A framework for quick web archiving
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

340 lines
12 KiB

  1. from qwarc.const import *
  2. import aiohttp
  3. import asyncio
  4. import functools
  5. import logging
  6. import os
  7. import pkg_resources
  8. import platform
  9. import time
  10. import typing
  11. PAGESIZE = os.sysconf('SC_PAGE_SIZE')
  12. def get_rss():
  13. '''Get the current RSS of this process in bytes'''
  14. with open('/proc/self/statm', 'r') as fp:
  15. return int(fp.readline().split()[1]) * PAGESIZE
  16. def get_disk_free():
  17. '''Get the current free disk space on the relevant partition in bytes'''
  18. st = os.statvfs('.')
  19. return st.f_bavail * st.f_frsize
  20. def uses_too_much_memory(limit):
  21. '''
  22. Check whether the process is using too much memory
  23. For performance reasons, this actually only checks the memory usage on every 100th call.
  24. '''
  25. uses_too_much_memory.callCounter += 1
  26. # Only check every hundredth call
  27. if uses_too_much_memory.callCounter % 100 == 0 and get_rss() > limit:
  28. return True
  29. return False
  30. uses_too_much_memory.callCounter = 0
  31. def too_little_disk_space(limit):
  32. '''
  33. Check whether the disk space is too small
  34. For performance reasons, this actually only checks the free disk space on every 100th call.
  35. '''
  36. too_little_disk_space.callCounter += 1
  37. if too_little_disk_space.callCounter % 100 == 0:
  38. too_little_disk_space.currentResult = (get_disk_free() < limit)
  39. return too_little_disk_space.currentResult
  40. too_little_disk_space.callCounter = 0
  41. too_little_disk_space.currentResult = False
  42. # https://stackoverflow.com/a/4665027
  43. def find_all(aStr, sub):
  44. '''Generator yielding the start positions of every non-overlapping occurrence of sub in aStr.'''
  45. start = 0
  46. while True:
  47. start = aStr.find(sub, start)
  48. if start == -1:
  49. return
  50. yield start
  51. start += len(sub)
  52. def str_get_between(aStr, a, b):
  53. '''Get the string after the first occurrence of a in aStr and the first occurrence of b after that of a, or None if there is no such string.'''
  54. aPos = aStr.find(a)
  55. if aPos == -1:
  56. return None
  57. offset = aPos + len(a)
  58. bPos = aStr.find(b, offset)
  59. if bPos == -1:
  60. return None
  61. return aStr[offset:bPos]
  62. def maybe_str_get_between(x, a, b):
  63. '''Like str_get_between, but returns None if x evaluates to False and converts it to a str before matching.'''
  64. if x:
  65. return str_get_between(str(x), a, b)
  66. def str_get_all_between(aStr, a, b):
  67. '''Generator yielding every string between an occurrence of a in aStr and the following occurrence of b.'''
  68. prevEnd = -1
  69. for aOffset in find_all(aStr, a):
  70. if aOffset < prevEnd:
  71. continue
  72. offset = aOffset + len(a)
  73. bPos = aStr.find(b, offset)
  74. if bPos != -1:
  75. yield aStr[offset:bPos]
  76. prevEnd = bPos + len(b)
  77. def maybe_str_get_all_between(x, a, b):
  78. '''Like str_get_all_between, but yields no elements if x evaluates to False and converts x to a str before matching.'''
  79. if x:
  80. yield from str_get_all_between(str(x), a, b)
  81. def generate_range_items(start, stop, step):
  82. '''
  83. Generator for items of `step` size between `start` and `stop` (inclusive)
  84. Yields strings of the form `'a-b'` where `a` and `b` are integers such that `b - a + 1 == step`, `min(a) == start`, and `max(b) == stop`.
  85. `b - a + 1` may be unequal to `step` on the last item if `(stop - start + 1) % step != 0` (see examples below).
  86. Note that `a` and `b` can be equal on the last item if `(stop - start) % step == 0` (see examples below).
  87. Examples:
  88. - generate_range_items(0, 99, 10) yields '0-9', '10-19', '20-29', ..., '90-99'
  89. - generate_range_items(0, 42, 10): '0-9', '10-19', '20-29', '30-39', '40-42'
  90. - generate_range_items(0, 20, 10): '0-9', '10-19', '20-20'
  91. '''
  92. for i in range(start, stop + 1, step):
  93. yield f'{i}-{min(i + step - 1, stop)}'
  94. async def handle_response_default(*, url, attempt, response, exc, redirectLevel, item):
  95. '''
  96. The default response handler, which behaves as follows:
  97. - If there is no response (e.g. timeout error), retry the retrieval after a delay of 5 seconds.
  98. - If the response has any of the status codes 401, 403, 404, 405, or 410, treat it as a permanent error and return.
  99. - If there was any exception and it is a asyncio.TimeoutError or a aiohttp.ClientError, treat as a potentially temporary error and retry the retrieval after a delay of 5 seconds.
  100. - If the response has any of the status codes 200, 204, 206, or 304, treat it as a success and return.
  101. - If the response has any of the status codes 301, 302, 303, 307, or 308, follow the redirect target if specified or return otherwise.
  102. - Otherwise, treat as a potentially temporary error and retry the retrieval after a delay of 5 seconds.
  103. - All responses are written to WARC by default.
  104. Note that this handler does not limit the number of retries on errors.
  105. Parameters: url (yarl.URL instance), attempt (int), response (aiohttp.ClientResponse or None), exc (Exception or None), redirectLevel (int), item (qwarc.Item instance)
  106. At least one of response and exc is not None.
  107. The redirectLevel indicates how many redirects were followed to get to this url, i.e. it starts out as zero and increases by one for every redirect.
  108. The attempt starts from 1 for every url, i.e. it is reset on redirects. The handler is invoked at most once for each attempt.
  109. Returns: (one of the qwarc.RESPONSE_* constants, bool signifying whether to write to WARC or not)
  110. The latter is ignored when exc is not None; responses that triggered an exception are never written to WARC.
  111. '''
  112. if response is None:
  113. await asyncio.sleep(5)
  114. return ACTION_RETRY, True
  115. if response.status in (401, 403, 404, 405, 410):
  116. return ACTION_IGNORE, True
  117. if exc is not None:
  118. if isinstance(exc, (asyncio.TimeoutError, aiohttp.ClientError)):
  119. await asyncio.sleep(5)
  120. return ACTION_RETRY, False # Don't write to WARC since there might be an incomplete response
  121. if response.status in (200, 204, 206, 304):
  122. return ACTION_SUCCESS, True
  123. if response.status in (301, 302, 303, 307, 308):
  124. return ACTION_FOLLOW_OR_SUCCESS, True
  125. await asyncio.sleep(5)
  126. return ACTION_RETRY, True
  127. async def handle_response_ignore_redirects(**kwargs):
  128. '''A response handler that does not follow redirects, i.e. treats them as a success instead. It behaves as handle_response_default otherwise.'''
  129. action, writeToWarc = await handle_response_default(**kwargs)
  130. if action == ACTION_FOLLOW_OR_SUCCESS:
  131. action = ACTION_SUCCESS
  132. return action, writeToWarc
  133. def handle_response_limit_error_retries(maxRetries, handler = handle_response_default):
  134. '''A response handler that limits the number of retries on errors. It behaves as handler otherwise, which defaults to handle_response_default.
  135. Technically, this is actually a response handler factory. This is so that the intuitive use works: fetch(..., responseHandler = handle_response_limit_error_retries(5))
  136. If you use the same limit many times, you should keep the return value (the response handler) of this method and reuse it to avoid creating a new function every time.
  137. '''
  138. async def _handler(**kwargs):
  139. action, writeToWarc = await handler(**kwargs)
  140. if action == ACTION_RETRY and kwargs['attempt'] > maxRetries:
  141. action = ACTION_RETRIES_EXCEEDED
  142. return action, writeToWarc
  143. return _handler
  144. def handle_response_limit_redirect_depth(maxRedirects, handler = handle_response_default):
  145. '''
  146. A response handler that limits how many redirects are followed. It behaves as handler otherwise, which defaults to handle_response_default.
  147. The same details as for handle_response_limit_error_retries apply.
  148. '''
  149. async def _handler(**kwargs):
  150. action, writeToWarc = await handler(**kwargs)
  151. # redirectLevel starts off at 0 so if it is equal to maxRedirects - 1, there were exactly maxRedirects redirects
  152. if action == ACTION_FOLLOW_OR_SUCCESS and kwargs['redirectLevel'] >= maxRedirects - 1:
  153. action = ACTION_TOO_MANY_REDIRECTS
  154. return action, writeToWarc
  155. return _handler
  156. def _get_dependency_versions(*pkgs):
  157. pending = set(pkgs)
  158. have = set(pkgs)
  159. while pending:
  160. key = pending.pop()
  161. try:
  162. dist = pkg_resources.get_distribution(key)
  163. except pkg_resources.DistributionNotFound:
  164. logging.error(f'Unable to get distribution {key}')
  165. continue
  166. yield dist.key, dist.version
  167. for requirement in dist.requires():
  168. if requirement.key not in have:
  169. pending.add(requirement.key)
  170. have.add(requirement.key)
  171. @functools.lru_cache(maxsize = 1)
  172. def get_software_info(specDependencyPackages):
  173. # Based on crocoite.utils, authored by PromyLOPh in commit 6ccd72ab on 2018-12-08 under MIT licence
  174. baseDependencyPackageVersions = list(_get_dependency_versions(__package__))
  175. baseDependencyPackages = set(x[0] for x in baseDependencyPackageVersions)
  176. specDependencyPackageVersions = list(_get_dependency_versions(*specDependencyPackages))
  177. return {
  178. 'platform': platform.platform(),
  179. 'python': {
  180. 'implementation': platform.python_implementation(),
  181. 'version': platform.python_version(),
  182. 'build': platform.python_build(),
  183. },
  184. 'self': [{"package": package, "version": version} for package, version in baseDependencyPackageVersions],
  185. 'spec': [{"package": package, "version": version} for package, version in specDependencyPackageVersions if package not in baseDependencyPackages],
  186. }
  187. class LogFormatter(logging.Formatter):
  188. def __init__(self):
  189. super().__init__('%(asctime)s.%(msecs)03dZ %(name)s %(levelname)s %(itemString)s %(message)s', datefmt = '%Y-%m-%d %H:%M:%S')
  190. self.converter = time.gmtime
  191. def format(self, record):
  192. if not hasattr(record, 'itemString'):
  193. if hasattr(record, 'itemType') and hasattr(record, 'itemValue'):
  194. record.itemString = f'{record.itemType}:{record.itemValue}'
  195. else:
  196. record.itemString = 'None'
  197. return super().format(record)
  198. class SpecDependencies(typing.NamedTuple):
  199. packages: tuple = ()
  200. files: tuple = ()
  201. extra: typing.Any = None
  202. class ReadonlyFileView:
  203. '''
  204. A poor read-only view for a file object. It hides the writing methods and passes everything else through to the underlying file object. Note that this does *not* actually prevent modification at all.
  205. '''
  206. def __init__(self, fp):
  207. self._fp = fp
  208. def __getattr__(self, key):
  209. if key in ('write', 'writelines', 'truncate'):
  210. raise AttributeError
  211. if key == 'writable':
  212. return False
  213. return getattr(self._fp, key)
  214. class FrozenFileView:
  215. '''
  216. A poor minimal frozen view for a file object. It fixes the bounds of the file, i.e. if something is appended to the underlying file object, it does not become visible in the frozen view. Only seek, tell, and read are implemented.
  217. Note that seeks and reads will affect the underlying file object. The actual data is not really frozen either, and any changes on the underlying file object will affect the frozen view as well.
  218. '''
  219. def __init__(self, fp, begin, end):
  220. '''
  221. fp: file-like object
  222. begin: int, offset from beginning of the file
  223. end: int, offset from beginning of the file
  224. '''
  225. self._fp = fp
  226. self._begin = begin
  227. self._end = end
  228. def seek(self, offset, whence = os.SEEK_SET):
  229. if whence == os.SEEK_SET:
  230. return self._fp.seek(self._begin + offset, whence)
  231. elif whence == os.SEEK_CUR:
  232. return self._fp.seek(offset, whence)
  233. elif whence == os.SEEK_END:
  234. return self._fp.seek(self._end + offset, whence)
  235. raise NotImplementedError
  236. def tell(self):
  237. return self._fp.tell() - self._begin
  238. def read(self, size = -1):
  239. curPos = self._fp.tell()
  240. if curPos < self._begin:
  241. self._fp.seek(self._begin)
  242. elif curPos > self._end:
  243. return self._fp.read(0)
  244. if size == -1:
  245. return self._fp.read(self._end - self._fp.tell())
  246. return self._fp.read(min(size, self._end - self._fp.tell()))
  247. class DummyClientResponse:
  248. '''A ClientResponse-like object for when no actual ClientResponse is available. Always evaluates to False when cast to a bool.'''
  249. def __init__(self):
  250. self._qhistory = None
  251. @property
  252. def qhistory(self):
  253. return self._qhistory
  254. @qhistory.setter
  255. def qhistory(self, history):
  256. self._qhistory = history
  257. def __bool__(self):
  258. return False