A framework for quick web archiving
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

297 lines
10 KiB

  1. from qwarc.const import *
  2. import aiohttp
  3. import asyncio
  4. import functools
  5. import logging
  6. import os
  7. import pkg_resources
  8. import platform
  9. import time
  10. import typing
  11. PAGESIZE = os.sysconf('SC_PAGE_SIZE')
  12. def get_rss():
  13. '''Get the current RSS of this process in bytes'''
  14. with open('/proc/self/statm', 'r') as fp:
  15. return int(fp.readline().split()[1]) * PAGESIZE
  16. def get_disk_free():
  17. '''Get the current free disk space on the relevant partition in bytes'''
  18. st = os.statvfs('.')
  19. return st.f_bavail * st.f_frsize
  20. def uses_too_much_memory(limit):
  21. '''
  22. Check whether the process is using too much memory
  23. For performance reasons, this actually only checks the memory usage on every 100th call.
  24. '''
  25. uses_too_much_memory.callCounter += 1
  26. # Only check every hundredth call
  27. if uses_too_much_memory.callCounter % 100 == 0 and get_rss() > limit:
  28. return True
  29. return False
  30. uses_too_much_memory.callCounter = 0
  31. def too_little_disk_space(limit):
  32. '''
  33. Check whether the disk space is too small
  34. For performance reasons, this actually only checks the free disk space on every 100th call.
  35. '''
  36. too_little_disk_space.callCounter += 1
  37. if too_little_disk_space.callCounter % 100 == 0:
  38. too_little_disk_space.currentResult = (get_disk_free() < limit)
  39. return too_little_disk_space.currentResult
  40. too_little_disk_space.callCounter = 0
  41. too_little_disk_space.currentResult = False
  42. # https://stackoverflow.com/a/4665027
  43. def find_all(aStr, sub):
  44. '''Generator yielding the start positions of every non-overlapping occurrence of sub in aStr.'''
  45. start = 0
  46. while True:
  47. start = aStr.find(sub, start)
  48. if start == -1:
  49. return
  50. yield start
  51. start += len(sub)
  52. def str_get_between(aStr, a, b):
  53. '''Get the string after the first occurrence of a in aStr and the first occurrence of b after that of a, or None if there is no such string.'''
  54. aPos = aStr.find(a)
  55. if aPos == -1:
  56. return None
  57. offset = aPos + len(a)
  58. bPos = aStr.find(b, offset)
  59. if bPos == -1:
  60. return None
  61. return aStr[offset:bPos]
  62. def maybe_str_get_between(x, a, b):
  63. '''Like str_get_between, but returns None if x evaluates to False and converts it to a str before matching.'''
  64. if x:
  65. return str_get_between(str(x), a, b)
  66. def str_get_all_between(aStr, a, b):
  67. '''Generator yielding every string between occurrences of a in aStr and the following occurrence of b.'''
  68. #TODO: This produces half-overlapping matches: str_get_all_between('aabc', 'a', 'c') will yield 'ab' and 'b'.
  69. # Might need to implement sending an offset to the find_all generator to work around this, or discard aOffset values which are smaller than the previous bPos+len(b).
  70. for aOffset in find_all(aStr, a):
  71. offset = aOffset + len(a)
  72. bPos = aStr.find(b, offset)
  73. if bPos != -1:
  74. yield aStr[offset:bPos]
  75. def maybe_str_get_all_between(x, a, b):
  76. '''Like str_get_all_between, but yields no elements if x evaluates to False and converts x to a str before matching.'''
  77. if x:
  78. yield from str_get_all_between(str(x), a, b)
  79. def generate_range_items(start, stop, step):
  80. '''
  81. Generator for items of `step` size between `start` and `stop` (inclusive)
  82. Yields strings of the form `'a-b'` where `a` and `b` are integers such that `b - a + 1 == step`, `min(a) == start`, and `max(b) == stop`.
  83. `b - a + 1` may be unequal to `step` on the last item if `(stop - start + 1) % step != 0` (see examples below).
  84. Note that `a` and `b` can be equal on the last item if `(stop - start) % step == 0` (see examples below).
  85. Examples:
  86. - generate_range_items(0, 99, 10) yields '0-9', '10-19', '20-29', ..., '90-99'
  87. - generate_range_items(0, 42, 10): '0-9', '10-19', '20-29', '30-39', '40-42'
  88. - generate_range_items(0, 20, 10): '0-9', '10-19', '20-20'
  89. '''
  90. for i in range(start, stop + 1, step):
  91. yield f'{i}-{min(i + step - 1, stop)}'
  92. async def handle_response_default(*, url, attempt, response, exc, redirectLevel, item):
  93. '''
  94. The default response handler, which behaves as follows:
  95. - If there is no response (e.g. timeout error), retry the retrieval after a delay of 5 seconds.
  96. - If the response has any of the status codes 401, 403, 404, 405, or 410, treat it as a permanent error and return.
  97. - If there was any exception and it is a asyncio.TimeoutError or a aiohttp.ClientError, treat as a potentially temporary error and retry the retrieval after a delay of 5 seconds.
  98. - If the response has any of the status codes 200, 204, 206, or 304, treat it as a success and return.
  99. - If the response has any of the status codes 301, 302, 303, 307, or 308, follow the redirect target if specified or return otherwise.
  100. - Otherwise, treat as a potentially temporary error and retry the retrieval after a delay of 5 seconds.
  101. - All responses are written to WARC by default.
  102. Note that this handler does not limit the number of retries on errors.
  103. Parameters: url (yarl.URL instance), attempt (int), response (aiohttp.ClientResponse or None), exc (Exception or None), redirectLevel (int), item (qwarc.Item instance)
  104. At least one of response and exc is not None.
  105. The redirectLevel indicates how many redirects were followed to get to this url, i.e. it starts out as zero and increases by one for every redirect.
  106. The attempt starts from 1 for every url, i.e. it is reset on redirects. The handler is invoked at most once for each attempt.
  107. Returns: (one of the qwarc.RESPONSE_* constants, bool signifying whether to write to WARC or not)
  108. The latter is ignored when exc is not None; responses that triggered an exception are never written to WARC.
  109. '''
  110. if response is None:
  111. await asyncio.sleep(5)
  112. return ACTION_RETRY, True
  113. if response.status in (401, 403, 404, 405, 410):
  114. return ACTION_IGNORE, True
  115. if exc is not None:
  116. if isinstance(exc, (asyncio.TimeoutError, aiohttp.ClientError)):
  117. await asyncio.sleep(5)
  118. return ACTION_RETRY, False # Don't write to WARC since there might be an incomplete response
  119. if response.status in (200, 204, 206, 304):
  120. return ACTION_SUCCESS, True
  121. if response.status in (301, 302, 303, 307, 308):
  122. return ACTION_FOLLOW_OR_SUCCESS, True
  123. await asyncio.sleep(5)
  124. return ACTION_RETRY, True
  125. async def handle_response_ignore_redirects(**kwargs):
  126. '''A response handler that does not follow redirects, i.e. treats them as a success instead. It behaves as handle_response_default otherwise.'''
  127. action, writeToWarc = await handle_response_default(**kwargs)
  128. if action == ACTION_FOLLOW_OR_SUCCESS:
  129. action = ACTION_SUCCESS
  130. return action, writeToWarc
  131. def handle_response_limit_error_retries(maxRetries, handler = handle_response_default):
  132. '''A response handler that limits the number of retries on errors. It behaves as handler otherwise, which defaults to handle_response_default.
  133. Technically, this is actually a response handler factory. This is so that the intuitive use works: fetch(..., responseHandler = handle_response_limit_error_retries(5))
  134. If you use the same limit many times, you should keep the return value (the response handler) of this method and reuse it to avoid creating a new function every time.
  135. '''
  136. async def _handler(**kwargs):
  137. action, writeToWarc = await handler(**kwargs)
  138. if action == ACTION_RETRY and kwargs['attempt'] > maxRetries:
  139. action = ACTION_RETRIES_EXCEEDED
  140. return action, writeToWarc
  141. return _handler
  142. def handle_response_limit_redirect_depth(maxRedirects, handler = handle_response_default):
  143. '''
  144. A response handler that limits how many redirects are followed. It behaves as handler otherwise, which defaults to handle_response_default.
  145. The same details as for handle_response_limit_error_retries apply.
  146. '''
  147. async def _handler(**kwargs):
  148. action, writeToWarc = await handler(**kwargs)
  149. # redirectLevel starts off at 0 so if it is equal to maxRedirects - 1, there were exactly maxRedirects redirects
  150. if action == ACTION_FOLLOW_OR_SUCCESS and kwargs['redirectLevel'] >= maxRedirects - 1:
  151. action = ACTION_TOO_MANY_REDIRECTS
  152. return action, writeToWarc
  153. return _handler
  154. def _get_dependency_versions(*pkgs):
  155. pending = set(pkgs)
  156. have = set(pkgs)
  157. while pending:
  158. key = pending.pop()
  159. try:
  160. dist = pkg_resources.get_distribution(key)
  161. except pkg_resources.DistributionNotFound:
  162. logging.error(f'Unable to get distribution {key}')
  163. continue
  164. yield dist.key, dist.version
  165. for requirement in dist.requires():
  166. if requirement.key not in have:
  167. pending.add(requirement.key)
  168. have.add(requirement.key)
  169. @functools.lru_cache(maxsize = 1)
  170. def get_software_info(specDependencyPackages):
  171. # Based on crocoite.utils, authored by PromyLOPh in commit 6ccd72ab on 2018-12-08 under MIT licence
  172. baseDependencyPackageVersions = list(_get_dependency_versions(__package__))
  173. baseDependencyPackages = set(x[0] for x in baseDependencyPackageVersions)
  174. specDependencyPackageVersions = list(_get_dependency_versions(*specDependencyPackages))
  175. return {
  176. 'platform': platform.platform(),
  177. 'python': {
  178. 'implementation': platform.python_implementation(),
  179. 'version': platform.python_version(),
  180. 'build': platform.python_build(),
  181. },
  182. 'self': [{"package": package, "version": version} for package, version in baseDependencyPackageVersions],
  183. 'spec': [{"package": package, "version": version} for package, version in specDependencyPackageVersions if package not in baseDependencyPackages],
  184. }
  185. class LogFormatter(logging.Formatter):
  186. def __init__(self):
  187. super().__init__('%(asctime)s.%(msecs)03dZ %(levelname)s %(itemString)s %(message)s', datefmt = '%Y-%m-%d %H:%M:%S')
  188. self.converter = time.gmtime
  189. def format(self, record):
  190. if not hasattr(record, 'itemString'):
  191. if hasattr(record, 'itemType') and hasattr(record, 'itemValue'):
  192. record.itemString = f'{record.itemType}:{record.itemValue}'
  193. else:
  194. record.itemString = 'None'
  195. return super().format(record)
  196. class SpecDependencies(typing.NamedTuple):
  197. packages: tuple = ()
  198. files: tuple = ()
  199. extra: typing.Any = None
  200. class ReadonlyFileView:
  201. '''
  202. A poor read-only view for a file object. It hides the writing methods and passes everything else through to the underlying file object. Note that this does *not* actually prevent modification at all.
  203. '''
  204. def __init__(self, fp):
  205. self._fp = fp
  206. def __getattr__(self, key):
  207. if key in ('write', 'writelines', 'truncate'):
  208. raise AttributeError
  209. if key == 'writable':
  210. return False
  211. return getattr(self._fp, key)
  212. class DummyClientResponse:
  213. '''A ClientResponse-like object for when no actual ClientResponse is available. Always evaluates to False when cast to a bool.'''
  214. def __init__(self):
  215. self._qhistory = None
  216. @property
  217. def qhistory(self):
  218. return self._qhistory
  219. @qhistory.setter
  220. def qhistory(self, history):
  221. self._qhistory = history
  222. def __bool__(self):
  223. return False