A framework for quick web archiving
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

281 lines
9.6 KiB

  1. from qwarc.const import *
  2. import aiohttp
  3. import asyncio
  4. import functools
  5. import logging
  6. import os
  7. import pkg_resources
  8. import platform
  9. import time
  10. import typing
  11. PAGESIZE = os.sysconf('SC_PAGE_SIZE')
  12. def get_rss():
  13. '''Get the current RSS of this process in bytes'''
  14. with open('/proc/self/statm', 'r') as fp:
  15. return int(fp.readline().split()[1]) * PAGESIZE
  16. def get_disk_free():
  17. '''Get the current free disk space on the relevant partition in bytes'''
  18. st = os.statvfs('.')
  19. return st.f_bavail * st.f_frsize
  20. def uses_too_much_memory(limit):
  21. '''
  22. Check whether the process is using too much memory
  23. For performance reasons, this actually only checks the memory usage on every 100th call.
  24. '''
  25. uses_too_much_memory.callCounter += 1
  26. # Only check every hundredth call
  27. if uses_too_much_memory.callCounter % 100 == 0 and get_rss() > limit:
  28. return True
  29. return False
  30. uses_too_much_memory.callCounter = 0
  31. def too_little_disk_space(limit):
  32. '''
  33. Check whether the disk space is too small
  34. For performance reasons, this actually only checks the free disk space on every 100th call.
  35. '''
  36. too_little_disk_space.callCounter += 1
  37. if too_little_disk_space.callCounter % 100 == 0:
  38. too_little_disk_space.currentResult = (get_disk_free() < limit)
  39. return too_little_disk_space.currentResult
  40. too_little_disk_space.callCounter = 0
  41. too_little_disk_space.currentResult = False
  42. # https://stackoverflow.com/a/4665027
  43. def find_all(aStr, sub):
  44. '''Generator yielding the start positions of every non-overlapping occurrence of sub in aStr.'''
  45. start = 0
  46. while True:
  47. start = aStr.find(sub, start)
  48. if start == -1:
  49. return
  50. yield start
  51. start += len(sub)
  52. def str_get_between(aStr, a, b):
  53. '''Get the string after the first occurrence of a in aStr and the first occurrence of b after that of a, or None if there is no such string.'''
  54. aPos = aStr.find(a)
  55. if aPos == -1:
  56. return None
  57. offset = aPos + len(a)
  58. bPos = aStr.find(b, offset)
  59. if bPos == -1:
  60. return None
  61. return aStr[offset:bPos]
  62. def maybe_str_get_between(x, a, b):
  63. '''Like str_get_between, but returns None if x evaluates to False and converts it to a str before matching.'''
  64. if x:
  65. return str_get_between(str(x), a, b)
  66. def str_get_all_between(aStr, a, b):
  67. '''Generator yielding every string between occurrences of a in aStr and the following occurrence of b.'''
  68. #TODO: This produces half-overlapping matches: str_get_all_between('aabc', 'a', 'c') will yield 'ab' and 'b'.
  69. # Might need to implement sending an offset to the find_all generator to work around this, or discard aOffset values which are smaller than the previous bPos+len(b).
  70. for aOffset in find_all(aStr, a):
  71. offset = aOffset + len(a)
  72. bPos = aStr.find(b, offset)
  73. if bPos != -1:
  74. yield aStr[offset:bPos]
  75. def maybe_str_get_all_between(x, a, b):
  76. '''Like str_get_all_between, but yields no elements if x evaluates to False and converts x to a str before matching.'''
  77. if x:
  78. yield from str_get_all_between(str(x), a, b)
  79. def generate_range_items(start, stop, step):
  80. '''
  81. Generator for items of `step` size between `start` and `stop` (inclusive)
  82. Yields strings of the form `'a-b'` where `a` and `b` are integers such that `b - a + 1 == step`, `min(a) == start`, and `max(b) == stop`.
  83. `b - a + 1` may be unequal to `step` on the last item if `(stop - start + 1) % step != 0` (see examples below).
  84. Note that `a` and `b` can be equal on the last item if `(stop - start) % step == 0` (see examples below).
  85. Examples:
  86. - generate_range_items(0, 99, 10) yields '0-9', '10-19', '20-29', ..., '90-99'
  87. - generate_range_items(0, 42, 10): '0-9', '10-19', '20-29', '30-39', '40-42'
  88. - generate_range_items(0, 20, 10): '0-9', '10-19', '20-20'
  89. '''
  90. for i in range(start, stop + 1, step):
  91. yield f'{i}-{min(i + step - 1, stop)}'
  92. async def handle_response_default(url, attempt, response, exc, item):
  93. '''
  94. The default response handler, which behaves as follows:
  95. - If there is no response (e.g. timeout error), retry the retrieval after a delay of 5 seconds.
  96. - If the response has any of the status codes 401, 403, 404, 405, or 410, treat it as a permanent error and return.
  97. - If there was any exception and it is a asyncio.TimeoutError or a aiohttp.ClientError, treat as a potentially temporary error and retry the retrieval after a delay of 5 seconds.
  98. - If the response has any of the status codes 200, 204, 206, or 304, treat it as a success and return.
  99. - If the response has any of the status codes 301, 302, 303, 307, or 308, follow the redirect target if specified or return otherwise.
  100. - Otherwise, treat as a potentially temporary error and retry the retrieval after a delay of 5 seconds.
  101. - All responses are written to WARC by default.
  102. Note that this handler does not limit the number of retries on errors.
  103. Parameters: url (yarl.URL instance), attempt (int), response (aiohttp.ClientResponse or None), exc (Exception or None), item (qwarc.Item instance)
  104. At least one of response and exc is not None.
  105. Returns: (one of the qwarc.RESPONSE_* constants, bool signifying whether to write to WARC or not)
  106. The latter is ignored when exc is not None; responses that triggered an exception are never written to WARC.
  107. '''
  108. #TODO: Document that `attempt` is reset on redirects
  109. if response is None:
  110. await asyncio.sleep(5)
  111. return ACTION_RETRY, True
  112. if response.status in (401, 403, 404, 405, 410):
  113. return ACTION_IGNORE, True
  114. if exc is not None:
  115. if isinstance(exc, (asyncio.TimeoutError, aiohttp.ClientError)):
  116. await asyncio.sleep(5)
  117. return ACTION_RETRY, False # Don't write to WARC since there might be an incomplete response
  118. if response.status in (200, 204, 206, 304):
  119. return ACTION_SUCCESS, True
  120. if response.status in (301, 302, 303, 307, 308):
  121. return ACTION_FOLLOW_OR_SUCCESS, True
  122. await asyncio.sleep(5)
  123. return ACTION_RETRY, True
  124. async def handle_response_ignore_redirects(url, attempt, response, exc, item):
  125. '''A response handler that does not follow redirects, i.e. treats them as a success instead. It behaves as handle_response_default otherwise.'''
  126. action, writeToWarc = await handle_response_default(url, attempt, response, exc, item)
  127. if action == ACTION_FOLLOW_OR_SUCCESS:
  128. action = ACTION_SUCCESS
  129. return action, writeToWarc
  130. def handle_response_limit_error_retries(maxRetries, handler = handle_response_default):
  131. '''A response handler that limits the number of retries on errors. It behaves as handler otherwise, which defaults to handle_response_default.
  132. Technically, this is actually a response handler factory. This is so that the intuitive use works: fetch(..., responseHandler = handle_response_limit_error_retries(5))
  133. If you use the same limit many times, you should keep the return value (the response handler) of this method and reuse it to avoid creating a new function every time.
  134. '''
  135. async def _handler(url, attempt, response, exc, item):
  136. action, writeToWarc = await handler(url, attempt, response, exc, item)
  137. if action == ACTION_RETRY and attempt > maxRetries:
  138. action = ACTION_RETRIES_EXCEEDED
  139. return action, writeToWarc
  140. return _handler
  141. def _get_dependency_versions(*pkgs):
  142. pending = set(pkgs)
  143. have = set(pkgs)
  144. while pending:
  145. key = pending.pop()
  146. try:
  147. dist = pkg_resources.get_distribution(key)
  148. except pkg_resources.DistributionNotFound:
  149. logging.error(f'Unable to get distribution {key}')
  150. continue
  151. yield dist.key, dist.version
  152. for requirement in dist.requires():
  153. if requirement.key not in have:
  154. pending.add(requirement.key)
  155. have.add(requirement.key)
  156. @functools.lru_cache(maxsize = 1)
  157. def get_software_info(specFile, specDependencies):
  158. # Based on crocoite.utils, authored by PromyLOPh in commit 6ccd72ab on 2018-12-08 under MIT licence
  159. baseDependencyPackageVersions = list(_get_dependency_versions(__package__))
  160. baseDependencyPackages = set(x[0] for x in baseDependencyPackageVersions)
  161. specDependencyPackageVersions = list(_get_dependency_versions(*specDependencies.packages))
  162. return {
  163. 'platform': platform.platform(),
  164. 'python': {
  165. 'implementation': platform.python_implementation(),
  166. 'version': platform.python_version(),
  167. 'build': platform.python_build(),
  168. },
  169. 'self': [{"package": package, "version": version} for package, version in baseDependencyPackageVersions],
  170. 'spec': [{"package": package, "version": version} for package, version in specDependencyPackageVersions if package not in baseDependencyPackages],
  171. }
  172. class LogFormatter(logging.Formatter):
  173. def __init__(self):
  174. super().__init__('%(asctime)s.%(msecs)03dZ %(levelname)s %(itemString)s %(message)s', datefmt = '%Y-%m-%d %H:%M:%S')
  175. self.converter = time.gmtime
  176. def format(self, record):
  177. if not hasattr(record, 'itemString'):
  178. if hasattr(record, 'itemType') and hasattr(record, 'itemValue'):
  179. record.itemString = f'{record.itemType}:{record.itemValue}'
  180. else:
  181. record.itemString = 'None'
  182. return super().format(record)
  183. class SpecDependencies(typing.NamedTuple):
  184. packages: tuple = ()
  185. files: tuple = ()
  186. extra: typing.Any = None
  187. class ReadonlyFileView:
  188. '''
  189. A poor read-only view for a file object. It hides the writing methods and passes everything else through to the underlying file object. Note that this does *not* actually prevent modification at all.
  190. '''
  191. def __init__(self, fp):
  192. self._fp = fp
  193. def __getattr__(self, key):
  194. if key in ('write', 'writelines', 'truncate'):
  195. raise AttributeError
  196. if key == 'writable':
  197. return False
  198. return getattr(self._fp, key)
  199. class DummyClientResponse:
  200. '''A ClientResponse-like object for when no actual ClientResponse is available. Always evaluates to False when cast to a bool.'''
  201. def __init__(self):
  202. self._qhistory = None
  203. @property
  204. def qhistory(self):
  205. return self._qhistory
  206. @qhistory.setter
  207. def qhistory(self, history):
  208. self._qhistory = history
  209. def __bool__(self):
  210. return False