A framework for quick web archiving
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

378 lines
12 KiB

  1. from qwarc.const import *
  2. import aiohttp
  3. import asyncio
  4. import functools
  5. import io
  6. import logging
  7. import os
  8. import pkg_resources
  9. import platform
  10. import time
  11. import typing
  12. import zlib
  13. PAGESIZE = os.sysconf('SC_PAGE_SIZE')
  14. def get_rss():
  15. '''Get the current RSS of this process in bytes'''
  16. with open('/proc/self/statm', 'r') as fp:
  17. return int(fp.readline().split()[1]) * PAGESIZE
  18. def get_disk_free():
  19. '''Get the current free disk space on the relevant partition in bytes'''
  20. st = os.statvfs('.')
  21. return st.f_bavail * st.f_frsize
  22. def uses_too_much_memory(limit):
  23. '''
  24. Check whether the process is using too much memory
  25. For performance reasons, this actually only checks the memory usage on every 100th call.
  26. '''
  27. uses_too_much_memory.callCounter += 1
  28. # Only check every hundredth call
  29. if uses_too_much_memory.callCounter % 100 == 0 and get_rss() > limit:
  30. return True
  31. return False
  32. uses_too_much_memory.callCounter = 0
  33. def too_little_disk_space(limit):
  34. '''
  35. Check whether the disk space is too small
  36. For performance reasons, this actually only checks the free disk space on every 100th call.
  37. '''
  38. too_little_disk_space.callCounter += 1
  39. if too_little_disk_space.callCounter % 100 == 0:
  40. too_little_disk_space.currentResult = (get_disk_free() < limit)
  41. return too_little_disk_space.currentResult
  42. too_little_disk_space.callCounter = 0
  43. too_little_disk_space.currentResult = False
  44. # https://stackoverflow.com/a/4665027
  45. def find_all(aStr, sub):
  46. '''Generator yielding the start positions of every non-overlapping occurrence of sub in aStr.'''
  47. start = 0
  48. while True:
  49. start = aStr.find(sub, start)
  50. if start == -1:
  51. return
  52. yield start
  53. start += len(sub)
  54. def str_get_between(aStr, a, b):
  55. '''Get the string after the first occurrence of a in aStr and the first occurrence of b after that of a, or None if there is no such string.'''
  56. aPos = aStr.find(a)
  57. if aPos == -1:
  58. return None
  59. offset = aPos + len(a)
  60. bPos = aStr.find(b, offset)
  61. if bPos == -1:
  62. return None
  63. return aStr[offset:bPos]
  64. def maybe_str_get_between(x, a, b):
  65. '''Like str_get_between, but returns None if x evaluates to False and converts it to a str before matching.'''
  66. if x:
  67. return str_get_between(str(x), a, b)
  68. def str_get_all_between(aStr, a, b):
  69. '''Generator yielding every string between occurrences of a in aStr and the following occurrence of b.'''
  70. #TODO: This produces half-overlapping matches: str_get_all_between('aabc', 'a', 'c') will yield 'ab' and 'b'.
  71. # Might need to implement sending an offset to the find_all generator to work around this, or discard aOffset values which are smaller than the previous bPos+len(b).
  72. for aOffset in find_all(aStr, a):
  73. offset = aOffset + len(a)
  74. bPos = aStr.find(b, offset)
  75. if bPos != -1:
  76. yield aStr[offset:bPos]
  77. def maybe_str_get_all_between(x, a, b):
  78. '''Like str_get_all_between, but yields no elements if x evaluates to False and converts x to a str before matching.'''
  79. if x:
  80. yield from str_get_all_between(str(x), a, b)
  81. def generate_range_items(start, stop, step):
  82. '''
  83. Generator for items of `step` size between `start` and `stop` (inclusive)
  84. Yields strings of the form `'a-b'` where `a` and `b` are integers such that `b - a + 1 == step`, `min(a) == start`, and `max(b) == stop`.
  85. `b - a + 1` may be unequal to `step` on the last item if `(stop - start + 1) % step != 0` (see examples below).
  86. Note that `a` and `b` can be equal on the last item if `(stop - start) % step == 0` (see examples below).
  87. Examples:
  88. - generate_range_items(0, 99, 10) yields '0-9', '10-19', '20-29', ..., '90-99'
  89. - generate_range_items(0, 42, 10): '0-9', '10-19', '20-29', '30-39', '40-42'
  90. - generate_range_items(0, 20, 10): '0-9', '10-19', '20-20'
  91. '''
  92. for i in range(start, stop + 1, step):
  93. yield f'{i}-{min(i + step - 1, stop)}'
  94. async def handle_response_default(url, attempt, response, exc):
  95. '''
  96. The default response handler, which behaves as follows:
  97. - If there is no response (e.g. timeout error), retry the retrieval after a delay of 5 seconds.
  98. - If the response has any of the status codes 401, 403, 404, 405, or 410, treat it as a permanent error and return.
  99. - If there was any exception and it is a asyncio.TimeoutError or a aiohttp.ClientError, treat as a potentially temporary error and retry the retrieval after a delay of 5 seconds.
  100. - If the response has any of the status codes 200, 204, 206, or 304, treat it as a success and return.
  101. - If the response has any of the status codes 301, 302, 303, 307, or 308, follow the redirect target if specified or return otherwise.
  102. - Otherwise, treat as a potentially temporary error and retry the retrieval after a delay of 5 seconds.
  103. - All responses are written to WARC by default.
  104. Note that this handler does not limit the number of retries on errors.
  105. Parameters: url (yarl.URL instance), attempt (int), response (aiohttp.ClientResponse or None), exc (Exception or None)
  106. At least one of response and exc is not None.
  107. Returns: (one of the qwarc.RESPONSE_* constants, bool signifying whether to write to WARC or not)
  108. The latter is ignored when exc is not None; responses that triggered an exception are never written to WARC.
  109. '''
  110. #TODO: Document that `attempt` is reset on redirects
  111. if response is None:
  112. await asyncio.sleep(5)
  113. return ACTION_RETRY, True
  114. if response.status in (401, 403, 404, 405, 410):
  115. return ACTION_IGNORE, True
  116. if exc is not None:
  117. if isinstance(exc, (asyncio.TimeoutError, aiohttp.ClientError)):
  118. await asyncio.sleep(5)
  119. return ACTION_RETRY, False # Don't write to WARC since there might be an incomplete response
  120. if response.status in (200, 204, 206, 304):
  121. return ACTION_SUCCESS, True
  122. if response.status in (301, 302, 303, 307, 308):
  123. return ACTION_FOLLOW_OR_SUCCESS, True
  124. await asyncio.sleep(5)
  125. return ACTION_RETRY, True
  126. async def handle_response_ignore_redirects(url, attempt, response, exc):
  127. '''A response handler that does not follow redirects, i.e. treats them as a success instead. It behaves as handle_response_default otherwise.'''
  128. action, writeToWarc = await handle_response_default(url, attempt, response, exc)
  129. if action == ACTION_FOLLOW_OR_SUCCESS:
  130. action = ACTION_SUCCESS
  131. return action, writeToWarc
  132. def handle_response_limit_error_retries(maxRetries, handler = handle_response_default):
  133. '''A response handler that limits the number of retries on errors. It behaves as handler otherwise, which defaults to handle_response_default.
  134. Technically, this is actually a response handler factory. This is so that the intuitive use works: fetch(..., responseHandler = handle_response_limit_error_retries(5))
  135. If you use the same limit many times, you should keep the return value (the response handler) of this method and reuse it to avoid creating a new function every time.
  136. '''
  137. async def _handler(url, attempt, response, exc):
  138. action, writeToWarc = await handler(url, attempt, response, exc)
  139. if action == ACTION_RETRY and attempt > maxRetries:
  140. action = ACTION_RETRIES_EXCEEDED
  141. return action, writeToWarc
  142. return _handler
  143. def _get_dependency_versions(*pkgs):
  144. pending = set(pkgs)
  145. have = set(pkgs)
  146. while pending:
  147. key = pending.pop()
  148. try:
  149. dist = pkg_resources.get_distribution(key)
  150. except pkg_resources.DistributionNotFound:
  151. logging.error(f'Unable to get distribution {key}')
  152. continue
  153. yield dist.key, dist.version
  154. for requirement in dist.requires():
  155. if requirement.key not in have:
  156. pending.add(requirement.key)
  157. have.add(requirement.key)
  158. @functools.lru_cache(maxsize = 1)
  159. def get_software_info(specFile, specDependencies):
  160. # Based on crocoite.utils, authored by PromyLOPh in commit 6ccd72ab on 2018-12-08 under MIT licence
  161. baseDependencyPackageVersions = list(_get_dependency_versions(__package__))
  162. baseDependencyPackages = set(x[0] for x in baseDependencyPackageVersions)
  163. specDependencyPackageVersions = list(_get_dependency_versions(*specDependencies.packages))
  164. return {
  165. 'platform': platform.platform(),
  166. 'python': {
  167. 'implementation': platform.python_implementation(),
  168. 'version': platform.python_version(),
  169. 'build': platform.python_build(),
  170. },
  171. 'self': [{"package": package, "version": version} for package, version in baseDependencyPackageVersions],
  172. 'spec': [{"package": package, "version": version} for package, version in specDependencyPackageVersions if package not in baseDependencyPackages],
  173. }
  174. class LogFormatter(logging.Formatter):
  175. def __init__(self):
  176. super().__init__('%(asctime)s.%(msecs)03dZ %(levelname)s %(itemString)s %(message)s', datefmt = '%Y-%m-%d %H:%M:%S')
  177. self.converter = time.gmtime
  178. def format(self, record):
  179. if not hasattr(record, 'itemString'):
  180. if hasattr(record, 'itemType') and hasattr(record, 'itemValue'):
  181. record.itemString = f'{record.itemType}:{record.itemValue}'
  182. else:
  183. record.itemString = 'None'
  184. return super().format(record)
  185. class SpecDependencies(typing.NamedTuple):
  186. packages: tuple = ()
  187. files: tuple = ()
  188. extra: typing.Any = None
  189. class ReadonlyFileView:
  190. '''
  191. A poor read-only view for a file object. It hides the writing methods and passes everything else through to the underlying file object. Note that this does *not* actually prevent modification at all.
  192. '''
  193. def __init__(self, fp):
  194. self._fp = fp
  195. def __getattr__(self, key):
  196. if key in ('write', 'writelines', 'truncate'):
  197. raise AttributeError
  198. if key == 'writable':
  199. return False
  200. return getattr(self._fp, key)
  201. def iter_file(f, length = None, blockSize = 1048576):
  202. '''Read `length` bytes from `f` in chunks of `blockSize` bytes. If `length` is `None`, read until EOF.'''
  203. read = 0
  204. while True:
  205. buf = f.read(blockSize)
  206. if not buf: # EOF
  207. if length and read < length:
  208. raise RuntimeError('Reached EOF before reading enough data')
  209. break
  210. if length and read + len(buf) > length:
  211. initialBufLen = len(buf)
  212. buf = buf[0 : length - read]
  213. f.seek(len(buf) - initialBufLen, io.SEEK_CUR)
  214. read += len(buf)
  215. yield buf
  216. if length and read >= length:
  217. if read > length: # This should never happen due to the truncation above.
  218. raise RuntimeError('Overread')
  219. break
  220. def read_http_headers(f, copy = None):
  221. headers = {}
  222. # Status line or request line
  223. line = f.readline()
  224. if copy:
  225. copy.write(line)
  226. line = f.readline()
  227. if copy:
  228. copy.write(line)
  229. while line and line not in (b'\r\n', b'\r', b'\n'):
  230. # Split into header name and value
  231. name, value = line.split(b':', 1)
  232. name = name.strip(b' \t')
  233. #TODO name validation
  234. # Read next line
  235. line = f.readline()
  236. if copy:
  237. copy.write(line)
  238. # Handle continuation lines
  239. continuation = line[0:1] in (b' ', b'\t')
  240. if continuation:
  241. value = []
  242. while continuation:
  243. value.append(line)
  244. line = f.readline()
  245. if copy:
  246. copy.write(line)
  247. continuation = line[0:1] in (b' ', b'\t')
  248. value = b''.join(value)
  249. # Decode and store
  250. try:
  251. name = name.decode('utf-8')
  252. except UnicodeDecodeError:
  253. name = name.decode('iso-8859-1')
  254. try:
  255. value = value.decode('utf-8')
  256. except UnicodeDecodeError:
  257. value = value.decode('iso-8859-1')
  258. headers[name.lower()] = value
  259. # `line` is already the next line, if any
  260. return headers
  261. def read_http_body(f, length, headers):
  262. if 'chunked' in map(str.strip, headers.get('transfer-encoding', '').split(',')):
  263. while True:
  264. chunkLine = f.readline()
  265. if b';' in chunkLine:
  266. chunkLength = chunkLine.split(b';', 1)[0].strip()
  267. else:
  268. chunkLength = chunkLine.strip()
  269. chunkLength = int(chunkLength, base = 16)
  270. if chunkLength == 0:
  271. break
  272. yield from iter_file(f, length = chunkLength)
  273. assert f.read(2) == b'\r\n' # Chunk terminator
  274. # Consume trailer
  275. line = f.readline()
  276. while line and line not in (b'\r\n', b'\r', b'\n'):
  277. line = f.readline()
  278. else:
  279. yield from iter_file(f, length = length)
  280. class GzipWrapper:
  281. def __init__(self, f):
  282. self._file = f
  283. self._compressor = None
  284. def __enter__(self):
  285. self._compressor = zlib.compressobj(9, zlib.DEFLATED, 16 + zlib.MAX_WBITS)
  286. return self
  287. def write(self, data):
  288. buf = self._compressor.compress(data)
  289. self._file.write(buf)
  290. def __exit__(self, excType, excVal, excTb):
  291. buf = self._compressor.flush()
  292. self._file.write(buf)
  293. self._file.flush()
  294. self._compressor = None