A framework for quick web archiving
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

183 lignes
6.5 KiB

  1. from qwarc.const import *
  2. import asyncio
  3. import os
  4. PAGESIZE = os.sysconf('SC_PAGE_SIZE')
  5. def get_rss():
  6. '''Get the current RSS of this process in bytes'''
  7. with open('/proc/self/statm', 'r') as fp:
  8. return int(fp.readline().split()[1]) * PAGESIZE
  9. def get_disk_free():
  10. '''Get the current free disk space on the relevant partition in bytes'''
  11. st = os.statvfs('.')
  12. return st.f_bavail * st.f_frsize
  13. def uses_too_much_memory(limit):
  14. '''
  15. Check whether the process is using too much memory
  16. For performance reasons, this actually only checks the memory usage on every 100th call.
  17. '''
  18. uses_too_much_memory.callCounter += 1
  19. # Only check every hundredth call
  20. if uses_too_much_memory.callCounter % 100 == 0 and get_rss() > limit:
  21. return True
  22. return False
  23. uses_too_much_memory.callCounter = 0
  24. def too_little_disk_space(limit):
  25. '''
  26. Check whether the disk space is too small
  27. For performance reasons, this actually only checks the free disk space on every 100th call.
  28. '''
  29. too_little_disk_space.callCounter += 1
  30. if too_little_disk_space.callCounter % 100 == 0:
  31. too_little_disk_space.currentResult = (get_disk_free() < limit)
  32. return too_little_disk_space.currentResult
  33. too_little_disk_space.callCounter = 0
  34. too_little_disk_space.currentResult = False
  35. # https://stackoverflow.com/a/4665027
  36. def find_all(aStr, sub):
  37. '''Generator yielding the start positions of every non-overlapping occurrence of sub in aStr.'''
  38. start = 0
  39. while True:
  40. start = aStr.find(sub, start)
  41. if start == -1:
  42. return
  43. yield start
  44. start += len(sub)
  45. def str_get_between(aStr, a, b):
  46. '''Get the string after the first occurrence of a in aStr and the first occurrence of b after that of a, or None if there is no such string.'''
  47. aPos = aStr.find(a)
  48. if aPos == -1:
  49. return None
  50. offset = aPos + len(a)
  51. bPos = aStr.find(b, offset)
  52. if bPos == -1:
  53. return None
  54. return aStr[offset:bPos]
  55. def maybe_str_get_between(x, a, b):
  56. '''Like str_get_between, but returns None if x evaluates to False and converts it to a str before matching.'''
  57. if x:
  58. return str_get_between(str(x), a, b)
  59. def str_get_all_between(aStr, a, b):
  60. '''Generator yielding every string between occurrences of a in aStr and the following occurrence of b.'''
  61. #TODO: This produces half-overlapping matches: str_get_all_between('aabc', 'a', 'c') will yield 'ab' and 'b'.
  62. # Might need to implement sending an offset to the find_all generator to work around this, or discard aOffset values which are smaller than the previous bPos+len(b).
  63. for aOffset in find_all(aStr, a):
  64. offset = aOffset + len(a)
  65. bPos = aStr.find(b, offset)
  66. if bPos != -1:
  67. yield aStr[offset:bPos]
  68. def maybe_str_get_all_between(x, a, b):
  69. '''Like str_get_all_between, but yields no elements if x evaluates to False and converts x to a str before matching.'''
  70. if x:
  71. yield from str_get_all_between(str(x), a, b)
  72. def generate_range_items(start, stop, step):
  73. '''
  74. Generator for items of `step` size between `start` and `stop` (inclusive)
  75. Yields strings of the form `'a-b'` where `a` and `b` are integers such that `b - a + 1 == step`, `min(a) == start`, and `max(b) == stop`.
  76. `b - a + 1` may be unequal to `step` on the last item if `(stop - start + 1) % step != 0` (see examples below).
  77. Note that `a` and `b` can be equal on the last item if `(stop - start) % step == 0` (see examples below).
  78. Examples:
  79. - generate_range_items(0, 99, 10) yields '0-9', '10-19', '20-29', ..., '90-99'
  80. - generate_range_items(0, 42, 10): '0-9', '10-19', '20-29', '30-39', '40-42'
  81. - generate_range_items(0, 20, 10): '0-9', '10-19', '20-20'
  82. '''
  83. for i in range(start, stop + 1, step):
  84. yield '{}-{}'.format(i, min(i + step - 1, stop))
  85. async def handle_response_default(url, attempt, response, exc):
  86. '''
  87. The default response handler, which behaves as follows:
  88. - If there is no response (e.g. timeout error), retry the retrieval after a delay of 5 seconds.
  89. - If the response has any of the status codes 401, 403, 404, 405, or 410, treat it as a permanent error and return.
  90. - If there was any exception and it is a asyncio.TimeoutError or a aiohttp.ClientError, treat as a potentially temporary error and retry the retrieval after a delay of 5 seconds.
  91. - If the response has any of the status codes 200, 204, 206, or 304, treat it as a success and return.
  92. - If the response has any of the status codes 301, 302, 303, 307, or 308, follow the redirect target if specified or return otherwise.
  93. - Otherwise, treat as a potentially temporary error and retry the retrieval after a delay of 5 seconds.
  94. - All responses are written to WARC by default.
  95. Note that this handler does not limit the number of retries on errors.
  96. Parameters: url (yarl.URL instance), attempt (int), response (aiohttp.ClientResponse or None), exc (Exception or None)
  97. At least one of response and exc is not None.
  98. Returns: (one of the qwarc.RESPONSE_* constants, bool signifying whether to write to WARC or not)
  99. '''
  100. #TODO: Document that `attempt` is reset on redirects
  101. if response is None:
  102. await asyncio.sleep(5)
  103. return ACTION_RETRY, True
  104. if response.status in (401, 403, 404, 405, 410):
  105. return ACTION_IGNORE, True
  106. if exc is not None and isinstance(exc, (asyncio.TimeoutError, _aiohttp.ClientError)):
  107. await asyncio.sleep(5)
  108. return ACTION_RETRY, True
  109. if response.status in (200, 204, 206, 304):
  110. return ACTION_SUCCESS, True
  111. if response.status in (301, 302, 303, 307, 308):
  112. return ACTION_FOLLOW_OR_SUCCESS, True
  113. await asyncio.sleep(5)
  114. return ACTION_RETRY, True
  115. async def handle_response_ignore_redirects(url, attempt, response, exc):
  116. '''A response handler that does not follow redirects, i.e. treats them as a success instead. It behaves as handle_response_default otherwise.'''
  117. action, writeToWarc = await handle_response_default(url, attempt, response, exc)
  118. if action == ACTION_FOLLOW_OR_SUCCESS:
  119. action = ACTION_SUCCESS
  120. return action, writeToWarc
  121. def handle_response_limit_error_retries(maxRetries):
  122. '''A response handler that limits the number of retries on errors. It behaves as handle_response_default otherwise.
  123. Technically, this is actually a response handler factory. This is so that the intuitive use works: fetch(..., responseHandler = handle_response_limit_error_retries(5))
  124. If you use the same limit many times, you should keep the return value (the response handler) of this method and reuse it to avoid creating a new function every time.
  125. '''
  126. async def handler(url, attempt, response, exc):
  127. action, writeToWarc = await handle_response_default(url, attempt, response, exc)
  128. if action == ACTION_RETRY and attempt > maxRetries:
  129. action = ACTION_IGNORE
  130. return action, writeToWarc
  131. return handler