A framework for quick web archiving
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

183 lines
6.5 KiB

  1. from qwarc.const import *
  2. import asyncio
  3. import os
  4. PAGESIZE = os.sysconf('SC_PAGE_SIZE')
  5. def get_rss():
  6. '''Get the current RSS of this process in bytes'''
  7. with open('/proc/self/statm', 'r') as fp:
  8. return int(fp.readline().split()[1]) * PAGESIZE
  9. def get_disk_free():
  10. '''Get the current free disk space on the relevant partition in bytes'''
  11. st = os.statvfs('.')
  12. return st.f_bavail * st.f_frsize
  13. def uses_too_much_memory(limit):
  14. '''
  15. Check whether the process is using too much memory
  16. For performance reasons, this actually only checks the memory usage on every 100th call.
  17. '''
  18. uses_too_much_memory.callCounter += 1
  19. # Only check every hundredth call
  20. if uses_too_much_memory.callCounter % 100 == 0 and get_rss() > limit:
  21. return True
  22. return False
  23. uses_too_much_memory.callCounter = 0
  24. def too_little_disk_space(limit):
  25. '''
  26. Check whether the disk space is too small
  27. For performance reasons, this actually only checks the free disk space on every 100th call.
  28. '''
  29. too_little_disk_space.callCounter += 1
  30. if too_little_disk_space.callCounter % 100 == 0:
  31. too_little_disk_space.currentResult = (get_disk_free() < limit)
  32. return too_little_disk_space.currentResult
  33. too_little_disk_space.callCounter = 0
  34. too_little_disk_space.currentResult = False
  35. # https://stackoverflow.com/a/4665027
  36. def find_all(aStr, sub):
  37. '''Generator yielding the start positions of every non-overlapping occurrence of sub in aStr.'''
  38. start = 0
  39. while True:
  40. start = aStr.find(sub, start)
  41. if start == -1:
  42. return
  43. yield start
  44. start += len(sub)
  45. def str_get_between(aStr, a, b):
  46. '''Get the string after the first occurrence of a in aStr and the first occurrence of b after that of a, or None if there is no such string.'''
  47. aPos = aStr.find(a)
  48. if aPos == -1:
  49. return None
  50. offset = aPos + len(a)
  51. bPos = aStr.find(b, offset)
  52. if bPos == -1:
  53. return None
  54. return aStr[offset:bPos]
  55. def maybe_str_get_between(x, a, b):
  56. '''Like str_get_between, but returns None if x evaluates to False and converts it to a str before matching.'''
  57. if x:
  58. return str_get_between(str(x), a, b)
  59. def str_get_all_between(aStr, a, b):
  60. '''Generator yielding every string between occurrences of a in aStr and the following occurrence of b.'''
  61. #TODO: This produces half-overlapping matches: str_get_all_between('aabc', 'a', 'c') will yield 'ab' and 'b'.
  62. # Might need to implement sending an offset to the find_all generator to work around this, or discard aOffset values which are smaller than the previous bPos+len(b).
  63. for aOffset in find_all(aStr, a):
  64. offset = aOffset + len(a)
  65. bPos = aStr.find(b, offset)
  66. if bPos != -1:
  67. yield aStr[offset:bPos]
  68. def maybe_str_get_all_between(x, a, b):
  69. '''Like str_get_all_between, but yields no elements if x evaluates to False and converts x to a str before matching.'''
  70. if x:
  71. yield from str_get_all_between(str(x), a, b)
  72. def generate_range_items(start, stop, step):
  73. '''
  74. Generator for items of `step` size between `start` and `stop` (inclusive)
  75. Yields strings of the form `'a-b'` where `a` and `b` are integers such that `b - a + 1 == step`, `min(a) == start`, and `max(b) == stop`.
  76. `b - a + 1` may be unequal to `step` on the last item if `(stop - start + 1) % step != 0` (see examples below).
  77. Note that `a` and `b` can be equal on the last item if `(stop - start) % step == 0` (see examples below).
  78. Examples:
  79. - generate_range_items(0, 99, 10) yields '0-9', '10-19', '20-29', ..., '90-99'
  80. - generate_range_items(0, 42, 10): '0-9', '10-19', '20-29', '30-39', '40-42'
  81. - generate_range_items(0, 20, 10): '0-9', '10-19', '20-20'
  82. '''
  83. for i in range(start, stop + 1, step):
  84. yield '{}-{}'.format(i, min(i + step - 1, stop))
  85. async def handle_response_default(url, attempt, response, exc):
  86. '''
  87. The default response handler, which behaves as follows:
  88. - If there is no response (e.g. timeout error), retry the retrieval after a delay of 5 seconds.
  89. - If the response has any of the status codes 401, 403, 404, 405, or 410, treat it as a permanent error and return.
  90. - If there was any exception and it is a asyncio.TimeoutError or a aiohttp.ClientError, treat as a potentially temporary error and retry the retrieval after a delay of 5 seconds.
  91. - If the response has any of the status codes 200, 204, 206, or 304, treat it as a success and return.
  92. - If the response has any of the status codes 301, 302, 303, 307, or 308, follow the redirect target if specified or return otherwise.
  93. - Otherwise, treat as a potentially temporary error and retry the retrieval after a delay of 5 seconds.
  94. - All responses are written to WARC by default.
  95. Note that this handler does not limit the number of retries on errors.
  96. Parameters: url (yarl.URL instance), attempt (int), response (aiohttp.ClientResponse or None), exc (Exception or None)
  97. At least one of response and exc is not None.
  98. Returns: (one of the qwarc.RESPONSE_* constants, bool signifying whether to write to WARC or not)
  99. '''
  100. #TODO: Document that `attempt` is reset on redirects
  101. if response is None:
  102. await asyncio.sleep(5)
  103. return ACTION_RETRY, True
  104. if response.status in (401, 403, 404, 405, 410):
  105. return ACTION_IGNORE, True
  106. if exc is not None and isinstance(exc, (asyncio.TimeoutError, _aiohttp.ClientError)):
  107. await asyncio.sleep(5)
  108. return ACTION_RETRY, True
  109. if response.status in (200, 204, 206, 304):
  110. return ACTION_SUCCESS, True
  111. if response.status in (301, 302, 303, 307, 308):
  112. return ACTION_FOLLOW_OR_SUCCESS, True
  113. await asyncio.sleep(5)
  114. return ACTION_RETRY, True
  115. async def handle_response_ignore_redirects(url, attempt, response, exc):
  116. '''A response handler that does not follow redirects, i.e. treats them as a success instead. It behaves as handle_response_default otherwise.'''
  117. action, writeToWarc = await handle_response_default(url, attempt, response, exc)
  118. if action == ACTION_FOLLOW_OR_SUCCESS:
  119. action = ACTION_SUCCESS
  120. return action, writeToWarc
  121. def handle_response_limit_error_retries(maxRetries):
  122. '''A response handler that limits the number of retries on errors. It behaves as handle_response_default otherwise.
  123. Technically, this is actually a response handler factory. This is so that the intuitive use works: fetch(..., responseHandler = handle_response_limit_error_retries(5))
  124. If you use the same limit many times, you should keep the return value (the response handler) of this method and reuse it to avoid creating a new function every time.
  125. '''
  126. async def handler(url, attempt, response, exc):
  127. action, writeToWarc = await handle_response_default(url, attempt, response, exc)
  128. if action == ACTION_RETRY and attempt > maxRetries:
  129. action = ACTION_IGNORE
  130. return action, writeToWarc
  131. return handler