from qwarc.const import * import aiohttp import asyncio import functools import io import logging import os import pkg_resources import platform import time import typing import zlib PAGESIZE = os.sysconf('SC_PAGE_SIZE') def get_rss(): '''Get the current RSS of this process in bytes''' with open('/proc/self/statm', 'r') as fp: return int(fp.readline().split()[1]) * PAGESIZE def get_disk_free(): '''Get the current free disk space on the relevant partition in bytes''' st = os.statvfs('.') return st.f_bavail * st.f_frsize def uses_too_much_memory(limit): ''' Check whether the process is using too much memory For performance reasons, this actually only checks the memory usage on every 100th call. ''' uses_too_much_memory.callCounter += 1 # Only check every hundredth call if uses_too_much_memory.callCounter % 100 == 0 and get_rss() > limit: return True return False uses_too_much_memory.callCounter = 0 def too_little_disk_space(limit): ''' Check whether the disk space is too small For performance reasons, this actually only checks the free disk space on every 100th call. ''' too_little_disk_space.callCounter += 1 if too_little_disk_space.callCounter % 100 == 0: too_little_disk_space.currentResult = (get_disk_free() < limit) return too_little_disk_space.currentResult too_little_disk_space.callCounter = 0 too_little_disk_space.currentResult = False # https://stackoverflow.com/a/4665027 def find_all(aStr, sub): '''Generator yielding the start positions of every non-overlapping occurrence of sub in aStr.''' start = 0 while True: start = aStr.find(sub, start) if start == -1: return yield start start += len(sub) def str_get_between(aStr, a, b): '''Get the string after the first occurrence of a in aStr and the first occurrence of b after that of a, or None if there is no such string.''' aPos = aStr.find(a) if aPos == -1: return None offset = aPos + len(a) bPos = aStr.find(b, offset) if bPos == -1: return None return aStr[offset:bPos] def maybe_str_get_between(x, a, b): '''Like str_get_between, but returns None if x evaluates to False and converts it to a str before matching.''' if x: return str_get_between(str(x), a, b) def str_get_all_between(aStr, a, b): '''Generator yielding every string between occurrences of a in aStr and the following occurrence of b.''' #TODO: This produces half-overlapping matches: str_get_all_between('aabc', 'a', 'c') will yield 'ab' and 'b'. # Might need to implement sending an offset to the find_all generator to work around this, or discard aOffset values which are smaller than the previous bPos+len(b). for aOffset in find_all(aStr, a): offset = aOffset + len(a) bPos = aStr.find(b, offset) if bPos != -1: yield aStr[offset:bPos] def maybe_str_get_all_between(x, a, b): '''Like str_get_all_between, but yields no elements if x evaluates to False and converts x to a str before matching.''' if x: yield from str_get_all_between(str(x), a, b) def generate_range_items(start, stop, step): ''' Generator for items of `step` size between `start` and `stop` (inclusive) Yields strings of the form `'a-b'` where `a` and `b` are integers such that `b - a + 1 == step`, `min(a) == start`, and `max(b) == stop`. `b - a + 1` may be unequal to `step` on the last item if `(stop - start + 1) % step != 0` (see examples below). Note that `a` and `b` can be equal on the last item if `(stop - start) % step == 0` (see examples below). Examples: - generate_range_items(0, 99, 10) yields '0-9', '10-19', '20-29', ..., '90-99' - generate_range_items(0, 42, 10): '0-9', '10-19', '20-29', '30-39', '40-42' - generate_range_items(0, 20, 10): '0-9', '10-19', '20-20' ''' for i in range(start, stop + 1, step): yield f'{i}-{min(i + step - 1, stop)}' async def handle_response_default(url, attempt, response, exc): ''' The default response handler, which behaves as follows: - If there is no response (e.g. timeout error), retry the retrieval after a delay of 5 seconds. - If the response has any of the status codes 401, 403, 404, 405, or 410, treat it as a permanent error and return. - If there was any exception and it is a asyncio.TimeoutError or a aiohttp.ClientError, treat as a potentially temporary error and retry the retrieval after a delay of 5 seconds. - If the response has any of the status codes 200, 204, 206, or 304, treat it as a success and return. - If the response has any of the status codes 301, 302, 303, 307, or 308, follow the redirect target if specified or return otherwise. - Otherwise, treat as a potentially temporary error and retry the retrieval after a delay of 5 seconds. - All responses are written to WARC by default. Note that this handler does not limit the number of retries on errors. Parameters: url (yarl.URL instance), attempt (int), response (aiohttp.ClientResponse or None), exc (Exception or None) At least one of response and exc is not None. Returns: (one of the qwarc.RESPONSE_* constants, bool signifying whether to write to WARC or not) The latter is ignored when exc is not None; responses that triggered an exception are never written to WARC. ''' #TODO: Document that `attempt` is reset on redirects if response is None: await asyncio.sleep(5) return ACTION_RETRY, True if response.status in (401, 403, 404, 405, 410): return ACTION_IGNORE, True if exc is not None: if isinstance(exc, (asyncio.TimeoutError, aiohttp.ClientError)): await asyncio.sleep(5) return ACTION_RETRY, False # Don't write to WARC since there might be an incomplete response if response.status in (200, 204, 206, 304): return ACTION_SUCCESS, True if response.status in (301, 302, 303, 307, 308): return ACTION_FOLLOW_OR_SUCCESS, True await asyncio.sleep(5) return ACTION_RETRY, True async def handle_response_ignore_redirects(url, attempt, response, exc): '''A response handler that does not follow redirects, i.e. treats them as a success instead. It behaves as handle_response_default otherwise.''' action, writeToWarc = await handle_response_default(url, attempt, response, exc) if action == ACTION_FOLLOW_OR_SUCCESS: action = ACTION_SUCCESS return action, writeToWarc def handle_response_limit_error_retries(maxRetries, handler = handle_response_default): '''A response handler that limits the number of retries on errors. It behaves as handler otherwise, which defaults to handle_response_default. Technically, this is actually a response handler factory. This is so that the intuitive use works: fetch(..., responseHandler = handle_response_limit_error_retries(5)) If you use the same limit many times, you should keep the return value (the response handler) of this method and reuse it to avoid creating a new function every time. ''' async def _handler(url, attempt, response, exc): action, writeToWarc = await handler(url, attempt, response, exc) if action == ACTION_RETRY and attempt > maxRetries: action = ACTION_RETRIES_EXCEEDED return action, writeToWarc return _handler def _get_dependency_versions(*pkgs): pending = set(pkgs) have = set(pkgs) while pending: key = pending.pop() try: dist = pkg_resources.get_distribution(key) except pkg_resources.DistributionNotFound: logging.error(f'Unable to get distribution {key}') continue yield dist.key, dist.version for requirement in dist.requires(): if requirement.key not in have: pending.add(requirement.key) have.add(requirement.key) @functools.lru_cache(maxsize = 1) def get_software_info(specFile, specDependencies): # Based on crocoite.utils, authored by PromyLOPh in commit 6ccd72ab on 2018-12-08 under MIT licence baseDependencyPackageVersions = list(_get_dependency_versions(__package__)) baseDependencyPackages = set(x[0] for x in baseDependencyPackageVersions) specDependencyPackageVersions = list(_get_dependency_versions(*specDependencies.packages)) return { 'platform': platform.platform(), 'python': { 'implementation': platform.python_implementation(), 'version': platform.python_version(), 'build': platform.python_build(), }, 'self': [{"package": package, "version": version} for package, version in baseDependencyPackageVersions], 'spec': [{"package": package, "version": version} for package, version in specDependencyPackageVersions if package not in baseDependencyPackages], } class LogFormatter(logging.Formatter): def __init__(self): super().__init__('%(asctime)s.%(msecs)03dZ %(levelname)s %(itemString)s %(message)s', datefmt = '%Y-%m-%d %H:%M:%S') self.converter = time.gmtime def format(self, record): if not hasattr(record, 'itemString'): if hasattr(record, 'itemType') and hasattr(record, 'itemValue'): record.itemString = f'{record.itemType}:{record.itemValue}' else: record.itemString = 'None' return super().format(record) class SpecDependencies(typing.NamedTuple): packages: tuple = () files: tuple = () extra: typing.Any = None class ReadonlyFileView: ''' A poor read-only view for a file object. It hides the writing methods and passes everything else through to the underlying file object. Note that this does *not* actually prevent modification at all. ''' def __init__(self, fp): self._fp = fp def __getattr__(self, key): if key in ('write', 'writelines', 'truncate'): raise AttributeError if key == 'writable': return False return getattr(self._fp, key) def iter_file(f, length = None, blockSize = 1048576): '''Read `length` bytes from `f` in chunks of `blockSize` bytes. If `length` is `None`, read until EOF.''' read = 0 while True: buf = f.read(blockSize) if not buf: # EOF if length and read < length: raise RuntimeError('Reached EOF before reading enough data') break if length and read + len(buf) > length: initialBufLen = len(buf) buf = buf[0 : length - read] f.seek(len(buf) - initialBufLen, io.SEEK_CUR) read += len(buf) yield buf if length and read >= length: if read > length: # This should never happen due to the truncation above. raise RuntimeError('Overread') break def read_http_headers(f, copy = None): headers = {} # Status line or request line line = f.readline() if copy: copy.write(line) line = f.readline() if copy: copy.write(line) while line and line not in (b'\r\n', b'\r', b'\n'): # Split into header name and value name, value = line.split(b':', 1) name = name.strip(b' \t') #TODO name validation # Read next line line = f.readline() if copy: copy.write(line) # Handle continuation lines continuation = line[0:1] in (b' ', b'\t') if continuation: value = [] while continuation: value.append(line) line = f.readline() if copy: copy.write(line) continuation = line[0:1] in (b' ', b'\t') value = b''.join(value) # Decode and store try: name = name.decode('utf-8') except UnicodeDecodeError: name = name.decode('iso-8859-1') try: value = value.decode('utf-8') except UnicodeDecodeError: value = value.decode('iso-8859-1') headers[name.lower()] = value # `line` is already the next line, if any return headers def read_http_body(f, length, headers): if 'chunked' in map(str.strip, headers.get('transfer-encoding', '').split(',')): while True: chunkLine = f.readline() if b';' in chunkLine: chunkLength = chunkLine.split(b';', 1)[0].strip() else: chunkLength = chunkLine.strip() chunkLength = int(chunkLength, base = 16) if chunkLength == 0: break yield from iter_file(f, length = chunkLength) assert f.read(2) == b'\r\n' # Chunk terminator # Consume trailer line = f.readline() while line and line not in (b'\r\n', b'\r', b'\n'): line = f.readline() else: yield from iter_file(f, length = length) class GzipWrapper: def __init__(self, f): self._file = f self._compressor = None def __enter__(self): self._compressor = zlib.compressobj(9, zlib.DEFLATED, 16 + zlib.MAX_WBITS) return self def write(self, data): buf = self._compressor.compress(data) self._file.write(buf) def __exit__(self, excType, excVal, excTb): buf = self._compressor.flush() self._file.write(buf) self._file.flush() self._compressor = None