JustAnotherArchivist
/
qwarc


			
							from qwarc.const import *
import aiohttp
import asyncio
import functools
import io
import logging
import os
import pkg_resources
import platform
import time
import typing
import zlib


PAGESIZE = os.sysconf('SC_PAGE_SIZE')


def get_rss():
	'''Get the current RSS of this process in bytes'''

	with open('/proc/self/statm', 'r') as fp:
		return int(fp.readline().split()[1]) * PAGESIZE


def get_disk_free():
	'''Get the current free disk space on the relevant partition in bytes'''

	st = os.statvfs('.')
	return st.f_bavail * st.f_frsize


def uses_too_much_memory(limit):
	'''
	Check whether the process is using too much memory

	For performance reasons, this actually only checks the memory usage on every 100th call.
	'''

	uses_too_much_memory.callCounter += 1
	# Only check every hundredth call
	if uses_too_much_memory.callCounter % 100 == 0 and get_rss() > limit:
		return True
	return False
uses_too_much_memory.callCounter = 0


def too_little_disk_space(limit):
	'''
	Check whether the disk space is too small

	For performance reasons, this actually only checks the free disk space on every 100th call.
	'''

	too_little_disk_space.callCounter += 1
	if too_little_disk_space.callCounter % 100 == 0:
		too_little_disk_space.currentResult = (get_disk_free() < limit)
	return too_little_disk_space.currentResult
too_little_disk_space.callCounter = 0
too_little_disk_space.currentResult = False


# https://stackoverflow.com/a/4665027
def find_all(aStr, sub):
	'''Generator yielding the start positions of every non-overlapping occurrence of sub in aStr.'''

	start = 0
	while True:
		start = aStr.find(sub, start)
		if start == -1:
			return
		yield start
		start += len(sub)


def str_get_between(aStr, a, b):
	'''Get the string after the first occurrence of a in aStr and the first occurrence of b after that of a, or None if there is no such string.'''

	aPos = aStr.find(a)
	if aPos == -1:
		return None
	offset = aPos + len(a)
	bPos = aStr.find(b, offset)
	if bPos == -1:
		return None
	return aStr[offset:bPos]


def maybe_str_get_between(x, a, b):
	'''Like str_get_between, but returns None if x evaluates to False and converts it to a str before matching.'''

	if x:
		return str_get_between(str(x), a, b)


def str_get_all_between(aStr, a, b):
	'''Generator yielding every string between occurrences of a in aStr and the following occurrence of b.'''

	#TODO: This produces half-overlapping matches: str_get_all_between('aabc', 'a', 'c') will yield 'ab' and 'b'.
		# Might need to implement sending an offset to the find_all generator to work around this, or discard aOffset values which are smaller than the previous bPos+len(b).

	for aOffset in find_all(aStr, a):
		offset = aOffset + len(a)
		bPos = aStr.find(b, offset)
		if bPos != -1:
			yield aStr[offset:bPos]


def maybe_str_get_all_between(x, a, b):
	'''Like str_get_all_between, but yields no elements if x evaluates to False and converts x to a str before matching.'''

	if x:
		yield from str_get_all_between(str(x), a, b)


def generate_range_items(start, stop, step):
	'''
	Generator for items of `step` size between `start` and `stop` (inclusive)
	Yields strings of the form `'a-b'` where `a` and `b` are integers such that `b - a + 1 == step`, `min(a) == start`, and `max(b) == stop`.
	`b - a + 1` may be unequal to `step` on the last item if `(stop - start + 1) % step != 0` (see examples below).
	Note that `a` and `b` can be equal on the last item if `(stop - start) % step == 0` (see examples below).

	Examples:
	- generate_range_items(0, 99, 10) yields '0-9', '10-19', '20-29', ..., '90-99'
	- generate_range_items(0, 42, 10): '0-9', '10-19', '20-29', '30-39', '40-42'
	- generate_range_items(0, 20, 10): '0-9', '10-19', '20-20'
	'''

	for i in range(start, stop + 1, step):
		yield f'{i}-{min(i + step - 1, stop)}'


async def handle_response_default(url, attempt, response, exc):
	'''
	The default response handler, which behaves as follows:
	- If there is no response (e.g. timeout error), retry the retrieval after a delay of 5 seconds.
	- If the response has any of the status codes 401, 403, 404, 405, or 410, treat it as a permanent error and return.
	- If there was any exception and it is a asyncio.TimeoutError or a aiohttp.ClientError, treat as a potentially temporary error and retry the retrieval after a delay of 5 seconds.
	- If the response has any of the status codes 200, 204, 206, or 304, treat it as a success and return.
	- If the response has any of the status codes 301, 302, 303, 307, or 308, follow the redirect target if specified or return otherwise.
	- Otherwise, treat as a potentially temporary error and retry the retrieval after a delay of 5 seconds.

	- All responses are written to WARC by default.

	Note that this handler does not limit the number of retries on errors.

	Parameters: url (yarl.URL instance), attempt (int), response (aiohttp.ClientResponse or None), exc (Exception or None)
		At least one of response and exc is not None.
	Returns: (one of the qwarc.RESPONSE_* constants, bool signifying whether to write to WARC or not)
		The latter is ignored when exc is not None; responses that triggered an exception are never written to WARC.
	'''

	#TODO: Document that `attempt` is reset on redirects

	if response is None:
		await asyncio.sleep(5)
		return ACTION_RETRY, True
	if response.status in (401, 403, 404, 405, 410):
		return ACTION_IGNORE, True
	if exc is not None:
		if isinstance(exc, (asyncio.TimeoutError, aiohttp.ClientError)):
			await asyncio.sleep(5)
		return ACTION_RETRY, False # Don't write to WARC since there might be an incomplete response
	if response.status in (200, 204, 206, 304):
		return ACTION_SUCCESS, True
	if response.status in (301, 302, 303, 307, 308):
		return ACTION_FOLLOW_OR_SUCCESS, True
	await asyncio.sleep(5)
	return ACTION_RETRY, True


async def handle_response_ignore_redirects(url, attempt, response, exc):
	'''A response handler that does not follow redirects, i.e. treats them as a success instead. It behaves as handle_response_default otherwise.'''

	action, writeToWarc = await handle_response_default(url, attempt, response, exc)
	if action == ACTION_FOLLOW_OR_SUCCESS:
		action = ACTION_SUCCESS
	return action, writeToWarc


def handle_response_limit_error_retries(maxRetries, handler = handle_response_default):
	'''A response handler that limits the number of retries on errors. It behaves as handler otherwise, which defaults to handle_response_default.

	Technically, this is actually a response handler factory. This is so that the intuitive use works: fetch(..., responseHandler = handle_response_limit_error_retries(5))

	If you use the same limit many times, you should keep the return value (the response handler) of this method and reuse it to avoid creating a new function every time.
	'''

	async def _handler(url, attempt, response, exc):
		action, writeToWarc = await handler(url, attempt, response, exc)
		if action == ACTION_RETRY and attempt > maxRetries:
			action = ACTION_RETRIES_EXCEEDED
		return action, writeToWarc
	return _handler


def _get_dependency_versions(*pkgs):
	pending = set(pkgs)
	have = set(pkgs)
	while pending:
		key = pending.pop()
		try:
			dist = pkg_resources.get_distribution(key)
		except pkg_resources.DistributionNotFound:
			logging.error(f'Unable to get distribution {key}')
			continue
		yield dist.key, dist.version
		for requirement in dist.requires():
			if requirement.key not in have:
				pending.add(requirement.key)
				have.add(requirement.key)


@functools.lru_cache(maxsize = 1)
def get_software_info(specFile, specDependencies):
	# Based on crocoite.utils, authored by PromyLOPh in commit 6ccd72ab on 2018-12-08 under MIT licence
	baseDependencyPackageVersions = list(_get_dependency_versions(__package__))
	baseDependencyPackages = set(x[0] for x in baseDependencyPackageVersions)
	specDependencyPackageVersions = list(_get_dependency_versions(*specDependencies.packages))
	return {
		'platform': platform.platform(),
		'python': {
			'implementation': platform.python_implementation(),
			'version': platform.python_version(),
			'build': platform.python_build(),
		},
		'self': [{"package": package, "version": version} for package, version in baseDependencyPackageVersions],
		'spec': [{"package": package, "version": version} for package, version in specDependencyPackageVersions if package not in baseDependencyPackages],
	  }


class LogFormatter(logging.Formatter):
	def __init__(self):
		super().__init__('%(asctime)s.%(msecs)03dZ %(levelname)s %(itemString)s %(message)s', datefmt = '%Y-%m-%d %H:%M:%S')
		self.converter = time.gmtime

	def format(self, record):
		if not hasattr(record, 'itemString'):
			if hasattr(record, 'itemType') and hasattr(record, 'itemValue'):
				record.itemString = f'{record.itemType}:{record.itemValue}'
			else:
				record.itemString = 'None'
		return super().format(record)


class SpecDependencies(typing.NamedTuple):
	packages: tuple = ()
	files: tuple = ()
	extra: typing.Any = None


class ReadonlyFileView:
	'''
	A poor read-only view for a file object. It hides the writing methods and passes everything else through to the underlying file object. Note that this does *not* actually prevent modification at all.
	'''

	def __init__(self, fp):
		self._fp = fp

	def __getattr__(self, key):
		if key in ('write', 'writelines', 'truncate'):
			raise AttributeError
		if key == 'writable':
			return False
		return getattr(self._fp, key)


def iter_file(f, length = None, blockSize = 1048576):
	'''Read `length` bytes from `f` in chunks of `blockSize` bytes. If `length` is `None`, read until EOF.'''
	read = 0
	while True:
		buf = f.read(blockSize)
		if not buf: # EOF
			if length and read < length:
				raise RuntimeError('Reached EOF before reading enough data')
			break
		if length and read + len(buf) > length:
			initialBufLen = len(buf)
			buf = buf[0 : length - read]
			f.seek(len(buf) - initialBufLen, io.SEEK_CUR)
		read += len(buf)
		yield buf
		if length and read >= length:
			if read > length: # This should never happen due to the truncation above.
				raise RuntimeError('Overread')
			break


def read_http_headers(f, copy = None):
	headers = {}

	# Status line or request line
	line = f.readline()
	if copy:
		copy.write(line)

	line = f.readline()
	if copy:
		copy.write(line)
	while line and line not in (b'\r\n', b'\r', b'\n'):
		# Split into header name and value
		name, value = line.split(b':', 1)
		name = name.strip(b' \t')
		#TODO name validation

		# Read next line
		line = f.readline()
		if copy:
			copy.write(line)

		# Handle continuation lines
		continuation = line[0:1] in (b' ', b'\t')
		if continuation:
			value = []
			while continuation:
				value.append(line)
				line = f.readline()
				if copy:
					copy.write(line)
				continuation = line[0:1] in (b' ', b'\t')
			value = b''.join(value)

		# Decode and store
		try:
			name = name.decode('utf-8')
		except UnicodeDecodeError:
			name = name.decode('iso-8859-1')
		try:
			value = value.decode('utf-8')
		except UnicodeDecodeError:
			value = value.decode('iso-8859-1')
		headers[name.lower()] = value

		# `line` is already the next line, if any
	return headers


def read_http_body(f, length, headers):
	if 'chunked' in map(str.strip, headers.get('transfer-encoding', '').split(',')):
		while True:
			chunkLine = f.readline()
			if b';' in chunkLine:
				chunkLength = chunkLine.split(b';', 1)[0].strip()
			else:
				chunkLength = chunkLine.strip()
			chunkLength = int(chunkLength, base = 16)
			if chunkLength == 0:
				break
			yield from iter_file(f, length = chunkLength)
			assert f.read(2) == b'\r\n' # Chunk terminator

		# Consume trailer
		line = f.readline()
		while line and line not in (b'\r\n', b'\r', b'\n'):
			line = f.readline()
	else:
		yield from iter_file(f, length = length)


class GzipWrapper:
	def __init__(self, f):
		self._file = f
		self._compressor = None

	def __enter__(self):
		self._compressor = zlib.compressobj(9, zlib.DEFLATED, 16 + zlib.MAX_WBITS)
		return self

	def write(self, data):
		buf = self._compressor.compress(data)
		self._file.write(buf)

	def __exit__(self, excType, excVal, excTb):
		buf = self._compressor.flush()
		self._file.write(buf)
		self._file.flush()
		self._compressor = None