JustAnotherArchivist
/
qwarc


			
							import qwarc.aiohttp
from qwarc.const import *
import qwarc.utils
import qwarc.warc


import aiohttp as _aiohttp
if _aiohttp.__version__ != '2.3.10':
	raise ImportError('aiohttp must be version 2.3.10')
import asyncio
import collections
import concurrent.futures
import contextlib
import io
import itertools
import logging
import os
import random
import sqlite3
import yarl


class Item:
	itemType = None
	defaultResponseHandler = staticmethod(qwarc.utils.handle_response_default)

	def __init__(self, qwarcObj, itemValue, session, headers, warc):
		self.qwarcObj = qwarcObj
		self.itemValue = itemValue
		self.session = session
		self.headers = headers
		self.warc = warc
		if not hasattr(self, '_baseUrl'): # To allow subclasses to set the baseUrl before calling super().__init__
			self._baseUrl = None
		self.stats = {'tx': 0, 'rx': 0, 'requests': 0}
		self.logger = logging.LoggerAdapter(logging.getLogger(), {'itemType': self.itemType, 'itemValue': self.itemValue})

		self.childItems = []

	@property
	def baseUrl(self):
		return self._baseUrl

	@baseUrl.setter
	def baseUrl(self, baseUrl):
		if baseUrl is None:
			self._baseUrl = None
		elif isinstance(baseUrl, yarl.URL):
			self._baseUrl = baseUrl
		else:
			self._baseUrl = yarl.URL(baseUrl)

	def _merge_headers(self, headers, extraHeaders = []):
		d = {}  # Preserves order from Python 3.7 (guaranteed) or CPython 3.6 (implementation detail)
		keys = {}  # casefolded key -> d key
		for key, value in itertools.chain(self.headers, extraHeaders, headers):
			keyc = key.casefold()
			if value is None:
				if keyc in keys:
					del d[keys[keyc]]
					del keys[keyc]
			else:
				if keyc in keys and key != keys[keyc]:
					del d[keys[keyc]]
				d[key] = value
				keys[keyc] = key
		out = []
		for key, value in d.items():
			if isinstance(value, tuple):
				for value_ in value:
					out.append((key, value_))
			else:
				out.append((key, value))
		return out

	async def fetch(self, url, responseHandler = None, method = 'GET', data = None, headers = [], verify_ssl = True, timeout = 60, fromResponse = None):
		'''
		HTTP GET or POST a URL

		url: str or yarl.URL; if this is not a complete URL, it is evaluated relative to self.baseUrl
		responseHandler: None or a callable that determines how the response is handled; if None, self.defaultResponseHandler is used. See qwarc.utils.handle_response_default for details.
		method: str, must be 'GET' or 'POST'
		data: dict or list/tuple of lists/tuples of length two or bytes or file-like or None, the data to be sent in the request body
		headers: list of 2-tuples, additional or overriding headers for this request only
			To remove one of the default headers, pass a value of None.
			If a header appears multiple times, only the last one is used. To send a header multiple times, pass a tuple of values.
		verify_ssl: bool, whether the SSL/TLS certificate should be validated
		timeout: int or float, how long the fetch may take at most in total (sending request until finishing reading the response)
		fromResponse: ClientResponse or None; if provided, use fromResponse.url for the url completion (instead of self.baseUrl) and add it as a Referer header

		Returns response (a ClientResponse object or a qwarc.utils.DummyClientResponse object)
		'''

		#TODO: Rewrite using 'async with self.session.get'

		url = yarl.URL(url) # Explicitly convert for normalisation, percent-encoding, etc.
		if not url.scheme or not url.host:
			if fromResponse is not None:
				url = fromResponse.url.join(url)
			elif not self.baseUrl:
				raise ValueError('Incomplete URL and no baseUrl to join it with')
			else:
				url = self.baseUrl.join(url)
		originalUrl = url
		if responseHandler is None:
			responseHandler = self.defaultResponseHandler
		assert method in ('GET', 'POST'), 'method must be GET or POST'
		headers = self._merge_headers(headers, extraHeaders = [('Referer', str(fromResponse.url))] if fromResponse is not None else [])
		history = []
		attempt = 0
		redirectLevel = 0
		while True:
			attempt += 1
			response = None
			exc = None
			action = ACTION_RETRY
			writeToWarc = True
			try:
				try:
					with _aiohttp.Timeout(timeout):
						self.logger.info(f'Fetching {url}')
						response = await self.session.request(method, url, data = data, headers = headers, allow_redirects = False, verify_ssl = verify_ssl)
						try:
							while True:
								ret = await response.content.read(1048576)
								if not ret:
									break
						except:
							# No calling the handleResponse callback here because this is really bad. The not-so-bad exceptions (e.g. an error during reading the response) will be caught further down.
							response.close()
							raise
						else:
							response.rawRequestData.seek(0, io.SEEK_END)
							tx = response.rawRequestData.tell()
							response.rawResponseData.seek(0, io.SEEK_END)
							rx = response.rawResponseData.tell()
							self.logger.info(f'Fetched {url}: {response.status} (tx {tx}, rx {rx})')
							self.stats['tx'] += tx
							self.stats['rx'] += rx
							self.stats['requests'] += 1
				except (asyncio.TimeoutError, _aiohttp.ClientError) as e:
					self.logger.warning(f'Request for {url} failed: {e!r}')
					action, writeToWarc = await responseHandler(url = url, attempt = attempt, response = response, exc = e, redirectLevel = redirectLevel, item = self)
					exc = e # Pass the exception outward for the history
				else:
					action, writeToWarc = await responseHandler(url = url, attempt = attempt, response = response, exc = None, redirectLevel = redirectLevel, item = self)
				if response and exc is None and writeToWarc:
					self.warc.write_client_response(response)
				history.append((response, exc))
				retResponse = response if exc is None else qwarc.utils.DummyClientResponse()
				if action in (ACTION_SUCCESS, ACTION_IGNORE):
					retResponse.qhistory = tuple(history)
					return retResponse
				elif action == ACTION_FOLLOW_OR_SUCCESS:
					redirectUrl = response.headers.get('Location') or response.headers.get('URI')
					if not redirectUrl:
						retResponse.qhistory = tuple(history)
						return retResponse
					url = url.join(yarl.URL(redirectUrl))
					if response.status in (301, 302, 303) and method == 'POST':
						method = 'GET'
						data = None
					attempt = 0
					redirectLevel += 1
				elif action == ACTION_RETRIES_EXCEEDED:
					self.logger.error(f'Request for {url} failed {attempt} times')
					retResponse.qhistory = tuple(history)
					return retResponse
				elif action == ACTION_TOO_MANY_REDIRECTS:
					self.logger.error(f'Request for {url} (from {originalUrl}) exceeded redirect limit')
					retResponse.qhistory = tuple(history)
					return retResponse
				elif action == ACTION_RETRY:
					# Nothing to do, just go to the next cycle
					pass
			finally:
				if response:
					await response.release()

	async def process(self):
		raise NotImplementedError

	@classmethod
	def generate(cls):
		yield from () # Generate no items by default

	def add_subitem(self, itemClassOrType, itemValue):
		if issubclass(itemClassOrType, Item):
			item = (itemClassOrType.itemType, itemValue)
		else:
			item = (itemClassOrType, itemValue)
		if item not in self.childItems:
			self.childItems.append(item)

	async def flush_subitems(self):
		await self.qwarcObj.flush_subitems(self)

	def clear_subitems(self):
		self.childItems = []

	@classmethod
	def get_subclasses(cls):
		for subclass in cls.__subclasses__():
			yield subclass
			yield from subclass.get_subclasses()


class QWARC:
	def __init__(self, itemClasses, warcBasePath, dbPath, command, specFile, specDependencies, logFilename, concurrency = 1, memoryLimit = 0, minFreeDisk = 0, warcSizeLimit = 0, warcDedupe = False):
		'''
		itemClasses: iterable of Item
		warcBasePath: str, base name of the WARC files
		dbPath: str, path to the sqlite3 database file
		command: list, the command line used to invoke qwarc
		specFile: str, path to the spec file
		specDependencies: qwarc.utils.SpecDependencies
		logFilename: str, name of the log file written by this process
		concurrency: int, number of concurrently processed items
		memoryLimit: int, gracefully stop when the process uses more than memoryLimit bytes of RSS; 0 disables the memory check
		minFreeDisk: int, pause when there's less than minFreeDisk space on the partition where WARCs are written; 0 disables the disk space check
		warcSizeLimit: int, size of each WARC file; 0 if the WARCs should not be split
		'''

		self._itemClasses = itemClasses
		self._itemTypeMap = {cls.itemType: cls for cls in itemClasses}
		self._warcBasePath = warcBasePath
		self._dbPath = dbPath
		self._command = command
		self._specFile = specFile
		self._specDependencies = specDependencies
		self._logFilename = logFilename
		self._concurrency = concurrency
		self._memoryLimit = memoryLimit
		self._minFreeDisk = minFreeDisk
		self._warcSizeLimit = warcSizeLimit
		self._warcDedupe = warcDedupe

		self._reset_working_vars()

	def _reset_working_vars(self):
		# Working variables
		self._db = None
		self._tasks = set()
		self._sleepTasks = set()
		self._sessions = [] # aiohttp.ClientSession instances
		self._freeSessions = collections.deque() # ClientSession instances that are currently free
		self._warc = None

	@contextlib.asynccontextmanager
	async def exclusive_db_lock(self):
		c = self._db.cursor()
		while True:
			try:
				c.execute('BEGIN EXCLUSIVE')
				break
			except sqlite3.OperationalError as e:
				if str(e) != 'database is locked':
					raise
				await asyncio.sleep(1)
		try:
			yield c
			c.execute('COMMIT')
		except:
			c.execute('ROLLBACK')
			raise

	def _make_item(self, itemType, itemValue, session, headers):
		try:
			itemClass = self._itemTypeMap[itemType]
		except KeyError:
			raise RuntimeError(f'No such item type: {itemType!r}')
		return itemClass(self, itemValue, session, headers, self._warc)

	async def _wait_for_free_task(self):
		if not self._tasks:
			return
		done, pending = await asyncio.wait(self._tasks, return_when = concurrent.futures.FIRST_COMPLETED)
		for future in done:
			newStatus = STATUS_DONE
			if future.taskType == 'sleep':
				self._sleepTasks.remove(future)
			elif future.taskType == 'process':
				item = future.item
			try:
				future.result()
			except asyncio.CancelledError as e:
				# Got cancelled, nothing we can do about it, but let's log a warning if it's a process task
				if future.taskType == 'process':
					logging.error(f'Task for {future.itemType}:{future.itemValue} cancelled: {future!r}')
					newStatus = STATUS_ERROR
			except Exception as e:
				if future.taskType == 'process':
					logging.error(f'{future.itemType}:{future.itemValue} failed: {e!r} ({item.stats["requests"]} requests, {item.stats["tx"]} tx, {item.stats["rx"]} rx)', exc_info = e)
					newStatus = STATUS_ERROR
			else:
				if future.taskType == 'process':
					logging.info(f'{future.itemType}:{future.itemValue} done: {item.stats["requests"]} requests, {item.stats["tx"]} tx, {item.stats["rx"]} rx')
			if future.taskType != 'process':
				continue
			async with self.exclusive_db_lock() as cursor:
				cursor.execute('UPDATE items SET status = ? WHERE id = ?', (newStatus, future.id))
			await self._insert_subitems(item)
			self._freeSessions.append(item.session)
		self._tasks = pending

	async def _insert_subitems(self, item):
		async with self.exclusive_db_lock() as cursor:
			if item.childItems:
				it = iter(item.childItems)
				while True:
					values = [(t, v, STATUS_TODO) for t, v in itertools.islice(it, 100000)]
					if not values:
						break
					cursor.executemany('INSERT OR IGNORE INTO items (type, value, status) VALUES (?, ?, ?)', values)

	async def run(self, loop):
		for i in range(self._concurrency):
			session = _aiohttp.ClientSession(
			  connector = qwarc.aiohttp.TCPConnector(loop = loop),
			  request_class = qwarc.aiohttp.ClientRequest,
			  response_class = qwarc.aiohttp.ClientResponse,
			  loop = loop
			)
			self._sessions.append(session)
			self._freeSessions.append(session)

		self._warc = qwarc.warc.WARC(self._warcBasePath, self._warcSizeLimit, self._warcDedupe, self._command, self._specFile, self._specDependencies, self._logFilename)

		self._db = sqlite3.connect(self._dbPath, timeout = 1)
		self._db.isolation_level = None # Transactions are handled manually below.
		self._db.execute('PRAGMA synchronous = OFF')

		async with self.exclusive_db_lock() as cursor:
			cursor.execute('SELECT name FROM sqlite_master WHERE type = "table" AND name = "items"')
			result = cursor.fetchone()
			if not result:
				self._create_db(cursor)
			self._insert_generated_items(cursor)

		try:
			while True:
				while len(self._tasks) >= self._concurrency:
					await self._wait_for_free_task()

				if os.path.exists('STOP'):
					logging.info('Gracefully shutting down due to STOP file')
					break
				if self._memoryLimit and qwarc.utils.uses_too_much_memory(self._memoryLimit):
					logging.info(f'Gracefully shutting down due to memory usage (current = {qwarc.utils.get_rss()} > limit = {self._memoryLimit})')
					break

				if self._minFreeDisk and qwarc.utils.too_little_disk_space(self._minFreeDisk):
					logging.info('Disk space is low, sleeping')
					sleepTask = asyncio.ensure_future(asyncio.sleep(random.uniform(self._concurrency / 2, self._concurrency * 1.5)))
					sleepTask.taskType = 'sleep'
					self._tasks.add(sleepTask)
					self._sleepTasks.add(sleepTask)
					continue

				async with self.exclusive_db_lock() as cursor:
					cursor.execute('SELECT id, type, value, status FROM items WHERE status = ? LIMIT 1', (STATUS_TODO,))
					result = cursor.fetchone()
					if not result:
						if cursor.execute('SELECT id, status FROM items WHERE status != ? LIMIT 1', (STATUS_DONE,)).fetchone():
							# There is currently no item to do, but there are still some in progress, so more TODOs may appear in the future.
							# It would be nice if we could just await wait_for_free_task() here, but that doesn't work because those TODOs might be in another process.
							# So instead, we insert a dummy task which just sleeps a bit. Average sleep time is equal to concurrency, i.e. one check per second.
							#TODO: The average sleep time is too large if there are only few sleep tasks; scale with len(sleepTasks)/self._concurrency?
							sleepTask = asyncio.ensure_future(asyncio.sleep(random.uniform(self._concurrency / 2, self._concurrency * 1.5)))
							sleepTask.taskType = 'sleep'
							self._tasks.add(sleepTask)
							self._sleepTasks.add(sleepTask)
							continue
						else:
							# Really nothing to do anymore
							break
					id_, itemType, itemValue, status = result
					cursor.execute('UPDATE items SET status = ? WHERE id = ?', (STATUS_INPROGRESS, id_))

				session = self._freeSessions.popleft()
				item = self._make_item(itemType, itemValue, session, DEFAULT_HEADERS)
				task = asyncio.ensure_future(item.process())
				#TODO: Is there a better way to add custom information to a task/coroutine object?
				task.taskType = 'process'
				task.id = id_
				task.itemType = itemType
				task.itemValue = itemValue
				task.item = item
				self._tasks.add(task)

			for sleepTask in self._sleepTasks:
				sleepTask.cancel()

			while len(self._tasks):
				await self._wait_for_free_task()

			logging.info('Done')
		except (Exception, KeyboardInterrupt) as e:
			# Kill all tasks
			for task in self._tasks:
				task.cancel()
			await asyncio.wait(self._tasks, return_when = concurrent.futures.ALL_COMPLETED)

			raise
		finally:
			for session in self._sessions:
				session.close()
			self._warc.close()
			self._db.close()

			self._reset_working_vars()

	async def flush_subitems(self, item):
		await self._insert_subitems(item)
		item.clear_subitems()

	def _create_db(self, cursor):
		cursor.execute('CREATE TABLE items (id INTEGER PRIMARY KEY, type TEXT, value TEXT, status INTEGER)')
		cursor.execute('CREATE INDEX items_status_idx ON items (status)')
		cursor.execute('CREATE UNIQUE INDEX items_type_value_idx ON items (type, value)')

	def _insert_generated_items(self, cursor):
		it = itertools.chain((cls.itemType, value, STATUS_TODO) for cls in self._itemClasses for value in cls.generate())
		while True:
			values = tuple(itertools.islice(it, 100000))
			if not values:
				break
			cursor.executemany('INSERT OR IGNORE INTO items (type, value, status) VALUES (?, ?, ?)', values)