Initial commit

5 years ago · e892a6b6a7
--- a/qwarc/init.py
+++ b/qwarc/init.py
@@ -0,0 +1,316 @@
 import qwarc.aiohttp
 from qwarc.const import *
 import qwarc.utils
 import qwarc.warc


 import aiohttp as _aiohttp
 if _aiohttp.__version__ != '2.3.10':
 	raise ImportError('aiohttp must be version 2.3.10')
 import asyncio
 import collections
 import concurrent.futures
 import itertools
 import logging
 import os
 import random
 import sqlite3
 import yarl


 class Item:
 	itemType = None

 	def __init__(self, itemValue, session, headers, warc):
 		self.itemValue = itemValue
 		self.session = session
 		self.headers = headers
 		self.warc = warc
 		self.stats = {'tx': 0, 'rx': 0, 'requests': 0}

 		self.childItems = []

 	async def fetch(self, url, responseHandler = qwarc.utils.handle_response_default):
 		'''
 		HTTP GET a URL

 		url: str or yarl.URL
 		responseHandler: a callable that determines how the response is handled. See qwarc.utils.handle_response_default for details.

 		Returns response (a ClientResponse object or None) and history (a tuple of (response, exception) tuples).
 			response can be None and history can be an empty tuple, depending on the circumstances (e.g. timeouts).
 		'''

 		#TODO: Rewrite using 'async with self.session.get'

 		url = yarl.URL(url) # Explicitly convert for normalisation, percent-encoding, etc.
 		history = []
 		attempt = 0
 		#TODO redirectLevel
 		while True:
 			attempt += 1
 			response = None
 			exc = None
 			action = ACTION_RETRY
 			writeToWarc = True
 			try:
 				try:
 					with _aiohttp.Timeout(60):
 						logging.info('Fetching {}'.format(url))
 						response = await self.session.get(url, headers = self.headers, allow_redirects = False)
 						try:
 							ret = await response.text(errors = 'surrogateescape')
 						except:
 							# No calling the handleResponse callback here because this is really bad. The not-so-bad exceptions (e.g. an error during reading the response) will be caught further down.
 							response.close()
 							raise
 						else:
 							tx = len(response.rawRequestData)
 							rx = len(response.rawResponseData)
 							logging.info('Fetched {}: {} (tx {}, rx {})'.format(url, response.status, tx, rx))
 							self.stats['tx'] += tx
 							self.stats['rx'] += rx
 							self.stats['requests'] += 1
 				except (asyncio.TimeoutError, _aiohttp.ClientError) as e:
 					logging.error('Request for {} failed: {!r}'.format(url, e))
 					action, writeToWarc = await responseHandler(url, attempt, response, e)
 					exc = e # Pass the exception outward for the history
 				else:
 					action, writeToWarc = await responseHandler(url, attempt, response, None)
 				history.append((response, exc))
 				if action in (ACTION_SUCCESS, ACTION_IGNORE):
 					return response, tuple(history)
 				elif action == ACTION_FOLLOW_OR_SUCCESS:
 					redirectUrl = response.headers.get('Location') or response.headers.get('URI')
 					if not redirectUrl:
 						return response, tuple(history)
 					url = url.join(yarl.URL(redirectUrl))
 					attempt = 0
 				elif action == ACTION_RETRY:
 					# Nothing to do, just go to the next cycle
 					pass
 			finally:
 				if response:
 					if writeToWarc:
 						self.warc.write_client_response(response)
 					await response.release()

 	async def process(self):
 		raise NotImplementedError

 	@classmethod
 	def generate(cls):
 		yield from () # Generate no items by default

 	@classmethod
 	def _gen(cls):
 		for x in cls.generate():
 			yield (cls.itemType, x, STATUS_TODO) 

 	def add_item(self, itemClassOrType, itemValue):
 		if issubclass(itemClassOrType, Item):
 			item = (itemClassOrType.itemType, itemValue)
 		else:
 			item = (itemClassOrType, itemValue)
 		if item not in self.childItems:
 			self.childItems.append(item)


 class QWARC:
 	def __init__(self, itemClasses, warcBasePath, dbPath, concurrency = 1, memoryLimit = 0, minFreeDisk = 0, warcSizeLimit = 0):
 		'''
 		itemClasses: iterable of Item
 		warcBasePath: str, base name of the WARC files
 		dbPath: str, path to the sqlite3 database file
 		concurrency: int, number of concurrently processed items
 		memoryLimit: int, gracefully stop when the process uses more than memoryLimit bytes of RSS; 0 disables the memory check
 		minFreeDisk: int, pause when there's less than minFreeDisk space on the partition where WARCs are written; 0 disables the disk space check
 		warcSizeLimit: int, size of each WARC file; 0 if the WARCs should not be split
 		'''

 		self._itemClasses = itemClasses
 		self._itemTypeMap = {cls.itemType: cls for cls in itemClasses}
 		self._warcBasePath = warcBasePath
 		self._dbPath = dbPath
 		self._concurrency = concurrency
 		self._memoryLimit = memoryLimit
 		self._minFreeDisk = minFreeDisk
 		self._warcSizeLimit = warcSizeLimit

 	async def obtain_exclusive_db_lock(self, db):
 		c = db.cursor()
 		while True:
 			try:
 				c.execute('BEGIN EXCLUSIVE')
 				break
 			except sqlite3.OperationalError as e:
 				if str(e) != 'database is locked':
 					raise
 				await asyncio.sleep(1)
 		return c

 	def _make_item(self, itemType, itemValue, session, headers, warc):
 		try:
 			itemClass = self._itemTypeMap[itemType]
 		except KeyError:
 			raise RuntimeError('No such item type: {!r}'.format(itemType))
 		return itemClass(itemValue, session, headers, warc)

 	async def run(self, loop):
 		headers = [('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0')] #TODO: Move elsewhere

 		tasks = set()
 		sleepTasks = set()
 		sessions = [] # aiohttp.ClientSession instances
 		freeSessions = collections.deque() # ClientSession instances that are currently free

 		for i in range(self._concurrency):
 			session = _aiohttp.ClientSession(
 			  connector = qwarc.aiohttp.TCPConnector(loop = loop),
 			  request_class = qwarc.aiohttp.ClientRequest,
 			  response_class = qwarc.aiohttp.ClientResponse,
 			  skip_auto_headers = ['Accept-Encoding'],
 			  loop = loop
 			)
 			sessions.append(session)
 			freeSessions.append(session)

 		warc = qwarc.warc.WARC(self._warcBasePath, self._warcSizeLimit)

 		db = sqlite3.connect(self._dbPath, timeout = 1)
 		db.isolation_level = None # Transactions are handled manually below.
 		db.execute('PRAGMA synchronous = OFF')

 		try:
 			async def wait_for_free_task():
 				nonlocal tasks, freeSessions, db, emptyTodoSleep
 				done, pending = await asyncio.wait(tasks, return_when = concurrent.futures.FIRST_COMPLETED)
 				for future in done:
 					# TODO Replace all of this with `if future.cancelled():`
 					try:
 						await future #TODO: Is this actually necessary? asyncio.wait only returns 'done' futures...
 					except concurrent.futures.CancelledError as e:
 						# Got cancelled, nothing we can do about it, but let's log a warning if it's a process task
 						if isinstance(future, asyncio.Task):
 							if future.taskType == 'process_item':
 								logging.warning('Task for {}:{} cancelled: {!r}'.format(future.itemType, future.itemValue, future))
 							elif future.taskType == 'sleep':
 								sleepTasks.remove(future)
 						continue
 					if future.taskType == 'sleep':
 						# Dummy task for empty todo list, see below.
 						sleepTasks.remove(future)
 						continue
 					item = future.item
 					logging.info('{itemType}:{itemValue} done: {requests} requests, {tx} tx, {rx} rx'.format(itemType = future.itemType, itemValue = future.itemValue, **item.stats))
 					cursor = await self.obtain_exclusive_db_lock(db)
 					try:
 						cursor.execute('UPDATE items SET status = ? WHERE id = ?', (STATUS_DONE, future.id))
 						if item.childItems:
 							it = iter(item.childItems)
 							while True:
 								values = [(t, v, STATUS_TODO) for t, v in itertools.islice(it, 100000)]
 								if not values:
 									break
 								cursor.executemany('INSERT INTO items (type, value, status) VALUES (?, ?, ?)', values)
 						cursor.execute('COMMIT')
 					except:
 						cursor.execute('ROLLBACK')
 						raise
 					freeSessions.append(item.session)
 				tasks = pending

 			while True:
 				while len(tasks) >= self._concurrency:
 					emptyTodoFullReached = True
 					await wait_for_free_task()

 				if self._minFreeDisk and too_little_disk_space(self._minFreeDisk):
 					logging.info('Disk space is low, sleeping')
 					sleepTask = asyncio.sleep(random.uniform(self._concurrency / 2, self._concurrency * 1.5))
 					sleepTask.taskType = 'sleep'
 					tasks.add(sleepTask)
 					sleepTasks.add(sleepTask)
 					continue

 				cursor = await self.obtain_exclusive_db_lock(db)
 				try:
 					cursor.execute('SELECT id, type, value, status FROM items WHERE status = ? LIMIT 1', (STATUS_TODO,))
 					result = cursor.fetchone()
 					if not result:
 						if cursor.execute('SELECT id, status FROM items WHERE status != ? LIMIT 1', (STATUS_DONE,)).fetchone():
 							# There is currently no item to do, but there are still some in progress, so more TODOs may appear in the future.
 							# It would be nice if we could just await wait_for_free_task() here, but that doesn't work because those TODOs might be in another process.
 							# So instead, we insert a dummy task which just sleeps a bit. Average sleep time is equal to concurrency, i.e. one check per second.
 							#TODO: The average sleep time is too large if there are only few sleep tasks; scale with len(sleepTasks)/self._concurrency?
 							sleepTask = asyncio.ensure_future(asyncio.sleep(random.uniform(self._concurrency / 2, self._concurrency * 1.5)))
 							sleepTask.taskType = 'sleep'
 							tasks.add(sleepTask)
 							sleepTasks.add(sleepTask)
 							cursor.execute('COMMIT')
 							continue
 						else:
 							# Really nothing to do anymore
 							#TODO: Another process may be running create_db, in which case we'd still want to wait...
 							# create_db could insert a dummy item which is marked as done when the DB is ready
 							cursor.execute('COMMIT')
 							break
 					emptyTodoSleep = 0
 					id, itemType, itemValue, status = result
 					cursor.execute('UPDATE items SET status = ? WHERE id = ?', (STATUS_INPROGRESS, id))
 					cursor.execute('COMMIT')
 				except:
 					cursor.execute('ROLLBACK')
 					raise

 				session = freeSessions.popleft()
 				item = self._make_item(itemType, itemValue, session, headers, warc)
 				task = asyncio.ensure_future(item.process())
 				#TODO: Is there a better way to add custom information to a task/coroutine object?
 				task.taskType = 'process'
 				task.id = id
 				task.itemType = itemType
 				task.itemValue = itemValue
 				task.item = item
 				tasks.add(task)
 				if os.path.exists('STOP'):
 					logging.info('Gracefully shutting down due to STOP file')
 					break
 				if self._memoryLimit and uses_too_much_memory(self._memoryLimit):
 					logging.info('Gracefully shutting down due to memory usage (current = {} > limit = {})'.format(get_rss(), self._memoryLimit))
 					break

 			for sleepTask in sleepTasks:
 				sleepTask.cancel()

 			while len(tasks):
 				await wait_for_free_task()

 			logging.info('Done')
 		except (Exception, KeyboardInterrupt) as e:
 			# Kill all tasks
 			for task in tasks:
 				task.cancel()
 			await asyncio.wait(tasks, return_when = concurrent.futures.ALL_COMPLETED)

 			raise
 		finally:
 			for session in sessions:
 				session.close()
 			warc.close()
 			db.close()

 	def create_db(self):
 		db = sqlite3.connect(self._dbPath, timeout = 1)
 		db.execute('PRAGMA synchronous = OFF')
 		with db:
 			db.execute('CREATE TABLE items (id INTEGER PRIMARY KEY, type TEXT, value TEXT, status INTEGER)')
 			db.execute('CREATE INDEX items_status_idx ON items (status)')

 		it = itertools.chain(*(i._gen() for i in self._itemClasses))
 		while True:
 			values = tuple(itertools.islice(it, 100000))
 			if not values:
 				break
 			with db:
 				db.executemany('INSERT INTO items (type, value, status) VALUES (?, ?, ?)', values)
--- a/qwarc/aiohttp.py
+++ b/qwarc/aiohttp.py
@@ -0,0 +1,113 @@
 import aiohttp
 import aiohttp.client_proto
 import aiohttp.connector
 import functools
 import itertools
 import time


 # aiohttp does not expose the raw data sent over the wire, so we need to get a bit creative...
 # The ResponseHandler handles received data; the writes are done directly on the underlying transport.
 # So ResponseHandler is replaced with a class which keeps all received data in a list, and the transport's write method is replaced with one which sends back all written data to the ResponseHandler.
 # Because the ResponseHandler instance disappears when the connection is closed (ClientResponse.{_response_eof,close,release}), ClientResponse copies the references to the data objects in the RequestHandler.
 # aiohttp also does connection pooling/reuse, so ClientRequest resets the raw data when the request is sent. (This would not work with pipelining, but aiohttp does not support pipelining: https://github.com/aio-libs/aiohttp/issues/1740 )
 # This code has been developed for aiohttp version 2.3.10.

 #TODO: THERE IS A MEMORY LEAK HERE SOMEWHERE! I spent a whole day trying to find it without success.


 class RawData:
 	def __init__(self):
 		self.requestTimestamp = None
 		self.requestData = []
 		self.responseTimestamp = None
 		self.responseData = []


 class ResponseHandler(aiohttp.client_proto.ResponseHandler):
 	def __init__(self, *args, **kwargs):
 		super().__init__(*args, **kwargs)
 		self.rawData = None
 		self.remoteAddress = None

 	def data_received(self, data):
 		super().data_received(data)
 		if not data:
 			return
 		if self.rawData.responseTimestamp is None:
 			self.rawData.responseTimestamp = time.time()
 		self.rawData.responseData.append(data)

 	def reset_raw_data(self):
 		self.rawData = RawData()


 def make_transport_write(transport, protocol):
 	transport._real_write = transport.write
 	def write(self, data):
 		if protocol.rawData.requestTimestamp is None:
 			protocol.rawData.requestTimestamp = time.time()
 		protocol.rawData.requestData.append(data)
 		self._real_write(data)
 	return write


 class TCPConnector(aiohttp.connector.TCPConnector):
 	def __init__(self, *args, loop = None, **kwargs):
 		super().__init__(*args, loop = loop, **kwargs)
 		self._factory = functools.partial(ResponseHandler, loop = loop)

 	async def _wrap_create_connection(self, protocolFactory, host, port, *args, **kwargs): #FIXME: Uses internal API
 		transport, protocol = await super()._wrap_create_connection(protocolFactory, host, port, *args, **kwargs)
 		transport.write = make_transport_write(transport, protocol).__get__(transport, type(transport)) # https://stackoverflow.com/a/28127947
 		protocol.remoteAddress = (host, port)
 		return (transport, protocol)


 class ClientRequest(aiohttp.client_reqrep.ClientRequest):
 	def send(self, connection):
 		connection.protocol.reset_raw_data()
 		return super().send(connection)


 class ClientResponse(aiohttp.client_reqrep.ClientResponse):
 	def __init__(self, *args, **kwargs):
 		super().__init__(*args, **kwargs)
 		self._rawData = None
 		self._remoteAddress = None

 	async def start(self, connection, readUntilEof):
 		self._rawData = connection.protocol.rawData
 		self._remoteAddress = connection.protocol.remoteAddress
 		return (await super().start(connection, readUntilEof))

 	@property
 	def rawRequestTimestamp(self):
 		return self._rawData.requestTimestamp

 	@property
 	def rawRequestData(self):
 		return b''.join(self._rawData.requestData)

 	@property
 	def rawResponseTimestamp(self):
 		return self._rawData.responseTimestamp

 	@property
 	def rawResponseData(self):
 		return b''.join(self._rawData.responseData)

 	@property
 	def remoteAddress(self):
 		return self._remoteAddress

 	def set_history(self, history):
 		self._history = history #FIXME: Uses private attribute of aiohttp.client_reqrep.ClientResponse

 	def iter_all(self):
 		return itertools.chain(self.history, (self,))

 	async def release(self):
 		if not self.closed:
 			self.connection.reset_raw_data()
 		await super().release()
--- a/qwarc/cli.py
+++ b/qwarc/cli.py
@@ -0,0 +1,81 @@
 import argparse
 import asyncio
 import importlib.util
 import logging
 import os.path
 import qwarc
 import sys
 import time


 def setup_logging(logFilename):
 	rootLogger = logging.getLogger()
 	rootLogger.handlers = []
 	rootLogger.setLevel(logging.INFO)

 	formatter = logging.Formatter('%(asctime)s.%(msecs)03dZ %(levelname)s %(message)s', datefmt = '%Y-%m-%d %H:%M:%S')
 	formatter.converter = time.gmtime

 	fileHandler = logging.FileHandler(logFilename)
 	fileHandler.setFormatter(formatter)
 	rootLogger.addHandler(fileHandler)

 	stderrHandler = logging.StreamHandler()
 	stderrHandler.setFormatter(formatter)
 	rootLogger.addHandler(stderrHandler)


 def check_files(specFilename, logFilename):
 	success = True
 	if not os.path.isfile(specFilename):
 		print('Error: "{}" does not exist or is not a regular file', file = sys.stderr)
 		success = False
 	if os.path.exists(logFilename):
 		print('Error: "{}" already exists'.format(logFilename), file = sys.stderr)
 		success = False
 	if os.path.exists('STOP'):
 		print('Error: "STOP" exists', file = sys.stderr)
 		success = False
 	return success


 def main():
 	parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)
 	parser.add_argument('--log', metavar = 'LOGFILE', default = './qwarc.log')
 	parser.add_argument('--database', metavar = 'DBFILE', default = './qwarc.db')
 	parser.add_argument('--warc', metavar = 'PREFIX', help = 'prefix for the WARC filenames', default = './qwarc')
 	parser.add_argument('--concurrency', type = int, default = 1)
 	parser.add_argument('--memorylimit', metavar = 'LIMIT', help = 'pause when less than LIMIT bytes memory is free; disable if 0', default = 0)
 	parser.add_argument('--disklimit', metavar = 'LIMIT', help = 'pause when less than LIMIT bytes disk space is free; disable if 0', default = 0)
 	parser.add_argument('--warcsplit', metavar = 'SIZE', help = 'split WARCs into files of SIZE bytes; disable if 0', default = 0)
 	parser.add_argument('specfile')

 	args = parser.parse_args()

 	if not check_files(args.specfile, args.log):
 		sys.exit(1)

 	setup_logging(args.log)

 	spec = importlib.util.spec_from_file_location('spec', args.specfile)
 	specMod = importlib.util.module_from_spec(spec)
 	spec.loader.exec_module(specMod)

 	a = qwarc.QWARC(
 		itemClasses = qwarc.Item.__subclasses__(),
 		warcBasePath = args.warc,
 		dbPath = args.database,
 		concurrency = args.concurrency,
 		memoryLimit = args.memorylimit,
 		minFreeDisk = args.disklimit,
 		warcSizeLimit = args.warcsplit,
 	  )
 	if not os.path.exists(args.database):
 		a.create_db()

 	loop = asyncio.get_event_loop()
 	try:
 		loop.run_until_complete(a.run(loop))
 	except (Exception, KeyboardInterrupt) as e:
 		logging.exception('Unhandled error')
 	loop.close()
--- a/qwarc/const.py
+++ b/qwarc/const.py
@@ -0,0 +1,23 @@
 STATUS_TODO = 0
 '''Status of an item that has not been processed yet'''

 STATUS_INPROGRESS = 1
 '''Status of an item that is currently being processed'''

 STATUS_DONE = 2
 '''Status of an item that has been processed'''

 #TODO: Add a STATUS_ERROR?

 ACTION_SUCCESS = 0
 '''Treat this response as a success'''

 ACTION_IGNORE = 1 #TODO Replace with ACTION_SUCCESS since it's really the same thing.
 '''Ignore this response'''

 ACTION_RETRY = 2
 '''Retry the same request'''

 ACTION_FOLLOW_OR_SUCCESS = 3
 '''If the response contains a Location or URI header, follow it. Otherwise, treat it as a success.'''
 #TODO: Rename to ACTION_FOLLOW maybe? However, the current name makes it more clear what qwarc does when there's a redirect without a redirect target...
--- a/qwarc/utils.py
+++ b/qwarc/utils.py
@@ -0,0 +1,182 @@
 from qwarc.const import *
 import asyncio
 import os


 PAGESIZE = os.sysconf('SC_PAGE_SIZE')


 def get_rss():
 	'''Get the current RSS of this process in bytes'''

 	with open('/proc/self/statm', 'r') as fp:
 		return int(fp.readline().split()[1]) * PAGESIZE


 def get_disk_free():
 	'''Get the current free disk space on the relevant partition in bytes'''

 	st = os.statvfs('.')
 	return st.f_bavail * st.f_frsize


 def uses_too_much_memory(limit):
 	'''
 	Check whether the process is using too much memory

 	For performance reasons, this actually only checks the memory usage on every 100th call.
 	'''

 	uses_too_much_memory.callCounter += 1
 	# Only check every hundredth call
 	if uses_too_much_memory.callCounter % 100 == 0 and get_rss() > limit:
 		return True
 	return False
 uses_too_much_memory.callCounter = 0


 def too_little_disk_space(limit):
 	'''
 	Check whether the disk space is too small

 	For performance reasons, this actually only checks the free disk space on every 100th call.
 	'''

 	too_little_disk_space.callCounter += 1
 	if too_little_disk_space.callCounter % 100 == 0:
 		too_little_disk_space.currentResult = (get_disk_free() < limit)
 	return too_little_disk_space.currentResult
 too_little_disk_space.callCounter = 0
 too_little_disk_space.currentResult = False


 # https://stackoverflow.com/a/4665027
 def find_all(aStr, sub):
 	'''Generator yielding the start positions of every non-overlapping occurrence of sub in aStr.'''

 	start = 0
 	while True:
 		start = aStr.find(sub, start)
 		if start == -1:
 			return
 		yield start
 		start += len(sub)


 def str_get_between(aStr, a, b):
 	'''Get the string after the first occurrence of a in aStr and the first occurrence of b after that of a, or None if there is no such string.'''

 	aPos = aStr.find(a)
 	if aPos == -1:
 		return None
 	offset = aPos + len(a)
 	bPos = aStr.find(b, offset)
 	if bPos == -1:
 		return None
 	return aStr[offset:bPos]


 def maybe_str_get_between(x, a, b):
 	'''Like str_get_between, but returns None if x evaluates to False and converts it to a str before matching.'''

 	if x:
 		return str_get_between(str(x), a, b)


 def str_get_all_between(aStr, a, b):
 	'''Generator yielding every string between occurrences of a in aStr and the following occurrence of b.'''

 	#TODO: This produces half-overlapping matches: str_get_all_between('aabc', 'a', 'c') will yield 'ab' and 'b'.
 		# Might need to implement sending an offset to the find_all generator to work around this, or discard aOffset values which are smaller than the previous bPos+len(b).

 	for aOffset in find_all(aStr, a):
 		offset = aOffset + len(a)
 		bPos = aStr.find(b, offset)
 		if bPos != -1:
 			yield aStr[offset:bPos]


 def maybe_str_get_all_between(x, a, b):
 	'''Like str_get_all_between, but yields no elements if x evaluates to False and converts x to a str before matching.'''

 	if x:
 		yield from str_get_all_between(str(x), a, b)


 def generate_range_items(start, stop, step):
 	'''
 	Generator for items of `step` size between `start` and `stop` (inclusive)
 	Yields strings of the form `'a-b'` where `a` and `b` are integers such that `b - a + 1 == step`, `min(a) == start`, and `max(b) == stop`.
 	`b - a + 1` may be unequal to `step` on the last item if `(stop - start + 1) % step != 0` (see examples below).
 	Note that `a` and `b` can be equal on the last item if `(stop - start) % step == 0` (see examples below).

 	Examples:
 	- generate_range_items(0, 99, 10) yields '0-9', '10-19', '20-29', ..., '90-99'
 	- generate_range_items(0, 42, 10): '0-9', '10-19', '20-29', '30-39', '40-42'
 	- generate_range_items(0, 20, 10): '0-9', '10-19', '20-20'
 	'''

 	for i in range(start, stop + 1, step):
 		yield '{}-{}'.format(i, min(i + step - 1, stop))


 async def handle_response_default(url, attempt, response, exc):
 	'''
 	The default response handler, which behaves as follows:
 	- If there is no response (e.g. timeout error), retry the retrieval after a delay of 5 seconds.
 	- If the response has any of the status codes 401, 403, 404, 405, or 410, treat it as a permanent error and return.
 	- If there was any exception and it is a asyncio.TimeoutError or a aiohttp.ClientError, treat as a potentially temporary error and retry the retrieval after a delay of 5 seconds.
 	- If the response has any of the status codes 200, 204, 206, or 304, treat it as a success and return.
 	- If the response has any of the status codes 301, 302, 303, 307, or 308, follow the redirect target if specified or return otherwise.
 	- Otherwise, treat as a potentially temporary error and retry the retrieval after a delay of 5 seconds.

 	- All responses are written to WARC by default.

 	Note that this handler does not limit the number of retries on errors.

 	Parameters: url (yarl.URL instance), attempt (int), response (aiohttp.ClientResponse or None), exc (Exception or None)
 		At least one of response and exc is not None.
 	Returns: (one of the qwarc.RESPONSE_* constants, bool signifying whether to write to WARC or not)
 	'''

 	#TODO: Document that `attempt` is reset on redirects

 	if response is None:
 		await asyncio.sleep(5)
 		return ACTION_RETRY, True
 	if response.status in (401, 403, 404, 405, 410):
 		return ACTION_IGNORE, True
 	if exc is not None and isinstance(exc, (asyncio.TimeoutError, _aiohttp.ClientError)):
 		await asyncio.sleep(5)
 		return ACTION_RETRY, True
 	if response.status in (200, 204, 206, 304):
 		return ACTION_SUCCESS, True
 	if response.status in (301, 302, 303, 307, 308):
 		return ACTION_FOLLOW_OR_SUCCESS, True
 	await asyncio.sleep(5)
 	return ACTION_RETRY, True


 async def handle_response_ignore_redirects(url, attempt, response, exc):
 	'''A response handler that does not follow redirects, i.e. treats them as a success instead. It behaves as handle_response_default otherwise.'''

 	action, writeToWarc = await handle_response_default(url, attempt, response, exc)
 	if action == ACTION_FOLLOW_OR_SUCCESS:
 		action = ACTION_SUCCESS
 	return action, writeToWarc


 def handle_response_limit_error_retries(maxRetries):
 	'''A response handler that limits the number of retries on errors. It behaves as handle_response_default otherwise.

 	Technically, this is actually a response handler factory. This is so that the intuitive use works: fetch(..., responseHandler = handle_response_limit_error_retries(5))

 	If you use the same limit many times, you should keep the return value (the response handler) of this method and reuse it to avoid creating a new function every time.
 	'''

 	async def handler(url, attempt, response, exc):
 		action, writeToWarc = await handle_response_default(url, attempt, response, exc)
 		if action == ACTION_RETRY and attempt > maxRetries:
 			action = ACTION_IGNORE
 		return action, writeToWarc
 	return handler
--- a/qwarc/warc.py
+++ b/qwarc/warc.py
@@ -0,0 +1,93 @@
 import fcntl
 import io
 import logging
 import time
 import warcio


 class WARCWriter(warcio.warcwriter.WARCWriter):
 	def _do_write_req_resp(self, req, resp, params): #FIXME: Internal API
 		# Write request before response, like wget and wpull; cf. https://github.com/webrecorder/warcio/issues/20
 		self._write_warc_record(self.out, req)
 		self._write_warc_record(self.out, resp)


 class WARC:
 	def __init__(self, prefix, maxFileSize):
 		'''
 		Initialise the WARC writer

 		prefix: str, path prefix for WARCs; a dash, a five-digit number, and ".warc.gz" will be appended.
 		maxFileSize: int, maximum size of an individual WARC. Use 0 to disable splitting.
 		'''

 		self._prefix = prefix
 		self._counter = 0
 		self._maxFileSize = maxFileSize

 		self._closed = True
 		self._file = None
 		self._warcWriter = None

 		self._cycle()

 	def _cycle(self):
 		'''Close the current file, open the next file that doesn't exist yet'''

 		#TODO: This opens a new file also at the end, which can result in empty WARCs. Should try to reorder this to only open a WARC when writing a record, and to only close the current WARC if the size is exceeded after write_client_response.
 		self.close()
 		while True:
 			filename = '{}-{:05d}.warc.gz'.format(self._prefix, self._counter)
 			try:
 				# Try to open the file for writing, requiring that it does not exist yet, and attempt to get an exclusive, non-blocking lock on it
 				self._file = open(filename, 'xb')
 				fcntl.flock(self._file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
 			except FileExistsError:
 				logging.info('{} already exists, skipping'.format(filename))
 				self._counter += 1
 			else:
 				break
 		logging.info('Opened {}'.format(filename))
 		self._warcWriter = WARCWriter(self._file, gzip = True)
 		self._closed = False
 		self._counter += 1

 	def write_client_response(self, response):
 		'''
 		Write the requests and responses stored in a ClientResponse instance to the currently opened WARC.
 		A new WARC will be started automatically if the size of the current file exceeds the limit after writing all requests and responses from this `response` to the current WARC.
 		'''

 		for r in response.iter_all():
 			requestDate = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(r.rawRequestTimestamp))
 			requestRecord = self._warcWriter.create_warc_record(
 			    str(r.url),
 			    'request',
 			    payload = io.BytesIO(r.rawRequestData),
 			    warc_headers_dict = {
 			      'WARC-Date': requestDate,
 			      'WARC-IP-Address': r.remoteAddress[0],
 			    }
 			  )
 			responseRecord = self._warcWriter.create_warc_record(
 			    str(r.url),
 			    'response',
 			    payload = io.BytesIO(r.rawResponseData),
 			    warc_headers_dict = {
 			      'WARC-Date': requestDate,
 			      'WARC-IP-Address': r.remoteAddress[0],
 			    }
 			  )
 			self._warcWriter.write_request_response_pair(requestRecord, responseRecord)

 		if self._maxFileSize and self._file.tell() > self._maxFileSize:
 			self._cycle()

 	def close(self):
 		'''Close the currently opened WARC'''

 		if not self._closed:
 			self._file.close()
 			self._warcWriter = None
 			self._file = None
 			self._closed = True
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,22 @@
 import setuptools


 setuptools.setup(
 	name = 'qwarc',
 	version = '0.0-dev',
 	description = 'A framework for quick web archival',
 	author = 'JustAnotherArchivist',
 	url = 'https://github.com/JustAnotherArchivist/qwarc',
 	classifiers = [
 		'Development Status :: 3 - Alpha',
 		'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)',
 		'Programming Language :: Python :: 3.6',
 	],
 	packages = ['qwarc'],
 	install_requires = ['aiohttp==2.3.10', 'warcio', 'yarl'],
 	entry_points = {
 		'console_scripts': [
 			'qwarc = qwarc.cli:main',
 		],
 	},
 )