@@ -0,0 +1,316 @@ | |||
import qwarc.aiohttp | |||
from qwarc.const import * | |||
import qwarc.utils | |||
import qwarc.warc | |||
import aiohttp as _aiohttp | |||
if _aiohttp.__version__ != '2.3.10': | |||
raise ImportError('aiohttp must be version 2.3.10') | |||
import asyncio | |||
import collections | |||
import concurrent.futures | |||
import itertools | |||
import logging | |||
import os | |||
import random | |||
import sqlite3 | |||
import yarl | |||
class Item: | |||
itemType = None | |||
def __init__(self, itemValue, session, headers, warc): | |||
self.itemValue = itemValue | |||
self.session = session | |||
self.headers = headers | |||
self.warc = warc | |||
self.stats = {'tx': 0, 'rx': 0, 'requests': 0} | |||
self.childItems = [] | |||
async def fetch(self, url, responseHandler = qwarc.utils.handle_response_default): | |||
''' | |||
HTTP GET a URL | |||
url: str or yarl.URL | |||
responseHandler: a callable that determines how the response is handled. See qwarc.utils.handle_response_default for details. | |||
Returns response (a ClientResponse object or None) and history (a tuple of (response, exception) tuples). | |||
response can be None and history can be an empty tuple, depending on the circumstances (e.g. timeouts). | |||
''' | |||
#TODO: Rewrite using 'async with self.session.get' | |||
url = yarl.URL(url) # Explicitly convert for normalisation, percent-encoding, etc. | |||
history = [] | |||
attempt = 0 | |||
#TODO redirectLevel | |||
while True: | |||
attempt += 1 | |||
response = None | |||
exc = None | |||
action = ACTION_RETRY | |||
writeToWarc = True | |||
try: | |||
try: | |||
with _aiohttp.Timeout(60): | |||
logging.info('Fetching {}'.format(url)) | |||
response = await self.session.get(url, headers = self.headers, allow_redirects = False) | |||
try: | |||
ret = await response.text(errors = 'surrogateescape') | |||
except: | |||
# No calling the handleResponse callback here because this is really bad. The not-so-bad exceptions (e.g. an error during reading the response) will be caught further down. | |||
response.close() | |||
raise | |||
else: | |||
tx = len(response.rawRequestData) | |||
rx = len(response.rawResponseData) | |||
logging.info('Fetched {}: {} (tx {}, rx {})'.format(url, response.status, tx, rx)) | |||
self.stats['tx'] += tx | |||
self.stats['rx'] += rx | |||
self.stats['requests'] += 1 | |||
except (asyncio.TimeoutError, _aiohttp.ClientError) as e: | |||
logging.error('Request for {} failed: {!r}'.format(url, e)) | |||
action, writeToWarc = await responseHandler(url, attempt, response, e) | |||
exc = e # Pass the exception outward for the history | |||
else: | |||
action, writeToWarc = await responseHandler(url, attempt, response, None) | |||
history.append((response, exc)) | |||
if action in (ACTION_SUCCESS, ACTION_IGNORE): | |||
return response, tuple(history) | |||
elif action == ACTION_FOLLOW_OR_SUCCESS: | |||
redirectUrl = response.headers.get('Location') or response.headers.get('URI') | |||
if not redirectUrl: | |||
return response, tuple(history) | |||
url = url.join(yarl.URL(redirectUrl)) | |||
attempt = 0 | |||
elif action == ACTION_RETRY: | |||
# Nothing to do, just go to the next cycle | |||
pass | |||
finally: | |||
if response: | |||
if writeToWarc: | |||
self.warc.write_client_response(response) | |||
await response.release() | |||
async def process(self): | |||
raise NotImplementedError | |||
@classmethod | |||
def generate(cls): | |||
yield from () # Generate no items by default | |||
@classmethod | |||
def _gen(cls): | |||
for x in cls.generate(): | |||
yield (cls.itemType, x, STATUS_TODO) | |||
def add_item(self, itemClassOrType, itemValue): | |||
if issubclass(itemClassOrType, Item): | |||
item = (itemClassOrType.itemType, itemValue) | |||
else: | |||
item = (itemClassOrType, itemValue) | |||
if item not in self.childItems: | |||
self.childItems.append(item) | |||
class QWARC: | |||
def __init__(self, itemClasses, warcBasePath, dbPath, concurrency = 1, memoryLimit = 0, minFreeDisk = 0, warcSizeLimit = 0): | |||
''' | |||
itemClasses: iterable of Item | |||
warcBasePath: str, base name of the WARC files | |||
dbPath: str, path to the sqlite3 database file | |||
concurrency: int, number of concurrently processed items | |||
memoryLimit: int, gracefully stop when the process uses more than memoryLimit bytes of RSS; 0 disables the memory check | |||
minFreeDisk: int, pause when there's less than minFreeDisk space on the partition where WARCs are written; 0 disables the disk space check | |||
warcSizeLimit: int, size of each WARC file; 0 if the WARCs should not be split | |||
''' | |||
self._itemClasses = itemClasses | |||
self._itemTypeMap = {cls.itemType: cls for cls in itemClasses} | |||
self._warcBasePath = warcBasePath | |||
self._dbPath = dbPath | |||
self._concurrency = concurrency | |||
self._memoryLimit = memoryLimit | |||
self._minFreeDisk = minFreeDisk | |||
self._warcSizeLimit = warcSizeLimit | |||
async def obtain_exclusive_db_lock(self, db): | |||
c = db.cursor() | |||
while True: | |||
try: | |||
c.execute('BEGIN EXCLUSIVE') | |||
break | |||
except sqlite3.OperationalError as e: | |||
if str(e) != 'database is locked': | |||
raise | |||
await asyncio.sleep(1) | |||
return c | |||
def _make_item(self, itemType, itemValue, session, headers, warc): | |||
try: | |||
itemClass = self._itemTypeMap[itemType] | |||
except KeyError: | |||
raise RuntimeError('No such item type: {!r}'.format(itemType)) | |||
return itemClass(itemValue, session, headers, warc) | |||
async def run(self, loop): | |||
headers = [('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0')] #TODO: Move elsewhere | |||
tasks = set() | |||
sleepTasks = set() | |||
sessions = [] # aiohttp.ClientSession instances | |||
freeSessions = collections.deque() # ClientSession instances that are currently free | |||
for i in range(self._concurrency): | |||
session = _aiohttp.ClientSession( | |||
connector = qwarc.aiohttp.TCPConnector(loop = loop), | |||
request_class = qwarc.aiohttp.ClientRequest, | |||
response_class = qwarc.aiohttp.ClientResponse, | |||
skip_auto_headers = ['Accept-Encoding'], | |||
loop = loop | |||
) | |||
sessions.append(session) | |||
freeSessions.append(session) | |||
warc = qwarc.warc.WARC(self._warcBasePath, self._warcSizeLimit) | |||
db = sqlite3.connect(self._dbPath, timeout = 1) | |||
db.isolation_level = None # Transactions are handled manually below. | |||
db.execute('PRAGMA synchronous = OFF') | |||
try: | |||
async def wait_for_free_task(): | |||
nonlocal tasks, freeSessions, db, emptyTodoSleep | |||
done, pending = await asyncio.wait(tasks, return_when = concurrent.futures.FIRST_COMPLETED) | |||
for future in done: | |||
# TODO Replace all of this with `if future.cancelled():` | |||
try: | |||
await future #TODO: Is this actually necessary? asyncio.wait only returns 'done' futures... | |||
except concurrent.futures.CancelledError as e: | |||
# Got cancelled, nothing we can do about it, but let's log a warning if it's a process task | |||
if isinstance(future, asyncio.Task): | |||
if future.taskType == 'process_item': | |||
logging.warning('Task for {}:{} cancelled: {!r}'.format(future.itemType, future.itemValue, future)) | |||
elif future.taskType == 'sleep': | |||
sleepTasks.remove(future) | |||
continue | |||
if future.taskType == 'sleep': | |||
# Dummy task for empty todo list, see below. | |||
sleepTasks.remove(future) | |||
continue | |||
item = future.item | |||
logging.info('{itemType}:{itemValue} done: {requests} requests, {tx} tx, {rx} rx'.format(itemType = future.itemType, itemValue = future.itemValue, **item.stats)) | |||
cursor = await self.obtain_exclusive_db_lock(db) | |||
try: | |||
cursor.execute('UPDATE items SET status = ? WHERE id = ?', (STATUS_DONE, future.id)) | |||
if item.childItems: | |||
it = iter(item.childItems) | |||
while True: | |||
values = [(t, v, STATUS_TODO) for t, v in itertools.islice(it, 100000)] | |||
if not values: | |||
break | |||
cursor.executemany('INSERT INTO items (type, value, status) VALUES (?, ?, ?)', values) | |||
cursor.execute('COMMIT') | |||
except: | |||
cursor.execute('ROLLBACK') | |||
raise | |||
freeSessions.append(item.session) | |||
tasks = pending | |||
while True: | |||
while len(tasks) >= self._concurrency: | |||
emptyTodoFullReached = True | |||
await wait_for_free_task() | |||
if self._minFreeDisk and too_little_disk_space(self._minFreeDisk): | |||
logging.info('Disk space is low, sleeping') | |||
sleepTask = asyncio.sleep(random.uniform(self._concurrency / 2, self._concurrency * 1.5)) | |||
sleepTask.taskType = 'sleep' | |||
tasks.add(sleepTask) | |||
sleepTasks.add(sleepTask) | |||
continue | |||
cursor = await self.obtain_exclusive_db_lock(db) | |||
try: | |||
cursor.execute('SELECT id, type, value, status FROM items WHERE status = ? LIMIT 1', (STATUS_TODO,)) | |||
result = cursor.fetchone() | |||
if not result: | |||
if cursor.execute('SELECT id, status FROM items WHERE status != ? LIMIT 1', (STATUS_DONE,)).fetchone(): | |||
# There is currently no item to do, but there are still some in progress, so more TODOs may appear in the future. | |||
# It would be nice if we could just await wait_for_free_task() here, but that doesn't work because those TODOs might be in another process. | |||
# So instead, we insert a dummy task which just sleeps a bit. Average sleep time is equal to concurrency, i.e. one check per second. | |||
#TODO: The average sleep time is too large if there are only few sleep tasks; scale with len(sleepTasks)/self._concurrency? | |||
sleepTask = asyncio.ensure_future(asyncio.sleep(random.uniform(self._concurrency / 2, self._concurrency * 1.5))) | |||
sleepTask.taskType = 'sleep' | |||
tasks.add(sleepTask) | |||
sleepTasks.add(sleepTask) | |||
cursor.execute('COMMIT') | |||
continue | |||
else: | |||
# Really nothing to do anymore | |||
#TODO: Another process may be running create_db, in which case we'd still want to wait... | |||
# create_db could insert a dummy item which is marked as done when the DB is ready | |||
cursor.execute('COMMIT') | |||
break | |||
emptyTodoSleep = 0 | |||
id, itemType, itemValue, status = result | |||
cursor.execute('UPDATE items SET status = ? WHERE id = ?', (STATUS_INPROGRESS, id)) | |||
cursor.execute('COMMIT') | |||
except: | |||
cursor.execute('ROLLBACK') | |||
raise | |||
session = freeSessions.popleft() | |||
item = self._make_item(itemType, itemValue, session, headers, warc) | |||
task = asyncio.ensure_future(item.process()) | |||
#TODO: Is there a better way to add custom information to a task/coroutine object? | |||
task.taskType = 'process' | |||
task.id = id | |||
task.itemType = itemType | |||
task.itemValue = itemValue | |||
task.item = item | |||
tasks.add(task) | |||
if os.path.exists('STOP'): | |||
logging.info('Gracefully shutting down due to STOP file') | |||
break | |||
if self._memoryLimit and uses_too_much_memory(self._memoryLimit): | |||
logging.info('Gracefully shutting down due to memory usage (current = {} > limit = {})'.format(get_rss(), self._memoryLimit)) | |||
break | |||
for sleepTask in sleepTasks: | |||
sleepTask.cancel() | |||
while len(tasks): | |||
await wait_for_free_task() | |||
logging.info('Done') | |||
except (Exception, KeyboardInterrupt) as e: | |||
# Kill all tasks | |||
for task in tasks: | |||
task.cancel() | |||
await asyncio.wait(tasks, return_when = concurrent.futures.ALL_COMPLETED) | |||
raise | |||
finally: | |||
for session in sessions: | |||
session.close() | |||
warc.close() | |||
db.close() | |||
def create_db(self): | |||
db = sqlite3.connect(self._dbPath, timeout = 1) | |||
db.execute('PRAGMA synchronous = OFF') | |||
with db: | |||
db.execute('CREATE TABLE items (id INTEGER PRIMARY KEY, type TEXT, value TEXT, status INTEGER)') | |||
db.execute('CREATE INDEX items_status_idx ON items (status)') | |||
it = itertools.chain(*(i._gen() for i in self._itemClasses)) | |||
while True: | |||
values = tuple(itertools.islice(it, 100000)) | |||
if not values: | |||
break | |||
with db: | |||
db.executemany('INSERT INTO items (type, value, status) VALUES (?, ?, ?)', values) |
@@ -0,0 +1,113 @@ | |||
import aiohttp | |||
import aiohttp.client_proto | |||
import aiohttp.connector | |||
import functools | |||
import itertools | |||
import time | |||
# aiohttp does not expose the raw data sent over the wire, so we need to get a bit creative... | |||
# The ResponseHandler handles received data; the writes are done directly on the underlying transport. | |||
# So ResponseHandler is replaced with a class which keeps all received data in a list, and the transport's write method is replaced with one which sends back all written data to the ResponseHandler. | |||
# Because the ResponseHandler instance disappears when the connection is closed (ClientResponse.{_response_eof,close,release}), ClientResponse copies the references to the data objects in the RequestHandler. | |||
# aiohttp also does connection pooling/reuse, so ClientRequest resets the raw data when the request is sent. (This would not work with pipelining, but aiohttp does not support pipelining: https://github.com/aio-libs/aiohttp/issues/1740 ) | |||
# This code has been developed for aiohttp version 2.3.10. | |||
#TODO: THERE IS A MEMORY LEAK HERE SOMEWHERE! I spent a whole day trying to find it without success. | |||
class RawData: | |||
def __init__(self): | |||
self.requestTimestamp = None | |||
self.requestData = [] | |||
self.responseTimestamp = None | |||
self.responseData = [] | |||
class ResponseHandler(aiohttp.client_proto.ResponseHandler): | |||
def __init__(self, *args, **kwargs): | |||
super().__init__(*args, **kwargs) | |||
self.rawData = None | |||
self.remoteAddress = None | |||
def data_received(self, data): | |||
super().data_received(data) | |||
if not data: | |||
return | |||
if self.rawData.responseTimestamp is None: | |||
self.rawData.responseTimestamp = time.time() | |||
self.rawData.responseData.append(data) | |||
def reset_raw_data(self): | |||
self.rawData = RawData() | |||
def make_transport_write(transport, protocol): | |||
transport._real_write = transport.write | |||
def write(self, data): | |||
if protocol.rawData.requestTimestamp is None: | |||
protocol.rawData.requestTimestamp = time.time() | |||
protocol.rawData.requestData.append(data) | |||
self._real_write(data) | |||
return write | |||
class TCPConnector(aiohttp.connector.TCPConnector): | |||
def __init__(self, *args, loop = None, **kwargs): | |||
super().__init__(*args, loop = loop, **kwargs) | |||
self._factory = functools.partial(ResponseHandler, loop = loop) | |||
async def _wrap_create_connection(self, protocolFactory, host, port, *args, **kwargs): #FIXME: Uses internal API | |||
transport, protocol = await super()._wrap_create_connection(protocolFactory, host, port, *args, **kwargs) | |||
transport.write = make_transport_write(transport, protocol).__get__(transport, type(transport)) # https://stackoverflow.com/a/28127947 | |||
protocol.remoteAddress = (host, port) | |||
return (transport, protocol) | |||
class ClientRequest(aiohttp.client_reqrep.ClientRequest): | |||
def send(self, connection): | |||
connection.protocol.reset_raw_data() | |||
return super().send(connection) | |||
class ClientResponse(aiohttp.client_reqrep.ClientResponse): | |||
def __init__(self, *args, **kwargs): | |||
super().__init__(*args, **kwargs) | |||
self._rawData = None | |||
self._remoteAddress = None | |||
async def start(self, connection, readUntilEof): | |||
self._rawData = connection.protocol.rawData | |||
self._remoteAddress = connection.protocol.remoteAddress | |||
return (await super().start(connection, readUntilEof)) | |||
@property | |||
def rawRequestTimestamp(self): | |||
return self._rawData.requestTimestamp | |||
@property | |||
def rawRequestData(self): | |||
return b''.join(self._rawData.requestData) | |||
@property | |||
def rawResponseTimestamp(self): | |||
return self._rawData.responseTimestamp | |||
@property | |||
def rawResponseData(self): | |||
return b''.join(self._rawData.responseData) | |||
@property | |||
def remoteAddress(self): | |||
return self._remoteAddress | |||
def set_history(self, history): | |||
self._history = history #FIXME: Uses private attribute of aiohttp.client_reqrep.ClientResponse | |||
def iter_all(self): | |||
return itertools.chain(self.history, (self,)) | |||
async def release(self): | |||
if not self.closed: | |||
self.connection.reset_raw_data() | |||
await super().release() |
@@ -0,0 +1,81 @@ | |||
import argparse | |||
import asyncio | |||
import importlib.util | |||
import logging | |||
import os.path | |||
import qwarc | |||
import sys | |||
import time | |||
def setup_logging(logFilename): | |||
rootLogger = logging.getLogger() | |||
rootLogger.handlers = [] | |||
rootLogger.setLevel(logging.INFO) | |||
formatter = logging.Formatter('%(asctime)s.%(msecs)03dZ %(levelname)s %(message)s', datefmt = '%Y-%m-%d %H:%M:%S') | |||
formatter.converter = time.gmtime | |||
fileHandler = logging.FileHandler(logFilename) | |||
fileHandler.setFormatter(formatter) | |||
rootLogger.addHandler(fileHandler) | |||
stderrHandler = logging.StreamHandler() | |||
stderrHandler.setFormatter(formatter) | |||
rootLogger.addHandler(stderrHandler) | |||
def check_files(specFilename, logFilename): | |||
success = True | |||
if not os.path.isfile(specFilename): | |||
print('Error: "{}" does not exist or is not a regular file', file = sys.stderr) | |||
success = False | |||
if os.path.exists(logFilename): | |||
print('Error: "{}" already exists'.format(logFilename), file = sys.stderr) | |||
success = False | |||
if os.path.exists('STOP'): | |||
print('Error: "STOP" exists', file = sys.stderr) | |||
success = False | |||
return success | |||
def main(): | |||
parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter) | |||
parser.add_argument('--log', metavar = 'LOGFILE', default = './qwarc.log') | |||
parser.add_argument('--database', metavar = 'DBFILE', default = './qwarc.db') | |||
parser.add_argument('--warc', metavar = 'PREFIX', help = 'prefix for the WARC filenames', default = './qwarc') | |||
parser.add_argument('--concurrency', type = int, default = 1) | |||
parser.add_argument('--memorylimit', metavar = 'LIMIT', help = 'pause when less than LIMIT bytes memory is free; disable if 0', default = 0) | |||
parser.add_argument('--disklimit', metavar = 'LIMIT', help = 'pause when less than LIMIT bytes disk space is free; disable if 0', default = 0) | |||
parser.add_argument('--warcsplit', metavar = 'SIZE', help = 'split WARCs into files of SIZE bytes; disable if 0', default = 0) | |||
parser.add_argument('specfile') | |||
args = parser.parse_args() | |||
if not check_files(args.specfile, args.log): | |||
sys.exit(1) | |||
setup_logging(args.log) | |||
spec = importlib.util.spec_from_file_location('spec', args.specfile) | |||
specMod = importlib.util.module_from_spec(spec) | |||
spec.loader.exec_module(specMod) | |||
a = qwarc.QWARC( | |||
itemClasses = qwarc.Item.__subclasses__(), | |||
warcBasePath = args.warc, | |||
dbPath = args.database, | |||
concurrency = args.concurrency, | |||
memoryLimit = args.memorylimit, | |||
minFreeDisk = args.disklimit, | |||
warcSizeLimit = args.warcsplit, | |||
) | |||
if not os.path.exists(args.database): | |||
a.create_db() | |||
loop = asyncio.get_event_loop() | |||
try: | |||
loop.run_until_complete(a.run(loop)) | |||
except (Exception, KeyboardInterrupt) as e: | |||
logging.exception('Unhandled error') | |||
loop.close() |
@@ -0,0 +1,23 @@ | |||
STATUS_TODO = 0 | |||
'''Status of an item that has not been processed yet''' | |||
STATUS_INPROGRESS = 1 | |||
'''Status of an item that is currently being processed''' | |||
STATUS_DONE = 2 | |||
'''Status of an item that has been processed''' | |||
#TODO: Add a STATUS_ERROR? | |||
ACTION_SUCCESS = 0 | |||
'''Treat this response as a success''' | |||
ACTION_IGNORE = 1 #TODO Replace with ACTION_SUCCESS since it's really the same thing. | |||
'''Ignore this response''' | |||
ACTION_RETRY = 2 | |||
'''Retry the same request''' | |||
ACTION_FOLLOW_OR_SUCCESS = 3 | |||
'''If the response contains a Location or URI header, follow it. Otherwise, treat it as a success.''' | |||
#TODO: Rename to ACTION_FOLLOW maybe? However, the current name makes it more clear what qwarc does when there's a redirect without a redirect target... |
@@ -0,0 +1,182 @@ | |||
from qwarc.const import * | |||
import asyncio | |||
import os | |||
PAGESIZE = os.sysconf('SC_PAGE_SIZE') | |||
def get_rss(): | |||
'''Get the current RSS of this process in bytes''' | |||
with open('/proc/self/statm', 'r') as fp: | |||
return int(fp.readline().split()[1]) * PAGESIZE | |||
def get_disk_free(): | |||
'''Get the current free disk space on the relevant partition in bytes''' | |||
st = os.statvfs('.') | |||
return st.f_bavail * st.f_frsize | |||
def uses_too_much_memory(limit): | |||
''' | |||
Check whether the process is using too much memory | |||
For performance reasons, this actually only checks the memory usage on every 100th call. | |||
''' | |||
uses_too_much_memory.callCounter += 1 | |||
# Only check every hundredth call | |||
if uses_too_much_memory.callCounter % 100 == 0 and get_rss() > limit: | |||
return True | |||
return False | |||
uses_too_much_memory.callCounter = 0 | |||
def too_little_disk_space(limit): | |||
''' | |||
Check whether the disk space is too small | |||
For performance reasons, this actually only checks the free disk space on every 100th call. | |||
''' | |||
too_little_disk_space.callCounter += 1 | |||
if too_little_disk_space.callCounter % 100 == 0: | |||
too_little_disk_space.currentResult = (get_disk_free() < limit) | |||
return too_little_disk_space.currentResult | |||
too_little_disk_space.callCounter = 0 | |||
too_little_disk_space.currentResult = False | |||
# https://stackoverflow.com/a/4665027 | |||
def find_all(aStr, sub): | |||
'''Generator yielding the start positions of every non-overlapping occurrence of sub in aStr.''' | |||
start = 0 | |||
while True: | |||
start = aStr.find(sub, start) | |||
if start == -1: | |||
return | |||
yield start | |||
start += len(sub) | |||
def str_get_between(aStr, a, b): | |||
'''Get the string after the first occurrence of a in aStr and the first occurrence of b after that of a, or None if there is no such string.''' | |||
aPos = aStr.find(a) | |||
if aPos == -1: | |||
return None | |||
offset = aPos + len(a) | |||
bPos = aStr.find(b, offset) | |||
if bPos == -1: | |||
return None | |||
return aStr[offset:bPos] | |||
def maybe_str_get_between(x, a, b): | |||
'''Like str_get_between, but returns None if x evaluates to False and converts it to a str before matching.''' | |||
if x: | |||
return str_get_between(str(x), a, b) | |||
def str_get_all_between(aStr, a, b): | |||
'''Generator yielding every string between occurrences of a in aStr and the following occurrence of b.''' | |||
#TODO: This produces half-overlapping matches: str_get_all_between('aabc', 'a', 'c') will yield 'ab' and 'b'. | |||
# Might need to implement sending an offset to the find_all generator to work around this, or discard aOffset values which are smaller than the previous bPos+len(b). | |||
for aOffset in find_all(aStr, a): | |||
offset = aOffset + len(a) | |||
bPos = aStr.find(b, offset) | |||
if bPos != -1: | |||
yield aStr[offset:bPos] | |||
def maybe_str_get_all_between(x, a, b): | |||
'''Like str_get_all_between, but yields no elements if x evaluates to False and converts x to a str before matching.''' | |||
if x: | |||
yield from str_get_all_between(str(x), a, b) | |||
def generate_range_items(start, stop, step): | |||
''' | |||
Generator for items of `step` size between `start` and `stop` (inclusive) | |||
Yields strings of the form `'a-b'` where `a` and `b` are integers such that `b - a + 1 == step`, `min(a) == start`, and `max(b) == stop`. | |||
`b - a + 1` may be unequal to `step` on the last item if `(stop - start + 1) % step != 0` (see examples below). | |||
Note that `a` and `b` can be equal on the last item if `(stop - start) % step == 0` (see examples below). | |||
Examples: | |||
- generate_range_items(0, 99, 10) yields '0-9', '10-19', '20-29', ..., '90-99' | |||
- generate_range_items(0, 42, 10): '0-9', '10-19', '20-29', '30-39', '40-42' | |||
- generate_range_items(0, 20, 10): '0-9', '10-19', '20-20' | |||
''' | |||
for i in range(start, stop + 1, step): | |||
yield '{}-{}'.format(i, min(i + step - 1, stop)) | |||
async def handle_response_default(url, attempt, response, exc): | |||
''' | |||
The default response handler, which behaves as follows: | |||
- If there is no response (e.g. timeout error), retry the retrieval after a delay of 5 seconds. | |||
- If the response has any of the status codes 401, 403, 404, 405, or 410, treat it as a permanent error and return. | |||
- If there was any exception and it is a asyncio.TimeoutError or a aiohttp.ClientError, treat as a potentially temporary error and retry the retrieval after a delay of 5 seconds. | |||
- If the response has any of the status codes 200, 204, 206, or 304, treat it as a success and return. | |||
- If the response has any of the status codes 301, 302, 303, 307, or 308, follow the redirect target if specified or return otherwise. | |||
- Otherwise, treat as a potentially temporary error and retry the retrieval after a delay of 5 seconds. | |||
- All responses are written to WARC by default. | |||
Note that this handler does not limit the number of retries on errors. | |||
Parameters: url (yarl.URL instance), attempt (int), response (aiohttp.ClientResponse or None), exc (Exception or None) | |||
At least one of response and exc is not None. | |||
Returns: (one of the qwarc.RESPONSE_* constants, bool signifying whether to write to WARC or not) | |||
''' | |||
#TODO: Document that `attempt` is reset on redirects | |||
if response is None: | |||
await asyncio.sleep(5) | |||
return ACTION_RETRY, True | |||
if response.status in (401, 403, 404, 405, 410): | |||
return ACTION_IGNORE, True | |||
if exc is not None and isinstance(exc, (asyncio.TimeoutError, _aiohttp.ClientError)): | |||
await asyncio.sleep(5) | |||
return ACTION_RETRY, True | |||
if response.status in (200, 204, 206, 304): | |||
return ACTION_SUCCESS, True | |||
if response.status in (301, 302, 303, 307, 308): | |||
return ACTION_FOLLOW_OR_SUCCESS, True | |||
await asyncio.sleep(5) | |||
return ACTION_RETRY, True | |||
async def handle_response_ignore_redirects(url, attempt, response, exc): | |||
'''A response handler that does not follow redirects, i.e. treats them as a success instead. It behaves as handle_response_default otherwise.''' | |||
action, writeToWarc = await handle_response_default(url, attempt, response, exc) | |||
if action == ACTION_FOLLOW_OR_SUCCESS: | |||
action = ACTION_SUCCESS | |||
return action, writeToWarc | |||
def handle_response_limit_error_retries(maxRetries): | |||
'''A response handler that limits the number of retries on errors. It behaves as handle_response_default otherwise. | |||
Technically, this is actually a response handler factory. This is so that the intuitive use works: fetch(..., responseHandler = handle_response_limit_error_retries(5)) | |||
If you use the same limit many times, you should keep the return value (the response handler) of this method and reuse it to avoid creating a new function every time. | |||
''' | |||
async def handler(url, attempt, response, exc): | |||
action, writeToWarc = await handle_response_default(url, attempt, response, exc) | |||
if action == ACTION_RETRY and attempt > maxRetries: | |||
action = ACTION_IGNORE | |||
return action, writeToWarc | |||
return handler |
@@ -0,0 +1,93 @@ | |||
import fcntl | |||
import io | |||
import logging | |||
import time | |||
import warcio | |||
class WARCWriter(warcio.warcwriter.WARCWriter): | |||
def _do_write_req_resp(self, req, resp, params): #FIXME: Internal API | |||
# Write request before response, like wget and wpull; cf. https://github.com/webrecorder/warcio/issues/20 | |||
self._write_warc_record(self.out, req) | |||
self._write_warc_record(self.out, resp) | |||
class WARC: | |||
def __init__(self, prefix, maxFileSize): | |||
''' | |||
Initialise the WARC writer | |||
prefix: str, path prefix for WARCs; a dash, a five-digit number, and ".warc.gz" will be appended. | |||
maxFileSize: int, maximum size of an individual WARC. Use 0 to disable splitting. | |||
''' | |||
self._prefix = prefix | |||
self._counter = 0 | |||
self._maxFileSize = maxFileSize | |||
self._closed = True | |||
self._file = None | |||
self._warcWriter = None | |||
self._cycle() | |||
def _cycle(self): | |||
'''Close the current file, open the next file that doesn't exist yet''' | |||
#TODO: This opens a new file also at the end, which can result in empty WARCs. Should try to reorder this to only open a WARC when writing a record, and to only close the current WARC if the size is exceeded after write_client_response. | |||
self.close() | |||
while True: | |||
filename = '{}-{:05d}.warc.gz'.format(self._prefix, self._counter) | |||
try: | |||
# Try to open the file for writing, requiring that it does not exist yet, and attempt to get an exclusive, non-blocking lock on it | |||
self._file = open(filename, 'xb') | |||
fcntl.flock(self._file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) | |||
except FileExistsError: | |||
logging.info('{} already exists, skipping'.format(filename)) | |||
self._counter += 1 | |||
else: | |||
break | |||
logging.info('Opened {}'.format(filename)) | |||
self._warcWriter = WARCWriter(self._file, gzip = True) | |||
self._closed = False | |||
self._counter += 1 | |||
def write_client_response(self, response): | |||
''' | |||
Write the requests and responses stored in a ClientResponse instance to the currently opened WARC. | |||
A new WARC will be started automatically if the size of the current file exceeds the limit after writing all requests and responses from this `response` to the current WARC. | |||
''' | |||
for r in response.iter_all(): | |||
requestDate = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(r.rawRequestTimestamp)) | |||
requestRecord = self._warcWriter.create_warc_record( | |||
str(r.url), | |||
'request', | |||
payload = io.BytesIO(r.rawRequestData), | |||
warc_headers_dict = { | |||
'WARC-Date': requestDate, | |||
'WARC-IP-Address': r.remoteAddress[0], | |||
} | |||
) | |||
responseRecord = self._warcWriter.create_warc_record( | |||
str(r.url), | |||
'response', | |||
payload = io.BytesIO(r.rawResponseData), | |||
warc_headers_dict = { | |||
'WARC-Date': requestDate, | |||
'WARC-IP-Address': r.remoteAddress[0], | |||
} | |||
) | |||
self._warcWriter.write_request_response_pair(requestRecord, responseRecord) | |||
if self._maxFileSize and self._file.tell() > self._maxFileSize: | |||
self._cycle() | |||
def close(self): | |||
'''Close the currently opened WARC''' | |||
if not self._closed: | |||
self._file.close() | |||
self._warcWriter = None | |||
self._file = None | |||
self._closed = True |
@@ -0,0 +1,22 @@ | |||
import setuptools | |||
setuptools.setup( | |||
name = 'qwarc', | |||
version = '0.0-dev', | |||
description = 'A framework for quick web archival', | |||
author = 'JustAnotherArchivist', | |||
url = 'https://github.com/JustAnotherArchivist/qwarc', | |||
classifiers = [ | |||
'Development Status :: 3 - Alpha', | |||
'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)', | |||
'Programming Language :: Python :: 3.6', | |||
], | |||
packages = ['qwarc'], | |||
install_requires = ['aiohttp==2.3.10', 'warcio', 'yarl'], | |||
entry_points = { | |||
'console_scripts': [ | |||
'qwarc = qwarc.cli:main', | |||
], | |||
}, | |||
) |