Browse Source

Initial commit

tags/v0.1.0
JustAnotherArchivist 5 years ago
commit
e892a6b6a7
7 changed files with 830 additions and 0 deletions
  1. +316
    -0
      qwarc/__init__.py
  2. +113
    -0
      qwarc/aiohttp.py
  3. +81
    -0
      qwarc/cli.py
  4. +23
    -0
      qwarc/const.py
  5. +182
    -0
      qwarc/utils.py
  6. +93
    -0
      qwarc/warc.py
  7. +22
    -0
      setup.py

+ 316
- 0
qwarc/__init__.py View File

@@ -0,0 +1,316 @@
import qwarc.aiohttp
from qwarc.const import *
import qwarc.utils
import qwarc.warc


import aiohttp as _aiohttp
if _aiohttp.__version__ != '2.3.10':
raise ImportError('aiohttp must be version 2.3.10')
import asyncio
import collections
import concurrent.futures
import itertools
import logging
import os
import random
import sqlite3
import yarl


class Item:
itemType = None

def __init__(self, itemValue, session, headers, warc):
self.itemValue = itemValue
self.session = session
self.headers = headers
self.warc = warc
self.stats = {'tx': 0, 'rx': 0, 'requests': 0}

self.childItems = []

async def fetch(self, url, responseHandler = qwarc.utils.handle_response_default):
'''
HTTP GET a URL

url: str or yarl.URL
responseHandler: a callable that determines how the response is handled. See qwarc.utils.handle_response_default for details.

Returns response (a ClientResponse object or None) and history (a tuple of (response, exception) tuples).
response can be None and history can be an empty tuple, depending on the circumstances (e.g. timeouts).
'''

#TODO: Rewrite using 'async with self.session.get'

url = yarl.URL(url) # Explicitly convert for normalisation, percent-encoding, etc.
history = []
attempt = 0
#TODO redirectLevel
while True:
attempt += 1
response = None
exc = None
action = ACTION_RETRY
writeToWarc = True
try:
try:
with _aiohttp.Timeout(60):
logging.info('Fetching {}'.format(url))
response = await self.session.get(url, headers = self.headers, allow_redirects = False)
try:
ret = await response.text(errors = 'surrogateescape')
except:
# No calling the handleResponse callback here because this is really bad. The not-so-bad exceptions (e.g. an error during reading the response) will be caught further down.
response.close()
raise
else:
tx = len(response.rawRequestData)
rx = len(response.rawResponseData)
logging.info('Fetched {}: {} (tx {}, rx {})'.format(url, response.status, tx, rx))
self.stats['tx'] += tx
self.stats['rx'] += rx
self.stats['requests'] += 1
except (asyncio.TimeoutError, _aiohttp.ClientError) as e:
logging.error('Request for {} failed: {!r}'.format(url, e))
action, writeToWarc = await responseHandler(url, attempt, response, e)
exc = e # Pass the exception outward for the history
else:
action, writeToWarc = await responseHandler(url, attempt, response, None)
history.append((response, exc))
if action in (ACTION_SUCCESS, ACTION_IGNORE):
return response, tuple(history)
elif action == ACTION_FOLLOW_OR_SUCCESS:
redirectUrl = response.headers.get('Location') or response.headers.get('URI')
if not redirectUrl:
return response, tuple(history)
url = url.join(yarl.URL(redirectUrl))
attempt = 0
elif action == ACTION_RETRY:
# Nothing to do, just go to the next cycle
pass
finally:
if response:
if writeToWarc:
self.warc.write_client_response(response)
await response.release()

async def process(self):
raise NotImplementedError

@classmethod
def generate(cls):
yield from () # Generate no items by default

@classmethod
def _gen(cls):
for x in cls.generate():
yield (cls.itemType, x, STATUS_TODO)

def add_item(self, itemClassOrType, itemValue):
if issubclass(itemClassOrType, Item):
item = (itemClassOrType.itemType, itemValue)
else:
item = (itemClassOrType, itemValue)
if item not in self.childItems:
self.childItems.append(item)


class QWARC:
def __init__(self, itemClasses, warcBasePath, dbPath, concurrency = 1, memoryLimit = 0, minFreeDisk = 0, warcSizeLimit = 0):
'''
itemClasses: iterable of Item
warcBasePath: str, base name of the WARC files
dbPath: str, path to the sqlite3 database file
concurrency: int, number of concurrently processed items
memoryLimit: int, gracefully stop when the process uses more than memoryLimit bytes of RSS; 0 disables the memory check
minFreeDisk: int, pause when there's less than minFreeDisk space on the partition where WARCs are written; 0 disables the disk space check
warcSizeLimit: int, size of each WARC file; 0 if the WARCs should not be split
'''

self._itemClasses = itemClasses
self._itemTypeMap = {cls.itemType: cls for cls in itemClasses}
self._warcBasePath = warcBasePath
self._dbPath = dbPath
self._concurrency = concurrency
self._memoryLimit = memoryLimit
self._minFreeDisk = minFreeDisk
self._warcSizeLimit = warcSizeLimit

async def obtain_exclusive_db_lock(self, db):
c = db.cursor()
while True:
try:
c.execute('BEGIN EXCLUSIVE')
break
except sqlite3.OperationalError as e:
if str(e) != 'database is locked':
raise
await asyncio.sleep(1)
return c

def _make_item(self, itemType, itemValue, session, headers, warc):
try:
itemClass = self._itemTypeMap[itemType]
except KeyError:
raise RuntimeError('No such item type: {!r}'.format(itemType))
return itemClass(itemValue, session, headers, warc)

async def run(self, loop):
headers = [('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0')] #TODO: Move elsewhere

tasks = set()
sleepTasks = set()
sessions = [] # aiohttp.ClientSession instances
freeSessions = collections.deque() # ClientSession instances that are currently free

for i in range(self._concurrency):
session = _aiohttp.ClientSession(
connector = qwarc.aiohttp.TCPConnector(loop = loop),
request_class = qwarc.aiohttp.ClientRequest,
response_class = qwarc.aiohttp.ClientResponse,
skip_auto_headers = ['Accept-Encoding'],
loop = loop
)
sessions.append(session)
freeSessions.append(session)

warc = qwarc.warc.WARC(self._warcBasePath, self._warcSizeLimit)

db = sqlite3.connect(self._dbPath, timeout = 1)
db.isolation_level = None # Transactions are handled manually below.
db.execute('PRAGMA synchronous = OFF')

try:
async def wait_for_free_task():
nonlocal tasks, freeSessions, db, emptyTodoSleep
done, pending = await asyncio.wait(tasks, return_when = concurrent.futures.FIRST_COMPLETED)
for future in done:
# TODO Replace all of this with `if future.cancelled():`
try:
await future #TODO: Is this actually necessary? asyncio.wait only returns 'done' futures...
except concurrent.futures.CancelledError as e:
# Got cancelled, nothing we can do about it, but let's log a warning if it's a process task
if isinstance(future, asyncio.Task):
if future.taskType == 'process_item':
logging.warning('Task for {}:{} cancelled: {!r}'.format(future.itemType, future.itemValue, future))
elif future.taskType == 'sleep':
sleepTasks.remove(future)
continue
if future.taskType == 'sleep':
# Dummy task for empty todo list, see below.
sleepTasks.remove(future)
continue
item = future.item
logging.info('{itemType}:{itemValue} done: {requests} requests, {tx} tx, {rx} rx'.format(itemType = future.itemType, itemValue = future.itemValue, **item.stats))
cursor = await self.obtain_exclusive_db_lock(db)
try:
cursor.execute('UPDATE items SET status = ? WHERE id = ?', (STATUS_DONE, future.id))
if item.childItems:
it = iter(item.childItems)
while True:
values = [(t, v, STATUS_TODO) for t, v in itertools.islice(it, 100000)]
if not values:
break
cursor.executemany('INSERT INTO items (type, value, status) VALUES (?, ?, ?)', values)
cursor.execute('COMMIT')
except:
cursor.execute('ROLLBACK')
raise
freeSessions.append(item.session)
tasks = pending

while True:
while len(tasks) >= self._concurrency:
emptyTodoFullReached = True
await wait_for_free_task()

if self._minFreeDisk and too_little_disk_space(self._minFreeDisk):
logging.info('Disk space is low, sleeping')
sleepTask = asyncio.sleep(random.uniform(self._concurrency / 2, self._concurrency * 1.5))
sleepTask.taskType = 'sleep'
tasks.add(sleepTask)
sleepTasks.add(sleepTask)
continue

cursor = await self.obtain_exclusive_db_lock(db)
try:
cursor.execute('SELECT id, type, value, status FROM items WHERE status = ? LIMIT 1', (STATUS_TODO,))
result = cursor.fetchone()
if not result:
if cursor.execute('SELECT id, status FROM items WHERE status != ? LIMIT 1', (STATUS_DONE,)).fetchone():
# There is currently no item to do, but there are still some in progress, so more TODOs may appear in the future.
# It would be nice if we could just await wait_for_free_task() here, but that doesn't work because those TODOs might be in another process.
# So instead, we insert a dummy task which just sleeps a bit. Average sleep time is equal to concurrency, i.e. one check per second.
#TODO: The average sleep time is too large if there are only few sleep tasks; scale with len(sleepTasks)/self._concurrency?
sleepTask = asyncio.ensure_future(asyncio.sleep(random.uniform(self._concurrency / 2, self._concurrency * 1.5)))
sleepTask.taskType = 'sleep'
tasks.add(sleepTask)
sleepTasks.add(sleepTask)
cursor.execute('COMMIT')
continue
else:
# Really nothing to do anymore
#TODO: Another process may be running create_db, in which case we'd still want to wait...
# create_db could insert a dummy item which is marked as done when the DB is ready
cursor.execute('COMMIT')
break
emptyTodoSleep = 0
id, itemType, itemValue, status = result
cursor.execute('UPDATE items SET status = ? WHERE id = ?', (STATUS_INPROGRESS, id))
cursor.execute('COMMIT')
except:
cursor.execute('ROLLBACK')
raise

session = freeSessions.popleft()
item = self._make_item(itemType, itemValue, session, headers, warc)
task = asyncio.ensure_future(item.process())
#TODO: Is there a better way to add custom information to a task/coroutine object?
task.taskType = 'process'
task.id = id
task.itemType = itemType
task.itemValue = itemValue
task.item = item
tasks.add(task)
if os.path.exists('STOP'):
logging.info('Gracefully shutting down due to STOP file')
break
if self._memoryLimit and uses_too_much_memory(self._memoryLimit):
logging.info('Gracefully shutting down due to memory usage (current = {} > limit = {})'.format(get_rss(), self._memoryLimit))
break

for sleepTask in sleepTasks:
sleepTask.cancel()

while len(tasks):
await wait_for_free_task()

logging.info('Done')
except (Exception, KeyboardInterrupt) as e:
# Kill all tasks
for task in tasks:
task.cancel()
await asyncio.wait(tasks, return_when = concurrent.futures.ALL_COMPLETED)

raise
finally:
for session in sessions:
session.close()
warc.close()
db.close()

def create_db(self):
db = sqlite3.connect(self._dbPath, timeout = 1)
db.execute('PRAGMA synchronous = OFF')
with db:
db.execute('CREATE TABLE items (id INTEGER PRIMARY KEY, type TEXT, value TEXT, status INTEGER)')
db.execute('CREATE INDEX items_status_idx ON items (status)')

it = itertools.chain(*(i._gen() for i in self._itemClasses))
while True:
values = tuple(itertools.islice(it, 100000))
if not values:
break
with db:
db.executemany('INSERT INTO items (type, value, status) VALUES (?, ?, ?)', values)

+ 113
- 0
qwarc/aiohttp.py View File

@@ -0,0 +1,113 @@
import aiohttp
import aiohttp.client_proto
import aiohttp.connector
import functools
import itertools
import time


# aiohttp does not expose the raw data sent over the wire, so we need to get a bit creative...
# The ResponseHandler handles received data; the writes are done directly on the underlying transport.
# So ResponseHandler is replaced with a class which keeps all received data in a list, and the transport's write method is replaced with one which sends back all written data to the ResponseHandler.
# Because the ResponseHandler instance disappears when the connection is closed (ClientResponse.{_response_eof,close,release}), ClientResponse copies the references to the data objects in the RequestHandler.
# aiohttp also does connection pooling/reuse, so ClientRequest resets the raw data when the request is sent. (This would not work with pipelining, but aiohttp does not support pipelining: https://github.com/aio-libs/aiohttp/issues/1740 )
# This code has been developed for aiohttp version 2.3.10.

#TODO: THERE IS A MEMORY LEAK HERE SOMEWHERE! I spent a whole day trying to find it without success.


class RawData:
def __init__(self):
self.requestTimestamp = None
self.requestData = []
self.responseTimestamp = None
self.responseData = []


class ResponseHandler(aiohttp.client_proto.ResponseHandler):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.rawData = None
self.remoteAddress = None

def data_received(self, data):
super().data_received(data)
if not data:
return
if self.rawData.responseTimestamp is None:
self.rawData.responseTimestamp = time.time()
self.rawData.responseData.append(data)

def reset_raw_data(self):
self.rawData = RawData()


def make_transport_write(transport, protocol):
transport._real_write = transport.write
def write(self, data):
if protocol.rawData.requestTimestamp is None:
protocol.rawData.requestTimestamp = time.time()
protocol.rawData.requestData.append(data)
self._real_write(data)
return write


class TCPConnector(aiohttp.connector.TCPConnector):
def __init__(self, *args, loop = None, **kwargs):
super().__init__(*args, loop = loop, **kwargs)
self._factory = functools.partial(ResponseHandler, loop = loop)

async def _wrap_create_connection(self, protocolFactory, host, port, *args, **kwargs): #FIXME: Uses internal API
transport, protocol = await super()._wrap_create_connection(protocolFactory, host, port, *args, **kwargs)
transport.write = make_transport_write(transport, protocol).__get__(transport, type(transport)) # https://stackoverflow.com/a/28127947
protocol.remoteAddress = (host, port)
return (transport, protocol)


class ClientRequest(aiohttp.client_reqrep.ClientRequest):
def send(self, connection):
connection.protocol.reset_raw_data()
return super().send(connection)


class ClientResponse(aiohttp.client_reqrep.ClientResponse):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._rawData = None
self._remoteAddress = None

async def start(self, connection, readUntilEof):
self._rawData = connection.protocol.rawData
self._remoteAddress = connection.protocol.remoteAddress
return (await super().start(connection, readUntilEof))

@property
def rawRequestTimestamp(self):
return self._rawData.requestTimestamp

@property
def rawRequestData(self):
return b''.join(self._rawData.requestData)

@property
def rawResponseTimestamp(self):
return self._rawData.responseTimestamp

@property
def rawResponseData(self):
return b''.join(self._rawData.responseData)

@property
def remoteAddress(self):
return self._remoteAddress

def set_history(self, history):
self._history = history #FIXME: Uses private attribute of aiohttp.client_reqrep.ClientResponse

def iter_all(self):
return itertools.chain(self.history, (self,))

async def release(self):
if not self.closed:
self.connection.reset_raw_data()
await super().release()

+ 81
- 0
qwarc/cli.py View File

@@ -0,0 +1,81 @@
import argparse
import asyncio
import importlib.util
import logging
import os.path
import qwarc
import sys
import time


def setup_logging(logFilename):
rootLogger = logging.getLogger()
rootLogger.handlers = []
rootLogger.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s.%(msecs)03dZ %(levelname)s %(message)s', datefmt = '%Y-%m-%d %H:%M:%S')
formatter.converter = time.gmtime

fileHandler = logging.FileHandler(logFilename)
fileHandler.setFormatter(formatter)
rootLogger.addHandler(fileHandler)

stderrHandler = logging.StreamHandler()
stderrHandler.setFormatter(formatter)
rootLogger.addHandler(stderrHandler)


def check_files(specFilename, logFilename):
success = True
if not os.path.isfile(specFilename):
print('Error: "{}" does not exist or is not a regular file', file = sys.stderr)
success = False
if os.path.exists(logFilename):
print('Error: "{}" already exists'.format(logFilename), file = sys.stderr)
success = False
if os.path.exists('STOP'):
print('Error: "STOP" exists', file = sys.stderr)
success = False
return success


def main():
parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--log', metavar = 'LOGFILE', default = './qwarc.log')
parser.add_argument('--database', metavar = 'DBFILE', default = './qwarc.db')
parser.add_argument('--warc', metavar = 'PREFIX', help = 'prefix for the WARC filenames', default = './qwarc')
parser.add_argument('--concurrency', type = int, default = 1)
parser.add_argument('--memorylimit', metavar = 'LIMIT', help = 'pause when less than LIMIT bytes memory is free; disable if 0', default = 0)
parser.add_argument('--disklimit', metavar = 'LIMIT', help = 'pause when less than LIMIT bytes disk space is free; disable if 0', default = 0)
parser.add_argument('--warcsplit', metavar = 'SIZE', help = 'split WARCs into files of SIZE bytes; disable if 0', default = 0)
parser.add_argument('specfile')

args = parser.parse_args()

if not check_files(args.specfile, args.log):
sys.exit(1)

setup_logging(args.log)

spec = importlib.util.spec_from_file_location('spec', args.specfile)
specMod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(specMod)

a = qwarc.QWARC(
itemClasses = qwarc.Item.__subclasses__(),
warcBasePath = args.warc,
dbPath = args.database,
concurrency = args.concurrency,
memoryLimit = args.memorylimit,
minFreeDisk = args.disklimit,
warcSizeLimit = args.warcsplit,
)
if not os.path.exists(args.database):
a.create_db()

loop = asyncio.get_event_loop()
try:
loop.run_until_complete(a.run(loop))
except (Exception, KeyboardInterrupt) as e:
logging.exception('Unhandled error')
loop.close()

+ 23
- 0
qwarc/const.py View File

@@ -0,0 +1,23 @@
STATUS_TODO = 0
'''Status of an item that has not been processed yet'''

STATUS_INPROGRESS = 1
'''Status of an item that is currently being processed'''

STATUS_DONE = 2
'''Status of an item that has been processed'''

#TODO: Add a STATUS_ERROR?

ACTION_SUCCESS = 0
'''Treat this response as a success'''

ACTION_IGNORE = 1 #TODO Replace with ACTION_SUCCESS since it's really the same thing.
'''Ignore this response'''

ACTION_RETRY = 2
'''Retry the same request'''

ACTION_FOLLOW_OR_SUCCESS = 3
'''If the response contains a Location or URI header, follow it. Otherwise, treat it as a success.'''
#TODO: Rename to ACTION_FOLLOW maybe? However, the current name makes it more clear what qwarc does when there's a redirect without a redirect target...

+ 182
- 0
qwarc/utils.py View File

@@ -0,0 +1,182 @@
from qwarc.const import *
import asyncio
import os


PAGESIZE = os.sysconf('SC_PAGE_SIZE')


def get_rss():
'''Get the current RSS of this process in bytes'''

with open('/proc/self/statm', 'r') as fp:
return int(fp.readline().split()[1]) * PAGESIZE


def get_disk_free():
'''Get the current free disk space on the relevant partition in bytes'''

st = os.statvfs('.')
return st.f_bavail * st.f_frsize


def uses_too_much_memory(limit):
'''
Check whether the process is using too much memory

For performance reasons, this actually only checks the memory usage on every 100th call.
'''

uses_too_much_memory.callCounter += 1
# Only check every hundredth call
if uses_too_much_memory.callCounter % 100 == 0 and get_rss() > limit:
return True
return False
uses_too_much_memory.callCounter = 0


def too_little_disk_space(limit):
'''
Check whether the disk space is too small

For performance reasons, this actually only checks the free disk space on every 100th call.
'''

too_little_disk_space.callCounter += 1
if too_little_disk_space.callCounter % 100 == 0:
too_little_disk_space.currentResult = (get_disk_free() < limit)
return too_little_disk_space.currentResult
too_little_disk_space.callCounter = 0
too_little_disk_space.currentResult = False


# https://stackoverflow.com/a/4665027
def find_all(aStr, sub):
'''Generator yielding the start positions of every non-overlapping occurrence of sub in aStr.'''

start = 0
while True:
start = aStr.find(sub, start)
if start == -1:
return
yield start
start += len(sub)


def str_get_between(aStr, a, b):
'''Get the string after the first occurrence of a in aStr and the first occurrence of b after that of a, or None if there is no such string.'''

aPos = aStr.find(a)
if aPos == -1:
return None
offset = aPos + len(a)
bPos = aStr.find(b, offset)
if bPos == -1:
return None
return aStr[offset:bPos]


def maybe_str_get_between(x, a, b):
'''Like str_get_between, but returns None if x evaluates to False and converts it to a str before matching.'''

if x:
return str_get_between(str(x), a, b)


def str_get_all_between(aStr, a, b):
'''Generator yielding every string between occurrences of a in aStr and the following occurrence of b.'''

#TODO: This produces half-overlapping matches: str_get_all_between('aabc', 'a', 'c') will yield 'ab' and 'b'.
# Might need to implement sending an offset to the find_all generator to work around this, or discard aOffset values which are smaller than the previous bPos+len(b).

for aOffset in find_all(aStr, a):
offset = aOffset + len(a)
bPos = aStr.find(b, offset)
if bPos != -1:
yield aStr[offset:bPos]


def maybe_str_get_all_between(x, a, b):
'''Like str_get_all_between, but yields no elements if x evaluates to False and converts x to a str before matching.'''

if x:
yield from str_get_all_between(str(x), a, b)


def generate_range_items(start, stop, step):
'''
Generator for items of `step` size between `start` and `stop` (inclusive)
Yields strings of the form `'a-b'` where `a` and `b` are integers such that `b - a + 1 == step`, `min(a) == start`, and `max(b) == stop`.
`b - a + 1` may be unequal to `step` on the last item if `(stop - start + 1) % step != 0` (see examples below).
Note that `a` and `b` can be equal on the last item if `(stop - start) % step == 0` (see examples below).

Examples:
- generate_range_items(0, 99, 10) yields '0-9', '10-19', '20-29', ..., '90-99'
- generate_range_items(0, 42, 10): '0-9', '10-19', '20-29', '30-39', '40-42'
- generate_range_items(0, 20, 10): '0-9', '10-19', '20-20'
'''

for i in range(start, stop + 1, step):
yield '{}-{}'.format(i, min(i + step - 1, stop))


async def handle_response_default(url, attempt, response, exc):
'''
The default response handler, which behaves as follows:
- If there is no response (e.g. timeout error), retry the retrieval after a delay of 5 seconds.
- If the response has any of the status codes 401, 403, 404, 405, or 410, treat it as a permanent error and return.
- If there was any exception and it is a asyncio.TimeoutError or a aiohttp.ClientError, treat as a potentially temporary error and retry the retrieval after a delay of 5 seconds.
- If the response has any of the status codes 200, 204, 206, or 304, treat it as a success and return.
- If the response has any of the status codes 301, 302, 303, 307, or 308, follow the redirect target if specified or return otherwise.
- Otherwise, treat as a potentially temporary error and retry the retrieval after a delay of 5 seconds.

- All responses are written to WARC by default.

Note that this handler does not limit the number of retries on errors.

Parameters: url (yarl.URL instance), attempt (int), response (aiohttp.ClientResponse or None), exc (Exception or None)
At least one of response and exc is not None.
Returns: (one of the qwarc.RESPONSE_* constants, bool signifying whether to write to WARC or not)
'''

#TODO: Document that `attempt` is reset on redirects

if response is None:
await asyncio.sleep(5)
return ACTION_RETRY, True
if response.status in (401, 403, 404, 405, 410):
return ACTION_IGNORE, True
if exc is not None and isinstance(exc, (asyncio.TimeoutError, _aiohttp.ClientError)):
await asyncio.sleep(5)
return ACTION_RETRY, True
if response.status in (200, 204, 206, 304):
return ACTION_SUCCESS, True
if response.status in (301, 302, 303, 307, 308):
return ACTION_FOLLOW_OR_SUCCESS, True
await asyncio.sleep(5)
return ACTION_RETRY, True


async def handle_response_ignore_redirects(url, attempt, response, exc):
'''A response handler that does not follow redirects, i.e. treats them as a success instead. It behaves as handle_response_default otherwise.'''

action, writeToWarc = await handle_response_default(url, attempt, response, exc)
if action == ACTION_FOLLOW_OR_SUCCESS:
action = ACTION_SUCCESS
return action, writeToWarc


def handle_response_limit_error_retries(maxRetries):
'''A response handler that limits the number of retries on errors. It behaves as handle_response_default otherwise.

Technically, this is actually a response handler factory. This is so that the intuitive use works: fetch(..., responseHandler = handle_response_limit_error_retries(5))

If you use the same limit many times, you should keep the return value (the response handler) of this method and reuse it to avoid creating a new function every time.
'''

async def handler(url, attempt, response, exc):
action, writeToWarc = await handle_response_default(url, attempt, response, exc)
if action == ACTION_RETRY and attempt > maxRetries:
action = ACTION_IGNORE
return action, writeToWarc
return handler

+ 93
- 0
qwarc/warc.py View File

@@ -0,0 +1,93 @@
import fcntl
import io
import logging
import time
import warcio


class WARCWriter(warcio.warcwriter.WARCWriter):
def _do_write_req_resp(self, req, resp, params): #FIXME: Internal API
# Write request before response, like wget and wpull; cf. https://github.com/webrecorder/warcio/issues/20
self._write_warc_record(self.out, req)
self._write_warc_record(self.out, resp)


class WARC:
def __init__(self, prefix, maxFileSize):
'''
Initialise the WARC writer

prefix: str, path prefix for WARCs; a dash, a five-digit number, and ".warc.gz" will be appended.
maxFileSize: int, maximum size of an individual WARC. Use 0 to disable splitting.
'''

self._prefix = prefix
self._counter = 0
self._maxFileSize = maxFileSize

self._closed = True
self._file = None
self._warcWriter = None

self._cycle()

def _cycle(self):
'''Close the current file, open the next file that doesn't exist yet'''

#TODO: This opens a new file also at the end, which can result in empty WARCs. Should try to reorder this to only open a WARC when writing a record, and to only close the current WARC if the size is exceeded after write_client_response.
self.close()
while True:
filename = '{}-{:05d}.warc.gz'.format(self._prefix, self._counter)
try:
# Try to open the file for writing, requiring that it does not exist yet, and attempt to get an exclusive, non-blocking lock on it
self._file = open(filename, 'xb')
fcntl.flock(self._file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
except FileExistsError:
logging.info('{} already exists, skipping'.format(filename))
self._counter += 1
else:
break
logging.info('Opened {}'.format(filename))
self._warcWriter = WARCWriter(self._file, gzip = True)
self._closed = False
self._counter += 1

def write_client_response(self, response):
'''
Write the requests and responses stored in a ClientResponse instance to the currently opened WARC.
A new WARC will be started automatically if the size of the current file exceeds the limit after writing all requests and responses from this `response` to the current WARC.
'''

for r in response.iter_all():
requestDate = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(r.rawRequestTimestamp))
requestRecord = self._warcWriter.create_warc_record(
str(r.url),
'request',
payload = io.BytesIO(r.rawRequestData),
warc_headers_dict = {
'WARC-Date': requestDate,
'WARC-IP-Address': r.remoteAddress[0],
}
)
responseRecord = self._warcWriter.create_warc_record(
str(r.url),
'response',
payload = io.BytesIO(r.rawResponseData),
warc_headers_dict = {
'WARC-Date': requestDate,
'WARC-IP-Address': r.remoteAddress[0],
}
)
self._warcWriter.write_request_response_pair(requestRecord, responseRecord)

if self._maxFileSize and self._file.tell() > self._maxFileSize:
self._cycle()

def close(self):
'''Close the currently opened WARC'''

if not self._closed:
self._file.close()
self._warcWriter = None
self._file = None
self._closed = True

+ 22
- 0
setup.py View File

@@ -0,0 +1,22 @@
import setuptools


setuptools.setup(
name = 'qwarc',
version = '0.0-dev',
description = 'A framework for quick web archival',
author = 'JustAnotherArchivist',
url = 'https://github.com/JustAnotherArchivist/qwarc',
classifiers = [
'Development Status :: 3 - Alpha',
'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)',
'Programming Language :: Python :: 3.6',
],
packages = ['qwarc'],
install_requires = ['aiohttp==2.3.10', 'warcio', 'yarl'],
entry_points = {
'console_scripts': [
'qwarc = qwarc.cli:main',
],
},
)

Loading…
Cancel
Save