Browse Source

Add extensive debug logging

master
JustAnotherArchivist 3 years ago
parent
commit
f025c4e9f3
4 changed files with 111 additions and 18 deletions
  1. +70
    -9
      qwarc/__init__.py
  2. +4
    -1
      qwarc/cli.py
  3. +1
    -1
      qwarc/utils.py
  4. +36
    -7
      qwarc/warc.py

+ 70
- 9
qwarc/__init__.py View File

@@ -20,6 +20,9 @@ import sqlite3
import yarl import yarl




logger = logging.getLogger(__name__)


class Item: class Item:
itemType = None itemType = None
defaultResponseHandler = staticmethod(qwarc.utils.handle_response_default) defaultResponseHandler = staticmethod(qwarc.utils.handle_response_default)
@@ -33,7 +36,7 @@ class Item:
if not hasattr(self, '_baseUrl'): # To allow subclasses to set the baseUrl before calling super().__init__ if not hasattr(self, '_baseUrl'): # To allow subclasses to set the baseUrl before calling super().__init__
self._baseUrl = None self._baseUrl = None
self.stats = {'tx': 0, 'rx': 0, 'requests': 0} self.stats = {'tx': 0, 'rx': 0, 'requests': 0}
self.logger = logging.LoggerAdapter(logging.getLogger(), {'itemType': self.itemType, 'itemValue': self.itemValue})
self.logger = logging.LoggerAdapter(logging.getLogger(f'{type(self).__module__}.{type(self).__name__}'), {'itemType': self.itemType, 'itemValue': self.itemValue})


self.childItems = [] self.childItems = []


@@ -43,6 +46,7 @@ class Item:


@baseUrl.setter @baseUrl.setter
def baseUrl(self, baseUrl): def baseUrl(self, baseUrl):
self.logger.debug(f'Setting baseUrl to {baseUrl!r}')
if baseUrl is None: if baseUrl is None:
self._baseUrl = None self._baseUrl = None
elif isinstance(baseUrl, yarl.URL): elif isinstance(baseUrl, yarl.URL):
@@ -93,13 +97,27 @@ class Item:


#TODO: Rewrite using 'async with self.session.get' #TODO: Rewrite using 'async with self.session.get'


if self.logger.isEnabledFor(logging.DEBUG):
l = locals()
modified = []
for kwarg, kwdef in self.fetch.__kwdefaults__.items():
if l[kwarg] != kwdef:
modified.append(kwarg)
if modified:
self.logger.debug(f'fetch({url!r}) with ...')
for kwarg in modified:
self.logger.debug(f'... {kwarg} = {repr(l[kwarg]).replace(chr(10), " ")}') # aiohttp.ClientResponse's repr can contain newlines
else:
self.logger.debug(f'fetch({url!r})')
url = yarl.URL(url) # Explicitly convert for normalisation, percent-encoding, etc. url = yarl.URL(url) # Explicitly convert for normalisation, percent-encoding, etc.
if not url.scheme or not url.host: if not url.scheme or not url.host:
if fromResponse is not None: if fromResponse is not None:
self.logger.debug(f'Joining incomplete URL {url} with fromResponse URL {fromResponse.url}')
url = fromResponse.url.join(url) url = fromResponse.url.join(url)
elif not self.baseUrl: elif not self.baseUrl:
raise ValueError('Incomplete URL and no baseUrl to join it with') raise ValueError('Incomplete URL and no baseUrl to join it with')
else: else:
self.logger.debug(f'Joining incomplete URL {url} with base URL {baseUrl}')
url = self.baseUrl.join(url) url = self.baseUrl.join(url)
originalUrl = url originalUrl = url
if responseHandler is None: if responseHandler is None:
@@ -115,6 +133,7 @@ class Item:
exc = None exc = None
action = ACTION_RETRY action = ACTION_RETRY
writeToWarc = True writeToWarc = True
self.logger.debug(f'Trying to fetch {url}, attempt {attempt}, redirect level {redirectLevel}')
try: try:
try: try:
with _aiohttp.Timeout(timeout): with _aiohttp.Timeout(timeout):
@@ -144,20 +163,25 @@ class Item:
exc = e # Pass the exception outward for the history exc = e # Pass the exception outward for the history
else: else:
action, writeToWarc = await responseHandler(url = url, attempt = attempt, response = response, exc = None, redirectLevel = redirectLevel, item = self) action, writeToWarc = await responseHandler(url = url, attempt = attempt, response = response, exc = None, redirectLevel = redirectLevel, item = self)
self.logger.debug(f'Response handler result: {action}, {writeToWarc}')
if response and exc is None and writeToWarc: if response and exc is None and writeToWarc:
self.warc.write_client_response(response) self.warc.write_client_response(response)
history.append((response, exc)) history.append((response, exc))
retResponse = response if exc is None else qwarc.utils.DummyClientResponse() retResponse = response if exc is None else qwarc.utils.DummyClientResponse()
if action in (ACTION_SUCCESS, ACTION_IGNORE): if action in (ACTION_SUCCESS, ACTION_IGNORE):
self.logger.error(f'Fetch loop done')
retResponse.qhistory = tuple(history) retResponse.qhistory = tuple(history)
return retResponse return retResponse
elif action == ACTION_FOLLOW_OR_SUCCESS: elif action == ACTION_FOLLOW_OR_SUCCESS:
redirectUrl = response.headers.get('Location') or response.headers.get('URI') redirectUrl = response.headers.get('Location') or response.headers.get('URI')
if not redirectUrl: if not redirectUrl:
self.logger.debug('Got redirect but no redirect target, fetch loop done')
retResponse.qhistory = tuple(history) retResponse.qhistory = tuple(history)
return retResponse return retResponse
url = url.join(yarl.URL(redirectUrl)) url = url.join(yarl.URL(redirectUrl))
self.logger.debug(f'Next URL: {url}')
if response.status in (301, 302, 303) and method == 'POST': if response.status in (301, 302, 303) and method == 'POST':
self.logger.debug(f'{response.status} on POST, dropping to GET and clearing data')
method = 'GET' method = 'GET'
data = None data = None
attempt = 0 attempt = 0
@@ -175,6 +199,7 @@ class Item:
pass pass
finally: finally:
if response: if response:
self.logger.debug('Releasing response')
await response.release() await response.release()


async def process(self): async def process(self):
@@ -189,13 +214,16 @@ class Item:
item = (itemClassOrType.itemType, itemValue) item = (itemClassOrType.itemType, itemValue)
else: else:
item = (itemClassOrType, itemValue) item = (itemClassOrType, itemValue)
self.logger.debug(f'Adding subitem {itemClassOrType!r} {itemValue!r}')
if item not in self.childItems: if item not in self.childItems:
self.childItems.append(item) self.childItems.append(item)


async def flush_subitems(self): async def flush_subitems(self):
self.logger.debug('Flushing subitems')
await self.qwarcObj.flush_subitems(self) await self.qwarcObj.flush_subitems(self)


def clear_subitems(self): def clear_subitems(self):
self.logger.debug('Clearing subitems')
self.childItems = [] self.childItems = []


@classmethod @classmethod
@@ -257,6 +285,7 @@ class QWARC:


@contextlib.asynccontextmanager @contextlib.asynccontextmanager
async def exclusive_db_lock(self): async def exclusive_db_lock(self):
logger.debug('Attempting to obtain DB lock')
c = self._db.cursor() c = self._db.cursor()
while True: while True:
try: try:
@@ -266,14 +295,18 @@ class QWARC:
if str(e) != 'database is locked': if str(e) != 'database is locked':
raise raise
await asyncio.sleep(1) await asyncio.sleep(1)
logger.debug('Obtained DB lock')
try: try:
yield c yield c
logger.debug('Committing DB transaction')
c.execute('COMMIT') c.execute('COMMIT')
except: except:
logger.debug('Rolling back DB transaction')
c.execute('ROLLBACK') c.execute('ROLLBACK')
raise raise


def _make_item(self, itemType, itemValue, session, headers): def _make_item(self, itemType, itemValue, session, headers):
logger.debug(f'Making item {itemType!r} {itemValue!r}')
try: try:
itemClass = self._itemTypeMap[itemType] itemClass = self._itemTypeMap[itemType]
except KeyError: except KeyError:
@@ -281,29 +314,34 @@ class QWARC:
return itemClass(self, itemValue, session, headers, self._warc) return itemClass(self, itemValue, session, headers, self._warc)


async def _wait_for_free_task(self): async def _wait_for_free_task(self):
logger.debug('Waiting for task(s) to finish')
if not self._tasks: if not self._tasks:
logger.debug('No tasks to wait for')
return return
done, pending = await asyncio.wait(self._tasks, return_when = concurrent.futures.FIRST_COMPLETED) done, pending = await asyncio.wait(self._tasks, return_when = concurrent.futures.FIRST_COMPLETED)
logger.debug(f'{len(done)} task(s) finished, {len(pending)} remaining')
for future in done: for future in done:
logger.debug(f'Finished {future.taskType} task: {future!r}')
newStatus = STATUS_DONE newStatus = STATUS_DONE
if future.taskType == 'sleep': if future.taskType == 'sleep':
self._sleepTasks.remove(future) self._sleepTasks.remove(future)
elif future.taskType == 'process': elif future.taskType == 'process':
item = future.item item = future.item
logger.debug(f'Finished item: {item!r}')
try: try:
future.result() future.result()
except asyncio.CancelledError as e: except asyncio.CancelledError as e:
# Got cancelled, nothing we can do about it, but let's log a warning if it's a process task # Got cancelled, nothing we can do about it, but let's log a warning if it's a process task
if future.taskType == 'process': if future.taskType == 'process':
logging.error(f'Task for {future.itemType}:{future.itemValue} cancelled: {future!r}')
logger.error(f'Task for {future.itemType}:{future.itemValue} cancelled: {future!r}')
newStatus = STATUS_ERROR newStatus = STATUS_ERROR
except Exception as e: except Exception as e:
if future.taskType == 'process': if future.taskType == 'process':
logging.error(f'{future.itemType}:{future.itemValue} failed: {e!r} ({item.stats["requests"]} requests, {item.stats["tx"]} tx, {item.stats["rx"]} rx)', exc_info = e)
logger.error(f'{future.itemType}:{future.itemValue} failed: {e!r} ({item.stats["requests"]} requests, {item.stats["tx"]} tx, {item.stats["rx"]} rx)', exc_info = e)
newStatus = STATUS_ERROR newStatus = STATUS_ERROR
else: else:
if future.taskType == 'process': if future.taskType == 'process':
logging.info(f'{future.itemType}:{future.itemValue} done: {item.stats["requests"]} requests, {item.stats["tx"]} tx, {item.stats["rx"]} rx')
logger.info(f'{future.itemType}:{future.itemValue} done: {item.stats["requests"]} requests, {item.stats["tx"]} tx, {item.stats["rx"]} rx')
if future.taskType != 'process': if future.taskType != 'process':
continue continue
async with self.exclusive_db_lock() as cursor: async with self.exclusive_db_lock() as cursor:
@@ -313,6 +351,7 @@ class QWARC:
self._tasks = pending self._tasks = pending


async def _insert_subitems(self, item): async def _insert_subitems(self, item):
logger.debug(f'Inserting {len(item.childItems)} subitem(s) from {item!r}')
if item.childItems: if item.childItems:
async with self.exclusive_db_lock() as cursor: async with self.exclusive_db_lock() as cursor:
it = iter(item.childItems) it = iter(item.childItems)
@@ -325,6 +364,7 @@ class QWARC:
async def run(self, loop): async def run(self, loop):
self._closed = _STATE_RUNNING self._closed = _STATE_RUNNING


logger.debug(f'Creating {self._concurrency} sessions')
for i in range(self._concurrency): for i in range(self._concurrency):
session = _aiohttp.ClientSession( session = _aiohttp.ClientSession(
connector = qwarc.aiohttp.TCPConnector(loop = loop), connector = qwarc.aiohttp.TCPConnector(loop = loop),
@@ -335,12 +375,16 @@ class QWARC:
self._sessions.append(session) self._sessions.append(session)
self._freeSessions.append(session) self._freeSessions.append(session)


logger.debug('Constructing WARC object')
self._warc = qwarc.warc.WARC(self._warcBasePath, self._warcSizeLimit, self._warcDedupe, self._command, self._specFile, self._specDependencies, self._logFilename) self._warc = qwarc.warc.WARC(self._warcBasePath, self._warcSizeLimit, self._warcDedupe, self._command, self._specFile, self._specDependencies, self._logFilename)


logger.debug('Connecting to the database')
self._db = sqlite3.connect(self._dbPath, timeout = 1) self._db = sqlite3.connect(self._dbPath, timeout = 1)
self._db.set_trace_callback(logging.getLogger(f'{__name__}[sqlite3]').debug)
self._db.isolation_level = None # Transactions are handled manually below. self._db.isolation_level = None # Transactions are handled manually below.


# Setting the synchronous PRAGMA is not possible if the DB is currently locked by another process but also can't be done inside a transaction... So just retry until it succeeds. # Setting the synchronous PRAGMA is not possible if the DB is currently locked by another process but also can't be done inside a transaction... So just retry until it succeeds.
logger.debug('Setting synchronous PRAGMA')
while True: while True:
try: try:
self._db.execute('PRAGMA synchronous = OFF') self._db.execute('PRAGMA synchronous = OFF')
@@ -350,6 +394,7 @@ class QWARC:
await asyncio.sleep(1) await asyncio.sleep(1)
else: else:
break break
logger.debug('Set synchronous PRAGMA')


async with self.exclusive_db_lock() as cursor: async with self.exclusive_db_lock() as cursor:
cursor.execute('SELECT name FROM sqlite_master WHERE type = "table" AND name = "items"') cursor.execute('SELECT name FROM sqlite_master WHERE type = "table" AND name = "items"')
@@ -364,31 +409,35 @@ class QWARC:
await self._wait_for_free_task() await self._wait_for_free_task()


if self._closed == _STATE_CLOSING: if self._closed == _STATE_CLOSING:
logger.debug('Currently closing')
break break


if os.path.exists('STOP'): if os.path.exists('STOP'):
logging.info('Gracefully shutting down due to STOP file')
logger.info('Gracefully shutting down due to STOP file')
break break
if self._memoryLimit and qwarc.utils.uses_too_much_memory(self._memoryLimit): if self._memoryLimit and qwarc.utils.uses_too_much_memory(self._memoryLimit):
logging.info(f'Gracefully shutting down due to memory usage (current = {qwarc.utils.get_rss()} > limit = {self._memoryLimit})')
logger.info(f'Gracefully shutting down due to memory usage (current = {qwarc.utils.get_rss()} > limit = {self._memoryLimit})')
break break


if self._minFreeDisk and qwarc.utils.too_little_disk_space(self._minFreeDisk): if self._minFreeDisk and qwarc.utils.too_little_disk_space(self._minFreeDisk):
logging.info('Disk space is low, sleeping')
logger.info('Disk space is low, sleeping')
sleepTask = asyncio.ensure_future(asyncio.sleep(random.uniform(self._concurrency / 2, self._concurrency * 1.5))) sleepTask = asyncio.ensure_future(asyncio.sleep(random.uniform(self._concurrency / 2, self._concurrency * 1.5)))
sleepTask.taskType = 'sleep' sleepTask.taskType = 'sleep'
self._tasks.add(sleepTask) self._tasks.add(sleepTask)
self._sleepTasks.add(sleepTask) self._sleepTasks.add(sleepTask)
continue continue


logger.debug('Trying to get an item from the DB')
async with self.exclusive_db_lock() as cursor: async with self.exclusive_db_lock() as cursor:
cursor.execute('SELECT id, type, value, status FROM items WHERE status = ? LIMIT 1', (STATUS_TODO,)) cursor.execute('SELECT id, type, value, status FROM items WHERE status = ? LIMIT 1', (STATUS_TODO,))
result = cursor.fetchone() result = cursor.fetchone()
if not result: if not result:
logger.debug('Found no TODO item')
if cursor.execute('SELECT id, status FROM items WHERE status != ? LIMIT 1', (STATUS_DONE,)).fetchone(): if cursor.execute('SELECT id, status FROM items WHERE status != ? LIMIT 1', (STATUS_DONE,)).fetchone():
# There is currently no item to do, but there are still some in progress, so more TODOs may appear in the future. # There is currently no item to do, but there are still some in progress, so more TODOs may appear in the future.
# It would be nice if we could just await wait_for_free_task() here, but that doesn't work because those TODOs might be in another process. # It would be nice if we could just await wait_for_free_task() here, but that doesn't work because those TODOs might be in another process.
# So instead, we insert a dummy task which just sleeps a bit. Average sleep time is equal to concurrency, i.e. one check per second. # So instead, we insert a dummy task which just sleeps a bit. Average sleep time is equal to concurrency, i.e. one check per second.
logger.debug('There are incomplete items that may spawn further TODOs, starting sleep task')
#TODO: The average sleep time is too large if there are only few sleep tasks; scale with len(sleepTasks)/self._concurrency? #TODO: The average sleep time is too large if there are only few sleep tasks; scale with len(sleepTasks)/self._concurrency?
sleepTask = asyncio.ensure_future(asyncio.sleep(random.uniform(self._concurrency / 2, self._concurrency * 1.5))) sleepTask = asyncio.ensure_future(asyncio.sleep(random.uniform(self._concurrency / 2, self._concurrency * 1.5)))
sleepTask.taskType = 'sleep' sleepTask.taskType = 'sleep'
@@ -397,8 +446,10 @@ class QWARC:
continue continue
else: else:
# Really nothing to do anymore # Really nothing to do anymore
logger.debug('All items are done')
break break
id_, itemType, itemValue, status = result id_, itemType, itemValue, status = result
logger.debug(f'Got item {itemType}:{itemValue} ({id_})')
cursor.execute('UPDATE items SET status = ? WHERE id = ?', (STATUS_INPROGRESS, id_)) cursor.execute('UPDATE items SET status = ? WHERE id = ?', (STATUS_INPROGRESS, id_))


session = self._freeSessions.popleft() session = self._freeSessions.popleft()
@@ -412,17 +463,20 @@ class QWARC:
task.item = item task.item = item
self._tasks.add(task) self._tasks.add(task)


logger.debug(f'Cancelling {len(self._sleepTasks)} sleep task(s)')
for sleepTask in self._sleepTasks: for sleepTask in self._sleepTasks:
sleepTask.cancel() sleepTask.cancel()


logger.debug('Waiting for tasks to finish')
while len(self._tasks): while len(self._tasks):
await self._wait_for_free_task() await self._wait_for_free_task()


logging.info('Done')
logger.info('Done')
except (Exception, KeyboardInterrupt) as e: except (Exception, KeyboardInterrupt) as e:
# Kill all tasks
logger.debug(f'Got an exception, cancelling {len(self._tasks)} task(s)')
for task in self._tasks: for task in self._tasks:
task.cancel() task.cancel()
logger.debug('Waiting for tasks to get cancelled')
await asyncio.wait(self._tasks, return_when = concurrent.futures.ALL_COMPLETED) await asyncio.wait(self._tasks, return_when = concurrent.futures.ALL_COMPLETED)
self._tasks.clear() self._tasks.clear()


@@ -435,11 +489,13 @@ class QWARC:
item.clear_subitems() item.clear_subitems()


def _create_db(self, cursor): def _create_db(self, cursor):
logger.debug('Creating database')
cursor.execute('CREATE TABLE items (id INTEGER PRIMARY KEY, type TEXT, value TEXT, status INTEGER)') cursor.execute('CREATE TABLE items (id INTEGER PRIMARY KEY, type TEXT, value TEXT, status INTEGER)')
cursor.execute('CREATE INDEX items_status_idx ON items (status)') cursor.execute('CREATE INDEX items_status_idx ON items (status)')
cursor.execute('CREATE UNIQUE INDEX items_type_value_idx ON items (type, value)') cursor.execute('CREATE UNIQUE INDEX items_type_value_idx ON items (type, value)')


def _insert_generated_items(self, cursor): def _insert_generated_items(self, cursor):
logger.debug('Inserting generated items')
it = itertools.chain((cls.itemType, value, STATUS_TODO) for cls in self._itemClasses for value in cls.generate()) it = itertools.chain((cls.itemType, value, STATUS_TODO) for cls in self._itemClasses for value in cls.generate())
while True: while True:
values = tuple(itertools.islice(it, 100000)) values = tuple(itertools.islice(it, 100000))
@@ -453,15 +509,20 @@ class QWARC:


self._closed = _STATE_CLOSING self._closed = _STATE_CLOSING


logger.debug('Closing')
if self._tasks: if self._tasks:
logger.warning(f'Cancelling {len(self._tasks)} task(s) remaining on cleanup') logger.warning(f'Cancelling {len(self._tasks)} task(s) remaining on cleanup')
for task in self._tasks: for task in self._tasks:
task.cancel() task.cancel()
logger.debug('Waiting for tasks to get cancelled')
await asyncio.wait(self._tasks, return_when = concurrent.futures.ALL_COMPLETED) await asyncio.wait(self._tasks, return_when = concurrent.futures.ALL_COMPLETED)
self._tasks.clear() self._tasks.clear()
logger.debug('Closing DB')
self._db.close() self._db.close()
logger.debug('Closing sessions')
for session in self._sessions: for session in self._sessions:
session.close() session.close()
logger.debug('Closing WARC')
self._warc.close() self._warc.close()


self._reset_working_vars() # Also resets self._closed self._reset_working_vars() # Also resets self._closed

+ 4
- 1
qwarc/cli.py View File

@@ -9,6 +9,9 @@ import qwarc.version
import sys import sys




logger = logging.getLogger(__name__)


def setup_logging(logFilename, logLevel, logLevelStderr): def setup_logging(logFilename, logLevel, logLevelStderr):
if logLevelStderr is None: if logLevelStderr is None:
logLevelStderr = logLevel logLevelStderr = logLevel
@@ -91,7 +94,7 @@ def main():
try: try:
loop.run_until_complete(a.run(loop)) loop.run_until_complete(a.run(loop))
except (Exception, KeyboardInterrupt) as e: except (Exception, KeyboardInterrupt) as e:
logging.exception('Unhandled error')
logger.exception('Unhandled error')
finally: finally:
loop.run_until_complete(a.close()) loop.run_until_complete(a.close())
loop.close() loop.close()

+ 1
- 1
qwarc/utils.py View File

@@ -245,7 +245,7 @@ def get_software_info(specDependencyPackages):


class LogFormatter(logging.Formatter): class LogFormatter(logging.Formatter):
def __init__(self): def __init__(self):
super().__init__('%(asctime)s.%(msecs)03dZ %(levelname)s %(itemString)s %(message)s', datefmt = '%Y-%m-%d %H:%M:%S')
super().__init__('%(asctime)s.%(msecs)03dZ %(name)s %(levelname)s %(itemString)s %(message)s', datefmt = '%Y-%m-%d %H:%M:%S')
self.converter = time.gmtime self.converter = time.gmtime


def format(self, record): def format(self, record):


+ 36
- 7
qwarc/warc.py View File

@@ -11,6 +11,9 @@ import time
import warcio import warcio




logger = logging.getLogger(__name__)


class WARC: class WARC:
def __init__(self, prefix, maxFileSize, dedupe, command, specFile, specDependencies, logFilename): def __init__(self, prefix, maxFileSize, dedupe, command, specFile, specDependencies, logFilename):
''' '''
@@ -54,37 +57,42 @@ class WARC:
return return
while True: while True:
filename = f'{self._prefix}-{self._counter:05d}.warc.gz' filename = f'{self._prefix}-{self._counter:05d}.warc.gz'
logger.debug(f'Trying to open {filename}')
try: try:
# Try to open the file for writing, requiring that it does not exist yet, and attempt to get an exclusive, non-blocking lock on it # Try to open the file for writing, requiring that it does not exist yet, and attempt to get an exclusive, non-blocking lock on it
self._file = open(filename, 'xb') self._file = open(filename, 'xb')
fcntl.flock(self._file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) fcntl.flock(self._file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
except FileExistsError: except FileExistsError:
logging.info(f'{filename} already exists, skipping')
logger.debug(f'{filename} already exists, skipping')
self._counter += 1 self._counter += 1
else: else:
break break
logging.info(f'Opened {filename}')
logger.info(f'Opened {filename}')
self._open_journal(filename) self._open_journal(filename)
self._warcWriter = warcio.warcwriter.WARCWriter(self._file, gzip = True, warc_version = '1.1') self._warcWriter = warcio.warcwriter.WARCWriter(self._file, gzip = True, warc_version = '1.1')
self._closed = False self._closed = False
self._counter += 1 self._counter += 1


def _open_journal(self, filename): def _open_journal(self, filename):
logger.debug(f'Trying to open {filename}.qwarcjournal')
try: try:
self._journalFile = open(f'{filename}.qwarcjournal', 'xb') self._journalFile = open(f'{filename}.qwarcjournal', 'xb')
fcntl.flock(self._journalFile.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) fcntl.flock(self._journalFile.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
except FileExistsError: except FileExistsError:
logging.error(f'{filename}.qwarcjournal already exists!')
logger.error(f'{filename}.qwarcjournal already exists!')
raise RuntimeError(f'Unable to create journal file for {filename}: {filename}.qwarcjournal already exists') raise RuntimeError(f'Unable to create journal file for {filename}: {filename}.qwarcjournal already exists')
except OSError as e: except OSError as e:
if e.errno == errno.EWOULDBLOCK: if e.errno == errno.EWOULDBLOCK:
logging.error(f'{filename}.qwarcjournal is already locked!')
logger.error(f'{filename}.qwarcjournal is already locked!')
raise RuntimeError(f'Unable to lock journal file {filename}.qwarcjournal') from e raise RuntimeError(f'Unable to lock journal file {filename}.qwarcjournal') from e
raise raise
logger.debug(f'Opened {filename}.qwarcjournal')
self._journalClean = True self._journalClean = True


def _write_record(self, record): def _write_record(self, record):
# Write the current offset to the journal file
logger.debug(f'Preparing to write record {record.rec_headers.get_header("WARC-Record-ID")}')

logger.debug('Writing journal entry')
# Since the size can only grow, it is not necessary to explicitly delete the previous contents. # Since the size can only grow, it is not necessary to explicitly delete the previous contents.
self._journalFile.seek(0) self._journalFile.seek(0)
previousSize = self._file.tell() previousSize = self._file.tell()
@@ -92,19 +100,22 @@ class WARC:
self._journalFile.flush() self._journalFile.flush()
self._journalClean = False self._journalClean = False


logger.debug('Writing record')
try: try:
self._warcWriter.write_record(record) self._warcWriter.write_record(record)
except (OSError, IOError): except (OSError, IOError):
logger.debug(f'Error occurred, truncating to previous size {previousSize}')
self._file.truncate(previousSize) self._file.truncate(previousSize)
raise raise
else: else:
# Mark the write as ok
logger.debug('Write ok, cleaning journal')
self._journalFile.seek(-4, os.SEEK_END) # len(b'no \n') self._journalFile.seek(-4, os.SEEK_END) # len(b'no \n')
self._journalFile.write(b'yes\n') self._journalFile.write(b'yes\n')
self._journalFile.flush() self._journalFile.flush()
self._journalClean = True self._journalClean = True


def _write_warcinfo_record(self): def _write_warcinfo_record(self):
logger.debug('Writing warcinfo record')
data = { data = {
'software': qwarc.utils.get_software_info(self._specDependencies.packages), 'software': qwarc.utils.get_software_info(self._specDependencies.packages),
'command': self._command, 'command': self._command,
@@ -116,8 +127,10 @@ class WARC:
} }
payload = io.BytesIO(json.dumps(data, indent = 2).encode('utf-8')) payload = io.BytesIO(json.dumps(data, indent = 2).encode('utf-8'))
# Workaround for https://github.com/webrecorder/warcio/issues/87 # Workaround for https://github.com/webrecorder/warcio/issues/87
logger.debug('Calculating digest')
digester = warcio.utils.Digester('sha1') digester = warcio.utils.Digester('sha1')
digester.update(payload.getvalue()) digester.update(payload.getvalue())
logger.debug('Creating record object')
record = self._warcWriter.create_warc_record( record = self._warcWriter.create_warc_record(
None, None,
'warcinfo', 'warcinfo',
@@ -134,10 +147,13 @@ class WARC:
A new WARC will be started automatically if the size of the current file exceeds the limit after writing all requests and responses from this `response` to the current WARC. A new WARC will be started automatically if the size of the current file exceeds the limit after writing all requests and responses from this `response` to the current WARC.
''' '''


logger.debug(f'Writing response(s) for {response.url}')
self._ensure_opened() self._ensure_opened()
for r in response.iter_all(): for r in response.iter_all():
usec = f'{(r.rawRequestTimestamp - int(r.rawRequestTimestamp)):.6f}'[2:] usec = f'{(r.rawRequestTimestamp - int(r.rawRequestTimestamp)):.6f}'[2:]
requestDate = time.strftime(f'%Y-%m-%dT%H:%M:%S.{usec}Z', time.gmtime(r.rawRequestTimestamp)) requestDate = time.strftime(f'%Y-%m-%dT%H:%M:%S.{usec}Z', time.gmtime(r.rawRequestTimestamp))
logger.debug(f'Writing request/response for {requestDate} {r.url}')
logger.debug('Constructing request record')
r.rawRequestData.seek(0, io.SEEK_END) r.rawRequestData.seek(0, io.SEEK_END)
length = r.rawRequestData.tell() length = r.rawRequestData.tell()
r.rawRequestData.seek(0) r.rawRequestData.seek(0)
@@ -153,6 +169,7 @@ class WARC:
} }
) )
requestRecordID = requestRecord.rec_headers.get_header('WARC-Record-ID') requestRecordID = requestRecord.rec_headers.get_header('WARC-Record-ID')
logger.debug('Constructing response record')
r.rawResponseData.seek(0, io.SEEK_END) r.rawResponseData.seek(0, io.SEEK_END)
length = r.rawResponseData.tell() length = r.rawResponseData.tell()
r.rawResponseData.seek(0) r.rawResponseData.seek(0)
@@ -171,8 +188,10 @@ class WARC:
payloadDigest = responseRecord.rec_headers.get_header('WARC-Payload-Digest') payloadDigest = responseRecord.rec_headers.get_header('WARC-Payload-Digest')
assert payloadDigest is not None assert payloadDigest is not None
if self._dedupe and responseRecord.payload_length > 100: # Don't deduplicate small responses; the additional headers are typically larger than the payload dedupe savings... if self._dedupe and responseRecord.payload_length > 100: # Don't deduplicate small responses; the additional headers are typically larger than the payload dedupe savings...
logger.debug(f'Checking for duplicates with {payloadDigest}')
if payloadDigest in self._dedupeMap: if payloadDigest in self._dedupeMap:
refersToRecordId, refersToUri, refersToDate = self._dedupeMap[payloadDigest] refersToRecordId, refersToUri, refersToDate = self._dedupeMap[payloadDigest]
logger.debug('Record is a duplicate, creating revisit record to {refersToRecordId} ({refersToDate} {refersToUri})')
responseHttpHeaders = responseRecord.http_headers responseHttpHeaders = responseRecord.http_headers
responseRecord = self._warcWriter.create_revisit_record( responseRecord = self._warcWriter.create_revisit_record(
str(r.url), str(r.url),
@@ -192,18 +211,24 @@ class WARC:
# Workaround for https://github.com/webrecorder/warcio/issues/94 # Workaround for https://github.com/webrecorder/warcio/issues/94
responseRecord.rec_headers.replace_header('WARC-Profile', 'http://netpreserve.org/warc/1.1/revisit/identical-payload-digest') responseRecord.rec_headers.replace_header('WARC-Profile', 'http://netpreserve.org/warc/1.1/revisit/identical-payload-digest')
else: else:
logger.debug('New record, creating dedupe map entry')
self._dedupeMap[payloadDigest] = (responseRecord.rec_headers.get_header('WARC-Record-ID'), str(r.url), requestDate) self._dedupeMap[payloadDigest] = (responseRecord.rec_headers.get_header('WARC-Record-ID'), str(r.url), requestDate)
logger.debug('Writing request and response records')
self._write_record(requestRecord) self._write_record(requestRecord)
self._write_record(responseRecord) self._write_record(responseRecord)
logger.debug(f'Done writing responses for {response.url}')


if self._maxFileSize and self._file.tell() > self._maxFileSize: if self._maxFileSize and self._file.tell() > self._maxFileSize:
logger.debug('File exceeds max size, closing')
self._close_file() self._close_file()


def _write_resource_records(self): def _write_resource_records(self):
'''Write spec file and dependencies''' '''Write spec file and dependencies'''
assert self._metaWarcinfoRecordID is not None, 'write_warcinfo_record must be called first' assert self._metaWarcinfoRecordID is not None, 'write_warcinfo_record must be called first'


logger.debug('Writing resource records')
for type_, contentType, fn in itertools.chain((('specfile', 'application/x-python', self._specFile),), map(lambda x: ('spec-dependency-file', 'application/octet-stream', x), self._specDependencies.files)): for type_, contentType, fn in itertools.chain((('specfile', 'application/x-python', self._specFile),), map(lambda x: ('spec-dependency-file', 'application/octet-stream', x), self._specDependencies.files)):
logger.debug(f'Writing resource record for {fn}')
with open(fn, 'rb') as f: with open(fn, 'rb') as f:
f.seek(0, io.SEEK_END) f.seek(0, io.SEEK_END)
length = f.tell() length = f.tell()
@@ -224,6 +249,7 @@ class WARC:
def _write_log_record(self): def _write_log_record(self):
assert self._metaWarcinfoRecordID is not None, 'write_warcinfo_record must be called first' assert self._metaWarcinfoRecordID is not None, 'write_warcinfo_record must be called first'


logger.debug(f'Writing log record')
rootLogger = logging.getLogger() rootLogger = logging.getLogger()
for handler in rootLogger.handlers: #FIXME: Uses undocumented attribute handlers for handler in rootLogger.handlers: #FIXME: Uses undocumented attribute handlers
handler.flush() handler.flush()
@@ -246,10 +272,12 @@ class WARC:
'''Close the currently opened WARC''' '''Close the currently opened WARC'''


if not self._closed: if not self._closed:
logger.debug('Closing WARC file')
self._file.close() self._file.close()
journalFilename = self._journalFile.name journalFilename = self._journalFile.name
self._journalFile.close() self._journalFile.close()
if self._journalClean: if self._journalClean:
logger.debug(f'Deleting journal {journalFilename}')
os.remove(journalFilename) os.remove(journalFilename)
self._warcWriter = None self._warcWriter = None
self._file = None self._file = None
@@ -259,10 +287,11 @@ class WARC:


def _write_meta_warc(self, callback): def _write_meta_warc(self, callback):
filename = f'{self._prefix}-meta.warc.gz' filename = f'{self._prefix}-meta.warc.gz'
logger.debug(f'Opening {filename}')
self._file = open(filename, 'ab') self._file = open(filename, 'ab')
try: try:
fcntl.flock(self._file.fileno(), fcntl.LOCK_EX) fcntl.flock(self._file.fileno(), fcntl.LOCK_EX)
logging.info(f'Opened {filename}')
logger.info(f'Opened {filename}')
self._open_journal(filename) self._open_journal(filename)
self._warcWriter = warcio.warcwriter.WARCWriter(self._file, gzip = True, warc_version = '1.1') self._warcWriter = warcio.warcwriter.WARCWriter(self._file, gzip = True, warc_version = '1.1')
self._closed = False self._closed = False


Loading…
Cancel
Save