Browse Source

Add extensive debug logging

master
JustAnotherArchivist 3 years ago
parent
commit
f025c4e9f3
4 changed files with 111 additions and 18 deletions
  1. +70
    -9
      qwarc/__init__.py
  2. +4
    -1
      qwarc/cli.py
  3. +1
    -1
      qwarc/utils.py
  4. +36
    -7
      qwarc/warc.py

+ 70
- 9
qwarc/__init__.py View File

@@ -20,6 +20,9 @@ import sqlite3
import yarl


logger = logging.getLogger(__name__)


class Item:
itemType = None
defaultResponseHandler = staticmethod(qwarc.utils.handle_response_default)
@@ -33,7 +36,7 @@ class Item:
if not hasattr(self, '_baseUrl'): # To allow subclasses to set the baseUrl before calling super().__init__
self._baseUrl = None
self.stats = {'tx': 0, 'rx': 0, 'requests': 0}
self.logger = logging.LoggerAdapter(logging.getLogger(), {'itemType': self.itemType, 'itemValue': self.itemValue})
self.logger = logging.LoggerAdapter(logging.getLogger(f'{type(self).__module__}.{type(self).__name__}'), {'itemType': self.itemType, 'itemValue': self.itemValue})

self.childItems = []

@@ -43,6 +46,7 @@ class Item:

@baseUrl.setter
def baseUrl(self, baseUrl):
self.logger.debug(f'Setting baseUrl to {baseUrl!r}')
if baseUrl is None:
self._baseUrl = None
elif isinstance(baseUrl, yarl.URL):
@@ -93,13 +97,27 @@ class Item:

#TODO: Rewrite using 'async with self.session.get'

if self.logger.isEnabledFor(logging.DEBUG):
l = locals()
modified = []
for kwarg, kwdef in self.fetch.__kwdefaults__.items():
if l[kwarg] != kwdef:
modified.append(kwarg)
if modified:
self.logger.debug(f'fetch({url!r}) with ...')
for kwarg in modified:
self.logger.debug(f'... {kwarg} = {repr(l[kwarg]).replace(chr(10), " ")}') # aiohttp.ClientResponse's repr can contain newlines
else:
self.logger.debug(f'fetch({url!r})')
url = yarl.URL(url) # Explicitly convert for normalisation, percent-encoding, etc.
if not url.scheme or not url.host:
if fromResponse is not None:
self.logger.debug(f'Joining incomplete URL {url} with fromResponse URL {fromResponse.url}')
url = fromResponse.url.join(url)
elif not self.baseUrl:
raise ValueError('Incomplete URL and no baseUrl to join it with')
else:
self.logger.debug(f'Joining incomplete URL {url} with base URL {baseUrl}')
url = self.baseUrl.join(url)
originalUrl = url
if responseHandler is None:
@@ -115,6 +133,7 @@ class Item:
exc = None
action = ACTION_RETRY
writeToWarc = True
self.logger.debug(f'Trying to fetch {url}, attempt {attempt}, redirect level {redirectLevel}')
try:
try:
with _aiohttp.Timeout(timeout):
@@ -144,20 +163,25 @@ class Item:
exc = e # Pass the exception outward for the history
else:
action, writeToWarc = await responseHandler(url = url, attempt = attempt, response = response, exc = None, redirectLevel = redirectLevel, item = self)
self.logger.debug(f'Response handler result: {action}, {writeToWarc}')
if response and exc is None and writeToWarc:
self.warc.write_client_response(response)
history.append((response, exc))
retResponse = response if exc is None else qwarc.utils.DummyClientResponse()
if action in (ACTION_SUCCESS, ACTION_IGNORE):
self.logger.error(f'Fetch loop done')
retResponse.qhistory = tuple(history)
return retResponse
elif action == ACTION_FOLLOW_OR_SUCCESS:
redirectUrl = response.headers.get('Location') or response.headers.get('URI')
if not redirectUrl:
self.logger.debug('Got redirect but no redirect target, fetch loop done')
retResponse.qhistory = tuple(history)
return retResponse
url = url.join(yarl.URL(redirectUrl))
self.logger.debug(f'Next URL: {url}')
if response.status in (301, 302, 303) and method == 'POST':
self.logger.debug(f'{response.status} on POST, dropping to GET and clearing data')
method = 'GET'
data = None
attempt = 0
@@ -175,6 +199,7 @@ class Item:
pass
finally:
if response:
self.logger.debug('Releasing response')
await response.release()

async def process(self):
@@ -189,13 +214,16 @@ class Item:
item = (itemClassOrType.itemType, itemValue)
else:
item = (itemClassOrType, itemValue)
self.logger.debug(f'Adding subitem {itemClassOrType!r} {itemValue!r}')
if item not in self.childItems:
self.childItems.append(item)

async def flush_subitems(self):
self.logger.debug('Flushing subitems')
await self.qwarcObj.flush_subitems(self)

def clear_subitems(self):
self.logger.debug('Clearing subitems')
self.childItems = []

@classmethod
@@ -257,6 +285,7 @@ class QWARC:

@contextlib.asynccontextmanager
async def exclusive_db_lock(self):
logger.debug('Attempting to obtain DB lock')
c = self._db.cursor()
while True:
try:
@@ -266,14 +295,18 @@ class QWARC:
if str(e) != 'database is locked':
raise
await asyncio.sleep(1)
logger.debug('Obtained DB lock')
try:
yield c
logger.debug('Committing DB transaction')
c.execute('COMMIT')
except:
logger.debug('Rolling back DB transaction')
c.execute('ROLLBACK')
raise

def _make_item(self, itemType, itemValue, session, headers):
logger.debug(f'Making item {itemType!r} {itemValue!r}')
try:
itemClass = self._itemTypeMap[itemType]
except KeyError:
@@ -281,29 +314,34 @@ class QWARC:
return itemClass(self, itemValue, session, headers, self._warc)

async def _wait_for_free_task(self):
logger.debug('Waiting for task(s) to finish')
if not self._tasks:
logger.debug('No tasks to wait for')
return
done, pending = await asyncio.wait(self._tasks, return_when = concurrent.futures.FIRST_COMPLETED)
logger.debug(f'{len(done)} task(s) finished, {len(pending)} remaining')
for future in done:
logger.debug(f'Finished {future.taskType} task: {future!r}')
newStatus = STATUS_DONE
if future.taskType == 'sleep':
self._sleepTasks.remove(future)
elif future.taskType == 'process':
item = future.item
logger.debug(f'Finished item: {item!r}')
try:
future.result()
except asyncio.CancelledError as e:
# Got cancelled, nothing we can do about it, but let's log a warning if it's a process task
if future.taskType == 'process':
logging.error(f'Task for {future.itemType}:{future.itemValue} cancelled: {future!r}')
logger.error(f'Task for {future.itemType}:{future.itemValue} cancelled: {future!r}')
newStatus = STATUS_ERROR
except Exception as e:
if future.taskType == 'process':
logging.error(f'{future.itemType}:{future.itemValue} failed: {e!r} ({item.stats["requests"]} requests, {item.stats["tx"]} tx, {item.stats["rx"]} rx)', exc_info = e)
logger.error(f'{future.itemType}:{future.itemValue} failed: {e!r} ({item.stats["requests"]} requests, {item.stats["tx"]} tx, {item.stats["rx"]} rx)', exc_info = e)
newStatus = STATUS_ERROR
else:
if future.taskType == 'process':
logging.info(f'{future.itemType}:{future.itemValue} done: {item.stats["requests"]} requests, {item.stats["tx"]} tx, {item.stats["rx"]} rx')
logger.info(f'{future.itemType}:{future.itemValue} done: {item.stats["requests"]} requests, {item.stats["tx"]} tx, {item.stats["rx"]} rx')
if future.taskType != 'process':
continue
async with self.exclusive_db_lock() as cursor:
@@ -313,6 +351,7 @@ class QWARC:
self._tasks = pending

async def _insert_subitems(self, item):
logger.debug(f'Inserting {len(item.childItems)} subitem(s) from {item!r}')
if item.childItems:
async with self.exclusive_db_lock() as cursor:
it = iter(item.childItems)
@@ -325,6 +364,7 @@ class QWARC:
async def run(self, loop):
self._closed = _STATE_RUNNING

logger.debug(f'Creating {self._concurrency} sessions')
for i in range(self._concurrency):
session = _aiohttp.ClientSession(
connector = qwarc.aiohttp.TCPConnector(loop = loop),
@@ -335,12 +375,16 @@ class QWARC:
self._sessions.append(session)
self._freeSessions.append(session)

logger.debug('Constructing WARC object')
self._warc = qwarc.warc.WARC(self._warcBasePath, self._warcSizeLimit, self._warcDedupe, self._command, self._specFile, self._specDependencies, self._logFilename)

logger.debug('Connecting to the database')
self._db = sqlite3.connect(self._dbPath, timeout = 1)
self._db.set_trace_callback(logging.getLogger(f'{__name__}[sqlite3]').debug)
self._db.isolation_level = None # Transactions are handled manually below.

# Setting the synchronous PRAGMA is not possible if the DB is currently locked by another process but also can't be done inside a transaction... So just retry until it succeeds.
logger.debug('Setting synchronous PRAGMA')
while True:
try:
self._db.execute('PRAGMA synchronous = OFF')
@@ -350,6 +394,7 @@ class QWARC:
await asyncio.sleep(1)
else:
break
logger.debug('Set synchronous PRAGMA')

async with self.exclusive_db_lock() as cursor:
cursor.execute('SELECT name FROM sqlite_master WHERE type = "table" AND name = "items"')
@@ -364,31 +409,35 @@ class QWARC:
await self._wait_for_free_task()

if self._closed == _STATE_CLOSING:
logger.debug('Currently closing')
break

if os.path.exists('STOP'):
logging.info('Gracefully shutting down due to STOP file')
logger.info('Gracefully shutting down due to STOP file')
break
if self._memoryLimit and qwarc.utils.uses_too_much_memory(self._memoryLimit):
logging.info(f'Gracefully shutting down due to memory usage (current = {qwarc.utils.get_rss()} > limit = {self._memoryLimit})')
logger.info(f'Gracefully shutting down due to memory usage (current = {qwarc.utils.get_rss()} > limit = {self._memoryLimit})')
break

if self._minFreeDisk and qwarc.utils.too_little_disk_space(self._minFreeDisk):
logging.info('Disk space is low, sleeping')
logger.info('Disk space is low, sleeping')
sleepTask = asyncio.ensure_future(asyncio.sleep(random.uniform(self._concurrency / 2, self._concurrency * 1.5)))
sleepTask.taskType = 'sleep'
self._tasks.add(sleepTask)
self._sleepTasks.add(sleepTask)
continue

logger.debug('Trying to get an item from the DB')
async with self.exclusive_db_lock() as cursor:
cursor.execute('SELECT id, type, value, status FROM items WHERE status = ? LIMIT 1', (STATUS_TODO,))
result = cursor.fetchone()
if not result:
logger.debug('Found no TODO item')
if cursor.execute('SELECT id, status FROM items WHERE status != ? LIMIT 1', (STATUS_DONE,)).fetchone():
# There is currently no item to do, but there are still some in progress, so more TODOs may appear in the future.
# It would be nice if we could just await wait_for_free_task() here, but that doesn't work because those TODOs might be in another process.
# So instead, we insert a dummy task which just sleeps a bit. Average sleep time is equal to concurrency, i.e. one check per second.
logger.debug('There are incomplete items that may spawn further TODOs, starting sleep task')
#TODO: The average sleep time is too large if there are only few sleep tasks; scale with len(sleepTasks)/self._concurrency?
sleepTask = asyncio.ensure_future(asyncio.sleep(random.uniform(self._concurrency / 2, self._concurrency * 1.5)))
sleepTask.taskType = 'sleep'
@@ -397,8 +446,10 @@ class QWARC:
continue
else:
# Really nothing to do anymore
logger.debug('All items are done')
break
id_, itemType, itemValue, status = result
logger.debug(f'Got item {itemType}:{itemValue} ({id_})')
cursor.execute('UPDATE items SET status = ? WHERE id = ?', (STATUS_INPROGRESS, id_))

session = self._freeSessions.popleft()
@@ -412,17 +463,20 @@ class QWARC:
task.item = item
self._tasks.add(task)

logger.debug(f'Cancelling {len(self._sleepTasks)} sleep task(s)')
for sleepTask in self._sleepTasks:
sleepTask.cancel()

logger.debug('Waiting for tasks to finish')
while len(self._tasks):
await self._wait_for_free_task()

logging.info('Done')
logger.info('Done')
except (Exception, KeyboardInterrupt) as e:
# Kill all tasks
logger.debug(f'Got an exception, cancelling {len(self._tasks)} task(s)')
for task in self._tasks:
task.cancel()
logger.debug('Waiting for tasks to get cancelled')
await asyncio.wait(self._tasks, return_when = concurrent.futures.ALL_COMPLETED)
self._tasks.clear()

@@ -435,11 +489,13 @@ class QWARC:
item.clear_subitems()

def _create_db(self, cursor):
logger.debug('Creating database')
cursor.execute('CREATE TABLE items (id INTEGER PRIMARY KEY, type TEXT, value TEXT, status INTEGER)')
cursor.execute('CREATE INDEX items_status_idx ON items (status)')
cursor.execute('CREATE UNIQUE INDEX items_type_value_idx ON items (type, value)')

def _insert_generated_items(self, cursor):
logger.debug('Inserting generated items')
it = itertools.chain((cls.itemType, value, STATUS_TODO) for cls in self._itemClasses for value in cls.generate())
while True:
values = tuple(itertools.islice(it, 100000))
@@ -453,15 +509,20 @@ class QWARC:

self._closed = _STATE_CLOSING

logger.debug('Closing')
if self._tasks:
logger.warning(f'Cancelling {len(self._tasks)} task(s) remaining on cleanup')
for task in self._tasks:
task.cancel()
logger.debug('Waiting for tasks to get cancelled')
await asyncio.wait(self._tasks, return_when = concurrent.futures.ALL_COMPLETED)
self._tasks.clear()
logger.debug('Closing DB')
self._db.close()
logger.debug('Closing sessions')
for session in self._sessions:
session.close()
logger.debug('Closing WARC')
self._warc.close()

self._reset_working_vars() # Also resets self._closed

+ 4
- 1
qwarc/cli.py View File

@@ -9,6 +9,9 @@ import qwarc.version
import sys


logger = logging.getLogger(__name__)


def setup_logging(logFilename, logLevel, logLevelStderr):
if logLevelStderr is None:
logLevelStderr = logLevel
@@ -91,7 +94,7 @@ def main():
try:
loop.run_until_complete(a.run(loop))
except (Exception, KeyboardInterrupt) as e:
logging.exception('Unhandled error')
logger.exception('Unhandled error')
finally:
loop.run_until_complete(a.close())
loop.close()

+ 1
- 1
qwarc/utils.py View File

@@ -245,7 +245,7 @@ def get_software_info(specDependencyPackages):

class LogFormatter(logging.Formatter):
def __init__(self):
super().__init__('%(asctime)s.%(msecs)03dZ %(levelname)s %(itemString)s %(message)s', datefmt = '%Y-%m-%d %H:%M:%S')
super().__init__('%(asctime)s.%(msecs)03dZ %(name)s %(levelname)s %(itemString)s %(message)s', datefmt = '%Y-%m-%d %H:%M:%S')
self.converter = time.gmtime

def format(self, record):


+ 36
- 7
qwarc/warc.py View File

@@ -11,6 +11,9 @@ import time
import warcio


logger = logging.getLogger(__name__)


class WARC:
def __init__(self, prefix, maxFileSize, dedupe, command, specFile, specDependencies, logFilename):
'''
@@ -54,37 +57,42 @@ class WARC:
return
while True:
filename = f'{self._prefix}-{self._counter:05d}.warc.gz'
logger.debug(f'Trying to open {filename}')
try:
# Try to open the file for writing, requiring that it does not exist yet, and attempt to get an exclusive, non-blocking lock on it
self._file = open(filename, 'xb')
fcntl.flock(self._file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
except FileExistsError:
logging.info(f'{filename} already exists, skipping')
logger.debug(f'{filename} already exists, skipping')
self._counter += 1
else:
break
logging.info(f'Opened {filename}')
logger.info(f'Opened {filename}')
self._open_journal(filename)
self._warcWriter = warcio.warcwriter.WARCWriter(self._file, gzip = True, warc_version = '1.1')
self._closed = False
self._counter += 1

def _open_journal(self, filename):
logger.debug(f'Trying to open {filename}.qwarcjournal')
try:
self._journalFile = open(f'{filename}.qwarcjournal', 'xb')
fcntl.flock(self._journalFile.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
except FileExistsError:
logging.error(f'{filename}.qwarcjournal already exists!')
logger.error(f'{filename}.qwarcjournal already exists!')
raise RuntimeError(f'Unable to create journal file for {filename}: {filename}.qwarcjournal already exists')
except OSError as e:
if e.errno == errno.EWOULDBLOCK:
logging.error(f'{filename}.qwarcjournal is already locked!')
logger.error(f'{filename}.qwarcjournal is already locked!')
raise RuntimeError(f'Unable to lock journal file {filename}.qwarcjournal') from e
raise
logger.debug(f'Opened {filename}.qwarcjournal')
self._journalClean = True

def _write_record(self, record):
# Write the current offset to the journal file
logger.debug(f'Preparing to write record {record.rec_headers.get_header("WARC-Record-ID")}')

logger.debug('Writing journal entry')
# Since the size can only grow, it is not necessary to explicitly delete the previous contents.
self._journalFile.seek(0)
previousSize = self._file.tell()
@@ -92,19 +100,22 @@ class WARC:
self._journalFile.flush()
self._journalClean = False

logger.debug('Writing record')
try:
self._warcWriter.write_record(record)
except (OSError, IOError):
logger.debug(f'Error occurred, truncating to previous size {previousSize}')
self._file.truncate(previousSize)
raise
else:
# Mark the write as ok
logger.debug('Write ok, cleaning journal')
self._journalFile.seek(-4, os.SEEK_END) # len(b'no \n')
self._journalFile.write(b'yes\n')
self._journalFile.flush()
self._journalClean = True

def _write_warcinfo_record(self):
logger.debug('Writing warcinfo record')
data = {
'software': qwarc.utils.get_software_info(self._specDependencies.packages),
'command': self._command,
@@ -116,8 +127,10 @@ class WARC:
}
payload = io.BytesIO(json.dumps(data, indent = 2).encode('utf-8'))
# Workaround for https://github.com/webrecorder/warcio/issues/87
logger.debug('Calculating digest')
digester = warcio.utils.Digester('sha1')
digester.update(payload.getvalue())
logger.debug('Creating record object')
record = self._warcWriter.create_warc_record(
None,
'warcinfo',
@@ -134,10 +147,13 @@ class WARC:
A new WARC will be started automatically if the size of the current file exceeds the limit after writing all requests and responses from this `response` to the current WARC.
'''

logger.debug(f'Writing response(s) for {response.url}')
self._ensure_opened()
for r in response.iter_all():
usec = f'{(r.rawRequestTimestamp - int(r.rawRequestTimestamp)):.6f}'[2:]
requestDate = time.strftime(f'%Y-%m-%dT%H:%M:%S.{usec}Z', time.gmtime(r.rawRequestTimestamp))
logger.debug(f'Writing request/response for {requestDate} {r.url}')
logger.debug('Constructing request record')
r.rawRequestData.seek(0, io.SEEK_END)
length = r.rawRequestData.tell()
r.rawRequestData.seek(0)
@@ -153,6 +169,7 @@ class WARC:
}
)
requestRecordID = requestRecord.rec_headers.get_header('WARC-Record-ID')
logger.debug('Constructing response record')
r.rawResponseData.seek(0, io.SEEK_END)
length = r.rawResponseData.tell()
r.rawResponseData.seek(0)
@@ -171,8 +188,10 @@ class WARC:
payloadDigest = responseRecord.rec_headers.get_header('WARC-Payload-Digest')
assert payloadDigest is not None
if self._dedupe and responseRecord.payload_length > 100: # Don't deduplicate small responses; the additional headers are typically larger than the payload dedupe savings...
logger.debug(f'Checking for duplicates with {payloadDigest}')
if payloadDigest in self._dedupeMap:
refersToRecordId, refersToUri, refersToDate = self._dedupeMap[payloadDigest]
logger.debug('Record is a duplicate, creating revisit record to {refersToRecordId} ({refersToDate} {refersToUri})')
responseHttpHeaders = responseRecord.http_headers
responseRecord = self._warcWriter.create_revisit_record(
str(r.url),
@@ -192,18 +211,24 @@ class WARC:
# Workaround for https://github.com/webrecorder/warcio/issues/94
responseRecord.rec_headers.replace_header('WARC-Profile', 'http://netpreserve.org/warc/1.1/revisit/identical-payload-digest')
else:
logger.debug('New record, creating dedupe map entry')
self._dedupeMap[payloadDigest] = (responseRecord.rec_headers.get_header('WARC-Record-ID'), str(r.url), requestDate)
logger.debug('Writing request and response records')
self._write_record(requestRecord)
self._write_record(responseRecord)
logger.debug(f'Done writing responses for {response.url}')

if self._maxFileSize and self._file.tell() > self._maxFileSize:
logger.debug('File exceeds max size, closing')
self._close_file()

def _write_resource_records(self):
'''Write spec file and dependencies'''
assert self._metaWarcinfoRecordID is not None, 'write_warcinfo_record must be called first'

logger.debug('Writing resource records')
for type_, contentType, fn in itertools.chain((('specfile', 'application/x-python', self._specFile),), map(lambda x: ('spec-dependency-file', 'application/octet-stream', x), self._specDependencies.files)):
logger.debug(f'Writing resource record for {fn}')
with open(fn, 'rb') as f:
f.seek(0, io.SEEK_END)
length = f.tell()
@@ -224,6 +249,7 @@ class WARC:
def _write_log_record(self):
assert self._metaWarcinfoRecordID is not None, 'write_warcinfo_record must be called first'

logger.debug(f'Writing log record')
rootLogger = logging.getLogger()
for handler in rootLogger.handlers: #FIXME: Uses undocumented attribute handlers
handler.flush()
@@ -246,10 +272,12 @@ class WARC:
'''Close the currently opened WARC'''

if not self._closed:
logger.debug('Closing WARC file')
self._file.close()
journalFilename = self._journalFile.name
self._journalFile.close()
if self._journalClean:
logger.debug(f'Deleting journal {journalFilename}')
os.remove(journalFilename)
self._warcWriter = None
self._file = None
@@ -259,10 +287,11 @@ class WARC:

def _write_meta_warc(self, callback):
filename = f'{self._prefix}-meta.warc.gz'
logger.debug(f'Opening {filename}')
self._file = open(filename, 'ab')
try:
fcntl.flock(self._file.fileno(), fcntl.LOCK_EX)
logging.info(f'Opened {filename}')
logger.info(f'Opened {filename}')
self._open_journal(filename)
self._warcWriter = warcio.warcwriter.WARCWriter(self._file, gzip = True, warc_version = '1.1')
self._closed = False


Loading…
Cancel
Save