Bläddra i källkod

Refactor cleanup code

- Run the cleanup code on exceptions (e.g. ^C). There were several effects of that not happening previously; most notably, the log file was not written to the meta WARC.
- Cancel remaining tasks, which avoids a pile of asyncio warnings and errors on crashes.
- Close the DB before the WARC, or rather, close the WARC last. This is mostly a semantic change to further ensure that the log written to the meta WARC is as complete as possible.
master
JustAnotherArchivist 3 år sedan
förälder
incheckning
3c8b45b3a6
2 ändrade filer med 34 tillägg och 6 borttagningar
  1. +32
    -6
      qwarc/__init__.py
  2. +2
    -0
      qwarc/cli.py

+ 32
- 6
qwarc/__init__.py Visa fil

@@ -208,6 +208,11 @@ class Item:
return f'<{type(self).__module__}.{type(self).__name__} object {id(self)}, itemType = {self.itemType!r}, itemValue = {self.itemValue!r}>' return f'<{type(self).__module__}.{type(self).__name__} object {id(self)}, itemType = {self.itemType!r}, itemValue = {self.itemValue!r}>'




# QWARC._closed values
_STATE_RUNNING = 1
_STATE_CLOSING = 2
_STATE_CLOSED = 3

class QWARC: class QWARC:
def __init__(self, itemClasses, warcBasePath, dbPath, command, specFile, specDependencies, logFilename, concurrency = 1, memoryLimit = 0, minFreeDisk = 0, warcSizeLimit = 0, warcDedupe = False): def __init__(self, itemClasses, warcBasePath, dbPath, command, specFile, specDependencies, logFilename, concurrency = 1, memoryLimit = 0, minFreeDisk = 0, warcSizeLimit = 0, warcDedupe = False):
''' '''
@@ -248,6 +253,7 @@ class QWARC:
self._sessions = [] # aiohttp.ClientSession instances self._sessions = [] # aiohttp.ClientSession instances
self._freeSessions = collections.deque() # ClientSession instances that are currently free self._freeSessions = collections.deque() # ClientSession instances that are currently free
self._warc = None self._warc = None
self._closed = _STATE_CLOSED


@contextlib.asynccontextmanager @contextlib.asynccontextmanager
async def exclusive_db_lock(self): async def exclusive_db_lock(self):
@@ -317,6 +323,8 @@ class QWARC:
cursor.executemany('INSERT OR IGNORE INTO items (type, value, status) VALUES (?, ?, ?)', values) cursor.executemany('INSERT OR IGNORE INTO items (type, value, status) VALUES (?, ?, ?)', values)


async def run(self, loop): async def run(self, loop):
self._closed = _STATE_RUNNING

for i in range(self._concurrency): for i in range(self._concurrency):
session = _aiohttp.ClientSession( session = _aiohttp.ClientSession(
connector = qwarc.aiohttp.TCPConnector(loop = loop), connector = qwarc.aiohttp.TCPConnector(loop = loop),
@@ -355,6 +363,9 @@ class QWARC:
while len(self._tasks) >= self._concurrency: while len(self._tasks) >= self._concurrency:
await self._wait_for_free_task() await self._wait_for_free_task()


if self._closed == _STATE_CLOSING:
break

if os.path.exists('STOP'): if os.path.exists('STOP'):
logging.info('Gracefully shutting down due to STOP file') logging.info('Gracefully shutting down due to STOP file')
break break
@@ -413,15 +424,11 @@ class QWARC:
for task in self._tasks: for task in self._tasks:
task.cancel() task.cancel()
await asyncio.wait(self._tasks, return_when = concurrent.futures.ALL_COMPLETED) await asyncio.wait(self._tasks, return_when = concurrent.futures.ALL_COMPLETED)
self._tasks.clear()


raise raise
finally: finally:
for session in self._sessions:
session.close()
self._warc.close()
self._db.close()

self._reset_working_vars()
await self.close()


async def flush_subitems(self, item): async def flush_subitems(self, item):
await self._insert_subitems(item) await self._insert_subitems(item)
@@ -439,3 +446,22 @@ class QWARC:
if not values: if not values:
break break
cursor.executemany('INSERT OR IGNORE INTO items (type, value, status) VALUES (?, ?, ?)', values) cursor.executemany('INSERT OR IGNORE INTO items (type, value, status) VALUES (?, ?, ?)', values)

async def close(self):
if self._closed in (_STATE_CLOSING, _STATE_CLOSED):
return

self._closed = _STATE_CLOSING

if self._tasks:
logger.warning(f'Cancelling {len(self._tasks)} task(s) remaining on cleanup')
for task in self._tasks:
task.cancel()
await asyncio.wait(self._tasks, return_when = concurrent.futures.ALL_COMPLETED)
self._tasks.clear()
self._db.close()
for session in self._sessions:
session.close()
self._warc.close()

self._reset_working_vars() # Also resets self._closed

+ 2
- 0
qwarc/cli.py Visa fil

@@ -83,4 +83,6 @@ def main():
loop.run_until_complete(a.run(loop)) loop.run_until_complete(a.run(loop))
except (Exception, KeyboardInterrupt) as e: except (Exception, KeyboardInterrupt) as e:
logging.exception('Unhandled error') logging.exception('Unhandled error')
finally:
loop.run_until_complete(a.close())
loop.close() loop.close()

Laddar…
Avbryt
Spara