@@ -208,6 +208,11 @@ class Item:
return f'<{type(self).__module__}.{type(self).__name__} object {id(self)}, itemType = {self.itemType!r}, itemValue = {self.itemValue!r}>'
return f'<{type(self).__module__}.{type(self).__name__} object {id(self)}, itemType = {self.itemType!r}, itemValue = {self.itemValue!r}>'
# QWARC._closed values
_STATE_RUNNING = 1
_STATE_CLOSING = 2
_STATE_CLOSED = 3
class QWARC:
class QWARC:
def __init__(self, itemClasses, warcBasePath, dbPath, command, specFile, specDependencies, logFilename, concurrency = 1, memoryLimit = 0, minFreeDisk = 0, warcSizeLimit = 0, warcDedupe = False):
def __init__(self, itemClasses, warcBasePath, dbPath, command, specFile, specDependencies, logFilename, concurrency = 1, memoryLimit = 0, minFreeDisk = 0, warcSizeLimit = 0, warcDedupe = False):
'''
'''
@@ -248,6 +253,7 @@ class QWARC:
self._sessions = [] # aiohttp.ClientSession instances
self._sessions = [] # aiohttp.ClientSession instances
self._freeSessions = collections.deque() # ClientSession instances that are currently free
self._freeSessions = collections.deque() # ClientSession instances that are currently free
self._warc = None
self._warc = None
self._closed = _STATE_CLOSED
@contextlib.asynccontextmanager
@contextlib.asynccontextmanager
async def exclusive_db_lock(self):
async def exclusive_db_lock(self):
@@ -317,6 +323,8 @@ class QWARC:
cursor.executemany('INSERT OR IGNORE INTO items (type, value, status) VALUES (?, ?, ?)', values)
cursor.executemany('INSERT OR IGNORE INTO items (type, value, status) VALUES (?, ?, ?)', values)
async def run(self, loop):
async def run(self, loop):
self._closed = _STATE_RUNNING
for i in range(self._concurrency):
for i in range(self._concurrency):
session = _aiohttp.ClientSession(
session = _aiohttp.ClientSession(
connector = qwarc.aiohttp.TCPConnector(loop = loop),
connector = qwarc.aiohttp.TCPConnector(loop = loop),
@@ -355,6 +363,9 @@ class QWARC:
while len(self._tasks) >= self._concurrency:
while len(self._tasks) >= self._concurrency:
await self._wait_for_free_task()
await self._wait_for_free_task()
if self._closed == _STATE_CLOSING:
break
if os.path.exists('STOP'):
if os.path.exists('STOP'):
logging.info('Gracefully shutting down due to STOP file')
logging.info('Gracefully shutting down due to STOP file')
break
break
@@ -413,15 +424,11 @@ class QWARC:
for task in self._tasks:
for task in self._tasks:
task.cancel()
task.cancel()
await asyncio.wait(self._tasks, return_when = concurrent.futures.ALL_COMPLETED)
await asyncio.wait(self._tasks, return_when = concurrent.futures.ALL_COMPLETED)
self._tasks.clear()
raise
raise
finally:
finally:
for session in self._sessions:
session.close()
self._warc.close()
self._db.close()
self._reset_working_vars()
await self.close()
async def flush_subitems(self, item):
async def flush_subitems(self, item):
await self._insert_subitems(item)
await self._insert_subitems(item)
@@ -439,3 +446,22 @@ class QWARC:
if not values:
if not values:
break
break
cursor.executemany('INSERT OR IGNORE INTO items (type, value, status) VALUES (?, ?, ?)', values)
cursor.executemany('INSERT OR IGNORE INTO items (type, value, status) VALUES (?, ?, ?)', values)
async def close(self):
if self._closed in (_STATE_CLOSING, _STATE_CLOSED):
return
self._closed = _STATE_CLOSING
if self._tasks:
logger.warning(f'Cancelling {len(self._tasks)} task(s) remaining on cleanup')
for task in self._tasks:
task.cancel()
await asyncio.wait(self._tasks, return_when = concurrent.futures.ALL_COMPLETED)
self._tasks.clear()
self._db.close()
for session in self._sessions:
session.close()
self._warc.close()
self._reset_working_vars() # Also resets self._closed