diff --git a/qwarc/cli.py b/qwarc/cli.py index 01648e7..e9bcbcf 100644 --- a/qwarc/cli.py +++ b/qwarc/cli.py @@ -4,19 +4,9 @@ import importlib.util import logging import os.path import qwarc +import qwarc.utils import qwarc.version import sys -import time - - -class Formatter(logging.Formatter): - def format(self, record): - if not hasattr(record, 'itemString'): - if hasattr(record, 'itemType') and hasattr(record, 'itemValue'): - record.itemString = f'{record.itemType}:{record.itemValue}' - else: - record.itemString = 'None' - return super().format(record) def setup_logging(logFilename): @@ -24,8 +14,7 @@ def setup_logging(logFilename): rootLogger.handlers = [] rootLogger.setLevel(logging.INFO) - formatter = Formatter('%(asctime)s.%(msecs)03dZ %(levelname)s %(itemString)s %(message)s', datefmt = '%Y-%m-%d %H:%M:%S') - formatter.converter = time.gmtime + formatter = qwarc.utils.LogFormatter() fileHandler = logging.FileHandler(logFilename) fileHandler.setFormatter(formatter) diff --git a/qwarc/utils.py b/qwarc/utils.py index 60d1cbd..8ac80c5 100644 --- a/qwarc/utils.py +++ b/qwarc/utils.py @@ -6,6 +6,7 @@ import logging import os import pkg_resources import platform +import time PAGESIZE = os.sysconf('SC_PAGE_SIZE') @@ -215,3 +216,17 @@ def get_software_info(): }, 'self': [{"package": package, "version": version} for package, version in _get_dependency_versions(__package__)], } + + +class LogFormatter(logging.Formatter): + def __init__(self): + super().__init__('%(asctime)s.%(msecs)03dZ %(levelname)s %(itemString)s %(message)s', datefmt = '%Y-%m-%d %H:%M:%S') + self.converter = time.gmtime + + def format(self, record): + if not hasattr(record, 'itemString'): + if hasattr(record, 'itemType') and hasattr(record, 'itemValue'): + record.itemString = f'{record.itemType}:{record.itemValue}' + else: + record.itemString = 'None' + return super().format(record) diff --git a/qwarc/warc.py b/qwarc/warc.py index fe8d1c2..db2d648 100644 --- a/qwarc/warc.py +++ b/qwarc/warc.py @@ -1,8 +1,11 @@ import fcntl +import gzip import io import json import logging +import os import qwarc.utils +import tempfile import time import warcio @@ -28,6 +31,19 @@ class WARC: self._dedupe = dedupe self._dedupeMap = {} + self._logFile = None + self._logHandler = None + self._setup_logger() + + def _setup_logger(self): + rootLogger = logging.getLogger() + formatter = qwarc.utils.LogFormatter() + self._logFile = tempfile.NamedTemporaryFile(prefix = 'qwarc-warc-', suffix = '.log.gz', delete = False) + self._logHandler = logging.StreamHandler(io.TextIOWrapper(gzip.GzipFile(filename = self._logFile.name, mode = 'wb'), encoding = 'utf-8')) + self._logHandler.setFormatter(formatter) + rootLogger.addHandler(self._logHandler) + self._logHandler.setLevel(logging.INFO) + def _ensure_opened(self): '''Open the next file that doesn't exist yet if there is currently no file opened''' @@ -116,7 +132,7 @@ class WARC: if self._maxFileSize and self._file.tell() > self._maxFileSize: self.close() - def close(self): + def _close_file(self): '''Close the currently opened WARC''' if not self._closed: @@ -124,3 +140,40 @@ class WARC: self._warcWriter = None self._file = None self._closed = True + + def _write_meta_warc(self): + filename = f'{self._prefix}-meta.warc.gz' + #TODO: Handle OSError on fcntl.flock and retry + self._file = open(filename, 'ab') + try: + fcntl.flock(self._file.fileno(), fcntl.LOCK_EX) + logging.info(f'Opened {filename}') + self._warcWriter = warcio.warcwriter.WARCWriter(self._file, gzip = True) + self._closed = False + + self.write_warcinfo_record() + + self._logHandler.flush() + self._logHandler.stream.close() + record = self._warcWriter.create_warc_record( + 'urn:qwarc:log', + 'resource', + payload = gzip.GzipFile(self._logFile.name), + warc_headers_dict = {'Content-Type': 'text/plain; charset=utf-8'}, + ) + self._warcWriter.write_record(record) + finally: + self._close_file() + + def close(self): + '''Clean up everything.''' + self._close_file() + self._write_meta_warc() + logging.getLogger().removeHandler(self._logHandler) + try: + os.remove(self._logFile.name) + except OSError: + logging.error('Could not remove temporary log file') + self._logFile = None + self._logHandler.close() + self._logHandler = None