import argparse import asyncio import importlib.util import logging import os.path import qwarc import qwarc.utils import qwarc.version import sys logger = logging.getLogger(__name__) def setup_logging(logFilename, logLevel, logLevelStderr): if logLevelStderr is None: logLevelStderr = logLevel rootLogger = logging.getLogger() rootLogger.handlers = [] rootLogger.setLevel(min(logLevel, logLevelStderr)) formatter = qwarc.utils.LogFormatter() fileHandler = logging.FileHandler(logFilename) fileHandler.setFormatter(formatter) fileHandler.setLevel(logLevel) rootLogger.addHandler(fileHandler) stderrHandler = logging.StreamHandler() stderrHandler.setFormatter(formatter) stderrHandler.setLevel(logLevelStderr) rootLogger.addHandler(stderrHandler) def check_files(specFilename, logFilename): success = True if not os.path.isfile(specFilename): print(f'Error: "{specFilename}" does not exist or is not a regular file', file = sys.stderr) success = False if os.path.exists(logFilename): print(f'Error: "{logFilename}" already exists', file = sys.stderr) success = False if os.path.exists('STOP'): print('Error: "STOP" exists', file = sys.stderr) success = False return success def main(): parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--version', action = 'version', version = f'qwarc {qwarc.version.__version__}') parser.add_argument('--log', metavar = 'LOGFILE', default = './qwarc.log') parser.add_argument('--loglevel', metavar = 'LEVEL', help = 'verbosity of the log file', choices = ('INFO', 'DEBUG'), default = 'INFO') parser.add_argument('--loglevelstderr', metavar = 'LEVEL', help = 'verbosity of the log messages on stderr; set equal to the log file level if None', choices = (None, 'CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG'), default = None) parser.add_argument('--database', metavar = 'DBFILE', default = './qwarc.db') parser.add_argument('--warc', metavar = 'PREFIX', help = 'prefix for the WARC filenames', default = './qwarc') parser.add_argument('--concurrency', type = int, default = 1) parser.add_argument('--memorylimit', metavar = 'LIMIT', type = int, help = 'pause when less than LIMIT bytes memory is free; disable if 0', default = 0) parser.add_argument('--disklimit', metavar = 'LIMIT', type = int, help = 'pause when less than LIMIT bytes disk space is free; disable if 0', default = 0) parser.add_argument('--warcsplit', metavar = 'SIZE', type = int, help = 'split WARCs into files of SIZE bytes; disable if 0', default = 0) parser.add_argument('--warcdedupe', action = 'store_true', help = 'enable deduplication of WARC records') parser.add_argument('specfile') args = parser.parse_args() args.loglevel = getattr(logging, args.loglevel) args.loglevelstderr = getattr(logging, args.loglevelstderr) if args.loglevelstderr is not None else args.loglevel if not check_files(args.specfile, args.log): sys.exit(1) setup_logging(args.log, args.loglevel, args.loglevelstderr) spec = importlib.util.spec_from_file_location('spec', args.specfile) specMod = importlib.util.module_from_spec(spec) spec.loader.exec_module(specMod) specDependencies = specMod.__dict__.get('specDependencies', qwarc.utils.SpecDependencies()) a = qwarc.QWARC( itemClasses = set(qwarc.Item.get_subclasses()), warcBasePath = args.warc, dbPath = args.database, command = sys.argv, specFile = args.specfile, specDependencies = specDependencies, logFilename = args.log, concurrency = args.concurrency, memoryLimit = args.memorylimit, minFreeDisk = args.disklimit, warcSizeLimit = args.warcsplit, warcDedupe = args.warcdedupe, ) loop = asyncio.get_event_loop() try: loop.run_until_complete(a.run(loop)) except (Exception, KeyboardInterrupt) as e: logger.exception('Unhandled error') finally: loop.run_until_complete(a.close()) loop.close()