import argparse import asyncio import importlib.util import logging import os.path import qwarc import sys import time def setup_logging(logFilename): rootLogger = logging.getLogger() rootLogger.handlers = [] rootLogger.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s.%(msecs)03dZ %(levelname)s %(message)s', datefmt = '%Y-%m-%d %H:%M:%S') formatter.converter = time.gmtime fileHandler = logging.FileHandler(logFilename) fileHandler.setFormatter(formatter) rootLogger.addHandler(fileHandler) stderrHandler = logging.StreamHandler() stderrHandler.setFormatter(formatter) rootLogger.addHandler(stderrHandler) def check_files(specFilename, logFilename): success = True if not os.path.isfile(specFilename): print('Error: "{}" does not exist or is not a regular file', file = sys.stderr) success = False if os.path.exists(logFilename): print('Error: "{}" already exists'.format(logFilename), file = sys.stderr) success = False if os.path.exists('STOP'): print('Error: "STOP" exists', file = sys.stderr) success = False return success def main(): parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--log', metavar = 'LOGFILE', default = './qwarc.log') parser.add_argument('--database', metavar = 'DBFILE', default = './qwarc.db') parser.add_argument('--warc', metavar = 'PREFIX', help = 'prefix for the WARC filenames', default = './qwarc') parser.add_argument('--concurrency', type = int, default = 1) parser.add_argument('--memorylimit', metavar = 'LIMIT', help = 'pause when less than LIMIT bytes memory is free; disable if 0', default = 0) parser.add_argument('--disklimit', metavar = 'LIMIT', help = 'pause when less than LIMIT bytes disk space is free; disable if 0', default = 0) parser.add_argument('--warcsplit', metavar = 'SIZE', help = 'split WARCs into files of SIZE bytes; disable if 0', default = 0) parser.add_argument('specfile') args = parser.parse_args() if not check_files(args.specfile, args.log): sys.exit(1) setup_logging(args.log) spec = importlib.util.spec_from_file_location('spec', args.specfile) specMod = importlib.util.module_from_spec(spec) spec.loader.exec_module(specMod) a = qwarc.QWARC( itemClasses = qwarc.Item.__subclasses__(), warcBasePath = args.warc, dbPath = args.database, concurrency = args.concurrency, memoryLimit = args.memorylimit, minFreeDisk = args.disklimit, warcSizeLimit = args.warcsplit, ) if not os.path.exists(args.database): a.create_db() loop = asyncio.get_event_loop() try: loop.run_until_complete(a.run(loop)) except (Exception, KeyboardInterrupt) as e: logging.exception('Unhandled error') loop.close()