A framework for quick web archiving
Non puoi selezionare più di 25 argomenti Gli argomenti devono iniziare con una lettera o un numero, possono includere trattini ('-') e possono essere lunghi fino a 35 caratteri.

101 righe
3.8 KiB

  1. import argparse
  2. import asyncio
  3. import importlib.util
  4. import logging
  5. import os.path
  6. import qwarc
  7. import qwarc.utils
  8. import qwarc.version
  9. import sys
  10. logger = logging.getLogger(__name__)
  11. def setup_logging(logFilename, logLevel, logLevelStderr):
  12. if logLevelStderr is None:
  13. logLevelStderr = logLevel
  14. rootLogger = logging.getLogger()
  15. rootLogger.handlers = []
  16. rootLogger.setLevel(min(logLevel, logLevelStderr))
  17. formatter = qwarc.utils.LogFormatter()
  18. fileHandler = logging.FileHandler(logFilename)
  19. fileHandler.setFormatter(formatter)
  20. fileHandler.setLevel(logLevel)
  21. rootLogger.addHandler(fileHandler)
  22. stderrHandler = logging.StreamHandler()
  23. stderrHandler.setFormatter(formatter)
  24. stderrHandler.setLevel(logLevelStderr)
  25. rootLogger.addHandler(stderrHandler)
  26. def check_files(specFilename, logFilename):
  27. success = True
  28. if not os.path.isfile(specFilename):
  29. print(f'Error: "{specFilename}" does not exist or is not a regular file', file = sys.stderr)
  30. success = False
  31. if os.path.exists(logFilename):
  32. print(f'Error: "{logFilename}" already exists', file = sys.stderr)
  33. success = False
  34. if os.path.exists('STOP'):
  35. print('Error: "STOP" exists', file = sys.stderr)
  36. success = False
  37. return success
  38. def main():
  39. parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)
  40. parser.add_argument('--version', action = 'version', version = f'qwarc {qwarc.version.__version__}')
  41. parser.add_argument('--log', metavar = 'LOGFILE', default = './qwarc.log')
  42. parser.add_argument('--loglevel', metavar = 'LEVEL', help = 'verbosity of the log file', choices = ('INFO', 'DEBUG'), default = 'INFO')
  43. parser.add_argument('--loglevelstderr', metavar = 'LEVEL', help = 'verbosity of the log messages on stderr; set equal to the log file level if None', choices = (None, 'CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG'), default = None)
  44. parser.add_argument('--database', metavar = 'DBFILE', default = './qwarc.db')
  45. parser.add_argument('--warc', metavar = 'PREFIX', help = 'prefix for the WARC filenames', default = './qwarc')
  46. parser.add_argument('--concurrency', type = int, default = 1)
  47. parser.add_argument('--memorylimit', metavar = 'LIMIT', type = int, help = 'pause when less than LIMIT bytes memory is free; disable if 0', default = 0)
  48. parser.add_argument('--disklimit', metavar = 'LIMIT', type = int, help = 'pause when less than LIMIT bytes disk space is free; disable if 0', default = 0)
  49. parser.add_argument('--warcsplit', metavar = 'SIZE', type = int, help = 'split WARCs into files of SIZE bytes; disable if 0', default = 0)
  50. parser.add_argument('--warcdedupe', action = 'store_true', help = 'enable deduplication of WARC records')
  51. parser.add_argument('specfile')
  52. args = parser.parse_args()
  53. args.loglevel = getattr(logging, args.loglevel)
  54. args.loglevelstderr = getattr(logging, args.loglevelstderr) if args.loglevelstderr is not None else args.loglevel
  55. if not check_files(args.specfile, args.log):
  56. sys.exit(1)
  57. setup_logging(args.log, args.loglevel, args.loglevelstderr)
  58. spec = importlib.util.spec_from_file_location('spec', args.specfile)
  59. specMod = importlib.util.module_from_spec(spec)
  60. spec.loader.exec_module(specMod)
  61. specDependencies = specMod.__dict__.get('specDependencies', qwarc.utils.SpecDependencies())
  62. a = qwarc.QWARC(
  63. itemClasses = set(qwarc.Item.get_subclasses()),
  64. warcBasePath = args.warc,
  65. dbPath = args.database,
  66. command = sys.argv,
  67. specFile = args.specfile,
  68. specDependencies = specDependencies,
  69. logFilename = args.log,
  70. concurrency = args.concurrency,
  71. memoryLimit = args.memorylimit,
  72. minFreeDisk = args.disklimit,
  73. warcSizeLimit = args.warcsplit,
  74. warcDedupe = args.warcdedupe,
  75. )
  76. loop = asyncio.get_event_loop()
  77. try:
  78. loop.run_until_complete(a.run(loop))
  79. except (Exception, KeyboardInterrupt) as e:
  80. logger.exception('Unhandled error')
  81. finally:
  82. loop.run_until_complete(a.close())
  83. loop.close()