A framework for quick web archiving
您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

101 行
3.8 KiB

  1. import argparse
  2. import asyncio
  3. import importlib.util
  4. import logging
  5. import os.path
  6. import qwarc
  7. import qwarc.utils
  8. import qwarc.version
  9. import sys
  10. logger = logging.getLogger(__name__)
  11. def setup_logging(logFilename, logLevel, logLevelStderr):
  12. if logLevelStderr is None:
  13. logLevelStderr = logLevel
  14. rootLogger = logging.getLogger()
  15. rootLogger.handlers = []
  16. rootLogger.setLevel(min(logLevel, logLevelStderr))
  17. formatter = qwarc.utils.LogFormatter()
  18. fileHandler = logging.FileHandler(logFilename)
  19. fileHandler.setFormatter(formatter)
  20. fileHandler.setLevel(logLevel)
  21. rootLogger.addHandler(fileHandler)
  22. stderrHandler = logging.StreamHandler()
  23. stderrHandler.setFormatter(formatter)
  24. stderrHandler.setLevel(logLevelStderr)
  25. rootLogger.addHandler(stderrHandler)
  26. def check_files(specFilename, logFilename):
  27. success = True
  28. if not os.path.isfile(specFilename):
  29. print(f'Error: "{specFilename}" does not exist or is not a regular file', file = sys.stderr)
  30. success = False
  31. if os.path.exists(logFilename):
  32. print(f'Error: "{logFilename}" already exists', file = sys.stderr)
  33. success = False
  34. if os.path.exists('STOP'):
  35. print('Error: "STOP" exists', file = sys.stderr)
  36. success = False
  37. return success
  38. def main():
  39. parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)
  40. parser.add_argument('--version', action = 'version', version = f'qwarc {qwarc.version.__version__}')
  41. parser.add_argument('--log', metavar = 'LOGFILE', default = './qwarc.log')
  42. parser.add_argument('--loglevel', metavar = 'LEVEL', help = 'verbosity of the log file', choices = ('INFO', 'DEBUG'), default = 'INFO')
  43. parser.add_argument('--loglevelstderr', metavar = 'LEVEL', help = 'verbosity of the log messages on stderr; set equal to the log file level if None', choices = (None, 'CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG'), default = None)
  44. parser.add_argument('--database', metavar = 'DBFILE', default = './qwarc.db')
  45. parser.add_argument('--warc', metavar = 'PREFIX', help = 'prefix for the WARC filenames', default = './qwarc')
  46. parser.add_argument('--concurrency', type = int, default = 1)
  47. parser.add_argument('--memorylimit', metavar = 'LIMIT', type = int, help = 'pause when less than LIMIT bytes memory is free; disable if 0', default = 0)
  48. parser.add_argument('--disklimit', metavar = 'LIMIT', type = int, help = 'pause when less than LIMIT bytes disk space is free; disable if 0', default = 0)
  49. parser.add_argument('--warcsplit', metavar = 'SIZE', type = int, help = 'split WARCs into files of SIZE bytes; disable if 0', default = 0)
  50. parser.add_argument('--warcdedupe', action = 'store_true', help = 'enable deduplication of WARC records')
  51. parser.add_argument('specfile')
  52. args = parser.parse_args()
  53. args.loglevel = getattr(logging, args.loglevel)
  54. args.loglevelstderr = getattr(logging, args.loglevelstderr) if args.loglevelstderr is not None else args.loglevel
  55. if not check_files(args.specfile, args.log):
  56. sys.exit(1)
  57. setup_logging(args.log, args.loglevel, args.loglevelstderr)
  58. spec = importlib.util.spec_from_file_location('spec', args.specfile)
  59. specMod = importlib.util.module_from_spec(spec)
  60. spec.loader.exec_module(specMod)
  61. specDependencies = specMod.__dict__.get('specDependencies', qwarc.utils.SpecDependencies())
  62. a = qwarc.QWARC(
  63. itemClasses = set(qwarc.Item.get_subclasses()),
  64. warcBasePath = args.warc,
  65. dbPath = args.database,
  66. command = sys.argv,
  67. specFile = args.specfile,
  68. specDependencies = specDependencies,
  69. logFilename = args.log,
  70. concurrency = args.concurrency,
  71. memoryLimit = args.memorylimit,
  72. minFreeDisk = args.disklimit,
  73. warcSizeLimit = args.warcsplit,
  74. warcDedupe = args.warcdedupe,
  75. )
  76. loop = asyncio.get_event_loop()
  77. try:
  78. loop.run_until_complete(a.run(loop))
  79. except (Exception, KeyboardInterrupt) as e:
  80. logger.exception('Unhandled error')
  81. finally:
  82. loop.run_until_complete(a.close())
  83. loop.close()