A framework for quick web archiving
Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.

90 wiersze
3.1 KiB

  1. import argparse
  2. import asyncio
  3. import importlib.util
  4. import logging
  5. import os.path
  6. import qwarc
  7. import qwarc.utils
  8. import qwarc.version
  9. import sys
  10. def setup_logging(logFilename):
  11. rootLogger = logging.getLogger()
  12. rootLogger.handlers = []
  13. rootLogger.setLevel(logging.INFO)
  14. formatter = qwarc.utils.LogFormatter()
  15. fileHandler = logging.FileHandler(logFilename)
  16. fileHandler.setFormatter(formatter)
  17. rootLogger.addHandler(fileHandler)
  18. stderrHandler = logging.StreamHandler()
  19. stderrHandler.setFormatter(formatter)
  20. rootLogger.addHandler(stderrHandler)
  21. def check_files(specFilename, logFilename):
  22. success = True
  23. if not os.path.isfile(specFilename):
  24. print(f'Error: "{specFilename}" does not exist or is not a regular file', file = sys.stderr)
  25. success = False
  26. if os.path.exists(logFilename):
  27. print(f'Error: "{logFilename}" already exists', file = sys.stderr)
  28. success = False
  29. if os.path.exists('STOP'):
  30. print('Error: "STOP" exists', file = sys.stderr)
  31. success = False
  32. return success
  33. def main():
  34. parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)
  35. parser.add_argument('--version', action = 'version', version = f'qwarc {qwarc.version.__version__}')
  36. parser.add_argument('--log', metavar = 'LOGFILE', default = './qwarc.log')
  37. parser.add_argument('--database', metavar = 'DBFILE', default = './qwarc.db')
  38. parser.add_argument('--warc', metavar = 'PREFIX', help = 'prefix for the WARC filenames', default = './qwarc')
  39. parser.add_argument('--concurrency', type = int, default = 1)
  40. parser.add_argument('--memorylimit', metavar = 'LIMIT', type = int, help = 'pause when less than LIMIT bytes memory is free; disable if 0', default = 0)
  41. parser.add_argument('--disklimit', metavar = 'LIMIT', type = int, help = 'pause when less than LIMIT bytes disk space is free; disable if 0', default = 0)
  42. parser.add_argument('--warcsplit', metavar = 'SIZE', type = int, help = 'split WARCs into files of SIZE bytes; disable if 0', default = 0)
  43. parser.add_argument('--warcdedupe', action = 'store_true', help = 'enable deduplication of WARC records')
  44. parser.add_argument('specfile')
  45. args = parser.parse_args()
  46. if not check_files(args.specfile, args.log):
  47. sys.exit(1)
  48. setup_logging(args.log)
  49. spec = importlib.util.spec_from_file_location('spec', args.specfile)
  50. specMod = importlib.util.module_from_spec(spec)
  51. spec.loader.exec_module(specMod)
  52. specDependencies = specMod.__dict__.get('specDependencies', qwarc.utils.SpecDependencies())
  53. a = qwarc.QWARC(
  54. itemClasses = set(qwarc.Item.get_subclasses()),
  55. warcBasePath = args.warc,
  56. dbPath = args.database,
  57. command = sys.argv,
  58. specFile = args.specfile,
  59. specDependencies = specDependencies,
  60. logFilename = args.log,
  61. concurrency = args.concurrency,
  62. memoryLimit = args.memorylimit,
  63. minFreeDisk = args.disklimit,
  64. warcSizeLimit = args.warcsplit,
  65. warcDedupe = args.warcdedupe,
  66. )
  67. if not os.path.exists(args.database):
  68. a.create_db()
  69. loop = asyncio.get_event_loop()
  70. try:
  71. loop.run_until_complete(a.run(loop))
  72. except (Exception, KeyboardInterrupt) as e:
  73. logging.exception('Unhandled error')
  74. loop.close()