A framework for quick web archiving
Non puoi selezionare più di 25 argomenti Gli argomenti devono iniziare con una lettera o un numero, possono includere trattini ('-') e possono essere lunghi fino a 35 caratteri.

82 righe
2.7 KiB

  1. import argparse
  2. import asyncio
  3. import importlib.util
  4. import logging
  5. import os.path
  6. import qwarc
  7. import sys
  8. import time
  9. def setup_logging(logFilename):
  10. rootLogger = logging.getLogger()
  11. rootLogger.handlers = []
  12. rootLogger.setLevel(logging.INFO)
  13. formatter = logging.Formatter('%(asctime)s.%(msecs)03dZ %(levelname)s %(message)s', datefmt = '%Y-%m-%d %H:%M:%S')
  14. formatter.converter = time.gmtime
  15. fileHandler = logging.FileHandler(logFilename)
  16. fileHandler.setFormatter(formatter)
  17. rootLogger.addHandler(fileHandler)
  18. stderrHandler = logging.StreamHandler()
  19. stderrHandler.setFormatter(formatter)
  20. rootLogger.addHandler(stderrHandler)
  21. def check_files(specFilename, logFilename):
  22. success = True
  23. if not os.path.isfile(specFilename):
  24. print('Error: "{}" does not exist or is not a regular file', file = sys.stderr)
  25. success = False
  26. if os.path.exists(logFilename):
  27. print('Error: "{}" already exists'.format(logFilename), file = sys.stderr)
  28. success = False
  29. if os.path.exists('STOP'):
  30. print('Error: "STOP" exists', file = sys.stderr)
  31. success = False
  32. return success
  33. def main():
  34. parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)
  35. parser.add_argument('--log', metavar = 'LOGFILE', default = './qwarc.log')
  36. parser.add_argument('--database', metavar = 'DBFILE', default = './qwarc.db')
  37. parser.add_argument('--warc', metavar = 'PREFIX', help = 'prefix for the WARC filenames', default = './qwarc')
  38. parser.add_argument('--concurrency', type = int, default = 1)
  39. parser.add_argument('--memorylimit', metavar = 'LIMIT', help = 'pause when less than LIMIT bytes memory is free; disable if 0', default = 0)
  40. parser.add_argument('--disklimit', metavar = 'LIMIT', help = 'pause when less than LIMIT bytes disk space is free; disable if 0', default = 0)
  41. parser.add_argument('--warcsplit', metavar = 'SIZE', help = 'split WARCs into files of SIZE bytes; disable if 0', default = 0)
  42. parser.add_argument('specfile')
  43. args = parser.parse_args()
  44. if not check_files(args.specfile, args.log):
  45. sys.exit(1)
  46. setup_logging(args.log)
  47. spec = importlib.util.spec_from_file_location('spec', args.specfile)
  48. specMod = importlib.util.module_from_spec(spec)
  49. spec.loader.exec_module(specMod)
  50. a = qwarc.QWARC(
  51. itemClasses = qwarc.Item.__subclasses__(),
  52. warcBasePath = args.warc,
  53. dbPath = args.database,
  54. concurrency = args.concurrency,
  55. memoryLimit = args.memorylimit,
  56. minFreeDisk = args.disklimit,
  57. warcSizeLimit = args.warcsplit,
  58. )
  59. if not os.path.exists(args.database):
  60. a.create_db()
  61. loop = asyncio.get_event_loop()
  62. try:
  63. loop.run_until_complete(a.run(loop))
  64. except (Exception, KeyboardInterrupt) as e:
  65. logging.exception('Unhandled error')
  66. loop.close()