Parcourir la source

Use log filename in the target URI of the log resource record

tags/v0.2.0
JustAnotherArchivist il y a 4 ans
Parent
révision
50d46ad51c
3 fichiers modifiés avec 10 ajouts et 5 suppressions
  1. +4
    -2
      qwarc/__init__.py
  2. +1
    -0
      qwarc/cli.py
  3. +5
    -3
      qwarc/warc.py

+ 4
- 2
qwarc/__init__.py Voir le fichier

@@ -130,7 +130,7 @@ class Item:


class QWARC:
def __init__(self, itemClasses, warcBasePath, dbPath, command, specFile, specDependencies, concurrency = 1, memoryLimit = 0, minFreeDisk = 0, warcSizeLimit = 0, warcDedupe = False):
def __init__(self, itemClasses, warcBasePath, dbPath, command, specFile, specDependencies, logFilename, concurrency = 1, memoryLimit = 0, minFreeDisk = 0, warcSizeLimit = 0, warcDedupe = False):
'''
itemClasses: iterable of Item
warcBasePath: str, base name of the WARC files
@@ -138,6 +138,7 @@ class QWARC:
command: list, the command line used to invoke qwarc
specFile: str, path to the spec file
specDependencies: qwarc.utils.SpecDependencies
logFilename: str, name of the log file written by this process
concurrency: int, number of concurrently processed items
memoryLimit: int, gracefully stop when the process uses more than memoryLimit bytes of RSS; 0 disables the memory check
minFreeDisk: int, pause when there's less than minFreeDisk space on the partition where WARCs are written; 0 disables the disk space check
@@ -151,6 +152,7 @@ class QWARC:
self._command = command
self._specFile = specFile
self._specDependencies = specDependencies
self._logFilename = logFilename
self._concurrency = concurrency
self._memoryLimit = memoryLimit
self._minFreeDisk = minFreeDisk
@@ -195,7 +197,7 @@ class QWARC:
sessions.append(session)
freeSessions.append(session)

warc = qwarc.warc.WARC(self._warcBasePath, self._warcSizeLimit, self._warcDedupe, self._command, self._specFile, self._specDependencies)
warc = qwarc.warc.WARC(self._warcBasePath, self._warcSizeLimit, self._warcDedupe, self._command, self._specFile, self._specDependencies, self._logFilename)

db = sqlite3.connect(self._dbPath, timeout = 1)
db.isolation_level = None # Transactions are handled manually below.


+ 1
- 0
qwarc/cli.py Voir le fichier

@@ -71,6 +71,7 @@ def main():
command = sys.argv,
specFile = args.specfile,
specDependencies = specDependencies,
logFilename = args.log,
concurrency = args.concurrency,
memoryLimit = args.memorylimit,
minFreeDisk = args.disklimit,


+ 5
- 3
qwarc/warc.py Voir le fichier

@@ -12,7 +12,7 @@ import warcio


class WARC:
def __init__(self, prefix, maxFileSize, dedupe, command, specFile, specDependencies):
def __init__(self, prefix, maxFileSize, dedupe, command, specFile, specDependencies, logFilename):
'''
Initialise the WARC writer

@@ -22,6 +22,7 @@ class WARC:
command: list, the command line call for qwarc
specFile: str, path to the spec file
specDependencies: qwarc.utils.SpecDependencies
logFilename: str, name of the log file written by this process
'''

self._prefix = prefix
@@ -42,6 +43,7 @@ class WARC:
self._logFile = None
self._logHandler = None
self._setup_logger()
self._logFilename = logFilename

self._dataWarcinfoRecordID = None
self._metaWarcinfoRecordID = None
@@ -182,10 +184,10 @@ class WARC:
self._logHandler.flush()
self._logHandler.stream.close()
record = self._warcWriter.create_warc_record(
'urn:qwarc:log',
f'file://{self._logFilename}',
'resource',
payload = gzip.GzipFile(self._logFile.name),
warc_headers_dict = {'Content-Type': 'text/plain; charset=utf-8', 'WARC-Warcinfo-ID': self._metaWarcinfoRecordID},
warc_headers_dict = {'X-QWARC-Type': 'log', 'Content-Type': 'text/plain; charset=utf-8', 'WARC-Warcinfo-ID': self._metaWarcinfoRecordID},
)
self._warcWriter.write_record(record)



Chargement…
Annuler
Enregistrer