diff --git a/qwarc/warc.py b/qwarc/warc.py index 4ab26b9..46dfc66 100644 --- a/qwarc/warc.py +++ b/qwarc/warc.py @@ -43,6 +43,9 @@ class WARC: self._logHandler = None self._setup_logger() + self._metaWarcinfoRecordID = None + self._write_meta_warc(self._write_initial_meta_records) + def _setup_logger(self): rootLogger = logging.getLogger() formatter = qwarc.utils.LogFormatter() @@ -91,6 +94,7 @@ class WARC: warc_headers_dict = {'Content-Type': 'application/json; charset=utf-8'}, ) self._warcWriter.write_record(record) + return record.rec_headers.get_header('WARC-Record-ID') def write_client_response(self, response): ''' @@ -151,6 +155,7 @@ class WARC: def write_resource_records(self): '''Write spec file and dependencies''' + assert self._metaWarcinfoRecordID is not None, 'write_warcinfo_record must be called first' for type_, fn in itertools.chain((('specfile', self._specFile),), map(lambda x: ('spec-dependency-file', x), self._specDependencies.files)): with open(fn, 'rb') as f: @@ -158,10 +163,27 @@ class WARC: f'file://{fn}', 'resource', payload = f, - warc_headers_dict = {'X-QWARC-Type': type_}, + warc_headers_dict = {'X-QWARC-Type': type_, 'WARC-Warcinfo-ID': self._metaWarcinfoRecordID}, ) self._warcWriter.write_record(record) + def _write_initial_meta_records(self): + self._metaWarcinfoRecordID = self.write_warcinfo_record() + self.write_resource_records() + + def _write_log_record(self): + assert self._metaWarcinfoRecordID is not None, 'write_warcinfo_record must be called first' + + self._logHandler.flush() + self._logHandler.stream.close() + record = self._warcWriter.create_warc_record( + 'urn:qwarc:log', + 'resource', + payload = gzip.GzipFile(self._logFile.name), + warc_headers_dict = {'Content-Type': 'text/plain; charset=utf-8', 'WARC-Warcinfo-ID': self._metaWarcinfoRecordID}, + ) + self._warcWriter.write_record(record) + def _close_file(self): '''Close the currently opened WARC''' @@ -171,7 +193,7 @@ class WARC: self._file = None self._closed = True - def _write_meta_warc(self): + def _write_meta_warc(self, callback): filename = f'{self._prefix}-meta.warc.gz' #TODO: Handle OSError on fcntl.flock and retry self._file = open(filename, 'ab') @@ -181,26 +203,14 @@ class WARC: self._warcWriter = warcio.warcwriter.WARCWriter(self._file, gzip = True) self._closed = False - self.write_warcinfo_record() - - self.write_resource_records() - - self._logHandler.flush() - self._logHandler.stream.close() - record = self._warcWriter.create_warc_record( - 'urn:qwarc:log', - 'resource', - payload = gzip.GzipFile(self._logFile.name), - warc_headers_dict = {'Content-Type': 'text/plain; charset=utf-8'}, - ) - self._warcWriter.write_record(record) + callback() finally: self._close_file() def close(self): '''Clean up everything.''' self._close_file() - self._write_meta_warc() + self._write_meta_warc(self._write_log_record) logging.getLogger().removeHandler(self._logHandler) try: os.remove(self._logFile.name)