瀏覽代碼

Only open a WARC file when necessary to avoid producing empty WARCs at the end

tags/v0.2.0
JustAnotherArchivist 4 年之前
父節點
當前提交
9cff6bd5c1
共有 1 個文件被更改,包括 6 次插入7 次删除
  1. +6
    -7
      qwarc/warc.py

+ 6
- 7
qwarc/warc.py 查看文件

@@ -26,13 +26,11 @@ class WARC:
self._dedupe = dedupe self._dedupe = dedupe
self._dedupeMap = {} self._dedupeMap = {}


self._cycle()
def _ensure_opened(self):
'''Open the next file that doesn't exist yet if there is currently no file opened'''


def _cycle(self):
'''Close the current file, open the next file that doesn't exist yet'''

#TODO: This opens a new file also at the end, which can result in empty WARCs. Should try to reorder this to only open a WARC when writing a record, and to only close the current WARC if the size is exceeded after write_client_response.
self.close()
if not self._closed:
return
while True: while True:
filename = f'{self._prefix}-{self._counter:05d}.warc.gz' filename = f'{self._prefix}-{self._counter:05d}.warc.gz'
try: try:
@@ -55,6 +53,7 @@ class WARC:
A new WARC will be started automatically if the size of the current file exceeds the limit after writing all requests and responses from this `response` to the current WARC. A new WARC will be started automatically if the size of the current file exceeds the limit after writing all requests and responses from this `response` to the current WARC.
''' '''


self._ensure_opened()
for r in response.iter_all(): for r in response.iter_all():
requestDate = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(r.rawRequestTimestamp)) requestDate = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(r.rawRequestTimestamp))
requestRecord = self._warcWriter.create_warc_record( requestRecord = self._warcWriter.create_warc_record(
@@ -103,7 +102,7 @@ class WARC:
self._warcWriter.write_record(responseRecord) self._warcWriter.write_record(responseRecord)


if self._maxFileSize and self._file.tell() > self._maxFileSize: if self._maxFileSize and self._file.tell() > self._maxFileSize:
self._cycle()
self.close()


def close(self): def close(self):
'''Close the currently opened WARC''' '''Close the currently opened WARC'''


Loading…
取消
儲存