From 73877ecb963ac6f8ab5856042885854727120c25 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sat, 23 Dec 2017 02:25:19 +0100 Subject: [PATCH 1/3] Initial commit --- README.md | 13 +++++++++++++ warc-peek.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 README.md create mode 100644 warc-peek.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..8ff46e6 --- /dev/null +++ b/README.md @@ -0,0 +1,13 @@ +`warc-peek.py` is a small script to help looking into gzipped WARC files without decompressing the entire file. It searches a window in the file for gzip's magic bytes `1F 8B`, attempts decompression, compares the result to the expected beginning of a WARC record, and prints all valid offsets. These can then be used with e.g. `tail` and `zless` to actually look at the records. + +# Usage + + warc-peek.py WARCFILE OFFSET LENGTH + +Opens `WARCFILE`, reads `LENGTH` bytes starting at `OFFSET` (zero-based), and prints valid WARC record offsets to stdout (one integer per line). + +# Caveats + +* This script only works with WARCs in which each record is compressed individually. This is what the specification recommends and what most tools should generate by default, but there definitely exist valid compressed WARCs which can't be processed in this way. +* When you want to use `tail -c+OFFSET WARCFILE | zless` to look at the records, keep in mind that `tail` uses one-based indices, i.e. you will have to add one to the indices returned by `warc-peek.py`. +* `warc-peek.py` will miss valid record offsets in the last 512 bytes of the window. This is because a certain length of the compressed data is necessary to be able to decompress it. `warc-peek.py` uses 512 bytes for this and will therefore not attempt decompression when `1F 8B` is found in the last 512 bytes of the window. You can increase `LENGTH` to compensate for this if necessary. diff --git a/warc-peek.py b/warc-peek.py new file mode 100644 index 0000000..38af7ee --- /dev/null +++ b/warc-peek.py @@ -0,0 +1,31 @@ +import os +import sys +import zlib + + +def finditer(b, sub): + pos = 0 + while True: + pos = b.find(sub, pos) + if pos < 0: + break + yield pos + pos += 1 + + +with open(sys.argv[1], 'rb') as fp: + fp.seek(int(sys.argv[2]), os.SEEK_SET) + buffer = fp.read(int(sys.argv[3])) + +#print('Buffer length', len(buffer)) +for pos in finditer(buffer, b'\x1f\x8b'): + #print('Trying', pos) + if pos > len(buffer) - 512: # 512 bytes might be a bit too much, but at least it ensures that the decompression will work. + break + try: + dec = zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(buffer[pos:pos+512]) + except: + continue + #print(repr(dec)) + if dec.startswith(b'WARC/1.0\r\n'): + print(int(sys.argv[2]) + pos) From 1e7ec4a56e795df029d1503d9c8094891651935e Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sat, 23 Dec 2017 02:28:51 +0100 Subject: [PATCH 2/3] Executable bit --- warc-peek.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 warc-peek.py diff --git a/warc-peek.py b/warc-peek.py old mode 100644 new mode 100755 From 96a329578e496943b75d97f47a63ac1af9a34cae Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sat, 23 Dec 2017 02:53:43 +0100 Subject: [PATCH 3/3] Refactor --- warc-peek.py | 60 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/warc-peek.py b/warc-peek.py index 38af7ee..3a14202 100755 --- a/warc-peek.py +++ b/warc-peek.py @@ -1,8 +1,13 @@ -import os -import sys +#!/usr/bin/env python3 + +import argparse +import logging import zlib +logger = logging.getLogger('warc-peek') + + def finditer(b, sub): pos = 0 while True: @@ -13,19 +18,38 @@ def finditer(b, sub): pos += 1 -with open(sys.argv[1], 'rb') as fp: - fp.seek(int(sys.argv[2]), os.SEEK_SET) - buffer = fp.read(int(sys.argv[3])) - -#print('Buffer length', len(buffer)) -for pos in finditer(buffer, b'\x1f\x8b'): - #print('Trying', pos) - if pos > len(buffer) - 512: # 512 bytes might be a bit too much, but at least it ensures that the decompression will work. - break - try: - dec = zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(buffer[pos:pos+512]) - except: - continue - #print(repr(dec)) - if dec.startswith(b'WARC/1.0\r\n'): - print(int(sys.argv[2]) + pos) +def find_offsets(warcfile, offset, length): + with open(warcfile, 'rb') as fp: + fp.seek(offset) + buffer = fp.read(length) + + logger.debug('Buffer length: {:d}'.format(len(buffer))) + for pos in finditer(buffer, b'\x1f\x8b'): + logger.debug('Trying relative offset {:d}'.format(pos)) + if pos > len(buffer) - 512: # 512 bytes might be a bit too much, but at least it ensures that the decompression will work. + break + try: + dec = zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(buffer[pos:pos+512]) + except: + continue + logger.debug('First 100 bytes of decompressed data: {!r}'.format(dec[:100])) + if dec.startswith(b'WARC/1.0\r\n'): + yield offset + pos + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--debug', action = 'store_true', help = 'Enable debug output') + parser.add_argument('warcfile', help = 'A .warc.gz file') + parser.add_argument('offset', type = int, help = 'Zero-based byte offset of the window') + parser.add_argument('length', type = int, help = 'Length in bytes of the window') + args = parser.parse_args() + + if args.debug: + logging.basicConfig( + format = '{asctime} {levelname} {name} {message}', + style = '{', + level = logging.DEBUG, + ) + for offset in find_offsets(args.warcfile, args.offset, args.length): + print(offset)