Browse Source

Refactor

master
JustAnotherArchivist 6 years ago
parent
commit
96a329578e
1 changed files with 42 additions and 18 deletions
  1. +42
    -18
      warc-peek.py

+ 42
- 18
warc-peek.py View File

@@ -1,8 +1,13 @@
import os
import sys
#!/usr/bin/env python3

import argparse
import logging
import zlib


logger = logging.getLogger('warc-peek')


def finditer(b, sub):
pos = 0
while True:
@@ -13,19 +18,38 @@ def finditer(b, sub):
pos += 1


with open(sys.argv[1], 'rb') as fp:
fp.seek(int(sys.argv[2]), os.SEEK_SET)
buffer = fp.read(int(sys.argv[3]))

#print('Buffer length', len(buffer))
for pos in finditer(buffer, b'\x1f\x8b'):
#print('Trying', pos)
if pos > len(buffer) - 512: # 512 bytes might be a bit too much, but at least it ensures that the decompression will work.
break
try:
dec = zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(buffer[pos:pos+512])
except:
continue
#print(repr(dec))
if dec.startswith(b'WARC/1.0\r\n'):
print(int(sys.argv[2]) + pos)
def find_offsets(warcfile, offset, length):
with open(warcfile, 'rb') as fp:
fp.seek(offset)
buffer = fp.read(length)

logger.debug('Buffer length: {:d}'.format(len(buffer)))
for pos in finditer(buffer, b'\x1f\x8b'):
logger.debug('Trying relative offset {:d}'.format(pos))
if pos > len(buffer) - 512: # 512 bytes might be a bit too much, but at least it ensures that the decompression will work.
break
try:
dec = zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(buffer[pos:pos+512])
except:
continue
logger.debug('First 100 bytes of decompressed data: {!r}'.format(dec[:100]))
if dec.startswith(b'WARC/1.0\r\n'):
yield offset + pos


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--debug', action = 'store_true', help = 'Enable debug output')
parser.add_argument('warcfile', help = 'A .warc.gz file')
parser.add_argument('offset', type = int, help = 'Zero-based byte offset of the window')
parser.add_argument('length', type = int, help = 'Length in bytes of the window')
args = parser.parse_args()

if args.debug:
logging.basicConfig(
format = '{asctime} {levelname} {name} {message}',
style = '{',
level = logging.DEBUG,
)
for offset in find_offsets(args.warcfile, args.offset, args.length):
print(offset)

Loading…
Cancel
Save