Browse Source

Add --meta mode for dump-responses which prefixes each line with information about the file and record

master
JustAnotherArchivist 3 years ago
parent
commit
f51adccd3f
1 changed files with 38 additions and 5 deletions
  1. +38
    -5
      warc-tiny

+ 38
- 5
warc-tiny View File

@@ -3,7 +3,9 @@
# Tiny tool for WARC stuff.
# Operating modes:
# warc-tiny colour FILES -- coloured output of the WARCs for easier reading
# warc-tiny dump-responses FILES -- dump the HTTP response bodies to stdout
# warc-tiny dump-responses [-m|--meta] FILES -- dump the HTTP response bodies to stdout
# With --meta, prefix every line with the filename, record offset, record ID, and target URI; e.g. 'file.warc.gz:123:<urn:uuid:41b76f1f-f946-4723-91f8-cee6491e92f3>:https://example.org/: foobar'
# The record offset may be -1 if it is not known.
# warc-tiny verify FILES -- verify the integrity of a WARC by comparing the digests

import base64
@@ -283,23 +285,54 @@ class VerifyMode(ProcessMode):


class DumpResponsesMode(ProcessMode):
def __init__(self):
@classmethod
def split_args(cls, args):
if args[0] == '-m' or args[0] == '--meta':
return (True,), args[1:]
return (False,), args

def __init__(self, withMeta):
self._printEOR = False
self._isResponse = False
self._withMeta = withMeta
if withMeta:
self._recordID = None
self._targetURI = None
self._buffer = b''

def _write(self, data):
if not self._withMeta:
sys.stdout.buffer.write(data)
return

buf = self._buffer + data
lines = buf.split(b'\n')
self._buffer = lines.pop() # Since there's an explicit `_write(b'\r\n')` at the end of the record, this implicitly resets the buffer as well
for line in lines:
sys.stdout.buffer.write(':'.join((self._filename, '-1', self._recordID, self._targetURI, '')).encode('utf-8'))
sys.stdout.buffer.write(line)
sys.stdout.buffer.write(b'\n')

def process_event(self, event):
if type(event) is BeginOfRecord:
if type(event) is NewFile:
self._filename = event.filename
elif type(event) is BeginOfRecord:
warcContentType = next(x[1] for x in event.warcHeaders if x[0] == b'Content-Type')
warcType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type')
self._isResponse = warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response'
self._printEOR = False
if self._withMeta:
# Both of these are URIs, and per RFC 3986, those can only contain ASCII characters.
self._recordID = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Record-ID').decode('ascii')
self._targetURI = next((x[1] for x in event.warcHeaders if x[0] == b'WARC-Target-URI'), b'').decode('ascii')
self._buffer = b''
elif type(event) is HTTPBodyChunk:
if self._isResponse:
self._printEOR = True
sys.stdout.buffer.write(event.data)
self._write(event.data)
elif type(event) is EndOfRecord:
if self._printEOR:
sys.stdout.buffer.write(b'\r\n')
self._write(b'\r\n')


class COLOURS:


Loading…
Cancel
Save