From 4c90bacaede78778b71d3ba4ab023d83523b785b Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Thu, 2 Jul 2020 02:01:30 +0000 Subject: [PATCH] Shield values in colons with angled brackets --- warc-tiny | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/warc-tiny b/warc-tiny index 3a45b47..0ec6204 100755 --- a/warc-tiny +++ b/warc-tiny @@ -4,8 +4,9 @@ # Operating modes: # warc-tiny colour FILES -- coloured output of the WARCs for easier reading # warc-tiny dump-responses [-m|--meta] FILES -- dump the HTTP response bodies to stdout -# With --meta, prefix every line with the filename, record offset, record ID, and target URI; e.g. 'file.warc.gz:123::https://example.org/: foobar' +# With --meta, prefix every line with the filename, record offset, record ID, and target URI; e.g. 'file.warc.gz:123::: foobar' # The record offset may be -1 if it is not known. +# The filename is wrapped in angled brackets if it contains a colon; the target URI is always wrapped in angled brackets (since it virtually always contains a colon). # warc-tiny verify FILES -- verify the integrity of a WARC by comparing the digests import base64 @@ -309,13 +310,15 @@ class DumpResponsesMode(ProcessMode): lines = buf.split(b'\n') self._buffer = lines.pop() # Since there's an explicit `_write(b'\r\n')` at the end of the record, this implicitly resets the buffer as well for line in lines: - sys.stdout.buffer.write(':'.join((self._filename, '-1', self._recordID, self._targetURI, '')).encode('utf-8')) + sys.stdout.buffer.write(':'.join((self._filename, '-1', self._recordID, '<' + self._targetURI + '>', '')).encode('utf-8')) sys.stdout.buffer.write(line) sys.stdout.buffer.write(b'\n') def process_event(self, event): if type(event) is NewFile: self._filename = event.filename + if ':' in self._filename: + self._filename = '<' + self._filename + '>' elif type(event) is BeginOfRecord: warcContentType = next(x[1] for x in event.warcHeaders if x[0] == b'Content-Type') warcType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type')