Browse Source

Shield values in colons with angled brackets

master
JustAnotherArchivist 5 months ago
parent
commit
4c90bacaed
1 changed files with 5 additions and 2 deletions
  1. +5
    -2
      warc-tiny

+ 5
- 2
warc-tiny View File

@@ -4,8 +4,9 @@
# Operating modes:
# warc-tiny colour FILES -- coloured output of the WARCs for easier reading
# warc-tiny dump-responses [-m|--meta] FILES -- dump the HTTP response bodies to stdout
# With --meta, prefix every line with the filename, record offset, record ID, and target URI; e.g. 'file.warc.gz:123:<urn:uuid:41b76f1f-f946-4723-91f8-cee6491e92f3>:https://example.org/: foobar'
# With --meta, prefix every line with the filename, record offset, record ID, and target URI; e.g. 'file.warc.gz:123:<urn:uuid:41b76f1f-f946-4723-91f8-cee6491e92f3>:<https://example.org/>: foobar'
# The record offset may be -1 if it is not known.
# The filename is wrapped in angled brackets if it contains a colon; the target URI is always wrapped in angled brackets (since it virtually always contains a colon).
# warc-tiny verify FILES -- verify the integrity of a WARC by comparing the digests

import base64
@@ -309,13 +310,15 @@ class DumpResponsesMode(ProcessMode):
lines = buf.split(b'\n')
self._buffer = lines.pop() # Since there's an explicit `_write(b'\r\n')` at the end of the record, this implicitly resets the buffer as well
for line in lines:
sys.stdout.buffer.write(':'.join((self._filename, '-1', self._recordID, self._targetURI, '')).encode('utf-8'))
sys.stdout.buffer.write(':'.join((self._filename, '-1', self._recordID, '<' + self._targetURI + '>', '')).encode('utf-8'))
sys.stdout.buffer.write(line)
sys.stdout.buffer.write(b'\n')

def process_event(self, event):
if type(event) is NewFile:
self._filename = event.filename
if ':' in self._filename:
self._filename = '<' + self._filename + '>'
elif type(event) is BeginOfRecord:
warcContentType = next(x[1] for x in event.warcHeaders if x[0] == b'Content-Type')
warcType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type')


Loading…
Cancel
Save