From 761606a5be5a605af3333574bacd912baca2f233 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Mon, 9 Jan 2023 20:55:10 +0000 Subject: [PATCH] Add options to pass the URL context out through warc-dump-responses and http-response-bodies --- http-response-bodies.c | 79 ++++++++++++++++++++++++++++++++++++++++-- warc-dump-responses.c | 34 ++++++++++++++++-- 2 files changed, 109 insertions(+), 4 deletions(-) diff --git a/http-response-bodies.c b/http-response-bodies.c index 00f25ec..ad0691c 100644 --- a/http-response-bodies.c +++ b/http-response-bodies.c @@ -21,10 +21,10 @@ #endif int main(int argc, char* argv[]) { - //TODO --meta or a similar way to get something like that? - // Read stdin, decode HTTP responses, dump all bodies to stdout. + // stdin may contain an extra 'URL LENGTH\n' line before each response (--meta output from warc-dump-responses). // One LF is inserted at the end of each response to ensure that a new response always begins on a new line. + // If --html-fake-base is provided and the input contains URL data, every HTML response (Content-Type: text/html header) is prefixed with one line containing a fake tag: . The line is terminated with a LF. // Headers and chunk lines must fit into BUFSIZE. // Does not fully comply with the HTTP spec. For example, headers must be capitalised canonically, and continuation lines are unsupported. char buf[2 * BUFSIZE]; @@ -37,6 +37,13 @@ int main(int argc, char* argv[]) { size_t nscan; size_t bytes_read; size_t length; + bool html_fake_base = false; + char* url = NULL; // Warning, pointer is only valid within the STATE_HEADERS block below. + size_t urllen; + + if (argc == 2 && strcmp(argv[1], "--html-fake-base") == 0) { + html_fake_base = true; + } while ((n = fread(buf, 1, BUFSIZE, stdin)) > 0) { bufp = buf; @@ -50,6 +57,54 @@ checkstate: } DEBUG_PRINTF("State: %d\n", state); if (state == STATE_HEADERS) { + if (n < 9) { + fprintf(stderr, "Error: too little data before HTTP headers\n"); + return 1; + } + // Handle optional URL + Length line + url = NULL; + if (memcmp(bufp, "HTTP/1.1 ", 9) != 0) { + DEBUG_PRINTF("No HTTP header, looking for URL line\n"); + m0 = memmem(bufp, n, "\n", 1); + if (!m0 || m0 == bufp) { + fprintf(stderr, "Error: expected HTTP headers or URL line, got neither\n"); + return 1; + } + m1 = m0; + // Skip over length field, which we don't need. + --m0; + while (m0 > bufp && '0' <= *m0 && *m0 <= '9') --m0; + if (*m0 != ' ') { + fprintf(stderr, "Error: URL line has unexpected format\n"); + return 1; + } + // Rest must now be the URL; check that there is a scheme and no CR, LF, or whitespace. + url = bufp; + urllen = m0 - bufp; + if (!memmem(url, urllen, "://", 3)) { + fprintf(stderr, "Error: URL line contains no scheme\n"); + return 1; + } + m0 = url; + while (m0 < bufp + urllen && *m0 != '\r' && *m0 != '\n' && *m0 != ' ' && *m0 != '\t') ++m0; + if (m0 != bufp + urllen) { + fprintf(stderr, "Error: URL contains CR, LF, or whitespace\n"); + return 1; + } + DEBUG_PRINTF("Found URL: "); + for (int i = 0; i < (urllen > 64 ? 64 : urllen); ++i) DEBUG_PRINTF(isprint(*(url + i)) ? "%c" : "\\x%02x", *(url + i) & 0xFF); + if (urllen > 64) DEBUG_PRINTF("<...>"); + DEBUG_PRINTF("\n"); + // Skip over URL line and continue processing below + DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", m1 + 1 - bufp); + n = n - (m1 + 1 - bufp); + bufp = m1 + 1; + } + if (html_fake_base && !url) { + fprintf(stderr, "Error: --html-fake-base requires URL lines\n"); + return 1; + } + if (n < 9) { fprintf(stderr, "Error: too little data before HTTP headers\n"); return 1; @@ -120,6 +175,26 @@ checkstate: state = STATE_CHUNK_LINE; } + if (html_fake_base) { + m0 = memmem(bufp, n, "\r\nContent-Type:", 15); + if (m0 && m0 < eoh) { + DEBUG_PRINTF("Found Content-Type header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp); + m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2); + if (!m1) { + fprintf(stderr, "Error: CRLF after Content-Type missing\n"); + return 1; + } + m0 += 15; + while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0; + if (memcmp(m0, "text/html", 9) == 0) { + DEBUG_PRINTF("Is HTML response, inserting fake base tag\n"); + fprintf(stdout, "\n"); + } + } + } + DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", eoh - bufp); n = n - (eoh - bufp); bufp = eoh; diff --git a/warc-dump-responses.c b/warc-dump-responses.c index 1384022..e2516b4 100644 --- a/warc-dump-responses.c +++ b/warc-dump-responses.c @@ -20,10 +20,9 @@ #endif int main(int argc, char* argv[]) { - //TODO --meta or a similar way to get something like that? - // Read stdin, decode WARC, dump all response record bodies to stdout. // One LF is inserted at the end of each response to ensure that a new record always begins on a new line. + // If the --meta option is given, one line is printed before each record consisting of the WARC-Target-URI, a space, the record length in bytes in decimal notation, and a LF. // Headers must fit into BUFSIZE. // Does not fully comply with the WARC spec. For example, headers must be capitalised canonically, and continuation lines are unsupported. char buf[2 * BUFSIZE]; @@ -35,6 +34,11 @@ int main(int argc, char* argv[]) { size_t record_bytes_read; size_t record_length; size_t nscan; + bool meta = false; + + if (argc == 2 && strcmp(argv[1], "--meta") == 0) { + meta = true; + } while ((n = fread(buf, 1, BUFSIZE, stdin)) > 0) { bufp = buf; @@ -104,6 +108,32 @@ checkstate: state = STATE_OTHER_RECORD; } + if (meta && state == STATE_RESPONSE_RECORD) { + m0 = memmem(bufp, n, "\r\nWARC-Target-URI:", 18); + if (!m0) { + fprintf(stderr, "Error: WARC-Target-URI missing\n"); + return 1; + } + DEBUG_PRINTF("Found WARC-Target-URI header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp); + m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2); + if (!m1) { + fprintf(stderr, "Error: CRLF after WARC-Target-URI missing\n"); + return 1; + } + m0 += 18; + while (m0 < m1 && (*m0 == ' ' || *m0 == '\t')) ++m0; + DEBUG_PRINTF("WARC-Target-URI value starts at %p (offset %zu)\n", (void*)m0, m0 - bufp); + --m1; + while (m1 > m0 && (*m1 == ' ' || *m1 == '\t')) --m1; + DEBUG_PRINTF("WARC-Target-URI value ends at %p (offset %zu)\n", (void*)(m1 + 1), m1 + 1 - bufp); + if (m1 <= m0) { + fprintf(stderr, "Error: empty WARC-Target-URI\n"); + return 1; + } + fwrite(m0, 1, m1 + 1 - m0, stdout); + fprintf(stdout, " %zu\n", record_length); + } + m0 = memmem(bufp, n, "\r\n\r\n", 4); if (!m0) { fprintf(stderr, "Error: end of headers not found\n");