From 9879db1195172e94e18f5054da6680c2db35c83c Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Wed, 3 May 2023 04:18:11 +0000 Subject: [PATCH] Proper HTTP/1.0 support HTTP/1.0 does not mandate a Content-Length header in responses since keep-alive connections aren't a thing; the connection closure then signals the end of the response. This change requires the URL metadata line for processing HTTP/1.0 data, so it plays well with `warc-dump-responses --meta`. --- http-response-bodies.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/http-response-bodies.c b/http-response-bodies.c index 1ac302d..f7184eb 100644 --- a/http-response-bodies.c +++ b/http-response-bodies.c @@ -68,6 +68,8 @@ int main(int argc, char* argv[]) { bool html_fake_base = false; char* url = NULL; // Warning, pointer is only valid within the STATE_HEADERS block below. size_t urllen; + bool have_response_length = false; + size_t response_length; if (argc == 2 && strcmp(argv[1], "--html-fake-base") == 0) { html_fake_base = true; @@ -89,9 +91,10 @@ checkstate: fprintf(stderr, "Error: too little data before HTTP headers\n"); return 1; } - // Handle optional URL + Length line + // Handle URL + Length line; optional for HTTP/1.1 but required for HTTP/1.0 url = NULL; urllen = 0; + response_length = 0; if (memcmp(bufp, "HTTP/1.1 ", 9) != 0 && memcmp(bufp, "HTTP/1.0 ", 9) != 0) { DEBUG_PRINTF("No HTTP header, looking for URL line\n"); m0 = memmem(bufp, n, "\n", 1); @@ -100,13 +103,24 @@ checkstate: return 1; } m1 = m0; - // Skip over length field, which we don't need. + // Skip back over length field --m0; while (m0 > bufp && '0' <= *m0 && *m0 <= '9') --m0; if (*m0 != ' ') { fprintf(stderr, "Error: URL line has unexpected format\n"); return 1; } + // Read length + if (sscanf(m0, " %zu%ln", &response_length, &nscan) <= 0) { + fprintf(stderr, "Error: URL line contains no length\n"); + return 1; + } + if (nscan != m1 - m0) { + fprintf(stderr, "Error: URL line length read mismatch\n"); + return 1; + } + have_response_length = true; + DEBUG_PRINTF("Response length: %zu\n", response_length); // Rest must now be the URL; check that there is a scheme and no CR, LF, or whitespace. url = bufp; urllen = m0 - bufp; @@ -133,6 +147,10 @@ checkstate: fprintf(stderr, "Error: --html-fake-base requires URL lines\n"); return 1; } + if (memcmp(bufp, "HTTP/1.0 ", 9) == 0 && !have_response_length) { + fprintf(stderr, "Error: HTTP/1.0 requires URL metadata lines\n"); + return 1; + } if (n < 9) { fprintf(stderr, "Error: too little data before HTTP headers\n"); @@ -148,8 +166,15 @@ checkstate: eoh += 4; DEBUG_PRINTF("Response body begins at %p (offset %zu)\n", (void*)eoh, eoh - bufp); - m0 = memcasemem(bufp, eoh - bufp, "\r\ncontent-length:", 17); - if (m0 && m0 < eoh) { + if (memcmp(bufp, "HTTP/1.0 ", 9) == 0) { + // HTTP 1.0 doesn't have TE, so just use the response_length for the content length and skip the other header parsing + if (bufp + response_length < eoh) { + fprintf(stderr, "Error: end of headers occurs after alleged response length\n"); + return 1; + } + length = response_length - (eoh - bufp); + state = STATE_BODY; + } else if ((m0 = memcasemem(bufp, eoh - bufp, "\r\ncontent-length:", 17)) && m0 < eoh) { DEBUG_PRINTF("Found Content-Length header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp); m1 = memmem(m0 + 1, eoh - (m0 + 1), "\r\n", 2); if (!m1) {