Browse Source

Proper HTTP/1.0 support

HTTP/1.0 does not mandate a Content-Length header in responses since keep-alive connections aren't a thing; the connection closure then signals the end of the response.
This change requires the URL metadata line for processing HTTP/1.0 data, so it plays well with `warc-dump-responses --meta`.
master
JustAnotherArchivist 1 year ago
parent
commit
9879db1195
1 changed files with 29 additions and 4 deletions
  1. +29
    -4
      http-response-bodies.c

+ 29
- 4
http-response-bodies.c View File

@@ -68,6 +68,8 @@ int main(int argc, char* argv[]) {
bool html_fake_base = false;
char* url = NULL; // Warning, pointer is only valid within the STATE_HEADERS block below.
size_t urllen;
bool have_response_length = false;
size_t response_length;

if (argc == 2 && strcmp(argv[1], "--html-fake-base") == 0) {
html_fake_base = true;
@@ -89,9 +91,10 @@ checkstate:
fprintf(stderr, "Error: too little data before HTTP headers\n");
return 1;
}
// Handle optional URL + Length line
// Handle URL + Length line; optional for HTTP/1.1 but required for HTTP/1.0
url = NULL;
urllen = 0;
response_length = 0;
if (memcmp(bufp, "HTTP/1.1 ", 9) != 0 && memcmp(bufp, "HTTP/1.0 ", 9) != 0) {
DEBUG_PRINTF("No HTTP header, looking for URL line\n");
m0 = memmem(bufp, n, "\n", 1);
@@ -100,13 +103,24 @@ checkstate:
return 1;
}
m1 = m0;
// Skip over length field, which we don't need.
// Skip back over length field
--m0;
while (m0 > bufp && '0' <= *m0 && *m0 <= '9') --m0;
if (*m0 != ' ') {
fprintf(stderr, "Error: URL line has unexpected format\n");
return 1;
}
// Read length
if (sscanf(m0, " %zu%ln", &response_length, &nscan) <= 0) {
fprintf(stderr, "Error: URL line contains no length\n");
return 1;
}
if (nscan != m1 - m0) {
fprintf(stderr, "Error: URL line length read mismatch\n");
return 1;
}
have_response_length = true;
DEBUG_PRINTF("Response length: %zu\n", response_length);
// Rest must now be the URL; check that there is a scheme and no CR, LF, or whitespace.
url = bufp;
urllen = m0 - bufp;
@@ -133,6 +147,10 @@ checkstate:
fprintf(stderr, "Error: --html-fake-base requires URL lines\n");
return 1;
}
if (memcmp(bufp, "HTTP/1.0 ", 9) == 0 && !have_response_length) {
fprintf(stderr, "Error: HTTP/1.0 requires URL metadata lines\n");
return 1;
}

if (n < 9) {
fprintf(stderr, "Error: too little data before HTTP headers\n");
@@ -148,8 +166,15 @@ checkstate:
eoh += 4;
DEBUG_PRINTF("Response body begins at %p (offset %zu)\n", (void*)eoh, eoh - bufp);

m0 = memcasemem(bufp, eoh - bufp, "\r\ncontent-length:", 17);
if (m0 && m0 < eoh) {
if (memcmp(bufp, "HTTP/1.0 ", 9) == 0) {
// HTTP 1.0 doesn't have TE, so just use the response_length for the content length and skip the other header parsing
if (bufp + response_length < eoh) {
fprintf(stderr, "Error: end of headers occurs after alleged response length\n");
return 1;
}
length = response_length - (eoh - bufp);
state = STATE_BODY;
} else if ((m0 = memcasemem(bufp, eoh - bufp, "\r\ncontent-length:", 17)) && m0 < eoh) {
DEBUG_PRINTF("Found Content-Length header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp);
m1 = memmem(m0 + 1, eoh - (m0 + 1), "\r\n", 2);
if (!m1) {


Loading…
Cancel
Save