Add options to pass the URL context out through warc-dump-responses and http-response-bodies

1 year ago · 761606a5be
--- a/http-response-bodies.c
+++ b/http-response-bodies.c
@@ -21,10 +21,10 @@
 #endif

 int main(int argc, char* argv[]) {
 	//TODO --meta or a similar way to get something like that?

 	// Read stdin, decode HTTP responses, dump all bodies to stdout.
 	// stdin may contain an extra 'URL LENGTH\n' line before each response (--meta output from warc-dump-responses).
 	// One LF is inserted at the end of each response to ensure that a new response always begins on a new line.
 	// If --html-fake-base is provided and the input contains URL data, every HTML response (Content-Type: text/html header) is prefixed with one line containing a fake <base> tag: <base href="URL">. The line is terminated with a LF.
 	// Headers and chunk lines must fit into BUFSIZE.
 	// Does not fully comply with the HTTP spec. For example, headers must be capitalised canonically, and continuation lines are unsupported.
 	char buf[2 * BUFSIZE];
@@ -37,6 +37,13 @@ int main(int argc, char* argv[]) {
 	size_t nscan;
 	size_t bytes_read;
 	size_t length;
 	bool html_fake_base = false;
 	char* url = NULL; // Warning, pointer is only valid within the STATE_HEADERS block below.
 	size_t urllen;

 	if (argc == 2 && strcmp(argv[1], "--html-fake-base") == 0) {
 		html_fake_base = true;
 	}

 	while ((n = fread(buf, 1, BUFSIZE, stdin)) > 0) {
 		bufp = buf;
@@ -50,6 +57,54 @@ checkstate:
 		}
 		DEBUG_PRINTF("State: %d\n", state);
 		if (state == STATE_HEADERS) {
 			if (n < 9) {
 				fprintf(stderr, "Error: too little data before HTTP headers\n");
 				return 1;
 			}
 			// Handle optional URL + Length line
 			url = NULL;
 			if (memcmp(bufp, "HTTP/1.1 ", 9) != 0) {
 				DEBUG_PRINTF("No HTTP header, looking for URL line\n");
 				m0 = memmem(bufp, n, "\n", 1);
 				if (!m0 || m0 == bufp) {
 					fprintf(stderr, "Error: expected HTTP headers or URL line, got neither\n");
 					return 1;
 				}
 				m1 = m0;
 				// Skip over length field, which we don't need.
 				--m0;
 				while (m0 > bufp && '0' <= *m0 && *m0 <= '9') --m0;
 				if (*m0 != ' ') {
 					fprintf(stderr, "Error: URL line has unexpected format\n");
 					return 1;
 				}
 				// Rest must now be the URL; check that there is a scheme and no CR, LF, or whitespace.
 				url = bufp;
 				urllen = m0 - bufp;
 				if (!memmem(url, urllen, "://", 3)) {
 					fprintf(stderr, "Error: URL line contains no scheme\n");
 					return 1;
 				}
 				m0 = url;
 				while (m0 < bufp + urllen && *m0 != '\r' && *m0 != '\n' && *m0 != ' ' && *m0 != '\t') ++m0;
 				if (m0 != bufp + urllen) {
 					fprintf(stderr, "Error: URL contains CR, LF, or whitespace\n");
 					return 1;
 				}
 				DEBUG_PRINTF("Found URL: ");
 				for (int i = 0; i < (urllen > 64 ? 64 : urllen); ++i) DEBUG_PRINTF(isprint(*(url + i)) ? "%c" : "\\x%02x", *(url + i) & 0xFF);
 				if (urllen > 64) DEBUG_PRINTF("<...>");
 				DEBUG_PRINTF("\n");
 				// Skip over URL line and continue processing below
 				DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", m1 + 1 - bufp);
 				n = n - (m1 + 1 - bufp);
 				bufp = m1 + 1;
 			}
 			if (html_fake_base && !url) {
 				fprintf(stderr, "Error: --html-fake-base requires URL lines\n");
 				return 1;
 			}

 			if (n < 9) {
 				fprintf(stderr, "Error: too little data before HTTP headers\n");
 				return 1;
@@ -120,6 +175,26 @@ checkstate:
 					state = STATE_CHUNK_LINE;
 				}

 				if (html_fake_base) {
 					m0 = memmem(bufp, n, "\r\nContent-Type:", 15);
 					if (m0 && m0 < eoh) {
 						DEBUG_PRINTF("Found Content-Type header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp);
 						m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2);
 						if (!m1) {
 							fprintf(stderr, "Error: CRLF after Content-Type missing\n");
 							return 1;
 						}
 						m0 += 15;
 						while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
 						if (memcmp(m0, "text/html", 9) == 0) {
 							DEBUG_PRINTF("Is HTML response, inserting fake base tag\n");
 							fprintf(stdout, "<base href=\"");
 							fwrite(url, 1, urllen, stdout);
 							fprintf(stdout, "\">\n");
 						}
 					}
 				}

 				DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", eoh - bufp);
 				n = n - (eoh - bufp);
 				bufp = eoh;
--- a/warc-dump-responses.c
+++ b/warc-dump-responses.c
@@ -20,10 +20,9 @@
 #endif

 int main(int argc, char* argv[]) {
 	//TODO --meta or a similar way to get something like that?

 	// Read stdin, decode WARC, dump all response record bodies to stdout.
 	// One LF is inserted at the end of each response to ensure that a new record always begins on a new line.
 	// If the --meta option is given, one line is printed before each record consisting of the WARC-Target-URI, a space, the record length in bytes in decimal notation, and a LF.
 	// Headers must fit into BUFSIZE.
 	// Does not fully comply with the WARC spec. For example, headers must be capitalised canonically, and continuation lines are unsupported.
 	char buf[2 * BUFSIZE];
@@ -35,6 +34,11 @@ int main(int argc, char* argv[]) {
 	size_t record_bytes_read;
 	size_t record_length;
 	size_t nscan;
 	bool meta = false;

 	if (argc == 2 && strcmp(argv[1], "--meta") == 0) {
 		meta = true;
 	}

 	while ((n = fread(buf, 1, BUFSIZE, stdin)) > 0) {
 		bufp = buf;
@@ -104,6 +108,32 @@ checkstate:
 					state = STATE_OTHER_RECORD;
 				}

 				if (meta && state == STATE_RESPONSE_RECORD) {
 					m0 = memmem(bufp, n, "\r\nWARC-Target-URI:", 18);
 					if (!m0) {
 						fprintf(stderr, "Error: WARC-Target-URI missing\n");
 						return 1;
 					}
 					DEBUG_PRINTF("Found WARC-Target-URI header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp);
 					m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2);
 					if (!m1) {
 						fprintf(stderr, "Error: CRLF after WARC-Target-URI missing\n");
 						return 1;
 					}
 					m0 += 18;
 					while (m0 < m1 && (*m0 == ' ' || *m0 == '\t')) ++m0;
 					DEBUG_PRINTF("WARC-Target-URI value starts at %p (offset %zu)\n", (void*)m0, m0 - bufp);
 					--m1;
 					while (m1 > m0 && (*m1 == ' ' || *m1 == '\t')) --m1;
 					DEBUG_PRINTF("WARC-Target-URI value ends at %p (offset %zu)\n", (void*)(m1 + 1), m1 + 1 - bufp);
 					if (m1 <= m0) {
 						fprintf(stderr, "Error: empty WARC-Target-URI\n");
 						return 1;
 					}
 					fwrite(m0, 1, m1 + 1 - m0, stdout);
 					fprintf(stdout, " %zu\n", record_length);
 				}

 				m0 = memmem(bufp, n, "\r\n\r\n", 4);
 				if (!m0) {
 					fprintf(stderr, "Error: end of headers not found\n");