//TODO --meta or a similar way to get something like that?
// Read stdin, decode HTTP responses, dump all bodies to stdout.
// stdin may contain an extra 'URL LENGTH\n' line before each response (--meta output from warc-dump-responses).
// One LF is inserted at the end of each response to ensure that a new response always begins on a new line.
// If --html-fake-base is provided and the input contains URL data, every HTML response (Content-Type: text/html header) is prefixed with one line containing a fake <base> tag: <base href="URL">. The line is terminated with a LF.
// Headers and chunk lines must fit into BUFSIZE.
// Does not fully comply with the HTTP spec. For example, headers must be capitalised canonically, and continuation lines are unsupported.
char buf[2 * BUFSIZE];
@@ -37,6 +37,13 @@ int main(int argc, char* argv[]) {
size_t nscan;
size_t bytes_read;
size_t length;
bool html_fake_base = false;
char* url = NULL; // Warning, pointer is only valid within the STATE_HEADERS block below.
size_t urllen;
if (argc == 2 && strcmp(argv[1], "--html-fake-base") == 0) {
html_fake_base = true;
}
while ((n = fread(buf, 1, BUFSIZE, stdin)) > 0) {
bufp = buf;
@@ -50,6 +57,54 @@ checkstate:
}
DEBUG_PRINTF("State: %d\n", state);
if (state == STATE_HEADERS) {
if (n < 9) {
fprintf(stderr, "Error: too little data before HTTP headers\n");
return 1;
}
// Handle optional URL + Length line
url = NULL;
if (memcmp(bufp, "HTTP/1.1 ", 9) != 0) {
DEBUG_PRINTF("No HTTP header, looking for URL line\n");
m0 = memmem(bufp, n, "\n", 1);
if (!m0 || m0 == bufp) {
fprintf(stderr, "Error: expected HTTP headers or URL line, got neither\n");
//TODO --meta or a similar way to get something like that?
// Read stdin, decode WARC, dump all response record bodies to stdout.
// One LF is inserted at the end of each response to ensure that a new record always begins on a new line.
// If the --meta option is given, one line is printed before each record consisting of the WARC-Target-URI, a space, the record length in bytes in decimal notation, and a LF.
// Headers must fit into BUFSIZE.
// Does not fully comply with the WARC spec. For example, headers must be capitalised canonically, and continuation lines are unsupported.
char buf[2 * BUFSIZE];
@@ -35,6 +34,11 @@ int main(int argc, char* argv[]) {
size_t record_bytes_read;
size_t record_length;
size_t nscan;
bool meta = false;
if (argc == 2 && strcmp(argv[1], "--meta") == 0) {