Browse Source

Replace memmove with pointer arithmetic

master
JustAnotherArchivist 1 year ago
parent
commit
a432631d9b
1 changed files with 27 additions and 24 deletions
  1. +27
    -24
      warc-dump-responses.c

+ 27
- 24
warc-dump-responses.c View File

@@ -27,6 +27,7 @@ int main(int argc, char* argv[]) {
char buf[2 * BUFSIZE]; char buf[2 * BUFSIZE];
size_t n; size_t n;
int state = STATE_BEFORE_RECORD; int state = STATE_BEFORE_RECORD;
char* bufp;
char* m0; char* m0;
char* m1; char* m1;
size_t record_bytes_read; size_t record_bytes_read;
@@ -34,8 +35,9 @@ int main(int argc, char* argv[]) {
size_t nscan; size_t nscan;


while ((n = fread(buf, 1, BUFSIZE, stdin)) > 0) { while ((n = fread(buf, 1, BUFSIZE, stdin)) > 0) {
bufp = buf;
checkstate: checkstate:
DEBUG_PRINTF("Have %zu bytes of buffer (at %p)\n", n, (void*)&buf);
DEBUG_PRINTF("Have %zu bytes of buffer (at %p)\n", n, (void*)bufp);
if (n == 0) { if (n == 0) {
break; break;
} }
@@ -45,50 +47,50 @@ checkstate:
fprintf(stderr, "Error: too little data before WARC headers\n"); fprintf(stderr, "Error: too little data before WARC headers\n");
return 1; return 1;
} }
if (memcmp(buf, "WARC/1.0\r\n", 10) == 0 || memcmp(buf, "WARC/1.1\r\n", 10) == 0) {
if (memcmp(bufp, "WARC/1.0\r\n", 10) == 0 || memcmp(bufp, "WARC/1.1\r\n", 10) == 0) {
// Got some headers; find the record type, content length, and end of headers // Got some headers; find the record type, content length, and end of headers
m0 = memmem(buf, n, "\r\nContent-Length:", 17);
m0 = memmem(bufp, n, "\r\nContent-Length:", 17);
if (!m0) { if (!m0) {
fprintf(stderr, "Error: Content-Length missing\n"); fprintf(stderr, "Error: Content-Length missing\n");
return 1; return 1;
} }
DEBUG_PRINTF("Found Content-Length header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - buf);
m1 = memmem(m0 + 1, n - (m0 + 1 - buf), "\r\n", 2);
DEBUG_PRINTF("Found Content-Length header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp);
m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2);
if (!m1) { if (!m1) {
fprintf(stderr, "Error: CRLF after Content-Length missing\n"); fprintf(stderr, "Error: CRLF after Content-Length missing\n");
return 1; return 1;
} }
m0 += 17; m0 += 17;
while (m0 < buf + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
if (!sscanf(m0, "%zu%n", &record_length, &nscan)) { if (!sscanf(m0, "%zu%n", &record_length, &nscan)) {
fprintf(stderr, "Error: invalid Content-Length\n"); fprintf(stderr, "Error: invalid Content-Length\n");
return 1; return 1;
} }
if (nscan > n - (m0 - buf)) {
if (nscan > n - (m0 - bufp)) {
fprintf(stderr, "Error: buffer overread\n"); fprintf(stderr, "Error: buffer overread\n");
return 1; return 1;
} }
m0 += nscan; m0 += nscan;
while (m0 < buf + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
if (m0 != m1) { if (m0 != m1) {
fprintf(stderr, "Error: invalid Content-Length (noise before EOL)\n"); fprintf(stderr, "Error: invalid Content-Length (noise before EOL)\n");
return 1; return 1;
} }
DEBUG_PRINTF("Record body length: %zu\n", record_length); DEBUG_PRINTF("Record body length: %zu\n", record_length);


m0 = memmem(buf, n, "\r\nWARC-Type:", 12);
m0 = memmem(bufp, n, "\r\nWARC-Type:", 12);
if (!m0) { if (!m0) {
fprintf(stderr, "Error: WARC-Type missing\n"); fprintf(stderr, "Error: WARC-Type missing\n");
return 1; return 1;
} }
DEBUG_PRINTF("Found WARC-Type header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - buf);
m1 = memmem(m0, n - (m0 - buf), "\r\n", 2);
DEBUG_PRINTF("Found WARC-Type header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp);
m1 = memmem(m0, n - (m0 - bufp), "\r\n", 2);
if (!m1) { if (!m1) {
fprintf(stderr, "Error: CRLF after WARC-Type missing\n"); fprintf(stderr, "Error: CRLF after WARC-Type missing\n");
return 1; return 1;
} }
m0 += 12; m0 += 12;
while (m0 < buf + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
if (memcmp(m0, "response", 8) == 0) { if (memcmp(m0, "response", 8) == 0) {
DEBUG_PRINTF("Response record\n"); DEBUG_PRINTF("Response record\n");
state = STATE_RESPONSE_RECORD; state = STATE_RESPONSE_RECORD;
@@ -97,18 +99,17 @@ checkstate:
state = STATE_OTHER_RECORD; state = STATE_OTHER_RECORD;
} }


m0 = memmem(buf, n, "\r\n\r\n", 4);
m0 = memmem(bufp, n, "\r\n\r\n", 4);
if (!m0) { if (!m0) {
fprintf(stderr, "Error: end of headers not found\n"); fprintf(stderr, "Error: end of headers not found\n");
return 1; return 1;
} }
m0 += 4; m0 += 4;
DEBUG_PRINTF("Record body begins at %p (offset %zu)\n", (void*)m0, m0 - buf);
DEBUG_PRINTF("Record body begins at %p (offset %zu)\n", (void*)m0, m0 - bufp);


//TODO Replace all the memmove business with pointer logic to avoid needless constant memory copying (is more wrooom).
DEBUG_PRINTF("Moving %zu bytes backwards by %zu\n", n - (m0 - buf), m0 - buf);
memmove(buf, m0, n - (m0 - buf));
n = n - (m0 - buf);
DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", m0 - bufp);
n = n - (m0 - bufp);
bufp = m0;
record_bytes_read = 0; record_bytes_read = 0;
goto checkstate; goto checkstate;
} else { } else {
@@ -121,7 +122,7 @@ checkstate:
DEBUG_PRINTF("Partial record\n"); DEBUG_PRINTF("Partial record\n");
if (state == STATE_RESPONSE_RECORD) { if (state == STATE_RESPONSE_RECORD) {
DEBUG_PRINTF("Copying %zu bytes to stdout\n", n); DEBUG_PRINTF("Copying %zu bytes to stdout\n", n);
fwrite(buf, 1, n, stdout);
fwrite(bufp, 1, n, stdout);
} }
record_bytes_read += n; record_bytes_read += n;
DEBUG_PRINTF("%zu of %zu bytes from this record written\n", record_bytes_read, record_length); DEBUG_PRINTF("%zu of %zu bytes from this record written\n", record_bytes_read, record_length);
@@ -130,18 +131,20 @@ checkstate:
DEBUG_PRINTF("Full record\n"); DEBUG_PRINTF("Full record\n");
if (state == STATE_RESPONSE_RECORD) { if (state == STATE_RESPONSE_RECORD) {
DEBUG_PRINTF("Copying %zu bytes to stdout\n", record_length - record_bytes_read); DEBUG_PRINTF("Copying %zu bytes to stdout\n", record_length - record_bytes_read);
fwrite(buf, 1, record_length - record_bytes_read, stdout);
fwrite(bufp, 1, record_length - record_bytes_read, stdout);
fprintf(stdout, "\n"); fprintf(stdout, "\n");
} }
if (memcmp(buf + record_length - record_bytes_read, "\r\n\r\n", 4) != 0) {
if (memcmp(bufp + record_length - record_bytes_read, "\r\n\r\n", 4) != 0) {
fprintf(stderr, "Error: end of block not found\n"); fprintf(stderr, "Error: end of block not found\n");
return 1; return 1;
} }
DEBUG_PRINTF("Moving %zu bytes backwards by %zu\n", n - (record_length + 4 - record_bytes_read), record_length + 4 - record_bytes_read);
memmove(buf, buf + record_length + 4 - record_bytes_read, n - (record_length + 4 - record_bytes_read));
DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", record_length + 4 - record_bytes_read);
n = n - (record_length + 4 - record_bytes_read); n = n - (record_length + 4 - record_bytes_read);
bufp = bufp + record_length + 4 - record_bytes_read;
if (n < BUFSIZE) { if (n < BUFSIZE) {
DEBUG_PRINTF("Refilling buffer\n");
DEBUG_PRINTF("Buffer too small (%zu bytes), moving and refilling\n", n);
memmove(buf, bufp, n);
bufp = buf;
n += fread(buf + n, 1, BUFSIZE, stdin); n += fread(buf + n, 1, BUFSIZE, stdin);
} }
state = STATE_BEFORE_RECORD; state = STATE_BEFORE_RECORD;


Loading…
Cancel
Save