The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

157 lines
5.2 KiB

  1. #define _GNU_SOURCE
  2. #include <stdbool.h>
  3. #include <stdio.h>
  4. #include <stdlib.h>
  5. #include <string.h>
  6. #ifndef BUFSIZE
  7. #define BUFSIZE 1048576
  8. #endif
  9. #define STATE_BEFORE_RECORD 0
  10. #define STATE_RESPONSE_RECORD 1
  11. #define STATE_OTHER_RECORD 2
  12. #ifdef DEBUG
  13. #define DEBUG_PRINTF(...) do { fprintf(stderr, __VA_ARGS__); } while (false)
  14. #else
  15. #define DEBUG_PRINTF(...) do {} while (false)
  16. #endif
  17. int main(int argc, char* argv[]) {
  18. //TODO --meta or a similar way to get something like that?
  19. // Read stdin, decode WARC, dump all response record bodies to stdout.
  20. // One LF is inserted at the end of each response to ensure that a new record always begins on a new line.
  21. // Headers must fit into BUFSIZE.
  22. char buf[2 * BUFSIZE];
  23. size_t n;
  24. int state = STATE_BEFORE_RECORD;
  25. char* m0;
  26. char* m1;
  27. size_t record_bytes_read;
  28. size_t record_length;
  29. size_t nscan;
  30. while ((n = fread(buf, 1, BUFSIZE, stdin)) > 0) {
  31. checkstate:
  32. DEBUG_PRINTF("Have %zu bytes of buffer (at %p)\n", n, (void*)&buf);
  33. if (n == 0) {
  34. break;
  35. }
  36. DEBUG_PRINTF("State: %d\n", state);
  37. if (state == STATE_BEFORE_RECORD) {
  38. if (n < 10) {
  39. fprintf(stderr, "Error: too little data before WARC headers\n");
  40. return 1;
  41. }
  42. if (memcmp(buf, "WARC/1.0\r\n", 10) == 0 || memcmp(buf, "WARC/1.1\r\n", 10) == 0) {
  43. // Got some headers; find the record type, content length, and end of headers
  44. m0 = memmem(buf, n, "\r\nContent-Length:", 17);
  45. if (!m0) {
  46. fprintf(stderr, "Error: Content-Length missing\n");
  47. return 1;
  48. }
  49. DEBUG_PRINTF("Found Content-Length header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - buf);
  50. m1 = memmem(m0 + 1, n - (m0 + 1 - buf), "\r\n", 2);
  51. if (!m1) {
  52. fprintf(stderr, "Error: CRLF after Content-Length missing\n");
  53. return 1;
  54. }
  55. m0 += 17;
  56. while (m0 < buf + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  57. if (!sscanf(m0, "%zu%n", &record_length, &nscan)) {
  58. fprintf(stderr, "Error: invalid Content-Length\n");
  59. return 1;
  60. }
  61. if (nscan > n - (m0 - buf)) {
  62. fprintf(stderr, "Error: buffer overread\n");
  63. return 1;
  64. }
  65. m0 += nscan;
  66. while (m0 < buf + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  67. if (m0 != m1) {
  68. fprintf(stderr, "Error: invalid Content-Length (noise before EOL)\n");
  69. return 1;
  70. }
  71. DEBUG_PRINTF("Record body length: %zu\n", record_length);
  72. m0 = memmem(buf, n, "\r\nWARC-Type:", 12);
  73. if (!m0) {
  74. fprintf(stderr, "Error: WARC-Type missing\n");
  75. return 1;
  76. }
  77. DEBUG_PRINTF("Found WARC-Type header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - buf);
  78. m1 = memmem(m0, n - (m0 - buf), "\r\n", 2);
  79. if (!m1) {
  80. fprintf(stderr, "Error: CRLF after WARC-Type missing\n");
  81. return 1;
  82. }
  83. m0 += 12;
  84. while (m0 < buf + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  85. if (memcmp(m0, "response", 8) == 0) {
  86. DEBUG_PRINTF("Response record\n");
  87. state = STATE_RESPONSE_RECORD;
  88. } else {
  89. DEBUG_PRINTF("Other record\n");
  90. state = STATE_OTHER_RECORD;
  91. }
  92. m0 = memmem(buf, n, "\r\n\r\n", 4);
  93. if (!m0) {
  94. fprintf(stderr, "Error: end of headers not found\n");
  95. return 1;
  96. }
  97. m0 += 4;
  98. DEBUG_PRINTF("Record body begins at %p (offset %zu)\n", (void*)m0, m0 - buf);
  99. //TODO Replace all the memmove business with pointer logic to avoid needless constant memory copying (is more wrooom).
  100. DEBUG_PRINTF("Moving %zu bytes backwards by %zu\n", n - (m0 - buf), m0 - buf);
  101. memmove(buf, m0, n - (m0 - buf));
  102. n = n - (m0 - buf);
  103. record_bytes_read = 0;
  104. goto checkstate;
  105. } else {
  106. fprintf(stderr, "Error: expected header line, got something else\n");
  107. return 1;
  108. }
  109. } else if (state == STATE_RESPONSE_RECORD || state == STATE_OTHER_RECORD) {
  110. if (record_length + 4 - record_bytes_read > n) {
  111. // Only got part of the record body
  112. DEBUG_PRINTF("Partial record\n");
  113. if (state == STATE_RESPONSE_RECORD) {
  114. DEBUG_PRINTF("Copying %zu bytes to stdout\n", n);
  115. fwrite(buf, 1, n, stdout);
  116. }
  117. record_bytes_read += n;
  118. DEBUG_PRINTF("%zu of %zu bytes from this record written\n", record_bytes_read, record_length);
  119. } else {
  120. // Remainder of the record is in the buffer. Same logic as above for small records fitting in the buffer with the headers.
  121. DEBUG_PRINTF("Full record\n");
  122. if (state == STATE_RESPONSE_RECORD) {
  123. DEBUG_PRINTF("Copying %zu bytes to stdout\n", record_length - record_bytes_read);
  124. fwrite(buf, 1, record_length - record_bytes_read, stdout);
  125. fprintf(stdout, "\n");
  126. }
  127. if (memcmp(buf + record_length - record_bytes_read, "\r\n\r\n", 4) != 0) {
  128. fprintf(stderr, "Error: end of block not found\n");
  129. return 1;
  130. }
  131. DEBUG_PRINTF("Moving %zu bytes backwards by %zu\n", n - (record_length + 4 - record_bytes_read), record_length + 4 - record_bytes_read);
  132. memmove(buf, buf + record_length + 4 - record_bytes_read, n - (record_length + 4 - record_bytes_read));
  133. n = n - (record_length + 4 - record_bytes_read);
  134. if (n < BUFSIZE) {
  135. DEBUG_PRINTF("Refilling buffer\n");
  136. n += fread(buf + n, 1, BUFSIZE, stdin);
  137. }
  138. state = STATE_BEFORE_RECORD;
  139. goto checkstate;
  140. }
  141. }
  142. }
  143. if (state != STATE_BEFORE_RECORD) {
  144. fprintf(stderr, "Error: incomplete record at the end of input\n");
  145. return 1;
  146. }
  147. }