The little things give you away... A collection of various small helper stuff
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 

157 lignes
5.2 KiB

  1. #define _GNU_SOURCE
  2. #include <stdbool.h>
  3. #include <stdio.h>
  4. #include <stdlib.h>
  5. #include <string.h>
  6. #ifndef BUFSIZE
  7. #define BUFSIZE 1048576
  8. #endif
  9. #define STATE_BEFORE_RECORD 0
  10. #define STATE_RESPONSE_RECORD 1
  11. #define STATE_OTHER_RECORD 2
  12. #ifdef DEBUG
  13. #define DEBUG_PRINTF(...) do { fprintf(stderr, __VA_ARGS__); } while (false)
  14. #else
  15. #define DEBUG_PRINTF(...) do {} while (false)
  16. #endif
  17. int main(int argc, char* argv[]) {
  18. //TODO --meta or a similar way to get something like that?
  19. // Read stdin, decode WARC, dump all response record bodies to stdout.
  20. // One LF is inserted at the end of each response to ensure that a new record always begins on a new line.
  21. // Headers must fit into BUFSIZE.
  22. char buf[2 * BUFSIZE];
  23. size_t n;
  24. int state = STATE_BEFORE_RECORD;
  25. char* m0;
  26. char* m1;
  27. size_t record_bytes_read;
  28. size_t record_length;
  29. size_t nscan;
  30. while ((n = fread(buf, 1, BUFSIZE, stdin)) > 0) {
  31. checkstate:
  32. DEBUG_PRINTF("Have %zu bytes of buffer (at %p)\n", n, (void*)&buf);
  33. if (n == 0) {
  34. break;
  35. }
  36. DEBUG_PRINTF("State: %d\n", state);
  37. if (state == STATE_BEFORE_RECORD) {
  38. if (n < 10) {
  39. fprintf(stderr, "Error: too little data before WARC headers\n");
  40. return 1;
  41. }
  42. if (memcmp(buf, "WARC/1.0\r\n", 10) == 0 || memcmp(buf, "WARC/1.1\r\n", 10) == 0) {
  43. // Got some headers; find the record type, content length, and end of headers
  44. m0 = memmem(buf, n, "\r\nContent-Length:", 17);
  45. if (!m0) {
  46. fprintf(stderr, "Error: Content-Length missing\n");
  47. return 1;
  48. }
  49. DEBUG_PRINTF("Found Content-Length header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - buf);
  50. m1 = memmem(m0 + 1, n - (m0 + 1 - buf), "\r\n", 2);
  51. if (!m1) {
  52. fprintf(stderr, "Error: CRLF after Content-Length missing\n");
  53. return 1;
  54. }
  55. m0 += 17;
  56. while (m0 < buf + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  57. if (!sscanf(m0, "%zu%n", &record_length, &nscan)) {
  58. fprintf(stderr, "Error: invalid Content-Length\n");
  59. return 1;
  60. }
  61. if (nscan > n - (m0 - buf)) {
  62. fprintf(stderr, "Error: buffer overread\n");
  63. return 1;
  64. }
  65. m0 += nscan;
  66. while (m0 < buf + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  67. if (m0 != m1) {
  68. fprintf(stderr, "Error: invalid Content-Length (noise before EOL)\n");
  69. return 1;
  70. }
  71. DEBUG_PRINTF("Record body length: %zu\n", record_length);
  72. m0 = memmem(buf, n, "\r\nWARC-Type:", 12);
  73. if (!m0) {
  74. fprintf(stderr, "Error: WARC-Type missing\n");
  75. return 1;
  76. }
  77. DEBUG_PRINTF("Found WARC-Type header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - buf);
  78. m1 = memmem(m0, n - (m0 - buf), "\r\n", 2);
  79. if (!m1) {
  80. fprintf(stderr, "Error: CRLF after WARC-Type missing\n");
  81. return 1;
  82. }
  83. m0 += 12;
  84. while (m0 < buf + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  85. if (memcmp(m0, "response", 8) == 0) {
  86. DEBUG_PRINTF("Response record\n");
  87. state = STATE_RESPONSE_RECORD;
  88. } else {
  89. DEBUG_PRINTF("Other record\n");
  90. state = STATE_OTHER_RECORD;
  91. }
  92. m0 = memmem(buf, n, "\r\n\r\n", 4);
  93. if (!m0) {
  94. fprintf(stderr, "Error: end of headers not found\n");
  95. return 1;
  96. }
  97. m0 += 4;
  98. DEBUG_PRINTF("Record body begins at %p (offset %zu)\n", (void*)m0, m0 - buf);
  99. //TODO Replace all the memmove business with pointer logic to avoid needless constant memory copying (is more wrooom).
  100. DEBUG_PRINTF("Moving %zu bytes backwards by %zu\n", n - (m0 - buf), m0 - buf);
  101. memmove(buf, m0, n - (m0 - buf));
  102. n = n - (m0 - buf);
  103. record_bytes_read = 0;
  104. goto checkstate;
  105. } else {
  106. fprintf(stderr, "Error: expected header line, got something else\n");
  107. return 1;
  108. }
  109. } else if (state == STATE_RESPONSE_RECORD || state == STATE_OTHER_RECORD) {
  110. if (record_length + 4 - record_bytes_read > n) {
  111. // Only got part of the record body
  112. DEBUG_PRINTF("Partial record\n");
  113. if (state == STATE_RESPONSE_RECORD) {
  114. DEBUG_PRINTF("Copying %zu bytes to stdout\n", n);
  115. fwrite(buf, 1, n, stdout);
  116. }
  117. record_bytes_read += n;
  118. DEBUG_PRINTF("%zu of %zu bytes from this record written\n", record_bytes_read, record_length);
  119. } else {
  120. // Remainder of the record is in the buffer. Same logic as above for small records fitting in the buffer with the headers.
  121. DEBUG_PRINTF("Full record\n");
  122. if (state == STATE_RESPONSE_RECORD) {
  123. DEBUG_PRINTF("Copying %zu bytes to stdout\n", record_length - record_bytes_read);
  124. fwrite(buf, 1, record_length - record_bytes_read, stdout);
  125. fprintf(stdout, "\n");
  126. }
  127. if (memcmp(buf + record_length - record_bytes_read, "\r\n\r\n", 4) != 0) {
  128. fprintf(stderr, "Error: end of block not found\n");
  129. return 1;
  130. }
  131. DEBUG_PRINTF("Moving %zu bytes backwards by %zu\n", n - (record_length + 4 - record_bytes_read), record_length + 4 - record_bytes_read);
  132. memmove(buf, buf + record_length + 4 - record_bytes_read, n - (record_length + 4 - record_bytes_read));
  133. n = n - (record_length + 4 - record_bytes_read);
  134. if (n < BUFSIZE) {
  135. DEBUG_PRINTF("Refilling buffer\n");
  136. n += fread(buf + n, 1, BUFSIZE, stdin);
  137. }
  138. state = STATE_BEFORE_RECORD;
  139. goto checkstate;
  140. }
  141. }
  142. }
  143. if (state != STATE_BEFORE_RECORD) {
  144. fprintf(stderr, "Error: incomplete record at the end of input\n");
  145. return 1;
  146. }
  147. }