You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

chunker 1.8 KiB

11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. #!/bin/bash
  2. # Move uploaded .warc.gz files to an archive directory.
  3. # When the archive is large enough, make a tar and start with a
  4. # new archive.
  5. #
  6. # Be careful: this script assumes that any file in the upload directory
  7. # that has a name that ends with *.warc.gz is a fully uploaded file and
  8. # can be moved somewhere else. Remember this when running Rsync.
  9. #
  10. INCOMING_UPLOADS_DIR="${1}" # /home/archiveteam/uploads
  11. CHUNKER_WORKING_DIR="${2}" # /home/archiveteam/processed
  12. PACKING_QUEUE_DIR="${CHUNKER_WORKING_DIR}/archive"
  13. MEGABYTES_PER_CHUNK=$((1024*25))
  14. # if not specified in command-line arguments
  15. if [ -z "${INCOMING_UPLOADS_DIR}" ]
  16. then
  17. source ./config.sh || exit 1
  18. fi
  19. BYTES_PER_CHUNK=$((1024*1024*MEGABYTES_PER_CHUNK))
  20. mkdir -p "${CHUNKER_WORKING_DIR}" || exit 1
  21. mkdir -p "${PACKING_QUEUE_DIR}" || exit 1
  22. mkdir -p "${CHUNKER_WORKING_DIR}/current" || exit 1
  23. cur_size=$( du -B1 -s "${CHUNKER_WORKING_DIR}/current" | grep -oE "^[0-9]+" )
  24. # find every .warc.gz in the upload directory
  25. find "${INCOMING_UPLOADS_DIR}" -type f -regex ".+\.warc\.\(gz\|zst\)$" \
  26. | while read filename
  27. do
  28. # skip partial uploads
  29. if [[ "${filename}" =~ rsync-tmp ]]
  30. then
  31. continue
  32. fi
  33. cur_size=$((cur_size + $( du -B1 -s "${filename}" | grep -oE "^[0-9]+" )))
  34. # move to the current/ directory
  35. echo "Moving ${filename}"
  36. mkdir -p "${CHUNKER_WORKING_DIR}/current"
  37. mv "${filename}" "${CHUNKER_WORKING_DIR}/current/"
  38. # if the current/ directory is large enough,
  39. # rename it to archive-XXXXX and start a new current/
  40. if [[ "${cur_size}" -gt "${BYTES_PER_CHUNK}" ]]
  41. then
  42. timestamp=$( date +'%Y%m%d%H%M%S' )
  43. uuid=$(cat /proc/sys/kernel/random/uuid | cut -d- -f1)
  44. echo "Current archive is full, moving to ${timestamp}_${uuid}."
  45. mv "${CHUNKER_WORKING_DIR}/current" "${PACKING_QUEUE_DIR}/${timestamp}_${uuid}"
  46. cur_size=0
  47. sleep 3
  48. fi
  49. done