You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

45 lines
1.4 KiB

  1. #!/bin/bash
  2. # Move uploaded .warc.gz files to an archive directory.
  3. # When the archive is large enough, make a tar and start with a
  4. # new archive.
  5. #
  6. # Be careful: this script assumes that any file in the upload directory
  7. # that has a name that ends with *.warc.gz is a fully uploaded file and
  8. # can be moved somewhere else. Remember this when running Rsync.
  9. #
  10. PATH_TO_UPLOADS=$1 # /home/archiveteam/uploads
  11. PATH_TO_TARGET=$2 # /home/archiveteam/processed
  12. MAX_MEGABYTES=$((1024*25))
  13. # find every .warc.gz in the upload directory
  14. find $PATH_TO_UPLOADS -type f -name "*.warc.gz" \
  15. | while read filename
  16. do
  17. # skip partial uploads
  18. if [[ $filename =~ rsync-tmp ]]
  19. then
  20. continue
  21. fi
  22. # move to the current/ directory
  23. echo "Moving ${filename}"
  24. mkdir -p ${PATH_TO_TARGET}/current
  25. mv ${filename} ${PATH_TO_TARGET}/current/
  26. # if the current/ directory is large enough,
  27. # rename it to archive-XXXXX and start a new current/
  28. cur_size=$( du -BM -s ${PATH_TO_TARGET}/current | grep -oE "^[0-9]+" )
  29. if [[ $cur_size -gt $MAX_MEGABYTES ]]
  30. then
  31. timestamp=$( date +'%Y%m%d%H%M%S' )
  32. echo "Current archive is full, moving to ${timestamp}."
  33. mkdir -p ${PATH_TO_TARGET}/archive
  34. mv ${PATH_TO_TARGET}/current ${PATH_TO_TARGET}/archive/${timestamp}
  35. # perhaps do something to the ${PATH_TO_TARGET}/archive/${timestamp}
  36. # e.g. ./make-tar-and-upload.sh ${timestamp}
  37. fi
  38. done