Support packing ZST megaWARCs.

4 years ago · e6f8a5d1be
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
 *~
--- a/+ 1
+++ b/+ 1
@@ -28,7 +28,7 @@ mkdir -p "$CHUNKER_WORKING_DIR/current" || exit 1
 cur_size=$( du -B1 -s "$CHUNKER_WORKING_DIR/current" | grep -oE "^[0-9]+" )

 # find every .warc.gz in the upload directory
 find "$INCOMING_UPLOADS_DIR" -type f -name "*.warc.gz" \
 find "$INCOMING_UPLOADS_DIR" -type f -regex ".+\.warc\.\(gz\|zst\)$" \
 | while read filename
 do
  # skip partial uploads
--- a/config.example.sh
+++ b/config.example.sh
@@ -34,6 +34,9 @@ OFFLOAD_TARGET="rsync://somewhere-far-away:portnum/module-name/directory/"
 # it is also possible to create a list of targets and the offloader will pick one at random and retry others on failure
 # simply comment out the line above and put all rsync target urls separated by newline in a file called "offload_targets"

 # the API for requesting the ZSTD dictionaries
 ZST_DICTIONARY_API="API URL"

 ###############
 # DIRECTORIES #
 ###############
--- a/+ 1
+++ b/+ 1
@@ -1 +1 @@
 Subproject commit f77638dbf7d0c4a7dd301217ee04fbc6a3c3ebbf
 Subproject commit 5468d80e35b3dcb85d36624580c813326af706fe
--- a/+ 1
+++ b/+ 1
@@ -84,7 +84,7 @@ echo "$( date ): Starting megawarc for item $ITEM" >> packer.log
 mkdir -p $PACKER_WORKING_MEGAWARC_DIR/$ITEM
 # megawarcs use relative paths
 cd "$PACKER_WORKING_CHUNKS_DIR/"
 $MEGAWARC --verbose pack $PACKER_WORKING_MEGAWARC_DIR/$ITEM/${FILE_PREFIX}${ITEM} $ITEM
 $MEGAWARC --verbose pack --server $ZST_DICTIONARY_API $PACKER_WORKING_MEGAWARC_DIR/$ITEM/${FILE_PREFIX}${ITEM} $ITEM
 result=$?
 cd "$WORKING_DIR"

--- a/+ 3
+++ b/+ 3
@@ -65,13 +65,13 @@ echo "$( date ): Start uploading for item $ITEM" >> uploader.log
 # upload megawarc
 size_hint=$( du --bytes -s "${UPLOADER_WORKING_DIR}/${ITEM}" | grep -oE "^[0-9]+" )
 # (upload the large files first to optimise S3 snowballing)
 for ext in warc.gz tar json.gz

 find "${UPLOADER_WORKING_DIR}/${ITEM}" -type f -regextype posix-egrep -regex ".+\.megawarc\.(warc\.(gz|zst)|tar|json\.gz)$" -printf "%f\n" \
 | while read filename
 do
  test "${ext}" == "tar" && ! test -f "${FILE_PREFIX}${ITEM}.megawarc.${ext}" && continue # skip non-existing tar files
  result=1
  while [[ $result -ne 0 ]]
  do
    filename="${FILE_PREFIX}${ITEM}.megawarc.${ext}"
    curl -v --location --fail \
      --speed-limit 1 --speed-time 900 \
      --header "x-archive-queue-derive:1" \