@@ -0,0 +1 @@ | |||||
*~ |
@@ -28,7 +28,7 @@ mkdir -p "$CHUNKER_WORKING_DIR/current" || exit 1 | |||||
cur_size=$( du -B1 -s "$CHUNKER_WORKING_DIR/current" | grep -oE "^[0-9]+" ) | cur_size=$( du -B1 -s "$CHUNKER_WORKING_DIR/current" | grep -oE "^[0-9]+" ) | ||||
# find every .warc.gz in the upload directory | # find every .warc.gz in the upload directory | ||||
find "$INCOMING_UPLOADS_DIR" -type f -name "*.warc.gz" \ | |||||
find "$INCOMING_UPLOADS_DIR" -type f -regex ".+\.warc\.\(gz\|zst\)$" \ | |||||
| while read filename | | while read filename | ||||
do | do | ||||
# skip partial uploads | # skip partial uploads | ||||
@@ -34,6 +34,9 @@ OFFLOAD_TARGET="rsync://somewhere-far-away:portnum/module-name/directory/" | |||||
# it is also possible to create a list of targets and the offloader will pick one at random and retry others on failure | # it is also possible to create a list of targets and the offloader will pick one at random and retry others on failure | ||||
# simply comment out the line above and put all rsync target urls separated by newline in a file called "offload_targets" | # simply comment out the line above and put all rsync target urls separated by newline in a file called "offload_targets" | ||||
# the API for requesting the ZSTD dictionaries | |||||
ZST_DICTIONARY_API="API URL" | |||||
############### | ############### | ||||
# DIRECTORIES # | # DIRECTORIES # | ||||
############### | ############### | ||||
@@ -1 +1 @@ | |||||
Subproject commit f77638dbf7d0c4a7dd301217ee04fbc6a3c3ebbf | |||||
Subproject commit 5468d80e35b3dcb85d36624580c813326af706fe |
@@ -84,7 +84,7 @@ echo "$( date ): Starting megawarc for item $ITEM" >> packer.log | |||||
mkdir -p $PACKER_WORKING_MEGAWARC_DIR/$ITEM | mkdir -p $PACKER_WORKING_MEGAWARC_DIR/$ITEM | ||||
# megawarcs use relative paths | # megawarcs use relative paths | ||||
cd "$PACKER_WORKING_CHUNKS_DIR/" | cd "$PACKER_WORKING_CHUNKS_DIR/" | ||||
$MEGAWARC --verbose pack $PACKER_WORKING_MEGAWARC_DIR/$ITEM/${FILE_PREFIX}${ITEM} $ITEM | |||||
$MEGAWARC --verbose pack --server $ZST_DICTIONARY_API $PACKER_WORKING_MEGAWARC_DIR/$ITEM/${FILE_PREFIX}${ITEM} $ITEM | |||||
result=$? | result=$? | ||||
cd "$WORKING_DIR" | cd "$WORKING_DIR" | ||||
@@ -65,13 +65,13 @@ echo "$( date ): Start uploading for item $ITEM" >> uploader.log | |||||
# upload megawarc | # upload megawarc | ||||
size_hint=$( du --bytes -s "${UPLOADER_WORKING_DIR}/${ITEM}" | grep -oE "^[0-9]+" ) | size_hint=$( du --bytes -s "${UPLOADER_WORKING_DIR}/${ITEM}" | grep -oE "^[0-9]+" ) | ||||
# (upload the large files first to optimise S3 snowballing) | # (upload the large files first to optimise S3 snowballing) | ||||
for ext in warc.gz tar json.gz | |||||
find "${UPLOADER_WORKING_DIR}/${ITEM}" -type f -regextype posix-egrep -regex ".+\.megawarc\.(warc\.(gz|zst)|tar|json\.gz)$" -printf "%f\n" \ | |||||
| while read filename | |||||
do | do | ||||
test "${ext}" == "tar" && ! test -f "${FILE_PREFIX}${ITEM}.megawarc.${ext}" && continue # skip non-existing tar files | |||||
result=1 | result=1 | ||||
while [[ $result -ne 0 ]] | while [[ $result -ne 0 ]] | ||||
do | do | ||||
filename="${FILE_PREFIX}${ITEM}.megawarc.${ext}" | |||||
curl -v --location --fail \ | curl -v --location --fail \ | ||||
--speed-limit 1 --speed-time 900 \ | --speed-limit 1 --speed-time 900 \ | ||||
--header "x-archive-queue-derive:1" \ | --header "x-archive-queue-derive:1" \ | ||||