Browse Source

Support packing ZST megaWARCs.

master
arkiver 10 months ago
parent
commit
e6f8a5d1be
6 changed files with 10 additions and 6 deletions
  1. +1
    -0
      .gitignore
  2. +1
    -1
      chunker
  3. +3
    -0
      config.example.sh
  4. +1
    -1
      megawarc
  5. +1
    -1
      pack-one
  6. +3
    -3
      upload-one

+ 1
- 0
.gitignore View File

@@ -0,0 +1 @@
*~

+ 1
- 1
chunker View File

@@ -28,7 +28,7 @@ mkdir -p "$CHUNKER_WORKING_DIR/current" || exit 1
cur_size=$( du -B1 -s "$CHUNKER_WORKING_DIR/current" | grep -oE "^[0-9]+" )

# find every .warc.gz in the upload directory
find "$INCOMING_UPLOADS_DIR" -type f -name "*.warc.gz" \
find "$INCOMING_UPLOADS_DIR" -type f -regex ".+\.warc\.\(gz\|zst\)$" \
| while read filename
do
# skip partial uploads


+ 3
- 0
config.example.sh View File

@@ -34,6 +34,9 @@ OFFLOAD_TARGET="rsync://somewhere-far-away:portnum/module-name/directory/"
# it is also possible to create a list of targets and the offloader will pick one at random and retry others on failure
# simply comment out the line above and put all rsync target urls separated by newline in a file called "offload_targets"

# the API for requesting the ZSTD dictionaries
ZST_DICTIONARY_API="API URL"

###############
# DIRECTORIES #
###############


+ 1
- 1
megawarc

@@ -1 +1 @@
Subproject commit f77638dbf7d0c4a7dd301217ee04fbc6a3c3ebbf
Subproject commit 5468d80e35b3dcb85d36624580c813326af706fe

+ 1
- 1
pack-one View File

@@ -84,7 +84,7 @@ echo "$( date ): Starting megawarc for item $ITEM" >> packer.log
mkdir -p $PACKER_WORKING_MEGAWARC_DIR/$ITEM
# megawarcs use relative paths
cd "$PACKER_WORKING_CHUNKS_DIR/"
$MEGAWARC --verbose pack $PACKER_WORKING_MEGAWARC_DIR/$ITEM/${FILE_PREFIX}${ITEM} $ITEM
$MEGAWARC --verbose pack --server $ZST_DICTIONARY_API $PACKER_WORKING_MEGAWARC_DIR/$ITEM/${FILE_PREFIX}${ITEM} $ITEM
result=$?
cd "$WORKING_DIR"



+ 3
- 3
upload-one View File

@@ -65,13 +65,13 @@ echo "$( date ): Start uploading for item $ITEM" >> uploader.log
# upload megawarc
size_hint=$( du --bytes -s "${UPLOADER_WORKING_DIR}/${ITEM}" | grep -oE "^[0-9]+" )
# (upload the large files first to optimise S3 snowballing)
for ext in warc.gz tar json.gz

find "${UPLOADER_WORKING_DIR}/${ITEM}" -type f -regextype posix-egrep -regex ".+\.megawarc\.(warc\.(gz|zst)|tar|json\.gz)$" -printf "%f\n" \
| while read filename
do
test "${ext}" == "tar" && ! test -f "${FILE_PREFIX}${ITEM}.megawarc.${ext}" && continue # skip non-existing tar files
result=1
while [[ $result -ne 0 ]]
do
filename="${FILE_PREFIX}${ITEM}.megawarc.${ext}"
curl -v --location --fail \
--speed-limit 1 --speed-time 900 \
--header "x-archive-queue-derive:1" \


Loading…
Cancel
Save