@@ -39,7 +39,23 @@ Filesystems 1 and 2 do not have to be the same. | |||||
Configuration | Configuration | ||||
------------- | ------------- | ||||
See each script for the configuration options and arguments. Part of the configuration is in the `config.sh` file that you should place in the working directory of each script. Another part of the configuration is in the script's arguments. The working directory itself is also important for some of the scripts. | |||||
Create a configuration file called `config.sh` and place it in the directory where you start the scripts. See the `config.example.sh` for more details. | |||||
Running | |||||
------- | |||||
In `screen`, `tmux` or something similar, run the scripts: | |||||
`./chunk-many` (run exactly one) | |||||
`./pack-many` (you may run more than one) | |||||
`./upload-many` (you may run more than one) | |||||
`touch RUN` before you start the scripts. Use `rm RUN` to stop gracefully. | |||||
Recovering from errors | |||||
---------------------- | |||||
The scripts are designed not to lose data. If a script dies, you can look in its working directory for in-progress items and move them back to the queue. | |||||
Requirements | Requirements | ||||
@@ -0,0 +1,17 @@ | |||||
#!/bin/bash | |||||
# This loops the chunker script while the RUN file exists. | |||||
# See chunker for details. | |||||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | |||||
while [[ -f RUN ]] | |||||
do | |||||
$SCRIPT_DIR/chunker | |||||
result=$? | |||||
if [[ $result -ne 0 ]] | |||||
then | |||||
date | |||||
echo "chunker exited with $result" | |||||
exit $result | |||||
fi | |||||
done | |||||
@@ -8,12 +8,22 @@ | |||||
# can be moved somewhere else. Remember this when running Rsync. | # can be moved somewhere else. Remember this when running Rsync. | ||||
# | # | ||||
PATH_TO_UPLOADS=$1 # /home/archiveteam/uploads | |||||
PATH_TO_TARGET=$2 # /home/archiveteam/processed | |||||
MAX_MEGABYTES=$((1024*25)) | |||||
INCOMING_UPLOADS_DIR=$1 # /home/archiveteam/uploads | |||||
CHUNKER_WORKING_DIR=$2 # /home/archiveteam/processed | |||||
PACKING_QUEUE_DIR="$CHUNKER_WORKING_DIR/archive" | |||||
MEGABYTES_PER_CHUNK=$((1024*25)) | |||||
# if not specified in command-line arguments | |||||
if [ -z $INCOMING_UPLOADS_DIR ] | |||||
then | |||||
source ./config.sh || exit 1 | |||||
fi | |||||
mkdir -p "$CHUNKER_WORKING_DIR" || exit 1 | |||||
mkdir -p "$PACKING_QUEUE_DIR" || exit 1 | |||||
# find every .warc.gz in the upload directory | # find every .warc.gz in the upload directory | ||||
find "$PATH_TO_UPLOADS" -type f -name "*.warc.gz" \ | |||||
find "$INCOMING_UPLOADS_DIR" -type f -name "*.warc.gz" \ | |||||
| while read filename | | while read filename | ||||
do | do | ||||
# skip partial uploads | # skip partial uploads | ||||
@@ -24,18 +34,17 @@ do | |||||
# move to the current/ directory | # move to the current/ directory | ||||
echo "Moving ${filename}" | echo "Moving ${filename}" | ||||
mkdir -p "${PATH_TO_TARGET}/current" | |||||
mv "${filename}" "${PATH_TO_TARGET}/current/" | |||||
mkdir -p "$CHUNKER_WORKING_DIR/current" | |||||
mv "${filename}" "$CHUNKER_WORKING_DIR/current/" | |||||
# if the current/ directory is large enough, | # if the current/ directory is large enough, | ||||
# rename it to archive-XXXXX and start a new current/ | # rename it to archive-XXXXX and start a new current/ | ||||
cur_size=$( du -BM -s "${PATH_TO_TARGET}/current" | grep -oE "^[0-9]+" ) | |||||
if [[ $cur_size -gt $MAX_MEGABYTES ]] | |||||
cur_size=$( du -BM -s "$CHUNKER_WORKING_DIR/current" | grep -oE "^[0-9]+" ) | |||||
if [[ $cur_size -gt $MEGABYTES_PER_CHUNK ]] | |||||
then | then | ||||
timestamp=$( date +'%Y%m%d%H%M%S' ) | timestamp=$( date +'%Y%m%d%H%M%S' ) | ||||
echo "Current archive is full, moving to ${timestamp}." | echo "Current archive is full, moving to ${timestamp}." | ||||
mkdir -p "${PATH_TO_TARGET}/archive" | |||||
mv "${PATH_TO_TARGET}/current" "${PATH_TO_TARGET}/archive/${timestamp}" | |||||
mv "$CHUNKER_WORKING_DIR/current" "$PACKING_QUEUE_DIR/${timestamp}" | |||||
fi | fi | ||||
done | done | ||||
@@ -1,10 +1,16 @@ | |||||
#!/bin/bash | #!/bin/bash | ||||
# Create this config.sh and copy it to the working directories of the | |||||
# packing and upload scripts. | |||||
# Create a copy of this config.sh, customise it and place it in the | |||||
# working directory of the packing and upload scripts. | |||||
echo "config.sh not customised." | |||||
exit 1 | |||||
#################### | |||||
# CHUNKER SETTINGS # | |||||
#################### | |||||
# start a new chunk when the current chunk is at least this large | |||||
MEGABYTES_PER_CHUNK=$((1024*25)) | |||||
################### | |||||
# UPLOAD METADATA # | |||||
################### | |||||
# your Archive.org S3 keys | # your Archive.org S3 keys | ||||
IA_AUTH="ACCESS_KEY:SECRET" | IA_AUTH="ACCESS_KEY:SECRET" | ||||
@@ -21,5 +27,56 @@ IA_ITEM_PREFIX="archiveteam_todo_" | |||||
FILE_PREFIX="todo_" | FILE_PREFIX="todo_" | ||||
# the date field for the item | # the date field for the item | ||||
IA_ITEM_DATE="2013-04" | |||||
IA_ITEM_DATE=$( date +"%Y-%m" ) | |||||
############### | |||||
# DIRECTORIES # | |||||
############### | |||||
# Put your directories on one or two filesystems (see README). | |||||
FS1_BASE_DIR="/archiveteam/ssd/project" | |||||
FS2_BASE_DIR="/archiveteam/disk/project" | |||||
## THESE DIRECTORIES ON FILESYSTEM 1: for warcs | |||||
# the rsync upload directory | |||||
# (the chunker will package the .warc.gz files in this directory) | |||||
INCOMING_UPLOADS_DIR="${FS1_BASE_DIR}/incoming-uploads" | |||||
# the chunker working directory | |||||
# (this directory will hold the current in-progress chunk) | |||||
CHUNKER_WORKING_DIR="${FS1_BASE_DIR}/chunker-work" | |||||
# the chunker output directory / the packer queue | |||||
# (this directory will hold the completed chunks) | |||||
PACKING_QUEUE_DIR="${FS1_BASE_DIR}/packing-queue" | |||||
# the packer working directory - warc side | |||||
# (this directory will hold the current chunk) | |||||
PACKER_WORKING_CHUNKS_DIR="${FS1_BASE_DIR}/packer-work-in" | |||||
## THESE DIRECTORIES ON FILESYSTEM 2: for megawarcs | |||||
# the packer working directory - megawarc side | |||||
# (this directory will hold the current megawarc) | |||||
PACKER_WORKING_MEGAWARC_DIR="${FS2_BASE_DIR}/packer-work-out" | |||||
# the packer output directory / the upload queue | |||||
# (this directory will hold the completed megawarcs) | |||||
UPLOAD_QUEUE_DIR="${FS2_BASE_DIR}/upload-queue" | |||||
# the uploader working directory | |||||
# (this directory will hold the current megawarc) | |||||
UPLOADER_WORKING_DIR="${FS2_BASE_DIR}/uploader-work" | |||||
# the final destination for uploaded megawarcs | |||||
# leave this empty to remove megawarcs after uploading | |||||
COMPLETED_DIR="${FS2_BASE_DIR}/uploaded" | |||||
# remove this | |||||
echo "config.sh not customised." | |||||
exit 1 | |||||
@@ -5,7 +5,7 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | |||||
while [[ -f RUN ]] | while [[ -f RUN ]] | ||||
do | do | ||||
$SCRIPT_DIR/pack-one-without-upload $1 $2 $3 | |||||
$SCRIPT_DIR/pack-one-without-upload | |||||
result=$? | result=$? | ||||
if [[ $result -ne 0 ]] | if [[ $result -ne 0 ]] | ||||
then | then | ||||
@@ -1,33 +1,34 @@ | |||||
#!/bin/bash | #!/bin/bash | ||||
# Feeds the upload queue with megawarcs. | # Feeds the upload queue with megawarcs. | ||||
# (Needs a config.sh in the working directory.) | |||||
# | # | ||||
# ./pack-one PROCESSED_DIR TARGET_DIR UPLOAD_QUEUE_DIR | |||||
# ./pack-one | |||||
# | # | ||||
# 1. Grabs an item from PROCESSED_DIR | |||||
# 2. Reserves the item by moving the directory to the working directory | |||||
# 3. Makes a megawarc in the TARGET_DIR | |||||
# 1. Grabs an item from PACKING_QUEUE_DIR | |||||
# 2. Reserves the item by moving the directory to the | |||||
# PACKER_WORKING_CHUNKS_DIR | |||||
# 3. Makes a megawarc in the PACKER_WORKING_MEGAWARC_DIR | |||||
# 4. Removes the source files from the working directory | # 4. Removes the source files from the working directory | ||||
# 5. Moves the megawarc to UPLOAD_QUEUE_DIR | |||||
# 5. Moves the megawarc to the UPLOAD_QUEUE_DIR | |||||
# | # | ||||
# The program exits with 1 on any nontransient error. | # The program exits with 1 on any nontransient error. | ||||
# | # | ||||
# run from the packer directory /archiveteam/packer-1/ | |||||
# | |||||
# ./pack-one /archiveteam/processed/archive /archiveteam/ssd1/packer-1 /archiveteam/ssd1/upload-queue | |||||
# | |||||
PROCESSED_DIR=$1 | |||||
TARGET_DIR=$2 | |||||
UPLOAD_QUEUE_DIR=$3 | |||||
WORKING_DIR="$( pwd )" | |||||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | ||||
MEGAWARC=$SCRIPT_DIR/megawarc/megawarc | MEGAWARC=$SCRIPT_DIR/megawarc/megawarc | ||||
if [ ! -f ./config.sh ] ; then | |||||
echo "config.sh not found in current directory." | |||||
if [[ ! -x $MEGAWARC ]] | |||||
then | |||||
echo "$MEGAWARC does not exist or is not executable." | |||||
exit 1 | exit 1 | ||||
fi | fi | ||||
source ./config.sh | |||||
source ./config.sh || exit 1 | |||||
mkdir -p "$PACKER_WORKING_CHUNKS_DIR" || exit 1 | |||||
mkdir -p "$PACKER_WORKING_MEGAWARC_DIR" || exit 1 | |||||
mkdir -p "$UPLOAD_QUEUE_DIR" || exit 1 | |||||
function mayicontinue { | function mayicontinue { | ||||
@@ -37,9 +38,6 @@ function mayicontinue { | |||||
# echo | # echo | ||||
} | } | ||||
mkdir -p $TARGET_DIR | |||||
mkdir -p $UPLOAD_QUEUE_DIR | |||||
# check if the upload queue is empty | # check if the upload queue is empty | ||||
# if [ "$( ls -A $UPLOAD_QUEUE_DIR )" ] | # if [ "$( ls -A $UPLOAD_QUEUE_DIR )" ] | ||||
@@ -53,15 +51,15 @@ mkdir -p $UPLOAD_QUEUE_DIR | |||||
mayicontinue | mayicontinue | ||||
# try to grab a directory from /archiveteam/processed/archive/ | |||||
# try to grab a directory from the packing queue | |||||
ITEM=none | ITEM=none | ||||
while [[ $ITEM = none ]] | while [[ $ITEM = none ]] | ||||
do | do | ||||
possible_item=$( ls -1 $PROCESSED_DIR/ | grep 201 | sort | head -n 1 ) | |||||
possible_item=$( ls -1 "$PACKING_QUEUE_DIR/" | grep 201 | sort | head -n 1 ) | |||||
if [[ $possible_item =~ 201 ]] | if [[ $possible_item =~ 201 ]] | ||||
then | then | ||||
echo "Trying to grab $possible_item" | echo "Trying to grab $possible_item" | ||||
if mv $PROCESSED_DIR/$possible_item . | |||||
if mv "$PACKING_QUEUE_DIR/$possible_item" "$PACKER_WORKING_CHUNKS_DIR/" | |||||
then | then | ||||
ITEM=$possible_item | ITEM=$possible_item | ||||
else | else | ||||
@@ -83,9 +81,12 @@ mayicontinue | |||||
echo "$( date ): Starting megawarc for item $ITEM" >> packer.log | echo "$( date ): Starting megawarc for item $ITEM" >> packer.log | ||||
# construct a megawarc | # construct a megawarc | ||||
mkdir -p $TARGET_DIR/$ITEM | |||||
$MEGAWARC --verbose pack $TARGET_DIR/$ITEM/${FILE_PREFIX}${ITEM} $ITEM | |||||
mkdir -p $PACKER_WORKING_MEGAWARC_DIR/$ITEM | |||||
# megawarcs use relative paths | |||||
cd "$PACKER_WORKING_CHUNKS_DIR/" | |||||
$MEGAWARC --verbose pack $PACKER_WORKING_MEGAWARC_DIR/$ITEM/${FILE_PREFIX}${ITEM} $ITEM | |||||
result=$? | result=$? | ||||
cd "$WORKING_DIR" | |||||
if [[ $result -ne 0 ]] | if [[ $result -ne 0 ]] | ||||
then | then | ||||
@@ -102,7 +103,7 @@ mayicontinue | |||||
# remove files | # remove files | ||||
echo "megawarc OK, removing source files" | echo "megawarc OK, removing source files" | ||||
rm -rf $ITEM | |||||
rm -rf "$PACKER_WORKING_CHUNKS_DIR/$ITEM" | |||||
result=$? | result=$? | ||||
if [[ $result -ne 0 ]] | if [[ $result -ne 0 ]] | ||||
@@ -114,7 +115,7 @@ fi | |||||
echo "add to upload queue" | echo "add to upload queue" | ||||
mv $TARGET_DIR/$ITEM $UPLOAD_QUEUE_DIR | |||||
mv "$PACKER_WORKING_MEGAWARC_DIR/$ITEM" "$UPLOAD_QUEUE_DIR/" | |||||
exit 0 | exit 0 | ||||
@@ -5,7 +5,7 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | |||||
while [[ -f RUN ]] | while [[ -f RUN ]] | ||||
do | do | ||||
$SCRIPT_DIR/upload-one $1 | |||||
$SCRIPT_DIR/upload-one | |||||
result=$? | result=$? | ||||
if [[ $result -ne 0 ]] | if [[ $result -ne 0 ]] | ||||
then | then | ||||
@@ -1,31 +1,30 @@ | |||||
#!/bin/bash | #!/bin/bash | ||||
# Uploads megawarcs from the upload queue. | # Uploads megawarcs from the upload queue. | ||||
# (Needs a config.sh in the working directory.) | |||||
# | # | ||||
# ./upload-one UPLOAD_QUEUE_DIR | |||||
# ./upload-one | |||||
# | # | ||||
# 1. Grabs an item from UPLOAD_QUEUE_DIR | # 1. Grabs an item from UPLOAD_QUEUE_DIR | ||||
# 2. Reserves the item by moving the directory to the working directory | |||||
# 2. Reserves the item by moving the directory to the | |||||
# UPLOADER_WORKING_DIR | |||||
# 3. Uploads the item to s3.us.archive.org | # 3. Uploads the item to s3.us.archive.org | ||||
# 4. Removes the source files from the working directory | # 4. Removes the source files from the working directory | ||||
# If COMPLETED_DIR is set, uploaded files are moved there. | |||||
# | # | ||||
# The program exits with 1 on any nontransient error. | # The program exits with 1 on any nontransient error. | ||||
# | # | ||||
# run from the upload directory /archiveteam/ssd1/uploader-1/ | |||||
# | |||||
# ./upload-one /archiveteam/ssd1/upload-queue | |||||
# | |||||
# | |||||
UPLOAD_QUEUE_DIR=$1 | |||||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | ||||
if [ ! -f ./config.sh ] ; then | |||||
echo "config.sh not found in current directory." | |||||
exit 1 | |||||
fi | |||||
source ./config.sh | |||||
source ./config.sh || exit 1 | |||||
mkdir -p "$UPLOAD_QUEUE_DIR" || exit 1 | |||||
mkdir -p "$UPLOADER_WORKING_DIR" || exit 1 | |||||
if [ ! -z "$COMPLETED_DIR" ] | |||||
then | |||||
mkdir -p "$COMPLETED_DIR" || exit 1 | |||||
fi | |||||
function mayicontinue { | function mayicontinue { | ||||
echo | echo | ||||
@@ -41,11 +40,11 @@ mayicontinue | |||||
ITEM=none | ITEM=none | ||||
while [[ $ITEM = none ]] | while [[ $ITEM = none ]] | ||||
do | do | ||||
possible_item=$( ls -1 $UPLOAD_QUEUE_DIR | grep 201 | sort | head -n 1 ) | |||||
possible_item=$( ls -1 "$UPLOAD_QUEUE_DIR" | grep 201 | sort | head -n 1 ) | |||||
if [[ $possible_item =~ 201 ]] | if [[ $possible_item =~ 201 ]] | ||||
then | then | ||||
echo "Trying to grab $possible_item" | echo "Trying to grab $possible_item" | ||||
if mv $UPLOAD_QUEUE_DIR/$possible_item . | |||||
if mv "$UPLOAD_QUEUE_DIR/$possible_item" "$UPLOADER_WORKING_DIR/" | |||||
then | then | ||||
ITEM=$possible_item | ITEM=$possible_item | ||||
else | else | ||||
@@ -64,14 +63,14 @@ done | |||||
echo "$( date ): Start uploading for item $ITEM" >> uploader.log | echo "$( date ): Start uploading for item $ITEM" >> uploader.log | ||||
# upload megawarc | # upload megawarc | ||||
size_hint=$( du --bytes -s "${UPLOADER_WORKING_DIR}/${ITEM}" | grep -oE "^[0-9]+" ) | |||||
# (upload the large files first to optimise S3 snowballing) | # (upload the large files first to optimise S3 snowballing) | ||||
for ext in warc.gz tar json.gz | for ext in warc.gz tar json.gz | ||||
do | do | ||||
result=1 | result=1 | ||||
while [[ $result -ne 0 ]] | while [[ $result -ne 0 ]] | ||||
do | do | ||||
filename=${FILE_PREFIX}${ITEM}.megawarc.${ext} | |||||
size_hint=$( du --bytes -s ${ITEM}/${filename} | grep -oE "^[0-9]+" ) | |||||
filename="${FILE_PREFIX}${ITEM}.megawarc.${ext}" | |||||
curl -v --location --fail \ | curl -v --location --fail \ | ||||
--speed-limit 1 --speed-time 900 \ | --speed-limit 1 --speed-time 900 \ | ||||
--header "x-archive-queue-derive:1" \ | --header "x-archive-queue-derive:1" \ | ||||
@@ -83,8 +82,8 @@ do | |||||
--header "x-archive-meta-language:eng" \ | --header "x-archive-meta-language:eng" \ | ||||
--header "x-archive-size-hint:$size_hint" \ | --header "x-archive-size-hint:$size_hint" \ | ||||
--header "authorization: LOW ${IA_AUTH}" \ | --header "authorization: LOW ${IA_AUTH}" \ | ||||
--upload-file ${ITEM}/${filename} \ | |||||
http://s3.us.archive.org/${IA_ITEM_PREFIX}${ITEM}/${filename} \ | |||||
--upload-file "${UPLOADER_WORKING_DIR}/${ITEM}/${filename}" \ | |||||
"http://s3.us.archive.org/${IA_ITEM_PREFIX}${ITEM}/${filename}" \ | |||||
> /dev/null | > /dev/null | ||||
result=$? | result=$? | ||||
if [[ $result -ne 0 ]] | if [[ $result -ne 0 ]] | ||||
@@ -105,17 +104,31 @@ echo "$( date ): Completed uploading for item $ITEM" >> uploader.log | |||||
mayicontinue | mayicontinue | ||||
# remove megawarc | |||||
rm -rf ${ITEM} | |||||
result=$? | |||||
if [[ $result -ne 0 ]] | |||||
# move or remove megawarc | |||||
if [ -z "$COMPLETED_DIR" ] | |||||
then | then | ||||
date | |||||
echo "rm -rf megawarc exited with $result for $ITEM" | |||||
exit 1 | |||||
fi | |||||
# remove | |||||
rm -rf "${UPLOADER_WORKING_DIR}/${ITEM}" | |||||
result=$? | |||||
if [[ $result -ne 0 ]] | |||||
then | |||||
date | |||||
echo "rm -rf megawarc exited with $result for $ITEM" | |||||
exit 1 | |||||
fi | |||||
else | |||||
# move | |||||
mv "${UPLOADER_WORKING_DIR}/${ITEM}" "${COMPLETED_DIR}/" | |||||
result=$? | |||||
if [[ $result -ne 0 ]] | |||||
then | |||||
date | |||||
echo "rm -rf megawarc exited with $result for $ITEM" | |||||
exit 1 | |||||
fi | |||||
fi | |||||
exit 0 | exit 0 | ||||