@@ -39,7 +39,23 @@ Filesystems 1 and 2 do not have to be the same. | |||
Configuration | |||
------------- | |||
See each script for the configuration options and arguments. Part of the configuration is in the `config.sh` file that you should place in the working directory of each script. Another part of the configuration is in the script's arguments. The working directory itself is also important for some of the scripts. | |||
Create a configuration file called `config.sh` and place it in the directory where you start the scripts. See the `config.example.sh` for more details. | |||
Running | |||
------- | |||
In `screen`, `tmux` or something similar, run the scripts: | |||
`./chunk-many` (run exactly one) | |||
`./pack-many` (you may run more than one) | |||
`./upload-many` (you may run more than one) | |||
`touch RUN` before you start the scripts. Use `rm RUN` to stop gracefully. | |||
Recovering from errors | |||
---------------------- | |||
The scripts are designed not to lose data. If a script dies, you can look in its working directory for in-progress items and move them back to the queue. | |||
Requirements | |||
@@ -0,0 +1,17 @@ | |||
#!/bin/bash | |||
# This loops the chunker script while the RUN file exists. | |||
# See chunker for details. | |||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | |||
while [[ -f RUN ]] | |||
do | |||
$SCRIPT_DIR/chunker | |||
result=$? | |||
if [[ $result -ne 0 ]] | |||
then | |||
date | |||
echo "chunker exited with $result" | |||
exit $result | |||
fi | |||
done | |||
@@ -8,12 +8,22 @@ | |||
# can be moved somewhere else. Remember this when running Rsync. | |||
# | |||
PATH_TO_UPLOADS=$1 # /home/archiveteam/uploads | |||
PATH_TO_TARGET=$2 # /home/archiveteam/processed | |||
MAX_MEGABYTES=$((1024*25)) | |||
INCOMING_UPLOADS_DIR=$1 # /home/archiveteam/uploads | |||
CHUNKER_WORKING_DIR=$2 # /home/archiveteam/processed | |||
PACKING_QUEUE_DIR="$CHUNKER_WORKING_DIR/archive" | |||
MEGABYTES_PER_CHUNK=$((1024*25)) | |||
# if not specified in command-line arguments | |||
if [ -z $INCOMING_UPLOADS_DIR ] | |||
then | |||
source ./config.sh || exit 1 | |||
fi | |||
mkdir -p "$CHUNKER_WORKING_DIR" || exit 1 | |||
mkdir -p "$PACKING_QUEUE_DIR" || exit 1 | |||
# find every .warc.gz in the upload directory | |||
find "$PATH_TO_UPLOADS" -type f -name "*.warc.gz" \ | |||
find "$INCOMING_UPLOADS_DIR" -type f -name "*.warc.gz" \ | |||
| while read filename | |||
do | |||
# skip partial uploads | |||
@@ -24,18 +34,17 @@ do | |||
# move to the current/ directory | |||
echo "Moving ${filename}" | |||
mkdir -p "${PATH_TO_TARGET}/current" | |||
mv "${filename}" "${PATH_TO_TARGET}/current/" | |||
mkdir -p "$CHUNKER_WORKING_DIR/current" | |||
mv "${filename}" "$CHUNKER_WORKING_DIR/current/" | |||
# if the current/ directory is large enough, | |||
# rename it to archive-XXXXX and start a new current/ | |||
cur_size=$( du -BM -s "${PATH_TO_TARGET}/current" | grep -oE "^[0-9]+" ) | |||
if [[ $cur_size -gt $MAX_MEGABYTES ]] | |||
cur_size=$( du -BM -s "$CHUNKER_WORKING_DIR/current" | grep -oE "^[0-9]+" ) | |||
if [[ $cur_size -gt $MEGABYTES_PER_CHUNK ]] | |||
then | |||
timestamp=$( date +'%Y%m%d%H%M%S' ) | |||
echo "Current archive is full, moving to ${timestamp}." | |||
mkdir -p "${PATH_TO_TARGET}/archive" | |||
mv "${PATH_TO_TARGET}/current" "${PATH_TO_TARGET}/archive/${timestamp}" | |||
mv "$CHUNKER_WORKING_DIR/current" "$PACKING_QUEUE_DIR/${timestamp}" | |||
fi | |||
done | |||
@@ -1,10 +1,16 @@ | |||
#!/bin/bash | |||
# Create this config.sh and copy it to the working directories of the | |||
# packing and upload scripts. | |||
# Create a copy of this config.sh, customise it and place it in the | |||
# working directory of the packing and upload scripts. | |||
echo "config.sh not customised." | |||
exit 1 | |||
#################### | |||
# CHUNKER SETTINGS # | |||
#################### | |||
# start a new chunk when the current chunk is at least this large | |||
MEGABYTES_PER_CHUNK=$((1024*25)) | |||
################### | |||
# UPLOAD METADATA # | |||
################### | |||
# your Archive.org S3 keys | |||
IA_AUTH="ACCESS_KEY:SECRET" | |||
@@ -21,5 +27,56 @@ IA_ITEM_PREFIX="archiveteam_todo_" | |||
FILE_PREFIX="todo_" | |||
# the date field for the item | |||
IA_ITEM_DATE="2013-04" | |||
IA_ITEM_DATE=$( date +"%Y-%m" ) | |||
############### | |||
# DIRECTORIES # | |||
############### | |||
# Put your directories on one or two filesystems (see README). | |||
FS1_BASE_DIR="/archiveteam/ssd/project" | |||
FS2_BASE_DIR="/archiveteam/disk/project" | |||
## THESE DIRECTORIES ON FILESYSTEM 1: for warcs | |||
# the rsync upload directory | |||
# (the chunker will package the .warc.gz files in this directory) | |||
INCOMING_UPLOADS_DIR="${FS1_BASE_DIR}/incoming-uploads" | |||
# the chunker working directory | |||
# (this directory will hold the current in-progress chunk) | |||
CHUNKER_WORKING_DIR="${FS1_BASE_DIR}/chunker-work" | |||
# the chunker output directory / the packer queue | |||
# (this directory will hold the completed chunks) | |||
PACKING_QUEUE_DIR="${FS1_BASE_DIR}/packing-queue" | |||
# the packer working directory - warc side | |||
# (this directory will hold the current chunk) | |||
PACKER_WORKING_CHUNKS_DIR="${FS1_BASE_DIR}/packer-work-in" | |||
## THESE DIRECTORIES ON FILESYSTEM 2: for megawarcs | |||
# the packer working directory - megawarc side | |||
# (this directory will hold the current megawarc) | |||
PACKER_WORKING_MEGAWARC_DIR="${FS2_BASE_DIR}/packer-work-out" | |||
# the packer output directory / the upload queue | |||
# (this directory will hold the completed megawarcs) | |||
UPLOAD_QUEUE_DIR="${FS2_BASE_DIR}/upload-queue" | |||
# the uploader working directory | |||
# (this directory will hold the current megawarc) | |||
UPLOADER_WORKING_DIR="${FS2_BASE_DIR}/uploader-work" | |||
# the final destination for uploaded megawarcs | |||
# leave this empty to remove megawarcs after uploading | |||
COMPLETED_DIR="${FS2_BASE_DIR}/uploaded" | |||
# remove this | |||
echo "config.sh not customised." | |||
exit 1 | |||
@@ -5,7 +5,7 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | |||
while [[ -f RUN ]] | |||
do | |||
$SCRIPT_DIR/pack-one-without-upload $1 $2 $3 | |||
$SCRIPT_DIR/pack-one-without-upload | |||
result=$? | |||
if [[ $result -ne 0 ]] | |||
then | |||
@@ -1,33 +1,34 @@ | |||
#!/bin/bash | |||
# Feeds the upload queue with megawarcs. | |||
# (Needs a config.sh in the working directory.) | |||
# | |||
# ./pack-one PROCESSED_DIR TARGET_DIR UPLOAD_QUEUE_DIR | |||
# ./pack-one | |||
# | |||
# 1. Grabs an item from PROCESSED_DIR | |||
# 2. Reserves the item by moving the directory to the working directory | |||
# 3. Makes a megawarc in the TARGET_DIR | |||
# 1. Grabs an item from PACKING_QUEUE_DIR | |||
# 2. Reserves the item by moving the directory to the | |||
# PACKER_WORKING_CHUNKS_DIR | |||
# 3. Makes a megawarc in the PACKER_WORKING_MEGAWARC_DIR | |||
# 4. Removes the source files from the working directory | |||
# 5. Moves the megawarc to UPLOAD_QUEUE_DIR | |||
# 5. Moves the megawarc to the UPLOAD_QUEUE_DIR | |||
# | |||
# The program exits with 1 on any nontransient error. | |||
# | |||
# run from the packer directory /archiveteam/packer-1/ | |||
# | |||
# ./pack-one /archiveteam/processed/archive /archiveteam/ssd1/packer-1 /archiveteam/ssd1/upload-queue | |||
# | |||
PROCESSED_DIR=$1 | |||
TARGET_DIR=$2 | |||
UPLOAD_QUEUE_DIR=$3 | |||
WORKING_DIR="$( pwd )" | |||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | |||
MEGAWARC=$SCRIPT_DIR/megawarc/megawarc | |||
if [ ! -f ./config.sh ] ; then | |||
echo "config.sh not found in current directory." | |||
if [[ ! -x $MEGAWARC ]] | |||
then | |||
echo "$MEGAWARC does not exist or is not executable." | |||
exit 1 | |||
fi | |||
source ./config.sh | |||
source ./config.sh || exit 1 | |||
mkdir -p "$PACKER_WORKING_CHUNKS_DIR" || exit 1 | |||
mkdir -p "$PACKER_WORKING_MEGAWARC_DIR" || exit 1 | |||
mkdir -p "$UPLOAD_QUEUE_DIR" || exit 1 | |||
function mayicontinue { | |||
@@ -37,9 +38,6 @@ function mayicontinue { | |||
# echo | |||
} | |||
mkdir -p $TARGET_DIR | |||
mkdir -p $UPLOAD_QUEUE_DIR | |||
# check if the upload queue is empty | |||
# if [ "$( ls -A $UPLOAD_QUEUE_DIR )" ] | |||
@@ -53,15 +51,15 @@ mkdir -p $UPLOAD_QUEUE_DIR | |||
mayicontinue | |||
# try to grab a directory from /archiveteam/processed/archive/ | |||
# try to grab a directory from the packing queue | |||
ITEM=none | |||
while [[ $ITEM = none ]] | |||
do | |||
possible_item=$( ls -1 $PROCESSED_DIR/ | grep 201 | sort | head -n 1 ) | |||
possible_item=$( ls -1 "$PACKING_QUEUE_DIR/" | grep 201 | sort | head -n 1 ) | |||
if [[ $possible_item =~ 201 ]] | |||
then | |||
echo "Trying to grab $possible_item" | |||
if mv $PROCESSED_DIR/$possible_item . | |||
if mv "$PACKING_QUEUE_DIR/$possible_item" "$PACKER_WORKING_CHUNKS_DIR/" | |||
then | |||
ITEM=$possible_item | |||
else | |||
@@ -83,9 +81,12 @@ mayicontinue | |||
echo "$( date ): Starting megawarc for item $ITEM" >> packer.log | |||
# construct a megawarc | |||
mkdir -p $TARGET_DIR/$ITEM | |||
$MEGAWARC --verbose pack $TARGET_DIR/$ITEM/${FILE_PREFIX}${ITEM} $ITEM | |||
mkdir -p $PACKER_WORKING_MEGAWARC_DIR/$ITEM | |||
# megawarcs use relative paths | |||
cd "$PACKER_WORKING_CHUNKS_DIR/" | |||
$MEGAWARC --verbose pack $PACKER_WORKING_MEGAWARC_DIR/$ITEM/${FILE_PREFIX}${ITEM} $ITEM | |||
result=$? | |||
cd "$WORKING_DIR" | |||
if [[ $result -ne 0 ]] | |||
then | |||
@@ -102,7 +103,7 @@ mayicontinue | |||
# remove files | |||
echo "megawarc OK, removing source files" | |||
rm -rf $ITEM | |||
rm -rf "$PACKER_WORKING_CHUNKS_DIR/$ITEM" | |||
result=$? | |||
if [[ $result -ne 0 ]] | |||
@@ -114,7 +115,7 @@ fi | |||
echo "add to upload queue" | |||
mv $TARGET_DIR/$ITEM $UPLOAD_QUEUE_DIR | |||
mv "$PACKER_WORKING_MEGAWARC_DIR/$ITEM" "$UPLOAD_QUEUE_DIR/" | |||
exit 0 | |||
@@ -5,7 +5,7 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | |||
while [[ -f RUN ]] | |||
do | |||
$SCRIPT_DIR/upload-one $1 | |||
$SCRIPT_DIR/upload-one | |||
result=$? | |||
if [[ $result -ne 0 ]] | |||
then | |||
@@ -1,31 +1,30 @@ | |||
#!/bin/bash | |||
# Uploads megawarcs from the upload queue. | |||
# (Needs a config.sh in the working directory.) | |||
# | |||
# ./upload-one UPLOAD_QUEUE_DIR | |||
# ./upload-one | |||
# | |||
# 1. Grabs an item from UPLOAD_QUEUE_DIR | |||
# 2. Reserves the item by moving the directory to the working directory | |||
# 2. Reserves the item by moving the directory to the | |||
# UPLOADER_WORKING_DIR | |||
# 3. Uploads the item to s3.us.archive.org | |||
# 4. Removes the source files from the working directory | |||
# If COMPLETED_DIR is set, uploaded files are moved there. | |||
# | |||
# The program exits with 1 on any nontransient error. | |||
# | |||
# run from the upload directory /archiveteam/ssd1/uploader-1/ | |||
# | |||
# ./upload-one /archiveteam/ssd1/upload-queue | |||
# | |||
# | |||
UPLOAD_QUEUE_DIR=$1 | |||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | |||
if [ ! -f ./config.sh ] ; then | |||
echo "config.sh not found in current directory." | |||
exit 1 | |||
fi | |||
source ./config.sh | |||
source ./config.sh || exit 1 | |||
mkdir -p "$UPLOAD_QUEUE_DIR" || exit 1 | |||
mkdir -p "$UPLOADER_WORKING_DIR" || exit 1 | |||
if [ ! -z "$COMPLETED_DIR" ] | |||
then | |||
mkdir -p "$COMPLETED_DIR" || exit 1 | |||
fi | |||
function mayicontinue { | |||
echo | |||
@@ -41,11 +40,11 @@ mayicontinue | |||
ITEM=none | |||
while [[ $ITEM = none ]] | |||
do | |||
possible_item=$( ls -1 $UPLOAD_QUEUE_DIR | grep 201 | sort | head -n 1 ) | |||
possible_item=$( ls -1 "$UPLOAD_QUEUE_DIR" | grep 201 | sort | head -n 1 ) | |||
if [[ $possible_item =~ 201 ]] | |||
then | |||
echo "Trying to grab $possible_item" | |||
if mv $UPLOAD_QUEUE_DIR/$possible_item . | |||
if mv "$UPLOAD_QUEUE_DIR/$possible_item" "$UPLOADER_WORKING_DIR/" | |||
then | |||
ITEM=$possible_item | |||
else | |||
@@ -64,14 +63,14 @@ done | |||
echo "$( date ): Start uploading for item $ITEM" >> uploader.log | |||
# upload megawarc | |||
size_hint=$( du --bytes -s "${UPLOADER_WORKING_DIR}/${ITEM}" | grep -oE "^[0-9]+" ) | |||
# (upload the large files first to optimise S3 snowballing) | |||
for ext in warc.gz tar json.gz | |||
do | |||
result=1 | |||
while [[ $result -ne 0 ]] | |||
do | |||
filename=${FILE_PREFIX}${ITEM}.megawarc.${ext} | |||
size_hint=$( du --bytes -s ${ITEM}/${filename} | grep -oE "^[0-9]+" ) | |||
filename="${FILE_PREFIX}${ITEM}.megawarc.${ext}" | |||
curl -v --location --fail \ | |||
--speed-limit 1 --speed-time 900 \ | |||
--header "x-archive-queue-derive:1" \ | |||
@@ -83,8 +82,8 @@ do | |||
--header "x-archive-meta-language:eng" \ | |||
--header "x-archive-size-hint:$size_hint" \ | |||
--header "authorization: LOW ${IA_AUTH}" \ | |||
--upload-file ${ITEM}/${filename} \ | |||
http://s3.us.archive.org/${IA_ITEM_PREFIX}${ITEM}/${filename} \ | |||
--upload-file "${UPLOADER_WORKING_DIR}/${ITEM}/${filename}" \ | |||
"http://s3.us.archive.org/${IA_ITEM_PREFIX}${ITEM}/${filename}" \ | |||
> /dev/null | |||
result=$? | |||
if [[ $result -ne 0 ]] | |||
@@ -105,17 +104,31 @@ echo "$( date ): Completed uploading for item $ITEM" >> uploader.log | |||
mayicontinue | |||
# remove megawarc | |||
rm -rf ${ITEM} | |||
result=$? | |||
if [[ $result -ne 0 ]] | |||
# move or remove megawarc | |||
if [ -z "$COMPLETED_DIR" ] | |||
then | |||
date | |||
echo "rm -rf megawarc exited with $result for $ITEM" | |||
exit 1 | |||
fi | |||
# remove | |||
rm -rf "${UPLOADER_WORKING_DIR}/${ITEM}" | |||
result=$? | |||
if [[ $result -ne 0 ]] | |||
then | |||
date | |||
echo "rm -rf megawarc exited with $result for $ITEM" | |||
exit 1 | |||
fi | |||
else | |||
# move | |||
mv "${UPLOADER_WORKING_DIR}/${ITEM}" "${COMPLETED_DIR}/" | |||
result=$? | |||
if [[ $result -ne 0 ]] | |||
then | |||
date | |||
echo "rm -rf megawarc exited with $result for $ITEM" | |||
exit 1 | |||
fi | |||
fi | |||
exit 0 | |||