From 7cfca954152352c234b0df2077ebdfcca872c093 Mon Sep 17 00:00:00 2001
From: Alard <alard@example.com>
Date: Sun, 21 Apr 2013 16:55:01 +0200
Subject: [PATCH] Move directory configuration to config file.

---
 README.md                    | 18 ++++++++-
 chunk-many                   | 17 +++++++++
 chunker                      | 29 ++++++++++-----
 config.example.sh            | 67 +++++++++++++++++++++++++++++++---
 pack-multiple-without-upload |  2 +-
 pack-one-without-upload      | 53 ++++++++++++++-------------
 upload-multiple              |  2 +-
 upload-one                   | 71 +++++++++++++++++++++---------------
 8 files changed, 186 insertions(+), 73 deletions(-)
 create mode 100644 chunk-many

diff --git a/README.md b/README.md
index b526888..78f8e9e 100644
--- a/README.md
+++ b/README.md
@@ -39,7 +39,23 @@ Filesystems 1 and 2 do not have to be the same.
 
 Configuration
 -------------
-See each script for the configuration options and arguments. Part of the configuration is in the `config.sh` file that you should place in the working directory of each script. Another part of the configuration is in the script's arguments. The working directory itself is also important for some of the scripts.
+Create a configuration file called `config.sh` and place it in the directory where you start the scripts. See the `config.example.sh` for more details.
+
+
+Running
+-------
+In `screen`, `tmux` or something similar, run the scripts:
+
+`./chunk-many` (run exactly one)
+`./pack-many` (you may run more than one)
+`./upload-many` (you may run more than one)
+
+`touch RUN` before you start the scripts. Use `rm RUN` to stop gracefully.
+
+
+Recovering from errors
+----------------------
+The scripts are designed not to lose data. If a script dies, you can look in its working directory for in-progress items and move them back to the queue.
 
 
 Requirements
diff --git a/chunk-many b/chunk-many
new file mode 100644
index 0000000..91f179d
--- /dev/null
+++ b/chunk-many
@@ -0,0 +1,17 @@
+#!/bin/bash
+# This loops the chunker script while the RUN file exists.
+# See chunker for details.
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+while [[ -f RUN ]]
+do
+  $SCRIPT_DIR/chunker
+  result=$?
+  if [[ $result -ne 0 ]]
+  then
+    date
+    echo "chunker exited with $result"
+    exit $result
+  fi
+done
+
diff --git a/chunker b/chunker
index de0fbd2..ae979aa 100755
--- a/chunker
+++ b/chunker
@@ -8,12 +8,22 @@
 # can be moved somewhere else. Remember this when running Rsync.
 #
 
-PATH_TO_UPLOADS=$1 # /home/archiveteam/uploads
-PATH_TO_TARGET=$2  # /home/archiveteam/processed
-MAX_MEGABYTES=$((1024*25))
+INCOMING_UPLOADS_DIR=$1 # /home/archiveteam/uploads
+CHUNKER_WORKING_DIR=$2  # /home/archiveteam/processed
+PACKING_QUEUE_DIR="$CHUNKER_WORKING_DIR/archive"
+MEGABYTES_PER_CHUNK=$((1024*25))
+
+# if not specified in command-line arguments
+if [ -z $INCOMING_UPLOADS_DIR ]
+then
+  source ./config.sh || exit 1
+fi
+
+mkdir -p "$CHUNKER_WORKING_DIR" || exit 1
+mkdir -p "$PACKING_QUEUE_DIR" || exit 1
 
 # find every .warc.gz in the upload directory
-find "$PATH_TO_UPLOADS" -type f -name "*.warc.gz" \
+find "$INCOMING_UPLOADS_DIR" -type f -name "*.warc.gz" \
 | while read filename
 do
   # skip partial uploads
@@ -24,18 +34,17 @@ do
 
   # move to the current/ directory
   echo "Moving ${filename}"
-  mkdir -p "${PATH_TO_TARGET}/current"
-  mv "${filename}" "${PATH_TO_TARGET}/current/"
+  mkdir -p "$CHUNKER_WORKING_DIR/current"
+  mv "${filename}" "$CHUNKER_WORKING_DIR/current/"
 
   # if the current/ directory is large enough,
   # rename it to archive-XXXXX and start a new current/
-  cur_size=$( du -BM -s "${PATH_TO_TARGET}/current" | grep -oE "^[0-9]+" )
-  if [[ $cur_size -gt $MAX_MEGABYTES ]]
+  cur_size=$( du -BM -s "$CHUNKER_WORKING_DIR/current" | grep -oE "^[0-9]+" )
+  if [[ $cur_size -gt $MEGABYTES_PER_CHUNK ]]
   then
     timestamp=$( date +'%Y%m%d%H%M%S' )
     echo "Current archive is full, moving to ${timestamp}."
-    mkdir -p "${PATH_TO_TARGET}/archive"
-    mv "${PATH_TO_TARGET}/current" "${PATH_TO_TARGET}/archive/${timestamp}"
+    mv "$CHUNKER_WORKING_DIR/current" "$PACKING_QUEUE_DIR/${timestamp}"
   fi
 done
 
diff --git a/config.example.sh b/config.example.sh
index 784d768..e03465e 100755
--- a/config.example.sh
+++ b/config.example.sh
@@ -1,10 +1,16 @@
 #!/bin/bash
-# Create this config.sh and copy it to the working directories of the
-# packing and upload scripts.
+# Create a copy of this config.sh, customise it and place it in the
+# working directory of the packing and upload scripts.
 
-echo "config.sh not customised."
-exit 1
+####################
+# CHUNKER SETTINGS #
+####################
+# start a new chunk when the current chunk is at least this large
+MEGABYTES_PER_CHUNK=$((1024*25))
 
+###################
+# UPLOAD METADATA #
+###################
 # your Archive.org S3 keys
 IA_AUTH="ACCESS_KEY:SECRET"
 
@@ -21,5 +27,56 @@ IA_ITEM_PREFIX="archiveteam_todo_"
 FILE_PREFIX="todo_"
 
 # the date field for the item
-IA_ITEM_DATE="2013-04"
+IA_ITEM_DATE=$( date +"%Y-%m" )
+
+
+
+###############
+# DIRECTORIES #
+###############
+# Put your directories on one or two filesystems (see README).
+FS1_BASE_DIR="/archiveteam/ssd/project"
+FS2_BASE_DIR="/archiveteam/disk/project"
+
+## THESE DIRECTORIES ON FILESYSTEM 1: for warcs
+
+# the rsync upload directory
+# (the chunker will package the .warc.gz files in this directory)
+INCOMING_UPLOADS_DIR="${FS1_BASE_DIR}/incoming-uploads"
+
+# the chunker working directory
+# (this directory will hold the current in-progress chunk)
+CHUNKER_WORKING_DIR="${FS1_BASE_DIR}/chunker-work"
+
+# the chunker output directory / the packer queue
+# (this directory will hold the completed chunks)
+PACKING_QUEUE_DIR="${FS1_BASE_DIR}/packing-queue"
+
+# the packer working directory - warc side
+# (this directory will hold the current chunk)
+PACKER_WORKING_CHUNKS_DIR="${FS1_BASE_DIR}/packer-work-in"
+
+## THESE DIRECTORIES ON FILESYSTEM 2: for megawarcs
+
+# the packer working directory - megawarc side
+# (this directory will hold the current megawarc)
+PACKER_WORKING_MEGAWARC_DIR="${FS2_BASE_DIR}/packer-work-out"
+
+# the packer output directory / the upload queue
+# (this directory will hold the completed megawarcs)
+UPLOAD_QUEUE_DIR="${FS2_BASE_DIR}/upload-queue"
+
+# the uploader working directory
+# (this directory will hold the current megawarc)
+UPLOADER_WORKING_DIR="${FS2_BASE_DIR}/uploader-work"
+
+# the final destination for uploaded megawarcs
+# leave this empty to remove megawarcs after uploading
+COMPLETED_DIR="${FS2_BASE_DIR}/uploaded"
+
+
+# remove this
+echo "config.sh not customised."
+exit 1
+
 
diff --git a/pack-multiple-without-upload b/pack-multiple-without-upload
index 0ec18e7..f976e50 100755
--- a/pack-multiple-without-upload
+++ b/pack-multiple-without-upload
@@ -5,7 +5,7 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 
 while [[ -f RUN ]]
 do
-  $SCRIPT_DIR/pack-one-without-upload $1 $2 $3
+  $SCRIPT_DIR/pack-one-without-upload
   result=$?
   if [[ $result -ne 0 ]]
   then
diff --git a/pack-one-without-upload b/pack-one-without-upload
index b4bd748..63d40e7 100755
--- a/pack-one-without-upload
+++ b/pack-one-without-upload
@@ -1,33 +1,34 @@
 #!/bin/bash
 # Feeds the upload queue with megawarcs.
+# (Needs a config.sh in the working directory.)
 #
-#   ./pack-one PROCESSED_DIR TARGET_DIR UPLOAD_QUEUE_DIR
+#   ./pack-one
 #
-# 1. Grabs an item from PROCESSED_DIR
-# 2. Reserves the item by moving the directory to the working directory
-# 3. Makes a megawarc in the TARGET_DIR
+# 1. Grabs an item from PACKING_QUEUE_DIR
+# 2. Reserves the item by moving the directory to the
+#    PACKER_WORKING_CHUNKS_DIR
+# 3. Makes a megawarc in the PACKER_WORKING_MEGAWARC_DIR
 # 4. Removes the source files from the working directory
-# 5. Moves the megawarc to UPLOAD_QUEUE_DIR
+# 5. Moves the megawarc to the UPLOAD_QUEUE_DIR
 #
 # The program exits with 1 on any nontransient error.
 #
-# run from the packer directory /archiveteam/packer-1/
-# 
-#   ./pack-one /archiveteam/processed/archive /archiveteam/ssd1/packer-1 /archiveteam/ssd1/upload-queue
-#
-
-PROCESSED_DIR=$1
-TARGET_DIR=$2
-UPLOAD_QUEUE_DIR=$3
 
+WORKING_DIR="$( pwd )"
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 MEGAWARC=$SCRIPT_DIR/megawarc/megawarc
 
-if [ ! -f ./config.sh ] ; then
-  echo "config.sh not found in current directory."
+if [[ ! -x $MEGAWARC ]]
+then
+  echo "$MEGAWARC does not exist or is not executable."
   exit 1
 fi
-source ./config.sh
+
+source ./config.sh || exit 1
+
+mkdir -p "$PACKER_WORKING_CHUNKS_DIR" || exit 1
+mkdir -p "$PACKER_WORKING_MEGAWARC_DIR" || exit 1
+mkdir -p "$UPLOAD_QUEUE_DIR" || exit 1
 
 
 function mayicontinue {
@@ -37,9 +38,6 @@ function mayicontinue {
 # echo
 }
 
-mkdir -p $TARGET_DIR
-mkdir -p $UPLOAD_QUEUE_DIR
-
 
 # check if the upload queue is empty
 # if [ "$( ls -A $UPLOAD_QUEUE_DIR )" ]
@@ -53,15 +51,15 @@ mkdir -p $UPLOAD_QUEUE_DIR
 mayicontinue
 
 
-# try to grab a directory from /archiveteam/processed/archive/
+# try to grab a directory from the packing queue
 ITEM=none
 while [[ $ITEM = none ]]
 do
-  possible_item=$( ls -1 $PROCESSED_DIR/ | grep 201 | sort | head -n 1 )
+  possible_item=$( ls -1 "$PACKING_QUEUE_DIR/" | grep 201 | sort | head -n 1 )
   if [[ $possible_item =~ 201 ]]
   then
     echo "Trying to grab $possible_item"
-    if mv $PROCESSED_DIR/$possible_item .
+    if mv "$PACKING_QUEUE_DIR/$possible_item" "$PACKER_WORKING_CHUNKS_DIR/"
     then
       ITEM=$possible_item
     else
@@ -83,9 +81,12 @@ mayicontinue
 echo "$( date ): Starting megawarc for item $ITEM" >> packer.log
 
 # construct a megawarc
-mkdir -p $TARGET_DIR/$ITEM
-$MEGAWARC --verbose pack $TARGET_DIR/$ITEM/${FILE_PREFIX}${ITEM} $ITEM
+mkdir -p $PACKER_WORKING_MEGAWARC_DIR/$ITEM
+# megawarcs use relative paths
+cd "$PACKER_WORKING_CHUNKS_DIR/"
+$MEGAWARC --verbose pack $PACKER_WORKING_MEGAWARC_DIR/$ITEM/${FILE_PREFIX}${ITEM} $ITEM
 result=$?
+cd "$WORKING_DIR"
 
 if [[ $result -ne 0 ]]
 then
@@ -102,7 +103,7 @@ mayicontinue
 
 # remove files
 echo "megawarc OK, removing source files"
-rm -rf $ITEM
+rm -rf "$PACKER_WORKING_CHUNKS_DIR/$ITEM"
 result=$?
 
 if [[ $result -ne 0 ]]
@@ -114,7 +115,7 @@ fi
 
 
 echo "add to upload queue"
-mv $TARGET_DIR/$ITEM $UPLOAD_QUEUE_DIR
+mv "$PACKER_WORKING_MEGAWARC_DIR/$ITEM" "$UPLOAD_QUEUE_DIR/"
 
 
 exit 0
diff --git a/upload-multiple b/upload-multiple
index c2707fa..1a12f4d 100755
--- a/upload-multiple
+++ b/upload-multiple
@@ -5,7 +5,7 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 
 while [[ -f RUN ]]
 do
-  $SCRIPT_DIR/upload-one $1
+  $SCRIPT_DIR/upload-one
   result=$?
   if [[ $result -ne 0 ]]
   then
diff --git a/upload-one b/upload-one
index 7e2ed88..d0cad86 100755
--- a/upload-one
+++ b/upload-one
@@ -1,31 +1,30 @@
 #!/bin/bash
 # Uploads megawarcs from the upload queue.
+# (Needs a config.sh in the working directory.)
 #
-#   ./upload-one UPLOAD_QUEUE_DIR
+#   ./upload-one
 #
 # 1. Grabs an item from UPLOAD_QUEUE_DIR
-# 2. Reserves the item by moving the directory to the working directory
+# 2. Reserves the item by moving the directory to the
+#    UPLOADER_WORKING_DIR
 # 3. Uploads the item to s3.us.archive.org
 # 4. Removes the source files from the working directory
+#    If COMPLETED_DIR is set, uploaded files are moved there.
 #
 # The program exits with 1 on any nontransient error.
 #
-# run from the upload directory /archiveteam/ssd1/uploader-1/
-# 
-#   ./upload-one /archiveteam/ssd1/upload-queue
-#
-#
-
-UPLOAD_QUEUE_DIR=$1
 
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 
-if [ ! -f ./config.sh ] ; then
-  echo "config.sh not found in current directory."
-  exit 1
-fi
-source ./config.sh
+source ./config.sh || exit 1
 
+mkdir -p "$UPLOAD_QUEUE_DIR" || exit 1
+mkdir -p "$UPLOADER_WORKING_DIR" || exit 1
+
+if [ ! -z "$COMPLETED_DIR" ]
+then
+  mkdir -p "$COMPLETED_DIR" || exit 1
+fi
 
 function mayicontinue {
   echo
@@ -41,11 +40,11 @@ mayicontinue
 ITEM=none
 while [[ $ITEM = none ]]
 do
-  possible_item=$( ls -1 $UPLOAD_QUEUE_DIR | grep 201 | sort | head -n 1 )
+  possible_item=$( ls -1 "$UPLOAD_QUEUE_DIR" | grep 201 | sort | head -n 1 )
   if [[ $possible_item =~ 201 ]]
   then
     echo "Trying to grab $possible_item"
-    if mv $UPLOAD_QUEUE_DIR/$possible_item .
+    if mv "$UPLOAD_QUEUE_DIR/$possible_item" "$UPLOADER_WORKING_DIR/"
     then
       ITEM=$possible_item
     else
@@ -64,14 +63,14 @@ done
 echo "$( date ): Start uploading for item $ITEM" >> uploader.log
 
 # upload megawarc
+size_hint=$( du --bytes -s "${UPLOADER_WORKING_DIR}/${ITEM}" | grep -oE "^[0-9]+" )
 # (upload the large files first to optimise S3 snowballing)
 for ext in warc.gz tar json.gz
 do
   result=1
   while [[ $result -ne 0 ]]
   do
-    filename=${FILE_PREFIX}${ITEM}.megawarc.${ext}
-    size_hint=$( du --bytes -s ${ITEM}/${filename} | grep -oE "^[0-9]+" )
+    filename="${FILE_PREFIX}${ITEM}.megawarc.${ext}"
     curl -v --location --fail \
       --speed-limit 1 --speed-time 900 \
       --header "x-archive-queue-derive:1" \
@@ -83,8 +82,8 @@ do
       --header "x-archive-meta-language:eng" \
       --header "x-archive-size-hint:$size_hint" \
       --header "authorization: LOW ${IA_AUTH}" \
-      --upload-file ${ITEM}/${filename} \
-      http://s3.us.archive.org/${IA_ITEM_PREFIX}${ITEM}/${filename} \
+      --upload-file "${UPLOADER_WORKING_DIR}/${ITEM}/${filename}" \
+      "http://s3.us.archive.org/${IA_ITEM_PREFIX}${ITEM}/${filename}" \
       > /dev/null
     result=$?
     if [[ $result -ne 0 ]]
@@ -105,17 +104,31 @@ echo "$( date ): Completed uploading for item $ITEM" >> uploader.log
 mayicontinue
 
 
-# remove megawarc
-rm -rf ${ITEM}
-result=$?
-
-if [[ $result -ne 0 ]]
+# move or remove megawarc
+if [ -z "$COMPLETED_DIR" ]
 then
-  date
-  echo "rm -rf megawarc exited with $result for $ITEM"
-  exit 1
-fi
+  # remove
+  rm -rf "${UPLOADER_WORKING_DIR}/${ITEM}"
+  result=$?
 
+  if [[ $result -ne 0 ]]
+  then
+    date
+    echo "rm -rf megawarc exited with $result for $ITEM"
+    exit 1
+  fi
+else
+  # move
+  mv "${UPLOADER_WORKING_DIR}/${ITEM}" "${COMPLETED_DIR}/"
+  result=$?
+
+  if [[ $result -ne 0 ]]
+  then
+    date
+    echo "rm -rf megawarc exited with $result for $ITEM"
+    exit 1
+  fi
+fi
 
 exit 0