Move directory configuration to config file.

11 years ago · 7cfca95415
--- a/README.md
+++ b/README.md
@@ -39,7 +39,23 @@ Filesystems 1 and 2 do not have to be the same.

 Configuration
 -------------
 See each script for the configuration options and arguments. Part of the configuration is in the `config.sh` file that you should place in the working directory of each script. Another part of the configuration is in the script's arguments. The working directory itself is also important for some of the scripts.
 Create a configuration file called `config.sh` and place it in the directory where you start the scripts. See the `config.example.sh` for more details.


 Running
 -------
 In `screen`, `tmux` or something similar, run the scripts:

 `./chunk-many` (run exactly one)
 `./pack-many` (you may run more than one)
 `./upload-many` (you may run more than one)

 `touch RUN` before you start the scripts. Use `rm RUN` to stop gracefully.


 Recovering from errors
 ----------------------
 The scripts are designed not to lose data. If a script dies, you can look in its working directory for in-progress items and move them back to the queue.


 Requirements
--- a/+ 17
+++ b/+ 17
@@ -0,0 +1,17 @@
 #!/bin/bash
 # This loops the chunker script while the RUN file exists.
 # See chunker for details.
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

 while [[ -f RUN ]]
 do
  $SCRIPT_DIR/chunker
  result=$?
  if [[ $result -ne 0 ]]
  then
    date
    echo "chunker exited with $result"
    exit $result
  fi
 done

--- a/+ 19
+++ b/+ 19
@@ -8,12 +8,22 @@
 # can be moved somewhere else. Remember this when running Rsync.
 #

 PATH_TO_UPLOADS=$1 # /home/archiveteam/uploads
 PATH_TO_TARGET=$2  # /home/archiveteam/processed
 MAX_MEGABYTES=$((1024*25))
 INCOMING_UPLOADS_DIR=$1 # /home/archiveteam/uploads
 CHUNKER_WORKING_DIR=$2  # /home/archiveteam/processed
 PACKING_QUEUE_DIR="$CHUNKER_WORKING_DIR/archive"
 MEGABYTES_PER_CHUNK=$((1024*25))

 # if not specified in command-line arguments
 if [ -z $INCOMING_UPLOADS_DIR ]
 then
  source ./config.sh || exit 1
 fi

 mkdir -p "$CHUNKER_WORKING_DIR" || exit 1
 mkdir -p "$PACKING_QUEUE_DIR" || exit 1

 # find every .warc.gz in the upload directory
 find "$PATH_TO_UPLOADS" -type f -name "*.warc.gz" \
 find "$INCOMING_UPLOADS_DIR" -type f -name "*.warc.gz" \
 | while read filename
 do
  # skip partial uploads
@@ -24,18 +34,17 @@ do

  # move to the current/ directory
  echo "Moving ${filename}"
  mkdir -p "${PATH_TO_TARGET}/current"
  mv "${filename}" "${PATH_TO_TARGET}/current/"
  mkdir -p "$CHUNKER_WORKING_DIR/current"
  mv "${filename}" "$CHUNKER_WORKING_DIR/current/"

  # if the current/ directory is large enough,
  # rename it to archive-XXXXX and start a new current/
  cur_size=$( du -BM -s "${PATH_TO_TARGET}/current" | grep -oE "^[0-9]+" )
  if [[ $cur_size -gt $MAX_MEGABYTES ]]
  cur_size=$( du -BM -s "$CHUNKER_WORKING_DIR/current" | grep -oE "^[0-9]+" )
  if [[ $cur_size -gt $MEGABYTES_PER_CHUNK ]]
  then
    timestamp=$( date +'%Y%m%d%H%M%S' )
    echo "Current archive is full, moving to ${timestamp}."
    mkdir -p "${PATH_TO_TARGET}/archive"
    mv "${PATH_TO_TARGET}/current" "${PATH_TO_TARGET}/archive/${timestamp}"
    mv "$CHUNKER_WORKING_DIR/current" "$PACKING_QUEUE_DIR/${timestamp}"
  fi
 done

--- a/config.example.sh
+++ b/config.example.sh
@@ -1,10 +1,16 @@
 #!/bin/bash
 # Create this config.sh and copy it to the working directories of the
 # packing and upload scripts.
 # Create a copy of this config.sh, customise it and place it in the
 # working directory of the packing and upload scripts.

 echo "config.sh not customised."
 exit 1
 ####################
 # CHUNKER SETTINGS #
 ####################
 # start a new chunk when the current chunk is at least this large
 MEGABYTES_PER_CHUNK=$((1024*25))

 ###################
 # UPLOAD METADATA #
 ###################
 # your Archive.org S3 keys
 IA_AUTH="ACCESS_KEY:SECRET"

@@ -21,5 +27,56 @@ IA_ITEM_PREFIX="archiveteam_todo_"
 FILE_PREFIX="todo_"

 # the date field for the item
 IA_ITEM_DATE="2013-04"
 IA_ITEM_DATE=$( date +"%Y-%m" )



 ###############
 # DIRECTORIES #
 ###############
 # Put your directories on one or two filesystems (see README).
 FS1_BASE_DIR="/archiveteam/ssd/project"
 FS2_BASE_DIR="/archiveteam/disk/project"

 ## THESE DIRECTORIES ON FILESYSTEM 1: for warcs

 # the rsync upload directory
 # (the chunker will package the .warc.gz files in this directory)
 INCOMING_UPLOADS_DIR="${FS1_BASE_DIR}/incoming-uploads"

 # the chunker working directory
 # (this directory will hold the current in-progress chunk)
 CHUNKER_WORKING_DIR="${FS1_BASE_DIR}/chunker-work"

 # the chunker output directory / the packer queue
 # (this directory will hold the completed chunks)
 PACKING_QUEUE_DIR="${FS1_BASE_DIR}/packing-queue"

 # the packer working directory - warc side
 # (this directory will hold the current chunk)
 PACKER_WORKING_CHUNKS_DIR="${FS1_BASE_DIR}/packer-work-in"

 ## THESE DIRECTORIES ON FILESYSTEM 2: for megawarcs

 # the packer working directory - megawarc side
 # (this directory will hold the current megawarc)
 PACKER_WORKING_MEGAWARC_DIR="${FS2_BASE_DIR}/packer-work-out"

 # the packer output directory / the upload queue
 # (this directory will hold the completed megawarcs)
 UPLOAD_QUEUE_DIR="${FS2_BASE_DIR}/upload-queue"

 # the uploader working directory
 # (this directory will hold the current megawarc)
 UPLOADER_WORKING_DIR="${FS2_BASE_DIR}/uploader-work"

 # the final destination for uploaded megawarcs
 # leave this empty to remove megawarcs after uploading
 COMPLETED_DIR="${FS2_BASE_DIR}/uploaded"


 # remove this
 echo "config.sh not customised."
 exit 1


--- a/+ 1
+++ b/+ 1
@@ -5,7 +5,7 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

 while [[ -f RUN ]]
 do
  $SCRIPT_DIR/pack-one-without-upload $1 $2 $3
  $SCRIPT_DIR/pack-one-without-upload
  result=$?
  if [[ $result -ne 0 ]]
  then
--- a/+ 27
+++ b/+ 27
@@ -1,33 +1,34 @@
 #!/bin/bash
 # Feeds the upload queue with megawarcs.
 # (Needs a config.sh in the working directory.)
 #
 #   ./pack-one PROCESSED_DIR TARGET_DIR UPLOAD_QUEUE_DIR
 #   ./pack-one
 #
 # 1. Grabs an item from PROCESSED_DIR
 # 2. Reserves the item by moving the directory to the working directory
 # 3. Makes a megawarc in the TARGET_DIR
 # 1. Grabs an item from PACKING_QUEUE_DIR
 # 2. Reserves the item by moving the directory to the
 #    PACKER_WORKING_CHUNKS_DIR
 # 3. Makes a megawarc in the PACKER_WORKING_MEGAWARC_DIR
 # 4. Removes the source files from the working directory
 # 5. Moves the megawarc to UPLOAD_QUEUE_DIR
 # 5. Moves the megawarc to the UPLOAD_QUEUE_DIR
 #
 # The program exits with 1 on any nontransient error.
 #
 # run from the packer directory /archiveteam/packer-1/
 # 
 #   ./pack-one /archiveteam/processed/archive /archiveteam/ssd1/packer-1 /archiveteam/ssd1/upload-queue
 #

 PROCESSED_DIR=$1
 TARGET_DIR=$2
 UPLOAD_QUEUE_DIR=$3

 WORKING_DIR="$( pwd )"
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 MEGAWARC=$SCRIPT_DIR/megawarc/megawarc

 if [ ! -f ./config.sh ] ; then
  echo "config.sh not found in current directory."
 if [[ ! -x $MEGAWARC ]]
 then
  echo "$MEGAWARC does not exist or is not executable."
  exit 1
 fi
 source ./config.sh

 source ./config.sh || exit 1

 mkdir -p "$PACKER_WORKING_CHUNKS_DIR" || exit 1
 mkdir -p "$PACKER_WORKING_MEGAWARC_DIR" || exit 1
 mkdir -p "$UPLOAD_QUEUE_DIR" || exit 1


 function mayicontinue {
@@ -37,9 +38,6 @@ function mayicontinue {
 # echo
 }

 mkdir -p $TARGET_DIR
 mkdir -p $UPLOAD_QUEUE_DIR


 # check if the upload queue is empty
 # if [ "$( ls -A $UPLOAD_QUEUE_DIR )" ]
@@ -53,15 +51,15 @@ mkdir -p $UPLOAD_QUEUE_DIR
 mayicontinue


 # try to grab a directory from /archiveteam/processed/archive/
 # try to grab a directory from the packing queue
 ITEM=none
 while [[ $ITEM = none ]]
 do
  possible_item=$( ls -1 $PROCESSED_DIR/ | grep 201 | sort | head -n 1 )
  possible_item=$( ls -1 "$PACKING_QUEUE_DIR/" | grep 201 | sort | head -n 1 )
  if [[ $possible_item =~ 201 ]]
  then
    echo "Trying to grab $possible_item"
    if mv $PROCESSED_DIR/$possible_item .
    if mv "$PACKING_QUEUE_DIR/$possible_item" "$PACKER_WORKING_CHUNKS_DIR/"
    then
      ITEM=$possible_item
    else
@@ -83,9 +81,12 @@ mayicontinue
 echo "$( date ): Starting megawarc for item $ITEM" >> packer.log

 # construct a megawarc
 mkdir -p $TARGET_DIR/$ITEM
 $MEGAWARC --verbose pack $TARGET_DIR/$ITEM/${FILE_PREFIX}${ITEM} $ITEM
 mkdir -p $PACKER_WORKING_MEGAWARC_DIR/$ITEM
 # megawarcs use relative paths
 cd "$PACKER_WORKING_CHUNKS_DIR/"
 $MEGAWARC --verbose pack $PACKER_WORKING_MEGAWARC_DIR/$ITEM/${FILE_PREFIX}${ITEM} $ITEM
 result=$?
 cd "$WORKING_DIR"

 if [[ $result -ne 0 ]]
 then
@@ -102,7 +103,7 @@ mayicontinue

 # remove files
 echo "megawarc OK, removing source files"
 rm -rf $ITEM
 rm -rf "$PACKER_WORKING_CHUNKS_DIR/$ITEM"
 result=$?

 if [[ $result -ne 0 ]]
@@ -114,7 +115,7 @@ fi


 echo "add to upload queue"
 mv $TARGET_DIR/$ITEM $UPLOAD_QUEUE_DIR
 mv "$PACKER_WORKING_MEGAWARC_DIR/$ITEM" "$UPLOAD_QUEUE_DIR/"


 exit 0
--- a/+ 1
+++ b/+ 1
@@ -5,7 +5,7 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

 while [[ -f RUN ]]
 do
  $SCRIPT_DIR/upload-one $1
  $SCRIPT_DIR/upload-one
  result=$?
  if [[ $result -ne 0 ]]
  then
--- a/+ 42
+++ b/+ 42
@@ -1,31 +1,30 @@
 #!/bin/bash
 # Uploads megawarcs from the upload queue.
 # (Needs a config.sh in the working directory.)
 #
 #   ./upload-one UPLOAD_QUEUE_DIR
 #   ./upload-one
 #
 # 1. Grabs an item from UPLOAD_QUEUE_DIR
 # 2. Reserves the item by moving the directory to the working directory
 # 2. Reserves the item by moving the directory to the
 #    UPLOADER_WORKING_DIR
 # 3. Uploads the item to s3.us.archive.org
 # 4. Removes the source files from the working directory
 #    If COMPLETED_DIR is set, uploaded files are moved there.
 #
 # The program exits with 1 on any nontransient error.
 #
 # run from the upload directory /archiveteam/ssd1/uploader-1/
 # 
 #   ./upload-one /archiveteam/ssd1/upload-queue
 #
 #

 UPLOAD_QUEUE_DIR=$1

 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

 if [ ! -f ./config.sh ] ; then
  echo "config.sh not found in current directory."
  exit 1
 fi
 source ./config.sh
 source ./config.sh || exit 1

 mkdir -p "$UPLOAD_QUEUE_DIR" || exit 1
 mkdir -p "$UPLOADER_WORKING_DIR" || exit 1

 if [ ! -z "$COMPLETED_DIR" ]
 then
  mkdir -p "$COMPLETED_DIR" || exit 1
 fi

 function mayicontinue {
  echo
@@ -41,11 +40,11 @@ mayicontinue
 ITEM=none
 while [[ $ITEM = none ]]
 do
  possible_item=$( ls -1 $UPLOAD_QUEUE_DIR | grep 201 | sort | head -n 1 )
  possible_item=$( ls -1 "$UPLOAD_QUEUE_DIR" | grep 201 | sort | head -n 1 )
  if [[ $possible_item =~ 201 ]]
  then
    echo "Trying to grab $possible_item"
    if mv $UPLOAD_QUEUE_DIR/$possible_item .
    if mv "$UPLOAD_QUEUE_DIR/$possible_item" "$UPLOADER_WORKING_DIR/"
    then
      ITEM=$possible_item
    else
@@ -64,14 +63,14 @@ done
 echo "$( date ): Start uploading for item $ITEM" >> uploader.log

 # upload megawarc
 size_hint=$( du --bytes -s "${UPLOADER_WORKING_DIR}/${ITEM}" | grep -oE "^[0-9]+" )
 # (upload the large files first to optimise S3 snowballing)
 for ext in warc.gz tar json.gz
 do
  result=1
  while [[ $result -ne 0 ]]
  do
    filename=${FILE_PREFIX}${ITEM}.megawarc.${ext}
    size_hint=$( du --bytes -s ${ITEM}/${filename} | grep -oE "^[0-9]+" )
    filename="${FILE_PREFIX}${ITEM}.megawarc.${ext}"
    curl -v --location --fail \
      --speed-limit 1 --speed-time 900 \
      --header "x-archive-queue-derive:1" \
@@ -83,8 +82,8 @@ do
      --header "x-archive-meta-language:eng" \
      --header "x-archive-size-hint:$size_hint" \
      --header "authorization: LOW ${IA_AUTH}" \
      --upload-file ${ITEM}/${filename} \
      http://s3.us.archive.org/${IA_ITEM_PREFIX}${ITEM}/${filename} \
      --upload-file "${UPLOADER_WORKING_DIR}/${ITEM}/${filename}" \
      "http://s3.us.archive.org/${IA_ITEM_PREFIX}${ITEM}/${filename}" \
      > /dev/null
    result=$?
    if [[ $result -ne 0 ]]
@@ -105,17 +104,31 @@ echo "$( date ): Completed uploading for item $ITEM" >> uploader.log
 mayicontinue


 # remove megawarc
 rm -rf ${ITEM}
 result=$?

 if [[ $result -ne 0 ]]
 # move or remove megawarc
 if [ -z "$COMPLETED_DIR" ]
 then
  date
  echo "rm -rf megawarc exited with $result for $ITEM"
  exit 1
 fi
  # remove
  rm -rf "${UPLOADER_WORKING_DIR}/${ITEM}"
  result=$?

  if [[ $result -ne 0 ]]
  then
    date
    echo "rm -rf megawarc exited with $result for $ITEM"
    exit 1
  fi
 else
  # move
  mv "${UPLOADER_WORKING_DIR}/${ITEM}" "${COMPLETED_DIR}/"
  result=$?

  if [[ $result -ne 0 ]]
  then
    date
    echo "rm -rf megawarc exited with $result for $ITEM"
    exit 1
  fi
 fi

 exit 0