Browse Source

Move directory configuration to config file.

master
Alard 11 years ago
parent
commit
7cfca95415
8 changed files with 186 additions and 73 deletions
  1. +17
    -1
      README.md
  2. +17
    -0
      chunk-many
  3. +19
    -10
      chunker
  4. +62
    -5
      config.example.sh
  5. +1
    -1
      pack-multiple-without-upload
  6. +27
    -26
      pack-one-without-upload
  7. +1
    -1
      upload-multiple
  8. +42
    -29
      upload-one

+ 17
- 1
README.md View File

@@ -39,7 +39,23 @@ Filesystems 1 and 2 do not have to be the same.

Configuration
-------------
See each script for the configuration options and arguments. Part of the configuration is in the `config.sh` file that you should place in the working directory of each script. Another part of the configuration is in the script's arguments. The working directory itself is also important for some of the scripts.
Create a configuration file called `config.sh` and place it in the directory where you start the scripts. See the `config.example.sh` for more details.


Running
-------
In `screen`, `tmux` or something similar, run the scripts:

`./chunk-many` (run exactly one)
`./pack-many` (you may run more than one)
`./upload-many` (you may run more than one)

`touch RUN` before you start the scripts. Use `rm RUN` to stop gracefully.


Recovering from errors
----------------------
The scripts are designed not to lose data. If a script dies, you can look in its working directory for in-progress items and move them back to the queue.


Requirements


+ 17
- 0
chunk-many View File

@@ -0,0 +1,17 @@
#!/bin/bash
# This loops the chunker script while the RUN file exists.
# See chunker for details.
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

while [[ -f RUN ]]
do
$SCRIPT_DIR/chunker
result=$?
if [[ $result -ne 0 ]]
then
date
echo "chunker exited with $result"
exit $result
fi
done


+ 19
- 10
chunker View File

@@ -8,12 +8,22 @@
# can be moved somewhere else. Remember this when running Rsync.
#

PATH_TO_UPLOADS=$1 # /home/archiveteam/uploads
PATH_TO_TARGET=$2 # /home/archiveteam/processed
MAX_MEGABYTES=$((1024*25))
INCOMING_UPLOADS_DIR=$1 # /home/archiveteam/uploads
CHUNKER_WORKING_DIR=$2 # /home/archiveteam/processed
PACKING_QUEUE_DIR="$CHUNKER_WORKING_DIR/archive"
MEGABYTES_PER_CHUNK=$((1024*25))

# if not specified in command-line arguments
if [ -z $INCOMING_UPLOADS_DIR ]
then
source ./config.sh || exit 1
fi

mkdir -p "$CHUNKER_WORKING_DIR" || exit 1
mkdir -p "$PACKING_QUEUE_DIR" || exit 1

# find every .warc.gz in the upload directory
find "$PATH_TO_UPLOADS" -type f -name "*.warc.gz" \
find "$INCOMING_UPLOADS_DIR" -type f -name "*.warc.gz" \
| while read filename
do
# skip partial uploads
@@ -24,18 +34,17 @@ do

# move to the current/ directory
echo "Moving ${filename}"
mkdir -p "${PATH_TO_TARGET}/current"
mv "${filename}" "${PATH_TO_TARGET}/current/"
mkdir -p "$CHUNKER_WORKING_DIR/current"
mv "${filename}" "$CHUNKER_WORKING_DIR/current/"

# if the current/ directory is large enough,
# rename it to archive-XXXXX and start a new current/
cur_size=$( du -BM -s "${PATH_TO_TARGET}/current" | grep -oE "^[0-9]+" )
if [[ $cur_size -gt $MAX_MEGABYTES ]]
cur_size=$( du -BM -s "$CHUNKER_WORKING_DIR/current" | grep -oE "^[0-9]+" )
if [[ $cur_size -gt $MEGABYTES_PER_CHUNK ]]
then
timestamp=$( date +'%Y%m%d%H%M%S' )
echo "Current archive is full, moving to ${timestamp}."
mkdir -p "${PATH_TO_TARGET}/archive"
mv "${PATH_TO_TARGET}/current" "${PATH_TO_TARGET}/archive/${timestamp}"
mv "$CHUNKER_WORKING_DIR/current" "$PACKING_QUEUE_DIR/${timestamp}"
fi
done


+ 62
- 5
config.example.sh View File

@@ -1,10 +1,16 @@
#!/bin/bash
# Create this config.sh and copy it to the working directories of the
# packing and upload scripts.
# Create a copy of this config.sh, customise it and place it in the
# working directory of the packing and upload scripts.

echo "config.sh not customised."
exit 1
####################
# CHUNKER SETTINGS #
####################
# start a new chunk when the current chunk is at least this large
MEGABYTES_PER_CHUNK=$((1024*25))

###################
# UPLOAD METADATA #
###################
# your Archive.org S3 keys
IA_AUTH="ACCESS_KEY:SECRET"

@@ -21,5 +27,56 @@ IA_ITEM_PREFIX="archiveteam_todo_"
FILE_PREFIX="todo_"

# the date field for the item
IA_ITEM_DATE="2013-04"
IA_ITEM_DATE=$( date +"%Y-%m" )



###############
# DIRECTORIES #
###############
# Put your directories on one or two filesystems (see README).
FS1_BASE_DIR="/archiveteam/ssd/project"
FS2_BASE_DIR="/archiveteam/disk/project"

## THESE DIRECTORIES ON FILESYSTEM 1: for warcs

# the rsync upload directory
# (the chunker will package the .warc.gz files in this directory)
INCOMING_UPLOADS_DIR="${FS1_BASE_DIR}/incoming-uploads"

# the chunker working directory
# (this directory will hold the current in-progress chunk)
CHUNKER_WORKING_DIR="${FS1_BASE_DIR}/chunker-work"

# the chunker output directory / the packer queue
# (this directory will hold the completed chunks)
PACKING_QUEUE_DIR="${FS1_BASE_DIR}/packing-queue"

# the packer working directory - warc side
# (this directory will hold the current chunk)
PACKER_WORKING_CHUNKS_DIR="${FS1_BASE_DIR}/packer-work-in"

## THESE DIRECTORIES ON FILESYSTEM 2: for megawarcs

# the packer working directory - megawarc side
# (this directory will hold the current megawarc)
PACKER_WORKING_MEGAWARC_DIR="${FS2_BASE_DIR}/packer-work-out"

# the packer output directory / the upload queue
# (this directory will hold the completed megawarcs)
UPLOAD_QUEUE_DIR="${FS2_BASE_DIR}/upload-queue"

# the uploader working directory
# (this directory will hold the current megawarc)
UPLOADER_WORKING_DIR="${FS2_BASE_DIR}/uploader-work"

# the final destination for uploaded megawarcs
# leave this empty to remove megawarcs after uploading
COMPLETED_DIR="${FS2_BASE_DIR}/uploaded"


# remove this
echo "config.sh not customised."
exit 1



+ 1
- 1
pack-multiple-without-upload View File

@@ -5,7 +5,7 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

while [[ -f RUN ]]
do
$SCRIPT_DIR/pack-one-without-upload $1 $2 $3
$SCRIPT_DIR/pack-one-without-upload
result=$?
if [[ $result -ne 0 ]]
then


+ 27
- 26
pack-one-without-upload View File

@@ -1,33 +1,34 @@
#!/bin/bash
# Feeds the upload queue with megawarcs.
# (Needs a config.sh in the working directory.)
#
# ./pack-one PROCESSED_DIR TARGET_DIR UPLOAD_QUEUE_DIR
# ./pack-one
#
# 1. Grabs an item from PROCESSED_DIR
# 2. Reserves the item by moving the directory to the working directory
# 3. Makes a megawarc in the TARGET_DIR
# 1. Grabs an item from PACKING_QUEUE_DIR
# 2. Reserves the item by moving the directory to the
# PACKER_WORKING_CHUNKS_DIR
# 3. Makes a megawarc in the PACKER_WORKING_MEGAWARC_DIR
# 4. Removes the source files from the working directory
# 5. Moves the megawarc to UPLOAD_QUEUE_DIR
# 5. Moves the megawarc to the UPLOAD_QUEUE_DIR
#
# The program exits with 1 on any nontransient error.
#
# run from the packer directory /archiveteam/packer-1/
#
# ./pack-one /archiveteam/processed/archive /archiveteam/ssd1/packer-1 /archiveteam/ssd1/upload-queue
#

PROCESSED_DIR=$1
TARGET_DIR=$2
UPLOAD_QUEUE_DIR=$3

WORKING_DIR="$( pwd )"
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
MEGAWARC=$SCRIPT_DIR/megawarc/megawarc

if [ ! -f ./config.sh ] ; then
echo "config.sh not found in current directory."
if [[ ! -x $MEGAWARC ]]
then
echo "$MEGAWARC does not exist or is not executable."
exit 1
fi
source ./config.sh

source ./config.sh || exit 1

mkdir -p "$PACKER_WORKING_CHUNKS_DIR" || exit 1
mkdir -p "$PACKER_WORKING_MEGAWARC_DIR" || exit 1
mkdir -p "$UPLOAD_QUEUE_DIR" || exit 1


function mayicontinue {
@@ -37,9 +38,6 @@ function mayicontinue {
# echo
}

mkdir -p $TARGET_DIR
mkdir -p $UPLOAD_QUEUE_DIR


# check if the upload queue is empty
# if [ "$( ls -A $UPLOAD_QUEUE_DIR )" ]
@@ -53,15 +51,15 @@ mkdir -p $UPLOAD_QUEUE_DIR
mayicontinue


# try to grab a directory from /archiveteam/processed/archive/
# try to grab a directory from the packing queue
ITEM=none
while [[ $ITEM = none ]]
do
possible_item=$( ls -1 $PROCESSED_DIR/ | grep 201 | sort | head -n 1 )
possible_item=$( ls -1 "$PACKING_QUEUE_DIR/" | grep 201 | sort | head -n 1 )
if [[ $possible_item =~ 201 ]]
then
echo "Trying to grab $possible_item"
if mv $PROCESSED_DIR/$possible_item .
if mv "$PACKING_QUEUE_DIR/$possible_item" "$PACKER_WORKING_CHUNKS_DIR/"
then
ITEM=$possible_item
else
@@ -83,9 +81,12 @@ mayicontinue
echo "$( date ): Starting megawarc for item $ITEM" >> packer.log

# construct a megawarc
mkdir -p $TARGET_DIR/$ITEM
$MEGAWARC --verbose pack $TARGET_DIR/$ITEM/${FILE_PREFIX}${ITEM} $ITEM
mkdir -p $PACKER_WORKING_MEGAWARC_DIR/$ITEM
# megawarcs use relative paths
cd "$PACKER_WORKING_CHUNKS_DIR/"
$MEGAWARC --verbose pack $PACKER_WORKING_MEGAWARC_DIR/$ITEM/${FILE_PREFIX}${ITEM} $ITEM
result=$?
cd "$WORKING_DIR"

if [[ $result -ne 0 ]]
then
@@ -102,7 +103,7 @@ mayicontinue

# remove files
echo "megawarc OK, removing source files"
rm -rf $ITEM
rm -rf "$PACKER_WORKING_CHUNKS_DIR/$ITEM"
result=$?

if [[ $result -ne 0 ]]
@@ -114,7 +115,7 @@ fi


echo "add to upload queue"
mv $TARGET_DIR/$ITEM $UPLOAD_QUEUE_DIR
mv "$PACKER_WORKING_MEGAWARC_DIR/$ITEM" "$UPLOAD_QUEUE_DIR/"


exit 0


+ 1
- 1
upload-multiple View File

@@ -5,7 +5,7 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

while [[ -f RUN ]]
do
$SCRIPT_DIR/upload-one $1
$SCRIPT_DIR/upload-one
result=$?
if [[ $result -ne 0 ]]
then


+ 42
- 29
upload-one View File

@@ -1,31 +1,30 @@
#!/bin/bash
# Uploads megawarcs from the upload queue.
# (Needs a config.sh in the working directory.)
#
# ./upload-one UPLOAD_QUEUE_DIR
# ./upload-one
#
# 1. Grabs an item from UPLOAD_QUEUE_DIR
# 2. Reserves the item by moving the directory to the working directory
# 2. Reserves the item by moving the directory to the
# UPLOADER_WORKING_DIR
# 3. Uploads the item to s3.us.archive.org
# 4. Removes the source files from the working directory
# If COMPLETED_DIR is set, uploaded files are moved there.
#
# The program exits with 1 on any nontransient error.
#
# run from the upload directory /archiveteam/ssd1/uploader-1/
#
# ./upload-one /archiveteam/ssd1/upload-queue
#
#

UPLOAD_QUEUE_DIR=$1

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

if [ ! -f ./config.sh ] ; then
echo "config.sh not found in current directory."
exit 1
fi
source ./config.sh
source ./config.sh || exit 1

mkdir -p "$UPLOAD_QUEUE_DIR" || exit 1
mkdir -p "$UPLOADER_WORKING_DIR" || exit 1

if [ ! -z "$COMPLETED_DIR" ]
then
mkdir -p "$COMPLETED_DIR" || exit 1
fi

function mayicontinue {
echo
@@ -41,11 +40,11 @@ mayicontinue
ITEM=none
while [[ $ITEM = none ]]
do
possible_item=$( ls -1 $UPLOAD_QUEUE_DIR | grep 201 | sort | head -n 1 )
possible_item=$( ls -1 "$UPLOAD_QUEUE_DIR" | grep 201 | sort | head -n 1 )
if [[ $possible_item =~ 201 ]]
then
echo "Trying to grab $possible_item"
if mv $UPLOAD_QUEUE_DIR/$possible_item .
if mv "$UPLOAD_QUEUE_DIR/$possible_item" "$UPLOADER_WORKING_DIR/"
then
ITEM=$possible_item
else
@@ -64,14 +63,14 @@ done
echo "$( date ): Start uploading for item $ITEM" >> uploader.log

# upload megawarc
size_hint=$( du --bytes -s "${UPLOADER_WORKING_DIR}/${ITEM}" | grep -oE "^[0-9]+" )
# (upload the large files first to optimise S3 snowballing)
for ext in warc.gz tar json.gz
do
result=1
while [[ $result -ne 0 ]]
do
filename=${FILE_PREFIX}${ITEM}.megawarc.${ext}
size_hint=$( du --bytes -s ${ITEM}/${filename} | grep -oE "^[0-9]+" )
filename="${FILE_PREFIX}${ITEM}.megawarc.${ext}"
curl -v --location --fail \
--speed-limit 1 --speed-time 900 \
--header "x-archive-queue-derive:1" \
@@ -83,8 +82,8 @@ do
--header "x-archive-meta-language:eng" \
--header "x-archive-size-hint:$size_hint" \
--header "authorization: LOW ${IA_AUTH}" \
--upload-file ${ITEM}/${filename} \
http://s3.us.archive.org/${IA_ITEM_PREFIX}${ITEM}/${filename} \
--upload-file "${UPLOADER_WORKING_DIR}/${ITEM}/${filename}" \
"http://s3.us.archive.org/${IA_ITEM_PREFIX}${ITEM}/${filename}" \
> /dev/null
result=$?
if [[ $result -ne 0 ]]
@@ -105,17 +104,31 @@ echo "$( date ): Completed uploading for item $ITEM" >> uploader.log
mayicontinue


# remove megawarc
rm -rf ${ITEM}
result=$?

if [[ $result -ne 0 ]]
# move or remove megawarc
if [ -z "$COMPLETED_DIR" ]
then
date
echo "rm -rf megawarc exited with $result for $ITEM"
exit 1
fi
# remove
rm -rf "${UPLOADER_WORKING_DIR}/${ITEM}"
result=$?

if [[ $result -ne 0 ]]
then
date
echo "rm -rf megawarc exited with $result for $ITEM"
exit 1
fi
else
# move
mv "${UPLOADER_WORKING_DIR}/${ITEM}" "${COMPLETED_DIR}/"
result=$?

if [[ $result -ne 0 ]]
then
date
echo "rm -rf megawarc exited with $result for $ITEM"
exit 1
fi
fi

exit 0


Loading…
Cancel
Save