Browse Source

Make chunker much faster on big current/ directories by not du'ing it for every file

master
Ivan Kozik 10 years ago
parent
commit
28ba3a7685
1 changed files with 9 additions and 2 deletions
  1. +9
    -2
      chunker

+ 9
- 2
chunker View File

@@ -19,9 +19,14 @@ then
source ./config.sh || exit 1
fi

BYTES_PER_CHUNK=$((1024*1024*MEGABYTES_PER_CHUNK))

mkdir -p "$CHUNKER_WORKING_DIR" || exit 1
mkdir -p "$PACKING_QUEUE_DIR" || exit 1

mkdir -p "$CHUNKER_WORKING_DIR/current" || exit 1
cur_size=$( du -B1 -s "$CHUNKER_WORKING_DIR/current" | grep -oE "^[0-9]+" )

# find every .warc.gz in the upload directory
find "$INCOMING_UPLOADS_DIR" -type f -name "*.warc.gz" \
| while read filename
@@ -32,6 +37,8 @@ do
continue
fi

cur_size=$((cur_size + $( du -B1 -s $filename | grep -oE "^[0-9]+" )))

# move to the current/ directory
echo "Moving ${filename}"
mkdir -p "$CHUNKER_WORKING_DIR/current"
@@ -39,12 +46,12 @@ do

# if the current/ directory is large enough,
# rename it to archive-XXXXX and start a new current/
cur_size=$( du -BM -s "$CHUNKER_WORKING_DIR/current" | grep -oE "^[0-9]+" )
if [[ $cur_size -gt $MEGABYTES_PER_CHUNK ]]
if [[ $cur_size -gt $BYTES_PER_CHUNK ]]
then
timestamp=$( date +'%Y%m%d%H%M%S' )
echo "Current archive is full, moving to ${timestamp}."
mv "$CHUNKER_WORKING_DIR/current" "$PACKING_QUEUE_DIR/${timestamp}"
cur_size=0
fi
done


Loading…
Cancel
Save