From 28ba3a7685f1208468c99cee4c92937750764059 Mon Sep 17 00:00:00 2001 From: Ivan Kozik Date: Mon, 24 Jun 2013 19:19:49 +0000 Subject: [PATCH] Make chunker much faster on big current/ directories by not du'ing it for every file --- chunker | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/chunker b/chunker index ae979aa..12ef5d3 100755 --- a/chunker +++ b/chunker @@ -19,9 +19,14 @@ then source ./config.sh || exit 1 fi +BYTES_PER_CHUNK=$((1024*1024*MEGABYTES_PER_CHUNK)) + mkdir -p "$CHUNKER_WORKING_DIR" || exit 1 mkdir -p "$PACKING_QUEUE_DIR" || exit 1 +mkdir -p "$CHUNKER_WORKING_DIR/current" || exit 1 +cur_size=$( du -B1 -s "$CHUNKER_WORKING_DIR/current" | grep -oE "^[0-9]+" ) + # find every .warc.gz in the upload directory find "$INCOMING_UPLOADS_DIR" -type f -name "*.warc.gz" \ | while read filename @@ -32,6 +37,8 @@ do continue fi + cur_size=$((cur_size + $( du -B1 -s $filename | grep -oE "^[0-9]+" ))) + # move to the current/ directory echo "Moving ${filename}" mkdir -p "$CHUNKER_WORKING_DIR/current" @@ -39,12 +46,12 @@ do # if the current/ directory is large enough, # rename it to archive-XXXXX and start a new current/ - cur_size=$( du -BM -s "$CHUNKER_WORKING_DIR/current" | grep -oE "^[0-9]+" ) - if [[ $cur_size -gt $MEGABYTES_PER_CHUNK ]] + if [[ $cur_size -gt $BYTES_PER_CHUNK ]] then timestamp=$( date +'%Y%m%d%H%M%S' ) echo "Current archive is full, moving to ${timestamp}." mv "$CHUNKER_WORKING_DIR/current" "$PACKING_QUEUE_DIR/${timestamp}" + cur_size=0 fi done