#!/bin/bash defaultFormat='{url}' function usage_exit { echo 's3-bucket-list-qwarc [options] BUCKETURL [MARKER...]' >&2 echo >&2 echo 'List the contents of an open S3 (or S3-like) bucket, possibly in parallel using multiple markers' >&2 echo >&2 echo 'Options:' >&2 echo ' --concurrency N Start qwarc with a concurrency of N; the default is to run all markers at once' >&2 echo " --format FORMAT Modify the output format; FORMAT defaults to '${defaultFormat}'; available fields: url, key, size, and all fields returned by S3 (e.g. LastModified)" >&2 echo ' --jsonl Enable JSONL output; cannot be used with --format' >&2 echo ' --no-start-marker Disable the automatic marker to start from the beginning of the list' >&2 echo ' --no-end-marker Disable the automatic marker to scan to the end of the list' >&2 echo ' --with-list-urls Enables printing the list URLs retrieved to FD 3' >&2 exit $1 } concurrency= listUrls= noStartMarker= noEndMarker= format= jsonl= cmd="$(printf '%q ' 's3-bucket-list-qwarc' "$@" | sed 's, $,,')" while [[ $# -gt 0 ]] do if [[ "$1" == '--help' || "$1" == '-h' ]] then usage_exit 0 elif [[ "$1" == '--concurrency' ]] then declare -i concurrency="$2" shift elif [[ "$1" == '--format' ]] then format="$2" shift elif [[ "$1" == '--jsonl' ]] then jsonl=1 elif [[ "$1" == '--no-start-marker' ]] then noStartMarker=1 elif [[ "$1" == '--no-end-marker' ]] then noEndMarker=1 elif [[ "$1" == '--with-list-urls' ]] then listUrls='yes' else break fi shift done bucketUrl="$1" shift # Remaining arguments are markers if [[ -z "${concurrency}" ]] then declare -i concurrency=$# if [[ -z "${noStartMarker}" ]]; then concurrency+=1; fi if [[ -z "${noEndMarker}" ]]; then concurrency+=1; fi concurrency+=-1 # Because the obvious -= doesn't work... fi if [[ "${listUrls}" ]] && ! { command >&3; } 2>/dev/null then echo 'Error: --with-list-urls requires FD 3 to be open' >&2 exit 1 fi if [[ "${jsonl}" && "${format}" ]] then echo 'Error: --jsonl and --format options are mutually exclusive' >&2 exit 1 fi if [[ -z "${format}" ]] then format="${defaultFormat}" fi # Validate and process bucket URL if [[ "${bucketUrl}" != http://* && "${bucketUrl}" != https://* ]] then echo 'Invalid bucket URL: must be HTTP or HTTPS' >&2 exit 1 fi if [[ "${bucketUrl}" == *'?'* ]] then echo 'Invalid bucket URL: must not have a query' >&2 exit 1 fi if [[ "${bucketUrl}" != *://*/* || "${bucketUrl}" != */ ]] then bucketUrl="${bucketUrl}/" fi # Construct prefix for files and output prefix="${bucketUrl#*://}" # Remove protocol while [[ "${prefix}" == */ ]]; do prefix="${prefix%/}"; done # Remove trailing slashes prefix="${prefix//\//_}" # Replace slashes with underscores # Ensure no collisions if [[ -e s3-bucket-list-qwarc ]] then echo 'Error: s3-bucket-list-qwarc exists in this directory.' >&2 exit 1 fi if [[ "$(shopt -s nullglob; echo "${prefix}"*)" ]] then echo "Error: there already exist files with the prefix '${prefix}' in this directory." >&2 exit 1 fi # Write the qwarc spec file # Indentation... Inspired by https://stackoverflow.com/a/33817423 readarray code <InternalErrorWe encountered an internal error. Please try again.' in body: self.logger.error(f'Got internal error on {url} on attempt {attempt}; {"retrying" if attempt < 10 else "aborting"}') if attempt >= 10: if 'marker' in params: self.logger.error(f'To retry, use marker {marker!r}') break attempt += 1 continue if not body.startswith(b'\n') and \ not body.startswith(b""): self.logger.error(f'Invalid body on marker {marker!r}: {body[:200]}...') break if b'' in body[:200] and marker is not None: self.logger.error('Marker loop (empty marker in response despite providing one)') break # No risk, no fun! contents = body.split(b'') assert all(content.startswith(b'') for content in contents[1:]) assert all(content.endswith(b'') for content in contents[1:-1]) assert contents[-1].endswith(b'') contents[-1] = contents[-1][:-len('')] for content in contents[1:]: key = html.unescape(content[5 : content.index(b'')].decode('utf-8')) # 5 = len(b'') if marker2 is not None and key >= marker2: break fields = {} url = f'{bucketUrl}{urllib.parse.quote(key)}' fields['URL'] = url tags = content.split(b'>') assert len(tags) % 2 == 0 assert tags[-1] == b'' assert tags[-2] == b''.join(openTags).decode('utf-8') assert k not in fields, f'{k!r} encountered twice (previous value: {fields[k]!r})' fields[k] = html.unescape(tag[:-(len(openTags[-1]) + 2)].decode('utf-8')) openTags.pop() continue assert False if 'Size' in fields: fields['Size'] = int(fields['Size']) if jsonl: s = json.dumps(fields) else: s = format.format(**fields, key = key, url = url, size = fields.get('Size')) self.logger.info(f'Output: {s!r}') print(s) lastKey = key if marker2 is not None and lastKey >= marker2: break truncated = True if b'true' in body else (False if b'false' in body else None) assert truncated in (True, False) if not truncated: break if marker is not None and marker == lastKey: self.logger.error('Marker loop (same last key as previous marker)') break marker = lastKey attempt = 1 specDependencies = qwarc.utils.SpecDependencies( files = ['s3-bucket-list-qwarc', os.environ['S3_MARKERS_FILENAME']], extra = {'env': {k: os.environ.get(k) for k in ('S3BL_CMD', 'S3_FORMAT', 'S3_BUCKET_URL', 'S3_MARKERS_FILENAME', 'S3_WITH_LIST_URLS')}} ) EOF printf '%s' "${code[@]# }" >"${prefix}.py" # That's a tab character after the hash. # Generate the markers file { if [[ -z "${noStartMarker}" ]]; then echo; fi; printf '%s\n' "$@"; if [[ -z "${noEndMarker}" ]]; then echo; fi; } >"${prefix}-markers" # Copy this script rsync -a "$0" s3-bucket-list-qwarc # Collect environment variables envvars=() envvars+=(S3BL_CMD="${cmd}") envvars+=(S3_FORMAT="${format}") envvars+=(S3_JSONL="${jsonl}") envvars+=(S3_BUCKET_URL="${bucketUrl}") envvars+=(S3_MARKERS_FILENAME="${prefix}-markers") if [[ "${listUrls}" ]]; then envvars+=(S3_WITH_LIST_URLS="${listUrls}"); fi # Lift-off! env "${envvars[@]}" qwarc --concurrency "${concurrency}" --database "${prefix}.db" --log "${prefix}.log" --warc "${prefix}" "${prefix}.py"