Last active
January 10, 2025 12:07
-
-
Save SinclairCoder/bfcf64176711cf86f8551eaef0280387 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Define the common prefix | |
PREFIX="https://data.commoncrawl.org/" | |
# Counter to keep track of the number of downloaded files | |
COUNTER=0 | |
LINE_NUMBER=0 | |
DATES=("2014-35" "2014-41" "2014-42" "2014-49" "2014-52" "2015-06" "2015-11" "2015-14" "2015-18" "2015-22") | |
NODE_ID=$(($SLURM_PROCID)) | |
DATE=${DATES[$NODE_ID]} | |
mkdir -p CC-MAIN-${DATE} | |
cd CC-MAIN-${DATE} | |
# Function to download a single file using axel | |
download_file() { | |
URL="${PREFIX}${1}" | |
# 使用axel下载文件,使用多个连接以提高下载速度 | |
for i in {1..3}; do | |
axel -n 128 -U "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" $URL && break || sleep 60s # -n 10表示使用10个连接,-q表示静默模式 | |
done | |
} | |
export -f download_file | |
export PREFIX | |
# Read each line from the file and use parallel to download files | |
cat ../CC-MAIN-${DATE}_warc.paths | while read -r LINE; do | |
((LINE_NUMBER++)) | |
# 如果需要,可以取消注释来跳过某些行 | |
# if [ $LINE_NUMBER -gt 10001 ]; then | |
# continue | |
# fi | |
echo $LINE | |
done | xargs -I {} -P 2 bash -c 'download_file "$@"' _ {} | |
# done | pv -l | xargs -I {} -P 10 bash -c 'download_file "$@"' _ {} | |
echo "Downloaded files" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment