Last active
May 23, 2021 16:59
-
-
Save jimmygle/f679dc814254074b05f1f6a09c3e68f5 to your computer and use it in GitHub Desktop.
wget one liners and helper functions for doing mass file downloads
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
wget \ | |
--mirror \ # Recursive download (infinite depth) | |
--no-parent \ # Don't ascend to parent directory | |
--continue \ # Resume partially downloaded files | |
--user-agent="thanks" \ # Sets user agent seen by server | |
--wait=2 \ # Wait n seconds between requests | |
--reject="index.html*" \ # Rejected file patterns | |
"URL" | |
# One Liner... | |
wget -m -np -c -U "thanks" -w 2 -R "index.html*" "URL" | |
# Download in parallel in the background with outputs to the log | |
xargs -i wget -m -np -b -c -U "thanks" -R "index.html*" "{}" < urls.txt | |
# Get a raw look into the status of wget-log* files | |
tail -n 5 wget-log* | |
# [WIP] Gets formatted status from wget-log* files | |
# Output Example: | |
# wget-log.2 | |
# Status: FINISHED | 2021-05-23 05:30:24 | 60 files, 26G in 5h 51m 12s (1.25 MB/s) | |
# Last: https://www.google.com/robots.txt | |
# wget-log.4 | |
# Status: 90% | |
# Current: https://www.google.com/sitemap.xml | |
# wget-log.5 | |
# Status: ERROR | |
# Message: Cannot write to ‘~/Downloads/Google/index.html.tmp’ (Bad file descriptor). | |
function wgetstatus() { | |
#tail -n 5 wget-log* | |
while read -r line | |
do | |
if [ -n $line | grep "==>wget-log" ] | |
then | |
echo "$line"; | |
fi | |
done | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment