Skip to content

Instantly share code, notes, and snippets.

@songouyang
Created May 18, 2019 10:27
Show Gist options
  • Save songouyang/01c3e8db141f7cbb4b5589f9c99e94e4 to your computer and use it in GitHub Desktop.
Save songouyang/01c3e8db141f7cbb4b5589f9c99e94e4 to your computer and use it in GitHub Desktop.
多线程下载器
import re
from os import path
import requests
from threading import Lock
from six.moves.urllib.parse import unquote, urlparse
from concurrent.futures import ThreadPoolExecutor, wait
class Downloader(object):
def __init__(self, workers_num=8):
self.session = requests.session()
self.pool = ThreadPoolExecutor(max_workers=workers_num)
self.workers_num = workers_num
def get_file_name(self, url):
token = '[-!#-\'*+.\dA-Z^-z|~]+'
qdtext='[]-~\t !#-[]'
mimeCharset='[-!#-&+\dA-Z^-z]+'
language='(?:[A-Za-z]{2,3}(?:-[A-Za-z]{3}(?:-[A-Za-z]{3}){,2})?|[A-Za-z]{4,8})(?:-[A-Za-z]{4})?(?:-(?:[A-Za-z]{2}|\d{3}))(?:-(?:[\dA-Za-z]{5,8}|\d[\dA-Za-z]{3}))*(?:-[\dA-WY-Za-wy-z](?:-[\dA-Za-z]{2,8})+)*(?:-[Xx](?:-[\dA-Za-z]{1,8})+)?|[Xx](?:-[\dA-Za-z]{1,8})+|[Ee][Nn]-[Gg][Bb]-[Oo][Ee][Dd]|[Ii]-[Aa][Mm][Ii]|[Ii]-[Bb][Nn][Nn]|[Ii]-[Dd][Ee][Ff][Aa][Uu][Ll][Tt]|[Ii]-[Ee][Nn][Oo][Cc][Hh][Ii][Aa][Nn]|[Ii]-[Hh][Aa][Kk]|[Ii]-[Kk][Ll][Ii][Nn][Gg][Oo][Nn]|[Ii]-[Ll][Uu][Xx]|[Ii]-[Mm][Ii][Nn][Gg][Oo]|[Ii]-[Nn][Aa][Vv][Aa][Jj][Oo]|[Ii]-[Pp][Ww][Nn]|[Ii]-[Tt][Aa][Oo]|[Ii]-[Tt][Aa][Yy]|[Ii]-[Tt][Ss][Uu]|[Ss][Gg][Nn]-[Bb][Ee]-[Ff][Rr]|[Ss][Gg][Nn]-[Bb][Ee]-[Nn][Ll]|[Ss][Gg][Nn]-[Cc][Hh]-[Dd][Ee]'
valueChars = '(?:%[\dA-F][\dA-F]|[-!#$&+.\dA-Z^-z|~])*'
dispositionParm = '[Ff][Ii][Ll][Ee][Nn][Aa][Mm][Ee]\s*=\s*(?:({token})|"((?:{qdtext}|\\\\[\t !-~])*)")|[Ff][Ii][Ll][Ee][Nn][Aa][Mm][Ee]\*\s*=\s*({mimeCharset})\'(?:{language})?\'({valueChars})|{token}\s*=\s*(?:{token}|"(?:{qdtext}|\\\\[\t !-~])*")|{token}\*\s*=\s*{mimeCharset}\'(?:{language})?\'{valueChars}'.format(**locals())
rsp = self.session.head(url)
try:
m = re.match('(?:{token}\s*;\s*)?(?:{dispositionParm})(?:\s*;\s*(?:{dispositionParm}))*|{token}'.format(**locals()), rsp.headers['Content-Disposition'])
except KeyError:
name = path.basename(unquote(urlparse(url).path))
else:
if not m:
name = path.basename(unquote(urlparse(url).path))
elif m.group(8) is not None:
name = unquote(m.group(8)).decode(m.group(7))
elif m.group(4) is not None:
name = unquote(m.group(4)).decode(m.group(3))
elif m.group(6) is not None:
name = re.sub('\\\\(.)', '\1', m.group(6))
elif m.group(5) is not None:
name = m.group(5)
elif m.group(2) is not None:
name = re.sub('\\\\(.)', '\1', m.group(2))
else:
name = m.group(1)
if name:
name = path.basename(name)
else:
name = path.basename(unquote(urlparse(url).path))
return name
def get_file_size(self, url):
while True:
rsp = self.session.head(url)
if 300 < rsp.status_code < 400:
url = rsp.headers["Location"]
continue
return int(rsp.headers["content-length"])
def handler(self, url, file_name, start, end):
headers = {'Range': 'bytes={}-{}'.format(start, end)}
rsp = self.session.get(url, headers=headers, stream=True)
with open(file_name, 'rb+') as f:
f.seek(start)
f.write(rsp.content)
def run(self, url):
file_name = self.get_file_name(url)
file_size = self.get_file_size(url)
with open(file_name, 'wb+') as f:
f.truncate(file_size)
f.close()
part = int(file_size / self.workers_num)
futures = []
for i in range(self.workers_num):
start = i * part
if i == self.workers_num - 1:
end = file_size
else:
end = start + part - 1
futures.append(self.pool.submit(self.handler, url, file_name, start, end))
wait(futures)
if __name__ == "__main__":
import time
d = Downloader()
s = time.time()
d.run("http://demo.borland.com/testsite/downloads/downloadfile.php?file=Small.zip&cd=attachment+filename")
d.run("http://static-aliyun-doc.oss-cn-hangzhou.aliyuncs.com/download/pdf/DNHCS_MGW1842487_zh-CN_cn_181112170048_public_92408e650bbaaab8b146f371082a0ac3.pdf")
d.run("https://p.pstatp.com/origin/ff670000482866725305")
e = time.time()
print(e - s)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment