Skip to content

Instantly share code, notes, and snippets.

@zhangxigithub
Last active August 29, 2015 14:05
Show Gist options
  • Save zhangxigithub/7887286af657a3cc8ab9 to your computer and use it in GitHub Desktop.
Save zhangxigithub/7887286af657a3cc8ab9 to your computer and use it in GitHub Desktop.
[改进,多线程下载]下载豆瓣妹子的图片 http://dbmeizi.com 使用方法:python db.py 会在当前目录下创建dbmeizi文件夹并下载所有图片
#coding:utf-8
#!/usr/bin/python
import Queue
import threading
import os
import time
import random
import urlparse,urllib,urllib2,os,time,threading,Queue
from bs4 import BeautifulSoup
def downloadImage(imageURL):
url = urlparse.urlparse(imageURL)
i = len(url.path) - 1
while i > 0:
if url.path[i] == '/':
break
i = i - 1
filename = url.path[i+1:len(url.path)]
urllib.urlretrieve(imageURL,"./dbmeizi/"+filename);
print filename+"..... done"
def findMM(startIndex,length):
#os.makedirs("./dbmeizi")
index = startIndex
picList = []
while index<startIndex+length:
htmlString = urllib2.urlopen("http://www.dbmeizi.com/?p="+str(index)).read()
soup = BeautifulSoup(htmlString)
pics = soup.findAll("div",{"class":"pic"})
#if len(pics) == 0:
# return
for person in pics:
mz = person.find("img")
picURL = mz["data-bigimg"]
picList.append(picURL)
#downloadImage(picURL)
#print "==="
print "find page "+str(index)
index = index + 1
return picList
class Download(threading.Thread):
name = ""
def __init__(self,que):
threading.Thread.__init__(self)
self.que=que
def run(self):
while True:
host = self.que.get()
#time.sleep(random.random()*3)
#time.sleep(1)
downloadImage(host)
#print "download......"+host+"....done\n"
self.que.task_done()
#grabs urls of hosts and prints first 1024 bytes of page
#url = urllib2.urlopen(host)
#print url.read(1024)
#signals to queue job is done
#print "download......"
#time.sleep(random.random()*3)
#time.sleep(3)
#print "download......"+self.name
def main():
theList = findMM(0,900)
#print theList[0]
q = Queue.Queue(0)
for x in range(10):
d = Download(q)
d.name = str(x)
d.setDaemon(True)
d.start()
for url in theList:
q.put(url)
q.join()
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment