Created
November 21, 2013 09:27
-
-
Save dengshilong/7578553 to your computer and use it in GitHub Desktop.
抓取百度风云榜的热词。在urls.txt中写入 http://top.baidu.com/buzz?b=18等需要抓取的二级分类链接
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- encoding: UTF-8 -*- | |
import urllib2 | |
import re | |
from datetime import date | |
def get_page(url): | |
"""得到一个网页的内容""" | |
try: | |
print "crawling %s" % url | |
headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5",\ | |
"Referer": 'http://www.baidu.com'} | |
req = urllib2.Request(url, headers=headers) #设置头部 | |
#req.set_proxy("125.216.144.199:8080",'http') #设置代理 | |
return urllib2.urlopen(req).read() | |
except Exception,e: | |
print e | |
print "can't get page %s" % url | |
return None | |
def get_sub_categorys(url): | |
""" | |
输入: http://top.baidu.com/category?c=1等一级分类链接 | |
输出: http://top.baidu.com/buzz?b=338等二级分类链接以及名称 | |
每个元素为(链接,分类名称) | |
""" | |
content = get_page(url) | |
m = re.search(r'<div id="flist"([\s\S]+?)</div>', content) | |
if m: | |
temp = m.group(1) | |
links = re.findall(r'(buzz\?b=[\d]+)[^>]+>([^<]+)', temp) | |
root = 'http://top.baidu.com/' | |
return [(root + path, name) for path, name in links] | |
else: | |
return None | |
def get_buzz_word(url): | |
""" | |
输入: http://top.baidu.com/buzz?b=18等二级分类链接 | |
输出: 热词列表 | |
""" | |
content = get_page(url) | |
words = re.findall(r'<a class="list-title" target="_blank" href=".+?">(.+?)</a>', content) | |
return words | |
if __name__ == "__main__": | |
fw = open('words.txt', 'w') | |
with open('urls.txt', 'r') as f: | |
for line in f: | |
line = line.strip() | |
words = get_buzz_word(line) | |
fw.write('\t'.join(words) + '\n') | |
fw.close() | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
http://top.baidu.com/buzz?b=258 | |
http://top.baidu.com/buzz?b=3 | |
http://top.baidu.com/buzz?b=22 | |
http://top.baidu.com/buzz?b=18 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment