Created
February 4, 2012 15:32
-
-
Save obsesh/1738514 to your computer and use it in GitHub Desktop.
2012 updated sxsw scrapper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib, mechanize | |
import time | |
import simplejson | |
import os | |
import sys | |
from multiprocessing import Pool | |
import pickle | |
def crawl_sxsw(cookies, user_id): | |
already_crawled = os.path.exists( data_path+str(user_id)+ ".json" ) | |
be_nice = True | |
if not already_crawled: | |
skip = False | |
url = url_root + '/users/'+str(user_id) | |
print "\tmaking request for user " +str(user_id) | |
br = mechanize.Browser() | |
br.set_handle_redirect(False) | |
br._ua_handlers['_cookies'].cookiejar =pickle.loads(cookies) | |
try: | |
r = br.open(url) | |
except: | |
print "\t404 for " +str(user_id) | |
if be_nice: | |
time.sleep(1) | |
skip = True | |
if not skip: | |
s = r.read() | |
print "\tparsing user: " +str(user_id) | |
try: | |
name = s.split("<p class='name'>")[1]\ | |
.split("</p>")[0] | |
except: | |
name= '' | |
try: | |
company = s.split("<p class='company'>")[1]\ | |
.split("<p class='location'>")[0]\ | |
.replace('</p>','') | |
company_url, company = ' '.join(company.split()).split('">') | |
company = company.replace('</a>','') | |
company_url = company_url.replace('<a href="','') | |
except: | |
company = '' | |
company_url = '' | |
try: | |
hometown = s.split("<p class='location'>")[1]\ | |
.split("</p>")[0] | |
except: | |
hometown = '' | |
try: | |
photo = s.split('id="badge_photo"')[1]\ | |
.split('" />')[0].replace(' src="','http://sxsocial.sxsw.com') | |
except: | |
photo = '' | |
try: | |
bio = s.split("<div class='bio'>")[1]\ | |
.split("<p>")[1]\ | |
.split("</p>")[0] | |
except: | |
bio = '' | |
try: | |
links = s.split("<ul class='social'>")[1]\ | |
.split("</ul>")[0] | |
links_str = ' '.join(links.split()).split('<li>') | |
links = [] | |
for l in links_str: | |
try: | |
links.append(l.split('<a href="')[1].split('"')[0]) | |
except: | |
pass | |
except: | |
links = [] | |
try: | |
registrant_type = s.split("<p class='registation'>")[1]\ | |
.split("</p>")[0] | |
registrant_type = registrant_type.strip() | |
except: | |
registrant_type = '' | |
sxsw_user = { | |
'name':name, | |
'user_id':user_id, | |
'user_url':url, | |
'registrant_type':registrant_type, | |
'company':company, | |
'company_url':company_url, | |
'hometown':hometown, | |
'photo':photo, | |
'bio':bio, | |
'links':links | |
} | |
print "\twriting dump for " + name | |
f = open(data_path+str(user_id)+ ".json","w") | |
simplejson.dump(sxsw_user,f) | |
f.close() | |
f = open(data_path+"placeholder","w") | |
f.write(str(user_id)) | |
f.close() | |
if be_nice: | |
time.sleep(4) | |
else: | |
print "\talready crawled " + str(user_id) | |
return "success" | |
if __name__ == '__main__': | |
url_root = 'http://sxsocial.sxsw.com' | |
data_path = './data/' | |
be_nice = True | |
username = os.getenv("USERNAME") | |
password = os.getenv("PASSWORD") | |
if not username and not password: | |
print "please pass USERNAME and PASSWORD in as env variables" | |
sys.exit() | |
#have to guess and fudge these | |
# try finding the lowest uid here: | |
# http://sxsocial.sxsw.com/users/290 (play around) | |
# then highest | |
# http://sxsocial.sxsw.com/users/28000 (play around) | |
start_uid = 11999 #500 | |
end_uid = 12001 #28000 | |
try: | |
placeholder = int(open(data_path+"placeholder", 'r').read()) | |
except: | |
placeholder = 0 | |
print "resuming from user id " +str(placeholder) | |
if placeholder > start_uid: | |
start_uid = placeholder | |
br = mechanize.Browser() | |
# log in | |
print "Logging in" | |
br.open(url_root + '/user_session/new') | |
br.select_form(nr=0) | |
br["user_session[username]"] = username | |
br["user_session[password]"] = password | |
r = br.submit() | |
#assert username in r.get_data() | |
#establish session | |
cookies = pickle.dumps(br._ua_handlers['_cookies'].cookiejar) | |
print "crawling range " + str(start_uid) + " to " + str(end_uid) | |
pool = Pool(processes=10) | |
for user_id in range(start_uid, end_uid): | |
result = pool.apply_async(crawl_sxsw, [cookies,user_id]) | |
print result.get() | |
#pool.join() | |
#for user_id in range(start_uid, end_uid): | |
# crawl_sxsw(cookies, user_id) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment