obsesh · February 4, 2012 15:32
diff --git a/gistfile1.txt b/gistfile1.txt
 import urllib, mechanize
 import time
 import simplejson
 import os
 import sys
 from multiprocessing import Pool
 import pickle



 def crawl_sxsw(cookies, user_id):
  already_crawled = os.path.exists( data_path+str(user_id)+ ".json" )
  be_nice = True

  if not already_crawled:
    skip = False
    url = url_root + '/users/'+str(user_id) 
    print "\tmaking request for user " +str(user_id)

    br = mechanize.Browser()
    br.set_handle_redirect(False)

    br._ua_handlers['_cookies'].cookiejar =pickle.loads(cookies)
    try:
      r = br.open(url)
    except:
      print "\t404 for " +str(user_id)
      if be_nice:
        time.sleep(1)
      skip = True


    if not skip:
      s = r.read()
      print "\tparsing user: " +str(user_id)

      try:

        name = s.split("<p class='name'>")[1]\
            .split("</p>")[0]
      except:
        name= ''


      try:
        company = s.split("<p class='company'>")[1]\
            .split("<p class='location'>")[0]\
            .replace('</p>','')
        company_url, company = ' '.join(company.split()).split('">')
        company = company.replace('</a>','')
        company_url = company_url.replace('<a href="','') 
      except:
        company = ''
        company_url = ''

      try:
        hometown = s.split("<p class='location'>")[1]\
            .split("</p>")[0]
      except:
        hometown = ''

      try:
        photo = s.split('id="badge_photo"')[1]\
            .split('" />')[0].replace(' src="','http://sxsocial.sxsw.com')
      except:
        photo = ''

      try:
 				bio = s.split("<div class='bio'>")[1]\
 						.split("<p>")[1]\
 						.split("</p>")[0]
      except:
        bio = ''

      try:
        links = s.split("<ul class='social'>")[1]\
            .split("</ul>")[0]
        links_str = ' '.join(links.split()).split('<li>')

        links = []

        for l in links_str:
          try:
 						links.append(l.split('<a href="')[1].split('"')[0])
          except:
            pass
      except:
        links = []

      try:
 				registrant_type = s.split("<p class='registation'>")[1]\
 						.split("</p>")[0]
 				registrant_type = registrant_type.strip()
      except:
        registrant_type = ''


      sxsw_user = {
        'name':name,
        'user_id':user_id,
        'user_url':url,
        'registrant_type':registrant_type,
        'company':company,
        'company_url':company_url,
        'hometown':hometown,
        'photo':photo,
        'bio':bio,
        'links':links
 			}

      print "\twriting dump for " + name 
      f = open(data_path+str(user_id)+ ".json","w")
      simplejson.dump(sxsw_user,f)
      f.close()
      f = open(data_path+"placeholder","w")
      f.write(str(user_id))
      f.close()
      if be_nice:
        time.sleep(4)
  else:
    print "\talready crawled " + str(user_id)
  return "success"



 if __name__ == '__main__':
  url_root = 'http://sxsocial.sxsw.com'
  data_path = './data/'
  be_nice = True 

  username = os.getenv("USERNAME")
  password = os.getenv("PASSWORD")

  if not username and not password:
    print "please pass USERNAME and PASSWORD in as env variables"
    sys.exit()



  
  #have to guess and fudge these
  # try finding the lowest uid here:
  #   http://sxsocial.sxsw.com/users/290 (play around)
  # then highest
  #   http://sxsocial.sxsw.com/users/28000 (play around)
  start_uid = 11999 #500
  end_uid = 12001 #28000

  try:
    placeholder = int(open(data_path+"placeholder", 'r').read())
  except:
    placeholder = 0
  print "resuming from user id " +str(placeholder)
  if placeholder > start_uid:
    start_uid = placeholder




  br = mechanize.Browser()


  # log in
  print "Logging in"
  br.open(url_root + '/user_session/new')
  br.select_form(nr=0)
  br["user_session[username]"] = username
  br["user_session[password]"] = password
  r = br.submit()
  #assert username in r.get_data()
  #establish session
  cookies = pickle.dumps(br._ua_handlers['_cookies'].cookiejar)

  
  print "crawling range " + str(start_uid) + " to " + str(end_uid)
  pool = Pool(processes=10)
  for user_id in range(start_uid, end_uid):
    result = pool.apply_async(crawl_sxsw, [cookies,user_id])

  print result.get()
  #pool.join()

  #for user_id in range(start_uid, end_uid):
  #   crawl_sxsw(cookies, user_id)
	import urllib, mechanize
	import time
	import simplejson
	import os
	import sys
	from multiprocessing import Pool
	import pickle



	def crawl_sxsw(cookies, user_id):
	already_crawled = os.path.exists( data_path+str(user_id)+ ".json" )
	be_nice = True

	if not already_crawled:
	skip = False
	url = url_root + '/users/'+str(user_id)
	print "\tmaking request for user " +str(user_id)

	br = mechanize.Browser()
	br.set_handle_redirect(False)

	br._ua_handlers['_cookies'].cookiejar =pickle.loads(cookies)
	try:
	r = br.open(url)
	except:
	print "\t404 for " +str(user_id)
	if be_nice:
	time.sleep(1)
	skip = True


	if not skip:
	s = r.read()
	print "\tparsing user: " +str(user_id)

	try:

	name = s.split("<p class='name'>")[1]\
	.split("</p>")[0]
	except:
	name= ''


	try:
	company = s.split("<p class='company'>")[1]\
	.split("<p class='location'>")[0]\
	.replace('</p>','')
	company_url, company = ' '.join(company.split()).split('">')
	company = company.replace('</a>','')
	company_url = company_url.replace('<a href="','')
	except:
	company = ''
	company_url = ''

	try:
	hometown = s.split("<p class='location'>")[1]\
	.split("</p>")[0]
	except:
	hometown = ''

	try:
	photo = s.split('id="badge_photo"')[1]\
	.split('" />')[0].replace(' src="','http://sxsocial.sxsw.com')
	except:
	photo = ''

	try:
	bio = s.split("<div class='bio'>")[1]\
	.split("<p>")[1]\
	.split("</p>")[0]
	except:
	bio = ''

	try:
	links = s.split("<ul class='social'>")[1]\
	.split("</ul>")[0]
	links_str = ' '.join(links.split()).split('<li>')

	links = []

	for l in links_str:
	try:
	links.append(l.split('<a href="')[1].split('"')[0])
	except:
	pass
	except:
	links = []

	try:
	registrant_type = s.split("<p class='registation'>")[1]\
	.split("</p>")[0]
	registrant_type = registrant_type.strip()
	except:
	registrant_type = ''


	sxsw_user = {
	'name':name,
	'user_id':user_id,
	'user_url':url,
	'registrant_type':registrant_type,
	'company':company,
	'company_url':company_url,
	'hometown':hometown,
	'photo':photo,
	'bio':bio,
	'links':links
	}

	print "\twriting dump for " + name
	f = open(data_path+str(user_id)+ ".json","w")
	simplejson.dump(sxsw_user,f)
	f.close()
	f = open(data_path+"placeholder","w")
	f.write(str(user_id))
	f.close()
	if be_nice:
	time.sleep(4)
	else:
	print "\talready crawled " + str(user_id)
	return "success"



	if __name__ == '__main__':
	url_root = 'http://sxsocial.sxsw.com'
	data_path = './data/'
	be_nice = True

	username = os.getenv("USERNAME")
	password = os.getenv("PASSWORD")

	if not username and not password:
	print "please pass USERNAME and PASSWORD in as env variables"
	sys.exit()




	#have to guess and fudge these
	# try finding the lowest uid here:
	# http://sxsocial.sxsw.com/users/290 (play around)
	# then highest
	# http://sxsocial.sxsw.com/users/28000 (play around)
	start_uid = 11999 #500
	end_uid = 12001 #28000

	try:
	placeholder = int(open(data_path+"placeholder", 'r').read())
	except:
	placeholder = 0
	print "resuming from user id " +str(placeholder)
	if placeholder > start_uid:
	start_uid = placeholder




	br = mechanize.Browser()


	# log in
	print "Logging in"
	br.open(url_root + '/user_session/new')
	br.select_form(nr=0)
	br["user_session[username]"] = username
	br["user_session[password]"] = password
	r = br.submit()
	#assert username in r.get_data()
	#establish session
	cookies = pickle.dumps(br._ua_handlers['_cookies'].cookiejar)


	print "crawling range " + str(start_uid) + " to " + str(end_uid)
	pool = Pool(processes=10)
	for user_id in range(start_uid, end_uid):
	result = pool.apply_async(crawl_sxsw, [cookies,user_id])

	print result.get()
	#pool.join()

	#for user_id in range(start_uid, end_uid):
	# crawl_sxsw(cookies, user_id)