Created
January 13, 2014 09:49
-
-
Save Azazeo/8397330 to your computer and use it in GitHub Desktop.
Crawler and data extractor for airbnb.com website
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
MAX_THREADS = 5 | |
delay = 0.5 | |
import psycopg2 | |
import re | |
import sys | |
import time | |
import threading | |
import urllib2 | |
import urlparse | |
from bs4 import BeautifulSoup | |
start_url = 'http://airbnb.com' | |
room_regex = re.compile('^http:\/\/airbnb.com\/rooms\/[0-9]*$') | |
connection = psycopg2.connect(database="urls", | |
host="100.86.226.62", | |
port="5432", | |
user="postgres", | |
password="pgpassword") | |
cursor = connection.cursor() | |
class Worker(threading.Thread): | |
def __init__(self, n, Q): | |
self.Q = Q | |
self.visited = set() | |
threading.Thread.__init__(self) | |
self.n = n | |
self.connection = psycopg2.connect(database="urls", | |
host="100.86.226.62", | |
port="5432", | |
user="postgres", | |
password="pgpassword") | |
self.cursor = self.connection.cursor() | |
def process_url(self, url): | |
self.visited = self.visited | set([url]) | |
try: | |
soup = BeautifulSoup(urllib2.urlopen(urllib2.Request(url))) | |
page_urls = soup.findAll('a', href=True) | |
try: | |
self.cursor.execute("UPDATE airbnb_urls SET crawled = %s WHERE url = %s;", (True, url)) | |
self.connection.commit() | |
except: | |
self.connection.commit() | |
for tag in page_urls: | |
new_url = urlparse.urljoin(url, tag['href']) | |
if start_url in new_url and new_url not in self.visited and '?' not in new_url and '#' not in new_url: | |
try: | |
is_room = True if re.match(room_regex, new_url) != None else False | |
self.cursor.execute("INSERT INTO airbnb_urls (url, crawled, is_room) VALUES (%s, %s, %s)", (new_url, False, is_room)) | |
self.connection.commit() | |
self.visited = self.visited | set([new_url]) | |
except: | |
self.connection.commit() | |
except urllib2.HTTPError as e: | |
print "something wrong with {}".format(url) | |
print "{} : {}".format(e.code, e.reason) | |
def run(self): | |
u = self.Q.pop() | |
if u not in self.visited: | |
self.visited = self.visited | set([u]) | |
self.process_url(u) | |
while len(self.Q) > 0: | |
time.sleep(delay) | |
u = self.Q.pop() | |
print "Thread #", self.n | |
print "Poped ", u | |
print "In queue: ", len(self.Q) | |
if u not in self.visited: | |
self.visited = self.visited | set([u]) | |
self.process_url(u) | |
print "Queue is empty\nFINISHED" | |
while not False: | |
cursor.execute("SELECT COUNT (url) FROM airbnb_urls WHERE crawled = FALSE;") | |
if cursor.fetchone()[0] == 0: | |
print "Nothing to crawl" | |
break | |
Q = [] | |
JOBS = [] | |
for i in xrange(MAX_THREADS): | |
cursor.execute("SELECT url FROM airbnb_urls WHERE crawled = FALSE LIMIT 1000 OFFSET {};".format(i*1000)) | |
Q = map(lambda x: x[0], cursor.fetchall()) | |
if len(Q) > 0: | |
j = Worker(i, Q) | |
JOBS.append(j) | |
j.start() | |
for j in JOBS: | |
j.join() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import psycopg2 | |
import random | |
import sys | |
import time | |
import threading | |
import urllib2 | |
import urlparse | |
from datetime import datetime | |
from bs4 import BeautifulSoup | |
connection = psycopg2.connect(database="urls", | |
host="100.86.226.62", | |
port="5432", | |
user="postgres", | |
password="pgpassword") | |
cursor = connection.cursor() | |
cursor.execute('SELECT url FROM airbnb_url WHERE is_room = TRUE LIMIT 1 OFFSET 1234;') | |
room = cursor.fetchone(); | |
print room | |
price_pattern = { | |
'day_l': 'price_amount', | |
'day_h': 'price_amount', | |
'week_l': 'weekly_price_string', | |
'week_h': 'weekly_price_string', | |
'month_l': 'monthly_price_string', | |
'month_h': 'monthly_price_string', | |
} | |
address_pattern = { | |
'address': 'display-address', | |
'zip-code': None, | |
} | |
def get_data_by_pattern(pattern, html): | |
data={} | |
for name, id in pattern.iteritems(): | |
try: | |
data[name]=html.find(id=id).string | |
except: | |
data[name]=None | |
return data | |
def get_address_from_airbnb(html): | |
address = { | |
'address': None, | |
'zip-code': None | |
} | |
try: | |
a = html.find(id='display-address')['data-location'] | |
if a[-13:]=='United States': | |
address['zip-code']=a[-20:-15] | |
address['address']=a | |
except: | |
pass | |
return address | |
def get_rating_from_airbnb(html): | |
rating={'reviews': None, | |
'rating': None} | |
try: | |
rating['reviews']=int(html.find(id='action-buttons').find('a',class_='icon').string) | |
except: | |
pass | |
try: | |
if rating['reviews']>0: | |
r=(len(list(html.find('div', class_='star-rating').find_all(class_='icon icon-pink icon-star'))) + 0.5*len(list(html.find('div', class_='star-rating').find_all(class_='icon icon-pink icon-star-half')))) | |
rating['rating']=r | |
except: | |
pass | |
return rating | |
def get_accommodates_from_airbnb(html): | |
r = None | |
try: | |
for td in html.find(id='description_details').find_all('td'): | |
if td.string == 'Accommodates:': | |
r = int(td.parent.find_all('td')[1].string) | |
except: | |
pass | |
return r | |
def get_bedrooms_from_airbnb(html): | |
r = None | |
try: | |
for td in html.find(id='description_details').find_all('td'): | |
if td.string == 'Bedrooms:': | |
r = int(td.parent.find_all('td')[1].string) | |
except: | |
pass | |
return r | |
def get_bathrooms_from_airbnb(html): | |
r = None | |
try: | |
for td in html.find(id='description_details').find_all('td'): | |
if td.string == 'Bathrooms:': | |
r = int(td.parent.find_all('td')[1].string) | |
except: | |
pass | |
return r | |
def get_or_create_ppage(room): | |
ppage=ParsedPage.objects(url=room).first() | |
if ppage: | |
ppage.updated=datetime.now() | |
return ppage | |
else: | |
ppage=ParsedPage(url=room, updated=datetime.now()) | |
return ppage | |
def parse_room(room): | |
print room | |
ppage=get_or_create_ppage(room) | |
try: | |
soup=BeautifulSoup(urllib2.urlopen(urllib2.Request(room))) | |
ppage.rating=get_rating_from_airbnb(soup) | |
ppage.price=get_data_by_pattern(price_pattern, soup) | |
ppage.address=get_address_from_airbnb(soup) | |
ppage.rooms=get_rooms_from_airbnb(soup) | |
except urllib2.HTTPError as e: | |
print "something wrong with {}".format(room) | |
print "{} : {}".format(e.code, e.reason) | |
except: | |
print 'Some parsing error: {}'.format(sys.exc_info()[0]) | |
return ppage | |
def save_parsed_room(ppage): | |
try: | |
ppage.updated=datetime.now() | |
ppage.save() | |
except: | |
print 'Error on saving {}'.format(ppage.url) | |
print 'Details: {}'.format(sys.exc_info()[0]) | |
class Worker(threading.Thread): | |
def __init__(self, n): | |
threading.Thread.__init__(self) | |
self.n=n | |
def run(self): | |
global urls | |
while len(urls)>0: | |
url=urls.pop() | |
if self.n==0: | |
print '#####' | |
print 'urls left: {}'.format(len(urls)) | |
print '#####' | |
save_parsed_room(parse_room(url)) | |
time.sleep(DELAY) |
@mrd1no yeah, I'm having the same problem. It gives me 503 after a few seconds.
I was using python with tornado and connecting on a proxy but to no use.
Have you managed to do it?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi Azazeo,
nice code!
today I wrote an Airbnb crawler with Scrapy in order to develop a rate shopping tool to compare the development of prices in my area over time.
As long as I was parsing just the general page of the properties everything worked correctly.
When I tried to launch the research for a couple of weeks, after a few seconds I got an Error Server 503.
Do you think that my IP has been kicked out? Did you experience anything similar?