Last active
January 31, 2022 03:21
-
-
Save xtream1101/5d7ea9e2672a162aa565c46d17feabc0 to your computer and use it in GitHub Desktop.
Instacart scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import time | |
import requests | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.common.keys import Keys | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.webdriver.firefox.options import Options as FirefoxOptions | |
from webdriverdownloader import GeckoDriverDownloader | |
gdd = GeckoDriverDownloader() | |
geckodriver = gdd.download_and_install() | |
headers = { | |
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36" | |
} | |
def login(email, password): | |
print("Logging in...") | |
login_url = 'https://www.instacart.com/accounts/login' | |
options = FirefoxOptions() | |
options.add_argument("--headless") | |
driver = webdriver.Firefox(firefox_options=options, | |
executable_path=geckodriver[1]) | |
driver.get(login_url) | |
time.sleep(2) | |
login_link = driver.find_element_by_xpath('//a[@class="ic-btn ic-btn-success no-underline"]') | |
login_link.click() | |
time.sleep(2) | |
(WebDriverWait(driver, 5).until( | |
EC.presence_of_element_located(( | |
By.XPATH, '//input[@type="email"]')))) | |
email_input = (driver.find_element_by_xpath( | |
'//input[@type="email"]')) | |
password_input = driver.find_element_by_xpath('//input[@type="password"]') | |
email_input.clear() | |
email_input.send_keys(email) | |
password_input.clear() | |
password_input.send_keys(password) | |
password_input.send_keys(Keys.RETURN) | |
time.sleep(2) | |
baked_cookies = {} | |
# Requests does not need all this info in the cookies, strip it out | |
for cookie in driver.get_cookies(): | |
baked_cookies[cookie['name']] = cookie['value'] | |
return baked_cookies | |
def get_pdp(cookies, product_id): | |
print(f"Getting product {product_id}...") | |
url = f'https://www.instacart.com/v3/containers/items/item_{product_id}' | |
r = requests.get(url, | |
cookies=cookies, | |
headers=headers) | |
if r.status_code == 200: | |
return r.json() | |
else: | |
print(f"Failed to get product {product_id} {r}") | |
def process_pdp(pdp_data): | |
# Rather then just saving the data, pull out the data needed and do something with it | |
product_title = pdp_data['container']['title'] | |
product_id = pdp_data['container']['tracking_params']['item_id'] | |
print(f"Processing: {product_id} - {product_title}") | |
# TODO: make sure folder products is already created | |
with open(f'products/product-{product_id}.json', 'w') as outfile: | |
json.dump(pdp_data, outfile, sort_keys=True, indent=4) | |
def search(cookies, term): | |
# TODO: Make sure the search term is urlencoded | |
print(f"Searching: {term}...") | |
url = f'https://www.instacart.com/v3/containers/kroger/search_v3/{term}?source=web&per=50' | |
r = requests.get(url, | |
cookies=cookies, | |
headers=headers) | |
if r.status_code == 200: | |
return r.json() | |
else: | |
print(f"Failed to search for {term} {r}") | |
def extract_product_ids(search_results): | |
# Get the product ids and return as a list | |
product_list = [] | |
for module in search_results['container']['modules']: | |
if 'items' in module['data']: | |
product_list = module['data']['items'] | |
break | |
product_ids = [] | |
for product in product_list: | |
product_ids.append(product['id'].split('_')[1]) | |
return product_ids | |
# Just do this once per session | |
login_cookies1 = login("<email>", "<password>") | |
terms = ['cookies'] | |
for term in terms: | |
# First Search the term | |
search_results = search(login_cookies1, term) | |
# Get all the product ids that are in the search results | |
product_ids = extract_product_ids(search_results) | |
print(f"Found {len(product_ids)} products for the term {term}") | |
for product_id in product_ids: | |
# For each product id, get the product details | |
pdp_data = get_pdp(login_cookies1, product_id) | |
# Extract/save the product details | |
process_pdp(pdp_data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment