Created
November 18, 2019 15:22
-
-
Save yurisasuke/feff1400eb270c60f3655b188f14b0e2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
import csv | |
def harvest_data(): | |
req = requests.get('https://www.jumia.co.ke') | |
data = req.text | |
soup = BeautifulSoup(data, 'html.parser') | |
# print (soup.prettify()) | |
menu_elem = soup.find('ul', {'class': 'menu-items'}) | |
menu_entries = menu_elem.find_all('li', {'class': 'menu-item'}) | |
menu_items = [] | |
for elem in menu_entries: | |
entry = process_entry(elem) | |
menu_items.append(entry) | |
return menu_items | |
def process_entry(menu_elem): | |
name_span = menu_elem.find('span', {'class': 'nav-subTxt'}) | |
title = name_span.text | |
href_anchor = menu_elem.find('a', {'class': 'main-category'}, href=True) | |
print(href_anchor) | |
try: | |
main_href = href_anchor['href'] | |
except: | |
main_href = '' | |
sub_menu_elem = menu_elem.find('div', {'class': 'submenu'}) | |
categories_elem = sub_menu_elem.find_all('div', {'class': 'categories'}) | |
categories = {} | |
for elem in categories_elem: | |
cat_elem = elem.find('a', {'class': 'category'}, href=True) | |
try: | |
category = cat_elem.text | |
category_href = cat_elem['href'] | |
except AttributeError: | |
continue | |
sub_categories = [] | |
sub_elems = elem.find_all('a', {'class': 'subcategory'}, href=True) | |
for sub_elem in sub_elems: | |
sub_cat = sub_elem.text | |
sub_href = sub_elem['href'] | |
sub_categories.append((sub_cat, sub_href)) | |
categories[category] = (category_href, sub_categories) | |
return ((title, main_href), categories) | |
def write_data(data, outpath): | |
rows = [] | |
title_id = 1 | |
category_id = 1000 | |
sub_cateory_id = 10000 | |
for row in data: | |
title_href, entries = row | |
title, href = title_href | |
rows.append((title_id, title, '', href)) | |
for category, sub_categories_href in entries.items(): | |
cat_href, sub_categories = sub_categories_href | |
rows.append((category_id, category, title_id, cat_href)) | |
for sub_cat_href in sub_categories: | |
sub_cat, sub_href = sub_cat_href | |
rows.append((sub_cateory_id, sub_cat, category_id, sub_href)) | |
sub_cateory_id += 1 | |
category_id += 1 | |
title_id += 1 | |
headers = ['id', 'title', 'parent_id', 'category_url'] | |
with open(outpath, 'w') as data_file: | |
writer = csv.writer(data_file) | |
writer.writerow(headers) | |
writer.writerows(rows) | |
if __name__ == '__main__': | |
path = '/Users/keithwacira/go/src/data/categories.csv' | |
data = harvest_data() | |
write_data(data, path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment