Created
July 19, 2024 18:07
-
-
Save skrawcz/5893ae14e7a056a79b9c221acf886b70 to your computer and use it in GitHub Desktop.
Gist for guest post on blog
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# python 3.12 | |
""" | |
Hamilton demo. Runs the Hamilton code. | |
""" | |
import sys | |
import pprint | |
from hamilton import driver | |
import toyscriptiii as ts # defined below | |
dr = driver.Builder().with_modules(ts).build() | |
dr.display_all_functions("ts.png", deduplicate_inputs=True, keep_dot=True, orient='BR') | |
results = dr.execute(['parsed_data', | |
'data_with_wikipedia', | |
'data_with_company', | |
'info_output', | |
'commodity_word_counts', | |
'colloquial_company_word_counts', | |
'info_dict_merged', | |
'wikipedia_report'], | |
inputs={'datafile':'data.csv'}) | |
pprint.pprint(results['info_dict_merged']) | |
print(results['info_output']) | |
print(results['wikipedia_report']) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# python 3.12 | |
""" | |
Toy script. | |
Takes some input from a csv file on big American | |
mines and looks at Wikipedia text for some extra | |
context. | |
""" | |
import copy | |
import pprint | |
import sys | |
from urllib import request | |
import re | |
from bs4 import BeautifulSoup | |
def parsed_data(datafile:str) -> dict: | |
""" | |
Get csv data into a dictionary keyed on mine name. | |
""" | |
retval = {} | |
with open(datafile, 'r') as f: | |
headers = [x.strip() for x in next(f).split(',')] | |
for linex in f: | |
vals = [x.strip() for x in linex.split(',')] | |
retval[vals[0]] = {key:val for key, val in zip(headers, vals)} | |
pprint.pprint(retval) | |
return retval | |
def data_with_wikipedia(parsed_data:dict) -> dict: | |
""" | |
Connect to wikipedia sites and fill in | |
raw html data. | |
Return dictionary. | |
""" | |
retval = copy.deepcopy(parsed_data) | |
for minex in retval: | |
obj = request.urlopen(retval[minex]['wikipedia page']) | |
html = obj.read() | |
soup = BeautifulSoup(html, 'html.parser') | |
print(soup.title) | |
# Text from html and strip out newlines. | |
newstring = soup.get_text().replace('\n', '') | |
retval[minex]['wikipediatext'] = newstring | |
return retval | |
def data_with_company(data_with_wikipedia:dict) -> dict: | |
""" | |
Fetches company ownership for mine out of | |
Wikipedia text dump. | |
Returns a new dictionary with the company name | |
without the big wikipedia text dump. | |
""" | |
# Wikipedia setup for mine company name. | |
COMPANYPAT = r'[a-z]Company' | |
# Lower case followed by upper case heuristic. | |
ENDCOMPANYPAT = '[a-z][A-Z]' | |
retval = copy.deepcopy(data_with_wikipedia) | |
companypat = re.compile(COMPANYPAT) | |
endcompanypat = re.compile(ENDCOMPANYPAT) | |
for minex in retval: | |
print(minex) | |
match = re.search(companypat, retval[minex]['wikipediatext']) | |
if match: | |
print('Company match span = ', match.span()) | |
companyidx = match.span()[1] | |
match2 = re.search(endcompanypat, retval[minex]['wikipediatext'][companyidx:]) | |
print('End Company match span = ', match2.span()) | |
retval[minex]['company'] = retval[minex]['wikipediatext'][companyidx:companyidx + match2.span()[0] + 1] | |
# Get rid of big text dump in return value. | |
retval[minex].pop('wikipediatext') | |
return retval | |
def info_output(data_with_company:dict) -> str: | |
""" | |
Prints some output text to a file for each | |
mine in the data_with_company dictionary. | |
Returns string filename of output. | |
""" | |
INFOLINEFMT = 'The {mine:s} mine is a big {commodity:s} mine in the State of {state:s} in the US.' | |
COMPANYLINEFMT = '\n {company:s} owns the mine.\n\n' | |
retval = 'mine_info.txt' | |
with open(retval, 'w') as f: | |
for minex in data_with_company: | |
print(INFOLINEFMT.format(**data_with_company[minex]), file=f) | |
print(COMPANYLINEFMT.format(**data_with_company[minex]), file=f) | |
return retval | |
def commodity_word_counts(data_with_wikipedia:dict, data_with_company:dict) -> dict: | |
""" | |
Return dictionary keyed on mine with counts of | |
commodity (e.g., zinc etc.) mentions on Wikipedia | |
page (excluding ones in the company name). | |
""" | |
retval = {} | |
# This will probably miss some occurrences at mashed together | |
# word boundaries. It is a rough estimate. | |
# '\b[Gg]old\b' | |
commoditypatfmt = r'\b[{0:s}{1:s}]{2:s}\b' | |
for minex in data_with_wikipedia: | |
print(minex) | |
commodityuc = data_with_wikipedia[minex]['commodity'][0].upper() | |
commoditypat = commoditypatfmt.format(commodityuc, | |
data_with_wikipedia[minex]['commodity'][0], | |
data_with_wikipedia[minex]['commodity'][1:]) | |
print(commoditypat) | |
commoditymatches = re.findall(commoditypat, data_with_wikipedia[minex]['wikipediatext']) | |
# pprint.pprint(commoditymatches) | |
nummatchesraw = len(commoditymatches) | |
print('Initial length of commoditymatches is {0:d}.'.format(nummatchesraw)) | |
companymatches = re.findall(data_with_company[minex]['company'], | |
data_with_wikipedia[minex]['wikipediatext']) | |
numcompanymatches = len(companymatches) | |
print('Length of companymatches is {0:d}.'.format(numcompanymatches)) | |
# Is the commodity name part of the company name? | |
print('commoditypat = ', commoditypat) | |
print(data_with_company[minex]['company']) | |
commoditymatchcompany = re.search(commoditypat, data_with_company[minex]['company']) | |
if commoditymatchcompany: | |
print('commoditymatchcompany.span() = ', commoditymatchcompany.span()) | |
nummatchesfinal = nummatchesraw - numcompanymatches | |
retval[minex] = nummatchesfinal | |
else: | |
retval[minex] = nummatchesraw | |
return retval | |
def colloquial_company_word_counts(data_with_wikipedia:dict) -> dict: | |
""" | |
Find the number of times the company you associate with | |
the property/mine (very subjective) is within the | |
text of the mine's wikipedia article. | |
""" | |
retval = {} | |
for minex in data_with_wikipedia: | |
colloquial_pat = data_with_wikipedia[minex]['colloquial association'] | |
print(minex) | |
nummatches = len(re.findall(colloquial_pat, data_with_wikipedia[minex]['wikipediatext'])) | |
print('{0:d} matches for colloquial association {1:s}.'.format(nummatches, colloquial_pat)) | |
retval[minex] = nummatches | |
return retval | |
def info_dict_merged(data_with_company:dict, | |
commodity_word_counts:dict, | |
colloquial_company_word_counts:dict) -> dict: | |
""" | |
Get a dictionary with all the collected information | |
in it minus the big Wikipedia text dump. | |
""" | |
retval = copy.deepcopy(data_with_company) | |
for minex in retval: | |
retval[minex]['colloquial association count'] = colloquial_company_word_counts[minex] | |
retval[minex]['commodity word count'] = commodity_word_counts[minex] | |
return retval | |
def wikipedia_report(info_dict_merged:dict) -> str: | |
""" | |
Writes out Wikipedia information (word counts) | |
to file in prose; returns string filename. | |
""" | |
retval = 'wikipedia_info.txt' | |
colloqfmt = 'The {0:s} mine has {1:d} occurrences of colloquial association {2:s} in its Wikipedia article text.\n' | |
commodfmt = 'The {0:s} mine has {1:d} occurrences of commodity name {2:s} in its Wikipedia article text.\n\n' | |
with open(retval, 'w') as f: | |
for minex in info_dict_merged: | |
print(colloqfmt.format(info_dict_merged[minex]['mine'], | |
info_dict_merged[minex]['colloquial association count'], | |
info_dict_merged[minex]['colloquial association']), file=f) | |
print(commodfmt.format(info_dict_merged[minex]['mine'], | |
info_dict_merged[minex]['commodity word count'], | |
info_dict_merged[minex]['commodity']), file=f) | |
return retval |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment