skrawcz · July 19, 2024 18:07
diff --git a/run.py b/run.py
 # python 3.12

 """
 Hamilton demo. Runs the Hamilton code.
 """

 import sys

 import pprint

 from hamilton import driver

 import toyscriptiii as ts # defined below

 dr = driver.Builder().with_modules(ts).build()

 dr.display_all_functions("ts.png", deduplicate_inputs=True, keep_dot=True, orient='BR')

 results = dr.execute(['parsed_data',
                      'data_with_wikipedia',
                      'data_with_company',
                      'info_output',
                      'commodity_word_counts',
                      'colloquial_company_word_counts',
                      'info_dict_merged',
                      'wikipedia_report'],
                      inputs={'datafile':'data.csv'})

 pprint.pprint(results['info_dict_merged'])
 print(results['info_output'])
 print(results['wikipedia_report'])
diff --git a/toyscriptiii.py b/toyscriptiii.py
 # python 3.12

 """
 Toy script.

 Takes some input from a csv file on big American
 mines and looks at Wikipedia text for some extra
 context.
 """

 import copy

 import pprint

 import sys

 from urllib import request

 import re

 from bs4 import BeautifulSoup

 def parsed_data(datafile:str) -> dict:
    """
    Get csv data into a dictionary keyed on mine name.
    """
    retval = {}
    with open(datafile, 'r') as f:
        headers = [x.strip() for x in next(f).split(',')]
        for linex in f:
            vals = [x.strip() for x in linex.split(',')]
            retval[vals[0]] = {key:val for key, val in zip(headers, vals)} 
    pprint.pprint(retval)
    return retval
        
 def data_with_wikipedia(parsed_data:dict) -> dict:
    """
    Connect to wikipedia sites and fill in
    raw html data.

    Return dictionary.
    """
    retval = copy.deepcopy(parsed_data)
    for minex in retval:
        obj = request.urlopen(retval[minex]['wikipedia page'])
        html = obj.read()
        soup = BeautifulSoup(html, 'html.parser')
        print(soup.title)
        # Text from html and strip out newlines.
        newstring = soup.get_text().replace('\n', '')
        retval[minex]['wikipediatext'] = newstring
    return retval

 def data_with_company(data_with_wikipedia:dict) -> dict:
    """
    Fetches company ownership for mine out of 
    Wikipedia text dump.

    Returns a new dictionary with the company name
    without the big wikipedia text dump.
    """
    # Wikipedia setup for mine company name.
    COMPANYPAT = r'[a-z]Company'
    # Lower case followed by upper case heuristic.
    ENDCOMPANYPAT = '[a-z][A-Z]'
    retval = copy.deepcopy(data_with_wikipedia)
    companypat = re.compile(COMPANYPAT)
    endcompanypat = re.compile(ENDCOMPANYPAT) 
    for minex in retval:
        print(minex)
        match = re.search(companypat, retval[minex]['wikipediatext'])
        if match:
            print('Company match span = ', match.span())
            companyidx = match.span()[1]
            match2 = re.search(endcompanypat, retval[minex]['wikipediatext'][companyidx:])
            print('End Company match span = ', match2.span())
            retval[minex]['company'] = retval[minex]['wikipediatext'][companyidx:companyidx + match2.span()[0] + 1]
        # Get rid of big text dump in return value.
        retval[minex].pop('wikipediatext')
    return retval

 def info_output(data_with_company:dict) -> str:
    """
    Prints some output text to a file for each
    mine in the data_with_company dictionary.

    Returns string filename of output.
    """
    INFOLINEFMT = 'The {mine:s} mine is a big {commodity:s} mine in the State of {state:s} in the US.'
    COMPANYLINEFMT = '\n    {company:s} owns the mine.\n\n'
    retval = 'mine_info.txt'
    with open(retval, 'w') as f:
        for minex in data_with_company:
            print(INFOLINEFMT.format(**data_with_company[minex]), file=f)
            print(COMPANYLINEFMT.format(**data_with_company[minex]), file=f)
    return retval

 def commodity_word_counts(data_with_wikipedia:dict, data_with_company:dict) -> dict:
    """
    Return dictionary keyed on mine with counts of
    commodity (e.g., zinc etc.) mentions on Wikipedia
    page (excluding ones in the company name).
    """
    retval = {}
    # This will probably miss some occurrences at mashed together
    # word boundaries. It is a rough estimate.
    # '\b[Gg]old\b'
    commoditypatfmt = r'\b[{0:s}{1:s}]{2:s}\b'
    for minex in data_with_wikipedia:
        print(minex)
        commodityuc = data_with_wikipedia[minex]['commodity'][0].upper()
        commoditypat = commoditypatfmt.format(commodityuc,
                                              data_with_wikipedia[minex]['commodity'][0],
                                              data_with_wikipedia[minex]['commodity'][1:])
        print(commoditypat)
        commoditymatches = re.findall(commoditypat, data_with_wikipedia[minex]['wikipediatext'])
        # pprint.pprint(commoditymatches)
        nummatchesraw = len(commoditymatches)
        print('Initial length of commoditymatches is {0:d}.'.format(nummatchesraw))
        companymatches = re.findall(data_with_company[minex]['company'],
                                    data_with_wikipedia[minex]['wikipediatext'])
        numcompanymatches = len(companymatches)
        print('Length of companymatches is {0:d}.'.format(numcompanymatches))
        # Is the commodity name part of the company name?
        print('commoditypat = ', commoditypat)
        print(data_with_company[minex]['company'])
        commoditymatchcompany = re.search(commoditypat, data_with_company[minex]['company'])
        if commoditymatchcompany:
            print('commoditymatchcompany.span() = ', commoditymatchcompany.span())
            nummatchesfinal = nummatchesraw - numcompanymatches
            retval[minex] = nummatchesfinal 
        else:
            retval[minex] = nummatchesraw 
    return retval

 def colloquial_company_word_counts(data_with_wikipedia:dict) -> dict:
    """
    Find the number of times the company you associate with
    the property/mine (very subjective) is within the
    text of the mine's wikipedia article.
    """
    retval = {}
    for minex in data_with_wikipedia:
        colloquial_pat = data_with_wikipedia[minex]['colloquial association']
        print(minex)
        nummatches = len(re.findall(colloquial_pat, data_with_wikipedia[minex]['wikipediatext']))
        print('{0:d} matches for colloquial association {1:s}.'.format(nummatches, colloquial_pat))
        retval[minex] = nummatches
    return retval

 def info_dict_merged(data_with_company:dict,
                     commodity_word_counts:dict,
                     colloquial_company_word_counts:dict) -> dict:
    """
    Get a dictionary with all the collected information
    in it minus the big Wikipedia text dump.
    """
    retval = copy.deepcopy(data_with_company)
    for minex in retval:
        retval[minex]['colloquial association count'] = colloquial_company_word_counts[minex]
        retval[minex]['commodity word count'] = commodity_word_counts[minex]
    return retval

 def wikipedia_report(info_dict_merged:dict) -> str:
    """
    Writes out Wikipedia information (word counts)
    to file in prose; returns string filename.
    """
    retval = 'wikipedia_info.txt'
    colloqfmt = 'The {0:s} mine has {1:d} occurrences of colloquial association {2:s} in its Wikipedia article text.\n'
    commodfmt = 'The {0:s} mine has {1:d} occurrences of commodity name {2:s} in its Wikipedia article text.\n\n'
    with open(retval, 'w') as f:
        for minex in info_dict_merged:
            print(colloqfmt.format(info_dict_merged[minex]['mine'],
                                   info_dict_merged[minex]['colloquial association count'],
                                   info_dict_merged[minex]['colloquial association']), file=f)
            print(commodfmt.format(info_dict_merged[minex]['mine'],
                                   info_dict_merged[minex]['commodity word count'],
                                   info_dict_merged[minex]['commodity']), file=f)
    return retval
	# python 3.12

	"""
	Hamilton demo. Runs the Hamilton code.
	"""

	import sys

	import pprint

	from hamilton import driver

	import toyscriptiii as ts # defined below

	dr = driver.Builder().with_modules(ts).build()

	dr.display_all_functions("ts.png", deduplicate_inputs=True, keep_dot=True, orient='BR')

	results = dr.execute(['parsed_data',
	'data_with_wikipedia',
	'data_with_company',
	'info_output',
	'commodity_word_counts',
	'colloquial_company_word_counts',
	'info_dict_merged',
	'wikipedia_report'],
	inputs={'datafile':'data.csv'})

	pprint.pprint(results['info_dict_merged'])
	print(results['info_output'])
	print(results['wikipedia_report'])
	# python 3.12

	"""
	Toy script.

	Takes some input from a csv file on big American
	mines and looks at Wikipedia text for some extra
	context.
	"""

	import copy

	import pprint

	import sys

	from urllib import request

	import re

	from bs4 import BeautifulSoup

	def parsed_data(datafile:str) -> dict:
	"""
	Get csv data into a dictionary keyed on mine name.
	"""
	retval = {}
	with open(datafile, 'r') as f:
	headers = [x.strip() for x in next(f).split(',')]
	for linex in f:
	vals = [x.strip() for x in linex.split(',')]
	retval[vals[0]] = {key:val for key, val in zip(headers, vals)}
	pprint.pprint(retval)
	return retval

	def data_with_wikipedia(parsed_data:dict) -> dict:
	"""
	Connect to wikipedia sites and fill in
	raw html data.

	Return dictionary.
	"""
	retval = copy.deepcopy(parsed_data)
	for minex in retval:
	obj = request.urlopen(retval[minex]['wikipedia page'])
	html = obj.read()
	soup = BeautifulSoup(html, 'html.parser')
	print(soup.title)
	# Text from html and strip out newlines.
	newstring = soup.get_text().replace('\n', '')
	retval[minex]['wikipediatext'] = newstring
	return retval

	def data_with_company(data_with_wikipedia:dict) -> dict:
	"""
	Fetches company ownership for mine out of
	Wikipedia text dump.

	Returns a new dictionary with the company name
	without the big wikipedia text dump.
	"""
	# Wikipedia setup for mine company name.
	COMPANYPAT = r'[a-z]Company'
	# Lower case followed by upper case heuristic.
	ENDCOMPANYPAT = '[a-z][A-Z]'
	retval = copy.deepcopy(data_with_wikipedia)
	companypat = re.compile(COMPANYPAT)
	endcompanypat = re.compile(ENDCOMPANYPAT)
	for minex in retval:
	print(minex)
	match = re.search(companypat, retval[minex]['wikipediatext'])
	if match:
	print('Company match span = ', match.span())
	companyidx = match.span()[1]
	match2 = re.search(endcompanypat, retval[minex]['wikipediatext'][companyidx:])
	print('End Company match span = ', match2.span())
	retval[minex]['company'] = retval[minex]['wikipediatext'][companyidx:companyidx + match2.span()[0] + 1]
	# Get rid of big text dump in return value.
	retval[minex].pop('wikipediatext')
	return retval

	def info_output(data_with_company:dict) -> str:
	"""
	Prints some output text to a file for each
	mine in the data_with_company dictionary.

	Returns string filename of output.
	"""
	INFOLINEFMT = 'The {mine:s} mine is a big {commodity:s} mine in the State of {state:s} in the US.'
	COMPANYLINEFMT = '\n {company:s} owns the mine.\n\n'
	retval = 'mine_info.txt'
	with open(retval, 'w') as f:
	for minex in data_with_company:
	print(INFOLINEFMT.format(**data_with_company[minex]), file=f)
	print(COMPANYLINEFMT.format(**data_with_company[minex]), file=f)
	return retval

	def commodity_word_counts(data_with_wikipedia:dict, data_with_company:dict) -> dict:
	"""
	Return dictionary keyed on mine with counts of
	commodity (e.g., zinc etc.) mentions on Wikipedia
	page (excluding ones in the company name).
	"""
	retval = {}
	# This will probably miss some occurrences at mashed together
	# word boundaries. It is a rough estimate.
	# '\b[Gg]old\b'
	commoditypatfmt = r'\b[{0:s}{1:s}]{2:s}\b'
	for minex in data_with_wikipedia:
	print(minex)
	commodityuc = data_with_wikipedia[minex]['commodity'][0].upper()
	commoditypat = commoditypatfmt.format(commodityuc,
	data_with_wikipedia[minex]['commodity'][0],
	data_with_wikipedia[minex]['commodity'][1:])
	print(commoditypat)
	commoditymatches = re.findall(commoditypat, data_with_wikipedia[minex]['wikipediatext'])
	# pprint.pprint(commoditymatches)
	nummatchesraw = len(commoditymatches)
	print('Initial length of commoditymatches is {0:d}.'.format(nummatchesraw))
	companymatches = re.findall(data_with_company[minex]['company'],
	data_with_wikipedia[minex]['wikipediatext'])
	numcompanymatches = len(companymatches)
	print('Length of companymatches is {0:d}.'.format(numcompanymatches))
	# Is the commodity name part of the company name?
	print('commoditypat = ', commoditypat)
	print(data_with_company[minex]['company'])
	commoditymatchcompany = re.search(commoditypat, data_with_company[minex]['company'])
	if commoditymatchcompany:
	print('commoditymatchcompany.span() = ', commoditymatchcompany.span())
	nummatchesfinal = nummatchesraw - numcompanymatches
	retval[minex] = nummatchesfinal
	else:
	retval[minex] = nummatchesraw
	return retval

	def colloquial_company_word_counts(data_with_wikipedia:dict) -> dict:
	"""
	Find the number of times the company you associate with
	the property/mine (very subjective) is within the
	text of the mine's wikipedia article.
	"""
	retval = {}
	for minex in data_with_wikipedia:
	colloquial_pat = data_with_wikipedia[minex]['colloquial association']
	print(minex)
	nummatches = len(re.findall(colloquial_pat, data_with_wikipedia[minex]['wikipediatext']))
	print('{0:d} matches for colloquial association {1:s}.'.format(nummatches, colloquial_pat))
	retval[minex] = nummatches
	return retval

	def info_dict_merged(data_with_company:dict,
	commodity_word_counts:dict,
	colloquial_company_word_counts:dict) -> dict:
	"""
	Get a dictionary with all the collected information
	in it minus the big Wikipedia text dump.
	"""
	retval = copy.deepcopy(data_with_company)
	for minex in retval:
	retval[minex]['colloquial association count'] = colloquial_company_word_counts[minex]
	retval[minex]['commodity word count'] = commodity_word_counts[minex]
	return retval

	def wikipedia_report(info_dict_merged:dict) -> str:
	"""
	Writes out Wikipedia information (word counts)
	to file in prose; returns string filename.
	"""
	retval = 'wikipedia_info.txt'
	colloqfmt = 'The {0:s} mine has {1:d} occurrences of colloquial association {2:s} in its Wikipedia article text.\n'
	commodfmt = 'The {0:s} mine has {1:d} occurrences of commodity name {2:s} in its Wikipedia article text.\n\n'
	with open(retval, 'w') as f:
	for minex in info_dict_merged:
	print(colloqfmt.format(info_dict_merged[minex]['mine'],
	info_dict_merged[minex]['colloquial association count'],
	info_dict_merged[minex]['colloquial association']), file=f)
	print(commodfmt.format(info_dict_merged[minex]['mine'],
	info_dict_merged[minex]['commodity word count'],
	info_dict_merged[minex]['commodity']), file=f)
	return retval