Created
June 7, 2015 19:10
-
-
Save jackschultz/38c8462d8c3b6d74f422 to your computer and use it in GitHub Desktop.
Analysis of nobel prize winners and their ages
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import unicodedata | |
import matplotlib.mlab as mlab | |
import matplotlib.pyplot as plt | |
from scipy.stats import norm | |
class Prize: | |
def __init__(self, name, age, year, prize_type, description): | |
self.name = unicodedata.normalize('NFKD', name).encode('ascii','ignore') #umlaut issues | |
self.age = age | |
self.year = year | |
self.prize_type = prize_type | |
self.description = description | |
def __str__(self): | |
return self.name + ' won ' + str(self.prize_type) + ' at age ' + str(self.age) + ' in ' + str(self.year) | |
f = open('nobel_laureates_by_age.html', 'r') | |
html = BeautifulSoup(f.read()) | |
winners = [] | |
prize_types = set() | |
nobel_prize_string = "The Nobel Prize in " | |
for tag in html.find("div", id="nobel-age-info").children: | |
# we're looking for a specific div, that doesn't have a class, id, or anything noteworthy | |
#so I'm going to count the divs that are in this outerdiv until we hit the one I want | |
if tag.name == None: | |
next | |
elif tag.name == 'h3': | |
current_age = int(tag.text.split(" ")[-1]) #update the age | |
elif tag.name == 'div': | |
name = tag.find("h6").text #winner's name | |
description = tag.find_all("p")[0].find("a").text #winner's name | |
year = int(description.split(' ')[-1]) | |
prize_type = ' '.join(description.split(' ')[0:-1]) | |
prize_types.add(prize_type) | |
prize = Prize(name, current_age, int(year), prize_type) | |
winners.append(prize) | |
all_prize_string = "All Prizes" | |
ts = list(prize_types) | |
ts.append(all_prize_string) #want to get all prizes too | |
print "Type, Number of Winners, Mean Age, Variance of Ages" | |
for prize_type in ts: | |
ages = [p.age for p in winners if p.prize_type == prize_type or prize_type == all_prize_string] | |
num_bins = ages[-1] - ages[0] | |
fig = plt.figure() | |
n, bins, patches = plt.hist(ages, num_bins, normed=1, facecolor='green', alpha=0.2) | |
mean, var = norm.fit(ages) | |
y = mlab.normpdf(bins, mean, var) | |
plt.plot(bins, y, 'r--') | |
plt.ylabel('Number of Winners') | |
plt.xlabel('Age') | |
plt.title(prize_type + '. Mean: ' + str(round(mean,2)) + ', Var: ' + str(round(var,2))) | |
fig.savefig('nobel_hist_' + prize_type.lower().replace(' ', '_') + '.png', dpi=500,format='png') | |
print prize_type +', '+ str(len(ages)) +', '+ str(round(mean,2)) +', '+ str(round(var,2)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment