Last active
May 15, 2024 16:08
-
-
Save vinayakg/fa92188fff3b68bea2078697633a9500 to your computer and use it in GitHub Desktop.
demography analyzer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
def calculate_demographic_data(print_data=True): | |
# Read data from file | |
df = pd.read_csv("boilerplate-demographic-data-analyzer/adult.data.csv") | |
# How many of each race are represented in this dataset? This should be a Pandas series with race names as the index labels. | |
# How many people of each race are represented in this dataset? This should be a Pandas series with race names as the index labels. (race column) | |
race_count = df["race"].value_counts() | |
# What is the average age of men? | |
# What is the average age of men? | |
average_age_men = df.loc[(df["sex"] == "Male")]["age"].mean(numeric_only=True) | |
# What is the percentage of people who have a Bachelor's degree? | |
# What is the percentage of people who have a Bachelor's degree? | |
percentage_bachelors = ( | |
len(df.loc[(df["education"] == "Bachelors")]) / len(df) | |
) * 100 | |
# What percentage of people with advanced education (`Bachelors`, `Masters`, or `Doctorate`) make more than 50K? | |
# What percentage of people without advanced education make more than 50K? | |
advanced_educated = df[ | |
(df["education"] == "Bachelors") | |
| (df["education"] == "Masters") | |
| (df["education"] == "Doctorate") | |
] | |
non_advanced_educated = df[ | |
(df["education"] != "Bachelors") | |
& (df["education"] != "Masters") | |
& (df["education"] != "Doctorate") | |
] | |
percentage_advanced_educated_more_than_50K = ( | |
len(advanced_educated[advanced_educated["salary"] == ">50K"]) | |
/ len(advanced_educated) | |
) * 100 | |
percentage_non_advanced_educated_more_than_50K = ( | |
len(non_advanced_educated[non_advanced_educated["salary"] == ">50K"]) | |
/ len(non_advanced_educated) | |
) * 100 | |
# What percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K? | |
# with and without `Bachelors`, `Masters`, or `Doctorate` | |
higher_education = None | |
lower_education = None | |
# percentage with salary >50K | |
# What percentage of people without advanced education make more than 50K? | |
higher_education_rich = percentage_advanced_educated_more_than_50K | |
lower_education_rich = percentage_non_advanced_educated_more_than_50K | |
# What is the minimum number of hours a person works per week (hours-per-week feature)? | |
# What is the minimum number of hours a person works per week? | |
min_work_hours = df["hours-per-week"].min() | |
# What percentage of the people who work the minimum number of hours per week have a salary of >50K? | |
# What percentage of the people who work the minimum number of hours per week have a salary of more than 50K? | |
minimum_hour_workers = df[df["hours-per-week"] == min_work_hours] | |
num_min_workers = ( | |
len(minimum_hour_workers[minimum_hour_workers["salary"] == ">50K"]) | |
/ len(minimum_hour_workers) | |
) * 100 | |
rich_percentage = num_min_workers | |
# What country has the highest percentage of people that earn >50K? | |
# What country has the highest percentage of people that earn >50K and what is that percentage? | |
# Calculate the total count for each country: | |
country_counts_obj = df["native-country"].value_counts() | |
countrypop_earning_more_than_50K = df[df["salary"] == ">50K"] | |
highest_earning_country_obj = countrypop_earning_more_than_50K[ | |
"native-country" | |
].value_counts() | |
highest_earning_country_percentage_obj = ( | |
highest_earning_country_obj / country_counts_obj | |
) * 100 | |
highest_earning_country = highest_earning_country_percentage_obj.idxmax() | |
highest_earning_country_percentage = highest_earning_country_percentage_obj.max() | |
# Identify the most popular occupation for those who earn >50K in India. | |
# Identify the most popular occupation for those who earn >50K in India. | |
top_IN_occupation = ( | |
countrypop_earning_more_than_50K[ | |
countrypop_earning_more_than_50K["native-country"] == "India" | |
]["occupation"] | |
.value_counts() | |
.idxmax() | |
) | |
# DO NOT MODIFY BELOW THIS LINE | |
if print_data: | |
print("Number of each race:\n", race_count) | |
print("Average age of men:", average_age_men) | |
print(f"Percentage with Bachelors degrees: {percentage_bachelors}%") | |
print( | |
f"Percentage with higher education that earn >50K: {higher_education_rich}%" | |
) | |
print( | |
f"Percentage without higher education that earn >50K: {lower_education_rich}%" | |
) | |
print(f"Min work time: {min_work_hours} hours/week") | |
print( | |
f"Percentage of rich among those who work fewest hours: {rich_percentage}%" | |
) | |
print("Country with highest percentage of rich:", highest_earning_country) | |
print( | |
f"Highest percentage of rich people in country: {highest_earning_country_percentage}%" | |
) | |
print("Top occupations in India:", top_IN_occupation) | |
return { | |
"race_count": race_count, | |
"average_age_men": average_age_men, | |
"percentage_bachelors": percentage_bachelors, | |
"higher_education_rich": higher_education_rich, | |
"lower_education_rich": lower_education_rich, | |
"min_work_hours": min_work_hours, | |
"rich_percentage": rich_percentage, | |
"highest_earning_country": highest_earning_country, | |
"highest_earning_country_percentage": highest_earning_country_percentage, | |
"top_IN_occupation": top_IN_occupation, | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment