Skip to content

Instantly share code, notes, and snippets.

@minhkhoablieu
Created July 2, 2022 03:50
Show Gist options
  • Save minhkhoablieu/4238e5509bc483b45c7df0c12dd3df08 to your computer and use it in GitHub Desktop.
Save minhkhoablieu/4238e5509bc483b45c7df0c12dd3df08 to your computer and use it in GitHub Desktop.
Basic_statistics
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import itertools\n",
"\n",
"from scipy.spatial import distance\n",
"from currency_converter import CurrencyConverter"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Python310\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3444: DtypeWarning: Columns (29) have mixed types.Specify dtype option on import or set low_memory=False.\n",
" exec(code_obj, self.user_global_ns, self.user_ns)\n"
]
}
],
"source": [
"data = pd.read_csv('../../data/workers/data.csv')\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(102822, 30)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.shape"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_id</th>\n",
" <th>username</th>\n",
" <th>registration_date</th>\n",
" <th>all</th>\n",
" <th>complete</th>\n",
" <th>incomplete</th>\n",
" <th>public_name</th>\n",
" <th>tagline</th>\n",
" <th>profile_description</th>\n",
" <th>overall</th>\n",
" <th>...</th>\n",
" <th>recommendations</th>\n",
" <th>hourly_rate</th>\n",
" <th>top_skills</th>\n",
" <th>experience_duration_by_year</th>\n",
" <th>experience_duration_by_month</th>\n",
" <th>education_duration_by_year</th>\n",
" <th>skills</th>\n",
" <th>categories</th>\n",
" <th>main category</th>\n",
" <th>super_star</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>29037031</td>\n",
" <td>programmatis</td>\n",
" <td>4/5/2018 7:58</td>\n",
" <td>55</td>\n",
" <td>55</td>\n",
" <td>0</td>\n",
" <td>Programmatis</td>\n",
" <td>SaaS Progressive Web App Specialists</td>\n",
" <td>Programmatis started out as just Joe. His work...</td>\n",
" <td>5.000000</td>\n",
" <td>...</td>\n",
" <td>326</td>\n",
" <td>500.0</td>\n",
" <td>['PHP', 'JavaScript', 'MySQL', 'HTML5', 'Datab...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>['Websites, IT &amp; Software', 'Websites, IT &amp; So...</td>\n",
" <td>Websites, IT &amp; Software</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>27245644</td>\n",
" <td>DavidJG23</td>\n",
" <td>11/10/2017 1:45</td>\n",
" <td>118</td>\n",
" <td>118</td>\n",
" <td>0</td>\n",
" <td>David J.</td>\n",
" <td>iOS Developer</td>\n",
" <td>Welcome to my profile! \\n\\nMy name is David. I...</td>\n",
" <td>5.000000</td>\n",
" <td>...</td>\n",
" <td>8</td>\n",
" <td>50.0</td>\n",
" <td>['Mobile App Development', 'iPhone', 'Swift', ...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>['Mobile Phones &amp; Computing', 'Mobile Phones &amp;...</td>\n",
" <td>Mobile Phones &amp; Computing</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2239716</td>\n",
" <td>mikehurley</td>\n",
" <td>3/14/2011 22:12</td>\n",
" <td>7314</td>\n",
" <td>7284</td>\n",
" <td>30</td>\n",
" <td>Elite Information Tech</td>\n",
" <td>White Hat SEO, SMO, Google Ads</td>\n",
" <td>We’re Elite Information Tech, and we thank you...</td>\n",
" <td>4.898098</td>\n",
" <td>...</td>\n",
" <td>400</td>\n",
" <td>20.0</td>\n",
" <td>['SEO', 'Link Building', 'Internet Marketing',...</td>\n",
" <td>14.0</td>\n",
" <td>3.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>['Websites, IT &amp; Software', 'Websites, IT &amp; So...</td>\n",
" <td>Sales &amp; Marketing</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>10408748</td>\n",
" <td>christiansoterod</td>\n",
" <td>3/3/2014 20:48</td>\n",
" <td>374</td>\n",
" <td>373</td>\n",
" <td>1</td>\n",
" <td>Christian D.</td>\n",
" <td>High End Architect | 3D Designer | Preferred Team</td>\n",
" <td>Preferred freelancer leading a team of archite...</td>\n",
" <td>4.982189</td>\n",
" <td>...</td>\n",
" <td>45</td>\n",
" <td>75.0</td>\n",
" <td>['3D Rendering', 'Building Architecture', '3D ...</td>\n",
" <td>9.0</td>\n",
" <td>3.0</td>\n",
" <td>5.0</td>\n",
" <td>NaN</td>\n",
" <td>['Design, Media &amp; Architecture ', 'Design, Med...</td>\n",
" <td>Design, Media &amp; Architecture</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>21404907</td>\n",
" <td>akkastech</td>\n",
" <td>9/29/2016 16:57</td>\n",
" <td>104</td>\n",
" <td>102</td>\n",
" <td>2</td>\n",
" <td>Renesis Tech PVT LTD</td>\n",
" <td>Blockchain | Web/Mobile App Development</td>\n",
" <td>With offices in Lahore, Pakistan, and Toronto,...</td>\n",
" <td>4.934415</td>\n",
" <td>...</td>\n",
" <td>20</td>\n",
" <td>25.0</td>\n",
" <td>['Android', 'Mobile App Development', 'iPhone'...</td>\n",
" <td>7.0</td>\n",
" <td>8.0</td>\n",
" <td>4.0</td>\n",
" <td>NaN</td>\n",
" <td>['Mobile Phones &amp; Computing', 'Mobile Phones &amp;...</td>\n",
" <td>Mobile Phones &amp; Computing</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 30 columns</p>\n",
"</div>"
],
"text/plain": [
" user_id username registration_date all complete incomplete \\\n",
"0 29037031 programmatis 4/5/2018 7:58 55 55 0 \n",
"1 27245644 DavidJG23 11/10/2017 1:45 118 118 0 \n",
"2 2239716 mikehurley 3/14/2011 22:12 7314 7284 30 \n",
"3 10408748 christiansoterod 3/3/2014 20:48 374 373 1 \n",
"4 21404907 akkastech 9/29/2016 16:57 104 102 2 \n",
"\n",
" public_name tagline \\\n",
"0 Programmatis SaaS Progressive Web App Specialists \n",
"1 David J. iOS Developer \n",
"2 Elite Information Tech White Hat SEO, SMO, Google Ads \n",
"3 Christian D. High End Architect | 3D Designer | Preferred Team \n",
"4 Renesis Tech PVT LTD Blockchain | Web/Mobile App Development \n",
"\n",
" profile_description overall ... \\\n",
"0 Programmatis started out as just Joe. His work... 5.000000 ... \n",
"1 Welcome to my profile! \\n\\nMy name is David. I... 5.000000 ... \n",
"2 We’re Elite Information Tech, and we thank you... 4.898098 ... \n",
"3 Preferred freelancer leading a team of archite... 4.982189 ... \n",
"4 With offices in Lahore, Pakistan, and Toronto,... 4.934415 ... \n",
"\n",
" recommendations hourly_rate \\\n",
"0 326 500.0 \n",
"1 8 50.0 \n",
"2 400 20.0 \n",
"3 45 75.0 \n",
"4 20 25.0 \n",
"\n",
" top_skills \\\n",
"0 ['PHP', 'JavaScript', 'MySQL', 'HTML5', 'Datab... \n",
"1 ['Mobile App Development', 'iPhone', 'Swift', ... \n",
"2 ['SEO', 'Link Building', 'Internet Marketing',... \n",
"3 ['3D Rendering', 'Building Architecture', '3D ... \n",
"4 ['Android', 'Mobile App Development', 'iPhone'... \n",
"\n",
" experience_duration_by_year experience_duration_by_month \\\n",
"0 NaN NaN \n",
"1 NaN NaN \n",
"2 14.0 3.0 \n",
"3 9.0 3.0 \n",
"4 7.0 8.0 \n",
"\n",
" education_duration_by_year skills \\\n",
"0 NaN NaN \n",
"1 NaN NaN \n",
"2 NaN NaN \n",
"3 5.0 NaN \n",
"4 4.0 NaN \n",
"\n",
" categories \\\n",
"0 ['Websites, IT & Software', 'Websites, IT & So... \n",
"1 ['Mobile Phones & Computing', 'Mobile Phones &... \n",
"2 ['Websites, IT & Software', 'Websites, IT & So... \n",
"3 ['Design, Media & Architecture ', 'Design, Med... \n",
"4 ['Mobile Phones & Computing', 'Mobile Phones &... \n",
"\n",
" main category super_star \n",
"0 Websites, IT & Software True \n",
"1 Mobile Phones & Computing True \n",
"2 Sales & Marketing True \n",
"3 Design, Media & Architecture True \n",
"4 Mobile Phones & Computing True \n",
"\n",
"[5 rows x 30 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def getMostCommonCategory(categories):\n",
" if(len(categories) > 0):\n",
" return max(set(categories), key=categories.count)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Basic statistics on workers profiles:\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Design, Media & Architecture 35746\n",
"Websites, IT & Software 26394\n",
"Writing & Content 15124\n",
"Data Entry & Admin 8774\n",
"Engineering & Science 4538\n",
"Sales & Marketing 3608\n",
"Translation & Languages 3442\n",
"Business, Accounting, Human Resources & Legal 2812\n",
"Mobile Phones & Computing 1126\n",
"Jobs for Anyone 362\n",
"Trades & Services 233\n",
"Education 196\n",
"Product Sourcing & Manufacturing 119\n",
"Series([], ) 81\n",
"Freight, Shipping & Transportation 71\n",
"Telecommunications 13\n",
"Health & Medicine 4\n",
"Name: main category, dtype: int64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data['main category'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"data[data['main category'] == 'Series([], )'][['top_skills', 'categories']].reset_index().to_csv('debug.csv')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# data['main category'].value_counts().reset_index().to_excel('main category.xlsx', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>main category</th>\n",
" <th>Business, Accounting, Human Resources &amp; Legal</th>\n",
" <th>Data Entry &amp; Admin</th>\n",
" <th>Design, Media &amp; Architecture</th>\n",
" <th>Education</th>\n",
" <th>Engineering &amp; Science</th>\n",
" <th>Freight, Shipping &amp; Transportation</th>\n",
" <th>Health &amp; Medicine</th>\n",
" <th>Jobs for Anyone</th>\n",
" <th>Mobile Phones &amp; Computing</th>\n",
" <th>Product Sourcing &amp; Manufacturing</th>\n",
" <th>Sales &amp; Marketing</th>\n",
" <th>Series([], )</th>\n",
" <th>Telecommunications</th>\n",
" <th>Trades &amp; Services</th>\n",
" <th>Translation &amp; Languages</th>\n",
" <th>Websites, IT &amp; Software</th>\n",
" <th>Writing &amp; Content</th>\n",
" </tr>\n",
" <tr>\n",
" <th>country</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Afghanistan</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Albania</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>18</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>9</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>26</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Algeria</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>133</td>\n",
" <td>0</td>\n",
" <td>15</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>13</td>\n",
" <td>75</td>\n",
" <td>14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Andorra</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Angola</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Vietnam</th>\n",
" <td>264</td>\n",
" <td>1009</td>\n",
" <td>2389</td>\n",
" <td>34</td>\n",
" <td>514</td>\n",
" <td>15</td>\n",
" <td>1</td>\n",
" <td>40</td>\n",
" <td>148</td>\n",
" <td>31</td>\n",
" <td>426</td>\n",
" <td>9</td>\n",
" <td>5</td>\n",
" <td>25</td>\n",
" <td>691</td>\n",
" <td>2476</td>\n",
" <td>1687</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Virgin Islands, U.S.</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Yemen</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zambia</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zimbabwe</th>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>9</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>162 rows × 17 columns</p>\n",
"</div>"
],
"text/plain": [
"main category Business, Accounting, Human Resources & Legal \\\n",
"country \n",
"Afghanistan 1 \n",
"Albania 1 \n",
"Algeria 0 \n",
"Andorra 0 \n",
"Angola 0 \n",
"... ... \n",
"Vietnam 264 \n",
"Virgin Islands, U.S. 1 \n",
"Yemen 0 \n",
"Zambia 1 \n",
"Zimbabwe 0 \n",
"\n",
"main category Data Entry & Admin Design, Media & Architecture \\\n",
"country \n",
"Afghanistan 2 4 \n",
"Albania 3 18 \n",
"Algeria 1 133 \n",
"Andorra 0 0 \n",
"Angola 0 2 \n",
"... ... ... \n",
"Vietnam 1009 2389 \n",
"Virgin Islands, U.S. 0 0 \n",
"Yemen 0 0 \n",
"Zambia 3 5 \n",
"Zimbabwe 3 5 \n",
"\n",
"main category Education Engineering & Science \\\n",
"country \n",
"Afghanistan 0 0 \n",
"Albania 0 4 \n",
"Algeria 0 15 \n",
"Andorra 0 0 \n",
"Angola 0 0 \n",
"... ... ... \n",
"Vietnam 34 514 \n",
"Virgin Islands, U.S. 0 0 \n",
"Yemen 0 0 \n",
"Zambia 0 1 \n",
"Zimbabwe 0 3 \n",
"\n",
"main category Freight, Shipping & Transportation Health & Medicine \\\n",
"country \n",
"Afghanistan 0 0 \n",
"Albania 0 0 \n",
"Algeria 0 0 \n",
"Andorra 0 0 \n",
"Angola 0 0 \n",
"... ... ... \n",
"Vietnam 15 1 \n",
"Virgin Islands, U.S. 0 0 \n",
"Yemen 0 0 \n",
"Zambia 0 0 \n",
"Zimbabwe 0 0 \n",
"\n",
"main category Jobs for Anyone Mobile Phones & Computing \\\n",
"country \n",
"Afghanistan 0 0 \n",
"Albania 0 0 \n",
"Algeria 0 3 \n",
"Andorra 0 0 \n",
"Angola 0 0 \n",
"... ... ... \n",
"Vietnam 40 148 \n",
"Virgin Islands, U.S. 0 0 \n",
"Yemen 1 0 \n",
"Zambia 0 0 \n",
"Zimbabwe 0 1 \n",
"\n",
"main category Product Sourcing & Manufacturing Sales & Marketing \\\n",
"country \n",
"Afghanistan 0 0 \n",
"Albania 0 9 \n",
"Algeria 0 2 \n",
"Andorra 0 0 \n",
"Angola 0 0 \n",
"... ... ... \n",
"Vietnam 31 426 \n",
"Virgin Islands, U.S. 0 0 \n",
"Yemen 0 1 \n",
"Zambia 0 0 \n",
"Zimbabwe 0 0 \n",
"\n",
"main category Series([], ) Telecommunications Trades & Services \\\n",
"country \n",
"Afghanistan 0 0 0 \n",
"Albania 0 0 0 \n",
"Algeria 0 0 0 \n",
"Andorra 0 0 0 \n",
"Angola 0 0 0 \n",
"... ... ... ... \n",
"Vietnam 9 5 25 \n",
"Virgin Islands, U.S. 0 0 0 \n",
"Yemen 0 0 0 \n",
"Zambia 0 0 0 \n",
"Zimbabwe 0 0 0 \n",
"\n",
"main category Translation & Languages Websites, IT & Software \\\n",
"country \n",
"Afghanistan 1 4 \n",
"Albania 2 26 \n",
"Algeria 13 75 \n",
"Andorra 0 0 \n",
"Angola 1 1 \n",
"... ... ... \n",
"Vietnam 691 2476 \n",
"Virgin Islands, U.S. 0 0 \n",
"Yemen 1 4 \n",
"Zambia 0 3 \n",
"Zimbabwe 0 9 \n",
"\n",
"main category Writing & Content \n",
"country \n",
"Afghanistan 1 \n",
"Albania 3 \n",
"Algeria 14 \n",
"Andorra 1 \n",
"Angola 0 \n",
"... ... \n",
"Vietnam 1687 \n",
"Virgin Islands, U.S. 1 \n",
"Yemen 1 \n",
"Zambia 1 \n",
"Zimbabwe 4 \n",
"\n",
"[162 rows x 17 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.groupby(['country', 'main category']).size().unstack(fill_value=0)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# data.groupby(['country', 'main category']).size().unstack(fill_value=0).reset_index().to_excel('workers by country.xlsx', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### First questions to be analysed based on a skill analysis"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### specialisation"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# data['skill pairs'] = data['categories'].apply((lambda x: list(itertools.combinations(x, 2))))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### general skills"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"skills = data[['top_skills', 'main category']]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\minhk\\AppData\\Local\\Temp/ipykernel_9532/2409612083.py:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" skills['top_skills'] = skills['top_skills'].apply(eval)\n"
]
}
],
"source": [
"skills['top_skills'] = skills['top_skills'].apply(eval)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"skills = skills.apply(pd.Series.explode).reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(488629, 2)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"skills.shape"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"main category top_skills \n",
"Business, Accounting, Human Resources & Legal .NET 5\n",
" .NET Core 1\n",
" 2D Animation 1\n",
" 2D Drawing 1\n",
" 3D Animation 1\n",
" ..\n",
"Writing & Content ePub 4\n",
" iMovie 2\n",
" iPad 2\n",
" iPhone 28\n",
" phpMyAdmin 1\n",
"Length: 8555, dtype: int64"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"skills.groupby(['main category', 'top_skills']).size()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"# skills.groupby(['main category', 'top_skills']).size().unstack(fill_value=0).reset_index().to_excel('general purpose skills.xlsx', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Design, Media & Architecture ', 'Design, Media & Architecture ', 'Design, Media & Architecture ', 'Design, Media & Architecture ', 'Design, Media & Architecture '] 19016\n",
"['Websites, IT & Software', 'Websites, IT & Software', 'Websites, IT & Software', 'Websites, IT & Software', 'Websites, IT & Software'] 9649\n",
"['Writing & Content', 'Writing & Content', 'Writing & Content', 'Writing & Content', 'Writing & Content'] 5376\n",
"['Data Entry & Admin', 'Data Entry & Admin', 'Data Entry & Admin', 'Data Entry & Admin', 'Data Entry & Admin'] 1160\n",
"['Design, Media & Architecture '] 988\n",
" ... \n",
"['Sales & Marketing', 'Design, Media & Architecture ', 'Sales & Marketing', 'Writing & Content', 'Data Entry & Admin'] 1\n",
"['Websites, IT & Software', 'Trades & Services', 'Sales & Marketing', 'Design, Media & Architecture ', 'Websites, IT & Software'] 1\n",
"['Sales & Marketing', 'Design, Media & Architecture ', 'Websites, IT & Software', 'Websites, IT & Software', 'Mobile Phones & Computing'] 1\n",
"['Websites, IT & Software', 'Websites, IT & Software', 'Writing & Content', 'Mobile Phones & Computing', 'Sales & Marketing'] 1\n",
"['Design, Media & Architecture ', 'Product Sourcing & Manufacturing', 'Websites, IT & Software', 'Data Entry & Admin', 'Design, Media & Architecture '] 1\n",
"Name: categories, Length: 14140, dtype: int64"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data['categories'].value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Part A: characterising the market"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"42812"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[data['overall'] == 0]['overall'].count()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"data['overall_greater_than_0'] = data['overall'] > 0"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"country overall_greater_than_0\n",
"Afghanistan True 13\n",
"Albania True 65\n",
" False 1\n",
"Algeria True 251\n",
" False 5\n",
" ... \n",
"Virgin Islands, U.S. True 2\n",
"Yemen True 8\n",
"Zambia True 13\n",
" False 1\n",
"Zimbabwe True 25\n",
"Name: overall_greater_than_0, Length: 238, dtype: int64"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.groupby(['country'])['overall_greater_than_0'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"# data.groupby(['country'])['overall_greater_than_0'].value_counts().unstack(fill_value=0).reset_index().to_excel('Employment rate by coutries.xlsx', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"US Dollar 70001\n",
"Euro 11488\n",
"Indian Rupee 11154\n",
"Canadian Dollar 8343\n",
"British Pounds 814\n",
"Australian Dollar 703\n",
"Singapore Dollar 200\n",
"HongKong Dollar 60\n",
"New Zealand Dollar 58\n",
"Pakistani Rupee 1\n",
"Name: primary_currency, dtype: int64"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data['primary_currency'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 102822.000000\n",
"mean 22.585631\n",
"std 42.679727\n",
"min 0.000000\n",
"25% 10.000000\n",
"50% 15.000000\n",
"75% 25.000000\n",
"max 1000.000000\n",
"Name: hourly_rate, dtype: float64"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data['hourly_rate'].describe()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"c = CurrencyConverter()\n",
"def convert_currency_to_usd(primary_currency, hourly_rate):\n",
" currency = 'USD'\n",
" if(primary_currency == 'Euro'):\n",
" currency = 'EUR'\n",
" if(primary_currency == 'Indian Rupee'):\n",
" currency = 'INR'\n",
" if(primary_currency == 'Canadian Dollar'):\n",
" currency = 'CAD'\n",
" if(primary_currency == 'British Pounds'):\n",
" currency = 'GBP'\n",
" if(primary_currency == 'Australian Dollar'):\n",
" currency = 'AUD'\n",
" if(primary_currency == 'Singapore Dollar'):\n",
" currency = 'SGD'\n",
" if(primary_currency == 'HongKong Dollar'):\n",
" currency = 'HKD'\n",
" if(primary_currency == 'New Zealand Dollar'):\n",
" currency = 'NZD'\n",
" \n",
" return c.convert(hourly_rate, currency, 'USD')\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"data['convert_currency_to_usd'] = data.apply(lambda x: convert_currency_to_usd(x['primary_currency'], x['hourly_rate']), axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 500.000000\n",
"1 55.505000\n",
"2 20.000000\n",
"3 75.000000\n",
"4 25.000000\n",
" ... \n",
"102817 11.982945\n",
"102818 6.390904\n",
"102819 11.982945\n",
"102820 23.965889\n",
"102821 19.971575\n",
"Name: convert_currency_to_usd, Length: 102822, dtype: float64"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data['convert_currency_to_usd']"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>count</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>min</th>\n",
" <th>25%</th>\n",
" <th>50%</th>\n",
" <th>75%</th>\n",
" <th>max</th>\n",
" </tr>\n",
" <tr>\n",
" <th>country</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Afghanistan</th>\n",
" <td>13.0</td>\n",
" <td>14.384615</td>\n",
" <td>6.305268</td>\n",
" <td>5.0</td>\n",
" <td>10.00</td>\n",
" <td>15.0</td>\n",
" <td>20.00</td>\n",
" <td>25.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Albania</th>\n",
" <td>66.0</td>\n",
" <td>23.833333</td>\n",
" <td>15.260389</td>\n",
" <td>2.0</td>\n",
" <td>13.50</td>\n",
" <td>20.0</td>\n",
" <td>30.00</td>\n",
" <td>100.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Algeria</th>\n",
" <td>256.0</td>\n",
" <td>18.699219</td>\n",
" <td>13.523232</td>\n",
" <td>2.0</td>\n",
" <td>10.00</td>\n",
" <td>15.0</td>\n",
" <td>25.00</td>\n",
" <td>150.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Andorra</th>\n",
" <td>1.0</td>\n",
" <td>25.000000</td>\n",
" <td>NaN</td>\n",
" <td>25.0</td>\n",
" <td>25.00</td>\n",
" <td>25.0</td>\n",
" <td>25.00</td>\n",
" <td>25.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Angola</th>\n",
" <td>4.0</td>\n",
" <td>20.000000</td>\n",
" <td>9.128709</td>\n",
" <td>10.0</td>\n",
" <td>13.75</td>\n",
" <td>20.0</td>\n",
" <td>26.25</td>\n",
" <td>30.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Vietnam</th>\n",
" <td>9779.0</td>\n",
" <td>19.177941</td>\n",
" <td>53.646487</td>\n",
" <td>1.0</td>\n",
" <td>5.00</td>\n",
" <td>10.0</td>\n",
" <td>20.00</td>\n",
" <td>1000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Virgin Islands, U.S.</th>\n",
" <td>2.0</td>\n",
" <td>56.500000</td>\n",
" <td>9.192388</td>\n",
" <td>50.0</td>\n",
" <td>53.25</td>\n",
" <td>56.5</td>\n",
" <td>59.75</td>\n",
" <td>63.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Yemen</th>\n",
" <td>8.0</td>\n",
" <td>17.250000</td>\n",
" <td>8.224528</td>\n",
" <td>5.0</td>\n",
" <td>10.00</td>\n",
" <td>20.0</td>\n",
" <td>20.75</td>\n",
" <td>30.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zambia</th>\n",
" <td>14.0</td>\n",
" <td>90.142857</td>\n",
" <td>262.035481</td>\n",
" <td>7.0</td>\n",
" <td>15.00</td>\n",
" <td>20.0</td>\n",
" <td>28.75</td>\n",
" <td>1000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zimbabwe</th>\n",
" <td>25.0</td>\n",
" <td>15.120000</td>\n",
" <td>10.268560</td>\n",
" <td>2.0</td>\n",
" <td>9.00</td>\n",
" <td>12.0</td>\n",
" <td>20.00</td>\n",
" <td>40.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>162 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" count mean std min 25% 50% 75% \\\n",
"country \n",
"Afghanistan 13.0 14.384615 6.305268 5.0 10.00 15.0 20.00 \n",
"Albania 66.0 23.833333 15.260389 2.0 13.50 20.0 30.00 \n",
"Algeria 256.0 18.699219 13.523232 2.0 10.00 15.0 25.00 \n",
"Andorra 1.0 25.000000 NaN 25.0 25.00 25.0 25.00 \n",
"Angola 4.0 20.000000 9.128709 10.0 13.75 20.0 26.25 \n",
"... ... ... ... ... ... ... ... \n",
"Vietnam 9779.0 19.177941 53.646487 1.0 5.00 10.0 20.00 \n",
"Virgin Islands, U.S. 2.0 56.500000 9.192388 50.0 53.25 56.5 59.75 \n",
"Yemen 8.0 17.250000 8.224528 5.0 10.00 20.0 20.75 \n",
"Zambia 14.0 90.142857 262.035481 7.0 15.00 20.0 28.75 \n",
"Zimbabwe 25.0 15.120000 10.268560 2.0 9.00 12.0 20.00 \n",
"\n",
" max \n",
"country \n",
"Afghanistan 25.0 \n",
"Albania 100.0 \n",
"Algeria 150.0 \n",
"Andorra 25.0 \n",
"Angola 30.0 \n",
"... ... \n",
"Vietnam 1000.0 \n",
"Virgin Islands, U.S. 63.0 \n",
"Yemen 30.0 \n",
"Zambia 1000.0 \n",
"Zimbabwe 40.0 \n",
"\n",
"[162 rows x 8 columns]"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.groupby(['country'])['hourly_rate'].describe()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"# data.groupby(['country'])['hourly_rate'].describe().reset_index().to_excel('Wage.xlsx', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>count</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>min</th>\n",
" <th>25%</th>\n",
" <th>50%</th>\n",
" <th>75%</th>\n",
" <th>max</th>\n",
" </tr>\n",
" <tr>\n",
" <th>country</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Afghanistan</th>\n",
" <td>13.0</td>\n",
" <td>14.384615</td>\n",
" <td>6.305268</td>\n",
" <td>5.000000</td>\n",
" <td>10.0000</td>\n",
" <td>15.0000</td>\n",
" <td>20.0000</td>\n",
" <td>25.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Albania</th>\n",
" <td>66.0</td>\n",
" <td>24.183652</td>\n",
" <td>15.489520</td>\n",
" <td>2.000000</td>\n",
" <td>13.5000</td>\n",
" <td>22.2020</td>\n",
" <td>30.0000</td>\n",
" <td>100.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Algeria</th>\n",
" <td>256.0</td>\n",
" <td>19.034636</td>\n",
" <td>13.675736</td>\n",
" <td>2.000000</td>\n",
" <td>10.0000</td>\n",
" <td>16.6515</td>\n",
" <td>25.0000</td>\n",
" <td>150.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Andorra</th>\n",
" <td>1.0</td>\n",
" <td>27.752500</td>\n",
" <td>NaN</td>\n",
" <td>27.752500</td>\n",
" <td>27.7525</td>\n",
" <td>27.7525</td>\n",
" <td>27.7525</td>\n",
" <td>27.7525</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Angola</th>\n",
" <td>4.0</td>\n",
" <td>20.000000</td>\n",
" <td>9.128709</td>\n",
" <td>10.000000</td>\n",
" <td>13.7500</td>\n",
" <td>20.0000</td>\n",
" <td>26.2500</td>\n",
" <td>30.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Vietnam</th>\n",
" <td>9779.0</td>\n",
" <td>19.145326</td>\n",
" <td>53.649341</td>\n",
" <td>0.105555</td>\n",
" <td>5.0000</td>\n",
" <td>10.0000</td>\n",
" <td>20.0000</td>\n",
" <td>1000.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Virgin Islands, U.S.</th>\n",
" <td>2.0</td>\n",
" <td>56.500000</td>\n",
" <td>9.192388</td>\n",
" <td>50.000000</td>\n",
" <td>53.2500</td>\n",
" <td>56.5000</td>\n",
" <td>59.7500</td>\n",
" <td>63.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Yemen</th>\n",
" <td>8.0</td>\n",
" <td>17.250000</td>\n",
" <td>8.224528</td>\n",
" <td>5.000000</td>\n",
" <td>10.0000</td>\n",
" <td>20.0000</td>\n",
" <td>20.7500</td>\n",
" <td>30.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zambia</th>\n",
" <td>14.0</td>\n",
" <td>90.300143</td>\n",
" <td>261.990797</td>\n",
" <td>7.000000</td>\n",
" <td>15.0000</td>\n",
" <td>21.1010</td>\n",
" <td>28.7500</td>\n",
" <td>1000.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zimbabwe</th>\n",
" <td>25.0</td>\n",
" <td>15.120000</td>\n",
" <td>10.268560</td>\n",
" <td>2.000000</td>\n",
" <td>9.0000</td>\n",
" <td>12.0000</td>\n",
" <td>20.0000</td>\n",
" <td>40.0000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>162 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" count mean std min 25% \\\n",
"country \n",
"Afghanistan 13.0 14.384615 6.305268 5.000000 10.0000 \n",
"Albania 66.0 24.183652 15.489520 2.000000 13.5000 \n",
"Algeria 256.0 19.034636 13.675736 2.000000 10.0000 \n",
"Andorra 1.0 27.752500 NaN 27.752500 27.7525 \n",
"Angola 4.0 20.000000 9.128709 10.000000 13.7500 \n",
"... ... ... ... ... ... \n",
"Vietnam 9779.0 19.145326 53.649341 0.105555 5.0000 \n",
"Virgin Islands, U.S. 2.0 56.500000 9.192388 50.000000 53.2500 \n",
"Yemen 8.0 17.250000 8.224528 5.000000 10.0000 \n",
"Zambia 14.0 90.300143 261.990797 7.000000 15.0000 \n",
"Zimbabwe 25.0 15.120000 10.268560 2.000000 9.0000 \n",
"\n",
" 50% 75% max \n",
"country \n",
"Afghanistan 15.0000 20.0000 25.0000 \n",
"Albania 22.2020 30.0000 100.0000 \n",
"Algeria 16.6515 25.0000 150.0000 \n",
"Andorra 27.7525 27.7525 27.7525 \n",
"Angola 20.0000 26.2500 30.0000 \n",
"... ... ... ... \n",
"Vietnam 10.0000 20.0000 1000.0000 \n",
"Virgin Islands, U.S. 56.5000 59.7500 63.0000 \n",
"Yemen 20.0000 20.7500 30.0000 \n",
"Zambia 21.1010 28.7500 1000.0000 \n",
"Zimbabwe 12.0000 20.0000 40.0000 \n",
"\n",
"[162 rows x 8 columns]"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.groupby(['country'])['convert_currency_to_usd'].describe()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"# data.groupby(['country'])['convert_currency_to_usd'].describe().reset_index().to_excel('Wage (USD).xlsx', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### global supply of skills"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>top_skills</th>\n",
" <th>country</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>['PHP', 'JavaScript', 'MySQL', 'HTML5', 'Datab...</td>\n",
" <td>United States</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>['Mobile App Development', 'iPhone', 'Swift', ...</td>\n",
" <td>Spain</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>['SEO', 'Link Building', 'Internet Marketing',...</td>\n",
" <td>India</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>['3D Rendering', 'Building Architecture', '3D ...</td>\n",
" <td>Portugal</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>['Android', 'Mobile App Development', 'iPhone'...</td>\n",
" <td>Pakistan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102817</th>\n",
" <td>['Application Performance Monitoring', 'Articl...</td>\n",
" <td>Canada</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102818</th>\n",
" <td>['German']</td>\n",
" <td>Canada</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102819</th>\n",
" <td>['Mobile App Testing', 'Testing / QA']</td>\n",
" <td>Canada</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102820</th>\n",
" <td>['3D Design', '3D Modelling', '3ds Max', 'Arts...</td>\n",
" <td>Canada</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102821</th>\n",
" <td>['Apache Maven', 'Selenium', 'Selenium Webdriv...</td>\n",
" <td>Canada</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>102822 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" top_skills country\n",
"0 ['PHP', 'JavaScript', 'MySQL', 'HTML5', 'Datab... United States\n",
"1 ['Mobile App Development', 'iPhone', 'Swift', ... Spain\n",
"2 ['SEO', 'Link Building', 'Internet Marketing',... India\n",
"3 ['3D Rendering', 'Building Architecture', '3D ... Portugal\n",
"4 ['Android', 'Mobile App Development', 'iPhone'... Pakistan\n",
"... ... ...\n",
"102817 ['Application Performance Monitoring', 'Articl... Canada\n",
"102818 ['German'] Canada\n",
"102819 ['Mobile App Testing', 'Testing / QA'] Canada\n",
"102820 ['3D Design', '3D Modelling', '3ds Max', 'Arts... Canada\n",
"102821 ['Apache Maven', 'Selenium', 'Selenium Webdriv... Canada\n",
"\n",
"[102822 rows x 2 columns]"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"supply_skills = data[['top_skills', 'country']]\n",
"supply_skills"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\minhk\\AppData\\Local\\Temp/ipykernel_9532/2592974012.py:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" supply_skills['top_skills'] = supply_skills['top_skills'].apply(eval)\n"
]
}
],
"source": [
"supply_skills['top_skills'] = supply_skills['top_skills'].apply(eval)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>top_skills</th>\n",
" <th>country</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>PHP</td>\n",
" <td>United States</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>JavaScript</td>\n",
" <td>United States</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>MySQL</td>\n",
" <td>United States</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>HTML5</td>\n",
" <td>United States</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Database Development</td>\n",
" <td>United States</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>488624</th>\n",
" <td>Chinese Tutoring</td>\n",
" <td>Canada</td>\n",
" </tr>\n",
" <tr>\n",
" <th>488625</th>\n",
" <td>Apache Maven</td>\n",
" <td>Canada</td>\n",
" </tr>\n",
" <tr>\n",
" <th>488626</th>\n",
" <td>Selenium</td>\n",
" <td>Canada</td>\n",
" </tr>\n",
" <tr>\n",
" <th>488627</th>\n",
" <td>Selenium Webdriver</td>\n",
" <td>Canada</td>\n",
" </tr>\n",
" <tr>\n",
" <th>488628</th>\n",
" <td>Testing / QA</td>\n",
" <td>Canada</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>488629 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" top_skills country\n",
"0 PHP United States\n",
"1 JavaScript United States\n",
"2 MySQL United States\n",
"3 HTML5 United States\n",
"4 Database Development United States\n",
"... ... ...\n",
"488624 Chinese Tutoring Canada\n",
"488625 Apache Maven Canada\n",
"488626 Selenium Canada\n",
"488627 Selenium Webdriver Canada\n",
"488628 Testing / QA Canada\n",
"\n",
"[488629 rows x 2 columns]"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"supply_skills = supply_skills.apply(pd.Series.explode).reset_index(drop=True)\n",
"supply_skills"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Graphic Design 21045\n",
"Logo Design 14569\n",
"Photoshop 12923\n",
"Data Entry 12071\n",
"Article Writing 11062\n",
" ... \n",
"Bracket Installation 1\n",
"Workday Security 1\n",
"Actimize 1\n",
"Employee Engagement 1\n",
"Plumbing 1\n",
"Name: top_skills, Length: 2075, dtype: int64"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# unique count of skills\n",
"supply_skills['top_skills'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"country top_skills \n",
"Afghanistan 2D Animation 1\n",
" 2D Animation Explainer Video 1\n",
" 3D Animation 1\n",
" 3D Design 2\n",
" 3D Rendering 1\n",
" ..\n",
"Zimbabwe Website Design 1\n",
" Word 1\n",
" WordPress 1\n",
" eCommerce 1\n",
" iPhone 1\n",
"Length: 32023, dtype: int64"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"supply_skills.groupby(['country', 'top_skills']).size()"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"# supply_skills.groupby(['country', 'top_skills']).size().unstack(fill_value=0).reset_index().to_excel('global supply of skills.xlsx', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### multiskilling"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"data['categories'] = data['categories'].apply(eval)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"data['top_skills'] = data['top_skills'].apply(eval)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"def countDifferentCategories(categories):\n",
" return len(set(categories))"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"def countDiffSkills(top_skills):\n",
" return len(set(top_skills))"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['PHP', 'JavaScript', 'MySQL', 'HTML5', 'Database Development']"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data['top_skills'][0]"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"data['countDifferentSkills'] = data['top_skills'].apply(countDiffSkills)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"data['countDifferentCategories'] = data['categories'].apply(countDifferentCategories)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(99955, 34)"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[data['countDifferentSkills'] >= 2].shape"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"country countDifferentSkills\n",
"Afghanistan 5 13\n",
"Albania 5 64\n",
" 1 1\n",
" 4 1\n",
"Algeria 5 241\n",
" ... \n",
"Yemen 5 8\n",
"Zambia 5 13\n",
" 4 1\n",
"Zimbabwe 5 24\n",
" 4 1\n",
"Name: countDifferentSkills, Length: 442, dtype: int64"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.groupby(['country'])['countDifferentSkills'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"# data.groupby(['country'])['countDifferentSkills'].value_counts().unstack(fill_value=0).reset_index().to_excel('multiskilling by coutries by skills.xlsx', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(58334, 34)"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[data['countDifferentCategories'] >= 2].shape"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"country countDifferentCategories\n",
"Afghanistan 3 6\n",
" 1 5\n",
" 4 2\n",
"Albania 1 24\n",
" 2 20\n",
" ..\n",
"Zambia 3 3\n",
"Zimbabwe 1 11\n",
" 2 8\n",
" 3 5\n",
" 4 1\n",
"Name: countDifferentCategories, Length: 629, dtype: int64"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.groupby(['country'])['countDifferentCategories'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"# data.groupby(['country'])['countDifferentCategories'].value_counts().unstack(fill_value=0).reset_index().to_excel('multiskilling by coutries.xlsx', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Top 100 skills each category"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"top_100_skills = data[['top_skills', 'categories']]"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"# top_100_skills['top_skills'] = top_100_skills['top_skills'].apply(eval)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"top_100_skills = top_100_skills.apply(pd.Series.explode).reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>top_skills</th>\n",
" <th>categories</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>PHP</td>\n",
" <td>Websites, IT &amp; Software</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>JavaScript</td>\n",
" <td>Websites, IT &amp; Software</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>MySQL</td>\n",
" <td>Websites, IT &amp; Software</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>HTML5</td>\n",
" <td>Websites, IT &amp; Software</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Database Development</td>\n",
" <td>Websites, IT &amp; Software</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>488624</th>\n",
" <td>Chinese Tutoring</td>\n",
" <td>Education</td>\n",
" </tr>\n",
" <tr>\n",
" <th>488625</th>\n",
" <td>Apache Maven</td>\n",
" <td>Websites, IT &amp; Software</td>\n",
" </tr>\n",
" <tr>\n",
" <th>488626</th>\n",
" <td>Selenium</td>\n",
" <td>Websites, IT &amp; Software</td>\n",
" </tr>\n",
" <tr>\n",
" <th>488627</th>\n",
" <td>Selenium Webdriver</td>\n",
" <td>Websites, IT &amp; Software</td>\n",
" </tr>\n",
" <tr>\n",
" <th>488628</th>\n",
" <td>Testing / QA</td>\n",
" <td>Websites, IT &amp; Software</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>488629 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" top_skills categories\n",
"0 PHP Websites, IT & Software\n",
"1 JavaScript Websites, IT & Software\n",
"2 MySQL Websites, IT & Software\n",
"3 HTML5 Websites, IT & Software\n",
"4 Database Development Websites, IT & Software\n",
"... ... ...\n",
"488624 Chinese Tutoring Education\n",
"488625 Apache Maven Websites, IT & Software\n",
"488626 Selenium Websites, IT & Software\n",
"488627 Selenium Webdriver Websites, IT & Software\n",
"488628 Testing / QA Websites, IT & Software\n",
"\n",
"[488629 rows x 2 columns]"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"top_100_skills"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"# top_100_skills.reset_index().to_excel('skill with category.xlsx', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"categories top_skills \n",
"Business, Accounting, Human Resources & Legal A/R Collections 30\n",
" A/R Management 48\n",
" A/R analysis 40\n",
" Account Management 271\n",
" Account Payables Management 100\n",
" ... \n",
"Writing & Content Web Page Writer 14\n",
" Wikipedia 51\n",
" Word Processing 35\n",
" Writing 139\n",
" eBooks 501\n",
"Length: 2075, dtype: int64"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"top_100_skills.groupby(['categories', 'top_skills']).size()"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"data_jsd = pd.read_csv('../../data/JSD_percent.csv',index_col='country')"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PHP</th>\n",
" <th>Graphic Design</th>\n",
" <th>Website Design</th>\n",
" <th>HTML</th>\n",
" <th>JavaScript</th>\n",
" <th>Photoshop</th>\n",
" <th>WordPress</th>\n",
" <th>Mobile App Development</th>\n",
" <th>Data Entry</th>\n",
" <th>Software Architecture</th>\n",
" <th>...</th>\n",
" <th>System Admin</th>\n",
" <th>Banner Design</th>\n",
" <th>iPad</th>\n",
" <th>Anything Goes</th>\n",
" <th>Spanish</th>\n",
" <th>CAD/CAM</th>\n",
" <th>Objective C</th>\n",
" <th>Social Networking</th>\n",
" <th>Software Testing</th>\n",
" <th>Distance</th>\n",
" </tr>\n",
" <tr>\n",
" <th>country</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Global demand</th>\n",
" <td>0.068271</td>\n",
" <td>0.058671</td>\n",
" <td>0.053399</td>\n",
" <td>0.052171</td>\n",
" <td>0.027036</td>\n",
" <td>0.026281</td>\n",
" <td>0.024745</td>\n",
" <td>0.023990</td>\n",
" <td>0.023245</td>\n",
" <td>0.023163</td>\n",
" <td>...</td>\n",
" <td>0.002918</td>\n",
" <td>0.002873</td>\n",
" <td>0.002809</td>\n",
" <td>0.002791</td>\n",
" <td>0.002791</td>\n",
" <td>0.002709</td>\n",
" <td>0.002691</td>\n",
" <td>0.002673</td>\n",
" <td>0.002654</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>India</th>\n",
" <td>0.079670</td>\n",
" <td>0.061113</td>\n",
" <td>0.056782</td>\n",
" <td>0.066741</td>\n",
" <td>0.033559</td>\n",
" <td>0.030441</td>\n",
" <td>0.040463</td>\n",
" <td>0.023621</td>\n",
" <td>0.018809</td>\n",
" <td>0.015085</td>\n",
" <td>...</td>\n",
" <td>0.002573</td>\n",
" <td>0.009582</td>\n",
" <td>0.003703</td>\n",
" <td>0.001046</td>\n",
" <td>0.000021</td>\n",
" <td>0.002992</td>\n",
" <td>0.002992</td>\n",
" <td>0.000502</td>\n",
" <td>0.002218</td>\n",
" <td>0.173795</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Romania</th>\n",
" <td>0.048501</td>\n",
" <td>0.075749</td>\n",
" <td>0.038147</td>\n",
" <td>0.043052</td>\n",
" <td>0.021798</td>\n",
" <td>0.051226</td>\n",
" <td>0.023433</td>\n",
" <td>0.004905</td>\n",
" <td>0.028338</td>\n",
" <td>0.014714</td>\n",
" <td>...</td>\n",
" <td>0.003270</td>\n",
" <td>0.018529</td>\n",
" <td>0.000545</td>\n",
" <td>0.002180</td>\n",
" <td>0.002180</td>\n",
" <td>0.004360</td>\n",
" <td>0.000000</td>\n",
" <td>0.001090</td>\n",
" <td>0.001635</td>\n",
" <td>0.212352</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Pakistan</th>\n",
" <td>0.049188</td>\n",
" <td>0.075598</td>\n",
" <td>0.041595</td>\n",
" <td>0.045081</td>\n",
" <td>0.019675</td>\n",
" <td>0.042718</td>\n",
" <td>0.031167</td>\n",
" <td>0.013235</td>\n",
" <td>0.032939</td>\n",
" <td>0.008449</td>\n",
" <td>...</td>\n",
" <td>0.001684</td>\n",
" <td>0.023220</td>\n",
" <td>0.001270</td>\n",
" <td>0.000532</td>\n",
" <td>0.000059</td>\n",
" <td>0.002245</td>\n",
" <td>0.001004</td>\n",
" <td>0.000561</td>\n",
" <td>0.001123</td>\n",
" <td>0.212856</td>\n",
" </tr>\n",
" <tr>\n",
" <th>United Kingdom</th>\n",
" <td>0.042535</td>\n",
" <td>0.048177</td>\n",
" <td>0.030816</td>\n",
" <td>0.038194</td>\n",
" <td>0.024740</td>\n",
" <td>0.026476</td>\n",
" <td>0.020399</td>\n",
" <td>0.004774</td>\n",
" <td>0.015625</td>\n",
" <td>0.014757</td>\n",
" <td>...</td>\n",
" <td>0.002604</td>\n",
" <td>0.006944</td>\n",
" <td>0.000434</td>\n",
" <td>0.001302</td>\n",
" <td>0.001302</td>\n",
" <td>0.005208</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.002604</td>\n",
" <td>0.249822</td>\n",
" </tr>\n",
" <tr>\n",
" <th>United Arab Emirates</th>\n",
" <td>0.040000</td>\n",
" <td>0.060870</td>\n",
" <td>0.038261</td>\n",
" <td>0.038261</td>\n",
" <td>0.015652</td>\n",
" <td>0.041739</td>\n",
" <td>0.027826</td>\n",
" <td>0.015652</td>\n",
" <td>0.020870</td>\n",
" <td>0.010435</td>\n",
" <td>...</td>\n",
" <td>0.003478</td>\n",
" <td>0.006957</td>\n",
" <td>0.003478</td>\n",
" <td>0.003478</td>\n",
" <td>0.001739</td>\n",
" <td>0.003478</td>\n",
" <td>0.003478</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.262151</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Australia</th>\n",
" <td>0.035928</td>\n",
" <td>0.047156</td>\n",
" <td>0.023204</td>\n",
" <td>0.029192</td>\n",
" <td>0.018713</td>\n",
" <td>0.031437</td>\n",
" <td>0.020210</td>\n",
" <td>0.008234</td>\n",
" <td>0.021707</td>\n",
" <td>0.015719</td>\n",
" <td>...</td>\n",
" <td>0.004491</td>\n",
" <td>0.005988</td>\n",
" <td>0.001497</td>\n",
" <td>0.002994</td>\n",
" <td>0.000000</td>\n",
" <td>0.006737</td>\n",
" <td>0.000749</td>\n",
" <td>0.000749</td>\n",
" <td>0.000000</td>\n",
" <td>0.264786</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Poland</th>\n",
" <td>0.057692</td>\n",
" <td>0.061966</td>\n",
" <td>0.043803</td>\n",
" <td>0.047009</td>\n",
" <td>0.047009</td>\n",
" <td>0.033120</td>\n",
" <td>0.018162</td>\n",
" <td>0.012821</td>\n",
" <td>0.014957</td>\n",
" <td>0.016026</td>\n",
" <td>...</td>\n",
" <td>0.008547</td>\n",
" <td>0.008547</td>\n",
" <td>0.000000</td>\n",
" <td>0.001068</td>\n",
" <td>0.001068</td>\n",
" <td>0.003205</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.001068</td>\n",
" <td>0.265190</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Serbia</th>\n",
" <td>0.043579</td>\n",
" <td>0.085415</td>\n",
" <td>0.027891</td>\n",
" <td>0.042998</td>\n",
" <td>0.037769</td>\n",
" <td>0.067984</td>\n",
" <td>0.017432</td>\n",
" <td>0.008135</td>\n",
" <td>0.018013</td>\n",
" <td>0.012202</td>\n",
" <td>...</td>\n",
" <td>0.002905</td>\n",
" <td>0.008716</td>\n",
" <td>0.000581</td>\n",
" <td>0.006973</td>\n",
" <td>0.000581</td>\n",
" <td>0.003486</td>\n",
" <td>0.000000</td>\n",
" <td>0.001162</td>\n",
" <td>0.000581</td>\n",
" <td>0.268509</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Greece</th>\n",
" <td>0.041591</td>\n",
" <td>0.052441</td>\n",
" <td>0.032550</td>\n",
" <td>0.034358</td>\n",
" <td>0.021700</td>\n",
" <td>0.043400</td>\n",
" <td>0.025316</td>\n",
" <td>0.005425</td>\n",
" <td>0.028933</td>\n",
" <td>0.021700</td>\n",
" <td>...</td>\n",
" <td>0.003617</td>\n",
" <td>0.005425</td>\n",
" <td>0.000000</td>\n",
" <td>0.007233</td>\n",
" <td>0.000000</td>\n",
" <td>0.009042</td>\n",
" <td>0.000000</td>\n",
" <td>0.001808</td>\n",
" <td>0.001808</td>\n",
" <td>0.274675</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10 rows × 101 columns</p>\n",
"</div>"
],
"text/plain": [
" PHP Graphic Design Website Design HTML \\\n",
"country \n",
"Global demand 0.068271 0.058671 0.053399 0.052171 \n",
"India 0.079670 0.061113 0.056782 0.066741 \n",
"Romania 0.048501 0.075749 0.038147 0.043052 \n",
"Pakistan 0.049188 0.075598 0.041595 0.045081 \n",
"United Kingdom 0.042535 0.048177 0.030816 0.038194 \n",
"United Arab Emirates 0.040000 0.060870 0.038261 0.038261 \n",
"Australia 0.035928 0.047156 0.023204 0.029192 \n",
"Poland 0.057692 0.061966 0.043803 0.047009 \n",
"Serbia 0.043579 0.085415 0.027891 0.042998 \n",
"Greece 0.041591 0.052441 0.032550 0.034358 \n",
"\n",
" JavaScript Photoshop WordPress \\\n",
"country \n",
"Global demand 0.027036 0.026281 0.024745 \n",
"India 0.033559 0.030441 0.040463 \n",
"Romania 0.021798 0.051226 0.023433 \n",
"Pakistan 0.019675 0.042718 0.031167 \n",
"United Kingdom 0.024740 0.026476 0.020399 \n",
"United Arab Emirates 0.015652 0.041739 0.027826 \n",
"Australia 0.018713 0.031437 0.020210 \n",
"Poland 0.047009 0.033120 0.018162 \n",
"Serbia 0.037769 0.067984 0.017432 \n",
"Greece 0.021700 0.043400 0.025316 \n",
"\n",
" Mobile App Development Data Entry \\\n",
"country \n",
"Global demand 0.023990 0.023245 \n",
"India 0.023621 0.018809 \n",
"Romania 0.004905 0.028338 \n",
"Pakistan 0.013235 0.032939 \n",
"United Kingdom 0.004774 0.015625 \n",
"United Arab Emirates 0.015652 0.020870 \n",
"Australia 0.008234 0.021707 \n",
"Poland 0.012821 0.014957 \n",
"Serbia 0.008135 0.018013 \n",
"Greece 0.005425 0.028933 \n",
"\n",
" Software Architecture ... System Admin Banner Design \\\n",
"country ... \n",
"Global demand 0.023163 ... 0.002918 0.002873 \n",
"India 0.015085 ... 0.002573 0.009582 \n",
"Romania 0.014714 ... 0.003270 0.018529 \n",
"Pakistan 0.008449 ... 0.001684 0.023220 \n",
"United Kingdom 0.014757 ... 0.002604 0.006944 \n",
"United Arab Emirates 0.010435 ... 0.003478 0.006957 \n",
"Australia 0.015719 ... 0.004491 0.005988 \n",
"Poland 0.016026 ... 0.008547 0.008547 \n",
"Serbia 0.012202 ... 0.002905 0.008716 \n",
"Greece 0.021700 ... 0.003617 0.005425 \n",
"\n",
" iPad Anything Goes Spanish CAD/CAM \\\n",
"country \n",
"Global demand 0.002809 0.002791 0.002791 0.002709 \n",
"India 0.003703 0.001046 0.000021 0.002992 \n",
"Romania 0.000545 0.002180 0.002180 0.004360 \n",
"Pakistan 0.001270 0.000532 0.000059 0.002245 \n",
"United Kingdom 0.000434 0.001302 0.001302 0.005208 \n",
"United Arab Emirates 0.003478 0.003478 0.001739 0.003478 \n",
"Australia 0.001497 0.002994 0.000000 0.006737 \n",
"Poland 0.000000 0.001068 0.001068 0.003205 \n",
"Serbia 0.000581 0.006973 0.000581 0.003486 \n",
"Greece 0.000000 0.007233 0.000000 0.009042 \n",
"\n",
" Objective C Social Networking Software Testing \\\n",
"country \n",
"Global demand 0.002691 0.002673 0.002654 \n",
"India 0.002992 0.000502 0.002218 \n",
"Romania 0.000000 0.001090 0.001635 \n",
"Pakistan 0.001004 0.000561 0.001123 \n",
"United Kingdom 0.000000 0.000000 0.002604 \n",
"United Arab Emirates 0.003478 0.000000 0.000000 \n",
"Australia 0.000749 0.000749 0.000000 \n",
"Poland 0.000000 0.000000 0.001068 \n",
"Serbia 0.000000 0.001162 0.000581 \n",
"Greece 0.000000 0.001808 0.001808 \n",
"\n",
" Distance \n",
"country \n",
"Global demand 0.000000 \n",
"India 0.173795 \n",
"Romania 0.212352 \n",
"Pakistan 0.212856 \n",
"United Kingdom 0.249822 \n",
"United Arab Emirates 0.262151 \n",
"Australia 0.264786 \n",
"Poland 0.265190 \n",
"Serbia 0.268509 \n",
"Greece 0.274675 \n",
"\n",
"[10 rows x 101 columns]"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_jsd.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"jsdGlobalDemand = data_jsd.loc['Global demand'].to_numpy()"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.0\n",
"0.2860571190815358\n",
"0.3246167845329701\n",
"0.3253215748458352\n",
"0.3607408860684481\n",
"0.3716516635688743\n",
"0.37481113322059917\n",
"0.3741457529980596\n",
"0.37763046558805335\n",
"0.3827520047487007\n",
"0.3841925704525008\n",
"0.38613200649425383\n",
"0.39171328706584846\n",
"0.39148205371771955\n",
"0.3917431163930851\n",
"0.39178346294554034\n",
"0.3925646310317158\n",
"0.39654750338672623\n",
"0.3975308548149135\n",
"0.39939654609137576\n",
"0.4003805394025181\n",
"0.4023853627795285\n",
"0.40302258344855135\n",
"0.4082472619921704\n",
"0.4075682636862357\n",
"0.4110313897503889\n",
"0.41691650996638024\n",
"0.41842780367747456\n",
"0.4174225730360051\n",
"0.41941463881362845\n",
"0.4213495375328997\n",
"0.4214496095583851\n",
"0.42188783431432053\n",
"0.4254923515138011\n",
"0.42413795322031494\n",
"0.42391977781606754\n",
"0.4265128640255013\n",
"0.4277254085485339\n",
"0.4317260195779134\n",
"0.43395225276633775\n",
"0.43436433969098454\n",
"0.43800000183787957\n",
"0.44089549335981415\n",
"0.44287304461186705\n",
"0.44340944612319677\n",
"0.44533017187289753\n",
"0.4487004735798567\n",
"0.44770491953420666\n",
"0.44834474323139584\n",
"0.45568652145405314\n",
"0.4542325725390619\n",
"0.4574408566374616\n",
"0.46089851829561646\n",
"0.46210139600478595\n",
"0.46473466004875785\n",
"0.4620871419876771\n",
"0.46457670791509453\n",
"0.46433163712422604\n",
"0.46889180045931483\n",
"0.465957053859102\n",
"0.46927139824044367\n",
"0.4715639466639418\n",
"0.4759959740268172\n",
"0.4755988862004194\n",
"0.47949995590696237\n",
"0.47980278996538894\n",
"0.4782509009352861\n",
"0.4830444190048917\n",
"0.4867645727279896\n",
"0.49015234515979994\n",
"0.48735136909135546\n",
"0.4899661524690199\n",
"0.49154962420182613\n",
"0.49461637926358537\n",
"0.5028420720155724\n",
"0.5017966397656894\n",
"0.5033988218919287\n",
"0.5039071326447193\n",
"0.5062660891895264\n",
"0.5110674689148276\n",
"0.5134538471703968\n",
"0.5194699286069008\n",
"0.521393730724888\n",
"0.5296501934620801\n",
"0.5255530912004522\n",
"0.5299544963542663\n",
"0.5357705148867817\n",
"0.5349289888142745\n",
"0.5442126396998821\n",
"0.5368796817977181\n",
"0.5382836832850767\n",
"0.5421161597637003\n",
"0.543996080215923\n",
"0.5454637942278867\n",
"0.5455291864649752\n",
"0.5525228820562225\n",
"0.5507960547478844\n",
"0.5531027449709519\n",
"0.5525109430519447\n",
"0.5518122388534176\n",
"0.5538333585683701\n",
"0.5609311944507676\n",
"0.5585401344427846\n",
"0.5595011229415492\n",
"0.5634627991440393\n",
"0.5633553192445999\n",
"0.5653091748905074\n",
"0.5770999906678096\n",
"0.5803872884952452\n",
"0.5913717133331765\n",
"0.5921664838355585\n",
"0.5966574186263832\n",
"0.5998987414070343\n",
"0.6015413542080362\n",
"0.600663295734713\n",
"0.6034598484140714\n",
"0.6062562271249721\n",
"0.6116007478946871\n",
"0.6198027820736538\n",
"0.6315010409318101\n",
"0.6345118041220162\n",
"0.6366831814436417\n",
"0.6481555535015878\n",
"0.6537175402172252\n",
"0.653677687602692\n",
"0.6557213787871455\n",
"0.6590289055341005\n",
"0.6621329740346206\n",
"0.6666141121412799\n",
"0.6681284341136481\n",
"0.6714780068290991\n",
"0.6743578889948241\n",
"0.6887131935189001\n",
"0.6955589143310991\n",
"0.6963394054396262\n",
"0.7033904598933083\n",
"0.7075455620214487\n",
"0.7118255272706894\n",
"0.7198060313658307\n",
"0.7217366563966879\n",
"0.7237168598011875\n",
"0.7297317460615703\n",
"0.7310461474991554\n",
"0.7310461474991554\n",
"0.7325977917723518\n",
"0.7322061687308511\n",
"0.735189079551855\n",
"0.7418350837457091\n",
"0.7550026196170765\n",
"0.7725223783860382\n",
"0.7757281717725366\n",
"0.7760811099027868\n",
"0.7781693245155246\n",
"0.7823750155905416\n",
"0.7849949157855389\n",
"0.7877688539107567\n",
"0.7899738003753038\n",
"0.7936250980710134\n",
"0.7958704681738475\n",
"0.8028712694144905\n",
"0.8047965596911127\n",
"0.8066551833456976\n",
"0.8221737373896092\n"
]
}
],
"source": [
"for index, row in data_jsd.iterrows():\n",
" print(distance.jensenshannon(jsdGlobalDemand, row.to_numpy()))"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 True\n",
"1 True\n",
"2 True\n",
"3 True\n",
"4 True\n",
" ... \n",
"102817 False\n",
"102818 False\n",
"102819 False\n",
"102820 False\n",
"102821 False\n",
"Name: super_star, Length: 102822, dtype: object"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data['super_star']"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>super_star</th>\n",
" <th>False</th>\n",
" <th>True</th>\n",
" </tr>\n",
" <tr>\n",
" <th>country</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Afghanistan</th>\n",
" <td>13</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Albania</th>\n",
" <td>60</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Algeria</th>\n",
" <td>239</td>\n",
" <td>17</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Andorra</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Angola</th>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Vietnam</th>\n",
" <td>9725</td>\n",
" <td>46</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Virgin Islands, U.S.</th>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Yemen</th>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zambia</th>\n",
" <td>13</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zimbabwe</th>\n",
" <td>23</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>162 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
"super_star False True\n",
"country \n",
"Afghanistan 13 0\n",
"Albania 60 6\n",
"Algeria 239 17\n",
"Andorra 1 0\n",
"Angola 4 0\n",
"... ... ...\n",
"Vietnam 9725 46\n",
"Virgin Islands, U.S. 2 0\n",
"Yemen 6 2\n",
"Zambia 13 1\n",
"Zimbabwe 23 2\n",
"\n",
"[162 rows x 2 columns]"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.groupby(['country', 'super_star']).size().unstack(fill_value=0)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"# data.groupby(['country', 'super_star']).size().unstack(fill_value=0).reset_index().to_excel('super_star.xlsx', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"super_star = data[data['super_star'] == 1]"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(3956, 34)"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"super_star.shape"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"country main category \n",
"Albania Design, Media & Architecture 1\n",
" Engineering & Science 2\n",
" Websites, IT & Software 3\n",
"Algeria Design, Media & Architecture 10\n",
" Engineering & Science 1\n",
" ..\n",
"Yemen Translation & Languages 1\n",
" Websites, IT & Software 1\n",
"Zambia Design, Media & Architecture 1\n",
"Zimbabwe Engineering & Science 1\n",
" Websites, IT & Software 1\n",
"Length: 409, dtype: int64"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"super_star.groupby(['country', 'main category']).size()"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"# super_star.groupby(['country', 'main category']).size().unstack(fill_value=0).reset_index().to_excel('super_star main_category.xlsx', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"super_star_categories = super_star[['categories','country']]"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>categories</th>\n",
" <th>country</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>[Websites, IT &amp; Software, Websites, IT &amp; Softw...</td>\n",
" <td>United States</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>[Mobile Phones &amp; Computing, Mobile Phones &amp; Co...</td>\n",
" <td>Spain</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>[Websites, IT &amp; Software, Websites, IT &amp; Softw...</td>\n",
" <td>India</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>[Design, Media &amp; Architecture , Design, Media ...</td>\n",
" <td>Portugal</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>[Mobile Phones &amp; Computing, Mobile Phones &amp; Co...</td>\n",
" <td>Pakistan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95569</th>\n",
" <td>[Mobile Phones &amp; Computing, Design, Media &amp; Ar...</td>\n",
" <td>Canada</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95619</th>\n",
" <td>[Design, Media &amp; Architecture , Design, Media ...</td>\n",
" <td>Canada</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95633</th>\n",
" <td>[Business, Accounting, Human Resources &amp; Legal...</td>\n",
" <td>Canada</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95641</th>\n",
" <td>[Writing &amp; Content, Writing &amp; Content, Sales &amp;...</td>\n",
" <td>Canada</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97072</th>\n",
" <td>[Design, Media &amp; Architecture , Websites, IT &amp;...</td>\n",
" <td>Canada</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3956 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" categories country\n",
"0 [Websites, IT & Software, Websites, IT & Softw... United States\n",
"1 [Mobile Phones & Computing, Mobile Phones & Co... Spain\n",
"2 [Websites, IT & Software, Websites, IT & Softw... India\n",
"3 [Design, Media & Architecture , Design, Media ... Portugal\n",
"4 [Mobile Phones & Computing, Mobile Phones & Co... Pakistan\n",
"... ... ...\n",
"95569 [Mobile Phones & Computing, Design, Media & Ar... Canada\n",
"95619 [Design, Media & Architecture , Design, Media ... Canada\n",
"95633 [Business, Accounting, Human Resources & Legal... Canada\n",
"95641 [Writing & Content, Writing & Content, Sales &... Canada\n",
"97072 [Design, Media & Architecture , Websites, IT &... Canada\n",
"\n",
"[3956 rows x 2 columns]"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"super_star_categories"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"super_star_categories = super_star_categories.apply(pd.Series.explode).reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>categories</th>\n",
" <th>country</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Websites, IT &amp; Software</td>\n",
" <td>United States</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Websites, IT &amp; Software</td>\n",
" <td>United States</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Websites, IT &amp; Software</td>\n",
" <td>United States</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Websites, IT &amp; Software</td>\n",
" <td>United States</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Websites, IT &amp; Software</td>\n",
" <td>United States</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19609</th>\n",
" <td>Design, Media &amp; Architecture</td>\n",
" <td>Canada</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19610</th>\n",
" <td>Websites, IT &amp; Software</td>\n",
" <td>Canada</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19611</th>\n",
" <td>Websites, IT &amp; Software</td>\n",
" <td>Canada</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19612</th>\n",
" <td>Websites, IT &amp; Software</td>\n",
" <td>Canada</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19613</th>\n",
" <td>Websites, IT &amp; Software</td>\n",
" <td>Canada</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>19614 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" categories country\n",
"0 Websites, IT & Software United States\n",
"1 Websites, IT & Software United States\n",
"2 Websites, IT & Software United States\n",
"3 Websites, IT & Software United States\n",
"4 Websites, IT & Software United States\n",
"... ... ...\n",
"19609 Design, Media & Architecture Canada\n",
"19610 Websites, IT & Software Canada\n",
"19611 Websites, IT & Software Canada\n",
"19612 Websites, IT & Software Canada\n",
"19613 Websites, IT & Software Canada\n",
"\n",
"[19614 rows x 2 columns]"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"super_star_categories"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"country categories \n",
"Albania Design, Media & Architecture 4\n",
" Engineering & Science 11\n",
" Jobs for Anyone 1\n",
" Websites, IT & Software 14\n",
"Algeria Data Entry & Admin 1\n",
" ..\n",
"Yemen Writing & Content 2\n",
"Zambia Design, Media & Architecture 4\n",
"Zimbabwe Data Entry & Admin 1\n",
" Engineering & Science 5\n",
" Websites, IT & Software 4\n",
"Length: 661, dtype: int64"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"super_star_categories.groupby(['country', 'categories']).size()"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
"# super_star_categories.groupby(['country', 'categories']).size().unstack(fill_value=0).reset_index().to_excel('super_star categories.xlsx', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## reviews"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
"def f_review_range(overall):\n",
" if(overall >= 1 and overall < 2):\n",
" return '1-2'\n",
" elif(overall >= 2 and overall < 3):\n",
" return '2-3'\n",
" elif(overall >= 3 and overall < 4):\n",
" return '3-4'\n",
" elif(overall >= 4 and overall < 5):\n",
" return '4-5'\n",
" elif(overall >= 5):\n",
" return '5'\n",
" else:\n",
" return '0'\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
"data['review_range'] = data['overall'].apply(f_review_range)"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"# data.groupby(['country', 'review_range']).size().unstack(fill_value=0).reset_index().to_excel('overall.xlsx', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### health"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
"health = data"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [],
"source": [
"health = health.apply(pd.Series.explode).reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_id</th>\n",
" <th>username</th>\n",
" <th>registration_date</th>\n",
" <th>all</th>\n",
" <th>complete</th>\n",
" <th>incomplete</th>\n",
" <th>public_name</th>\n",
" <th>tagline</th>\n",
" <th>profile_description</th>\n",
" <th>overall</th>\n",
" <th>...</th>\n",
" <th>education_duration_by_year</th>\n",
" <th>skills</th>\n",
" <th>categories</th>\n",
" <th>main category</th>\n",
" <th>super_star</th>\n",
" <th>overall_greater_than_0</th>\n",
" <th>convert_currency_to_usd</th>\n",
" <th>countDifferentSkills</th>\n",
" <th>countDifferentCategories</th>\n",
" <th>review_range</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>88469</th>\n",
" <td>11858472</td>\n",
" <td>tulasikrishna</td>\n",
" <td>8/2/2014 1:33</td>\n",
" <td>8</td>\n",
" <td>8</td>\n",
" <td>0</td>\n",
" <td>Tulasi Krishna S.</td>\n",
" <td>Certified Fitness Trainer, Nutritional Therapist</td>\n",
" <td>This is Tulasi Krishna.\\n\\nI am a Certified Fi...</td>\n",
" <td>4.950448</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Health &amp; Medicine</td>\n",
" <td>Data Entry &amp; Admin</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>0.039583</td>\n",
" <td>5</td>\n",
" <td>4</td>\n",
" <td>4-5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>150497</th>\n",
" <td>49669059</td>\n",
" <td>diegovergara549</td>\n",
" <td>9/8/2020 14:32</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>Diego V.</td>\n",
" <td>Profesor de Educación Física.</td>\n",
" <td>Licenciado en Educación en la Universidad Cató...</td>\n",
" <td>5.000000</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Health &amp; Medicine</td>\n",
" <td>Sales &amp; Marketing</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>10.000000</td>\n",
" <td>5</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>168291</th>\n",
" <td>49354667</td>\n",
" <td>katarinap2002</td>\n",
" <td>8/26/2020 2:37</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>Katarina P.</td>\n",
" <td>Reliable web designer for all types of websites!</td>\n",
" <td>Hello everyone!\\n\\nMy name is Katarina and I a...</td>\n",
" <td>5.000000</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Health &amp; Medicine</td>\n",
" <td>Translation &amp; Languages</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>10.000000</td>\n",
" <td>5</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>175020</th>\n",
" <td>51528131</td>\n",
" <td>DJLPO</td>\n",
" <td>12/6/2020 19:49</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>Damien P.</td>\n",
" <td>Bilingual (French / English) Business Law Jurist</td>\n",
" <td>Hi, \\n\\nI am Damien, I am Native French. I hav...</td>\n",
" <td>5.000000</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Health &amp; Medicine</td>\n",
" <td>Writing &amp; Content</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>15.541400</td>\n",
" <td>5</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>208918</th>\n",
" <td>55757071</td>\n",
" <td>Lestatst</td>\n",
" <td>6/14/2021 22:32</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>Lestat S.</td>\n",
" <td>Reliable and Informative.</td>\n",
" <td>I have I BEd in Foundation phase Education and...</td>\n",
" <td>5.000000</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Health &amp; Medicine</td>\n",
" <td>Engineering &amp; Science</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>10.000000</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>482910</th>\n",
" <td>58998272</td>\n",
" <td>zadroznyjordan</td>\n",
" <td>11/9/2021 13:00</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Jordan Z.</td>\n",
" <td>Cover Letters</td>\n",
" <td>Hello,\\nMy name is Jordan Zadrozny and I am a ...</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Health &amp; Medicine</td>\n",
" <td>Sales &amp; Marketing</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>19.971575</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>484549</th>\n",
" <td>57437475</td>\n",
" <td>kfirth98</td>\n",
" <td>8/25/2021 0:23</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Kiana F.</td>\n",
" <td>I am a designer</td>\n",
" <td>I have a strong background in graphic design, ...</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Health &amp; Medicine</td>\n",
" <td>Design, Media &amp; Architecture</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>15.977260</td>\n",
" <td>5</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>485553</th>\n",
" <td>58522528</td>\n",
" <td>jbgochico</td>\n",
" <td>10/13/2021 2:22</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>John Benhar G.</td>\n",
" <td>I'm Unemployed for now, while waiting my papers.</td>\n",
" <td>I graduated as a Bachelors Science in informat...</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Health &amp; Medicine</td>\n",
" <td>Writing &amp; Content</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>7.988630</td>\n",
" <td>5</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>487037</th>\n",
" <td>51039999</td>\n",
" <td>SteveChuFitness</td>\n",
" <td>11/11/2020 21:21</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Steve C.</td>\n",
" <td>Filmmaker and Body Builder</td>\n",
" <td>My top skills:\\n- Creating cinematic and comme...</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Health &amp; Medicine</td>\n",
" <td>Jobs for Anyone</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>19.971575</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>487507</th>\n",
" <td>59211845</td>\n",
" <td>NotoriousNitram</td>\n",
" <td>11/22/2021 8:30</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Abigail M.</td>\n",
" <td>Anything goes!</td>\n",
" <td>I am a musician, yarn artist, and can do much ...</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Health &amp; Medicine</td>\n",
" <td>Jobs for Anyone</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>11.982945</td>\n",
" <td>5</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>111 rows × 35 columns</p>\n",
"</div>"
],
"text/plain": [
" user_id username registration_date all complete \\\n",
"88469 11858472 tulasikrishna 8/2/2014 1:33 8 8 \n",
"150497 49669059 diegovergara549 9/8/2020 14:32 1 1 \n",
"168291 49354667 katarinap2002 8/26/2020 2:37 1 1 \n",
"175020 51528131 DJLPO 12/6/2020 19:49 1 1 \n",
"208918 55757071 Lestatst 6/14/2021 22:32 2 2 \n",
"... ... ... ... ... ... \n",
"482910 58998272 zadroznyjordan 11/9/2021 13:00 0 0 \n",
"484549 57437475 kfirth98 8/25/2021 0:23 0 0 \n",
"485553 58522528 jbgochico 10/13/2021 2:22 0 0 \n",
"487037 51039999 SteveChuFitness 11/11/2020 21:21 0 0 \n",
"487507 59211845 NotoriousNitram 11/22/2021 8:30 0 0 \n",
"\n",
" incomplete public_name \\\n",
"88469 0 Tulasi Krishna S. \n",
"150497 0 Diego V. \n",
"168291 0 Katarina P. \n",
"175020 0 Damien P. \n",
"208918 0 Lestat S. \n",
"... ... ... \n",
"482910 0 Jordan Z. \n",
"484549 0 Kiana F. \n",
"485553 0 John Benhar G. \n",
"487037 0 Steve C. \n",
"487507 0 Abigail M. \n",
"\n",
" tagline \\\n",
"88469 Certified Fitness Trainer, Nutritional Therapist \n",
"150497 Profesor de Educación Física. \n",
"168291 Reliable web designer for all types of websites! \n",
"175020 Bilingual (French / English) Business Law Jurist \n",
"208918 Reliable and Informative. \n",
"... ... \n",
"482910 Cover Letters \n",
"484549 I am a designer \n",
"485553 I'm Unemployed for now, while waiting my papers. \n",
"487037 Filmmaker and Body Builder \n",
"487507 Anything goes! \n",
"\n",
" profile_description overall ... \\\n",
"88469 This is Tulasi Krishna.\\n\\nI am a Certified Fi... 4.950448 ... \n",
"150497 Licenciado en Educación en la Universidad Cató... 5.000000 ... \n",
"168291 Hello everyone!\\n\\nMy name is Katarina and I a... 5.000000 ... \n",
"175020 Hi, \\n\\nI am Damien, I am Native French. I hav... 5.000000 ... \n",
"208918 I have I BEd in Foundation phase Education and... 5.000000 ... \n",
"... ... ... ... \n",
"482910 Hello,\\nMy name is Jordan Zadrozny and I am a ... 0.000000 ... \n",
"484549 I have a strong background in graphic design, ... 0.000000 ... \n",
"485553 I graduated as a Bachelors Science in informat... 0.000000 ... \n",
"487037 My top skills:\\n- Creating cinematic and comme... 0.000000 ... \n",
"487507 I am a musician, yarn artist, and can do much ... 0.000000 ... \n",
"\n",
" education_duration_by_year skills categories \\\n",
"88469 NaN NaN Health & Medicine \n",
"150497 NaN NaN Health & Medicine \n",
"168291 NaN NaN Health & Medicine \n",
"175020 NaN NaN Health & Medicine \n",
"208918 NaN NaN Health & Medicine \n",
"... ... ... ... \n",
"482910 NaN NaN Health & Medicine \n",
"484549 NaN NaN Health & Medicine \n",
"485553 NaN NaN Health & Medicine \n",
"487037 NaN NaN Health & Medicine \n",
"487507 NaN NaN Health & Medicine \n",
"\n",
" main category super_star overall_greater_than_0 \\\n",
"88469 Data Entry & Admin False True \n",
"150497 Sales & Marketing False True \n",
"168291 Translation & Languages False True \n",
"175020 Writing & Content False True \n",
"208918 Engineering & Science False True \n",
"... ... ... ... \n",
"482910 Sales & Marketing False False \n",
"484549 Design, Media & Architecture False False \n",
"485553 Writing & Content False False \n",
"487037 Jobs for Anyone False False \n",
"487507 Jobs for Anyone False False \n",
"\n",
" convert_currency_to_usd countDifferentSkills countDifferentCategories \\\n",
"88469 0.039583 5 4 \n",
"150497 10.000000 5 4 \n",
"168291 10.000000 5 4 \n",
"175020 15.541400 5 4 \n",
"208918 10.000000 5 5 \n",
"... ... ... ... \n",
"482910 19.971575 4 4 \n",
"484549 15.977260 5 4 \n",
"485553 7.988630 5 3 \n",
"487037 19.971575 2 2 \n",
"487507 11.982945 5 4 \n",
"\n",
" review_range \n",
"88469 4-5 \n",
"150497 5 \n",
"168291 5 \n",
"175020 5 \n",
"208918 5 \n",
"... ... \n",
"482910 0 \n",
"484549 0 \n",
"485553 0 \n",
"487037 0 \n",
"487507 0 \n",
"\n",
"[111 rows x 35 columns]"
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"health[health['categories'] == 'Health & Medicine']"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [],
"source": [
"# health[health['categories'] == 'Health & Medicine'].reset_index().to_excel('Health.xlsx', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### telecom"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
"telecom = data"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
"telecom = telecom.apply(pd.Series.explode).reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_id</th>\n",
" <th>username</th>\n",
" <th>registration_date</th>\n",
" <th>all</th>\n",
" <th>complete</th>\n",
" <th>incomplete</th>\n",
" <th>public_name</th>\n",
" <th>tagline</th>\n",
" <th>profile_description</th>\n",
" <th>overall</th>\n",
" <th>...</th>\n",
" <th>education_duration_by_year</th>\n",
" <th>skills</th>\n",
" <th>categories</th>\n",
" <th>main category</th>\n",
" <th>super_star</th>\n",
" <th>overall_greater_than_0</th>\n",
" <th>convert_currency_to_usd</th>\n",
" <th>countDifferentSkills</th>\n",
" <th>countDifferentCategories</th>\n",
" <th>review_range</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>87340</th>\n",
" <td>52138794</td>\n",
" <td>atrivedi12</td>\n",
" <td>1/5/2021 23:45</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>Abhishek T.</td>\n",
" <td>Seasoned IT Network infrastructure Professional</td>\n",
" <td>WiFi ✔️Security✔️NAC✔️LAN✔️ Hotspot✔️\\nI have ...</td>\n",
" <td>5.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Telecommunications</td>\n",
" <td>Telecommunications</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0.659721</td>\n",
" <td>5</td>\n",
" <td>3</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>87341</th>\n",
" <td>52138794</td>\n",
" <td>atrivedi12</td>\n",
" <td>1/5/2021 23:45</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>Abhishek T.</td>\n",
" <td>Seasoned IT Network infrastructure Professional</td>\n",
" <td>WiFi ✔️Security✔️NAC✔️LAN✔️ Hotspot✔️\\nI have ...</td>\n",
" <td>5.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Telecommunications</td>\n",
" <td>Telecommunications</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0.659721</td>\n",
" <td>5</td>\n",
" <td>3</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99511</th>\n",
" <td>56392987</td>\n",
" <td>stiaanx</td>\n",
" <td>7/11/2021 15:39</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>Christiaan Johan L.</td>\n",
" <td>Reliable and dependent value adding guru.</td>\n",
" <td>Meticulous professional leader that is dead li...</td>\n",
" <td>5.0</td>\n",
" <td>...</td>\n",
" <td>7.0</td>\n",
" <td>NaN</td>\n",
" <td>Telecommunications</td>\n",
" <td>Translation &amp; Languages</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>10.000000</td>\n",
" <td>5</td>\n",
" <td>3</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>110550</th>\n",
" <td>27587620</td>\n",
" <td>rashidhafiz96</td>\n",
" <td>12/8/2017 22:59</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>Hafiz R.</td>\n",
" <td>Economics , statistics , business plan</td>\n",
" <td>I have expertise and skills in following filed...</td>\n",
" <td>5.0</td>\n",
" <td>...</td>\n",
" <td>4.0</td>\n",
" <td>NaN</td>\n",
" <td>Telecommunications</td>\n",
" <td>Business, Accounting, Human Resources &amp; Legal</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>10.000000</td>\n",
" <td>5</td>\n",
" <td>3</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>111230</th>\n",
" <td>52303336</td>\n",
" <td>RubyKarthik</td>\n",
" <td>1/13/2021 1:06</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>Sankareswari R.</td>\n",
" <td>BSI Certified Lead Auditor | Six Sigma Green Belt</td>\n",
" <td>• BSI Certified Lead Auditor with 11+ years of...</td>\n",
" <td>5.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Telecommunications</td>\n",
" <td>Business, Accounting, Human Resources &amp; Legal</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>0.197916</td>\n",
" <td>5</td>\n",
" <td>3</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>481159</th>\n",
" <td>58729922</td>\n",
" <td>Nazbanooo</td>\n",
" <td>10/24/2021 12:23</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Elnaz T.</td>\n",
" <td>Paralegal &amp; Team Lead Client Services &amp; QA</td>\n",
" <td>I am a very intelligent, fast paced, detail or...</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>5.0</td>\n",
" <td>NaN</td>\n",
" <td>Telecommunications</td>\n",
" <td>Education</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>15.977260</td>\n",
" <td>5</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>481229</th>\n",
" <td>59230602</td>\n",
" <td>singhkuljit7</td>\n",
" <td>11/23/2021 10:47</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Kuljit S.</td>\n",
" <td>data entry</td>\n",
" <td>Proficient in typing and transcription.\\nVerba...</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Telecommunications</td>\n",
" <td>Data Entry &amp; Admin</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>15.178397</td>\n",
" <td>5</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>485315</th>\n",
" <td>59263128</td>\n",
" <td>trenyncejones14</td>\n",
" <td>11/25/2021 3:49</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Trenynce J.</td>\n",
" <td>Very helpful</td>\n",
" <td>Skills\\n• voice representation \\n• good Englis...</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Telecommunications</td>\n",
" <td>Telecommunications</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>12.781808</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>487329</th>\n",
" <td>57924641</td>\n",
" <td>aaronkemp171</td>\n",
" <td>9/14/2021 20:46</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Aaron K.</td>\n",
" <td>C Programmer</td>\n",
" <td>C Programmer with five years of experience wri...</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Telecommunications</td>\n",
" <td>Websites, IT &amp; Software</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>27.960204</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>488182</th>\n",
" <td>56366616</td>\n",
" <td>Antonnette08</td>\n",
" <td>7/10/2021 10:04</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Ma. Antonnette F.</td>\n",
" <td>Dedicated To Help You</td>\n",
" <td>I am highly organized.\\nAn organized candidate...</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Telecommunications</td>\n",
" <td>Trades &amp; Services</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>11.982945</td>\n",
" <td>5</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>100 rows × 35 columns</p>\n",
"</div>"
],
"text/plain": [
" user_id username registration_date all complete \\\n",
"87340 52138794 atrivedi12 1/5/2021 23:45 4 4 \n",
"87341 52138794 atrivedi12 1/5/2021 23:45 4 4 \n",
"99511 56392987 stiaanx 7/11/2021 15:39 4 4 \n",
"110550 27587620 rashidhafiz96 12/8/2017 22:59 3 3 \n",
"111230 52303336 RubyKarthik 1/13/2021 1:06 2 2 \n",
"... ... ... ... ... ... \n",
"481159 58729922 Nazbanooo 10/24/2021 12:23 0 0 \n",
"481229 59230602 singhkuljit7 11/23/2021 10:47 0 0 \n",
"485315 59263128 trenyncejones14 11/25/2021 3:49 0 0 \n",
"487329 57924641 aaronkemp171 9/14/2021 20:46 0 0 \n",
"488182 56366616 Antonnette08 7/10/2021 10:04 0 0 \n",
"\n",
" incomplete public_name \\\n",
"87340 0 Abhishek T. \n",
"87341 0 Abhishek T. \n",
"99511 0 Christiaan Johan L. \n",
"110550 0 Hafiz R. \n",
"111230 0 Sankareswari R. \n",
"... ... ... \n",
"481159 0 Elnaz T. \n",
"481229 0 Kuljit S. \n",
"485315 0 Trenynce J. \n",
"487329 0 Aaron K. \n",
"488182 0 Ma. Antonnette F. \n",
"\n",
" tagline \\\n",
"87340 Seasoned IT Network infrastructure Professional \n",
"87341 Seasoned IT Network infrastructure Professional \n",
"99511 Reliable and dependent value adding guru. \n",
"110550 Economics , statistics , business plan \n",
"111230 BSI Certified Lead Auditor | Six Sigma Green Belt \n",
"... ... \n",
"481159 Paralegal & Team Lead Client Services & QA \n",
"481229 data entry \n",
"485315 Very helpful \n",
"487329 C Programmer \n",
"488182 Dedicated To Help You \n",
"\n",
" profile_description overall ... \\\n",
"87340 WiFi ✔️Security✔️NAC✔️LAN✔️ Hotspot✔️\\nI have ... 5.0 ... \n",
"87341 WiFi ✔️Security✔️NAC✔️LAN✔️ Hotspot✔️\\nI have ... 5.0 ... \n",
"99511 Meticulous professional leader that is dead li... 5.0 ... \n",
"110550 I have expertise and skills in following filed... 5.0 ... \n",
"111230 • BSI Certified Lead Auditor with 11+ years of... 5.0 ... \n",
"... ... ... ... \n",
"481159 I am a very intelligent, fast paced, detail or... 0.0 ... \n",
"481229 Proficient in typing and transcription.\\nVerba... 0.0 ... \n",
"485315 Skills\\n• voice representation \\n• good Englis... 0.0 ... \n",
"487329 C Programmer with five years of experience wri... 0.0 ... \n",
"488182 I am highly organized.\\nAn organized candidate... 0.0 ... \n",
"\n",
" education_duration_by_year skills categories \\\n",
"87340 NaN NaN Telecommunications \n",
"87341 NaN NaN Telecommunications \n",
"99511 7.0 NaN Telecommunications \n",
"110550 4.0 NaN Telecommunications \n",
"111230 NaN NaN Telecommunications \n",
"... ... ... ... \n",
"481159 5.0 NaN Telecommunications \n",
"481229 NaN NaN Telecommunications \n",
"485315 NaN NaN Telecommunications \n",
"487329 NaN NaN Telecommunications \n",
"488182 NaN NaN Telecommunications \n",
"\n",
" main category super_star \\\n",
"87340 Telecommunications True \n",
"87341 Telecommunications True \n",
"99511 Translation & Languages False \n",
"110550 Business, Accounting, Human Resources & Legal False \n",
"111230 Business, Accounting, Human Resources & Legal False \n",
"... ... ... \n",
"481159 Education False \n",
"481229 Data Entry & Admin False \n",
"485315 Telecommunications False \n",
"487329 Websites, IT & Software False \n",
"488182 Trades & Services False \n",
"\n",
" overall_greater_than_0 convert_currency_to_usd countDifferentSkills \\\n",
"87340 True 0.659721 5 \n",
"87341 True 0.659721 5 \n",
"99511 True 10.000000 5 \n",
"110550 True 10.000000 5 \n",
"111230 True 0.197916 5 \n",
"... ... ... ... \n",
"481159 False 15.977260 5 \n",
"481229 False 15.178397 5 \n",
"485315 False 12.781808 2 \n",
"487329 False 27.960204 5 \n",
"488182 False 11.982945 5 \n",
"\n",
" countDifferentCategories review_range \n",
"87340 3 5 \n",
"87341 3 5 \n",
"99511 3 5 \n",
"110550 3 5 \n",
"111230 3 5 \n",
"... ... ... \n",
"481159 3 0 \n",
"481229 4 0 \n",
"485315 2 0 \n",
"487329 2 0 \n",
"488182 4 0 \n",
"\n",
"[100 rows x 35 columns]"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"telecom[telecom['categories'] == 'Telecommunications']"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
"# telecom[telecom['categories'] == 'Telecommunications'].reset_index().to_excel('Telecommunications.xlsx', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>top_skills</th>\n",
" <th>main category</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>PHP</td>\n",
" <td>Websites, IT &amp; Software</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>JavaScript</td>\n",
" <td>Websites, IT &amp; Software</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>MySQL</td>\n",
" <td>Websites, IT &amp; Software</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>HTML5</td>\n",
" <td>Websites, IT &amp; Software</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Database Development</td>\n",
" <td>Websites, IT &amp; Software</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>488624</th>\n",
" <td>Chinese Tutoring</td>\n",
" <td>Design, Media &amp; Architecture</td>\n",
" </tr>\n",
" <tr>\n",
" <th>488625</th>\n",
" <td>Apache Maven</td>\n",
" <td>Websites, IT &amp; Software</td>\n",
" </tr>\n",
" <tr>\n",
" <th>488626</th>\n",
" <td>Selenium</td>\n",
" <td>Websites, IT &amp; Software</td>\n",
" </tr>\n",
" <tr>\n",
" <th>488627</th>\n",
" <td>Selenium Webdriver</td>\n",
" <td>Websites, IT &amp; Software</td>\n",
" </tr>\n",
" <tr>\n",
" <th>488628</th>\n",
" <td>Testing / QA</td>\n",
" <td>Websites, IT &amp; Software</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>488629 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" top_skills main category\n",
"0 PHP Websites, IT & Software\n",
"1 JavaScript Websites, IT & Software\n",
"2 MySQL Websites, IT & Software\n",
"3 HTML5 Websites, IT & Software\n",
"4 Database Development Websites, IT & Software\n",
"... ... ...\n",
"488624 Chinese Tutoring Design, Media & Architecture \n",
"488625 Apache Maven Websites, IT & Software\n",
"488626 Selenium Websites, IT & Software\n",
"488627 Selenium Webdriver Websites, IT & Software\n",
"488628 Testing / QA Websites, IT & Software\n",
"\n",
"[488629 rows x 2 columns]"
]
},
"execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"skills"
]
}
],
"metadata": {
"interpreter": {
"hash": "369f2c481f4da34e4445cda3fffd2e751bd1c4d706f27375911949ba6bb62e1c"
},
"kernelspec": {
"display_name": "Python 3.10.0 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.0"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment