Created
February 4, 2018 18:43
-
-
Save chendaniely/bf72ccc8d9974e6aa04e8584d075dca7 to your computer and use it in GitHub Desktop.
Columns containing multiple bits of information
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"toc": "true" | |
}, | |
"source": [ | |
"# Table of Contents\n", | |
" <p><div class=\"lev1 toc-item\"><a href=\"#Concat-with-expand\" data-toc-modified-id=\"Concat-with-expand-1\"><span class=\"toc-item-num\">1 </span>Concat with expand</a></div><div class=\"lev1 toc-item\"><a href=\"#Zip\" data-toc-modified-id=\"Zip-2\"><span class=\"toc-item-num\">2 </span>Zip</a></div><div class=\"lev1 toc-item\"><a href=\"#differing-num-splits\" data-toc-modified-id=\"differing-num-splits-3\"><span class=\"toc-item-num\">3 </span>differing num splits</a></div>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Different ways of dealing with a column that contains multiple bits of information." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-02-04T18:42:13.085179Z", | |
"start_time": "2018-02-04T18:42:12.569897Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-02-04T18:42:13.093513Z", | |
"start_time": "2018-02-04T18:42:13.087487Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"df = pd.DataFrame({\n", | |
" 'name': ['Daniel', 'Julia', 'Eric'],\n", | |
" 'tags': ['male,student', 'female,nurse', 'male,accountant']\n", | |
" })" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-02-04T18:42:13.139725Z", | |
"start_time": "2018-02-04T18:42:13.096442Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>name</th>\n", | |
" <th>tags</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Daniel</td>\n", | |
" <td>male,student</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Julia</td>\n", | |
" <td>female,nurse</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Eric</td>\n", | |
" <td>male,accountant</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" name tags\n", | |
"0 Daniel male,student\n", | |
"1 Julia female,nurse\n", | |
"2 Eric male,accountant" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Concat with expand" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-02-04T18:42:13.149011Z", | |
"start_time": "2018-02-04T18:42:13.143132Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"col_split_expand = df['tags'].str.split(',', expand=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-02-04T18:42:13.158738Z", | |
"start_time": "2018-02-04T18:42:13.153747Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"df_concat = pd.concat([df, col_split_expand], axis=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-02-04T18:42:13.179610Z", | |
"start_time": "2018-02-04T18:42:13.162810Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>name</th>\n", | |
" <th>tags</th>\n", | |
" <th>0</th>\n", | |
" <th>1</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Daniel</td>\n", | |
" <td>male,student</td>\n", | |
" <td>male</td>\n", | |
" <td>student</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Julia</td>\n", | |
" <td>female,nurse</td>\n", | |
" <td>female</td>\n", | |
" <td>nurse</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Eric</td>\n", | |
" <td>male,accountant</td>\n", | |
" <td>male</td>\n", | |
" <td>accountant</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" name tags 0 1\n", | |
"0 Daniel male,student male student\n", | |
"1 Julia female,nurse female nurse\n", | |
"2 Eric male,accountant male accountant" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_concat" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-02-04T18:28:21.347512Z", | |
"start_time": "2018-02-04T18:28:21.341501Z" | |
} | |
}, | |
"source": [ | |
"# Zip" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-02-04T18:42:13.190003Z", | |
"start_time": "2018-02-04T18:42:13.183231Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"df = pd.DataFrame({\n", | |
" 'name': ['Daniel', 'Julia', 'Eric'],\n", | |
" 'tags': ['male,student', 'female,nurse', 'male,accountant']\n", | |
" })\n", | |
"col_split = df['tags'].str.split(',')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-02-04T18:42:13.199644Z", | |
"start_time": "2018-02-04T18:42:13.193101Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"df['sex'], df['occupation'] = zip(*col_split)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-02-04T18:42:13.226734Z", | |
"start_time": "2018-02-04T18:42:13.204293Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>name</th>\n", | |
" <th>tags</th>\n", | |
" <th>sex</th>\n", | |
" <th>occupation</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Daniel</td>\n", | |
" <td>male,student</td>\n", | |
" <td>male</td>\n", | |
" <td>student</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Julia</td>\n", | |
" <td>female,nurse</td>\n", | |
" <td>female</td>\n", | |
" <td>nurse</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Eric</td>\n", | |
" <td>male,accountant</td>\n", | |
" <td>male</td>\n", | |
" <td>accountant</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" name tags sex occupation\n", | |
"0 Daniel male,student male student\n", | |
"1 Julia female,nurse female nurse\n", | |
"2 Eric male,accountant male accountant" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-02-04T18:34:03.826602Z", | |
"start_time": "2018-02-04T18:34:03.823486Z" | |
} | |
}, | |
"source": [ | |
"# differing num splits" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-02-04T18:42:13.233696Z", | |
"start_time": "2018-02-04T18:42:13.229294Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"df = pd.DataFrame({\n", | |
" 'name': ['Daniel', 'Julia', 'Eric'],\n", | |
" 'tags': ['male,student,reserach associate,data engineer', 'female,student,nurse', 'male,accountant']\n", | |
" })" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-02-04T18:42:13.247175Z", | |
"start_time": "2018-02-04T18:42:13.237616Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>name</th>\n", | |
" <th>tags</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Daniel</td>\n", | |
" <td>male,student,reserach associate,data engineer</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Julia</td>\n", | |
" <td>female,student,nurse</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Eric</td>\n", | |
" <td>male,accountant</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" name tags\n", | |
"0 Daniel male,student,reserach associate,data engineer\n", | |
"1 Julia female,student,nurse\n", | |
"2 Eric male,accountant" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-02-04T18:42:13.264964Z", | |
"start_time": "2018-02-04T18:42:13.250462Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>0</th>\n", | |
" <th>1</th>\n", | |
" <th>2</th>\n", | |
" <th>3</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>male</td>\n", | |
" <td>student</td>\n", | |
" <td>reserach associate</td>\n", | |
" <td>data engineer</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>female</td>\n", | |
" <td>student</td>\n", | |
" <td>nurse</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>male</td>\n", | |
" <td>accountant</td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" 0 1 2 3\n", | |
"0 male student reserach associate data engineer\n", | |
"1 female student nurse None\n", | |
"2 male accountant None None" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"split_expand = df['tags'].str.split(',', expand=True)\n", | |
"split_expand" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-02-04T18:42:13.273385Z", | |
"start_time": "2018-02-04T18:42:13.267263Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"RangeIndex(start=0, stop=4, step=1)" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"split_expand.columns" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-02-04T18:42:13.282327Z", | |
"start_time": "2018-02-04T18:42:13.276386Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"split_expand.columns = ['tag_{}'.format(x) for x in range(len(split_expand.columns))]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-02-04T18:42:13.290191Z", | |
"start_time": "2018-02-04T18:42:13.285133Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"df_diff_concat = pd.concat([df, split_expand], axis=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-02-04T18:42:13.309831Z", | |
"start_time": "2018-02-04T18:42:13.294209Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>name</th>\n", | |
" <th>tags</th>\n", | |
" <th>tag_0</th>\n", | |
" <th>tag_1</th>\n", | |
" <th>tag_2</th>\n", | |
" <th>tag_3</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Daniel</td>\n", | |
" <td>male,student,reserach associate,data engineer</td>\n", | |
" <td>male</td>\n", | |
" <td>student</td>\n", | |
" <td>reserach associate</td>\n", | |
" <td>data engineer</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Julia</td>\n", | |
" <td>female,student,nurse</td>\n", | |
" <td>female</td>\n", | |
" <td>student</td>\n", | |
" <td>nurse</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Eric</td>\n", | |
" <td>male,accountant</td>\n", | |
" <td>male</td>\n", | |
" <td>accountant</td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" name tags tag_0 tag_1 \\\n", | |
"0 Daniel male,student,reserach associate,data engineer male student \n", | |
"1 Julia female,student,nurse female student \n", | |
"2 Eric male,accountant male accountant \n", | |
"\n", | |
" tag_2 tag_3 \n", | |
"0 reserach associate data engineer \n", | |
"1 nurse None \n", | |
"2 None None " | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_diff_concat" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python (root)", | |
"language": "python", | |
"name": "other-env" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.4" | |
}, | |
"toc": { | |
"colors": { | |
"hover_highlight": "#DAA520", | |
"navigate_num": "#000000", | |
"navigate_text": "#333333", | |
"running_highlight": "#FF0000", | |
"selected_highlight": "#FFD700", | |
"sidebar_border": "#EEEEEE", | |
"wrapper_background": "#FFFFFF" | |
}, | |
"moveMenuLeft": true, | |
"nav_menu": { | |
"height": "66px", | |
"width": "252px" | |
}, | |
"navigate_menu": true, | |
"number_sections": true, | |
"sideBar": true, | |
"threshold": 4, | |
"toc_cell": true, | |
"toc_section_display": "block", | |
"toc_window_display": true, | |
"widenNotebook": false | |
}, | |
"varInspector": { | |
"cols": { | |
"lenName": 16, | |
"lenType": 16, | |
"lenVar": 40 | |
}, | |
"kernels_config": { | |
"python": { | |
"delete_cmd_postfix": "", | |
"delete_cmd_prefix": "del ", | |
"library": "var_list.py", | |
"varRefreshCmd": "print(var_dic_list())" | |
}, | |
"r": { | |
"delete_cmd_postfix": ") ", | |
"delete_cmd_prefix": "rm(", | |
"library": "var_list.r", | |
"varRefreshCmd": "cat(var_dic_list()) " | |
} | |
}, | |
"types_to_exclude": [ | |
"module", | |
"function", | |
"builtin_function_or_method", | |
"instance", | |
"_Feature" | |
], | |
"window_display": false | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment