Skip to content

Instantly share code, notes, and snippets.

@chendaniely
Created February 4, 2018 18:43
Show Gist options
  • Save chendaniely/bf72ccc8d9974e6aa04e8584d075dca7 to your computer and use it in GitHub Desktop.
Save chendaniely/bf72ccc8d9974e6aa04e8584d075dca7 to your computer and use it in GitHub Desktop.
Columns containing multiple bits of information
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"toc": "true"
},
"source": [
"# Table of Contents\n",
" <p><div class=\"lev1 toc-item\"><a href=\"#Concat-with-expand\" data-toc-modified-id=\"Concat-with-expand-1\"><span class=\"toc-item-num\">1&nbsp;&nbsp;</span>Concat with expand</a></div><div class=\"lev1 toc-item\"><a href=\"#Zip\" data-toc-modified-id=\"Zip-2\"><span class=\"toc-item-num\">2&nbsp;&nbsp;</span>Zip</a></div><div class=\"lev1 toc-item\"><a href=\"#differing-num-splits\" data-toc-modified-id=\"differing-num-splits-3\"><span class=\"toc-item-num\">3&nbsp;&nbsp;</span>differing num splits</a></div>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Different ways of dealing with a column that contains multiple bits of information."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2018-02-04T18:42:13.085179Z",
"start_time": "2018-02-04T18:42:12.569897Z"
}
},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2018-02-04T18:42:13.093513Z",
"start_time": "2018-02-04T18:42:13.087487Z"
}
},
"outputs": [],
"source": [
"df = pd.DataFrame({\n",
" 'name': ['Daniel', 'Julia', 'Eric'],\n",
" 'tags': ['male,student', 'female,nurse', 'male,accountant']\n",
" })"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2018-02-04T18:42:13.139725Z",
"start_time": "2018-02-04T18:42:13.096442Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>tags</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Daniel</td>\n",
" <td>male,student</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Julia</td>\n",
" <td>female,nurse</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Eric</td>\n",
" <td>male,accountant</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name tags\n",
"0 Daniel male,student\n",
"1 Julia female,nurse\n",
"2 Eric male,accountant"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Concat with expand"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2018-02-04T18:42:13.149011Z",
"start_time": "2018-02-04T18:42:13.143132Z"
}
},
"outputs": [],
"source": [
"col_split_expand = df['tags'].str.split(',', expand=True)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2018-02-04T18:42:13.158738Z",
"start_time": "2018-02-04T18:42:13.153747Z"
}
},
"outputs": [],
"source": [
"df_concat = pd.concat([df, col_split_expand], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2018-02-04T18:42:13.179610Z",
"start_time": "2018-02-04T18:42:13.162810Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>tags</th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Daniel</td>\n",
" <td>male,student</td>\n",
" <td>male</td>\n",
" <td>student</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Julia</td>\n",
" <td>female,nurse</td>\n",
" <td>female</td>\n",
" <td>nurse</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Eric</td>\n",
" <td>male,accountant</td>\n",
" <td>male</td>\n",
" <td>accountant</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name tags 0 1\n",
"0 Daniel male,student male student\n",
"1 Julia female,nurse female nurse\n",
"2 Eric male,accountant male accountant"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_concat"
]
},
{
"cell_type": "markdown",
"metadata": {
"ExecuteTime": {
"end_time": "2018-02-04T18:28:21.347512Z",
"start_time": "2018-02-04T18:28:21.341501Z"
}
},
"source": [
"# Zip"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2018-02-04T18:42:13.190003Z",
"start_time": "2018-02-04T18:42:13.183231Z"
}
},
"outputs": [],
"source": [
"df = pd.DataFrame({\n",
" 'name': ['Daniel', 'Julia', 'Eric'],\n",
" 'tags': ['male,student', 'female,nurse', 'male,accountant']\n",
" })\n",
"col_split = df['tags'].str.split(',')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2018-02-04T18:42:13.199644Z",
"start_time": "2018-02-04T18:42:13.193101Z"
}
},
"outputs": [],
"source": [
"df['sex'], df['occupation'] = zip(*col_split)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"ExecuteTime": {
"end_time": "2018-02-04T18:42:13.226734Z",
"start_time": "2018-02-04T18:42:13.204293Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>tags</th>\n",
" <th>sex</th>\n",
" <th>occupation</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Daniel</td>\n",
" <td>male,student</td>\n",
" <td>male</td>\n",
" <td>student</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Julia</td>\n",
" <td>female,nurse</td>\n",
" <td>female</td>\n",
" <td>nurse</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Eric</td>\n",
" <td>male,accountant</td>\n",
" <td>male</td>\n",
" <td>accountant</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name tags sex occupation\n",
"0 Daniel male,student male student\n",
"1 Julia female,nurse female nurse\n",
"2 Eric male,accountant male accountant"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "markdown",
"metadata": {
"ExecuteTime": {
"end_time": "2018-02-04T18:34:03.826602Z",
"start_time": "2018-02-04T18:34:03.823486Z"
}
},
"source": [
"# differing num splits"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"ExecuteTime": {
"end_time": "2018-02-04T18:42:13.233696Z",
"start_time": "2018-02-04T18:42:13.229294Z"
}
},
"outputs": [],
"source": [
"df = pd.DataFrame({\n",
" 'name': ['Daniel', 'Julia', 'Eric'],\n",
" 'tags': ['male,student,reserach associate,data engineer', 'female,student,nurse', 'male,accountant']\n",
" })"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"ExecuteTime": {
"end_time": "2018-02-04T18:42:13.247175Z",
"start_time": "2018-02-04T18:42:13.237616Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>tags</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Daniel</td>\n",
" <td>male,student,reserach associate,data engineer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Julia</td>\n",
" <td>female,student,nurse</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Eric</td>\n",
" <td>male,accountant</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name tags\n",
"0 Daniel male,student,reserach associate,data engineer\n",
"1 Julia female,student,nurse\n",
"2 Eric male,accountant"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"ExecuteTime": {
"end_time": "2018-02-04T18:42:13.264964Z",
"start_time": "2018-02-04T18:42:13.250462Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>male</td>\n",
" <td>student</td>\n",
" <td>reserach associate</td>\n",
" <td>data engineer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>female</td>\n",
" <td>student</td>\n",
" <td>nurse</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>male</td>\n",
" <td>accountant</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0 1 2 3\n",
"0 male student reserach associate data engineer\n",
"1 female student nurse None\n",
"2 male accountant None None"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"split_expand = df['tags'].str.split(',', expand=True)\n",
"split_expand"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"ExecuteTime": {
"end_time": "2018-02-04T18:42:13.273385Z",
"start_time": "2018-02-04T18:42:13.267263Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"RangeIndex(start=0, stop=4, step=1)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"split_expand.columns"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"ExecuteTime": {
"end_time": "2018-02-04T18:42:13.282327Z",
"start_time": "2018-02-04T18:42:13.276386Z"
}
},
"outputs": [],
"source": [
"split_expand.columns = ['tag_{}'.format(x) for x in range(len(split_expand.columns))]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"ExecuteTime": {
"end_time": "2018-02-04T18:42:13.290191Z",
"start_time": "2018-02-04T18:42:13.285133Z"
}
},
"outputs": [],
"source": [
"df_diff_concat = pd.concat([df, split_expand], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"ExecuteTime": {
"end_time": "2018-02-04T18:42:13.309831Z",
"start_time": "2018-02-04T18:42:13.294209Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>tags</th>\n",
" <th>tag_0</th>\n",
" <th>tag_1</th>\n",
" <th>tag_2</th>\n",
" <th>tag_3</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Daniel</td>\n",
" <td>male,student,reserach associate,data engineer</td>\n",
" <td>male</td>\n",
" <td>student</td>\n",
" <td>reserach associate</td>\n",
" <td>data engineer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Julia</td>\n",
" <td>female,student,nurse</td>\n",
" <td>female</td>\n",
" <td>student</td>\n",
" <td>nurse</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Eric</td>\n",
" <td>male,accountant</td>\n",
" <td>male</td>\n",
" <td>accountant</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name tags tag_0 tag_1 \\\n",
"0 Daniel male,student,reserach associate,data engineer male student \n",
"1 Julia female,student,nurse female student \n",
"2 Eric male,accountant male accountant \n",
"\n",
" tag_2 tag_3 \n",
"0 reserach associate data engineer \n",
"1 nurse None \n",
"2 None None "
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_diff_concat"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (root)",
"language": "python",
"name": "other-env"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
},
"toc": {
"colors": {
"hover_highlight": "#DAA520",
"navigate_num": "#000000",
"navigate_text": "#333333",
"running_highlight": "#FF0000",
"selected_highlight": "#FFD700",
"sidebar_border": "#EEEEEE",
"wrapper_background": "#FFFFFF"
},
"moveMenuLeft": true,
"nav_menu": {
"height": "66px",
"width": "252px"
},
"navigate_menu": true,
"number_sections": true,
"sideBar": true,
"threshold": 4,
"toc_cell": true,
"toc_section_display": "block",
"toc_window_display": true,
"widenNotebook": false
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment