chendaniely · February 4, 2018 18:43
diff --git a/pandas_cols_multi_variable.ipynb b/pandas_cols_multi_variable.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "toc": "true"
   },
   "source": [
    "# Table of Contents\n",
    " <p><div class=\"lev1 toc-item\"><a href=\"#Concat-with-expand\" data-toc-modified-id=\"Concat-with-expand-1\"><span class=\"toc-item-num\">1&nbsp;&nbsp;</span>Concat with expand</a></div><div class=\"lev1 toc-item\"><a href=\"#Zip\" data-toc-modified-id=\"Zip-2\"><span class=\"toc-item-num\">2&nbsp;&nbsp;</span>Zip</a></div><div class=\"lev1 toc-item\"><a href=\"#differing-num-splits\" data-toc-modified-id=\"differing-num-splits-3\"><span class=\"toc-item-num\">3&nbsp;&nbsp;</span>differing num splits</a></div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Different ways of dealing with a column that contains multiple bits of information."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-02-04T18:42:13.085179Z",
     "start_time": "2018-02-04T18:42:12.569897Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-02-04T18:42:13.093513Z",
     "start_time": "2018-02-04T18:42:13.087487Z"
    }
   },
   "outputs": [],
   "source": [
    "df = pd.DataFrame({\n",
    "    'name': ['Daniel', 'Julia', 'Eric'],\n",
    "    'tags': ['male,student', 'female,nurse', 'male,accountant']\n",
    "    })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-02-04T18:42:13.139725Z",
     "start_time": "2018-02-04T18:42:13.096442Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>tags</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Daniel</td>\n",
       "      <td>male,student</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Julia</td>\n",
       "      <td>female,nurse</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Eric</td>\n",
       "      <td>male,accountant</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     name             tags\n",
       "0  Daniel     male,student\n",
       "1   Julia     female,nurse\n",
       "2    Eric  male,accountant"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Concat with expand"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-02-04T18:42:13.149011Z",
     "start_time": "2018-02-04T18:42:13.143132Z"
    }
   },
   "outputs": [],
   "source": [
    "col_split_expand = df['tags'].str.split(',', expand=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-02-04T18:42:13.158738Z",
     "start_time": "2018-02-04T18:42:13.153747Z"
    }
   },
   "outputs": [],
   "source": [
    "df_concat = pd.concat([df, col_split_expand], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-02-04T18:42:13.179610Z",
     "start_time": "2018-02-04T18:42:13.162810Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>tags</th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Daniel</td>\n",
       "      <td>male,student</td>\n",
       "      <td>male</td>\n",
       "      <td>student</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Julia</td>\n",
       "      <td>female,nurse</td>\n",
       "      <td>female</td>\n",
       "      <td>nurse</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Eric</td>\n",
       "      <td>male,accountant</td>\n",
       "      <td>male</td>\n",
       "      <td>accountant</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     name             tags       0           1\n",
       "0  Daniel     male,student    male     student\n",
       "1   Julia     female,nurse  female       nurse\n",
       "2    Eric  male,accountant    male  accountant"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_concat"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-02-04T18:28:21.347512Z",
     "start_time": "2018-02-04T18:28:21.341501Z"
    }
   },
   "source": [
    "# Zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-02-04T18:42:13.190003Z",
     "start_time": "2018-02-04T18:42:13.183231Z"
    }
   },
   "outputs": [],
   "source": [
    "df = pd.DataFrame({\n",
    "    'name': ['Daniel', 'Julia', 'Eric'],\n",
    "    'tags': ['male,student', 'female,nurse', 'male,accountant']\n",
    "    })\n",
    "col_split = df['tags'].str.split(',')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-02-04T18:42:13.199644Z",
     "start_time": "2018-02-04T18:42:13.193101Z"
    }
   },
   "outputs": [],
   "source": [
    "df['sex'], df['occupation'] = zip(*col_split)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-02-04T18:42:13.226734Z",
     "start_time": "2018-02-04T18:42:13.204293Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>tags</th>\n",
       "      <th>sex</th>\n",
       "      <th>occupation</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Daniel</td>\n",
       "      <td>male,student</td>\n",
       "      <td>male</td>\n",
       "      <td>student</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Julia</td>\n",
       "      <td>female,nurse</td>\n",
       "      <td>female</td>\n",
       "      <td>nurse</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Eric</td>\n",
       "      <td>male,accountant</td>\n",
       "      <td>male</td>\n",
       "      <td>accountant</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     name             tags     sex  occupation\n",
       "0  Daniel     male,student    male     student\n",
       "1   Julia     female,nurse  female       nurse\n",
       "2    Eric  male,accountant    male  accountant"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-02-04T18:34:03.826602Z",
     "start_time": "2018-02-04T18:34:03.823486Z"
    }
   },
   "source": [
    "# differing num splits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-02-04T18:42:13.233696Z",
     "start_time": "2018-02-04T18:42:13.229294Z"
    }
   },
   "outputs": [],
   "source": [
    "df = pd.DataFrame({\n",
    "    'name': ['Daniel', 'Julia', 'Eric'],\n",
    "    'tags': ['male,student,reserach associate,data engineer', 'female,student,nurse', 'male,accountant']\n",
    "    })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-02-04T18:42:13.247175Z",
     "start_time": "2018-02-04T18:42:13.237616Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>tags</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Daniel</td>\n",
       "      <td>male,student,reserach associate,data engineer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Julia</td>\n",
       "      <td>female,student,nurse</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Eric</td>\n",
       "      <td>male,accountant</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     name                                           tags\n",
       "0  Daniel  male,student,reserach associate,data engineer\n",
       "1   Julia                           female,student,nurse\n",
       "2    Eric                                male,accountant"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-02-04T18:42:13.264964Z",
     "start_time": "2018-02-04T18:42:13.250462Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>male</td>\n",
       "      <td>student</td>\n",
       "      <td>reserach associate</td>\n",
       "      <td>data engineer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>female</td>\n",
       "      <td>student</td>\n",
       "      <td>nurse</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>male</td>\n",
       "      <td>accountant</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        0           1                   2              3\n",
       "0    male     student  reserach associate  data engineer\n",
       "1  female     student               nurse           None\n",
       "2    male  accountant                None           None"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "split_expand = df['tags'].str.split(',', expand=True)\n",
    "split_expand"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-02-04T18:42:13.273385Z",
     "start_time": "2018-02-04T18:42:13.267263Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RangeIndex(start=0, stop=4, step=1)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "split_expand.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-02-04T18:42:13.282327Z",
     "start_time": "2018-02-04T18:42:13.276386Z"
    }
   },
   "outputs": [],
   "source": [
    "split_expand.columns = ['tag_{}'.format(x) for x in range(len(split_expand.columns))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-02-04T18:42:13.290191Z",
     "start_time": "2018-02-04T18:42:13.285133Z"
    }
   },
   "outputs": [],
   "source": [
    "df_diff_concat = pd.concat([df, split_expand], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-02-04T18:42:13.309831Z",
     "start_time": "2018-02-04T18:42:13.294209Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>tags</th>\n",
       "      <th>tag_0</th>\n",
       "      <th>tag_1</th>\n",
       "      <th>tag_2</th>\n",
       "      <th>tag_3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Daniel</td>\n",
       "      <td>male,student,reserach associate,data engineer</td>\n",
       "      <td>male</td>\n",
       "      <td>student</td>\n",
       "      <td>reserach associate</td>\n",
       "      <td>data engineer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Julia</td>\n",
       "      <td>female,student,nurse</td>\n",
       "      <td>female</td>\n",
       "      <td>student</td>\n",
       "      <td>nurse</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Eric</td>\n",
       "      <td>male,accountant</td>\n",
       "      <td>male</td>\n",
       "      <td>accountant</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     name                                           tags   tag_0       tag_1  \\\n",
       "0  Daniel  male,student,reserach associate,data engineer    male     student   \n",
       "1   Julia                           female,student,nurse  female     student   \n",
       "2    Eric                                male,accountant    male  accountant   \n",
       "\n",
       "                tag_2          tag_3  \n",
       "0  reserach associate  data engineer  \n",
       "1               nurse           None  \n",
       "2                None           None  "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_diff_concat"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python (root)",
   "language": "python",
   "name": "other-env"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  },
  "toc": {
   "colors": {
    "hover_highlight": "#DAA520",
    "navigate_num": "#000000",
    "navigate_text": "#333333",
    "running_highlight": "#FF0000",
    "selected_highlight": "#FFD700",
    "sidebar_border": "#EEEEEE",
    "wrapper_background": "#FFFFFF"
   },
   "moveMenuLeft": true,
   "nav_menu": {
    "height": "66px",
    "width": "252px"
   },
   "navigate_menu": true,
   "number_sections": true,
   "sideBar": true,
   "threshold": 4,
   "toc_cell": true,
   "toc_section_display": "block",
   "toc_window_display": true,
   "widenNotebook": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"toc": "true"
	},
	"source": [
	"# Table of Contents\n",
	" <p><div class=\"lev1 toc-item\"><a href=\"#Concat-with-expand\" data-toc-modified-id=\"Concat-with-expand-1\"><span class=\"toc-item-num\">1  </span>Concat with expand</a></div><div class=\"lev1 toc-item\"><a href=\"#Zip\" data-toc-modified-id=\"Zip-2\"><span class=\"toc-item-num\">2  </span>Zip</a></div><div class=\"lev1 toc-item\"><a href=\"#differing-num-splits\" data-toc-modified-id=\"differing-num-splits-3\"><span class=\"toc-item-num\">3  </span>differing num splits</a></div>"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Different ways of dealing with a column that contains multiple bits of information."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-02-04T18:42:13.085179Z",
	"start_time": "2018-02-04T18:42:12.569897Z"
	}
	},
	"outputs": [],
	"source": [
	"import pandas as pd"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-02-04T18:42:13.093513Z",
	"start_time": "2018-02-04T18:42:13.087487Z"
	}
	},
	"outputs": [],
	"source": [
	"df = pd.DataFrame({\n",
	" 'name': ['Daniel', 'Julia', 'Eric'],\n",
	" 'tags': ['male,student', 'female,nurse', 'male,accountant']\n",
	" })"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-02-04T18:42:13.139725Z",
	"start_time": "2018-02-04T18:42:13.096442Z"
	}
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>name</th>\n",
	" <th>tags</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>Daniel</td>\n",
	" <td>male,student</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>Julia</td>\n",
	" <td>female,nurse</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>Eric</td>\n",
	" <td>male,accountant</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" name tags\n",
	"0 Daniel male,student\n",
	"1 Julia female,nurse\n",
	"2 Eric male,accountant"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Concat with expand"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-02-04T18:42:13.149011Z",
	"start_time": "2018-02-04T18:42:13.143132Z"
	}
	},
	"outputs": [],
	"source": [
	"col_split_expand = df['tags'].str.split(',', expand=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-02-04T18:42:13.158738Z",
	"start_time": "2018-02-04T18:42:13.153747Z"
	}
	},
	"outputs": [],
	"source": [
	"df_concat = pd.concat([df, col_split_expand], axis=1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-02-04T18:42:13.179610Z",
	"start_time": "2018-02-04T18:42:13.162810Z"
	}
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>name</th>\n",
	" <th>tags</th>\n",
	" <th>0</th>\n",
	" <th>1</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>Daniel</td>\n",
	" <td>male,student</td>\n",
	" <td>male</td>\n",
	" <td>student</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>Julia</td>\n",
	" <td>female,nurse</td>\n",
	" <td>female</td>\n",
	" <td>nurse</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>Eric</td>\n",
	" <td>male,accountant</td>\n",
	" <td>male</td>\n",
	" <td>accountant</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" name tags 0 1\n",
	"0 Daniel male,student male student\n",
	"1 Julia female,nurse female nurse\n",
	"2 Eric male,accountant male accountant"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df_concat"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-02-04T18:28:21.347512Z",
	"start_time": "2018-02-04T18:28:21.341501Z"
	}
	},
	"source": [
	"# Zip"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-02-04T18:42:13.190003Z",
	"start_time": "2018-02-04T18:42:13.183231Z"
	}
	},
	"outputs": [],
	"source": [
	"df = pd.DataFrame({\n",
	" 'name': ['Daniel', 'Julia', 'Eric'],\n",
	" 'tags': ['male,student', 'female,nurse', 'male,accountant']\n",
	" })\n",
	"col_split = df['tags'].str.split(',')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-02-04T18:42:13.199644Z",
	"start_time": "2018-02-04T18:42:13.193101Z"
	}
	},
	"outputs": [],
	"source": [
	"df['sex'], df['occupation'] = zip(*col_split)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-02-04T18:42:13.226734Z",
	"start_time": "2018-02-04T18:42:13.204293Z"
	}
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>name</th>\n",
	" <th>tags</th>\n",
	" <th>sex</th>\n",
	" <th>occupation</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>Daniel</td>\n",
	" <td>male,student</td>\n",
	" <td>male</td>\n",
	" <td>student</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>Julia</td>\n",
	" <td>female,nurse</td>\n",
	" <td>female</td>\n",
	" <td>nurse</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>Eric</td>\n",
	" <td>male,accountant</td>\n",
	" <td>male</td>\n",
	" <td>accountant</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" name tags sex occupation\n",
	"0 Daniel male,student male student\n",
	"1 Julia female,nurse female nurse\n",
	"2 Eric male,accountant male accountant"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-02-04T18:34:03.826602Z",
	"start_time": "2018-02-04T18:34:03.823486Z"
	}
	},
	"source": [
	"# differing num splits"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-02-04T18:42:13.233696Z",
	"start_time": "2018-02-04T18:42:13.229294Z"
	}
	},
	"outputs": [],
	"source": [
	"df = pd.DataFrame({\n",
	" 'name': ['Daniel', 'Julia', 'Eric'],\n",
	" 'tags': ['male,student,reserach associate,data engineer', 'female,student,nurse', 'male,accountant']\n",
	" })"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-02-04T18:42:13.247175Z",
	"start_time": "2018-02-04T18:42:13.237616Z"
	}
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>name</th>\n",
	" <th>tags</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>Daniel</td>\n",
	" <td>male,student,reserach associate,data engineer</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>Julia</td>\n",
	" <td>female,student,nurse</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>Eric</td>\n",
	" <td>male,accountant</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" name tags\n",
	"0 Daniel male,student,reserach associate,data engineer\n",
	"1 Julia female,student,nurse\n",
	"2 Eric male,accountant"
	]
	},
	"execution_count": 11,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-02-04T18:42:13.264964Z",
	"start_time": "2018-02-04T18:42:13.250462Z"
	}
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>0</th>\n",
	" <th>1</th>\n",
	" <th>2</th>\n",
	" <th>3</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>male</td>\n",
	" <td>student</td>\n",
	" <td>reserach associate</td>\n",
	" <td>data engineer</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>female</td>\n",
	" <td>student</td>\n",
	" <td>nurse</td>\n",
	" <td>None</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>male</td>\n",
	" <td>accountant</td>\n",
	" <td>None</td>\n",
	" <td>None</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" 0 1 2 3\n",
	"0 male student reserach associate data engineer\n",
	"1 female student nurse None\n",
	"2 male accountant None None"
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"split_expand = df['tags'].str.split(',', expand=True)\n",
	"split_expand"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-02-04T18:42:13.273385Z",
	"start_time": "2018-02-04T18:42:13.267263Z"
	}
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"RangeIndex(start=0, stop=4, step=1)"
	]
	},
	"execution_count": 13,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"split_expand.columns"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-02-04T18:42:13.282327Z",
	"start_time": "2018-02-04T18:42:13.276386Z"
	}
	},
	"outputs": [],
	"source": [
	"split_expand.columns = ['tag_{}'.format(x) for x in range(len(split_expand.columns))]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-02-04T18:42:13.290191Z",
	"start_time": "2018-02-04T18:42:13.285133Z"
	}
	},
	"outputs": [],
	"source": [
	"df_diff_concat = pd.concat([df, split_expand], axis=1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-02-04T18:42:13.309831Z",
	"start_time": "2018-02-04T18:42:13.294209Z"
	}
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>name</th>\n",
	" <th>tags</th>\n",
	" <th>tag_0</th>\n",
	" <th>tag_1</th>\n",
	" <th>tag_2</th>\n",
	" <th>tag_3</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>Daniel</td>\n",
	" <td>male,student,reserach associate,data engineer</td>\n",
	" <td>male</td>\n",
	" <td>student</td>\n",
	" <td>reserach associate</td>\n",
	" <td>data engineer</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>Julia</td>\n",
	" <td>female,student,nurse</td>\n",
	" <td>female</td>\n",
	" <td>student</td>\n",
	" <td>nurse</td>\n",
	" <td>None</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>Eric</td>\n",
	" <td>male,accountant</td>\n",
	" <td>male</td>\n",
	" <td>accountant</td>\n",
	" <td>None</td>\n",
	" <td>None</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" name tags tag_0 tag_1 \\\n",
	"0 Daniel male,student,reserach associate,data engineer male student \n",
	"1 Julia female,student,nurse female student \n",
	"2 Eric male,accountant male accountant \n",
	"\n",
	" tag_2 tag_3 \n",
	"0 reserach associate data engineer \n",
	"1 nurse None \n",
	"2 None None "
	]
	},
	"execution_count": 16,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df_diff_concat"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python (root)",
	"language": "python",
	"name": "other-env"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.4"
	},
	"toc": {
	"colors": {
	"hover_highlight": "#DAA520",
	"navigate_num": "#000000",
	"navigate_text": "#333333",
	"running_highlight": "#FF0000",
	"selected_highlight": "#FFD700",
	"sidebar_border": "#EEEEEE",
	"wrapper_background": "#FFFFFF"
	},
	"moveMenuLeft": true,
	"nav_menu": {
	"height": "66px",
	"width": "252px"
	},
	"navigate_menu": true,
	"number_sections": true,
	"sideBar": true,
	"threshold": 4,
	"toc_cell": true,
	"toc_section_display": "block",
	"toc_window_display": true,
	"widenNotebook": false
	},
	"varInspector": {
	"cols": {
	"lenName": 16,
	"lenType": 16,
	"lenVar": 40
	},
	"kernels_config": {
	"python": {
	"delete_cmd_postfix": "",
	"delete_cmd_prefix": "del ",
	"library": "var_list.py",
	"varRefreshCmd": "print(var_dic_list())"
	},
	"r": {
	"delete_cmd_postfix": ") ",
	"delete_cmd_prefix": "rm(",
	"library": "var_list.r",
	"varRefreshCmd": "cat(var_dic_list()) "
	}
	},
	"types_to_exclude": [
	"module",
	"function",
	"builtin_function_or_method",
	"instance",
	"_Feature"
	],
	"window_display": false
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}