Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save e-roux/8b59bb6a04404e5903b0598ff25dda5d to your computer and use it in GitHub Desktop.
Save e-roux/8b59bb6a04404e5903b0598ff25dda5d to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"button": false,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"source": [
"### About dataset"
]
},
{
"cell_type": "markdown",
"metadata": {
"button": false,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"source": [
"This dataset is about past loans. The __Loan_train.csv__ data set includes details of 346 customers whose loan are already paid off or defaulted. It includes following fields:\n",
"\n",
"| Field | Description |\n",
"|----------------|---------------------------------------------------------------------------------------|\n",
"| Loan_status | Whether a loan is paid off on in collection |\n",
"| Principal | Basic principal loan amount at the |\n",
"| Terms | Origination terms which can be weekly (7 days), biweekly, and monthly payoff schedule |\n",
"| Effective_date | When the loan got originated and took effects |\n",
"| Due_date | Since it’s one-time payoff schedule, each loan has one single due date |\n",
"| Age | Age of applicant |\n",
"| Education | Education of applicant |\n",
"| Gender | The gender of applicant |"
]
},
{
"cell_type": "markdown",
"metadata": {
"button": false,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"source": [
"Lets download the dataset"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"button": false,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [],
"source": [
"!curl -sLo loan_train.csv https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/loan_train.csv\n",
"!curl -sLo loan_test.csv https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/loan_test.csv"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"!pip install -U --force sklearn watermark seaborn >/dev/null"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"button": false,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2020-07-30 16:07:59 \n",
"\n",
"CPython 3.8.3\n",
"IPython 7.14.0\n",
"\n",
"sklearn 0.0\n",
"numpy 1.19.1\n",
"pandas 1.1.0\n",
"matplotlib 3.3.0\n",
"seaborn 0.10.1\n"
]
}
],
"source": [
"from itertools import product\n",
"\n",
"from pathlib import Path\n",
"from typing import (List, Tuple, Union)\n",
"\n",
"import numpy as np\n",
"import matplotlib as mpl\n",
"import matplotlib.pyplot as plt\n",
"\n",
"import pandas as pd\n",
"from pandas import DataFrame, Series\n",
"from numpy import ndarray\n",
"\n",
"import matplotlib.ticker as ticker\n",
"from sklearn.base import (BaseEstimator, TransformerMixin)\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import (LabelEncoder, StandardScaler)\n",
"from sklearn.model_selection import GridSearchCV, train_test_split\n",
"\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.svm import SVC\n",
"from sklearn.linear_model import LogisticRegression\n",
"\n",
"# In binary and multiclass classification, accuracy_score\n",
"# is equal to the jaccard_score function.\n",
"from sklearn.metrics import (\n",
" accuracy_score,\n",
" jaccard_score,\n",
" f1_score,\n",
" log_loss\n",
")\n",
"\n",
"%load_ext watermark\n",
"%watermark -dvtp sklearn,numpy,pandas,matplotlib,seaborn"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Ignore DataConversionWarning from sklearn\n",
"import warnings\n",
"from sklearn.exceptions import DataConversionWarning\n",
"warnings.filterwarnings(action='ignore', category=DataConversionWarning)"
]
},
{
"cell_type": "markdown",
"metadata": {
"button": false,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"source": [
"### Load Data From CSV File "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"button": false,
"new_sheet": false,
"run_control": {
"read_only": false
},
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>loan_status</th>\n",
" <th>principal</th>\n",
" <th>terms</th>\n",
" <th>effective_date</th>\n",
" <th>due_date</th>\n",
" <th>age</th>\n",
" <th>education</th>\n",
" <th>gender</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>PAIDOFF</td>\n",
" <td>1000</td>\n",
" <td>30</td>\n",
" <td>2016-09-08</td>\n",
" <td>2016-10-07</td>\n",
" <td>45</td>\n",
" <td>High School or Below</td>\n",
" <td>male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>PAIDOFF</td>\n",
" <td>1000</td>\n",
" <td>30</td>\n",
" <td>2016-09-08</td>\n",
" <td>2016-10-07</td>\n",
" <td>33</td>\n",
" <td>Bechalor</td>\n",
" <td>female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>PAIDOFF</td>\n",
" <td>1000</td>\n",
" <td>15</td>\n",
" <td>2016-09-08</td>\n",
" <td>2016-09-22</td>\n",
" <td>27</td>\n",
" <td>college</td>\n",
" <td>male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>PAIDOFF</td>\n",
" <td>1000</td>\n",
" <td>30</td>\n",
" <td>2016-09-09</td>\n",
" <td>2016-10-08</td>\n",
" <td>28</td>\n",
" <td>college</td>\n",
" <td>female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>PAIDOFF</td>\n",
" <td>1000</td>\n",
" <td>30</td>\n",
" <td>2016-09-09</td>\n",
" <td>2016-10-08</td>\n",
" <td>29</td>\n",
" <td>college</td>\n",
" <td>male</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" loan_status principal terms effective_date due_date age \\\n",
"0 PAIDOFF 1000 30 2016-09-08 2016-10-07 45 \n",
"1 PAIDOFF 1000 30 2016-09-08 2016-10-07 33 \n",
"2 PAIDOFF 1000 15 2016-09-08 2016-09-22 27 \n",
"3 PAIDOFF 1000 30 2016-09-09 2016-10-08 28 \n",
"4 PAIDOFF 1000 30 2016-09-09 2016-10-08 29 \n",
"\n",
" education gender \n",
"0 High School or Below male \n",
"1 Bechalor female \n",
"2 college male \n",
"3 college female \n",
"4 college male "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(\n",
" 'loan_train.csv',\n",
" parse_dates=['due_date', 'effective_date'],\n",
" dtype={\n",
" 'loan_status': 'category',\n",
" 'education': 'category',\n",
" 'Gender': 'category',\n",
" },\n",
" usecols=lambda x: \"Unnamed\" not in x)\n",
"# Make all columns names to lower case\n",
"df.columns = df.columns.str.lower()\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 346 entries, 0 to 345\n",
"Data columns (total 8 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 loan_status 346 non-null category \n",
" 1 principal 346 non-null int64 \n",
" 2 terms 346 non-null int64 \n",
" 3 effective_date 346 non-null datetime64[ns]\n",
" 4 due_date 346 non-null datetime64[ns]\n",
" 5 age 346 non-null int64 \n",
" 6 education 346 non-null category \n",
" 7 gender 346 non-null category \n",
"dtypes: category(3), datetime64[ns](2), int64(3)\n",
"memory usage: 15.0 KB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"path = Union[str, Path]\n",
"\n",
"class DataFrameSelector(BaseEstimator, TransformerMixin):\n",
" def __init__(self, weekend_on: int = 3):\n",
" self.weekend_on = weekend_on\n",
" def fit(self, X: DataFrame, y=None) -> 'DataFrameSelector':\n",
" return self\n",
" def transform(self, X: DataFrame) -> ndarray:\n",
" attributes: List[str] = ['principal', 'terms', 'age', 'gender', 'education']\n",
" # Include new feature 'loan at end of the week' \n",
" X: DataFrame = X.copy()\n",
" dayofweek: Series = X['effective_date'].dt.dayofweek\n",
" X['weekend'] = dayofweek.apply(lambda x: 1 if (x>3) else 0) # (dayofweek >= weekend_on).astype(int) \n",
" X['gender'].replace(to_replace=['male','female'], value=[0,1],inplace=True)\n",
"\n",
" features: DataFrame = X[attributes + ['weekend']]\n",
"\n",
" encoded_features: DataFrame = pd.get_dummies(features).select_dtypes(exclude=['category'])\n",
" return encoded_features.values\n",
"\n",
"def read_csv(p: path) -> DataFrame:\n",
" df = pd.read_csv(\n",
" p,\n",
" parse_dates=['due_date', 'effective_date'],\n",
" dtype={\n",
" 'loan_status': 'category',\n",
" 'education': 'category',\n",
" 'Gender': 'category',\n",
" },\n",
" usecols=lambda x: \"Unnamed\" not in x)\n",
" \n",
" df.columns = df.columns.str.lower() # Make all columns names to lower case\n",
"\n",
" return df"
]
},
{
"cell_type": "markdown",
"metadata": {
"button": false,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"source": [
"# Classification "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"df_train = read_csv('./loan_train.csv')\n",
"df_test = read_csv('./loan_test.csv')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"y_encoder = LabelEncoder()\n",
"y_encoder.fit(df_train['loan_status'].values)\n",
"y_train = y_encoder.transform(df_train['loan_status'].values)\n",
"y_test = y_encoder.transform(df_test['loan_status'].values)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# K Nearest Neighbor(KNN)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## K Nearest Neighbor(KNN): find best K by train/test split"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"_X_train, _X_test, _y_train, _y_test = train_test_split(\n",
" df_train,\n",
" df_train['loan_status'].values,\n",
" test_size=0.2,\n",
" random_state=42\n",
")\n",
"\n",
"with plt.style.context('ggplot'):\n",
" fig, ax = plt.subplots(1)\n",
" x = range(1,10)\n",
" y = [np.mean(_y_test == Pipeline(steps=[\n",
" ('selector', DataFrameSelector()),\n",
" ('std_scaler', StandardScaler()),\n",
" ('knc', KNeighborsClassifier(n_neighbors=n))\n",
" ]) \\\n",
" .fit(_X_train, _y_train) \\\n",
" .predict(_X_test)) \n",
" for n in x]\n",
" ax.plot(x, y)\n",
" ax.set(ylim=[0, 1], xlabel='Number of neighbours', ylabel='Accuracy on test samples')\n",
" ax.axvline(np.argmax(y)+1, c='blue')\n",
" plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**7** seems to be the best coefficient with this random seed. Let's try a GridSearchCV"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## K Nearest Neighbor(KNN): GridSearchCV¶"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pipeline(steps=[('selector', DataFrameSelector(weekend_on=4)),\n",
" ('std_scaler', StandardScaler()),\n",
" ('knc', KNeighborsClassifier())])\n",
"Best parameter (CV score=0.703):\n",
"Train set Accuracy: 0.815028901734104\n",
"Jaccard similarity score: 0.7\n",
"F1-score: 0.7001989201477693\n"
]
}
],
"source": [
"# Parameters of pipelines can be set using ‘__’ separated parameter names:\n",
"param_grid = {\n",
" 'knc__n_neighbors': np.arange(1, 10),\n",
"}\n",
"\n",
"knn_pipeline = Pipeline(steps=[\n",
" ('selector', DataFrameSelector(weekend_on=4)),\n",
" ('std_scaler', StandardScaler()),\n",
" ('knc', KNeighborsClassifier(n_neighbors=4))\n",
"])\n",
"\n",
"search = GridSearchCV(knn_pipeline, param_grid, cv=3, n_jobs=-1)\n",
"search.fit(df_train, y_train)\n",
"\n",
"knn_pipeline = estimator = search.best_estimator_\n",
"\n",
"print(estimator)\n",
"print(\"Best parameter (CV score=%0.3f):\" % search.best_score_)\n",
"# In binary and multiclass classification, accuracy_score\n",
"# is equal to the jaccard_score function.\n",
"print(\"Train set Accuracy: \", accuracy_score(y_train, estimator.predict(df_train)))\n",
"\n",
"y_hat = estimator.predict(df_test)\n",
"print(\"Jaccard similarity score: \", jaccard_score(y_test, y_hat))\n",
"print(\"F1-score: \", f1_score(y_test, y_hat, average='weighted'))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"with plt.style.context('ggplot'):\n",
" fig, ax = plt.subplots(1)\n",
" x = [_res['knc__n_neighbors'] for _res in search.cv_results_['params']]\n",
" y_mean = search.cv_results_['mean_test_score']\n",
" sigma = search.cv_results_['std_test_score']\n",
" ax.plot(x, y_mean)\n",
" ax.fill_between(x, y_mean - sigma, y_mean + sigma, facecolor ='blue', alpha = 0.25)\n",
" ax.set(ylim=[0, 1], xlabel='Number of neighbours', ylabel='jaccard_score')\n",
" plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Not convinced by 7 as an optimum!, but let's take 7"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"knn_pipeline = Pipeline(steps=[\n",
" ('selector', DataFrameSelector()),\n",
" ('std_scaler', StandardScaler()),\n",
" ('knc', KNeighborsClassifier(n_neighbors=7))\n",
" ]) \\\n",
" .fit(df_train, y_train) "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Decision Tree"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"I'm using a `DecisionTreeClassifier(criterion=\"entropy\", max_depth=1)` model. THe GridSearchCV finds the best `max_depth` by itself"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pipeline(steps=[('selector', DataFrameSelector(weekend_on=4)),\n",
" ('std_scaler', StandardScaler()),\n",
" ('dt',\n",
" DecisionTreeClassifier(criterion='entropy', max_depth=3))])\n",
"Best parameter (CV score=0.674):\n",
"Train set Accuracy: 0.7514450867052023\n",
"Jaccard similarity score: 0.7407407407407407\n",
"F1-score: 0.6304176516942475\n"
]
}
],
"source": [
"# Parameters of pipelines can be set using ‘__’ separated parameter names:\n",
"param_grid = {\n",
" 'dt__max_depth': range(1, 10)\n",
"}\n",
"\n",
"dt_pipeline = Pipeline(steps=[\n",
" ('selector', DataFrameSelector(weekend_on=4)),\n",
" ('std_scaler', StandardScaler()),\n",
" ('dt', DecisionTreeClassifier(criterion=\"entropy\", max_depth=1))\n",
"])\n",
"\n",
"search = GridSearchCV(dt_pipeline, param_grid, cv=3, n_jobs=-1)\n",
"search.fit(df_train, y_train)\n",
"\n",
"dt_pipeline = estimator = search.best_estimator_\n",
"\n",
"print(estimator)\n",
"print(\"Best parameter (CV score=%0.3f):\" % search.best_score_)\n",
"# In binary and multiclass classification, accuracy_score\n",
"# is equal to the jaccard_score function.\n",
"print(\"Train set Accuracy: \", accuracy_score(y_train, estimator.predict(df_train)))\n",
"\n",
"y_hat = estimator.predict(df_test)\n",
"print(\"Jaccard similarity score: \", jaccard_score(y_test, y_hat))\n",
"print(\"F1-score: \", f1_score(y_test, y_hat, average='weighted'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Support Vector Machine"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pipeline(steps=[('selector', DataFrameSelector(weekend_on=4)),\n",
" ('std_scaler', StandardScaler()),\n",
" ('svc', SVC(C=1.2222222222222223))])\n",
"Best parameter (CV score=0.674):\n",
"Train set Accuracy: 0.7774566473988439\n",
"Jaccard similarity score: 0.7222222222222222\n",
"F1-score: 0.6212664277180406\n"
]
}
],
"source": [
"# Parameters of pipelines can be set using ‘__’ separated parameter names:\n",
"param_grid = {\n",
" 'svc__C': np.linspace(1, 3, 10),\n",
"}\n",
"\n",
"svc_pipeline = Pipeline(steps=[\n",
" ('selector', DataFrameSelector(weekend_on=4)),\n",
" ('std_scaler', StandardScaler()),\n",
" ('svc', SVC(C=0.01, kernel='rbf'))\n",
"])\n",
"\n",
"search = GridSearchCV(svc_pipeline, param_grid, cv=3, n_jobs=-1)\n",
"search.fit(df_train.copy(), y_train.copy())\n",
"\n",
"svc_pipeline = estimator = search.best_estimator_\n",
"\n",
"print(estimator)\n",
"print(\"Best parameter (CV score=%0.3f):\" % search.best_score_)\n",
"# In binary and multiclass classification, accuracy_score\n",
"# is equal to the jaccard_score function.\n",
"print(\"Train set Accuracy: \", accuracy_score(y_train, estimator.predict(df_train)))\n",
"\n",
"y_hat = estimator.predict(df_test)\n",
"print(\"Jaccard similarity score: \", jaccard_score(y_test, y_hat))\n",
"print(\"F1-score: \", f1_score(y_test, y_hat, average='weighted', labels=np.unique(y_hat)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Logistic Regression"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameter (CV score=0.749):\n",
"Train set Accuracy: 0.7514450867052023\n",
"Jaccard similarity score: 0.7407407407407407\n",
"F1-score: 0.851063829787234\n",
"Log-loss: 0.5190440232613492\n"
]
}
],
"source": [
"# Parameters of pipelines can be set using ‘__’ separated parameter names:\n",
"param_grid = {\n",
" 'lr__C': np.linspace(0.01, 1, 10),\n",
" 'lr__tol': np.linspace(0.0001, 0.001, 10),\n",
"}\n",
"\n",
"lr_pipeline = Pipeline(steps=[\n",
" ('selector', DataFrameSelector(weekend_on=4)),\n",
" ('std_scaler', StandardScaler()),\n",
" ('lr', LogisticRegression(solver='lbfgs'))\n",
"])\n",
"\n",
"search = GridSearchCV(lr_pipeline, param_grid, cv=3, n_jobs=-1)\n",
"search.fit(df_train, y_train)\n",
"\n",
"lr_pipeline = estimator = search.best_estimator_\n",
"\n",
"print(\"Best parameter (CV score=%0.3f):\" % search.best_score_)\n",
"# In binary and multiclass classification, accuracy_score\n",
"# is equal to the jaccard_score function.\n",
"print(\"Train set Accuracy: \", accuracy_score(y_train, estimator.predict(df_train)))\n",
"\n",
"\n",
"y_hat = estimator.predict(df_test)\n",
"y_hat_prob = estimator.predict_proba(df_test)\n",
"print(\"Jaccard similarity score: \", jaccard_score(y_test, y_hat))\n",
"print(\"F1-score: \", f1_score(y_test, y_hat, average='weighted', labels=np.unique(y_hat)))\n",
"print(\"Log-loss: \", log_loss(y_test, y_hat_prob))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Model Evaluation using Test set"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"df_test = read_csv('./loan_test.csv')\n",
"y_test = y_encoder.transform(df_test['loan_status'].values)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## KNN"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Jaccard score: 0.71\n",
"F1-score: 0.67\n"
]
}
],
"source": [
"y_hat = knn_pipeline.predict(df_test)\n",
"print(f\"Jaccard score: {jaccard_score(y_test, y_hat):.2f}\")\n",
"print(f\"F1-score: {f1_score(y_test, y_hat, average='weighted'):.2f}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Decision Tree"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Jaccard score: 0.74\n",
"F1-score: 0.63\n"
]
}
],
"source": [
"y_hat = dt_pipeline.predict(df_test)\n",
"print(f\"Jaccard score: {jaccard_score(y_test, y_hat):.2f}\")\n",
"print(f\"F1-score: {f1_score(y_test, y_hat, average='weighted'):.2f}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Logistic Regression"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Jaccard score: 0.74\n",
"F1-score: 0.63\n",
"LogLoss: 0.52\n"
]
}
],
"source": [
"y_hat = lr_pipeline.predict(df_test)\n",
"print(f\"Jaccard score: {jaccard_score(y_test, y_hat):.2f}\")\n",
"print(f\"F1-score: {f1_score(y_test, y_hat, average='weighted'):.2f}\")\n",
"y_hat_prob = lr_pipeline.predict_proba(df_test)\n",
"print(f\"LogLoss: {log_loss(y_test, y_hat_prob):.2f}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## SVM"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Jaccard score: 0.72\n",
"F1-score: 0.62\n"
]
}
],
"source": [
"y_hat = svc_pipeline.predict(df_test)\n",
"print(f\"Jaccard score: {jaccard_score(y_test, y_hat):.2f}\")\n",
"print(f\"F1-score: {f1_score(y_test, y_hat, average='weighted'):.2f}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Report"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Jaccard</th>\n",
" <th>F1-score</th>\n",
" <th>LogLoss</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>KNN</th>\n",
" <td>0.71</td>\n",
" <td>0.67</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Decision Tree</th>\n",
" <td>0.74</td>\n",
" <td>0.63</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>SVM</th>\n",
" <td>0.72</td>\n",
" <td>0.62</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Log. reg.</th>\n",
" <td>0.74</td>\n",
" <td>0.63</td>\n",
" <td>0.52</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Jaccard F1-score LogLoss\n",
"KNN 0.71 0.67 NaN\n",
"Decision Tree 0.74 0.63 NaN\n",
"SVM 0.72 0.62 NaN\n",
"Log. reg. 0.74 0.63 0.52"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipelines = {\n",
" 'KNN': knn_pipeline,\n",
" 'Decision Tree': dt_pipeline,\n",
" 'SVM': svc_pipeline,\n",
" 'Log. reg.': lr_pipeline\n",
"}\n",
"metrics = {\n",
" 'Jaccard': jaccard_score,\n",
" 'F1-score': f1_score,\n",
" 'LogLoss': log_loss\n",
"}\n",
"results = DataFrame(columns=list(metrics), index=list(pipelines))\n",
"\n",
"\n",
"y_hat = None\n",
"\n",
"for _p in pipelines:\n",
" y_hat = pipelines[_p].predict(df_test)\n",
" y_hat_prob = (_p == 'Log. reg.') and pipelines[_p].predict_proba(df_test)\n",
" \n",
" results.loc[_p, 'Jaccard'] = jaccard_score(y_test, y_hat)\n",
" results.loc[_p, 'F1-score'] = f1_score(y_test, y_hat, average='weighted')\n",
" results.loc[_p, 'LogLoss'] = log_loss(y_test, y_hat_prob) if _p == 'Log. reg.' else np.NaN\n",
"\n",
"results.astype(float).round(2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Results are impacted by the small number of samples while using a GridSearchCV**"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Xarray",
"language": "python",
"name": "xarray"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment