Created
June 14, 2016 19:24
-
-
Save conradlee/6ba8a1ebbd4f86a5bef1ab1065eaa1f4 to your computer and use it in GitHub Desktop.
Which candidate would a bayesian bandit write articles on?
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(8067.9780233487645, 0.0)\n", | |
"Mu: 7.426\n", | |
"Sigma: 2.153\n", | |
"\n", | |
"Empirical mean 16450.254\n", | |
"Lognormal distro mean 17039.480\n", | |
"\n", | |
"Empirical median: 2576.000\n", | |
"Lognormal distro median 1679.491\n" | |
] | |
} | |
], | |
"source": [ | |
"import pandas as pd\n", | |
"import powerlaw\n", | |
"from matplotlib import pyplot\n", | |
"import numpy as np\n", | |
"\n", | |
"xmin=200\n", | |
"bounded_df = pd.read_csv(\"overall_view_distribution.csv\")\n", | |
"views_distro = bounded_df['views'].values\n", | |
"\n", | |
"fit = powerlaw.Fit(views_distro, discrete=True, xmin=xmin, xmax=views_distro.max())\n", | |
"print fit.distribution_compare('lognormal',\"power_law\")\n", | |
"\n", | |
"fig = pyplot.gcf()\n", | |
"fig.clf()\n", | |
"fig_powerlaw = fit.plot_pdf(linewidth=2, label='data')\n", | |
"fit.power_law.plot_pdf(ax=fig_powerlaw, color='g', linestyle='--', label='powerlaw')\n", | |
"fit.lognormal.plot_pdf(ax=fig_powerlaw, color='r', linestyle='--', label='lognormal')\n", | |
"\n", | |
"fig_powerlaw.legend()\n", | |
"fig_powerlaw.figure.savefig('powerlaw_all.svg')\n", | |
"\n", | |
"mu = fit.lognormal.mu\n", | |
"print \"Mu: %0.3f\" % mu\n", | |
"print \"Sigma: %0.3f\\n\" % fit.lognormal.sigma\n", | |
"print \"Empirical mean %0.3f\" % views_distro.mean()\n", | |
"print \"Lognormal distro mean %0.3f\\n\" % np.exp(fit.lognormal.mu + fit.lognormal.sigma**2/2)\n", | |
"print \"Empirical median: %0.3f\" % np.median(views_distro)\n", | |
"print \"Lognormal distro median %0.3f\" % np.exp(fit.lognormal.mu)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Candidate Cruz Ted selected 10 0.0100 pct of the time\n", | |
"Candidate Donald Trump selected 21374 21.3740 pct of the time\n", | |
"Candidate Hillary Clinton selected 53276 53.2760 pct of the time\n", | |
"Candidate John Kasich selected 0 0.0000 pct of the time\n", | |
"Candidate Sanders Bernie selected 25340 25.3400 pct of the time\n" | |
] | |
} | |
], | |
"source": [ | |
"import bayesian_tools # A module with the funcitons defined at http://engineering.richrelevance.com/bayesian-ab-testing-with-a-log-normal-model/\n", | |
"import seaborn as sns\n", | |
"\n", | |
"v, c = [], []\n", | |
"n_draws=100000\n", | |
"candidates_ordered = ['John Kasich', 'Cruz Ted', 'Sanders Bernie', 'Hillary Clinton', 'Donald Trump']\n", | |
"candidate_map = {\"Hillary Clinton\": \"Clinton\",\n", | |
" \"Donald Trump\": \"Trump\",\n", | |
" \"Sanders Bernie\": \"Sanders\",\n", | |
" \"Cruz Ted\": \"Cruz\",\n", | |
" \"John Kasich\": \"Kasich\"}\n", | |
"aligned_samples = []\n", | |
"\n", | |
"for candidate in candidates_ordered:\n", | |
" cand_df = bounded_df[bounded_df['candidate'] == candidate]\n", | |
" views_data = cand_df['views'].values\n", | |
" drawn_means = bayesian_tools.draw_log_normal_means(views_data, mu, 1., 1., 1., n_samples=n_draws)\n", | |
" aligned_samples.append(drawn_means)\n", | |
" v.extend(list(drawn_means))\n", | |
" c.extend([candidate_map[candidate] for m in drawn_means])\n", | |
" \n", | |
"aligned_samples = np.array(aligned_samples).transpose()\n", | |
"max_indices = aligned_samples.argmax(axis=1)\n", | |
"for cand_idx, cand in enumerate(candidates):\n", | |
" times_cand_selected = (max_indices == cand_idx).sum()\n", | |
" print \"Candidate %s selected %d %0.4f pct of the time\" % (cand, times_cand_selected, 100 * times_cand_selected / float(n_draws))\n", | |
" \n", | |
"means_df = pd.DataFrame({'mean_views': v, 'candidate': c})\n", | |
"fig = pyplot.gcf()\n", | |
"fig.clf()\n", | |
"ax = fig.add_subplot(111)\n", | |
"order = [\"Trump\", \"Clinton\", \"Sanders\", \"Cruz\", \"Kasich\"]\n", | |
"sns.violinplot(x=\"candidate\", y=\"mean_views\", data=means_df, order=order)\n", | |
"fig.savefig(\"violinplot.svg\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"2015-11-02T00:00:00.000000000\n", | |
"\tCandidate Cruz Ted selected 1.3470 pct of the time\n", | |
"\tCandidate Donald Trump selected 14.5350 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 34.2270 pct of the time\n", | |
"\tCandidate John Kasich selected 0.0190 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 49.8720 pct of the time\n", | |
"2015-11-09T00:00:00.000000000\n", | |
"\tCandidate Cruz Ted selected 1.3130 pct of the time\n", | |
"\tCandidate Donald Trump selected 56.5940 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 38.3740 pct of the time\n", | |
"\tCandidate John Kasich selected 2.5760 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 1.1430 pct of the time\n", | |
"2015-11-16T00:00:00.000000000\n", | |
"\tCandidate Cruz Ted selected 10.7650 pct of the time\n", | |
"\tCandidate Donald Trump selected 37.3900 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 20.9160 pct of the time\n", | |
"\tCandidate John Kasich selected 5.2950 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 25.6340 pct of the time\n", | |
"2015-11-23T00:00:00.000000000\n", | |
"\tCandidate Cruz Ted selected 1.2340 pct of the time\n", | |
"\tCandidate Donald Trump selected 23.8620 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 73.4970 pct of the time\n", | |
"\tCandidate John Kasich selected 0.1360 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 1.2710 pct of the time\n", | |
"2015-11-30T00:00:00.000000000\n", | |
"\tCandidate Cruz Ted selected 5.6930 pct of the time\n", | |
"\tCandidate Donald Trump selected 26.2710 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 33.1520 pct of the time\n", | |
"\tCandidate John Kasich selected 0.0260 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 34.8580 pct of the time\n", | |
"2015-12-07T00:00:00.000000000\n", | |
"\tCandidate Cruz Ted selected 0.1220 pct of the time\n", | |
"\tCandidate Donald Trump selected 36.8460 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 40.5650 pct of the time\n", | |
"\tCandidate John Kasich selected 1.9250 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 20.5420 pct of the time\n", | |
"2015-12-14T00:00:00.000000000\n", | |
"\tCandidate Cruz Ted selected 1.1000 pct of the time\n", | |
"\tCandidate Donald Trump selected 12.3590 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 51.7680 pct of the time\n", | |
"\tCandidate John Kasich selected 0.0990 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 34.6740 pct of the time\n", | |
"2015-12-21T00:00:00.000000000\n", | |
"\tCandidate Cruz Ted selected 6.9530 pct of the time\n", | |
"\tCandidate Donald Trump selected 0.8970 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 23.3730 pct of the time\n", | |
"\tCandidate John Kasich selected 21.4890 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 47.2880 pct of the time\n", | |
"2015-12-28T00:00:00.000000000\n", | |
"\tCandidate Cruz Ted selected 0.0020 pct of the time\n", | |
"\tCandidate Donald Trump selected 29.1590 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 48.3310 pct of the time\n", | |
"\tCandidate John Kasich selected 2.1420 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 20.3660 pct of the time\n", | |
"2016-01-04T00:00:00.000000000\n", | |
"\tCandidate Cruz Ted selected 0.3990 pct of the time\n", | |
"\tCandidate Donald Trump selected 5.0130 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 30.8540 pct of the time\n", | |
"\tCandidate John Kasich selected 0.3900 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 63.3440 pct of the time\n", | |
"2016-01-11T00:00:00.000000000\n", | |
"\tCandidate Cruz Ted selected 1.2060 pct of the time\n", | |
"\tCandidate Donald Trump selected 5.2580 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 8.8530 pct of the time\n", | |
"\tCandidate John Kasich selected 1.7180 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 82.9650 pct of the time\n", | |
"2016-01-18T00:00:00.000000000\n", | |
"\tCandidate Cruz Ted selected 0.0380 pct of the time\n", | |
"\tCandidate Donald Trump selected 1.0580 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 56.0100 pct of the time\n", | |
"\tCandidate John Kasich selected 4.4450 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 38.4490 pct of the time\n", | |
"2016-01-25T00:00:00.000000000\n", | |
"\tCandidate Cruz Ted selected 0.2950 pct of the time\n", | |
"\tCandidate Donald Trump selected 6.2070 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 82.1720 pct of the time\n", | |
"\tCandidate John Kasich selected 0.2980 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 11.0280 pct of the time\n", | |
"2016-02-01T00:00:00.000000000\n", | |
"\tCandidate Cruz Ted selected 81.4630 pct of the time\n", | |
"\tCandidate Donald Trump selected 5.9180 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 4.7670 pct of the time\n", | |
"\tCandidate John Kasich selected 0.0010 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 7.8510 pct of the time\n", | |
"2016-02-08T00:00:00.000000000\n", | |
"\tCandidate Cruz Ted selected 46.9220 pct of the time\n", | |
"\tCandidate Donald Trump selected 18.3530 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 32.6940 pct of the time\n", | |
"\tCandidate John Kasich selected 0.5300 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 1.5010 pct of the time\n", | |
"2016-02-15T00:00:00.000000000\n", | |
"\tCandidate Cruz Ted selected 10.8470 pct of the time\n", | |
"\tCandidate Donald Trump selected 66.5090 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 0.8090 pct of the time\n", | |
"\tCandidate John Kasich selected 3.7630 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 18.0720 pct of the time\n", | |
"2016-02-22T00:00:00.000000000\n", | |
"\tCandidate Cruz Ted selected 0.1840 pct of the time\n", | |
"\tCandidate Donald Trump selected 87.9000 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 11.6560 pct of the time\n", | |
"\tCandidate John Kasich selected 0.1990 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 0.0610 pct of the time\n", | |
"2016-02-29T00:00:00.000000000\n", | |
"\tCandidate Cruz Ted selected 2.9680 pct of the time\n", | |
"\tCandidate Donald Trump selected 94.8590 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 0.0020 pct of the time\n", | |
"\tCandidate John Kasich selected 0.0010 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 2.1700 pct of the time\n", | |
"2016-03-07T00:00:00.000000000\n", | |
"\tCandidate Cruz Ted selected 0.1470 pct of the time\n", | |
"\tCandidate Donald Trump selected 46.4080 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 52.4540 pct of the time\n", | |
"\tCandidate John Kasich selected 0.1510 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 0.8400 pct of the time\n", | |
"2016-03-14T00:00:00.000000000\n", | |
"\tCandidate Cruz Ted selected 0.0040 pct of the time\n", | |
"\tCandidate Donald Trump selected 46.4960 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 17.0660 pct of the time\n", | |
"\tCandidate John Kasich selected 2.9310 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 33.5030 pct of the time\n", | |
"2016-03-21T00:00:00.000000000\n", | |
"\tCandidate Cruz Ted selected 25.3470 pct of the time\n", | |
"\tCandidate Donald Trump selected 1.7090 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 21.7410 pct of the time\n", | |
"\tCandidate John Kasich selected 0.0310 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 51.1720 pct of the time\n", | |
"2016-03-28T00:00:00.000000000\n", | |
"\tCandidate Cruz Ted selected 0.0730 pct of the time\n", | |
"\tCandidate Donald Trump selected 0.0170 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 55.8390 pct of the time\n", | |
"\tCandidate John Kasich selected 0.4880 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 43.5830 pct of the time\n", | |
"2016-04-04T00:00:00.000000000\n", | |
"\tCandidate Cruz Ted selected 9.9260 pct of the time\n", | |
"\tCandidate Donald Trump selected 2.3760 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 59.4220 pct of the time\n", | |
"\tCandidate John Kasich selected 0.1140 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 28.1620 pct of the time\n", | |
"2016-04-11T00:00:00.000000000\n", | |
"\tCandidate Cruz Ted selected 62.1470 pct of the time\n", | |
"\tCandidate Donald Trump selected 0.0690 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 15.3050 pct of the time\n", | |
"\tCandidate John Kasich selected 0.3690 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 22.1100 pct of the time\n", | |
"2016-04-18T00:00:00.000000000\n", | |
"\tCandidate Cruz Ted selected 31.4130 pct of the time\n", | |
"\tCandidate Donald Trump selected 0.2970 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 12.1430 pct of the time\n", | |
"\tCandidate John Kasich selected 0.2220 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 55.9250 pct of the time\n", | |
"2016-04-25T00:00:00.000000000\n", | |
"\tCandidate Cruz Ted selected 36.4820 pct of the time\n", | |
"\tCandidate Donald Trump selected 2.7340 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 9.1380 pct of the time\n", | |
"\tCandidate John Kasich selected 15.3330 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 36.3130 pct of the time\n", | |
"2016-05-02T00:00:00.000000000\n", | |
"\tCandidate Cruz Ted selected 75.3000 pct of the time\n", | |
"\tCandidate Donald Trump selected 0.0800 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 1.1070 pct of the time\n", | |
"\tCandidate John Kasich selected 5.1960 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 18.3170 pct of the time\n", | |
"2016-05-09T00:00:00.000000000\n", | |
"\tCandidate Donald Trump selected 0.0160 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 96.9620 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 3.0220 pct of the time\n", | |
"2016-05-16T00:00:00.000000000\n", | |
"\tCandidate Donald Trump selected 2.6960 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 32.4040 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 64.9000 pct of the time\n", | |
"2016-05-23T00:00:00.000000000\n", | |
"\tCandidate Donald Trump selected 9.6970 pct of the time\n", | |
"\tCandidate Hillary Clinton selected 83.9350 pct of the time\n", | |
"\tCandidate Sanders Bernie selected 6.3680 pct of the time\n" | |
] | |
} | |
], | |
"source": [ | |
"# Do the same thing for each week's worth of data\n", | |
"collapsed_weekly_df = pd.read_csv(\"weekly_view_distribution.csv\", parse_dates=[0])\n", | |
"\n", | |
"# Fit model and run simulation for each week\n", | |
"n_weekly_draws=100000\n", | |
"from collections import defaultdict\n", | |
"weeks = sorted(collapsed_weekly_df['timeperiod'].unique())\n", | |
"plot_data = defaultdict(list)\n", | |
"for week in weeks:\n", | |
" print week\n", | |
" week_df = collapsed_weekly_df[collapsed_weekly_df['timeperiod'] == week]\n", | |
" candidates = week_df['candidate'].unique()\n", | |
" aligned_samples = []\n", | |
" for candidate in candidates:\n", | |
" cand_df = week_df[week_df['candidate'] == candidate]\n", | |
" views_data = cand_df['views'].values\n", | |
" drawn_means = bayesian_tools.draw_log_normal_means(views_data, 7.4, 1., 1., 1., n_samples=n_weekly_draws)\n", | |
" aligned_samples.append(drawn_means)\n", | |
" aligned_samples = np.array(aligned_samples).transpose()\n", | |
" max_indices = aligned_samples.argmax(axis=1)\n", | |
" for cand_idx, cand in enumerate(candidates):\n", | |
" times_cand_selected = (max_indices == cand_idx).sum()\n", | |
" print \"\\tCandidate %s selected %0.4f pct of the time\" % (cand, 100 * times_cand_selected / float(n_weekly_draws))\n", | |
" plot_data[cand].append(100 * times_cand_selected / float(n_weekly_draws))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Do a bit of preprocessing so we can create our stacked bar charts\n", | |
"plot_series = {}\n", | |
"for candidate, data in plot_data.iteritems():\n", | |
" plot_series[candidate] = pd.Series(data, weeks[:len(data)])\n", | |
"plot_df = pd.DataFrame.from_records(dict(plot_series), index=weeks)\n", | |
"\n", | |
"for k in plot_series['Hillary Clinton'].index:\n", | |
" for finished_cand in ['Cruz Ted', 'John Kasich']:\n", | |
" if not (k in plot_series[finished_cand]):\n", | |
" plot_series[finished_cand][k] = 0.0" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Plot week-by-week visualization\n", | |
"times = map(lambda d: d.to_datetime(), plot_series['Hillary Clinton'].index)\n", | |
"fig = pyplot.gcf()\n", | |
"fig.clf()\n", | |
"ax = fig.add_subplot(111)\n", | |
"candidate_colors = {'Kasich': 'purple', 'Cruz': 'yellow', 'Clinton': 'blue', 'Trump': 'red', 'Sanders': 'green'}\n", | |
"bottoms = np.zeros(len(times), dtype=np.float64)\n", | |
"for cand in candidates_ordered:\n", | |
" cand_label = candidate_map[cand]\n", | |
" ax.bar(times, plot_series[cand], bottom=bottoms, label=cand_label , color=candidate_colors[cand_label], width=4.5)\n", | |
" bottoms += plot_series[cand]\n", | |
"ax.legend()\n", | |
"pyplot.savefig('bandit_week_by_week_bar.svg')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment