Last active
November 1, 2021 08:54
-
-
Save dkapitan/81dbd97f3638aed5ff843261dba6d84e to your computer and use it in GitHub Desktop.
Monkey-patch for pd.Dataframe.describe() with robust statistics
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def describe_robust(self, percentiles=None, include=None, exclude=None, trim=0.2): | |
""" | |
Monkey-patch for pd.Dataframe.describe based on robust statistics. | |
Calculate trimmed mean and winsorized standard deviation with default trim 0.2. | |
Uses scipy.stats.mstats (trimmed_mean, winsorized) and numpy.std | |
See e.g. http://www.uh.edu/~ttian/ES.pdf for methodical background. | |
BSD 3-Clause License | |
Copyright (c) 2018, Daniel Kapitan ([email protected]) | |
All rights reserved. | |
---------- | |
percentiles : list-like of numbers, optional | |
The percentiles to include in the output. All should | |
fall between 0 and 1. The default is | |
``[.25, .5, .75]``, which returns the 25th, 50th, and | |
75th percentiles. | |
include : 'all', list-like of dtypes or None (default), optional | |
A white list of data types to include in the result. Ignored | |
for ``Series``. Here are the options: | |
- 'all' : All columns of the input will be included in the output. | |
- A list-like of dtypes : Limits the results to the | |
provided data types. | |
To limit the result to numeric types submit | |
``numpy.number``. To limit it instead to object columns submit | |
the ``numpy.object`` data type. Strings | |
can also be used in the style of | |
``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To | |
select pandas categorical columns, use ``'category'`` | |
- None (default) : The result will include all numeric columns. | |
exclude : list-like of dtypes or None (default), optional, | |
A black list of data types to omit from the result. Ignored | |
for ``Series``. Here are the options: | |
- A list-like of dtypes : Excludes the provided data types | |
from the result. To exclude numeric types submit | |
``numpy.number``. To exclude object columns submit the data | |
type ``numpy.object``. Strings can also be used in the style of | |
``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To | |
exclude pandas categorical columns, use ``'category'`` | |
- None (default) : The result will exclude nothing. | |
trim : fraction to trim observation on both sides. 0.2 by default. | |
Returns | |
------- | |
summary: Series/DataFrame of robust summary statistics | |
""" | |
from pandas.io.formats.format import format_percentiles | |
from pandas.core.dtypes.common import ( | |
is_bool_dtype, | |
is_numeric_dtype, | |
is_datetime64_dtype) | |
from scipy.stats.mstats import trimmed_mean, winsorize | |
import numpy as np | |
import pandas as pd | |
if self.ndim >= 3: | |
msg = "describe is not implemented on Panel objects." | |
raise NotImplementedError(msg) | |
elif self.ndim == 2 and self.columns.size == 0: | |
raise ValueError("Cannot describe a DataFrame without columns") | |
if percentiles is not None: | |
# explicit conversion of `percentiles` to list | |
percentiles = list(percentiles) | |
# get them all to be in [0, 1] | |
self._check_percentile(percentiles) | |
# median should always be included | |
if 0.5 not in percentiles: | |
percentiles.append(0.5) | |
percentiles = np.asarray(percentiles) | |
else: | |
percentiles = np.array([0.25, 0.5, 0.75]) | |
# sort and check for duplicates | |
unique_pcts = np.unique(percentiles) | |
if len(unique_pcts) < len(percentiles): | |
raise ValueError("percentiles cannot contain duplicates") | |
percentiles = unique_pcts | |
formatted_percentiles = format_percentiles(percentiles) | |
def describe_numeric_1d_robust(series): | |
stat_index = (['count', 'trim_mean', 'trim_std', 'min'] + | |
formatted_percentiles + ['max']) | |
d = ([series.count(), | |
trimmed_mean(series.dropna(), limits=trim), | |
np.std(winsorize(series.dropna(), limits=trim)), | |
series.min()] + | |
[series.quantile(x) for x in percentiles] + [series.max()]) | |
return pd.Series(d, index=stat_index, name=series.name) | |
def describe_categorical_1d(data): | |
names = ['count', 'unique'] | |
objcounts = data.value_counts() | |
count_unique = len(objcounts[objcounts != 0]) | |
result = [data.count(), count_unique] | |
if result[1] > 0: | |
top, freq = objcounts.index[0], objcounts.iloc[0] | |
if is_datetime64_dtype(data): | |
asint = data.dropna().values.view('i8') | |
names += ['top', 'freq', 'first', 'last'] | |
result += [tslib.Timestamp(top), freq, | |
tslib.Timestamp(asint.min()), | |
tslib.Timestamp(asint.max())] | |
else: | |
names += ['top', 'freq'] | |
result += [top, freq] | |
return pd.Series(result, index=names, name=data.name) | |
def describe_1d(data): | |
if is_bool_dtype(data): | |
return describe_categorical_1d(data) | |
elif is_numeric_dtype(data): | |
return describe_numeric_1d_robust(data) | |
elif is_timedelta64_dtype(data): | |
return describe_numeric_1d_robust(data) | |
else: | |
return describe_categorical_1d(data) | |
if self.ndim == 1: | |
return describe_1d(self) | |
elif (include is None) and (exclude is None): | |
# when some numerics are found, keep only numerics | |
data = self.select_dtypes(include=[np.number]) | |
if len(data.columns) == 0: | |
data = self | |
elif include == 'all': | |
if exclude is not None: | |
msg = "exclude must be None when include is 'all'" | |
raise ValueError(msg) | |
data = self | |
else: | |
data = self.select_dtypes(include=include, exclude=exclude) | |
ldesc = [describe_1d(s) for _, s in data.iteritems()] | |
# set a convenient order for rows | |
names = [] | |
ldesc_indexes = sorted([x.index for x in ldesc], key=len) | |
for idxnames in ldesc_indexes: | |
for name in idxnames: | |
if name not in names: | |
names.append(name) | |
d = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1) | |
d.columns = data.columns.copy() | |
return d |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment