Created
March 24, 2023 23:45
-
-
Save Micky774/68798085b8fca7452d3975e484657330 to your computer and use it in GitHub Desktop.
csr_polynomial benchmark
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from scipy import sparse as sp | |
def generate_data(n_samples, n_features, n_classes=2, X_density=1, y_sparse=False, dtype=np.float64, random_state=None): | |
rng = np.random.RandomState(random_state) | |
if X_density < 1: | |
X = sp.random(n_samples, n_features, format="csr", density=X_density, random_state=rng) | |
else: | |
X = np.round(rng.rand(n_samples,n_features)*50).astype(dtype) | |
y = np.round(rng.randint(n_classes,size=(n_samples,))).astype(dtype) | |
if y_sparse: | |
y = sp.csr_matrix(y) | |
if y_sparse and y.shape[0] == 1: | |
y = y.T | |
return X, y | |
from functools import partial | |
from time import perf_counter | |
from statistics import mean, stdev | |
from itertools import product | |
import csv | |
from pathlib import Path | |
from sklearn.preprocessing import PolynomialFeatures | |
results_path = 'local_artifacts/benchmarks/csr_polynomial/' | |
Path(results_path).mkdir(parents=True, exist_ok=True) | |
branch = "main" | |
benchmark_config = [ | |
( | |
PolynomialFeatures, | |
partial(generate_data, n_samples=2_000), | |
product( | |
[50, 100, 175], | |
[.1, .25, .5], | |
), | |
), | |
] | |
N_REPEATS = 30 | |
with open(f'{results_path}{branch}.csv', 'w', newline='') as csvfile: | |
writer = csv.DictWriter( | |
csvfile, | |
fieldnames=[ | |
"n_features", | |
"density", | |
"n_repeat", | |
"duration", | |
], | |
) | |
writer.writeheader() | |
for Est, make_data, items in benchmark_config: | |
for n_features, density in items: | |
time_results = [] | |
for n_repeat in range(N_REPEATS): | |
X, _ = make_data(X_density=density, n_features=n_features, random_state=n_repeat) | |
est = Est(degree=3) | |
start = perf_counter() | |
est.fit_transform(X) | |
duration = perf_counter() - start | |
time_results.append(duration) | |
writer.writerow( | |
{ | |
"n_features": n_features, | |
"density": density, | |
"n_repeat": n_repeat, | |
"duration": duration, | |
} | |
) | |
results_mean, results_stdev = mean(time_results), stdev(time_results) | |
print( | |
f" {n_features=} {density=}|" | |
f" {results_mean:.3f} +/- {results_stdev:.3f}" | |
) | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import seaborn as sns | |
plt.rc('font', size=12) | |
results_path = 'local_artifacts/benchmarks/csr_polynomial/' | |
_branches = ("main", "PR") | |
percentile_trim = .95 | |
branches = {br:pd.read_csv(f'{results_path}{br}.csv') for br in _branches} | |
df = pd.concat([branches[br].assign(branch=br) for br in _branches]) | |
group_by_attrs = ["n_features", "density"] | |
grouped = list(df.groupby(group_by_attrs)) | |
fig, axis = plt.subplots(3, 3, figsize=(14, 9), constrained_layout=True) | |
fig.patch.set_facecolor('white') | |
for (grouped_attrs, subset), ax in zip(grouped, axis.reshape(-1)): | |
# Optionally trim outlier data | |
if percentile_trim < 1: | |
for branch in _branches: | |
_subset = subset[subset["branch"]==branch] | |
cut = _subset.duration < _subset.duration.quantile(percentile_trim) | |
subset[subset["branch"]==branch] = _subset[cut] | |
sns.violinplot(data=subset, y="duration", x="branch", ax=ax) | |
ax.set_title("|".join( [f"{k}={v}" for k, v in zip(group_by_attrs,grouped_attrs)] )) | |
ax.set_xlabel("") | |
for ax in axis[:, 1:].ravel(): | |
ax.set_ylabel("") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment