diff --git a/podlozhnyy_module/__init__.py b/podlozhnyy_module/__init__.py index faa9523..97889fa 100644 --- a/podlozhnyy_module/__init__.py +++ b/podlozhnyy_module/__init__.py @@ -1,7 +1,6 @@ import numpy as np import pandas as pd import seaborn as sns - from matplotlib import pyplot as plt from podlozhnyy_module import ( @@ -13,10 +12,10 @@ permutation, regression, timeseries, - timetest + timetest, ) -print('Привет! Ты импортировал модуль созданный https://github.com/NPodlozhniy') -print('В нем собраны часто востребованные в работе аналитика методы') -print('Посмотреть полный cписок пакетов можно с помощью команды dir(<название библиотеки>)') -print('Приятного использования!') \ No newline at end of file +print("Привет! Ты импортировал модуль созданный https://github.com/NPodlozhniy") +print("В нем собраны часто востребованные в работе аналитика методы") +print("Посмотреть полный cписок пакетов можно с помощью dir(<название библиотеки>)") +print("Приятного использования!") diff --git a/podlozhnyy_module/charts.py b/podlozhnyy_module/charts.py index b4a0491..4df1281 100644 --- a/podlozhnyy_module/charts.py +++ b/podlozhnyy_module/charts.py @@ -1,8 +1,11 @@ -from podlozhnyy_module import np, pd, sns, plt +from podlozhnyy_module import np, pd, plt, sns -sns.set_style(rc = {'figure.facecolor': 'floralwhite'}) +sns.set_style(rc={"figure.facecolor": "floralwhite"}) -def plot_hist(df: pd.core.frame.DataFrame, feature: str, target: str, n: int = 10) -> None: + +def plot_hist( + df: pd.core.frame.DataFrame, feature: str, target: str, n: int = 10 +) -> None: """ Строит приятную гистограмму распределения признака от целевой переменной @@ -13,18 +16,15 @@ def plot_hist(df: pd.core.frame.DataFrame, feature: str, target: str, n: int = 1 target: Целевая переменная для разбиения признака n: Кол-во bin-ов, default=10 """ - df2 = pd.melt(df[[feature, target]], id_vars=target, - value_vars=[feature], value_name='target') - bins = np.linspace(df2['target'].min(), df2['target'].max(), n + 1) - + df2 = pd.melt( + df[[feature, target]], id_vars=target, value_vars=[feature], value_name="target" + ) + bins = np.linspace(df2["target"].min(), df2["target"].max(), n + 1) + g = sns.FacetGrid( - df2, - col='variable', - hue=target, - palette='rainbow', - col_wrap=2, - height=10) - g.map(plt.hist, 'target', alpha=0.5, density=True, bins=bins, ec="k") + df2, col="variable", hue=target, palette="rainbow", col_wrap=2, height=10 + ) + g.map(plt.hist, "target", alpha=0.5, density=True, bins=bins, ec="k") g.axes[-1].legend() plt.show() @@ -39,18 +39,16 @@ def plot_stacked_hist(df: pd.core.frame.DataFrame, feature: str, target: str) -> feature: Признак, распределение которго, требуется посмотреть target: Целевая переменная, будет на оси x графика """ - overview = pd.crosstab( - df[target], - df[feature]).sort_values( - target, - ascending=True) + overview = pd.crosstab(df[target], df[feature]).sort_values(target, ascending=True) sum_series = overview.sum(axis=1) for col in list(overview.columns): overview[col] = overview[col] / sum_series - overview.plot(kind='bar', stacked=True) + overview.plot(kind="bar", stacked=True) -def plot_dual_axis(data: pd.core.frame.DataFrame, col1: str, col2: str, title: str = None): +def plot_dual_axis( + data: pd.core.frame.DataFrame, col1: str, col2: str, title: str = None +): """ Построение графика с двумя осями ординат @@ -61,43 +59,18 @@ def plot_dual_axis(data: pd.core.frame.DataFrame, col1: str, col2: str, title: s col2: Название дополнительного признака (правая ось) title: Заголовок графика """ - fig, ax1 = plt.subplots( - figsize=(12, 6) - ) + fig, ax1 = plt.subplots(figsize=(12, 6)) ax2 = ax1.twinx() - ax2.bar( - data.index, - data[col2], - alpha=0.15, - fill=True, - edgecolor='b' - ) - ax1.plot( - data.index, - data[col1], - 'go--', - linewidth=2 - ) + ax2.bar(data.index, data[col2], alpha=0.15, fill=True, edgecolor="b") + ax1.plot(data.index, data[col1], "go--", linewidth=2) def naming(name): - return ' '.join( - [x[0].upper() + x[1:] - for x in name.split('_') - ] - ) + return " ".join([x[0].upper() + x[1:] for x in name.split("_")]) if data.index.name: - ax1.set_xlabel( - naming(data.index.name) - ) - ax1.set_ylabel( - naming(col1), - color='g' - ) - ax2.set_ylabel( - naming(col2), - color='b' - ) + ax1.set_xlabel(naming(data.index.name)) + ax1.set_ylabel(naming(col1), color="g") + ax2.set_ylabel(naming(col2), color="b") plt.title(title) plt.show() diff --git a/podlozhnyy_module/collocation.py b/podlozhnyy_module/collocation.py index 3f05bb8..7c34613 100644 --- a/podlozhnyy_module/collocation.py +++ b/podlozhnyy_module/collocation.py @@ -1,4 +1,5 @@ from itertools import combinations + from nltk.corpus import stopwords from podlozhnyy_module import pd diff --git a/podlozhnyy_module/correlation.py b/podlozhnyy_module/correlation.py index 72512db..c9e7ba8 100644 --- a/podlozhnyy_module/correlation.py +++ b/podlozhnyy_module/correlation.py @@ -1,6 +1,7 @@ from scipy.stats import t as student -from podlozhnyy_module import pd, sns, plt +from podlozhnyy_module import pd, plt, sns + def plot_corr_matrix( df: pd.core.frame.DataFrame, @@ -19,9 +20,9 @@ def plot_corr_matrix( if features is None: features = df.columns[df.dtypes != "object"] corr = df[features].corr(method=method) - plt.figure(figsize=(10, 10), facecolor='floralwhite') - sns.heatmap(corr, vmax=1, square=True, annot=True, cmap='cubehelix') - plt.title('Correlation between different features') + plt.figure(figsize=(10, 10), facecolor="floralwhite") + sns.heatmap(corr, vmax=1, square=True, annot=True, cmap="cubehelix") + plt.title("Correlation between different features") bottom, top = plt.ylim() plt.ylim([bottom + 0.05, top - 0.05]) plt.show() diff --git a/podlozhnyy_module/pareto.py b/podlozhnyy_module/pareto.py index 2ae2529..3a559f4 100644 --- a/podlozhnyy_module/pareto.py +++ b/podlozhnyy_module/pareto.py @@ -1,5 +1,5 @@ from scipy.optimize import minimize -from scipy.stats import pareto, bernoulli +from scipy.stats import bernoulli, pareto from podlozhnyy_module import np @@ -51,7 +51,6 @@ def theoretical(x): return {"alpha": result.x[0], "loc": result.x[1], "scale": result.x[2]} - class ParetoExtended: """ Распределение Парето дополненное значением слева принимаемым с заданной вероятностью. diff --git a/podlozhnyy_module/permutation.py b/podlozhnyy_module/permutation.py index ca19e1b..4137814 100644 --- a/podlozhnyy_module/permutation.py +++ b/podlozhnyy_module/permutation.py @@ -1,4 +1,4 @@ -from itertools import product, combinations +from itertools import combinations, product from podlozhnyy_module import np @@ -6,9 +6,11 @@ def permutation_t_stat(sample1, sample2): return np.mean(sample1) - np.mean(sample2) - + def get_random_permutations(n, max_permutations): - return set([tuple(x) for x in 2 * np.random.randint(2, size = (max_permutations, n)) - 1]) + return set( + [tuple(x) for x in 2 * np.random.randint(2, size=(max_permutations, n)) - 1] + ) def permutation_zero_dist_one_samp(sample, mean, max_permutations): @@ -16,7 +18,7 @@ def permutation_zero_dist_one_samp(sample, mean, max_permutations): if max_permutations: signs_array = get_random_permutations(len(sample), max_permutations) else: - signs_array = product([-1, 1], repeat = len(sample)) + signs_array = product([-1, 1], repeat=len(sample)) return [np.mean(centered_sample * signs) for signs in signs_array] @@ -36,12 +38,23 @@ def permutation_zero_dist_ind(sample1, sample2, max_combinations): if max_combinations: indices = get_random_combinations(n1, n2, max_combinations) else: - indices = [(list(index), filter(lambda i: i not in index, range(n))) - for index in combinations(range(n), n1)] - return [joined_sample[list(i[0])].mean() - joined_sample[list(i[1])].mean() for i in indices] - - -def permutation_test(test, control, kind: str = 'independent', max_permutations: int = None, alternative: str = 'two-sided'): + indices = [ + (list(index), filter(lambda i: i not in index, range(n))) + for index in combinations(range(n), n1) + ] + return [ + joined_sample[list(i[0])].mean() - joined_sample[list(i[1])].mean() + for i in indices + ] + + +def permutation_test( + test, + control, + kind: str = "independent", + max_permutations: int = None, + alternative: str = "two-sided", +): """ Проводит одно- или двух- выборочный статистический тест, используя семейство перестановочных критериев Возвращает значение p-value для заданного типа альтернативы @@ -59,37 +72,34 @@ def permutation_test(test, control, kind: str = 'independent', max_permutations: alternative: str Тип альтернативы: {'two-sided', 'less', 'greater'}, default='two-sided' """ - if alternative not in ('two-sided', 'less', 'greater'): - raise ValueError("alternative not recognized, should be 'two-sided', 'less' or 'greater'") - - if kind not in ('independent', 'related'): + if alternative not in ("two-sided", "less", "greater"): + raise ValueError( + "alternative not recognized, should be 'two-sided', 'less' or 'greater'" + ) + + if kind not in ("independent", "related"): raise ValueError("kind not recognized, should be 'related' or 'independent'") - + if isinstance(control, int) or isinstance(control, float): - zero_distr = permutation_zero_dist_one_samp( - test, control, max_permutations - ) - elif kind == 'related': + zero_distr = permutation_zero_dist_one_samp(test, control, max_permutations) + elif kind == "related": if len(test) != len(control): raise ValueError("related samples must have the same size") zero_distr = permutation_zero_dist_one_samp( np.array(test) - np.array(control), 0.0, max_permutations ) else: - zero_distr = permutation_zero_dist_ind( - test, control, max_permutations - ) + zero_distr = permutation_zero_dist_ind(test, control, max_permutations) t_stat = permutation_t_stat(test, control) - if alternative == 'two-sided': - return sum([1. if abs(x) >= abs(t_stat) - else 0. for x in zero_distr]) / len(zero_distr) + if alternative == "two-sided": + return sum([1.0 if abs(x) >= abs(t_stat) else 0.0 for x in zero_distr]) / len( + zero_distr + ) - if alternative == 'less': - return sum([1. if x <= t_stat else 0. for x in zero_distr] - ) / len(zero_distr) + if alternative == "less": + return sum([1.0 if x <= t_stat else 0.0 for x in zero_distr]) / len(zero_distr) - if alternative == 'greater': - return sum([1. if x >= t_stat else 0. for x in zero_distr] - ) / len(zero_distr) + if alternative == "greater": + return sum([1.0 if x >= t_stat else 0.0 for x in zero_distr]) / len(zero_distr) diff --git a/podlozhnyy_module/regression.py b/podlozhnyy_module/regression.py index 8c1ed0f..d8cb827 100644 --- a/podlozhnyy_module/regression.py +++ b/podlozhnyy_module/regression.py @@ -1,81 +1,94 @@ -import holoviews as hv - +from functools import wraps from itertools import product +from math import log -from sklearn.metrics import r2_score +import holoviews as hv +from scipy.stats import f as fisher from sklearn.linear_model import LinearRegression +from sklearn.metrics import r2_score from sklearn.model_selection import train_test_split from statsmodels.stats.proportion import proportion_confint -from math import log -from functools import wraps -from scipy.stats import f as fisher - from podlozhnyy_module import np, pd, plt def _set_options(func): """Обертка для применения визуальных настроек""" + @wraps(func) def wrapper(*args, **kwargs): diagramm = func(*args, **kwargs) - for bnd, opts in [('matplotlib', matplotlib_opts), - ('bokeh', bokeh_opts)]: - if (bnd in hv.Store._options and bnd == hv.Store.current_backend): + for bnd, opts in [("matplotlib", matplotlib_opts), ("bokeh", bokeh_opts)]: + if bnd in hv.Store._options and bnd == hv.Store.current_backend: return diagramm.opts(opts) return diagramm + return wrapper -colors = hv.Cycle(['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', - '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']) +colors = hv.Cycle( + [ + "#1f77b4", + "#ff7f0e", + "#2ca02c", + "#d62728", + "#9467bd", + "#8c564b", + "#e377c2", + "#7f7f7f", + "#bcbd22", + "#17becf", + ] +) matplotlib_opts = { - 'Scatter.Weight_of_Evidence': { - 'plot': dict(show_grid=True, legend_position='right', width=450), - 'style': dict(color='r', size=5), + "Scatter.Weight_of_Evidence": { + "plot": dict(show_grid=True, legend_position="right", width=450), + "style": dict(color="r", size=5), }, - 'NdOverlay.Objects_rate': { - 'plot': dict(xrotation=45, legend_cols=1, legend_position='right'), + "NdOverlay.Objects_rate": { + "plot": dict(xrotation=45, legend_cols=1, legend_position="right"), }, - 'Spread.Objects_rate': { - 'plot': dict(show_legend=True, show_grid=True), - 'style': dict(facecolor=colors), + "Spread.Objects_rate": { + "plot": dict(show_legend=True, show_grid=True), + "style": dict(facecolor=colors), }, - 'Overlay.Woe_Stab': { - 'plot': dict(legend_position='right'), + "Overlay.Woe_Stab": { + "plot": dict(legend_position="right"), }, - 'Curve.Weight_of_Evidence': { - 'style': dict(color=colors), + "Curve.Weight_of_Evidence": { + "style": dict(color=colors), }, - 'Spread.Confident_Intervals': { - 'plot': dict(show_grid=True, xrotation=45), - 'style': dict(facecolor=colors, alpha=0.3), + "Spread.Confident_Intervals": { + "plot": dict(show_grid=True, xrotation=45), + "style": dict(facecolor=colors, alpha=0.3), }, } bokeh_opts = { - 'Scatter.Weight_of_Evidence': { - 'plot': dict(show_grid=True, tools=['hover'], legend_position='right', width=450), - 'style': dict(color='r', size=5), + "Scatter.Weight_of_Evidence": { + "plot": dict( + show_grid=True, tools=["hover"], legend_position="right", width=450 + ), + "style": dict(color="r", size=5), }, - 'NdOverlay.Objects_rate': { - 'plot': dict(xrotation=45, legend_position='right', width=450), + "NdOverlay.Objects_rate": { + "plot": dict(xrotation=45, legend_position="right", width=450), }, - 'Spread.Objects_rate': { - 'plot': dict(show_legend=True, show_grid=True, tools=['hover']), - 'style': dict(color=colors), + "Spread.Objects_rate": { + "plot": dict(show_legend=True, show_grid=True, tools=["hover"]), + "style": dict(color=colors), }, - 'Overlay.Woe_Stab': { - 'plot': dict(legend_position='right', width=450), + "Overlay.Woe_Stab": { + "plot": dict(legend_position="right", width=450), }, - 'Curve.Weight_of_Evidence': { - 'plot': dict(tools=['hover']), - 'style': dict(color=colors), + "Curve.Weight_of_Evidence": { + "plot": dict(tools=["hover"]), + "style": dict(color=colors), }, - 'Spread.Confident_Intervals': { - 'plot': dict(show_grid=True, xrotation=45), - 'style': dict(color=colors, alpha=0.3), + "Spread.Confident_Intervals": { + "plot": dict(show_grid=True, xrotation=45), + "style": dict(color=colors, alpha=0.3), }, } @@ -90,25 +103,23 @@ def make_bucket(df, feature, num_buck=10): feature: Название признака числового или категориального num_buck: Количество бакетов для группирровки """ - bucket = np.ceil( - df[feature].rank( - pct=True) * - num_buck).fillna( - num_buck + - 1) - agg = df[feature].groupby(bucket).agg(['min', 'max']) + bucket = np.ceil(df[feature].rank(pct=True) * num_buck).fillna(num_buck + 1) + agg = df[feature].groupby(bucket).agg(["min", "max"]) def _format_buck(row): - if row['bucket'] == num_buck + 1: - return 'missing' - elif row['min'] == row['max']: - return _format_(row['min']) + if row["bucket"] == num_buck + 1: + return "missing" + elif row["min"] == row["max"]: + return _format_(row["min"]) else: - return _format_(row['min']) + ' - ' + _format_(row['max']) + return _format_(row["min"]) + " - " + _format_(row["max"]) - bucket = df[[feature]].assign(bucket=bucket)\ - .join(agg, on='bucket')\ + bucket = ( + df[[feature]] + .assign(bucket=bucket) + .join(agg, on="bucket") .apply(_format_buck, axis=1) + ) return df.assign(bucket=bucket) @@ -126,25 +137,23 @@ def _format_(x, decimal=3): div, mod = x // 1, x % 1 if mod == 0: if div == 0: - return '%d' % x + return "%d" % x elif int(np.floor(np.log10(abs(div)))) < 3: - return '%d' % x + return "%d" % x if div == 0: power = int(np.floor(np.log10(abs(mod)))) digits = decimal - power - 1 - return '%s' % np.around(x, digits) + return "%s" % np.around(x, digits) else: power = int(np.floor(np.log10(abs(div)))) digits = decimal if power < 3: - return '%s' % np.around(x, digits) + return "%s" % np.around(x, digits) elif power < 10: - return '%se+0%s' % (np.around(x / - np.power(10, power), digits), power) + return "%se+0%s" % (np.around(x / np.power(10, power), digits), power) else: - return '%se+%s' % (np.around(x / - np.power(10, power), digits), power) - return '%s' % x + return "%se+%s" % (np.around(x / np.power(10, power), digits), power) + return "%s" % x @_set_options @@ -160,13 +169,20 @@ def check_linearity(df, feature, target, num_buck=10): target: Название целевой переменной num_buck: Количество бакетов, если признак числовой """ - return df.pipe(make_bucket, feature, num_buck) \ - .groupby('bucket').mean() \ - .pipe(lambda x: hv.Scatter(zip(np.array(x[feature]), np.array(x[target])), - kdims=f'{feature}', vdims=f'{target}', - label=f"Проверка линейности зависимости {target} от {feature}") - * simple_reg(np.array(x[feature]), - np.array(x[target]))) + return ( + df.pipe(make_bucket, feature, num_buck) + .groupby("bucket") + .mean() + .pipe( + lambda x: hv.Scatter( + zip(np.array(x[feature]), np.array(x[target])), + kdims=f"{feature}", + vdims=f"{target}", + label=f"Проверка линейности зависимости {target} от {feature}", + ) + * simple_reg(np.array(x[feature]), np.array(x[target])) + ) + ) @_set_options @@ -181,16 +197,18 @@ def check_homoscedacity(df, feature, target): target: Название целевой переменной """ simple_model = LinearRegression() - simple_model.fit( - np.array(df[feature]).reshape(-1, 1), np.array(df[target])) + simple_model.fit(np.array(df[feature]).reshape(-1, 1), np.array(df[target])) predicts = simple_model.predict(np.array(df[feature]).reshape(-1, 1)) def get_residuals(y, pred): return np.array(y) - np.array(pred) - return hv.Scatter(zip(predicts, get_residuals(df[target], predicts)), - kdims=['Estimated target'], vdims=['Residual'], - label=f"Проверка гомоскедастичности признака {feature}") + return hv.Scatter( + zip(predicts, get_residuals(df[target], predicts)), + kdims=["Estimated target"], + vdims=["Residual"], + label=f"Проверка гомоскедастичности признака {feature}", + ) def _logit(p): @@ -227,7 +245,7 @@ def _woe_confint(n, cnt, q): cnt: кол-во элементов в бакете q: вероятность просрочки на всем корпусе """ - p_low, p_high = proportion_confint(n, cnt, method='normal') + p_low, p_high = proportion_confint(n, cnt, method="normal") return _woe(p_low, q), _woe(p_high, q) @@ -245,20 +263,24 @@ def bad_rate(df, feature, target, num_buck=10): target: Название целевой переменной num_buck: Количество бакетов, если признак числовой """ - if df[feature].dtype == 'O': - return df.pipe(make_bucket, feature, num_buck)\ - .assign(obj_cnt=1)\ - .groupby('bucket')\ - .agg({target: 'sum', 'obj_cnt': 'sum'})\ - .rename(columns={target: 'target_sum'})\ + if df[feature].dtype == "O": + return ( + df.pipe(make_bucket, feature, num_buck) + .assign(obj_cnt=1) + .groupby("bucket") + .agg({target: "sum", "obj_cnt": "sum"}) + .rename(columns={target: "target_sum"}) .assign(bad_rate=lambda x: x.target_sum / x.obj_cnt) + ) else: - return df.pipe(make_bucket, feature, num_buck)\ - .assign(obj_cnt=1)\ - .groupby('bucket')\ - .agg({target: 'sum', 'obj_cnt': 'sum', feature: 'mean'})\ - .rename(columns={target: 'target_sum', feature: 'feature_avg'})\ + return ( + df.pipe(make_bucket, feature, num_buck) + .assign(obj_cnt=1) + .groupby("bucket") + .agg({target: "sum", "obj_cnt": "sum", feature: "mean"}) + .rename(columns={target: "target_sum", feature: "feature_avg"}) .assign(bad_rate=lambda x: x.target_sum / x.obj_cnt) + ) def woe(df, feature, target, num_buck=10): @@ -275,11 +297,13 @@ def woe(df, feature, target, num_buck=10): """ agg = bad_rate(df, feature, target, num_buck).reset_index() agg = agg[agg.target_sum != 0] - return agg.assign(nums=agg['obj_cnt'].sum(), bad_nums=agg['target_sum'].sum())\ - .assign(woe=lambda x: _woe(x.bad_rate, x.bad_nums / x.nums))\ - .drop(['bad_nums', 'nums'], axis=1)\ - .sort_values(by='woe', ascending=False)\ - .set_index('bucket') + return ( + agg.assign(nums=agg["obj_cnt"].sum(), bad_nums=agg["target_sum"].sum()) + .assign(woe=lambda x: _woe(x.bad_rate, x.bad_nums / x.nums)) + .drop(["bad_nums", "nums"], axis=1) + .sort_values(by="woe", ascending=False) + .set_index("bucket") + ) def IV(df, feature, target, num_buck=10): @@ -293,10 +317,17 @@ def IV(df, feature, target, num_buck=10): target: Название целевой переменной num_buck: Количество бакетов, если признак числовой """ - return woe(df, feature, target, num_buck)\ - .assign(iv=lambda x: (x.target_sum / x.target_sum.sum() - - (x.obj_cnt - x.target_sum) / (x.obj_cnt.sum() - x.target_sum.sum())) * x.woe)\ + return ( + woe(df, feature, target, num_buck) + .assign( + iv=lambda x: ( + x.target_sum / x.target_sum.sum() + - (x.obj_cnt - x.target_sum) / (x.obj_cnt.sum() - x.target_sum.sum()) + ) + * x.woe + ) .iv.sum() + ) def iv_report(df, features, target, num_buck=10): @@ -311,17 +342,18 @@ def iv_report(df, features, target, num_buck=10): target: Название целевой переменной num_buck: Количество бакетов для разбиения """ + def desc(x): if x > 0.5: - power = 'Suspicious' + power = "Suspicious" elif x > 0.3: - power = 'Strong' + power = "Strong" elif x > 0.1: - power = 'Medium' + power = "Medium" elif x > 0.02: - power = 'Weak' + power = "Weak" else: - power = 'Useless' + power = "Useless" return (x, power) ivs = {} @@ -330,10 +362,10 @@ def desc(x): ivs = list(ivs.items()) ivs.sort(key=lambda i: i[1], reverse=True) - print(' Name || Value || Interpretation') - print('--------------------------------------------------') + print(" Name || Value || Interpretation") + print("--------------------------------------------------") for feature in ivs: - print(f'{feature[0]:21} || {feature[1][0]:.3f} || {feature[1][1]}') + print(f"{feature[0]:21} || {feature[1][0]:.3f} || {feature[1][1]}") def iv_agg(df, features, target, num_bucks=[10, 10]): @@ -347,36 +379,51 @@ def iv_agg(df, features, target, num_bucks=[10, 10]): target: Название целевой переменной num_bucks: Список количества бакетов для каждой из переменных """ - index = make_bucket(df[[features[-1]]], features[-1], - num_buck=num_bucks[-1])['bucket'].values + index = make_bucket(df[[features[-1]]], features[-1], num_buck=num_bucks[-1])[ + "bucket" + ].values columns = [] for i, feature in enumerate(features[:-1]): - columns.append(make_bucket( - df[[feature]], feature, num_buck=num_bucks[i])['bucket'].values) - - obj_cnt = (pd.crosstab(index=index, - columns=columns, - margins=True)) - target_sum = (pd.crosstab(index=index, - columns=columns, - values=df[target].values, - aggfunc=np.sum, - margins=True)) - bad_rate = (pd.crosstab(index=index, - columns=columns, - values=df[target].values, - aggfunc=np.mean, - margins=True)) - agg = pd.DataFrame({'obj_cnt': obj_cnt.iloc[:-1, :-1].unstack().values, - 'target_sum': target_sum.iloc[:-1, :-1].unstack().values, - 'bad_rate': bad_rate.iloc[:-1, :-1].unstack().values}) + columns.append( + make_bucket(df[[feature]], feature, num_buck=num_bucks[i])["bucket"].values + ) + + obj_cnt = pd.crosstab(index=index, columns=columns, margins=True) + target_sum = pd.crosstab( + index=index, + columns=columns, + values=df[target].values, + aggfunc=np.sum, + margins=True, + ) + bad_rate = pd.crosstab( + index=index, + columns=columns, + values=df[target].values, + aggfunc=np.mean, + margins=True, + ) + agg = pd.DataFrame( + { + "obj_cnt": obj_cnt.iloc[:-1, :-1].unstack().values, + "target_sum": target_sum.iloc[:-1, :-1].unstack().values, + "bad_rate": bad_rate.iloc[:-1, :-1].unstack().values, + } + ) agg = agg[agg.target_sum != 0] - return agg.assign(nums=agg['obj_cnt'].sum(), bad_nums=agg['target_sum'].sum())\ - .assign(woe=lambda x: _woe(x.bad_rate, x.bad_nums / x.nums))\ - .assign(iv=lambda x: (x.target_sum / x.target_sum.sum() - - (x.obj_cnt - x.target_sum) / (x.obj_cnt.sum() - x.target_sum.sum())) * x.woe)\ - .iv.sum() + return ( + agg.assign(nums=agg["obj_cnt"].sum(), bad_nums=agg["target_sum"].sum()) + .assign(woe=lambda x: _woe(x.bad_rate, x.bad_nums / x.nums)) + .assign( + iv=lambda x: ( + x.target_sum / x.target_sum.sum() + - (x.obj_cnt - x.target_sum) / (x.obj_cnt.sum() - x.target_sum.sum()) + ) + * x.woe + ) + .iv.sum() + ) @_set_options @@ -392,8 +439,13 @@ def simple_reg(predictor, target): """ check = LinearRegression() check.fit(predictor.reshape(-1, 1), target) - return hv.Curve((np.array([min(predictor) - 1, max(predictor) + 1]), - check.coef_ * np.array([min(predictor) - 1, max(predictor) + 1]) + check.intercept_)) + return hv.Curve( + ( + np.array([min(predictor) - 1, max(predictor) + 1]), + check.coef_ * np.array([min(predictor) - 1, max(predictor) + 1]) + + check.intercept_, + ) + ) def r_2check(df, feature, target, num_buck=10): @@ -434,40 +486,48 @@ def plot_woe_curve(df, feature, target, num_buck=10): agg = bad_rate(df, feature, target, num_buck).reset_index() agg = agg[(agg.target_sum != 0) & (agg.feature_avg.notnull())] - agg = agg.assign(nums=agg['obj_cnt'].sum(), bad_nums=agg['target_sum'].sum())\ - .assign(woe=lambda x: _woe(x.bad_rate, x.bad_nums / x.nums), - woe_low=lambda x: _woe_confint(x.target_sum, - x.obj_cnt, - x.bad_nums / x.nums)[0], - woe_high=lambda x: _woe_confint(x.target_sum, - x.obj_cnt, - x.bad_nums / x.nums)[1])\ - .assign(woe_u=lambda x: x.woe_high - x.woe, - woe_b=lambda x: x.woe - x.woe_low) + agg = ( + agg.assign(nums=agg["obj_cnt"].sum(), bad_nums=agg["target_sum"].sum()) + .assign( + woe=lambda x: _woe(x.bad_rate, x.bad_nums / x.nums), + woe_low=lambda x: _woe_confint( + x.target_sum, x.obj_cnt, x.bad_nums / x.nums + )[0], + woe_high=lambda x: _woe_confint( + x.target_sum, x.obj_cnt, x.bad_nums / x.nums + )[1], + ) + .assign(woe_u=lambda x: x.woe_high - x.woe, woe_b=lambda x: x.woe - x.woe_low) + ) r2_woe = r_2check(df, feature, target, num_buck) - scatter = hv.Scatter(data=agg, - kdims=['feature_avg'], - vdims=['woe'], - group='Weight of Evidence', - label=f'r2_score = {r2_woe}') - errors = hv.ErrorBars(data=agg, - kdims=['feature_avg'], - vdims=['woe', 'woe_u', 'woe_b'], - group='Confident Intervals') + scatter = hv.Scatter( + data=agg, + kdims=["feature_avg"], + vdims=["woe"], + group="Weight of Evidence", + label=f"r2_score = {r2_woe}", + ) + errors = hv.ErrorBars( + data=agg, + kdims=["feature_avg"], + vdims=["woe", "woe_u", "woe_b"], + group="Confident Intervals", + ) reg = simple_reg(np.array(agg.feature_avg), np.array(agg.woe)) - return hv.Overlay(items=[scatter, errors, reg], - group='Woe Curve', - label=feature).redim.range(feature_avg=(agg.feature_avg.min() * 1.15, - agg.feature_avg.max() * 1.15), - woe=(agg.woe.min() * 1.15, - agg.woe.max() * 1.15)) + return hv.Overlay( + items=[scatter, errors, reg], group="Woe Curve", label=feature + ).redim.range( + feature_avg=(agg.feature_avg.min() * 1.15, agg.feature_avg.max() * 1.15), + woe=(agg.woe.min() * 1.15, agg.woe.max() * 1.15), + ) + # Динамика переменных и WoE @_set_options -def distribution(df, feature, date, num_buck=10, date_freq='Q'): +def distribution(df, feature, date, num_buck=10, date_freq="Q"): """ Строит график распределения признака во времени @@ -479,34 +539,43 @@ def distribution(df, feature, date, num_buck=10, date_freq='Q'): num_buck: Количество бакетов, если признак числовой date_freq: Частота агрегации времени """ - agg = df.pipe(make_bucket, feature, num_buck)\ - .assign(obj_cnt=1)\ - .groupby([pd.Grouper(key=date, freq=date_freq), 'bucket'])\ - .agg({'obj_cnt': sum})\ - .reset_index()\ - .assign(obj_total=lambda x: (x.groupby([pd.Grouper(key=date, - freq=date_freq)])['obj_cnt'].transform('sum')))\ - .assign(obj_rate=lambda x: x.obj_cnt / x.obj_total)\ - .reset_index()\ - .assign(objects_rate=lambda x: - x.groupby(date).apply( - lambda y: y.obj_rate.cumsum().to_frame()) - .reset_index(drop=True))\ - .assign(obj_rate_u=0, obj_rate_b=lambda x: x['obj_rate']) + agg = ( + df.pipe(make_bucket, feature, num_buck) + .assign(obj_cnt=1) + .groupby([pd.Grouper(key=date, freq=date_freq), "bucket"]) + .agg({"obj_cnt": sum}) + .reset_index() + .assign( + obj_total=lambda x: ( + x.groupby([pd.Grouper(key=date, freq=date_freq)])["obj_cnt"].transform( + "sum" + ) + ) + ) + .assign(obj_rate=lambda x: x.obj_cnt / x.obj_total) + .reset_index() + .assign( + objects_rate=lambda x: x.groupby(date) + .apply(lambda y: y.obj_rate.cumsum().to_frame()) + .reset_index(drop=True) + ) + .assign(obj_rate_u=0, obj_rate_b=lambda x: x["obj_rate"]) + ) data = hv.Dataset( - agg, kdims=[ - 'bucket', date], vdims=[ - 'objects_rate', 'obj_rate_b', 'obj_rate_u']) + agg, kdims=["bucket", date], vdims=["objects_rate", "obj_rate_b", "obj_rate_u"] + ) - return data.to.spread(kdims=[date], - vdims=['objects_rate', 'obj_rate_b', 'obj_rate_u'], - group='Objects rate', - label=feature).overlay('bucket') + return data.to.spread( + kdims=[date], + vdims=["objects_rate", "obj_rate_b", "obj_rate_u"], + group="Objects rate", + label=feature, + ).overlay("bucket") @_set_options -def woe_stab(df, feature, target, date, num_buck=3, date_freq='Q'): +def woe_stab(df, feature, target, date, num_buck=3, date_freq="Q"): """ Строит WoE признака во времени, позволяет оценить его устойчивость @@ -519,44 +588,46 @@ def woe_stab(df, feature, target, date, num_buck=3, date_freq='Q'): num_buck: Количество бакетов, если признак числовой date_freq: Частота агрегации времени """ - agg = df.pipe(make_bucket, feature, num_buck)\ - .assign(obj_cnt=1)\ - .groupby([pd.Grouper(key=date, freq=date_freq), 'bucket'])\ - .agg({target: 'sum', 'obj_cnt': sum})\ - .rename(columns={target: 'target_sum'})\ - .assign(bad_rate=lambda x: x.target_sum / x.obj_cnt) + agg = ( + df.pipe(make_bucket, feature, num_buck) + .assign(obj_cnt=1) + .groupby([pd.Grouper(key=date, freq=date_freq), "bucket"]) + .agg({target: "sum", "obj_cnt": sum}) + .rename(columns={target: "target_sum"}) + .assign(bad_rate=lambda x: x.target_sum / x.obj_cnt) + ) - agg = agg.assign(nums=agg.groupby([date])['obj_cnt'].transform('sum'), - bad_nums=agg.groupby([date])['target_sum'].transform('sum'))\ - .assign(woe=lambda x: _woe(x.bad_rate, x.bad_nums / x.nums), - woe_low=lambda x: _woe_confint(x.target_sum, - x.obj_cnt, - x.bad_nums / x.nums)[0], - woe_high=lambda x: _woe_confint(x.target_sum, - x.obj_cnt, - x.bad_nums / x.nums)[1])\ - .assign(woe_u=lambda x: x.woe_high - x.woe, - woe_b=lambda x: x.woe - x.woe_low)\ + agg = ( + agg.assign( + nums=agg.groupby([date])["obj_cnt"].transform("sum"), + bad_nums=agg.groupby([date])["target_sum"].transform("sum"), + ) + .assign( + woe=lambda x: _woe(x.bad_rate, x.bad_nums / x.nums), + woe_low=lambda x: _woe_confint( + x.target_sum, x.obj_cnt, x.bad_nums / x.nums + )[0], + woe_high=lambda x: _woe_confint( + x.target_sum, x.obj_cnt, x.bad_nums / x.nums + )[1], + ) + .assign(woe_u=lambda x: x.woe_high - x.woe, woe_b=lambda x: x.woe - x.woe_low) .reset_index() + ) agg = agg[agg.target_sum != 0] - data = hv.Dataset( - agg, kdims=[ - 'bucket', date], vdims=[ - 'woe', 'woe_b', 'woe_u']) - - confident_intervals = (data.to.spread(kdims=[date], - vdims=['woe', 'woe_b', 'woe_u'], - group='Confident Intervals') - .overlay('bucket')) - woe_curves = (data.to.curve(kdims=[date], - vdims=['woe'], - group='Weight of Evidence') - .overlay('bucket')) - return hv.Overlay(items=[confident_intervals * woe_curves], - group='Woe Stab', - label=f'{feature}') + data = hv.Dataset(agg, kdims=["bucket", date], vdims=["woe", "woe_b", "woe_u"]) + + confident_intervals = data.to.spread( + kdims=[date], vdims=["woe", "woe_b", "woe_u"], group="Confident Intervals" + ).overlay("bucket") + woe_curves = data.to.curve( + kdims=[date], vdims=["woe"], group="Weight of Evidence" + ).overlay("bucket") + return hv.Overlay( + items=[confident_intervals * woe_curves], group="Woe Stab", label=f"{feature}" + ) def HL(target, predict, num_buck=10): @@ -569,18 +640,23 @@ def HL(target, predict, num_buck=10): predict - предсказания вероятности num_buck - количество бакетов """ - data = pd.DataFrame({'target': target, 'predict': predict}) + data = pd.DataFrame({"target": target, "predict": predict}) data = ( - data.pipe(make_bucket, 'predict', num_buck) - .assign(obj_cnt=1) - .groupby('bucket') - .agg({'target': 'sum', 'predict': 'mean', 'obj_cnt': 'sum'}) - .assign(bad_rate=lambda x: x.target / x.obj_cnt) - .reset_index() + data.pipe(make_bucket, "predict", num_buck) + .assign(obj_cnt=1) + .groupby("bucket") + .agg({"target": "sum", "predict": "mean", "obj_cnt": "sum"}) + .assign(bad_rate=lambda x: x.target / x.obj_cnt) + .reset_index() + ) + return int( + sum( + (data.predict - data.bad_rate) ** 2 + / (data.predict * (1 - data.predict)) + * data.obj_cnt + ) ) - return int(sum((data.predict - data.bad_rate) ** 2 / - (data.predict * (1 - data.predict)) * data.obj_cnt)) @_set_options @@ -596,25 +672,31 @@ def plot_gain_chart(target, predict, num_buck=10): predict - предсказания вероятности num_buck - количество бакетов """ - data = pd.DataFrame({'target': target, 'predict': predict}) + data = pd.DataFrame({"target": target, "predict": predict}) H = HL(target, predict, num_buck) data = ( - data.assign(bucket=np.ceil(data['predict'].rank(pct=True) * num_buck)) - .assign(obj_cnt=1) - .groupby('bucket') - .agg({'target': 'sum', 'predict': 'mean', 'obj_cnt': 'sum'}) - .assign(bad_rate=lambda x: x.target / x.obj_cnt) - .reset_index() + data.assign(bucket=np.ceil(data["predict"].rank(pct=True) * num_buck)) + .assign(obj_cnt=1) + .groupby("bucket") + .agg({"target": "sum", "predict": "mean", "obj_cnt": "sum"}) + .assign(bad_rate=lambda x: x.target / x.obj_cnt) + .reset_index() ) - bars_gain = hv.Bars(data, kdims=['bucket'], vdims=['bad_rate'], label='observed') \ - .opts(plot={'xrotation': 90, 'show_legend': True}, style={'color': 'yellow'}) + bars_gain = hv.Bars( + data, kdims=["bucket"], vdims=["bad_rate"], label="observed" + ).opts(plot={"xrotation": 90, "show_legend": True}, style={"color": "yellow"}) - curve_gain = hv.Curve(data, kdims=['bucket'], vdims=['predict'], label='predicted') \ - .opts(plot={'xrotation': 90, 'show_legend': True}, style={'color': 'black'}) + curve_gain = hv.Curve( + data, kdims=["bucket"], vdims=["predict"], label="predicted" + ).opts(plot={"xrotation": 90, "show_legend": True}, style={"color": "black"}) - return hv.Overlay([bars_gain, curve_gain]).redim.label(**{'target': 'Bad Rate'})\ - .relabel(f'HL_score = {H}').opts(plot={'legend_position': 'top_left'}) + return ( + hv.Overlay([bars_gain, curve_gain]) + .redim.label(**{"target": "Bad Rate"}) + .relabel(f"HL_score = {H}") + .opts(plot={"legend_position": "top_left"}) + ) def feature_importance(names, values, verbose=False, thr=0.05): @@ -637,17 +719,13 @@ def feature_importance(names, values, verbose=False, thr=0.05): coef_list.sort(key=lambda i: i[1], reverse=True) for i in coef_list: if i[1] >= thr: - print(i[0], ':', round(i[1], 5)) + print(i[0], ":", round(i[1], 5)) return val_dict @_set_options def plot_confusion_matrix( - cm, - classes, - normalize=False, - title='Confusion matrix', - cmap=plt.cm.Blues + cm, classes, normalize=False, title="Confusion matrix", cmap=plt.cm.Blues ) -> None: """ Печатает и отрисовывает матрицу ошибок для задачи классификации @@ -661,7 +739,7 @@ def plot_confusion_matrix( title: Название для графика cmap: Цветовая палитра, по умолчанию: plt.cm.Blues """ - plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.imshow(cm, interpolation="nearest", cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) @@ -671,27 +749,33 @@ def plot_confusion_matrix( plt.ylim([bottom, top]) if normalize: - cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + cm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis] print("Normalized confusion matrix") else: - print('Confusion matrix, without normalization') + print("Confusion matrix, without normalization") print(cm) - thresh = cm.max() / 2. + thresh = cm.max() / 2.0 for i, j in product(range(cm.shape[0]), range(cm.shape[1])): - plt.text(j, i, cm[i, j], - horizontalalignment="center", - verticalalignment="center", - color="white" if cm[i, j] > thresh else "black") + plt.text( + j, + i, + cm[i, j], + horizontalalignment="center", + verticalalignment="center", + color="white" if cm[i, j] > thresh else "black", + ) plt.tight_layout() - plt.ylabel('Истинный класс') - plt.xlabel('Предсказанный класс') + plt.ylabel("Истинный класс") + plt.xlabel("Предсказанный класс") plt.show() -def regression_report(X: pd.core.frame.DataFrame, y: np.ndarray, model: object, **kwargs) -> dict: +def regression_report( + X: pd.core.frame.DataFrame, y: np.ndarray, model: object, **kwargs +) -> dict: """ Обучает заданную модель регрессии на предоставленных данных. В стандартный поток вывода публикуются метрики качества построенной модели и важность признаков. diff --git a/podlozhnyy_module/timeseries.py b/podlozhnyy_module/timeseries.py index 695d8a3..97a9808 100644 --- a/podlozhnyy_module/timeseries.py +++ b/podlozhnyy_module/timeseries.py @@ -48,10 +48,10 @@ def double_exponential_smoothing(self): else: self.result.append(level + trend) val = self.series[i] - prev_level, level = level, self.alpha * \ - val + (1 - self.alpha) * (level + trend) - trend = self.beta * (level - prev_level) + \ - (1 - self.beta) * trend + prev_level, level = level, self.alpha * val + (1 - self.alpha) * ( + level + trend + ) + trend = self.beta * (level - prev_level) + (1 - self.beta) * trend self.Level.append(level) self.Trend.append(trend) @@ -85,8 +85,7 @@ class HoltWinters: """ - def __init__(self, series, slen, alpha, beta, - gamma, n_preds, scaling_factor=1.96): + def __init__(self, series, slen, alpha, beta, gamma, n_preds, scaling_factor=1.96): self.series = series self.slen = slen self.alpha = alpha @@ -98,8 +97,7 @@ def __init__(self, series, slen, alpha, beta, def initial_trend(self): sum = 0.0 for i in range(self.slen): - sum += float(self.series[i + self.slen] - - self.series[i]) / self.slen + sum += float(self.series[i + self.slen] - self.series[i]) / self.slen return sum / self.slen def initial_seasonal_components(self): @@ -109,13 +107,16 @@ def initial_seasonal_components(self): # вычисляем сезонные средние for j in range(n_seasons): season_averages.append( - sum(self.series[self.slen * j: self.slen * j + self.slen]) / float(self.slen)) + sum(self.series[self.slen * j : self.slen * j + self.slen]) + / float(self.slen) + ) # вычисляем начальные значения for i in range(self.slen): sum_of_vals_over_avg = 0.0 for j in range(n_seasons): - sum_of_vals_over_avg += self.series[self.slen * - j + i] - season_averages[j] + sum_of_vals_over_avg += ( + self.series[self.slen * j + i] - season_averages[j] + ) seasonals[i] = sum_of_vals_over_avg / n_seasons return seasonals @@ -142,18 +143,17 @@ def triple_exponential_smoothing(self): self.PredictedDeviation.append(deviations[i % self.slen]) - self.UpperBond.append(self.result[0] + - self.scaling_factor * - self.PredictedDeviation[0]) + self.UpperBond.append( + self.result[0] + self.scaling_factor * self.PredictedDeviation[0] + ) - self.LowerBond.append(self.result[0] - - self.scaling_factor * - self.PredictedDeviation[0]) + self.LowerBond.append( + self.result[0] - self.scaling_factor * self.PredictedDeviation[0] + ) continue if i >= len(self.series): # прогнозируем m = i - len(self.series) + 1 - self.result.append(smooth + m * trend + - seasonals[i % self.slen]) + self.result.append(smooth + m * trend + seasonals[i % self.slen]) # во время прогноза с каждым шагом увеличиваем неопределенность prev_deviation = deviations[i % self.slen] @@ -162,28 +162,30 @@ def triple_exponential_smoothing(self): else: self.result.append(smooth + trend + seasonals[i % self.slen]) val = self.series[i] - prev_smooth, smooth = smooth, self.alpha * \ - (val - seasonals[i % self.slen]) + \ - (1 - self.alpha) * (smooth + trend) - trend = self.beta * (smooth - prev_smooth) + \ - (1 - self.beta) * trend - seasonals[i % self.slen] = self.gamma * \ - (val - smooth) + (1 - self.gamma) * \ - seasonals[i % self.slen] + prev_smooth, smooth = smooth, self.alpha * ( + val - seasonals[i % self.slen] + ) + (1 - self.alpha) * (smooth + trend) + trend = self.beta * (smooth - prev_smooth) + (1 - self.beta) * trend + seasonals[i % self.slen] = ( + self.gamma * (val - smooth) + + (1 - self.gamma) * seasonals[i % self.slen] + ) # Отклонение рассчитывается в соответствии с алгоритмом # Брутлага prev_deviation = deviations[i % self.slen] - deviations[i % self.slen] = self.gamma * np.abs( - self.series[i] - self.result[i]) + (1 - self.gamma) * prev_deviation + deviations[i % self.slen] = ( + self.gamma * np.abs(self.series[i] - self.result[i]) + + (1 - self.gamma) * prev_deviation + ) - self.UpperBond.append(self.result[-1] + - self.scaling_factor * - prev_deviation) + self.UpperBond.append( + self.result[-1] + self.scaling_factor * prev_deviation + ) - self.LowerBond.append(self.result[-1] - - self.scaling_factor * - prev_deviation) + self.LowerBond.append( + self.result[-1] - self.scaling_factor * prev_deviation + ) self.Smooth.append(smooth) self.Trend.append(trend) @@ -191,7 +193,7 @@ def triple_exponential_smoothing(self): self.PredictedDeviation.append(deviations[i % self.slen]) -def timeseriesCVscore(x, data, r=0, method='HoltWinters', slen=7): +def timeseriesCVscore(x, data, r=0, method="HoltWinters", slen=7): """ Производит кросс-валидацию на временных рядах для модели линейного тренда Хольта или модели Хольта-Винтерса Максимальное значение n_splits, таково, что (n_splits + 1) * 2 * slen <= len(data) (для линейной модели Хольта slen=1) @@ -213,7 +215,8 @@ def timeseriesCVscore(x, data, r=0, method='HoltWinters', slen=7): def weighted_mse(actual, predictions, r): weights = [1 / np.power(1 + r, i) for i in range(len(actual), 0, -1)] return np.mean( - ((np.array(actual) - np.array(predictions)) ** 2) * np.array(weights)) + ((np.array(actual) - np.array(predictions)) ** 2) * np.array(weights) + ) # Вектор ошибок errors = [] @@ -228,29 +231,28 @@ def weighted_mse(actual, predictions, r): # выборке и считаем ошибку for train, test in tscv.split(values): - if method == 'HoltWinters': + if method == "HoltWinters": model = HoltWinters( series=values[train], slen=slen, alpha=x[0], beta=x[1], gamma=x[2], - n_preds=len(test)) + n_preds=len(test), + ) model.triple_exponential_smoothing() - if method == 'Holt': + if method == "Holt": model = HoltLinearTrend( - series=values[train], - alpha=x[0], - beta=x[1], - n_preds=len(test)) + series=values[train], alpha=x[0], beta=x[1], n_preds=len(test) + ) model.double_exponential_smoothing() - predictions = model.result[-len(test):] + predictions = model.result[-len(test) :] actual = values[test] - # Можно считать обыычный MSE или взвесить и дать больший вес свежим - # значением + # Можно считать обыычный MSE или взвесить и дать больший вес свежим + # значением error = weighted_mse(actual, predictions, r=r) errors.append(error) @@ -271,20 +273,20 @@ def plotHolt(model, dataset, target, predict_interval, xlim=None): xlim: Сколько последних точек надо отобразить на графике, по умолчанию - все """ if len(model.result) > len(dataset): - dataset = pd.concat([dataset, pd.DataFrame( - np.array([np.NaN] * predict_interval), columns=[target])]) + dataset = pd.concat( + [ + dataset, + pd.DataFrame(np.array([np.NaN] * predict_interval), columns=[target]), + ] + ) plt.figure(figsize=(25, 10)) plt.plot(model.result, "b", label="Model") plt.plot(dataset[target].values, "g", label="Actual") plt.axvspan( - len(dataset) - - predict_interval - - 1, - len(dataset), - alpha=0.5, - color='lightgrey') + len(dataset) - predict_interval - 1, len(dataset), alpha=0.5, color="lightgrey" + ) plt.grid(True) - plt.axis('tight') + plt.axis("tight") plt.legend(loc="best", fontsize=13) if xlim: plt.xlim(len(dataset) - xlim, len(dataset)) @@ -305,34 +307,37 @@ def plotHoltWinters(model, dataset, target, predict_interval, xlim=None): xlim: Сколько последних точек надо отобразить на графике, по умолчанию - все """ if len(model.result) > len(dataset): - dataset = pd.concat([dataset, pd.DataFrame( - np.array([np.NaN] * predict_interval), columns=[target])]) + dataset = pd.concat( + [ + dataset, + pd.DataFrame(np.array([np.NaN] * predict_interval), columns=[target]), + ] + ) Anomalies = np.array([np.NaN] * len(dataset[target])) - Anomalies[dataset[target].values < - model.LowerBond] = dataset[target].values[dataset[target].values < model.LowerBond] - Anomalies[dataset[target].values > - model.UpperBond] = dataset[target].values[dataset[target].values > model.UpperBond] + Anomalies[dataset[target].values < model.LowerBond] = dataset[target].values[ + dataset[target].values < model.LowerBond + ] + Anomalies[dataset[target].values > model.UpperBond] = dataset[target].values[ + dataset[target].values > model.UpperBond + ] plt.figure(figsize=(25, 10)) plt.plot(model.result, "b", label="Model") plt.plot(model.UpperBond, "k--", alpha=0.5, label="Up/Low confidence") plt.plot(model.LowerBond, "k--", alpha=0.5) - plt.fill_between(x=range(0, - len(model.result)), - y1=model.UpperBond, - y2=model.LowerBond, - alpha=0.5, - color="grey") + plt.fill_between( + x=range(0, len(model.result)), + y1=model.UpperBond, + y2=model.LowerBond, + alpha=0.5, + color="grey", + ) plt.plot(dataset[target].values, "g", label="Actual") plt.plot(Anomalies, "ro", markersize=7, label="Anomalies") plt.axvspan( - len(dataset) - - predict_interval - - 1, - len(dataset), - alpha=0.5, - color='lightgrey') + len(dataset) - predict_interval - 1, len(dataset), alpha=0.5, color="lightgrey" + ) plt.grid(True) - plt.axis('tight') + plt.axis("tight") plt.legend(loc="best", fontsize=13) if xlim: plt.xlim(len(dataset) - xlim, len(dataset)) diff --git a/podlozhnyy_module/timetest.py b/podlozhnyy_module/timetest.py index 2edf8c7..7cddee5 100644 --- a/podlozhnyy_module/timetest.py +++ b/podlozhnyy_module/timetest.py @@ -1,7 +1,14 @@ from podlozhnyy_module import np, pd -def stat_difference_by_flg(df: pd.core.frame.DataFrame, feature: str, target: str, name: str, flg: str, num_buck: int = 10): +def stat_difference_by_flg( + df: pd.core.frame.DataFrame, + feature: str, + target: str, + name: str, + flg: str, + num_buck: int = 10, +): """ Считает разность в значениях целевой переменной между подмножествами исходного df, разбитого по флагу Подсчет происходит по бакетам признака feature если он числовой и просто по его значениям, если категориальный @@ -15,49 +22,106 @@ def stat_difference_by_flg(df: pd.core.frame.DataFrame, feature: str, target: st flg: Название колонки-флага по которой исходный фрейм разбивается на две части. Вычитание из flg=0, как будто из прошлого num_buck: Количество бакетов, если признак числовой """ - if str(df[feature].dtype) in ('object', 'category'): - agg = df.assign(obj_cnt=1)\ - .rename(columns={feature: 'bucket'})\ - .groupby([flg, 'bucket'], as_index=False)\ - .agg({target: 'mean', 'obj_cnt': 'sum'})\ - .rename(columns={target: 'AR', 'obj_cnt': 'feature_cnt'})\ - + if str(df[feature].dtype) in ("object", "category"): + agg = ( + df.assign(obj_cnt=1) + .rename(columns={feature: "bucket"}) + .groupby([flg, "bucket"], as_index=False) + .agg({target: "mean", "obj_cnt": "sum"}) + .rename(columns={target: "AR", "obj_cnt": "feature_cnt"}) + ) after = agg[agg[flg] == 1].copy() before = agg[agg[flg] == 0].copy() - return before.join(after.set_index('bucket'), on='bucket', rsuffix='_after') \ - .assign(AR_decrease_mean=lambda x: x.AR - x.AR_after, - AR_decrease_std=lambda x: np.sqrt((x.AR * (1 - x.AR) / x.feature_cnt) + (x.AR_after * (1 - x.AR_after) / x.feature_cnt_after))) \ - .assign(AR_decrease_min=lambda x: x.AR_decrease_mean - 1.96 * x.AR_decrease_std, - AR_decrease_max=lambda x: x.AR_decrease_mean + 1.96 * x.AR_decrease_std) \ - .assign(AR_decrease_overall=lambda x: x.AR_decrease_mean * x.feature_cnt_after / after.feature_cnt.sum())[['bucket', 'AR_decrease_overall', 'AR_decrease_mean', 'AR_decrease_min', 'AR_decrease_max']] \ - .sort_values(by='AR_decrease_overall', ascending=False) \ - .rename(columns={'AR_decrease_overall': name + '_decrease_overall', - 'AR_decrease_mean': name + '_decrease_mean', - 'AR_decrease_min': name + '_decrease_min', - 'AR_decrease_max': name + '_decrease_max'})\ - .set_index('bucket') + return ( + before.join(after.set_index("bucket"), on="bucket", rsuffix="_after") + .assign( + AR_decrease_mean=lambda x: x.AR - x.AR_after, + AR_decrease_std=lambda x: np.sqrt( + (x.AR * (1 - x.AR) / x.feature_cnt) + + (x.AR_after * (1 - x.AR_after) / x.feature_cnt_after) + ), + ) + .assign( + AR_decrease_min=lambda x: x.AR_decrease_mean - 1.96 * x.AR_decrease_std, + AR_decrease_max=lambda x: x.AR_decrease_mean + 1.96 * x.AR_decrease_std, + ) + .assign( + AR_decrease_overall=lambda x: x.AR_decrease_mean + * x.feature_cnt_after + / after.feature_cnt.sum() + )[ + [ + "bucket", + "AR_decrease_overall", + "AR_decrease_mean", + "AR_decrease_min", + "AR_decrease_max", + ] + ] + .sort_values(by="AR_decrease_overall", ascending=False) + .rename( + columns={ + "AR_decrease_overall": name + "_decrease_overall", + "AR_decrease_mean": name + "_decrease_mean", + "AR_decrease_min": name + "_decrease_min", + "AR_decrease_max": name + "_decrease_max", + } + ) + .set_index("bucket") + ) else: # Бьем на бакеты, считаем AR до и после - agg = df[df[feature].notnull()]\ - .assign(bucket=np.ceil(df[feature].rank(pct=True) * num_buck), obj_cnt=1)\ - .groupby([flg, 'bucket'], as_index=False)\ - .agg({target: 'mean', 'obj_cnt': 'sum', feature: 'mean'})\ - .rename(columns={target: 'AR', 'obj_cnt': 'feature_cnt', feature: 'feature_avg'}) -# Разделяем фрейм на до и после + agg = ( + df[df[feature].notnull()] + .assign(bucket=np.ceil(df[feature].rank(pct=True) * num_buck), obj_cnt=1) + .groupby([flg, "bucket"], as_index=False) + .agg({target: "mean", "obj_cnt": "sum", feature: "mean"}) + .rename( + columns={target: "AR", "obj_cnt": "feature_cnt", feature: "feature_avg"} + ) + ) + # Разделяем фрейм на до и после after = agg[agg[flg] == 1].copy() before = agg[agg[flg] == 0].copy() -# Считаем среднюю разницу и доверительный интервал для этой средней - return before.join(after.set_index('bucket'), on='bucket', rsuffix='_after')\ - .assign(AR_decrease_mean=lambda x: x.AR - x.AR_after, - AR_decrease_std=lambda x: np.sqrt((x.AR * (1 - x.AR) / x.feature_cnt) + (x.AR_after * (1 - x.AR_after) / x.feature_cnt_after)))\ - .assign(AR_decrease_min=lambda x: x.AR_decrease_mean - 1.96 * x.AR_decrease_std, - AR_decrease_max=lambda x: x.AR_decrease_mean + 1.96 * x.AR_decrease_std)\ - .assign(AR_decrease_overall=lambda x: x.AR_decrease_mean * x.feature_cnt_after / after.feature_cnt.sum())[['bucket', 'feature_avg', 'feature_avg_after', 'AR_decrease_overall', 'AR_decrease_mean', 'AR_decrease_min', 'AR_decrease_max']]\ - .rename(columns={'feature_avg': 'feature_avg_before'})\ - .rename(columns={'AR_decrease_overall': name + '_decrease_overall', - 'AR_decrease_mean': name + '_decrease_mean', - 'AR_decrease_min': name + '_decrease_min', - 'AR_decrease_max': name + '_decrease_max'})\ - .set_index('bucket') + # Считаем среднюю разницу и доверительный интервал для этой средней + return ( + before.join(after.set_index("bucket"), on="bucket", rsuffix="_after") + .assign( + AR_decrease_mean=lambda x: x.AR - x.AR_after, + AR_decrease_std=lambda x: np.sqrt( + (x.AR * (1 - x.AR) / x.feature_cnt) + + (x.AR_after * (1 - x.AR_after) / x.feature_cnt_after) + ), + ) + .assign( + AR_decrease_min=lambda x: x.AR_decrease_mean - 1.96 * x.AR_decrease_std, + AR_decrease_max=lambda x: x.AR_decrease_mean + 1.96 * x.AR_decrease_std, + ) + .assign( + AR_decrease_overall=lambda x: x.AR_decrease_mean + * x.feature_cnt_after + / after.feature_cnt.sum() + )[ + [ + "bucket", + "feature_avg", + "feature_avg_after", + "AR_decrease_overall", + "AR_decrease_mean", + "AR_decrease_min", + "AR_decrease_max", + ] + ] + .rename(columns={"feature_avg": "feature_avg_before"}) + .rename( + columns={ + "AR_decrease_overall": name + "_decrease_overall", + "AR_decrease_mean": name + "_decrease_mean", + "AR_decrease_min": name + "_decrease_min", + "AR_decrease_max": name + "_decrease_max", + } + ) + .set_index("bucket") + ) diff --git a/setup.py b/setup.py index 0e6e059..87b647b 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setup( name="podlozhnyy_module", - version="2.0", + version="2.1", description="One place for the most useful methods for work", long_description=readme, long_description_content_type="text/markdown",