Skip to content

Commit

Permalink
Changed dist_select plotting
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexander März committed Aug 28, 2023
1 parent bfd869b commit fe6a588
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 53 deletions.
38 changes: 13 additions & 25 deletions lightgbmlss/distributions/distribution_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
from tqdm import tqdm

from typing import Any, Dict, Optional, List, Tuple
from plotnine import *
import matplotlib.pyplot as plt
import seaborn as sns
import warnings


Expand Down Expand Up @@ -598,7 +599,6 @@ def dist_select(self,
target: np.ndarray,
candidate_distributions: List,
max_iter: int = 100,
n_samples: int = 1000,
plot: bool = False,
figure_size: tuple = (10, 5),
) -> pd.DataFrame:
Expand All @@ -614,8 +614,6 @@ def dist_select(self,
List of candidate distributions.
max_iter: int
Maximum number of iterations for the optimization.
n_samples: int
Number of samples to draw from the fitted distribution.
plot: bool
If True, a density plot of the actual and fitted distribution is created.
figure_size: tuple
Expand Down Expand Up @@ -650,11 +648,11 @@ def dist_select(self,
}
)
dist_list.append(fit_df)
fit_df = pd.concat(dist_list).sort_values(by=self.loss_fn, ascending=True)
fit_df["rank"] = fit_df[self.loss_fn].rank().astype(int)
fit_df.set_index(fit_df["rank"], inplace=True)
pbar.update(1)
pbar.set_description(f"Fitting of candidate distributions completed")
fit_df = pd.concat(dist_list).sort_values(by=self.loss_fn, ascending=True)
fit_df["rank"] = fit_df[self.loss_fn].rank().astype(int)
fit_df.set_index(fit_df["rank"], inplace=True)

if plot:
# Select best distribution
Expand All @@ -675,29 +673,19 @@ def dist_select(self,
axis=1,
)
fitted_params = pd.DataFrame(fitted_params, columns=best_dist_sel.param_dict.keys())
fitted_params.columns = best_dist_sel.param_dict.keys()
n_samples = np.max([10000, target.shape[0]])
n_samples = np.where(n_samples > 500000, 100000, n_samples)
dist_samples = best_dist_sel.draw_samples(fitted_params,
n_samples=n_samples,
seed=123).values

# Plot actual and fitted distribution
plot_df_actual = pd.DataFrame({"y": target.reshape(-1,), "type": "Actual"})
plot_df_fitted = pd.DataFrame({"y": dist_samples.reshape(-1,),
"type": f"Best-Fit: {best_dist['distribution'].values[0]}"})
plot_df = pd.concat([plot_df_actual, plot_df_fitted])

print(
ggplot(plot_df,
aes(x="y",
color="type")) +
geom_density(alpha=0.5) +
theme_bw(base_size=15) +
theme(figure_size=figure_size,
legend_position="right",
legend_title=element_blank(),
plot_title=element_text(hjust=0.5)) +
labs(title=f"Actual vs. Fitted Density")
)
plt.figure(figsize=figure_size)
sns.kdeplot(target.reshape(-1, ), label="Actual")
sns.kdeplot(dist_samples.reshape(-1, ), label=f"Best-Fit: {best_dist['distribution'].values[0]}")
plt.legend()
plt.title("Actual vs. Best-Fit Density", fontweight="bold", fontsize=16)
plt.show()

fit_df.drop(columns=["rank", "params"], inplace=True)

Expand Down
42 changes: 14 additions & 28 deletions lightgbmlss/distributions/flow_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
from tqdm import tqdm

from typing import Any, Dict, Optional, List, Tuple
from plotnine import *
import matplotlib.pyplot as plt
import seaborn as sns
import warnings


Expand Down Expand Up @@ -637,7 +638,6 @@ def flow_select(self,
target: np.ndarray,
candidate_flows: List,
max_iter: int = 100,
n_samples: int = 1000,
plot: bool = False,
figure_size: tuple = (10, 5),
) -> pd.DataFrame:
Expand All @@ -653,8 +653,6 @@ def flow_select(self,
List of candidate normalizing flow specifications.
max_iter: int
Maximum number of iterations for the optimization.
n_samples: int
Number of samples drawn from the fitted distribution.
plot: bool
If True, a density plot of the actual and fitted distribution is created.
figure_size: tuple
Expand Down Expand Up @@ -692,11 +690,11 @@ def flow_select(self,
}
)
flow_list.append(fit_df)
fit_df = pd.concat(flow_list).sort_values(by=flow_sel.loss_fn, ascending=True)
fit_df["rank"] = fit_df[flow_sel.loss_fn].rank().astype(int)
fit_df.set_index(fit_df["rank"], inplace=True)
pbar.update(1)
pbar.set_description(f"Fitting of candidate normalizing flows completed")
fit_df = pd.concat(flow_list).sort_values(by=flow_sel.loss_fn, ascending=True)
fit_df["rank"] = fit_df[flow_sel.loss_fn].rank().astype(int)
fit_df.set_index(fit_df["rank"], inplace=True)

if plot:
# Select normalizing flow with the lowest loss
Expand All @@ -713,29 +711,17 @@ def flow_select(self,
flow_params = torch.tensor(best_flow["params"][0]).reshape(1, -1)
flow_dist_sel = best_flow_sel.create_spline_flow(input_dim=1)
_, flow_dist_sel = best_flow_sel.replace_parameters(flow_params, flow_dist_sel)
flow_samples = pd.DataFrame(flow_dist_sel.sample((n_samples,)).squeeze().detach().numpy().T)
n_samples = np.max([10000, target.shape[0]])
n_samples = np.where(n_samples > 500000, 100000, n_samples)
flow_samples = pd.DataFrame(flow_dist_sel.sample((n_samples,)).squeeze().detach().numpy().T).values

# Plot actual and fitted distribution
flow_samples["type"] = f"Best-Fit: {best_flow['NormFlow'].values[0]}"

df_actual = pd.DataFrame(target)
df_actual["type"] = "Data"

plot_df = pd.concat([df_actual, flow_samples]).rename(columns={0: "variable"})

print(
ggplot(plot_df,
aes(x="variable",
color="type")) +
geom_density(size=1.1) +
theme_bw(base_size=15) +
theme(figure_size=figure_size,
legend_position="right",
legend_title=element_blank(),
plot_title=element_text(hjust=0.5)) +
labs(title=f"Actual vs. Fitted Density",
x="")
)
plt.figure(figsize=figure_size)
sns.kdeplot(target.reshape(-1, ), label="Actual")
sns.kdeplot(flow_samples.reshape(-1, ), label=f"Best-Fit: {best_flow['NormFlow'].values[0]}")
plt.legend()
plt.title("Actual vs. Best-Fit Density", fontweight="bold", fontsize=16)
plt.show()

fit_df.drop(columns=["rank", "params"], inplace=True)

Expand Down

0 comments on commit fe6a588

Please sign in to comment.