diff --git a/core/_machine_learning.py b/core/_machine_learning.py index 9e28b5c..f25d392 100644 --- a/core/_machine_learning.py +++ b/core/_machine_learning.py @@ -191,6 +191,9 @@ def fit(self, X, y=None, features=None, retrain=False): self.__init__(features) + # Set up an empty data frame for data to be scaled + scale_df = pd.DataFrame() + if self.ohe: # Get a subset of the data that requires one hot encoding ohe_df = X[self.ohe_meta.index.tolist()] @@ -305,6 +308,7 @@ def transform(self, X, y=None): """ X_transform = None + scale_df = pd.DataFrame() if self.ohe: # Get a subset of the data that requires one hot encoding @@ -561,7 +565,7 @@ def _print_log(self, step, **kwargs): f.write("Fit tfidf_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['tfidf_df'].shape, kwargs['tfidf_df'].head())) try: - if len(self.scale_df) > 0: + if len(kwargs['scale_df']) > 0: sys.stdout.write("Fit scale_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['scale_df'].shape, kwargs['scale_df'].head())) with open(self.log,'a', encoding='utf-8') as f: @@ -595,7 +599,7 @@ def _print_log(self, step, **kwargs): f.write("Transform tfidf_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['tfidf_df'].shape, kwargs['tfidf_df'].head())) try: - if len(self.scale_df) > 0: + if len(kwargs['scale_df']) > 0: sys.stdout.write("Transform scale_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['scale_df'].shape, kwargs['scale_df'].head())) with open(self.log,'a', encoding='utf-8') as f: diff --git a/core/_prophet.py b/core/_prophet.py index 235ef02..679b25e 100644 --- a/core/_prophet.py +++ b/core/_prophet.py @@ -252,7 +252,7 @@ def predict(self): if self.name is not None and len(self.add_seasonality_kwargs) > 0: self.model.add_seasonality(**self.add_seasonality_kwargs) - self.model.fit(self.input_df) + self.model.fit(self.input_df, **self.fit_kwargs) # Create a data frame for future values self.future_df = self.model.make_future_dataframe(**self.make_kwargs) @@ -320,6 +320,11 @@ def _set_params(self): self.mode = None self.seasonality_prior_scale = None self.holidays_prior_scale = None + self.mcmc_samples = None + self.seed = None + self.n_changepoints = None + self.changepoint_range = None + self.uncertainty_samples = None self.is_seasonality_request = False self.weekly_start = 6 # Defaulting to a Monday start for the week as used in Qlik self.yearly_start = 0 @@ -447,6 +452,34 @@ def _set_params(self): # Reducing this parameter dampens holiday effects. Default is 10, which provides very little regularization. if 'holidays_prior_scale' in self.kwargs: self.holidays_prior_scale = utils.atof(self.kwargs['holidays_prior_scale']) + + # Set the number of MCMC samples. + # If greater than 0, Prophet will do full Bayesian inference with the specified number of MCMC samples. + # If 0, Prophet will do MAP estimation. Default is 0. + if 'mcmc_samples' in self.kwargs: + self.mcmc_samples = utils.atoi(self.kwargs['mcmc_samples']) + + # Random seed that can be used to control stochasticity. + # Used for setting the numpy random seed used in predict and also for pystan when using mcmc_samples>0. + if 'random_seed' in self.kwargs: + self.seed = utils.atoi(self.kwargs['random_seed']) + + # Set the random seed for numpy + np.random.seed(self.seed) + + # Number of potential changepoints to include. Default value is 25. + # Potential changepoints are selected uniformly from the first `changepoint_range` proportion of the history. + if 'n_changepoints' in self.kwargs: + self.n_changepoints = utils.atoi(self.kwargs['n_changepoints']) + + # Proportion of history in which trend changepoints will be estimated. + # Defaults to 0.8 for the first 80%. + if 'changepoint_range' in self.kwargs: + self.changepoint_range = utils.atof(self.kwargs['changepoint_range']) + + # Number of simulated draws used to estimate uncertainty intervals. + if 'uncertainty_samples' in self.kwargs: + self.uncertainty_samples = utils.atoi(self.kwargs['uncertainty_samples']) # Set the weekly start for 'weekly' seasonality requests # Default week start is 0 which represents Sunday. Add offset as required. @@ -468,16 +501,18 @@ def _set_params(self): if 'upper_window' in self.kwargs: self.upper_window = utils.atoi(self.kwargs['upper_window']) - # Create dictionary of arguments for the Prophet(), make_future_dataframe() and add_seasonality() functions + # Create dictionary of arguments for the Prophet(), make_future_dataframe(), add_seasonality() and fit() functions self.prophet_kwargs = {} self.make_kwargs = {} self.add_seasonality_kwargs = {} + self.fit_kwargs = {} # Populate the parameters in the corresponding dictionary: # Set up a list of possible key word arguments for the Prophet() function prophet_params = ['seasonality_mode', 'growth', 'changepoint_prior_scale', 'interval_width',\ - 'seasonality_prior_scale', 'holidays_prior_scale'] + 'seasonality_prior_scale', 'holidays_prior_scale', 'mcmc_samples', 'n_changepoints',\ + 'changepoint_range', 'uncertainty_samples'] # Create dictionary of key word arguments for the Prophet() function self.prophet_kwargs = self._populate_dict(prophet_params) @@ -493,6 +528,14 @@ def _set_params(self): # Create dictionary of key word arguments for the add_seasonality() function self.add_seasonality_kwargs = self._populate_dict(seasonality_params) + + # Pass the random seed to the fit method if MCMC is being used + if self.mcmc_samples is not None and self.mcmc_samples > 0: + # Set up a list of possible key word arguments for the fit() function + fit_params = ['seed'] + # Create dictionary of key word arguments for the fit() function + self.fit_kwargs = self._populate_dict(fit_params) + def _populate_dict(self, params): """ @@ -636,6 +679,7 @@ def _print_log(self, step): sys.stdout.write("Instance creation parameters: {0}\n\n".format(self.prophet_kwargs)) sys.stdout.write("Make future data frame parameters: {0}\n\n".format(self.make_kwargs)) sys.stdout.write("Add seasonality parameters: {0}\n\n".format(self.add_seasonality_kwargs)) + sys.stdout.write("Fit parameters: {0}\n\n".format(self.fit_kwargs)) sys.stdout.write("REQUEST DATA FRAME: {0} rows x cols\n\n".format(self.request_df.shape)) sys.stdout.write("{0} \n\n".format(self.request_df.to_string())) if len(self.NaT_df) > 0: @@ -653,6 +697,7 @@ def _print_log(self, step): f.write("Instance creation parameters: {0}\n\n".format(self.prophet_kwargs)) f.write("Make future data frame parameters: {0}\n\n".format(self.make_kwargs)) f.write("Add seasonality parameters: {0}\n\n".format(self.add_seasonality_kwargs)) + f.write("Fit parameters: {0}\n\n".format(self.fit_kwargs)) f.write("REQUEST DATA FRAME: {0} rows x cols\n\n".format(self.request_df.shape)) f.write("{0} \n\n".format(self.request_df.to_string())) if len(self.NaT_df) > 0: diff --git a/docs/Prophet.md b/docs/Prophet.md index c0190dd..b1d12e1 100644 --- a/docs/Prophet.md +++ b/docs/Prophet.md @@ -55,10 +55,15 @@ Any of these arguments can be included in the final string parameter for the Pro | debug | Flag to output additional information to the terminal and logs | `true`, `false` | Information will be printed to the terminal as well to a log file: `..\qlik-py-env\core\logs\Prophet Log .txt`. Particularly useful is looking at the Request Data Frame to see what you are sending to the algorithm and the Forecast Data Frame to see the possible result columns. | | load_script | Flag for calling the function from the Qlik load script. | `true`, `false` | Set to `true` if calling the Prophet function from the load script in the Qlik app. This will change the output to a table consisting of two fields; `ds` which is the datetime dimension passed to Prophet, and the specified return value (`yhat` by default). `ds` is returned as a string in the format `YYYY-MM-DD hh:mm:ss TT`.

This parameter only applies to the `Prophet` function. | | take_log | Take a logarithm of the values before forecasting | `true`, `false` | Default value is `false`. This can be applied when making the time series more stationary might improve forecast values. You can just try both options and compare the results. In either case the values are returned in the original scale. | +| random_seed | An integer to control some of the stochasticity in the model | An integer value e.g. `42`, `1000` | The random seed can be used to make uncertaintly intervals for predictions deterministic and repeatable. If using `mmc_samples` > 0 this also applies to MMC sampling. However there may still be small variances in results from the model. More info [here](https://github.com/facebook/prophet/issues/849). | | cap | A saturating maximum for the forecast | A decimal or integer value e.g. `1000000` | You can apply a logistic growth trend model using this argument. For example when the maximum market size is known. More information [here](https://facebook.github.io/prophet/docs/saturating_forecasts.html). | | floor | A saturating minimum for the forecast | A decimal or integer value e.g. `0` | This argument must be used in combination with a cap. | | changepoint_prior_scale | A parameter to adjust the trend flexibility | A decimal value e.g. `0.05` | If the trend changes are being overfit (too much flexibility) or underfit (not enough flexibility), you can try adjusting this parameter. The default value is `0.05`. Increasing it will make the trend more flexible. Decreasing it will make the trend less flexible. More information [here](https://facebook.github.io/prophet/docs/trend_changepoints.html). | +| n_changepoints | Number of potential changepoints to include | An integer value e.g. `50` | This number of potential changepoints are selected uniformly from the first `changepoint_range` proportion of the history. The default value is `25`. | +| changepoint_range | Proportion of history in which trend changepoints will be estimated | A decimal value less than 1 e.g. `0.9` | Defaults to `0.8` for the first 80%. | | interval_width | The width of the uncertainty intervals | A decimal value e.g. `0.8` | The default value is `0.8` (80%). More information [here](https://facebook.github.io/prophet/docs/uncertainty_intervals.html). | +| uncertainty_samples | Number of simulated draws used to estimate uncertainty intervals | An integer value e.g. `1000` | The default value is `1000`. | +| mcmc_samples | Set the number of MCMC samples | An integer value e.g. `1000` | If greater than 0, Prophet will do full Bayesian inference with the specified number of MCMC samples. If 0, Prophet will do MAP estimation. The default value is `0`. | | seasonality_mode | Use additive or multiplicative model for seasonality. | `additive`, `multiplicative` | By default Prophet fits additive seasonalities, meaning the effect of the seasonality is added to the trend to get the forecast. If the seasonality is not a constant additive factor as assumed by Prophet, rather it grows with the trend you can set this parameter to `multiplicative`. More information [here](https://facebook.github.io/prophet/docs/multiplicative_seasonality.html). | | add_seasonality | Additional seasonality to be considered in the forecast. | A string value which represents the name of the seasonality e.g. `monthly` | Prophet will by default fit weekly and yearly seasonalities, if the time series is more than two cycles long. It will also fit daily seasonality for a sub-daily time series. You can add other seasonalities (monthly, quarterly, hourly) using this parameter. More information [here](https://facebook.github.io/prophet/docs/seasonality_and_holiday_effects.html). | | add_seasonality_mode | Use additive or multiplicative model for the additional seasonality. | `additive`, `multiplicative` | See the `seasonality_mode` parameter above. If the additional seasonality requires a different mode you can use this parameter. More information [here](https://facebook.github.io/prophet/docs/multiplicative_seasonality.html). |