diff --git a/html/.buildinfo b/html/.buildinfo deleted file mode 100644 index 4d38845..0000000 --- a/html/.buildinfo +++ /dev/null @@ -1,4 +0,0 @@ -# Sphinx build info version 1 -# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 15909ba89e1fcdfb649a555995912958 -tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/html/_modules/index.html b/html/_modules/index.html deleted file mode 100644 index 9da7a2e..0000000 --- a/html/_modules/index.html +++ /dev/null @@ -1,99 +0,0 @@ - - - -
- - -
-import logging
-import os
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-import yaml
-import joblib
-
-
-from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
-from sklearn.neighbors import KNeighborsClassifier
-from sklearn.linear_model import LogisticRegression
-from sklearn.tree import DecisionTreeClassifier
-from sklearn.svm import SVC
-from sklearn.metrics import mean_absolute_error, mean_squared_error, f1_score, confusion_matrix
-from sklearn.model_selection import train_test_split, GridSearchCV
-from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import StandardScaler, Normalizer
-
-from imblearn.over_sampling import SMOTE
-
-from yellowbrick import ROCAUC
-from yellowbrick.model_selection import FeatureImportances, ValidationCurve
-from yellowbrick.classifier import ClassificationReport, ConfusionMatrix, PrecisionRecallCurve, ClassPredictionError
-from yellowbrick.model_selection import RFECV
-
-from .preparation import Cleaner
-from .utils import get_path
-
-log = logging.getLogger(__name__)
-
-
-
-[docs]
-def predict_from_best_pipeline(X: pd.DataFrame, prob_flag=False, model_name='0.974_rfc_best_model.pkl',
- config_path=None):
- """
- Predict using the best model pipeline.
-
- Parameters
- ----------
- X : array-like
- Features to predict.
- prob_flag : bool, optional
- Whether to return probabilities, by default False.
- model_name : str, optional
- Name of the model to use, by default '0.974_rfc_best_model.pkl'
- model_path : str, optional
- Path to the model to use for prediction, by default 'None'
- config_path : str, optional
- Path to the configuration file, by default '../config/config.yaml'.
-
- Returns
- -------
- ndarray
- Predicted values or probabilities.
- """
-
- vtac_ml_pipe = VTACMLPipe(config_file=config_path)
- print(model_name)
- vtac_ml_pipe.load_best_model(model_name=model_name)
- print(vtac_ml_pipe.best_model)
- y = vtac_ml_pipe.predict(X, prob=prob_flag)
- return y
-
-
-
-
-[docs]
-class VTACMLPipe:
- """
- A machine learning pipeline for training and evaluating an optimal model for optical identification of GRBs for the SVOM mission.
-
- Parameters
- ----------
- config_path : str, optional
- Path to the configuration file. Default 'config/config.yaml'
-
- """
-
- def __init__(self, config_file='config/config.yaml'):
-
- """
- Initialize the VTACMLPipe.
-
- Parameters
- ----------
- config_path : str
- Path to the configuration file.
- """
-
- # initialize attributes
- self.config = None
- self.X = None
- self.y = None
- self.X_columns = None
- self.y_columns = None
- self.X_train = None
- self.y_train = None
- self.X_test = None
- self.y_test = None
- self.preprocessing = Pipeline(steps=[], verbose=True)
- self.full_pipeline = Pipeline(steps=[], verbose=True)
- self.models = {}
- self.best_model = None
- self.y_predict = None
- self.y_predict_prob = None
-
- # Load configs from config file
- self.load_config(config_file)
-
- # Defining Steps of the preprocessing pipeline
- cleaner = Cleaner(variables=self.X_columns)
- scaler = StandardScaler()
- normalizer = Normalizer()
- self.steps = [
- ('cleaner', cleaner),
- # ('ohe', ohe),
- ('scaler', scaler),
- ('normalizer', normalizer)
- ]
- self._create_pipe(self.steps)
-
-
-[docs]
- def load_config(self, config_file):
- """
- Load the configuration file and prepare the data.
-
- Parameters
- ----------
- config_file : str
- The path to the configuration file.
-
- """
- config_path = get_path(config_file)
-
- with open(config_path, 'r') as file:
- self.config = yaml.safe_load(file)
-
- # loading config file and prepping data
- data_file = self.config['Inputs']['file']
- data = self._get_data(data_file=data_file)
-
- self.X_columns = self.config['Inputs']['columns']
- self.X = data[self.X_columns]
- self.y_columns = self.config['Inputs']['target_column']
- self.y = data[self.y_columns]
- self._load_data(data, columns=self.X_columns, target=self.y_columns, test_size=0.2)
-
- # building models attribute
- for model in self.config['Models']:
- if model == 'rfc':
- self.models[model] = RandomForestClassifier()
- if model == 'svc':
- self.models[model] = SVC()
- if model == 'knn':
- self.models[model] = KNeighborsClassifier()
- if model == 'lr':
- self.models[model] = LogisticRegression()
- if model == 'dt':
- self.models[model] = DecisionTreeClassifier()
- if model == 'ada':
- self.models[model] = AdaBoostClassifier()
-
-
-
-[docs]
- def train(self, save_all_model=False, resample_flag=False, scoring='f1', cv=5):
- """
- Train the pipeline with the given data.
-
- Parameters
- ----------
- save_all_model : bool, optional
- Whether to save best model of each model type to output directory. Default is False.
- resample_flag : bool, optional
- Whether to resample the data. Default is False
- scoring : str, optional
- The scoring function to use. Default is 'f1'.
- cv : int, optional
- The cross-validation split to use. Default is 5.
-
- Returns
- -------
- Pipeline
- Trained machine learning pipeline.
- """
-
- models = self.models
-
- if resample_flag:
- self.X_train, self.y_train = self._resample(self.X_train, self.y_train)
-
- if self.preprocessing.steps is None:
- print("No preprocessing steps")
- model_path = None
- best_score = 0
- for name, model in models.items():
-
- param_grid = self.config['Models'][name]['param_grid']
-
- log.info("Model: {}".format(name))
-
- self.full_pipeline = Pipeline(steps=self.preprocessing.steps.copy(), verbose=True)
- self.full_pipeline.steps.append((name, model))
- log.info(self.full_pipeline.steps)
-
- # model fitting
- grid_search = GridSearchCV(self.full_pipeline, param_grid, scoring=scoring, verbose=2, cv=cv)
- grid_search.fit(self.X_train, self.y_train)
-
- model_filename = f'{round(grid_search.best_score_, 3)}_{name}_best_model.pkl'
- model_path = get_path(f'output/models/{model_filename}')
- if save_all_model:
- joblib.dump(
- grid_search.best_estimator_,
- model_path)
-
- if grid_search.best_score_ > best_score:
- best_score = grid_search.best_score_
- self.best_model = grid_search.best_estimator_
-
- log.info('*' * 50)
- log.info(f'Best {name} Pipeline:')
- log.info(grid_search.best_estimator_)
- log.info(f'Best Score: {grid_search.best_score_}')
- log.info('*' * 50)
- log.info(f'Overall best model: {self.best_model}')
-
-
- # self.save_best_model(model_path=model_path)
-
-
-[docs]
- def save_best_model(self, model_name='best_model', model_path=None):
- """
- Saves best model from training to the specified path in the config file. Optionally change name and/or path
- of the model.
-
- Parameters
- -------
- model_name : str, optional
- Name of the model to be saved. Default='best_model'.
- model_path : str, optional
- Path to the model to be saved. Default='model_path' in config file
-
-
- """
- if model_path is None:
- model_path = get_path(f'{self.config['Outputs']['model_path']}/{model_name}')
- else:
- print(model_path)
- model_path = get_path(model_path)
- print(model_path)
-
- joblib.dump(self.best_model, model_path)
- logging.info(f'Saved model to {model_path}')
-
-
-
-[docs]
- def load_best_model(self, model_name):
- """
- Loads 'model_name' into current pipeline.
-
- Parameters
- -------
- model_name : str
- The name of the model from the Outputs/models/ directory to be loaded.
-
-
- """
- model_path = get_path(f'{self.config['Outputs']['model_path']}/{model_name}')
- self.best_model = joblib.load(model_path)
- logging.info(f'Loaded {model_path}')
-
-
-
-[docs]
- def evaluate(self, name, plot=False, score=f1_score):
- """
- Evaluate the best model with various metrics and visualization.
-
- Parameters
- ----------
- name : str
- The name for the evaluation output.
- plot : bool, optional
- If True, generates and saves evaluation plots, by default False.
- score : callable, optional
- The scoring function to use for evaluation, by default f1_score.
-
- """
- viz_path = self.config['Outputs']['viz_path']
- output_path = get_path(f'{viz_path}/{name}')
-
- print(self.best_model.steps)
- if not os.path.exists(output_path):
- os.makedirs(output_path)
- print(f"Folder '{output_path}' created.")
- else:
- print(f"Folder '{output_path}' already exists.")
- # INCLUDE case in titles
- # model scoring
- #
- # if self.best_model.steps[-1][0] == 'knn' and plot:
- # print('plotting')
- # self.preprocessing.fit(self.X)
- # X = pd.DataFrame(self.preprocessing.transform(self.X))
- # self.plot_knn_neighbors(knn=self.best_model.steps[-1][1], X=X, y=self.y, features=self.X_columns)
- # print('done plotting')
- # else:
- print(self.best_model.steps[-1][1])
-
- # Evaluate model performance
- self._print_model_eval()
-
- if plot:
-
- _, ax_report = plt.subplots()
-
- report_viz = ClassificationReport(self.best_model, classes=["NOT_GRB", "IS_GRB"], support=True,
- ax=ax_report)
- report_viz.fit(self.X_train, self.y_train) # Fit the visualizer and the model
- report_viz.score(self.X_test, self.y_test) # Evaluate the model on the test data
- report_viz.show(outpath=output_path + '/classification_report.pdf')
-
- _, ax_cm_test = plt.subplots()
-
- cm_test_viz = ConfusionMatrix(self.best_model, classes=["NOT_GRB", "IS_GRB"], percent=True, axes=ax_cm_test)
- cm_test_viz.fit(self.X_train, self.y_train)
- cm_test_viz.score(self.X_test, self.y_test)
- cm_test_viz.show(outpath=output_path + '/confusion_matrix_test.pdf')
-
- _, ax_cm_train = plt.subplots()
-
- cm_train_viz = ConfusionMatrix(self.best_model, classes=["NOT_GRB", "IS_GRB"], percent=True, ax=ax_cm_train)
- cm_train_viz.fit(self.X_train, self.y_train)
- cm_train_viz.score(self.X_train, self.y_train)
- cm_train_viz.show(outpath=output_path + '/confusion_matrix_train.pdf')
-
- _, ax_roc = plt.subplots()
-
- roc_viz = ROCAUC(self.best_model, classes=["NOT_GRB", "IS_GRB"], ax=ax_roc)
- roc_viz.fit(self.X_train, self.y_train) # Fit the training data to the visualizer
- roc_viz.score(self.X_test, self.y_test) # Evaluate the model on the test data
- roc_viz.show(outpath=output_path + '/ROC_AUC.pdf')
-
- _, ax_pr_curve = plt.subplots()
-
- pr_curve_viz = PrecisionRecallCurve(self.best_model, classes=["NOT_GRB", "IS_GRB"], ax=ax_pr_curve)
- pr_curve_viz.fit(self.X_train, self.y_train)
- pr_curve_viz.score(self.X_test, self.y_test)
- pr_curve_viz.show(outpath=output_path + '/PR_curve.pdf')
-
- _, ax_class_pred = plt.subplots()
- ax_class_pred.semilogy()
-
- class_pred_viz = ClassPredictionError(self.best_model, classes=["NOT_GRB", "IS_GRB"], ax=ax_class_pred)
- class_pred_viz.fit(self.X_train, self.y_train)
- class_pred_viz.score(self.X_test, self.y_test)
- class_pred_viz.show(outpath=output_path + '/class_predictions.pdf')
- #
- if self.best_model.steps[-1][1] == RandomForestClassifier():
- _, ax_feature_imp = plt.subplots()
-
- feature_imp_viz = FeatureImportances(self.best_model.steps[-1][1], ax=ax_feature_imp)
- feature_imp_viz.fit(self.X, self.y)
- feature_imp_viz.show(outpath=output_path + '/feature_importances.pdf')
-
- # if plot_extra:
- # self.hyperparameter_valid_curve(outpath=output_path)
- # # self.recursive_feature_elimination_plot(outpath=output_path)
- #
-
-
-[docs]
- def predict(self, X, prob=False):
- """
- Predict using the best model.
-
- Parameters
- ----------
- X : DataFrame
- The input features for prediction.
- prob : bool, optional
- If True, returns the probability of the predictions, by default False.
-
- Returns
- -------
- ndarray
- The predicted values or probabilities.
- """
- X = X[self.X_columns]
- if prob is True:
- self.y_predict_prob = self.best_model.predict_proba(X)
- return self.y_predict_prob
- else:
- self.y_predict = self.best_model.predict(X)
- return self.y_predict
-
-
- @staticmethod
- def _get_data(data_file: str):
- """
- Load data from a parquet file.
-
- Parameters
- ----------
- data_file : str
- The name of the data file to load.
-
- Returns
- -------
- DataFrame
- The loaded data.
- """
- data_path = get_path(f'/data/{data_file}')
- print(data_path)
- data = pd.read_parquet(data_path, engine='fastparquet')
- return data
-
- def _load_data(self, data: pd.DataFrame, columns: list, target: str, test_size: float = 0.2):
- """
- Load the data from the source specified in the config.
-
- Parameters
- ----------
- data : pd.DataFrame
- The data to load.
- columns : list
- The columns to load.
- target: str
- The target column.
- test_size: float, optional
- The size of the test sample as a fraction of the total sample. Default is 0.2.
-
- Returns
- -------
- DataFrame
- Loaded data.
- """
- X = data[columns]
- y = data[target]
- self._split_data(X, y, test_size)
-
- def _split_data(self, X, y, test_size):
- """
- Split data into training and testing sets.
-
- Parameters
- ----------
- X : DataFrame
- The input features.
- y : array-like
- The target values.
- test_size : float
- The proportion of the dataset to include in the test split.
-
- Returns
- -------
- None
- """
- (self.X_train,
- self.X_test,
- self.y_train,
- self.y_test) = train_test_split(X, y,
- test_size=test_size,
- random_state=123)
-
- # _, ax_class_balance = plt.subplots()
- # ax_class_balance.semilogy()
- # class_balance_visualizer = ClassBalance(labels=["NOT_GRB", "GRB"], ax=ax_class_balance,
- # kwargs={'verbose': 2})
- # class_balance_visualizer.fit(self.y_train, self.y_test) # Fit the data to the visualizer
- # class_balance_visualizer.show(outpath='/output/visualizations/class_balance.pdf')
-
- @staticmethod
- def _resample(X, y):
- """
- Resamples the input data
-
- Parameters
- -------
- X : pd.DataFrame
- input data
- y : pd.Series
- input label
-
- Returns
- -------
- X_ : pd.DataFrame
- resampled data
- y_ : pd.Series
- resampled label
- """
- sm = SMOTE(sampling_strategy='minority', random_state=42)
- X_, y_ = sm.fit_resample(X, y)
- return X_, y_
-
- def _create_pipe(self, steps):
- """
- Create the machine learning pipeline from the given steps.
-
- Parameters
- -------
- steps : list
- The steps to use for the machine learning preprocessing pipeline.
-
- Returns
- -------
- Pipeline
- The created machine learning pipeline.
- """
- for step in steps:
- self.preprocessing.steps.append(step)
-
- def _print_model_eval(self):
- """
- Prints the evaluation of the model, mean average error (MAE), root mean squared error (RMSE),
- f1 score and confusion matrices for training and testing datasets.
- """
-
- train_pred = self.best_model.predict(self.X_train)
- test_pred = self.best_model.predict(self.X_test)
-
- train_conf_matrix = confusion_matrix(self.y_train, train_pred)
- test_conf_matrix = confusion_matrix(self.y_test, test_pred)
- print('*' * 50)
- print('Training score:')
- print(
- f'MAE: {round(mean_absolute_error(self.y_train, train_pred), 4)} '
- f'| RMSE: {round(mean_squared_error(self.y_train, train_pred, squared=False), 4)} '
- f'| F1: {round(f1_score(self.y_train, train_pred), 4)}'
- )
- print('Confusion Matrix:')
- print(train_conf_matrix)
- print('-' * 20)
- print('Validation score:')
- print(
- f'MAE: {round(mean_absolute_error(self.y_test, test_pred), 4)} '
- f'| RMSE: {round(mean_squared_error(self.y_test, test_pred, squared=False), 4)} '
- f'| F1: {round(f1_score(self.y_test, test_pred), 4)}'
- )
- print('Confusion Matrix:')
- print(test_conf_matrix)
-
-
-# def hyperparameter_valid_curve(self, outpath):
-# """
-# Validate hyperparameters and generate validation curves.
-#
-# Parameters
-# ----------
-# outpath : str
-# The output path where the validation curve plots will be saved.
-#
-# Returns
-# -------
-# None
-# """
-# best_model_name = self.best_model.steps[-1][0]
-# param_grid = self.config['Models'][best_model_name]['param_grid']
-# for param in param_grid:
-# # self.preprocessing.fit(self.X, self.y)
-# # processed_X = self.preprocessing.transform(self.X)
-# # processed_y = self.y
-# param_range = param_grid[param]
-# param_name = param.split('__')[1]
-# print(f'Validating {param_name} over range {param_range}')
-# _, ax_valid_curve = plt.subplots()
-#
-# valid_curve_viz = ValidationCurve(self.best_model,
-# param_name=param,
-# param_range=param_range,
-# cv=5,
-# scoring="f1",
-# ax=ax_valid_curve
-# )
-# valid_curve_viz.fit(self.X, self.y)
-# valid_curve_viz.show(outpath=f'{outpath}/{param_name}_valid_curve.pdf')
-#
-# def recursive_feature_elimination_plot(self, outpath):
-# """
-# Generate a recursive feature elimination plot.
-#
-# Parameters
-# ----------
-# outpath : str
-# The output path where the feature elimination plot will be saved.
-#
-# Returns
-# -------
-# None
-# """
-# _, ax_feature_elimination = plt.subplots()
-# visualizer = RFECV(self.best_model.steps[-1][1], cv=5, scoring='f1_weighted', ax=ax_feature_elimination)
-# visualizer.fit(self.X, self.y) # Fit the data to the visualizer
-# visualizer.show(outpath=outpath + 'feature_elimination.pdf')
-#
-# @staticmethod
-# def plot_knn_neighbors(knn, X, y, features):
-# """
-# Plot the KNN neighbors for a given dataset.
-#
-# Parameters
-# ----------
-# knn : KNeighborsClassifier
-# The KNN classifier.
-# X : DataFrame
-# The dataset containing the features.
-# y : array-like
-# The target values.
-# features : list
-# List of feature names to plot.
-#
-# Returns
-# -------
-# None
-# """
-#
-# # Select a random point
-# random_index = np.random.randint(0, len(X))
-# random_point = X.iloc[random_index]
-#
-# # Find the neighbors of the random point
-# neighbors = knn.kneighbors([random_point], return_distance=False)
-#
-# # Plot each pair of features
-# num_features = len(features)
-# for i in range(num_features):
-# for j in range(i + 1, num_features):
-# plt.figure(figsize=(8, 6))
-# plt.scatter(X.iloc[:, i], X.iloc[:, j], c=y, cmap='viridis', marker='o', edgecolor='k', s=50)
-# plt.scatter(random_point[i], random_point[j], c='red', marker='x', s=200, label='Random Point')
-# plt.scatter(X.iloc[neighbors[0], i], X.iloc[neighbors[0], j], c='red', marker='o', edgecolor='k', s=100,
-# facecolors='none', label='Neighbors')
-# plt.xlabel(features[i])
-# plt.ylabel(features[j])
-# plt.title(f'KNN Neighbors with {features[i]} vs {features[j]}')
-# plt.legend()
-# plt.savefig(
-# f'/output/visualizations/knn_plots/{features[i]}_vs_{features[j]}_knn_neighbors.pdf'
-# )
-
-
-
' + - '' + - _("Hide Search Matches") + - "
" - ) - ); - }, - - /** - * helper function to hide the search marks again - */ - hideSearchWords: () => { - document - .querySelectorAll("#searchbox .highlight-link") - .forEach((el) => el.remove()); - document - .querySelectorAll("span.highlighted") - .forEach((el) => el.classList.remove("highlighted")); - localStorage.removeItem("sphinx_highlight_terms") - }, - - initEscapeListener: () => { - // only install a listener if it is really needed - if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) return; - - document.addEventListener("keydown", (event) => { - // bail for input elements - if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; - // bail with special keys - if (event.shiftKey || event.altKey || event.ctrlKey || event.metaKey) return; - if (DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS && (event.key === "Escape")) { - SphinxHighlight.hideSearchWords(); - event.preventDefault(); - } - }); - }, -}; - -_ready(() => { - /* Do not call highlightSearchWords() when we are on the search page. - * It will highlight words from the *previous* search query. - */ - if (typeof Search === "undefined") SphinxHighlight.highlightSearchWords(); - SphinxHighlight.initEscapeListener(); -}); diff --git a/html/genindex.html b/html/genindex.html deleted file mode 100644 index 505a10c..0000000 --- a/html/genindex.html +++ /dev/null @@ -1,188 +0,0 @@ - - - - - - -- |
- | - |
|
-
- | - |
- |
- |
|
- - |
vtacML is a Python package designed for the real-time analysis of data from the Visible Telescope (VT) of the SVOM satellite. This package uses machine learning models to analyze features from a list of observed VT sources and identify potential gamma-ray burst (GRB) optical afterglow candidates. vtacML is integrated into the real-time SVOM VT VHF pipeline and flags each source detected, indicating the probability that it is a GRB candidate. This information is then used by Burst Advocates (BAs) on shift to help them identify which source is the real GRB counterpart.
-The SVOM mission, a collaboration between the China National Space Administration (CNSA) and the French space agency CNES, aims to study gamma-ray bursts (GRBs), the most energetic explosions in the universe. The Visible Telescope (VT) on SVOM plays a critical role in observing these events in the optical wavelength range.
-vtacML leverages machine learning to analyze VT data, providing a probability score for each observation to indicate its likelihood of being a GRB candidate. The package includes tools for data preprocessing, model training, evaluation, and visualization.
-To install vtacML, you can use pip
:
pip install vtacML
-
Alternatively, you can clone the repository and install the package locally:
-git clone https://github.com/jerbeario/VTAC_ML.git
-cd vtacML
-pip install .
-
Here’s a quick example to get you started with vtacML:
-from vtacML.pipeline import VTACMLPipe
-
-# Initialize the pipeline
-pipeline = VTACMLPipe()
-
-# Load configuration
-pipeline.load_config('path/to/config.yaml')
-
-# Train the model
-pipeline.train()
-
-# Evaluate the model
-pipeline.evaluate('evaluation_name', plot=True)
-
-# Predict GRB candidates
-predictions = pipeline.predict(observation_dataframe, prob=True)
-print(predictions)
-
vtacML can perform grid search on a large array of models and parameters specified in the configuration file. Initialize the VTACMLPipe
class with a specified config file (or use the default) and train it. Then, you can save the best model for future use.
from vtacML.pipeline import VTACMLPipe
-
-# Initialize the pipeline with a configuration file
-pipeline = VTACMLPipe(config_file='path/to/config.yaml')
-
-# Train the model with grid search
-pipeline.train()
-
-# Save the best model
-pipeline.save_best_model('path/to/save/best_model.pkl')
-
After training and saving the best model, you can create a new instance of the VTACMLPipe
class and load the best model for further use.
from vtacML.pipeline import VTACMLPipe
-
-# Initialize a new pipeline instance
-pipeline = VTACMLPipe()
-
-# Load the best model
-pipeline.load_best_model('path/to/save/best_model.pkl')
-
-# Predict GRB candidates
-predictions = pipeline.predict(observation_dataframe, prob=True)
-print(predictions)
-
If you already have a trained model, you can use the quick wrapper function predict_from_best_pipeline
to predict data immediately. A pre-trained model is available by default.
from vtacML.pipeline import predict_from_best_pipeline
-
-# Predict GRB candidates using the pre-trained model
-predictions = predict_from_best_pipeline(observation_dataframe, model_path='path/to/pretrained_model.pkl')
-print(predictions)
-
The config file is used to configure the model searching process.
-# Default config file, used to search for best model using only first two sequences (X0, X1) from the VT pipeline
-Inputs:
- file: 'combined_qpo_vt_all_cases_with_GRB_with_flags.parquet' # Data file used for training. Located in /data/
-# path: 'combined_qpo_vt_with_GRB.parquet'
-# path: 'combined_qpo_vt_faint_case_with_GRB_with_flags.parquet'
- columns: [
- "MAGCAL_R0",
- "MAGCAL_B0",
- "MAGERR_R0",
- "MAGERR_B0",
- "MAGCAL_R1",
- "MAGCAL_B1",
- "MAGERR_R1",
- "MAGERR_B1",
- "MAGVAR_R1",
- "MAGVAR_B1",
- 'EFLAG_R0',
- 'EFLAG_R1',
- 'EFLAG_B0',
- 'EFLAG_B1',
- "NEW_SRC",
- "DMAG_CAT"
- ] # features used for training
- target_column: 'IS_GRB' # feature column that holds the class information to be predicted
-
-# Set of models and parameters to perform GridSearchCV over
-Models:
- rfc:
- class: RandomForestClassifier()
- param_grid:
- 'rfc__n_estimators': [100, 200, 300] # Number of trees in the forest
- 'rfc__max_depth': [4, 6, 8] # Maximum depth of the tree
- 'rfc__min_samples_split': [2, 5, 10] # Minimum number of samples required to split an internal node
- 'rfc__min_samples_leaf': [1, 2, 4] # Minimum number of samples required to be at a leaf node
- 'rfc__bootstrap': [True, False] # Whether bootstrap samples are used when building trees
- ada:
- class: AdaBoostClassifier()
- param_grid:
- 'ada__n_estimators': [50, 100, 200] # Number of weak learners
- 'ada__learning_rate': [0.01, 0.1, 1] # Learning rate
- 'ada__algorithm': ['SAMME'] # Algorithm for boosting
- svc:
- class: SVC()
- param_grid:
- 'svc__C': [0.1, 1, 10, 100] # Regularization parameter
- 'svc__kernel': ['poly', 'rbf', 'sigmoid'] # Kernel type to be used in the algorithm
- 'svc__gamma': ['scale', 'auto'] # Kernel coefficient
- 'svc__degree': [3, 4, 5] # Degree of the polynomial kernel function (if `kernel` is 'poly')
- knn:
- class: KNeighborsClassifier()
- param_grid:
- 'knn__n_neighbors': [3, 5, 7, 9] # Number of neighbors to use
- 'knn__weights': ['uniform', 'distance'] # Weight function used in prediction
- 'knn__algorithm': ['ball_tree', 'kd_tree', 'brute'] # Algorithm used to compute the nearest neighbors
- 'knn__p': [1, 2] # Power parameter for the Minkowski metric
- lr:
- class: LogisticRegression()
- param_grid:
- 'lr__penalty': ['l1', 'l2', 'elasticnet'] # Specify the norm of the penalty
- 'lr__C': [0.01, 0.1, 1, 10] # Inverse of regularization strength
- 'lr__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'] # Algorithm to use in the optimization problem
- 'lr__max_iter': [100, 200, 300] # Maximum number of iterations taken for the solvers to converge
- dt:
- class: DecisionTreeClassifier()
- param_grid:
- 'dt__criterion': ['gini', 'entropy'] # The function to measure the quality of a split
- 'dt__splitter': ['best', 'random'] # The strategy used to choose the split at each node
- 'dt__max_depth': [4, 6, 8, 10] # Maximum depth of the tree
- 'dt__min_samples_split': [2, 5, 10] # Minimum number of samples required to split an internal node
- 'dt__min_samples_leaf': [1, 2, 4] # Minimum number of samples required to be at a leaf node
-
-# Output directories
-Outputs:
- model_path: '/output/models'
- viz_path: '/output/visualizations/'
- plot_correlation:
- flag: True
- path: 'output/corr_plots/'
-
-
Bases: object
A machine learning pipeline for training and evaluating an optimal model for optical identification of GRBs for the SVOM mission.
-config_path (str, optional) – Path to the configuration file. Default ‘config/config.yaml’
-Evaluate the best model with various metrics and visualization.
-name (str) – The name for the evaluation output.
plot (bool, optional) – If True, generates and saves evaluation plots, by default False.
score (callable, optional) – The scoring function to use for evaluation, by default f1_score.
Loads ‘model_name’ into current pipeline.
-model_name (str) – The name of the model from the Outputs/models/ directory to be loaded.
-Load the configuration file and prepare the data.
-config_file (str) – The path to the configuration file.
-Predict using the best model.
-X (DataFrame) – The input features for prediction.
prob (bool, optional) – If True, returns the probability of the predictions, by default False.
The predicted values or probabilities.
-ndarray
-Saves best model from training to the specified path in the config file. Optionally change name and/or path -of the model.
-model_name (str, optional) – Name of the model to be saved. Default=’best_model’.
model_path (str, optional) – Path to the model to be saved. Default=’model_path’ in config file
Train the pipeline with the given data.
-save_all_model (bool, optional) – Whether to save best model of each model type to output directory. Default is False.
resample_flag (bool, optional) – Whether to resample the data. Default is False
scoring (str, optional) – The scoring function to use. Default is ‘f1’.
cv (int, optional) – The cross-validation split to use. Default is 5.
Trained machine learning pipeline.
-Pipeline
-Predict using the best model pipeline.
-X (array-like) – Features to predict.
prob_flag (bool, optional) – Whether to return probabilities, by default False.
model_name (str, optional) – Name of the model to use, by default ‘0.974_rfc_best_model.pkl’
model_path (str, optional) – Path to the model to use for prediction, by default ‘None’
config_path (str, optional) – Path to the configuration file, by default ‘../config/config.yaml’.
Predicted values or probabilities.
-ndarray
-- Searching for multiple words only shows matches that contain - all words. -
- - - - - - - - -