diff --git a/docs/build/doctrees/environment.pickle b/docs/build/doctrees/environment.pickle index f685706..d3537f2 100644 Binary files a/docs/build/doctrees/environment.pickle and b/docs/build/doctrees/environment.pickle differ diff --git a/docs/build/doctrees/index.doctree b/docs/build/doctrees/index.doctree index 212ccc4..0ce437a 100644 Binary files a/docs/build/doctrees/index.doctree and b/docs/build/doctrees/index.doctree differ diff --git a/docs/build/html/.buildinfo b/docs/build/html/.buildinfo new file mode 100644 index 0000000..4d38845 --- /dev/null +++ b/docs/build/html/.buildinfo @@ -0,0 +1,4 @@ +# Sphinx build info version 1 +# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. +config: 15909ba89e1fcdfb649a555995912958 +tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/docs/build/html/_modules/index.html b/docs/build/html/_modules/index.html new file mode 100644 index 0000000..9da7a2e --- /dev/null +++ b/docs/build/html/_modules/index.html @@ -0,0 +1,99 @@ + + + +
+ + +
+import logging
+import os
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import yaml
+import joblib
+
+
+from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.svm import SVC
+from sklearn.metrics import mean_absolute_error, mean_squared_error, f1_score, confusion_matrix
+from sklearn.model_selection import train_test_split, GridSearchCV
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler, Normalizer
+
+from imblearn.over_sampling import SMOTE
+
+from yellowbrick import ROCAUC
+from yellowbrick.model_selection import FeatureImportances, ValidationCurve
+from yellowbrick.classifier import ClassificationReport, ConfusionMatrix, PrecisionRecallCurve, ClassPredictionError
+from yellowbrick.model_selection import RFECV
+
+from .preparation import Cleaner
+from .utils import get_path
+
+log = logging.getLogger(__name__)
+
+
+
+[docs]
+def predict_from_best_pipeline(X: pd.DataFrame, prob_flag=False, model_name='0.974_rfc_best_model.pkl',
+ config_path=None):
+ """
+ Predict using the best model pipeline.
+
+ Parameters
+ ----------
+ X : array-like
+ Features to predict.
+ prob_flag : bool, optional
+ Whether to return probabilities, by default False.
+ model_name : str, optional
+ Name of the model to use, by default '0.974_rfc_best_model.pkl'
+ model_path : str, optional
+ Path to the model to use for prediction, by default 'None'
+ config_path : str, optional
+ Path to the configuration file, by default '../config/config.yaml'.
+
+ Returns
+ -------
+ ndarray
+ Predicted values or probabilities.
+ """
+
+ vtac_ml_pipe = VTACMLPipe(config_file=config_path)
+ print(model_name)
+ vtac_ml_pipe.load_best_model(model_name=model_name)
+ print(vtac_ml_pipe.best_model)
+ y = vtac_ml_pipe.predict(X, prob=prob_flag)
+ return y
+
+
+
+
+[docs]
+class VTACMLPipe:
+ """
+ A machine learning pipeline for training and evaluating an optimal model for optical identification of GRBs for the SVOM mission.
+
+ Parameters
+ ----------
+ config_path : str, optional
+ Path to the configuration file. Default 'config/config.yaml'
+
+ """
+
+ def __init__(self, config_file='config/config.yaml'):
+
+ """
+ Initialize the VTACMLPipe.
+
+ Parameters
+ ----------
+ config_path : str
+ Path to the configuration file.
+ """
+
+ # initialize attributes
+ self.config = None
+ self.X = None
+ self.y = None
+ self.X_columns = None
+ self.y_columns = None
+ self.X_train = None
+ self.y_train = None
+ self.X_test = None
+ self.y_test = None
+ self.preprocessing = Pipeline(steps=[], verbose=True)
+ self.full_pipeline = Pipeline(steps=[], verbose=True)
+ self.models = {}
+ self.best_model = None
+ self.y_predict = None
+ self.y_predict_prob = None
+
+ # Load configs from config file
+ self.load_config(config_file)
+
+ # Defining Steps of the preprocessing pipeline
+ cleaner = Cleaner(variables=self.X_columns)
+ scaler = StandardScaler()
+ normalizer = Normalizer()
+ self.steps = [
+ ('cleaner', cleaner),
+ # ('ohe', ohe),
+ ('scaler', scaler),
+ ('normalizer', normalizer)
+ ]
+ self._create_pipe(self.steps)
+
+
+[docs]
+ def load_config(self, config_file):
+ """
+ Load the configuration file and prepare the data.
+
+ Parameters
+ ----------
+ config_file : str
+ The path to the configuration file.
+
+ """
+ config_path = get_path(config_file)
+
+ with open(config_path, 'r') as file:
+ self.config = yaml.safe_load(file)
+
+ # loading config file and prepping data
+ data_file = self.config['Inputs']['file']
+ data = self._get_data(data_file=data_file)
+
+ self.X_columns = self.config['Inputs']['columns']
+ self.X = data[self.X_columns]
+ self.y_columns = self.config['Inputs']['target_column']
+ self.y = data[self.y_columns]
+ self._load_data(data, columns=self.X_columns, target=self.y_columns, test_size=0.2)
+
+ # building models attribute
+ for model in self.config['Models']:
+ if model == 'rfc':
+ self.models[model] = RandomForestClassifier()
+ if model == 'svc':
+ self.models[model] = SVC()
+ if model == 'knn':
+ self.models[model] = KNeighborsClassifier()
+ if model == 'lr':
+ self.models[model] = LogisticRegression()
+ if model == 'dt':
+ self.models[model] = DecisionTreeClassifier()
+ if model == 'ada':
+ self.models[model] = AdaBoostClassifier()
+
+
+
+[docs]
+ def train(self, save_all_model=False, resample_flag=False, scoring='f1', cv=5):
+ """
+ Train the pipeline with the given data.
+
+ Parameters
+ ----------
+ save_all_model : bool, optional
+ Whether to save best model of each model type to output directory. Default is False.
+ resample_flag : bool, optional
+ Whether to resample the data. Default is False
+ scoring : str, optional
+ The scoring function to use. Default is 'f1'.
+ cv : int, optional
+ The cross-validation split to use. Default is 5.
+
+ Returns
+ -------
+ Pipeline
+ Trained machine learning pipeline.
+ """
+
+ models = self.models
+
+ if resample_flag:
+ self.X_train, self.y_train = self._resample(self.X_train, self.y_train)
+
+ if self.preprocessing.steps is None:
+ print("No preprocessing steps")
+ model_path = None
+ best_score = 0
+ for name, model in models.items():
+
+ param_grid = self.config['Models'][name]['param_grid']
+
+ log.info("Model: {}".format(name))
+
+ self.full_pipeline = Pipeline(steps=self.preprocessing.steps.copy(), verbose=True)
+ self.full_pipeline.steps.append((name, model))
+ log.info(self.full_pipeline.steps)
+
+ # model fitting
+ grid_search = GridSearchCV(self.full_pipeline, param_grid, scoring=scoring, verbose=2, cv=cv)
+ grid_search.fit(self.X_train, self.y_train)
+
+ model_filename = f'{round(grid_search.best_score_, 3)}_{name}_best_model.pkl'
+ model_path = get_path(f'output/models/{model_filename}')
+ if save_all_model:
+ joblib.dump(
+ grid_search.best_estimator_,
+ model_path)
+
+ if grid_search.best_score_ > best_score:
+ best_score = grid_search.best_score_
+ self.best_model = grid_search.best_estimator_
+
+ log.info('*' * 50)
+ log.info(f'Best {name} Pipeline:')
+ log.info(grid_search.best_estimator_)
+ log.info(f'Best Score: {grid_search.best_score_}')
+ log.info('*' * 50)
+ log.info(f'Overall best model: {self.best_model}')
+
+
+ # self.save_best_model(model_path=model_path)
+
+
+[docs]
+ def save_best_model(self, model_name='best_model', model_path=None):
+ """
+ Saves best model from training to the specified path in the config file. Optionally change name and/or path
+ of the model.
+
+ Parameters
+ -------
+ model_name : str, optional
+ Name of the model to be saved. Default='best_model'.
+ model_path : str, optional
+ Path to the model to be saved. Default='model_path' in config file
+
+
+ """
+ if model_path is None:
+ model_path = get_path(f'{self.config['Outputs']['model_path']}/{model_name}')
+ else:
+ print(model_path)
+ model_path = get_path(model_path)
+ print(model_path)
+
+ joblib.dump(self.best_model, model_path)
+ logging.info(f'Saved model to {model_path}')
+
+
+
+[docs]
+ def load_best_model(self, model_name):
+ """
+ Loads 'model_name' into current pipeline.
+
+ Parameters
+ -------
+ model_name : str
+ The name of the model from the Outputs/models/ directory to be loaded.
+
+
+ """
+ model_path = get_path(f'{self.config['Outputs']['model_path']}/{model_name}')
+ self.best_model = joblib.load(model_path)
+ logging.info(f'Loaded {model_path}')
+
+
+
+[docs]
+ def evaluate(self, name, plot=False, score=f1_score):
+ """
+ Evaluate the best model with various metrics and visualization.
+
+ Parameters
+ ----------
+ name : str
+ The name for the evaluation output.
+ plot : bool, optional
+ If True, generates and saves evaluation plots, by default False.
+ score : callable, optional
+ The scoring function to use for evaluation, by default f1_score.
+
+ """
+ viz_path = self.config['Outputs']['viz_path']
+ output_path = get_path(f'{viz_path}/{name}')
+
+ print(self.best_model.steps)
+ if not os.path.exists(output_path):
+ os.makedirs(output_path)
+ print(f"Folder '{output_path}' created.")
+ else:
+ print(f"Folder '{output_path}' already exists.")
+ # INCLUDE case in titles
+ # model scoring
+ #
+ # if self.best_model.steps[-1][0] == 'knn' and plot:
+ # print('plotting')
+ # self.preprocessing.fit(self.X)
+ # X = pd.DataFrame(self.preprocessing.transform(self.X))
+ # self.plot_knn_neighbors(knn=self.best_model.steps[-1][1], X=X, y=self.y, features=self.X_columns)
+ # print('done plotting')
+ # else:
+ print(self.best_model.steps[-1][1])
+
+ # Evaluate model performance
+ self._print_model_eval()
+
+ if plot:
+
+ _, ax_report = plt.subplots()
+
+ report_viz = ClassificationReport(self.best_model, classes=["NOT_GRB", "IS_GRB"], support=True,
+ ax=ax_report)
+ report_viz.fit(self.X_train, self.y_train) # Fit the visualizer and the model
+ report_viz.score(self.X_test, self.y_test) # Evaluate the model on the test data
+ report_viz.show(outpath=output_path + '/classification_report.pdf')
+
+ _, ax_cm_test = plt.subplots()
+
+ cm_test_viz = ConfusionMatrix(self.best_model, classes=["NOT_GRB", "IS_GRB"], percent=True, axes=ax_cm_test)
+ cm_test_viz.fit(self.X_train, self.y_train)
+ cm_test_viz.score(self.X_test, self.y_test)
+ cm_test_viz.show(outpath=output_path + '/confusion_matrix_test.pdf')
+
+ _, ax_cm_train = plt.subplots()
+
+ cm_train_viz = ConfusionMatrix(self.best_model, classes=["NOT_GRB", "IS_GRB"], percent=True, ax=ax_cm_train)
+ cm_train_viz.fit(self.X_train, self.y_train)
+ cm_train_viz.score(self.X_train, self.y_train)
+ cm_train_viz.show(outpath=output_path + '/confusion_matrix_train.pdf')
+
+ _, ax_roc = plt.subplots()
+
+ roc_viz = ROCAUC(self.best_model, classes=["NOT_GRB", "IS_GRB"], ax=ax_roc)
+ roc_viz.fit(self.X_train, self.y_train) # Fit the training data to the visualizer
+ roc_viz.score(self.X_test, self.y_test) # Evaluate the model on the test data
+ roc_viz.show(outpath=output_path + '/ROC_AUC.pdf')
+
+ _, ax_pr_curve = plt.subplots()
+
+ pr_curve_viz = PrecisionRecallCurve(self.best_model, classes=["NOT_GRB", "IS_GRB"], ax=ax_pr_curve)
+ pr_curve_viz.fit(self.X_train, self.y_train)
+ pr_curve_viz.score(self.X_test, self.y_test)
+ pr_curve_viz.show(outpath=output_path + '/PR_curve.pdf')
+
+ _, ax_class_pred = plt.subplots()
+ ax_class_pred.semilogy()
+
+ class_pred_viz = ClassPredictionError(self.best_model, classes=["NOT_GRB", "IS_GRB"], ax=ax_class_pred)
+ class_pred_viz.fit(self.X_train, self.y_train)
+ class_pred_viz.score(self.X_test, self.y_test)
+ class_pred_viz.show(outpath=output_path + '/class_predictions.pdf')
+ #
+ if self.best_model.steps[-1][1] == RandomForestClassifier():
+ _, ax_feature_imp = plt.subplots()
+
+ feature_imp_viz = FeatureImportances(self.best_model.steps[-1][1], ax=ax_feature_imp)
+ feature_imp_viz.fit(self.X, self.y)
+ feature_imp_viz.show(outpath=output_path + '/feature_importances.pdf')
+
+ # if plot_extra:
+ # self.hyperparameter_valid_curve(outpath=output_path)
+ # # self.recursive_feature_elimination_plot(outpath=output_path)
+ #
+
+
+[docs]
+ def predict(self, X, prob=False):
+ """
+ Predict using the best model.
+
+ Parameters
+ ----------
+ X : DataFrame
+ The input features for prediction.
+ prob : bool, optional
+ If True, returns the probability of the predictions, by default False.
+
+ Returns
+ -------
+ ndarray
+ The predicted values or probabilities.
+ """
+ X = X[self.X_columns]
+ if prob is True:
+ self.y_predict_prob = self.best_model.predict_proba(X)
+ return self.y_predict_prob
+ else:
+ self.y_predict = self.best_model.predict(X)
+ return self.y_predict
+
+
+ @staticmethod
+ def _get_data(data_file: str):
+ """
+ Load data from a parquet file.
+
+ Parameters
+ ----------
+ data_file : str
+ The name of the data file to load.
+
+ Returns
+ -------
+ DataFrame
+ The loaded data.
+ """
+ data_path = get_path(f'/data/{data_file}')
+ print(data_path)
+ data = pd.read_parquet(data_path, engine='fastparquet')
+ return data
+
+ def _load_data(self, data: pd.DataFrame, columns: list, target: str, test_size: float = 0.2):
+ """
+ Load the data from the source specified in the config.
+
+ Parameters
+ ----------
+ data : pd.DataFrame
+ The data to load.
+ columns : list
+ The columns to load.
+ target: str
+ The target column.
+ test_size: float, optional
+ The size of the test sample as a fraction of the total sample. Default is 0.2.
+
+ Returns
+ -------
+ DataFrame
+ Loaded data.
+ """
+ X = data[columns]
+ y = data[target]
+ self._split_data(X, y, test_size)
+
+ def _split_data(self, X, y, test_size):
+ """
+ Split data into training and testing sets.
+
+ Parameters
+ ----------
+ X : DataFrame
+ The input features.
+ y : array-like
+ The target values.
+ test_size : float
+ The proportion of the dataset to include in the test split.
+
+ Returns
+ -------
+ None
+ """
+ (self.X_train,
+ self.X_test,
+ self.y_train,
+ self.y_test) = train_test_split(X, y,
+ test_size=test_size,
+ random_state=123)
+
+ # _, ax_class_balance = plt.subplots()
+ # ax_class_balance.semilogy()
+ # class_balance_visualizer = ClassBalance(labels=["NOT_GRB", "GRB"], ax=ax_class_balance,
+ # kwargs={'verbose': 2})
+ # class_balance_visualizer.fit(self.y_train, self.y_test) # Fit the data to the visualizer
+ # class_balance_visualizer.show(outpath='/output/visualizations/class_balance.pdf')
+
+ @staticmethod
+ def _resample(X, y):
+ """
+ Resamples the input data
+
+ Parameters
+ -------
+ X : pd.DataFrame
+ input data
+ y : pd.Series
+ input label
+
+ Returns
+ -------
+ X_ : pd.DataFrame
+ resampled data
+ y_ : pd.Series
+ resampled label
+ """
+ sm = SMOTE(sampling_strategy='minority', random_state=42)
+ X_, y_ = sm.fit_resample(X, y)
+ return X_, y_
+
+ def _create_pipe(self, steps):
+ """
+ Create the machine learning pipeline from the given steps.
+
+ Parameters
+ -------
+ steps : list
+ The steps to use for the machine learning preprocessing pipeline.
+
+ Returns
+ -------
+ Pipeline
+ The created machine learning pipeline.
+ """
+ for step in steps:
+ self.preprocessing.steps.append(step)
+
+ def _print_model_eval(self):
+ """
+ Prints the evaluation of the model, mean average error (MAE), root mean squared error (RMSE),
+ f1 score and confusion matrices for training and testing datasets.
+ """
+
+ train_pred = self.best_model.predict(self.X_train)
+ test_pred = self.best_model.predict(self.X_test)
+
+ train_conf_matrix = confusion_matrix(self.y_train, train_pred)
+ test_conf_matrix = confusion_matrix(self.y_test, test_pred)
+ print('*' * 50)
+ print('Training score:')
+ print(
+ f'MAE: {round(mean_absolute_error(self.y_train, train_pred), 4)} '
+ f'| RMSE: {round(mean_squared_error(self.y_train, train_pred, squared=False), 4)} '
+ f'| F1: {round(f1_score(self.y_train, train_pred), 4)}'
+ )
+ print('Confusion Matrix:')
+ print(train_conf_matrix)
+ print('-' * 20)
+ print('Validation score:')
+ print(
+ f'MAE: {round(mean_absolute_error(self.y_test, test_pred), 4)} '
+ f'| RMSE: {round(mean_squared_error(self.y_test, test_pred, squared=False), 4)} '
+ f'| F1: {round(f1_score(self.y_test, test_pred), 4)}'
+ )
+ print('Confusion Matrix:')
+ print(test_conf_matrix)
+
+
+# def hyperparameter_valid_curve(self, outpath):
+# """
+# Validate hyperparameters and generate validation curves.
+#
+# Parameters
+# ----------
+# outpath : str
+# The output path where the validation curve plots will be saved.
+#
+# Returns
+# -------
+# None
+# """
+# best_model_name = self.best_model.steps[-1][0]
+# param_grid = self.config['Models'][best_model_name]['param_grid']
+# for param in param_grid:
+# # self.preprocessing.fit(self.X, self.y)
+# # processed_X = self.preprocessing.transform(self.X)
+# # processed_y = self.y
+# param_range = param_grid[param]
+# param_name = param.split('__')[1]
+# print(f'Validating {param_name} over range {param_range}')
+# _, ax_valid_curve = plt.subplots()
+#
+# valid_curve_viz = ValidationCurve(self.best_model,
+# param_name=param,
+# param_range=param_range,
+# cv=5,
+# scoring="f1",
+# ax=ax_valid_curve
+# )
+# valid_curve_viz.fit(self.X, self.y)
+# valid_curve_viz.show(outpath=f'{outpath}/{param_name}_valid_curve.pdf')
+#
+# def recursive_feature_elimination_plot(self, outpath):
+# """
+# Generate a recursive feature elimination plot.
+#
+# Parameters
+# ----------
+# outpath : str
+# The output path where the feature elimination plot will be saved.
+#
+# Returns
+# -------
+# None
+# """
+# _, ax_feature_elimination = plt.subplots()
+# visualizer = RFECV(self.best_model.steps[-1][1], cv=5, scoring='f1_weighted', ax=ax_feature_elimination)
+# visualizer.fit(self.X, self.y) # Fit the data to the visualizer
+# visualizer.show(outpath=outpath + 'feature_elimination.pdf')
+#
+# @staticmethod
+# def plot_knn_neighbors(knn, X, y, features):
+# """
+# Plot the KNN neighbors for a given dataset.
+#
+# Parameters
+# ----------
+# knn : KNeighborsClassifier
+# The KNN classifier.
+# X : DataFrame
+# The dataset containing the features.
+# y : array-like
+# The target values.
+# features : list
+# List of feature names to plot.
+#
+# Returns
+# -------
+# None
+# """
+#
+# # Select a random point
+# random_index = np.random.randint(0, len(X))
+# random_point = X.iloc[random_index]
+#
+# # Find the neighbors of the random point
+# neighbors = knn.kneighbors([random_point], return_distance=False)
+#
+# # Plot each pair of features
+# num_features = len(features)
+# for i in range(num_features):
+# for j in range(i + 1, num_features):
+# plt.figure(figsize=(8, 6))
+# plt.scatter(X.iloc[:, i], X.iloc[:, j], c=y, cmap='viridis', marker='o', edgecolor='k', s=50)
+# plt.scatter(random_point[i], random_point[j], c='red', marker='x', s=200, label='Random Point')
+# plt.scatter(X.iloc[neighbors[0], i], X.iloc[neighbors[0], j], c='red', marker='o', edgecolor='k', s=100,
+# facecolors='none', label='Neighbors')
+# plt.xlabel(features[i])
+# plt.ylabel(features[j])
+# plt.title(f'KNN Neighbors with {features[i]} vs {features[j]}')
+# plt.legend()
+# plt.savefig(
+# f'/output/visualizations/knn_plots/{features[i]}_vs_{features[j]}_knn_neighbors.pdf'
+# )
+
+
+
' + + '' + + _("Hide Search Matches") + + "
" + ) + ); + }, + + /** + * helper function to hide the search marks again + */ + hideSearchWords: () => { + document + .querySelectorAll("#searchbox .highlight-link") + .forEach((el) => el.remove()); + document + .querySelectorAll("span.highlighted") + .forEach((el) => el.classList.remove("highlighted")); + localStorage.removeItem("sphinx_highlight_terms") + }, + + initEscapeListener: () => { + // only install a listener if it is really needed + if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) return; + + document.addEventListener("keydown", (event) => { + // bail for input elements + if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; + // bail with special keys + if (event.shiftKey || event.altKey || event.ctrlKey || event.metaKey) return; + if (DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS && (event.key === "Escape")) { + SphinxHighlight.hideSearchWords(); + event.preventDefault(); + } + }); + }, +}; + +_ready(() => { + /* Do not call highlightSearchWords() when we are on the search page. + * It will highlight words from the *previous* search query. + */ + if (typeof Search === "undefined") SphinxHighlight.highlightSearchWords(); + SphinxHighlight.initEscapeListener(); +}); diff --git a/docs/build/html/genindex.html b/docs/build/html/genindex.html new file mode 100644 index 0000000..505a10c --- /dev/null +++ b/docs/build/html/genindex.html @@ -0,0 +1,188 @@ + + + + + + ++ |
+ | + |
|
+
+ | + |
+ |
+ |
|
+ + |
vtacML is a Python package designed for the real-time analysis of data from the Visible Telescope (VT) of the SVOM satellite. This package uses machine learning models to analyze features from a list of observed VT sources and identify potential gamma-ray burst (GRB) optical afterglow candidates. vtacML is integrated into the real-time SVOM VT VHF pipeline and flags each source detected, indicating the probability that it is a GRB candidate. This information is then used by Burst Advocates (BAs) on shift to help them identify which source is the real GRB counterpart.
+The SVOM mission, a collaboration between the China National Space Administration (CNSA) and the French space agency CNES, aims to study gamma-ray bursts (GRBs), the most energetic explosions in the universe. The Visible Telescope (VT) on SVOM plays a critical role in observing these events in the optical wavelength range.
+vtacML leverages machine learning to analyze VT data, providing a probability score for each observation to indicate its likelihood of being a GRB candidate. The package includes tools for data preprocessing, model training, evaluation, and visualization.
+To install vtacML, you can use pip
:
pip install vtacML
+
Alternatively, you can clone the repository and install the package locally:
+git clone https://github.com/jerbeario/VTAC_ML.git
+cd vtacML
+pip install .
+
Here’s a quick example to get you started with vtacML:
+from vtacML.pipeline import VTACMLPipe
+
+# Initialize the pipeline
+pipeline = VTACMLPipe()
+
+# Load configuration
+pipeline.load_config('path/to/config.yaml')
+
+# Train the model
+pipeline.train()
+
+# Evaluate the model
+pipeline.evaluate('evaluation_name', plot=True)
+
+# Predict GRB candidates
+predictions = pipeline.predict(observation_dataframe, prob=True)
+print(predictions)
+
vtacML can perform grid search on a large array of models and parameters specified in the configuration file. Initialize the VTACMLPipe
class with a specified config file (or use the default) and train it. Then, you can save the best model for future use.
from vtacML.pipeline import VTACMLPipe
+
+# Initialize the pipeline with a configuration file
+pipeline = VTACMLPipe(config_file='path/to/config.yaml')
+
+# Train the model with grid search
+pipeline.train()
+
+# Save the best model
+pipeline.save_best_model('path/to/save/best_model.pkl')
+
After training and saving the best model, you can create a new instance of the VTACMLPipe
class and load the best model for further use.
from vtacML.pipeline import VTACMLPipe
+
+# Initialize a new pipeline instance
+pipeline = VTACMLPipe()
+
+# Load the best model
+pipeline.load_best_model('path/to/save/best_model.pkl')
+
+# Predict GRB candidates
+predictions = pipeline.predict(observation_dataframe, prob=True)
+print(predictions)
+
If you already have a trained model, you can use the quick wrapper function predict_from_best_pipeline
to predict data immediately. A pre-trained model is available by default.
from vtacML.pipeline import predict_from_best_pipeline
+
+# Predict GRB candidates using the pre-trained model
+predictions = predict_from_best_pipeline(observation_dataframe, model_path='path/to/pretrained_model.pkl')
+print(predictions)
+
The config file is used to configure the model searching process.
+# Default config file, used to search for best model using only first two sequences (X0, X1) from the VT pipeline
+Inputs:
+ file: 'combined_qpo_vt_all_cases_with_GRB_with_flags.parquet' # Data file used for training. Located in /data/
+# path: 'combined_qpo_vt_with_GRB.parquet'
+# path: 'combined_qpo_vt_faint_case_with_GRB_with_flags.parquet'
+ columns: [
+ "MAGCAL_R0",
+ "MAGCAL_B0",
+ "MAGERR_R0",
+ "MAGERR_B0",
+ "MAGCAL_R1",
+ "MAGCAL_B1",
+ "MAGERR_R1",
+ "MAGERR_B1",
+ "MAGVAR_R1",
+ "MAGVAR_B1",
+ 'EFLAG_R0',
+ 'EFLAG_R1',
+ 'EFLAG_B0',
+ 'EFLAG_B1',
+ "NEW_SRC",
+ "DMAG_CAT"
+ ] # features used for training
+ target_column: 'IS_GRB' # feature column that holds the class information to be predicted
+
+# Set of models and parameters to perform GridSearchCV over
+Models:
+ rfc:
+ class: RandomForestClassifier()
+ param_grid:
+ 'rfc__n_estimators': [100, 200, 300] # Number of trees in the forest
+ 'rfc__max_depth': [4, 6, 8] # Maximum depth of the tree
+ 'rfc__min_samples_split': [2, 5, 10] # Minimum number of samples required to split an internal node
+ 'rfc__min_samples_leaf': [1, 2, 4] # Minimum number of samples required to be at a leaf node
+ 'rfc__bootstrap': [True, False] # Whether bootstrap samples are used when building trees
+ ada:
+ class: AdaBoostClassifier()
+ param_grid:
+ 'ada__n_estimators': [50, 100, 200] # Number of weak learners
+ 'ada__learning_rate': [0.01, 0.1, 1] # Learning rate
+ 'ada__algorithm': ['SAMME'] # Algorithm for boosting
+ svc:
+ class: SVC()
+ param_grid:
+ 'svc__C': [0.1, 1, 10, 100] # Regularization parameter
+ 'svc__kernel': ['poly', 'rbf', 'sigmoid'] # Kernel type to be used in the algorithm
+ 'svc__gamma': ['scale', 'auto'] # Kernel coefficient
+ 'svc__degree': [3, 4, 5] # Degree of the polynomial kernel function (if `kernel` is 'poly')
+ knn:
+ class: KNeighborsClassifier()
+ param_grid:
+ 'knn__n_neighbors': [3, 5, 7, 9] # Number of neighbors to use
+ 'knn__weights': ['uniform', 'distance'] # Weight function used in prediction
+ 'knn__algorithm': ['ball_tree', 'kd_tree', 'brute'] # Algorithm used to compute the nearest neighbors
+ 'knn__p': [1, 2] # Power parameter for the Minkowski metric
+ lr:
+ class: LogisticRegression()
+ param_grid:
+ 'lr__penalty': ['l1', 'l2', 'elasticnet'] # Specify the norm of the penalty
+ 'lr__C': [0.01, 0.1, 1, 10] # Inverse of regularization strength
+ 'lr__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'] # Algorithm to use in the optimization problem
+ 'lr__max_iter': [100, 200, 300] # Maximum number of iterations taken for the solvers to converge
+ dt:
+ class: DecisionTreeClassifier()
+ param_grid:
+ 'dt__criterion': ['gini', 'entropy'] # The function to measure the quality of a split
+ 'dt__splitter': ['best', 'random'] # The strategy used to choose the split at each node
+ 'dt__max_depth': [4, 6, 8, 10] # Maximum depth of the tree
+ 'dt__min_samples_split': [2, 5, 10] # Minimum number of samples required to split an internal node
+ 'dt__min_samples_leaf': [1, 2, 4] # Minimum number of samples required to be at a leaf node
+
+# Output directories
+Outputs:
+ model_path: '/output/models'
+ viz_path: '/output/visualizations/'
+ plot_correlation:
+ flag: True
+ path: 'output/corr_plots/'
+
+
Bases: object
A machine learning pipeline for training and evaluating an optimal model for optical identification of GRBs for the SVOM mission.
+config_path (str, optional) – Path to the configuration file. Default ‘config/config.yaml’
+Evaluate the best model with various metrics and visualization.
+name (str) – The name for the evaluation output.
plot (bool, optional) – If True, generates and saves evaluation plots, by default False.
score (callable, optional) – The scoring function to use for evaluation, by default f1_score.
Loads ‘model_name’ into current pipeline.
+model_name (str) – The name of the model from the Outputs/models/ directory to be loaded.
+Load the configuration file and prepare the data.
+config_file (str) – The path to the configuration file.
+Predict using the best model.
+X (DataFrame) – The input features for prediction.
prob (bool, optional) – If True, returns the probability of the predictions, by default False.
The predicted values or probabilities.
+ndarray
+Saves best model from training to the specified path in the config file. Optionally change name and/or path +of the model.
+model_name (str, optional) – Name of the model to be saved. Default=’best_model’.
model_path (str, optional) – Path to the model to be saved. Default=’model_path’ in config file
Train the pipeline with the given data.
+save_all_model (bool, optional) – Whether to save best model of each model type to output directory. Default is False.
resample_flag (bool, optional) – Whether to resample the data. Default is False
scoring (str, optional) – The scoring function to use. Default is ‘f1’.
cv (int, optional) – The cross-validation split to use. Default is 5.
Trained machine learning pipeline.
+Pipeline
+Predict using the best model pipeline.
+X (array-like) – Features to predict.
prob_flag (bool, optional) – Whether to return probabilities, by default False.
model_name (str, optional) – Name of the model to use, by default ‘0.974_rfc_best_model.pkl’
model_path (str, optional) – Path to the model to use for prediction, by default ‘None’
config_path (str, optional) – Path to the configuration file, by default ‘../config/config.yaml’.
Predicted values or probabilities.
+ndarray
++ Searching for multiple words only shows matches that contain + all words. +
+ + + + + + + + +