Group 25
| Name | Roll no. |
|---|---|
| Romit Mohane | 23110279 |
| Rudra Pratap Singh | 23110281 |
This assignment introduces us to CI/CD for Machine Learning using MLRun, a powerful orchestration and automation framework for deploying ML pipelines. We create a function, deploy it using Kubernetes and helm, and set up a simple retraining pipeline.
Data_prep.py
This fetches data using sklearn.datasets import load_breast_cancer.
import mlrun
from sklearn.datasets import load_breast_cancer
import pandas as pd
@mlrun.handler(outputs=["dataset", "label_column"])
def breast_cancer_loader(context, format="csv"):
# Load breast cancer dataset
cancer = load_breast_cancer(as_frame=True)
cancer_dataset = cancer.frame
cancer_dataset['target'] = cancer.target
context.logger.info('saving breast cancer dataset to {}'.format(context.artifact_path))
context.log_dataset('breast-cancer-dataset', df=cancer_dataset, format=format, index=False)
return cancer_dataset, "target"
if __name__ == "__main__":
with mlrun.get_or_create_ctx("breast-cancer-generator", upload_artifacts=True) as context:
breast_cancer_loader(context, context.get_param("format", "csv"))trainer.py
Split the data into train test (10% test data). Train a model using the training data. Use Random forest classifier. Wrap the model with apply_mlrun from mlrun.frameworks.sklearn.
import mlrun
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from mlrun.frameworks.sklearn import apply_mlrun
def train(
dataset: mlrun.DataItem,
label_column: str = "target",
n_estimators: int = 100,
max_depth: int = 3,
model_name: str = "breast_cancer_classifier",
):
# Load data
df = dataset.as_df()
X = df.drop(label_column, axis=1)
y = df[label_column]
# Split data (10% test)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.1, random_state=42
)
# Train Random Forest
model = RandomForestClassifier(
n_estimators=n_estimators, max_depth=max_depth, random_state=42
)
# MLRun integration
apply_mlrun(model=model, model_name=model_name, x_test=X_test, y_test=y_test)
model.fit(X_train, y_train)serving.py
Create a model class that will inherit from mlrun.serving.V2ModelServer, enabling automatic support for model lifecycle methods like load() and predict().
from cloudpickle import load
import numpy as np
from typing import List
import mlrun
class ClassifierModel(mlrun.serving.V2ModelServer):
def load(self):
model_file, extra_data = self.get_model('.pkl')
self.model = load(open(model_file, 'rb'))
def predict(self, body: dict) -> List:
feats = np.asarray(body['inputs'])
results: np.ndarray = self.model.predict(feats)
return results.tolist()workflow.py
Create a Python script that defines an MLRun pipeline using the @dsl.pipeline decorator. It includes the following:
- Data Ingestion
- Model Training
- Model Deployement
import mlrun
from kfp import dsl
@dsl.pipeline(name="breast-cancer-demo")
def pipeline(model_name="breast-cancer-classifier"):
ingest = mlrun.run_function(
"load-breast-cancer-data",
name="load-breast-cancer-data",
params={"format": "pq", "model_name": model_name},
outputs=["dataset"],
)
train = mlrun.run_function(
"trainer",
inputs={"dataset": ingest.outputs["dataset"]},
hyperparams={
"n_estimators": [10, 100, 200],
"learning_rate": [1e-1, 1e-3],
"max_depth": [2, 5, 10]
},
selector="max.accuracy",
outputs=["model"],
)
deploy = mlrun.deploy_function(
"serving",
models=[{"key": model_name, "model_path": train.outputs["model"], "class_name": "ClassifierModel"}],
mock=False
)Confusion Matrix Artifact
This shows the Confusion matrix for the trained model on the X_test data.




