diff --git a/.gitignore b/.gitignore index d43c902..3910090 100644 --- a/.gitignore +++ b/.gitignore @@ -99,3 +99,17 @@ target/ .mypy_cache/ .ruff_cache/ mlruns/ + +# Hydra outputs +outputs/ +multirun/ + +# ClearML +clearml/.env +clearml/clearml.conf +~/clearml.conf +*.joblib + +# Keep ClearML examples and configs +!clearml/env.example +!clearml/clearml.conf.example diff --git a/Makefile b/Makefile index d7fe925..e365ac5 100644 --- a/Makefile +++ b/Makefile @@ -80,6 +80,149 @@ test_environment: # PROJECT RULES # ################################################################################# +################################################################################# +# HW4: ML Pipeline Automation # +################################################################################# + +## Prepare data using Hydra config +prepare: + $(PYTHON_INTERPRETER) -m src.pipelines.prepare_data + +## Train single model (usage: make train MODEL=random_forest) +train: + $(PYTHON_INTERPRETER) -m src.pipelines.train_pipeline model=$(MODEL) + +## Train Random Forest model +train_rf: + $(PYTHON_INTERPRETER) -m src.pipelines.train_pipeline model=random_forest + +## Train Gradient Boosting model +train_gb: + $(PYTHON_INTERPRETER) -m src.pipelines.train_pipeline model=gradient_boosting + +## Train all models sequentially +train_all: + $(PYTHON_INTERPRETER) -m src.pipelines.run_all_models + +## Evaluate and compare all models +evaluate: + $(PYTHON_INTERPRETER) -m src.pipelines.evaluate_models + +## Run full DVC pipeline (prepare + all models + evaluate) +pipeline: + dvc repro + +## Run DVC pipeline for specific stage +pipeline_stage: + dvc repro $(STAGE) + +## Show DVC pipeline DAG +dag: + dvc dag + +## Show DVC metrics +metrics: + dvc metrics show + +## Compare DVC metrics with previous runs +metrics_diff: + dvc metrics diff + +## Show DVC params +params: + dvc params diff + +## Clean output directories +clean_outputs: + rm -rf outputs/ + rm -rf multirun/ + +## Run full pipeline from scratch +run_full: clean_outputs pipeline evaluate + @echo "Full pipeline completed!" + + +################################################################################# +# HW5: ClearML MLOps # +################################################################################# + +## Start ClearML Server (Docker) +clearml_server_start: + cd clearml && docker-compose up -d + @echo "ClearML Server started!" + @echo "Web UI: http://localhost:8080" + @echo "API: http://localhost:8008" + @echo "Files: http://localhost:8081" + +## Stop ClearML Server +clearml_server_stop: + cd clearml && docker-compose down + @echo "ClearML Server stopped" + +## Show ClearML Server status +clearml_server_status: + cd clearml && docker-compose ps + +## Setup ClearML configuration +clearml_setup: + $(PYTHON_INTERPRETER) clearml/setup_clearml.py --status + +## Test ClearML connection +clearml_test: + $(PYTHON_INTERPRETER) clearml/setup_clearml.py --test + +## Create ClearML project structure +clearml_create_project: + $(PYTHON_INTERPRETER) clearml/setup_clearml.py --create-project + +## Run single experiment with ClearML (usage: make clearml_experiment MODEL=RandomForest) +clearml_experiment: + $(PYTHON_INTERPRETER) -m src.clearml_integration.run_experiments --model $(MODEL) + +## Run all experiments with ClearML tracking +clearml_experiments_all: + $(PYTHON_INTERPRETER) -m src.clearml_integration.run_experiments --all + +## Run experiments in offline mode (no server required) +clearml_experiments_offline: + $(PYTHON_INTERPRETER) -m src.clearml_integration.run_experiments --all --offline + +## Compare ClearML experiments +clearml_compare: + $(PYTHON_INTERPRETER) -m src.clearml_integration.run_experiments --compare + +## Compare registered models +clearml_compare_models: + $(PYTHON_INTERPRETER) -m src.clearml_integration.run_experiments --compare-models + +## Run ClearML pipeline for single model (usage: make clearml_pipeline MODEL=RandomForest) +clearml_pipeline: + $(PYTHON_INTERPRETER) -m src.clearml_integration.pipeline --model $(MODEL) + +## Run ClearML pipeline for all models +clearml_pipeline_all: + $(PYTHON_INTERPRETER) -m src.clearml_integration.pipeline --all + +## Generate ClearML dashboard report +clearml_dashboard: + $(PYTHON_INTERPRETER) -m src.clearml_integration.dashboard --summary + +## Generate full ClearML report +clearml_report: + $(PYTHON_INTERPRETER) -m src.clearml_integration.dashboard --report + +## Export ClearML metrics and reports +clearml_export: + $(PYTHON_INTERPRETER) -m src.clearml_integration.dashboard --all + +## Full ClearML workflow: experiments + comparison + report +clearml_full: clearml_experiments_all clearml_compare_models clearml_report + @echo "Full ClearML workflow completed!" + +## Clean ClearML outputs +clearml_clean: + rm -rf outputs/clearml/ + @echo "ClearML outputs cleaned" ################################################################################# diff --git a/REPORT.md b/REPORT.md index df47530..2c861db 100644 --- a/REPORT.md +++ b/REPORT.md @@ -1,142 +1,708 @@ -# Отчет по ДЗ 2: Версионирование данных и моделей - -## Инструменты - -- **Версионирование данных**: DVC (Data Version Control) -- **Версионирование моделей**: MLflow -- **Удаленное хранилище (Remote Storage)**: Local Storage (эмуляция remote) - -## Настройка DVC - -1. **Инициализация DVC**: - ```bash - dvc init - ``` - -2. **Настройка Remote Storage**: - Использована локальная директория `../dvc_remote` для имитации удаленного хранилища. - ```bash - mkdir -p ../dvc_remote - dvc remote add -d localremote ../dvc_remote - dvc config core.analytics false - ``` - -3. **Версионирование данных и пайплайн**: - - Датасет Wine Quality отслеживается (`data/raw/winequality-red.csv.dvc`). - - Настроен DVC пайплайн (`dvc.yaml`) с этапами `prepare` и `train`. - - ```bash - # Добавление данных - dvc add data/raw/winequality-red.csv - dvc push - - # Запуск пайплайна - dvc repro - ``` +# EPML ITMO Project - Wine Quality Classification -## Настройка MLflow +Data Science Project for EPML ITMO with MLOps integration using ClearML. -MLflow настроен для трекинга экспериментов и реестра моделей. +--- -1. **Запуск сервера (опционально) или локальный трекинг**: - В данном проекте используется локальный трекинг в директорию `mlruns`. +## 📋 Домашнее задание 5: ClearML для MLOps -2. **Обучение и логирование**: - Скрипт `src/models/train_model.py` обучает RandomForest и логирует параметры, метрики и модель. - - Пример запуска: - ```bash - poetry run python src/models/train_model.py data/processed - ``` +### Содержание +- [Описание проекта](#описание-проекта) +- [Настройка ClearML](#1-настройка-clearml-3-балла) +- [Трекинг экспериментов](#2-трекинг-экспериментов-3-балла) +- [Управление моделями](#3-управление-моделями-3-балла) +- [Пайплайны](#4-пайплайны-2-балла) +- [Быстрый старт](#быстрый-старт) +- [Структура проекта](#структура-проекта) - Пример запуска с другими гиперпараметрами (версия 2): - ```bash - poetry run python src/models/train_model.py data/processed --n_estimators 200 --max_depth 10 - ``` +--- -## Результаты +## Описание проекта -### Логи запуска (Screenshots emulation) +Проект демонстрирует полную интеграцию ClearML для MLOps workflow на примере задачи классификации качества вина. Реализованы: +- Автоматический трекинг экспериментов +- Версионирование моделей +- ML пайплайны +- Дашборды и сравнение экспериментов -**Запуск 1 (Default params):** -```text -2025-12-08 22:12:27,045 - __main__ - INFO - Training model... -2025/12/08 22:12:27 INFO mlflow.tracking.fluent: Experiment with name 'wine_quality_experiment' does not exist. Creating a new experiment. -2025-12-08 22:12:28,243 - __main__ - INFO - Accuracy: 0.659375 -2025-12-08 22:12:28,243 - __main__ - INFO - F1 Score: 0.6442498546491976 -Successfully registered model 'WineQualityRandomForest'. -Created version '1' of model 'WineQualityRandomForest'. +### Используемые модели +- Random Forest +- Gradient Boosting +- Logistic Regression +- SVM +- Decision Tree +- KNN + +--- + +## 1. Настройка ClearML (3 балла) + +### 1.1 Установка ClearML Server через Docker + +Проект включает готовый `docker-compose.yml` для развертывания ClearML Server: + +```bash +# Запуск ClearML Server +make clearml_server_start + +# Или напрямую +cd clearml && docker-compose up -d +``` + +**Компоненты:** +- **MongoDB** - основная база данных +- **Elasticsearch** - поиск и аналитика +- **Redis** - кэширование и сессии +- **ClearML API Server** - REST API (порт 8008) +- **ClearML Web Server** - веб-интерфейс (порт 8080) +- **ClearML File Server** - хранилище файлов (порт 8081) +- **ClearML Agent** - опционально для удаленного выполнения + +**Доступ к сервисам:** +- Web UI: http://localhost:8080 +- API: http://localhost:8008 +- Files: http://localhost:8081 + +### 1.2 Настройка аутентификации + +1. Откройте Web UI: http://localhost:8080 +2. Перейдите в Settings → Workspace → Create new credentials +3. Скопируйте credentials + +**Способы настройки:** + +**Вариант 1: Интерактивная настройка** +```bash +clearml-init +``` + +**Вариант 2: Переменные окружения** +```bash +export CLEARML_API_HOST=http://localhost:8008 +export CLEARML_WEB_HOST=http://localhost:8080 +export CLEARML_FILES_HOST=http://localhost:8081 +export CLEARML_API_ACCESS_KEY= +export CLEARML_API_SECRET_KEY= +``` + +**Вариант 3: Файл конфигурации** +```bash +cp clearml/clearml.conf.example ~/clearml.conf +# Отредактируйте файл, добавив credentials +``` + +### 1.3 Проверка настройки + +```bash +# Проверка статуса +make clearml_setup + +# Тестирование подключения +make clearml_test + +# Создание проекта +make clearml_create_project +``` + +### 1.4 Конфигурация Docker Compose + +```yaml:clearml/docker-compose.yml +version: "3.8" + +services: + mongo: + image: mongo:6.0 + volumes: + - clearml-mongo-data:/data/db + + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:8.12.0 + environment: + - discovery.type=single-node + - xpack.security.enabled=false + + redis: + image: redis:7 + volumes: + - clearml-redis-data:/data + + apiserver: + image: allegroai/clearml:latest + ports: + - "8008:8008" + + webserver: + image: allegroai/clearml:latest + ports: + - "8080:80" + + fileserver: + image: allegroai/clearml:latest + ports: + - "8081:8081" +``` + +--- + +## 2. Трекинг экспериментов (3 балла) + +### 2.1 Автоматическое логирование + +Модуль `src/clearml_integration/experiment_tracker.py` обеспечивает: + +```python +from src.clearml_integration import ClearMLExperiment + +# Контекстный менеджер для экспериментов +with ClearMLExperiment( + experiment_name="RandomForest_Experiment", + project_name="EPML-ITMO/Wine-Quality/Experiments", + tags=["RandomForest", "wine-quality"], +) as exp: + # Автоматическое логирование параметров + exp.log_parameters({"n_estimators": 100, "max_depth": 10}) + + # Обучение модели + model.fit(X_train, y_train) + + # Логирование метрик с визуализацией + exp.log_classification_report(y_test, y_pred) + + # Логирование модели + exp.log_model(model, "random_forest") +``` + +**Декоратор для функций:** +```python +from src.clearml_integration import clearml_experiment + +@clearml_experiment( + experiment_name="training_pipeline", + project_name="EPML-ITMO/Wine-Quality" +) +def train_model(clearml_experiment=None): + # Эксперимент автоматически создается и закрывается + clearml_experiment.log_metrics({"accuracy": 0.95}) +``` + +### 2.2 Система сравнения экспериментов + +```python +from src.clearml_integration.experiment_tracker import ExperimentComparison + +comparison = ExperimentComparison(project_name="EPML-ITMO/Wine-Quality/Experiments") + +# Получение всех экспериментов +experiments = comparison.get_experiments(tags=["classification"]) + +# Сравнение метрик +df = comparison.compare_metrics(metric_names=["accuracy", "f1_score"]) + +# Генерация отчета +comparison.generate_report("outputs/comparison_report.json") +``` + +### 2.3 Логирование метрик и параметров + +**Поддерживаемые типы логирования:** +- Скалярные метрики с итерациями +- Confusion Matrix +- Classification Report +- Произвольные графики +- Артефакты (DataFrame, файлы, словари) +- Датасеты + +```python +# Скалярные метрики +exp.log_metric("accuracy", 0.95, series="validation", iteration=epoch) + +# Множественные метрики +exp.log_metrics({ + "accuracy": 0.95, + "precision": 0.94, + "recall": 0.93, + "f1_score": 0.935 +}) + +# Confusion Matrix +exp.log_confusion_matrix(y_true, y_pred, labels=class_names) + +# Артефакты +exp.log_artifact("feature_importance", feature_df) +exp.log_dataset(train_df, test_df) +``` + +### 2.4 Дашборды для анализа + +```bash +# Запуск дашборда с саммари +make clearml_dashboard + +# Генерация полного отчета +make clearml_report + +# Экспорт всех метрик и отчетов +make clearml_export +``` + +**Модуль `dashboard.py`:** +```python +from src.clearml_integration.dashboard import ClearMLDashboard + +dashboard = ClearMLDashboard() + +# Печать саммари +dashboard.print_summary() + +# Генерация Markdown отчета +dashboard.generate_full_report() + +# Экспорт метрик в CSV +dashboard.export_metrics_csv() + +# Экспорт саммари в JSON +dashboard.export_summary_json() +``` + +--- + +## 3. Управление моделями (3 балла) + +### 3.1 Регистрация и версионирование моделей + +```python +from src.clearml_integration import ClearMLModelManager + +manager = ClearMLModelManager(project_name="EPML-ITMO/Wine-Quality/Models") + +# Регистрация модели с автоматическим версионированием +model_id = manager.register_model( + model=trained_model, + model_name="RandomForest", + metrics={"accuracy": 0.95, "f1_score": 0.93}, + parameters={"n_estimators": 100}, + tags=["production", "wine-quality"], + description="Best RandomForest model for wine quality" +) +``` + +### 3.2 Система метаданных + +Каждая модель сохраняется со следующими метаданными: +```json +{ + "model_id": "RandomForest_v1_20251227_120000", + "model_name": "RandomForest", + "version": 1, + "framework": "sklearn", + "created_at": "2025-12-27T12:00:00", + "model_path": "outputs/clearml/models/RandomForest/v1/...", + "metrics": {"accuracy": 0.95, "f1_score": 0.93}, + "parameters": {"n_estimators": 100}, + "tags": ["production", "wine-quality"], + "model_class": "RandomForestClassifier" +} +``` + +### 3.3 Автоматическое создание версий + +```python +# Версии создаются автоматически при каждой регистрации +manager.register_model(model_v1, "RandomForest", metrics_v1) # v1 +manager.register_model(model_v2, "RandomForest", metrics_v2) # v2 +manager.register_model(model_v3, "RandomForest", metrics_v3) # v3 + +# Получение всех версий +versions = manager.get_model_versions("RandomForest") + +# Загрузка конкретной версии +model, metadata = manager.load_model("RandomForest", version=2) + +# Загрузка последней версии +model, metadata = manager.load_model("RandomForest", version="latest") +``` + +### 3.4 Система сравнения моделей + +```python +# Сравнение всех моделей по метрике +comparison_df = manager.compare_models(metric="accuracy") + +# Получение лучшей модели +best_id, best_metadata = manager.get_best_model(metric="accuracy") + +# Генерация отчета +manager.generate_model_report("outputs/model_report.md") + +# Экспорт модели для деплоя +manager.export_model("RandomForest", version="latest", export_path="deployment/") ``` -**Запуск 2 (Tuned params):** -```text -2025-12-08 22:12:56,617 - __main__ - INFO - Training model... -2025-12-08 22:12:57,279 - __main__ - INFO - Accuracy: 0.646875 -2025-12-08 22:12:57,279 - __main__ - INFO - F1 Score: 0.6266469214465146 -Registered model 'WineQualityRandomForest' already exists. Creating a new version of this model... -Created version '2' of model 'WineQualityRandomForest'. +```bash +# CLI команды +make clearml_compare_models +``` + +--- + +## 4. Пайплайны (2 балла) + +### 4.1 ClearML пайплайны для ML workflow + +```python +from src.clearml_integration import ClearMLPipeline + +# Создание пайплайна +pipeline = ClearMLPipeline( + pipeline_name="Wine-Quality-RandomForest", + project_name="EPML-ITMO/Wine-Quality/Pipelines", + version="1.0.0" +) + +# Добавление шагов +pipeline.add_data_step( + train_path="data/processed/train.csv", + test_path="data/processed/test.csv" +) + +pipeline.add_training_step( + model_name="RandomForest", + model_params={"n_estimators": 100, "max_depth": 10} +) + +pipeline.add_evaluation_step( + metrics=["accuracy", "precision", "recall", "f1_score"] +) + +pipeline.add_model_registration_step() + +# Запуск пайплайна +results = pipeline.run(local_mode=True) ``` -## Воспроизводимость +### 4.2 Готовые функции для пайплайнов + +```python +from src.clearml_integration.pipeline import ( + create_wine_quality_pipeline, + run_all_models_pipeline +) + +# Создание пайплайна для одной модели +pipeline = create_wine_quality_pipeline( + model_name="GradientBoosting", + model_params={"n_estimators": 100} +) +results = pipeline.run() + +# Запуск всех моделей +all_results = run_all_models_pipeline() +``` + +### 4.3 Мониторинг выполнения + +Пайплайн автоматически логирует: +- Время выполнения каждого шага +- Успех/неудачу шагов +- Промежуточные результаты +- Финальные метрики + +```python +# Результаты пайплайна +{ + "pipeline_name": "Wine-Quality-RandomForest", + "version": "1.0.0", + "success": True, + "total_duration": 15.5, + "steps": { + "data_loading": {"success": True, "duration": 0.5}, + "train_randomforest": {"success": True, "duration": 10.0}, + "evaluation": {"success": True, "duration": 2.0}, + "model_registration": {"success": True, "duration": 3.0} + }, + "final_metrics": {"accuracy": 0.95, "f1_score": 0.93} +} +``` + +### 4.4 Уведомления + +Система мониторинга отправляет уведомления о: +- Успешном завершении пайплайна +- Ошибках выполнения +- Результатах метрик + +Уведомления сохраняются в `outputs/clearml/pipelines/` и логируются в ClearML. + +--- -Для обеспечения воспроизводимости используются: -1. **DVC** для данных (`dvc.lock` / `.dvc` файлы). -2. **Poetry** для зависимостей (`poetry.lock`). -3. **Git** для кода. +## Быстрый старт -### Инструкция по воспроизведению +### Установка -1. **Клонировать репозиторий и перейти в ветку**: - ```bash - git checkout HW2 - ``` +```bash +# Клонирование репозитория +git clone +cd epml_itmo + +# Установка зависимостей +poetry install + +# Активация окружения +poetry shell +``` + +### Запуск ClearML Server + +```bash +# Запуск Docker контейнеров +make clearml_server_start + +# Проверка статуса +make clearml_server_status -2. **Установить зависимости**: - ```bash - poetry install - ``` +# Настройка credentials (интерактивно) +clearml-init +``` -3. **Получить данные (DVC)**: - ```bash - poetry run dvc pull - ``` - *Примечание: Так как remote локальный (`../dvc_remote`), он должен существовать на машине. В реальном проекте это был бы S3 bucket.* +### Запуск экспериментов -4. **Запустить обучение (через DVC Pipeline)**: - Это автоматически запустит подготовку данных (`prepare`) и обучение (`train`). - ```bash - poetry run dvc repro - ``` +```bash +# Запуск одного эксперимента +make clearml_experiment MODEL=RandomForest - *Альтернативно (вручную)*: - ```bash - poetry run python src/data/make_dataset.py data/raw data/processed - poetry run python src/models/train_model.py data/processed - ``` +# Запуск всех экспериментов +make clearml_experiments_all -5. **Просмотр результатов MLflow**: - ```bash - poetry run mlflow ui - ``` +# Запуск в офлайн режиме (без сервера) +make clearml_experiments_offline +``` -## Docker +### Запуск пайплайнов -Docker образ собирается с помощью команды: ```bash -docker build -t epml-hw2 . +# Пайплайн для одной модели +make clearml_pipeline MODEL=RandomForest + +# Пайплайн для всех моделей +make clearml_pipeline_all ``` -Запуск контейнера: +### Анализ и отчеты -*Важно: Для работы с локальным DVC remote его необходимо примонтировать в контейнер.* -Предполагая, что локальный remote находится в `../dvc_remote` относительно корня проекта: +```bash +# Сравнение экспериментов +make clearml_compare + +# Сравнение моделей +make clearml_compare_models + +# Генерация отчетов +make clearml_report + +# Полный workflow +make clearml_full +``` + +### Воспроизведение результатов ```bash -docker run -it -v $(pwd)/../dvc_remote:/dvc_remote epml-hw2 bash +# Полная последовательность команд для воспроизведения +make clearml_server_start +clearml-init # Ввести credentials из Web UI +make clearml_create_project +make clearml_experiments_all +make clearml_compare_models +make clearml_report ``` -Внутри контейнера: +--- + +## Структура проекта + +``` +├── clearml/ +│ ├── docker-compose.yml # ClearML Server конфигурация +│ ├── env.example # Пример переменных окружения +│ ├── clearml.conf.example # Пример конфигурации клиента +│ └── setup_clearml.py # Скрипт настройки +│ +├── conf/ +│ ├── config.yaml # Основная конфигурация Hydra +│ └── clearml/ +│ └── default.yaml # Конфигурация ClearML +│ +├── src/ +│ └── clearml_integration/ +│ ├── __init__.py +│ ├── experiment_tracker.py # Трекинг экспериментов +│ ├── model_manager.py # Управление моделями +│ ├── pipeline.py # ML пайплайны +│ ├── dashboard.py # Дашборды и отчеты +│ └── run_experiments.py # CLI для экспериментов +│ +├── outputs/ +│ └── clearml/ +│ ├── models/ # Зарегистрированные модели +│ ├── pipelines/ # Результаты пайплайнов +│ └── dashboard/ # Отчеты и экспорты +│ +├── Makefile # Make команды +└── README.md # Этот файл +``` + +--- + +## Команды Makefile + +| Команда | Описание | +|---------|----------| +| `make clearml_server_start` | Запуск ClearML Server | +| `make clearml_server_stop` | Остановка ClearML Server | +| `make clearml_server_status` | Статус контейнеров | +| `make clearml_setup` | Проверка конфигурации | +| `make clearml_test` | Тест подключения | +| `make clearml_create_project` | Создание проекта | +| `make clearml_experiment MODEL=X` | Запуск одного эксперимента | +| `make clearml_experiments_all` | Запуск всех экспериментов | +| `make clearml_experiments_offline` | Офлайн режим | +| `make clearml_compare` | Сравнение экспериментов | +| `make clearml_compare_models` | Сравнение моделей | +| `make clearml_pipeline MODEL=X` | Запуск пайплайна | +| `make clearml_pipeline_all` | Все пайплайны | +| `make clearml_dashboard` | Дашборд саммари | +| `make clearml_report` | Полный отчет | +| `make clearml_export` | Экспорт данных | +| `make clearml_full` | Полный workflow | +| `make clearml_clean` | Очистка outputs | + +--- + +## Скриншоты + +### ClearML Web UI - Эксперименты +*После запуска `make clearml_experiments_all` в Web UI отображаются все эксперименты с метриками.* + +![ClearML Experiments](reports/figures/clearml_experiments.jpg) + +### ClearML Web UI - Сравнение +*Функция сравнения позволяет визуально сопоставить результаты разных моделей.* + +![ClearML Comparison](reports/figures/clearml_comparison.jpg) + +### ClearML Web UI - Модели +*Реестр моделей с версионированием и метаданными.* + +![ClearML Models](reports/figures/clearml_models.jpg) + + +--- + +## Требования + +- Python 3.12+ +- Poetry +- Docker & Docker Compose + +### Зависимости Python +``` +clearml>=2.1.0 +pandas +numpy +scikit-learn +hydra-core +omegaconf +pydantic +``` + +--- + +## Ссылки + +- [ClearML Documentation](https://clear.ml/docs/) +- [ClearML GitHub](https://github.com/allegroai/clearml) +- [ClearML Server Setup](https://clear.ml/docs/latest/docs/deploying_clearml/clearml_server) + +--- + +## Project Organization (Original) + +``` +├── LICENSE +├── Makefile <- Makefile with commands like `make data` or `make train` +├── README.md <- The top-level README for developers using this project. +├── data +│ ├── external <- Data from third party sources. +│ ├── interim <- Intermediate data that has been transformed. +│ ├── processed <- The final, canonical data sets for modeling. +│ └── raw <- The original, immutable data dump. +│ +├── docs <- A default Sphinx project; see sphinx-doc.org for details +│ +├── models <- Trained and serialized models, model predictions, or model summaries +│ +├── notebooks <- Jupyter notebooks. +│ +├── pyproject.toml <- Project configuration and dependencies. +├── poetry.lock <- Locked dependency versions. +│ +├── references <- Data dictionaries, manuals, and all other explanatory materials. +│ +├── reports <- Generated analysis as HTML, PDF, LaTeX, etc. +│ └── figures <- Generated graphics and figures to be used in reporting +│ +├── src <- Source code for use in this project. +│ ├── __init__.py <- Makes src a Python module +│ ├── data <- Scripts to download or generate data +│ ├── features <- Scripts to turn raw data into features for modeling +│ ├── models <- Scripts to train models and make predictions +│ ├── pipelines <- ML pipeline orchestration +│ ├── clearml_integration <- ClearML MLOps integration +│ └── visualization <- Scripts to create visualizations +``` + +--- + +## Getting Started + +### Prerequisites + +- Python 3.12+ +- Poetry (for dependency management) +- Docker (for ClearML Server) + +### Installation + +1. Clone the repository +2. Install dependencies with Poetry: + +```bash +poetry install +``` + +3. Activate the virtual environment: + +```bash +poetry shell +``` + +### Code Quality + +This project uses `ruff`, `mypy`, and `bandit` for code quality. + +Run linters: + +```bash +poetry run ruff check . +poetry run mypy . +poetry run bandit -r src +``` + +Pre-commit hooks are configured to run automatically on commit. + +### Docker + +Build the docker image: + ```bash -dvc pull -dvc repro +docker build -t epml-itmo . ``` diff --git a/REPORT_HW4.md b/REPORT_HW4.md new file mode 100644 index 0000000..05ad59d --- /dev/null +++ b/REPORT_HW4.md @@ -0,0 +1,436 @@ +# ДЗ 4: Автоматизация ML пайплайнов + +## Обзор + +В данном домашнем задании реализована полная автоматизация ML пайплайнов с использованием: +- **DVC Pipelines** — для оркестрации пайплайнов и версионирования данных +- **Hydra** — для управления конфигурациями + +### Выбор инструментов + +**DVC Pipelines** выбран как инструмент оркестрации по следующим причинам: +- Уже использовался в проекте для версионирования данных +- Отлично интегрируется с Git workflow +- Поддерживает кэширование и параллельное выполнение +- Позволяет отслеживать метрики и параметры экспериментов + +**Hydra** выбран для управления конфигурациями: +- Иерархическая композиция конфигураций +- Поддержка переопределения параметров из командной строки +- Интерполяция переменных между конфигурациями +- Автоматическое создание директорий для выходных данных + +--- + +## 1. Настройка DVC Pipelines (4 балла) + +### 1.1 Структура пайплайна + +Реализован многоэтапный ML пайплайн со следующими стадиями: + +``` +prepare → train_random_forest ─────┐ + → train_gradient_boosting ─┤ + → train_logistic_regression┼→ evaluate + → train_svm ───────────────┤ + → train_decision_tree ─────┤ + → train_knn ───────────────┘ +``` + +### 1.2 Конфигурация DVC (`dvc.yaml`) + +```yaml +stages: + prepare: + cmd: python -m src.pipelines.prepare_data + deps: + - src/pipelines/prepare_data.py + - conf/data/default.yaml + params: + - conf/config.yaml: + - seed + - conf/data/default.yaml: + - test_size + outs: + - data/processed: + cache: true + + train_random_forest: + cmd: python -m src.pipelines.train_pipeline model=random_forest + deps: + - data/processed + - src/pipelines/train_pipeline.py + - conf/model/random_forest.yaml + params: + - conf/model/random_forest.yaml: + - params + metrics: + - outputs/randomforest/metrics.json: + cache: false + # ... аналогично для остальных моделей + + evaluate: + cmd: python -m src.pipelines.evaluate_models + deps: + - outputs/randomforest/metrics.json + - outputs/gradientboosting/metrics.json + # ... остальные метрики + metrics: + - outputs/comparison/best_model.json + plots: + - outputs/comparison/metrics_comparison.csv +``` + +### 1.3 Зависимости между этапами + +DVC автоматически определяет зависимости через: +- **deps** — файлы, от которых зависит стадия +- **outs** — выходные файлы стадии +- **params** — параметры из конфигурационных файлов + +### 1.4 Кэширование и параллельное выполнение + +- **Кэширование**: DVC кэширует все выходные файлы (`cache: true`), что позволяет пропускать стадии при повторных запусках +- **Параллельное выполнение**: Стадии обучения моделей могут выполняться параллельно, т.к. зависят только от `prepare` + +Запуск с параллелизацией: +```bash +dvc repro --parallel +``` + +### 1.5 Визуализация DAG + +``` + +---------+ + | prepare | + +---------+ + | + ┌───────────┼───────────┬───────────┬───────────┬───────────┐ + ↓ ↓ ↓ ↓ ↓ ↓ +train_rf train_gb train_lr train_svm train_dt train_knn + │ │ │ │ │ │ + └───────────┴───────────┴─────┬─────┴───────────┴───────────┘ + ↓ + +----------+ + | evaluate | + +----------+ +``` + +--- + +## 2. Настройка Hydra (3 балла) + +### 2.1 Структура конфигураций + +``` +conf/ +├── config.yaml # Главный файл конфигурации +├── data/ +│ └── default.yaml # Конфигурация данных +├── model/ +│ ├── random_forest.yaml +│ ├── gradient_boosting.yaml +│ ├── logistic_regression.yaml +│ ├── svm.yaml +│ ├── decision_tree.yaml +│ └── knn.yaml +└── training/ + └── default.yaml # Конфигурация обучения +``` + +### 2.2 Главный файл конфигурации (`conf/config.yaml`) + +```yaml +defaults: + - model: random_forest + - data: default + - training: default + - _self_ + +mlflow: + tracking_uri: "file://${hydra:runtime.cwd}/mlruns" + experiment_name: "wine_quality_hydra" + +logging: + level: INFO + format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + +seed: 42 +output_dir: "outputs" +``` + +### 2.3 Композиция конфигураций + +Hydra автоматически объединяет конфигурации из разных файлов: + +```yaml +# conf/model/random_forest.yaml +name: "RandomForest" +_target_: "sklearn.ensemble.RandomForestClassifier" +params: + n_estimators: 100 + max_depth: 10 + random_state: ${seed} # Интерполяция из главного конфига +``` + +### 2.4 Валидация конфигураций + +Реализована валидация с использованием Pydantic (`src/config/schemas.py`): + +```python +class ModelConfig(BaseModel): + name: str = Field(..., description="Model name") + _target_: str = Field(..., description="Full path to model class") + params: dict[str, Any] = Field(default_factory=dict) + + @field_validator("name") + @classmethod + def validate_model_name(cls, v: str) -> str: + if not v or not v.strip(): + raise ValueError("Model name cannot be empty") + return v +``` + +### 2.5 Переопределение параметров + +```bash +# Изменение модели +python -m src.pipelines.train_pipeline model=gradient_boosting + +# Изменение гиперпараметров +python -m src.pipelines.train_pipeline model.params.n_estimators=200 + +# Multirun для нескольких моделей +python -m src.pipelines.train_pipeline --multirun model=random_forest,gradient_boosting +``` + +--- + +## 3. Интеграция и тестирование (2 балла) + +### 3.1 Интеграция DVC + Hydra + +Пайплайн интегрирует оба инструмента: +1. **DVC** управляет порядком выполнения и кэшированием +2. **Hydra** управляет конфигурациями для каждого этапа + +### 3.2 Мониторинг выполнения + +Реализован модуль мониторинга (`src/pipelines/monitoring.py`): + +```python +class PipelineMonitor: + def start_pipeline(self) -> None + def end_pipeline(self, success: bool, error: str = None) -> dict + def start_stage(self, stage_name: str) -> None + def end_stage(self, stage_name: str, success: bool) -> None + def save_report(self) -> Path +``` + +Пример вывода мониторинга: +``` +============================================================ +PIPELINE COMPLETED +Status: SUCCESS +Total duration: 3.77s +---------------------------------------- + ✓ data_loading: 0.01s + ✓ model_creation: 0.57s + ✓ training: 0.61s + ✓ mlflow_logging: 2.58s + ✓ save_results: 0.00s +============================================================ +``` + +### 3.3 Уведомления о результатах + +Система уведомлений логирует результаты в файл: +- `outputs/{model}/notifications.log` — лог уведомлений +- `outputs/{model}/pipeline_report.json` — детальный отчет + +### 3.4 Тестирование воспроизводимости + +Воспроизводимость обеспечивается через: +1. **Фиксированный seed** (`seed: 42` в конфигурации) +2. **DVC версионирование** данных и конфигураций +3. **MLflow tracking** для логирования экспериментов + +Команда для воспроизведения: +```bash +# Клонирование репозитория +git clone +cd epml_itmo + +# Установка зависимостей +poetry install + +# Запуск полного пайплайна +dvc repro +``` + +--- + +## 4. Результаты + +### 4.1 Сравнение моделей + +| Model | Accuracy | Precision | Recall | F1 Score | +|--------------------|----------|-----------|---------|----------| +| GradientBoosting | 0.6500 | 0.6394 | 0.6500 | **0.6393** | +| RandomForest | 0.6438 | 0.6108 | 0.6438 | 0.6240 | +| DecisionTree | 0.5531 | 0.5320 | 0.5531 | 0.5409 | +| LogisticRegression | 0.5719 | 0.5245 | 0.5719 | 0.5382 | +| SVM | 0.5094 | 0.5645 | 0.5094 | 0.4618 | +| KNN | 0.4562 | 0.4223 | 0.4562 | 0.4299 | + +**Лучшая модель**: GradientBoosting с F1 Score = 0.6393 + +### 4.2 DVC Metrics + +```bash +$ dvc metrics show +``` + +Результаты метрик представлены в таблице выше (раздел 4.1). + +### 4.3 DVC DAG + +```bash +$ dvc dag +``` + +``` + +---------+ + **************| prepare |****************** + * +---------+ * + * | * + * ┌───────────────┼───────────────┐ * + * ↓ ↓ ↓ * ++--------+ +------------+ +-------------+ +-----+ +--------+ +-----+ +|train_rf| |train_gb | |train_lr | |svm | |train_dt| |knn | ++--------+ +------------+ +-------------+ +-----+ +--------+ +-----+ + * * * * * * + * * * * * * + **********+-------+-------+--------------+----------+******** + ↓ + +----------+ + | evaluate | + +----------+ +``` + +--- + +## 5. Команды для воспроизведения + +### Быстрый старт + +```bash +# 1. Установка зависимостей +poetry install + +# 2. Запуск полного пайплайна +make pipeline + +# 3. Просмотр метрик +make metrics +``` + +### Отдельные команды + +```bash +# Подготовка данных +make prepare + +# Обучение конкретной модели +make train MODEL=random_forest + +# Обучение всех моделей +make train_all + +# Оценка и сравнение +make evaluate + +# Просмотр DAG +make dag + +# Очистка и перезапуск +make run_full +``` + +### Параметры Hydra + +```bash +# Изменение числа деревьев в Random Forest +python -m src.pipelines.train_pipeline model=random_forest model.params.n_estimators=200 + +# Изменение глубины +python -m src.pipelines.train_pipeline model=random_forest model.params.max_depth=15 + +# Мультизапуск +python -m src.pipelines.train_pipeline --multirun model=random_forest,gradient_boosting +``` + +--- + +## 6. Структура проекта + +``` +epml_itmo/ +├── conf/ # Hydra конфигурации +│ ├── config.yaml # Главный конфиг +│ ├── data/default.yaml # Конфиг данных +│ ├── model/ # Конфиги моделей +│ │ ├── random_forest.yaml +│ │ ├── gradient_boosting.yaml +│ │ ├── logistic_regression.yaml +│ │ ├── svm.yaml +│ │ ├── decision_tree.yaml +│ │ └── knn.yaml +│ └── training/default.yaml # Конфиг обучения +├── src/ +│ ├── config/ # Pydantic схемы валидации +│ │ ├── __init__.py +│ │ └── schemas.py +│ └── pipelines/ # Скрипты пайплайнов +│ ├── __init__.py +│ ├── prepare_data.py # Подготовка данных +│ ├── train_pipeline.py # Обучение модели +│ ├── evaluate_models.py # Оценка моделей +│ ├── run_all_models.py # Запуск всех моделей +│ └── monitoring.py # Мониторинг +├── dvc.yaml # DVC pipeline конфигурация +├── dvc.lock # DVC lock file +├── Makefile # Команды Make +├── outputs/ # Выходные данные +│ ├── randomforest/ +│ ├── gradientboosting/ +│ ├── ... +│ └── comparison/ +└── mlruns/ # MLflow артефакты +``` + +--- + +## Заключение + +В рамках данного домашнего задания реализована полная автоматизация ML пайплайнов: + +1. **DVC Pipelines** обеспечивает: + - Автоматическое определение зависимостей + - Кэширование результатов + - Параллельное выполнение + - Отслеживание метрик и параметров + +2. **Hydra** обеспечивает: + - Иерархическую композицию конфигураций + - Валидацию через Pydantic + - Переопределение параметров из CLI + - Интерполяцию переменных + +3. **Интеграция**: + - Мониторинг выполнения с детальными отчетами + - Уведомления о результатах + - Полная воспроизводимость экспериментов + +Все результаты воспроизводимы через команду `dvc repro`. + diff --git a/clearml/clearml.conf.example b/clearml/clearml.conf.example new file mode 100644 index 0000000..708c1c1 --- /dev/null +++ b/clearml/clearml.conf.example @@ -0,0 +1,64 @@ +# ClearML Configuration File +# Copy to ~/clearml.conf or set CLEARML_CONFIG_FILE env variable +# +# Generate credentials in ClearML Web UI: +# 1. Go to Settings -> Workspace +# 2. Create new credentials +# 3. Copy access_key and secret_key + +api { + # ClearML API server + web_server: http://localhost:8080 + api_server: http://localhost:8008 + files_server: http://localhost:8081 + + # Credentials - get from ClearML Web UI + credentials { + "access_key" = "YOUR_ACCESS_KEY" + "secret_key" = "YOUR_SECRET_KEY" + } +} + +# SDK configuration +sdk { + # Default output URI for models and artifacts + default_output_uri: "file://outputs/clearml" + + # Development mode settings + development { + # Store models locally in development + store_development_task_output_uri: "file://outputs/clearml/dev" + + # Support async logging + support_async: true + } + + # AWS S3 configuration (if using cloud storage) + # aws { + # s3 { + # credentials { + # access_key: "" + # secret_key: "" + # } + # } + # } +} + +# Agent configuration (for remote execution) +agent { + # Default queue to pull tasks from + default_queue: "default" + + # Git configuration + git { + # Git user for cloning + user: "" + pass: "" + } + + # Docker default image + default_docker { + image: "python:3.12-slim" + } +} + diff --git a/clearml/docker-compose.yml b/clearml/docker-compose.yml new file mode 100644 index 0000000..1203043 --- /dev/null +++ b/clearml/docker-compose.yml @@ -0,0 +1,146 @@ +# ClearML Server Docker Compose Configuration +# For HW5: ClearML MLOps Setup +# +# Usage: +# cd clearml && docker-compose up -d +# +# Access: +# - Web UI: http://localhost:8080 +# - API: http://localhost:8008 +# - Files: http://localhost:8081 + +version: "3.8" + +services: + # MongoDB - Main database for ClearML + mongo: + image: mongo:6.0 + container_name: clearml-mongo + restart: unless-stopped + command: --setParameter internalQueryMaxBlockingSortMemoryUsageBytes=196100200 + volumes: + - clearml-mongo-data:/data/db + - clearml-mongo-config:/data/configdb + networks: + - clearml-network + + # Elasticsearch - Search and analytics + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:8.12.0 + container_name: clearml-elasticsearch + restart: unless-stopped + environment: + - discovery.type=single-node + - xpack.security.enabled=false + - "ES_JAVA_OPTS=-Xms512m -Xmx512m" + - cluster.routing.allocation.disk.threshold_enabled=false + volumes: + - clearml-elastic-data:/usr/share/elasticsearch/data + networks: + - clearml-network + ulimits: + memlock: + soft: -1 + hard: -1 + nofile: + soft: 65536 + hard: 65536 + + # Redis - Caching and session management + redis: + image: redis:7 + container_name: clearml-redis + restart: unless-stopped + volumes: + - clearml-redis-data:/data + networks: + - clearml-network + + # ClearML API Server + apiserver: + image: allegroai/clearml:latest + container_name: clearml-apiserver + restart: unless-stopped + depends_on: + - mongo + - elasticsearch + - redis + environment: + - CLEARML_HOST_IP=${CLEARML_HOST_IP:-localhost} + - CLEARML__SECURE__CREDENTIALS__APISERVER__access_key=${CLEARML_ACCESS_KEY:-} + - CLEARML__SECURE__CREDENTIALS__APISERVER__secret_key=${CLEARML_SECRET_KEY:-} + - CLEARML__APISERVER__DEFAULT_COMPANY=epml-itmo + volumes: + - clearml-logs:/var/log/clearml + - clearml-config:/opt/clearml/config + - clearml-data-fileserver:/mnt/fileserver + ports: + - "8008:8008" + networks: + - clearml-network + + # ClearML Web Server (UI) + webserver: + image: allegroai/clearml:latest + container_name: clearml-webserver + restart: unless-stopped + depends_on: + - apiserver + environment: + - CLEARML_HOST_IP=${CLEARML_HOST_IP:-localhost} + ports: + - "8080:80" + networks: + - clearml-network + + # ClearML File Server + fileserver: + image: allegroai/clearml:latest + container_name: clearml-fileserver + restart: unless-stopped + depends_on: + - apiserver + volumes: + - clearml-data-fileserver:/mnt/fileserver + ports: + - "8081:8081" + networks: + - clearml-network + + # ClearML Agent (optional - for remote execution) + agent: + image: allegroai/clearml-agent:latest + container_name: clearml-agent + restart: unless-stopped + depends_on: + - apiserver + environment: + - CLEARML_API_HOST=http://apiserver:8008 + - CLEARML_WEB_HOST=http://webserver:80 + - CLEARML_FILES_HOST=http://fileserver:8081 + - CLEARML_API_ACCESS_KEY=${CLEARML_ACCESS_KEY:-} + - CLEARML_API_SECRET_KEY=${CLEARML_SECRET_KEY:-} + - CLEARML_AGENT_GIT_USER=${GIT_USER:-} + - CLEARML_AGENT_GIT_PASS=${GIT_PASS:-} + volumes: + - /var/run/docker.sock:/var/run/docker.sock + - clearml-agent-data:/root/.clearml + networks: + - clearml-network + profiles: + - agent # Only starts with --profile agent + +networks: + clearml-network: + driver: bridge + +volumes: + clearml-mongo-data: + clearml-mongo-config: + clearml-elastic-data: + clearml-redis-data: + clearml-logs: + clearml-config: + clearml-data-fileserver: + clearml-agent-data: + diff --git a/clearml/env.example b/clearml/env.example new file mode 100644 index 0000000..23c61b6 --- /dev/null +++ b/clearml/env.example @@ -0,0 +1,13 @@ +# ClearML Server Environment Variables +# Copy this file to .env and fill in the values + +# Host IP for ClearML services (use localhost for local development) +CLEARML_HOST_IP=localhost + +# API credentials (auto-generated on first run, or set manually) +CLEARML_ACCESS_KEY= +CLEARML_SECRET_KEY= + +# Git credentials for ClearML Agent (optional) +GIT_USER= +GIT_PASS= diff --git a/clearml/setup_clearml.py b/clearml/setup_clearml.py new file mode 100644 index 0000000..09cbdcc --- /dev/null +++ b/clearml/setup_clearml.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python3 +""" +ClearML Setup Script + +This script helps to configure ClearML for the project. +It can be used to: +1. Initialize ClearML configuration +2. Test connection to ClearML server +3. Create default project and experiments + +Usage: + python clearml/setup_clearml.py --init # Initialize configuration + python clearml/setup_clearml.py --test # Test connection + python clearml/setup_clearml.py --create-project # Create project +""" + +from __future__ import annotations + +import argparse +import os +import sys +from pathlib import Path + +# Add project root to path +sys.path.insert(0, str(Path(__file__).resolve().parents[1])) + + +def init_clearml_config() -> None: + """Initialize ClearML configuration interactively.""" + print("=" * 60) + print("ClearML Configuration Setup") + print("=" * 60) + print() + print("This will help you configure ClearML for the project.") + print() + print("Prerequisites:") + print("1. ClearML Server running (docker-compose up -d)") + print("2. Access to ClearML Web UI (http://localhost:8080)") + print() + print("Steps:") + print("1. Open ClearML Web UI") + print("2. Go to Settings -> Workspace -> Create new credentials") + print("3. Copy the credentials") + print() + + try: + import clearml # noqa: F401 + + print("\nTo configure ClearML, run:") + print(" clearml-init") + print() + print("Or set environment variables:") + print(" export CLEARML_API_HOST=http://localhost:8008") + print(" export CLEARML_WEB_HOST=http://localhost:8080") + print(" export CLEARML_FILES_HOST=http://localhost:8081") + print(" export CLEARML_API_ACCESS_KEY=") + print(" export CLEARML_API_SECRET_KEY=") + + except ImportError: + print("ERROR: ClearML not installed. Run: poetry add clearml") + sys.exit(1) + + +def test_clearml_connection() -> bool: + """Test connection to ClearML server.""" + print("=" * 60) + print("Testing ClearML Connection") + print("=" * 60) + print() + + try: + from typing import Any + + from clearml import Task + + # Try to create a test task + task: Any = Task.init( + project_name="EPML-ITMO/Test", + task_name="Connection Test", + task_type=Task.TaskTypes.testing, + reuse_last_task_id=False, + ) + + print("✓ Successfully connected to ClearML!") + print(f" Task ID: {task.id}") + print(f" Project: {task.get_project_name()}") + + # Log a test metric + task.get_logger().report_scalar( + title="test", series="connection", value=1, iteration=0 + ) + + task.close() + print("\n✓ Connection test passed!") + return True + + except Exception as e: + print(f"\n✗ Connection test failed: {e}") + print("\nTroubleshooting:") + print("1. Check if ClearML server is running") + print("2. Verify credentials in ~/clearml.conf") + print("3. Check network connectivity") + return False + + +def create_project() -> None: + """Create default project structure in ClearML.""" + print("=" * 60) + print("Creating ClearML Project") + print("=" * 60) + print() + + try: + from typing import Any + + from clearml import Task + + # Create main project with a setup task + task: Any = Task.init( + project_name="EPML-ITMO/Wine-Quality", + task_name="Project Setup", + task_type=Task.TaskTypes.custom, + reuse_last_task_id=False, + ) + + # Add project description + task.set_comment( + """ +Wine Quality Classification Project +===================================== + +This project uses ClearML for MLOps workflow management. + +Models: +- Random Forest +- Gradient Boosting +- Logistic Regression +- SVM +- Decision Tree +- KNN + +Dataset: +- Wine Quality (Red Wine) from UCI ML Repository + +Experiments Structure: +- EPML-ITMO/Wine-Quality/Experiments - Training experiments +- EPML-ITMO/Wine-Quality/Models - Model registry +- EPML-ITMO/Wine-Quality/Pipelines - ML Pipelines + """ + ) + + # Add tags + task.add_tags(["setup", "project-init"]) + + print("✓ Created project: EPML-ITMO/Wine-Quality") + print(f" Task ID: {task.id}") + + task.close() + print("\n✓ Project created successfully!") + + except Exception as e: + print(f"\n✗ Failed to create project: {e}") + sys.exit(1) + + +def show_status() -> None: + """Show current ClearML configuration status.""" + print("=" * 60) + print("ClearML Configuration Status") + print("=" * 60) + print() + + # Check for config file + config_paths = [ + Path.home() / "clearml.conf", + Path.home() / ".clearml" / "clearml.conf", + Path("/etc/clearml.conf"), + ] + + config_found = False + for path in config_paths: + if path.exists(): + print(f"✓ Config file found: {path}") + config_found = True + break + + if not config_found: + print("✗ Config file not found") + print(" Expected locations:") + for path in config_paths: + print(f" - {path}") + + # Check environment variables + env_vars = [ + "CLEARML_API_HOST", + "CLEARML_WEB_HOST", + "CLEARML_FILES_HOST", + "CLEARML_API_ACCESS_KEY", + "CLEARML_API_SECRET_KEY", + ] + + print("\nEnvironment Variables:") + for var in env_vars: + value = os.environ.get(var, "") + if value: + # Mask sensitive values + if "KEY" in var: + display_value = value[:4] + "..." if len(value) > 4 else "***" + else: + display_value = value + print(f" ✓ {var}: {display_value}") + else: + print(f" ✗ {var}: not set") + + # Try to import and check configuration + print("\nClearML Package:") + try: + import clearml + + version = getattr(clearml, "__version__", "unknown") + print(f" ✓ Version: {version}") + except ImportError: + print(" ✗ Not installed") + + +def main() -> None: + """Main entry point.""" + parser = argparse.ArgumentParser(description="ClearML Setup Utility") + parser.add_argument("--init", action="store_true", help="Initialize configuration") + parser.add_argument("--test", action="store_true", help="Test connection") + parser.add_argument( + "--create-project", action="store_true", help="Create project structure" + ) + parser.add_argument( + "--status", action="store_true", help="Show configuration status" + ) + + args = parser.parse_args() + + if args.init: + init_clearml_config() + elif args.test: + success = test_clearml_connection() + sys.exit(0 if success else 1) + elif args.create_project: + create_project() + elif args.status: + show_status() + else: + # Default: show status + show_status() + print("\nUsage:") + print(" python clearml/setup_clearml.py --init # Initialize") + print(" python clearml/setup_clearml.py --test # Test connection") + print(" python clearml/setup_clearml.py --create-project # Create project") + print(" python clearml/setup_clearml.py --status # Show status") + + +if __name__ == "__main__": + main() diff --git a/conf/clearml/default.yaml b/conf/clearml/default.yaml new file mode 100644 index 0000000..c9987f5 --- /dev/null +++ b/conf/clearml/default.yaml @@ -0,0 +1,39 @@ +# ClearML Configuration for Hydra + +# ClearML Server settings +server: + api_host: "http://localhost:8008" + web_host: "http://localhost:8080" + files_host: "http://localhost:8081" + +# Project settings +project: + name: "EPML-ITMO/Wine-Quality" + experiments_subproject: "Experiments" + models_subproject: "Models" + pipelines_subproject: "Pipelines" + +# Experiment settings +experiment: + auto_connect_frameworks: true + auto_log_artifacts: true + offline_mode: false + +# Model settings +model: + output_uri: "outputs/clearml/models" + auto_version: true + register_after_training: true + +# Pipeline settings +pipeline: + default_queue: "default" + local_mode: true + save_results: true + +# Notification settings +notifications: + enabled: true + on_success: true + on_failure: true + diff --git a/conf/config.yaml b/conf/config.yaml new file mode 100644 index 0000000..142ae70 --- /dev/null +++ b/conf/config.yaml @@ -0,0 +1,32 @@ +# Main Hydra configuration file +# This file composes all sub-configurations + +defaults: + - model: random_forest + - data: default + - training: default + - clearml: default + - _self_ + +# MLflow configuration +mlflow: + tracking_uri: "file://${hydra:runtime.cwd}/mlruns" + experiment_name: "wine_quality_hydra" + +# ClearML configuration (overridable) +clearml: + enabled: true + project_name: "EPML-ITMO/Wine-Quality" + offline_mode: false + +# Logging configuration +logging: + level: INFO + format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + +# Random seed for reproducibility +seed: 42 + +# Output directory for results +output_dir: "outputs" + diff --git a/conf/data/default.yaml b/conf/data/default.yaml new file mode 100644 index 0000000..8fcc968 --- /dev/null +++ b/conf/data/default.yaml @@ -0,0 +1,13 @@ +# Data configuration +raw_path: "data/raw" +processed_path: "data/processed" +train_file: "train.csv" +test_file: "test.csv" +target_column: "quality" +test_size: 0.2 +random_state: ${seed} + +# Data preprocessing options +preprocessing: + normalize: false + handle_missing: "drop" # drop, mean, median diff --git a/conf/model/decision_tree.yaml b/conf/model/decision_tree.yaml new file mode 100644 index 0000000..6dbe3f2 --- /dev/null +++ b/conf/model/decision_tree.yaml @@ -0,0 +1,11 @@ +# Decision Tree model configuration +name: "DecisionTree" +_target_: "sklearn.tree.DecisionTreeClassifier" + +# Model hyperparameters +params: + max_depth: 10 + min_samples_split: 2 + min_samples_leaf: 1 + criterion: "gini" + random_state: ${seed} diff --git a/conf/model/gradient_boosting.yaml b/conf/model/gradient_boosting.yaml new file mode 100644 index 0000000..b7721ad --- /dev/null +++ b/conf/model/gradient_boosting.yaml @@ -0,0 +1,13 @@ +# Gradient Boosting model configuration +name: "GradientBoosting" +_target_: "sklearn.ensemble.GradientBoostingClassifier" + +# Model hyperparameters +params: + n_estimators: 100 + learning_rate: 0.1 + max_depth: 3 + min_samples_split: 2 + min_samples_leaf: 1 + subsample: 1.0 + random_state: ${seed} diff --git a/conf/model/knn.yaml b/conf/model/knn.yaml new file mode 100644 index 0000000..7fb6358 --- /dev/null +++ b/conf/model/knn.yaml @@ -0,0 +1,12 @@ +# K-Nearest Neighbors model configuration +name: "KNN" +_target_: "sklearn.neighbors.KNeighborsClassifier" + +# Model hyperparameters +params: + n_neighbors: 5 + weights: "uniform" + algorithm: "auto" + leaf_size: 30 + p: 2 # Euclidean distance + n_jobs: -1 diff --git a/conf/model/logistic_regression.yaml b/conf/model/logistic_regression.yaml new file mode 100644 index 0000000..4099a20 --- /dev/null +++ b/conf/model/logistic_regression.yaml @@ -0,0 +1,11 @@ +# Logistic Regression model configuration +name: "LogisticRegression" +_target_: "sklearn.linear_model.LogisticRegression" + +# Model hyperparameters +params: + C: 1.0 + penalty: "l2" + solver: "lbfgs" + max_iter: 1000 + random_state: ${seed} diff --git a/conf/model/random_forest.yaml b/conf/model/random_forest.yaml new file mode 100644 index 0000000..15479e5 --- /dev/null +++ b/conf/model/random_forest.yaml @@ -0,0 +1,14 @@ +# Random Forest model configuration +name: "RandomForest" +_target_: "sklearn.ensemble.RandomForestClassifier" + +# Model hyperparameters +params: + n_estimators: 100 + max_depth: 10 + min_samples_split: 2 + min_samples_leaf: 1 + max_features: "sqrt" + bootstrap: true + random_state: ${seed} + n_jobs: -1 diff --git a/conf/model/svm.yaml b/conf/model/svm.yaml new file mode 100644 index 0000000..b0dd99e --- /dev/null +++ b/conf/model/svm.yaml @@ -0,0 +1,10 @@ +# SVM model configuration +name: "SVM" +_target_: "sklearn.svm.SVC" + +# Model hyperparameters +params: + C: 1.0 + kernel: "rbf" + gamma: "scale" + random_state: ${seed} diff --git a/conf/training/default.yaml b/conf/training/default.yaml new file mode 100644 index 0000000..4fa36f6 --- /dev/null +++ b/conf/training/default.yaml @@ -0,0 +1,20 @@ +# Training configuration +cv_folds: 5 +shuffle: true + +# Metrics to track +metrics: + - accuracy + - precision + - recall + - f1_score + +# Model registration +register_model: true +model_name: "${model.name}" + +# Early stopping (for applicable models) +early_stopping: + enabled: false + patience: 10 + min_delta: 0.001 diff --git a/dvc.lock b/dvc.lock index ea9196f..46084c8 100644 --- a/dvc.lock +++ b/dvc.lock @@ -1,16 +1,26 @@ schema: '2.0' stages: prepare: - cmd: python src/data/make_dataset.py data/raw data/processed + cmd: python -m src.pipelines.prepare_data deps: - - path: data/raw/winequality-red.csv + - path: conf/config.yaml hash: md5 - md5: 2daeecee174368f8a33b82c8cccae3a5 - size: 84199 - - path: src/data/make_dataset.py + md5: 0f693e7eafdf2dffbdecbe1887a39c38 + size: 500 + - path: conf/data/default.yaml hash: md5 - md5: c485f51def978f2d9c6e2f92c5049db4 - size: 1853 + md5: ff90d2a07156af29118ad21498160520 + size: 294 + - path: src/pipelines/prepare_data.py + hash: md5 + md5: 2010975563acf3f8875478837357313c + size: 5368 + params: + conf/config.yaml: + seed: 42 + conf/data/default.yaml: + random_state: ${seed} + test_size: 0.2 outs: - path: data/processed hash: md5 @@ -29,3 +39,294 @@ stages: hash: md5 md5: 715de07ddf949c59578ad0717ed3d400 size: 2887 + train_random_forest: + cmd: python -m src.pipelines.train_pipeline model=random_forest + deps: + - path: conf/config.yaml + hash: md5 + md5: 0f693e7eafdf2dffbdecbe1887a39c38 + size: 500 + - path: conf/model/random_forest.yaml + hash: md5 + md5: 009db803cc0475494bbc3ad4c7d2dbf0 + size: 301 + - path: data/processed + hash: md5 + md5: e32cf3ebecf024f29e7403c2720f9f65.dir + size: 92145 + nfiles: 2 + - path: src/pipelines/monitoring.py + hash: md5 + md5: d2e9c4ff1a4843f2a4530913d40f29c0 + size: 9957 + - path: src/pipelines/train_pipeline.py + hash: md5 + md5: 0a67fe036f5c540d1af6435a196c2be3 + size: 11725 + params: + conf/model/random_forest.yaml: + params: + n_estimators: 100 + max_depth: 10 + min_samples_split: 2 + min_samples_leaf: 1 + max_features: sqrt + bootstrap: true + random_state: ${seed} + n_jobs: -1 + outs: + - path: outputs/randomforest/metrics.json + hash: md5 + md5: 8740a394154bf4df9dae5e1678329223 + size: 351 + - path: outputs/randomforest/pipeline_report.json + hash: md5 + md5: c72edc1e12f6495755390b4991efb04d + size: 1612 + train_gradient_boosting: + cmd: python -m src.pipelines.train_pipeline model=gradient_boosting + deps: + - path: conf/config.yaml + hash: md5 + md5: 0f693e7eafdf2dffbdecbe1887a39c38 + size: 500 + - path: conf/model/gradient_boosting.yaml + hash: md5 + md5: b5246eb98037315726a98e29a134f19b + size: 296 + - path: data/processed + hash: md5 + md5: e32cf3ebecf024f29e7403c2720f9f65.dir + size: 92145 + nfiles: 2 + - path: src/pipelines/monitoring.py + hash: md5 + md5: d2e9c4ff1a4843f2a4530913d40f29c0 + size: 9957 + - path: src/pipelines/train_pipeline.py + hash: md5 + md5: 0a67fe036f5c540d1af6435a196c2be3 + size: 11725 + params: + conf/model/gradient_boosting.yaml: + params: + n_estimators: 100 + learning_rate: 0.1 + max_depth: 3 + min_samples_split: 2 + min_samples_leaf: 1 + subsample: 1.0 + random_state: ${seed} + outs: + - path: outputs/gradientboosting/metrics.json + hash: md5 + md5: b984009fc498829906a97c5997ffbd32 + size: 348 + - path: outputs/gradientboosting/pipeline_report.json + hash: md5 + md5: 0627eb1efaebdccacb5606ff9490a992 + size: 1602 + train_logistic_regression: + cmd: python -m src.pipelines.train_pipeline model=logistic_regression + deps: + - path: conf/config.yaml + hash: md5 + md5: 0f693e7eafdf2dffbdecbe1887a39c38 + size: 500 + - path: conf/model/logistic_regression.yaml + hash: md5 + md5: d3c0db59f2560b9f0377ee317368c1a1 + size: 238 + - path: data/processed + hash: md5 + md5: e32cf3ebecf024f29e7403c2720f9f65.dir + size: 92145 + nfiles: 2 + - path: src/pipelines/monitoring.py + hash: md5 + md5: d2e9c4ff1a4843f2a4530913d40f29c0 + size: 9957 + - path: src/pipelines/train_pipeline.py + hash: md5 + md5: 0a67fe036f5c540d1af6435a196c2be3 + size: 11725 + params: + conf/model/logistic_regression.yaml: + params: + C: 1.0 + penalty: l2 + solver: lbfgs + max_iter: 1000 + random_state: ${seed} + outs: + - path: outputs/logisticregression/metrics.json + hash: md5 + md5: 3e82c66b555205263cb78b6c40efff72 + size: 359 + - path: outputs/logisticregression/pipeline_report.json + hash: md5 + md5: b84e6951b9d38b73189d58615d9f6113 + size: 1528 + train_svm: + cmd: python -m src.pipelines.train_pipeline model=svm + deps: + - path: conf/config.yaml + hash: md5 + md5: 0f693e7eafdf2dffbdecbe1887a39c38 + size: 500 + - path: conf/model/svm.yaml + hash: md5 + md5: 8989392bf681dba9d9218f2b843d8814 + size: 165 + - path: data/processed + hash: md5 + md5: e32cf3ebecf024f29e7403c2720f9f65.dir + size: 92145 + nfiles: 2 + - path: src/pipelines/monitoring.py + hash: md5 + md5: d2e9c4ff1a4843f2a4530913d40f29c0 + size: 9957 + - path: src/pipelines/train_pipeline.py + hash: md5 + md5: 0a67fe036f5c540d1af6435a196c2be3 + size: 11725 + params: + conf/model/svm.yaml: + params: + C: 1.0 + kernel: rbf + gamma: scale + random_state: ${seed} + outs: + - path: outputs/svm/metrics.json + hash: md5 + md5: 1765a3dc33179a2b335b2bd8b3cb3dd6 + size: 343 + - path: outputs/svm/pipeline_report.json + hash: md5 + md5: d5fa3b762f64f06b9bacb57f2e78d8bb + size: 1476 + train_decision_tree: + cmd: python -m src.pipelines.train_pipeline model=decision_tree + deps: + - path: conf/config.yaml + hash: md5 + md5: 0f693e7eafdf2dffbdecbe1887a39c38 + size: 500 + - path: conf/model/decision_tree.yaml + hash: md5 + md5: 8bd548ac7ed28073f3d276ee1fca8fc1 + size: 243 + - path: data/processed + hash: md5 + md5: e32cf3ebecf024f29e7403c2720f9f65.dir + size: 92145 + nfiles: 2 + - path: src/pipelines/monitoring.py + hash: md5 + md5: d2e9c4ff1a4843f2a4530913d40f29c0 + size: 9957 + - path: src/pipelines/train_pipeline.py + hash: md5 + md5: 0a67fe036f5c540d1af6435a196c2be3 + size: 11725 + params: + conf/model/decision_tree.yaml: + params: + max_depth: 10 + min_samples_split: 2 + min_samples_leaf: 1 + criterion: gini + random_state: ${seed} + outs: + - path: outputs/decisiontree/metrics.json + hash: md5 + md5: f2d5834a94d8163089349ec24990022a + size: 344 + - path: outputs/decisiontree/pipeline_report.json + hash: md5 + md5: ffe0cfe4a8c6c1b732ab476d401660dd + size: 1539 + train_knn: + cmd: python -m src.pipelines.train_pipeline model=knn + deps: + - path: conf/config.yaml + hash: md5 + md5: 0f693e7eafdf2dffbdecbe1887a39c38 + size: 500 + - path: conf/model/knn.yaml + hash: md5 + md5: 48a5609baa64a6642e2836d4cd27cc5c + size: 254 + - path: data/processed + hash: md5 + md5: e32cf3ebecf024f29e7403c2720f9f65.dir + size: 92145 + nfiles: 2 + - path: src/pipelines/monitoring.py + hash: md5 + md5: d2e9c4ff1a4843f2a4530913d40f29c0 + size: 9957 + - path: src/pipelines/train_pipeline.py + hash: md5 + md5: 0a67fe036f5c540d1af6435a196c2be3 + size: 11725 + params: + conf/model/knn.yaml: + params: + n_neighbors: 5 + weights: uniform + algorithm: auto + leaf_size: 30 + p: 2 + n_jobs: -1 + outs: + - path: outputs/knn/metrics.json + hash: md5 + md5: 17ec88c3113f44c1206c220074c26907 + size: 343 + - path: outputs/knn/pipeline_report.json + hash: md5 + md5: 8670988def549f6bdd474760db8e350e + size: 1519 + evaluate: + cmd: python -m src.pipelines.evaluate_models + deps: + - path: outputs/decisiontree/metrics.json + hash: md5 + md5: f2d5834a94d8163089349ec24990022a + size: 344 + - path: outputs/gradientboosting/metrics.json + hash: md5 + md5: b984009fc498829906a97c5997ffbd32 + size: 348 + - path: outputs/knn/metrics.json + hash: md5 + md5: 17ec88c3113f44c1206c220074c26907 + size: 343 + - path: outputs/logisticregression/metrics.json + hash: md5 + md5: 3e82c66b555205263cb78b6c40efff72 + size: 359 + - path: outputs/randomforest/metrics.json + hash: md5 + md5: 8740a394154bf4df9dae5e1678329223 + size: 351 + - path: outputs/svm/metrics.json + hash: md5 + md5: 1765a3dc33179a2b335b2bd8b3cb3dd6 + size: 343 + - path: src/pipelines/evaluate_models.py + hash: md5 + md5: d375924fefbc7c3697dbb28e8943e3e0 + size: 7035 + outs: + - path: outputs/comparison/best_model.json + hash: md5 + md5: df0009a067adc5ce9c34fa9627641f13 + size: 464 + - path: outputs/comparison/metrics_comparison.csv + hash: md5 + md5: c8dd433e3bccfd956775b116f57a4a37 + size: 1075 diff --git a/dvc.yaml b/dvc.yaml index b623072..67506b4 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -1,15 +1,151 @@ stages: + # Data preparation stage - downloads and splits data prepare: - cmd: python src/data/make_dataset.py data/raw data/processed + cmd: python -m src.pipelines.prepare_data deps: - - data/raw/winequality-red.csv - - src/data/make_dataset.py + - src/pipelines/prepare_data.py + - conf/data/default.yaml + - conf/config.yaml + params: + - conf/config.yaml: + - seed + - conf/data/default.yaml: + - test_size + - random_state outs: + - data/processed: + cache: true + + # Training stage with Random Forest + train_random_forest: + cmd: python -m src.pipelines.train_pipeline model=random_forest + deps: + - data/processed + - src/pipelines/train_pipeline.py + - src/pipelines/monitoring.py + - conf/model/random_forest.yaml + - conf/config.yaml + params: + - conf/model/random_forest.yaml: + - params + plots: + - outputs/randomforest/pipeline_report.json: + cache: false + metrics: + - outputs/randomforest/metrics.json: + cache: false + + # Training stage with Gradient Boosting + train_gradient_boosting: + cmd: python -m src.pipelines.train_pipeline model=gradient_boosting + deps: + - data/processed + - src/pipelines/train_pipeline.py + - src/pipelines/monitoring.py + - conf/model/gradient_boosting.yaml + - conf/config.yaml + params: + - conf/model/gradient_boosting.yaml: + - params + plots: + - outputs/gradientboosting/pipeline_report.json: + cache: false + metrics: + - outputs/gradientboosting/metrics.json: + cache: false + + # Training stage with Logistic Regression + train_logistic_regression: + cmd: python -m src.pipelines.train_pipeline model=logistic_regression + deps: - data/processed + - src/pipelines/train_pipeline.py + - src/pipelines/monitoring.py + - conf/model/logistic_regression.yaml + - conf/config.yaml + params: + - conf/model/logistic_regression.yaml: + - params + plots: + - outputs/logisticregression/pipeline_report.json: + cache: false + metrics: + - outputs/logisticregression/metrics.json: + cache: false - train: - cmd: python src/models/train_model.py data/processed + # Training stage with SVM + train_svm: + cmd: python -m src.pipelines.train_pipeline model=svm deps: - data/processed - - src/models/train_model.py + - src/pipelines/train_pipeline.py + - src/pipelines/monitoring.py + - conf/model/svm.yaml + - conf/config.yaml + params: + - conf/model/svm.yaml: + - params + plots: + - outputs/svm/pipeline_report.json: + cache: false + metrics: + - outputs/svm/metrics.json: + cache: false + # Training stage with Decision Tree + train_decision_tree: + cmd: python -m src.pipelines.train_pipeline model=decision_tree + deps: + - data/processed + - src/pipelines/train_pipeline.py + - src/pipelines/monitoring.py + - conf/model/decision_tree.yaml + - conf/config.yaml + params: + - conf/model/decision_tree.yaml: + - params + plots: + - outputs/decisiontree/pipeline_report.json: + cache: false + metrics: + - outputs/decisiontree/metrics.json: + cache: false + + # Training stage with KNN + train_knn: + cmd: python -m src.pipelines.train_pipeline model=knn + deps: + - data/processed + - src/pipelines/train_pipeline.py + - src/pipelines/monitoring.py + - conf/model/knn.yaml + - conf/config.yaml + params: + - conf/model/knn.yaml: + - params + plots: + - outputs/knn/pipeline_report.json: + cache: false + metrics: + - outputs/knn/metrics.json: + cache: false + + # Evaluate all models - compare results + evaluate: + cmd: python -m src.pipelines.evaluate_models + deps: + - outputs/randomforest/metrics.json + - outputs/gradientboosting/metrics.json + - outputs/logisticregression/metrics.json + - outputs/svm/metrics.json + - outputs/decisiontree/metrics.json + - outputs/knn/metrics.json + - src/pipelines/evaluate_models.py + metrics: + - outputs/comparison/best_model.json: + cache: false + plots: + - outputs/comparison/metrics_comparison.csv: + cache: false + x: model + y: accuracy diff --git a/poetry.lock b/poetry.lock index 1f32b22..083f5c6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -713,6 +713,39 @@ files = [ {file = "charset_normalizer-3.4.4.tar.gz", hash = "sha256:94537985111c35f28720e43603b8e7b43a6ecfb2ce1d3058bbe955b73404e21a"}, ] +[[package]] +name = "clearml" +version = "2.1.0" +description = "ClearML - Auto-Magical Experiment Manager, Version Control, and MLOps for AI" +optional = false +python-versions = "*" +files = [ + {file = "clearml-2.1.0-py2.py3-none-any.whl", hash = "sha256:877fa5d71806bd6cb38331e945cb475ac5b561acfcbd7468866c729231880704"}, +] + +[package.dependencies] +attrs = ">=18.0" +furl = ">=2.0.0" +jsonschema = ">=2.6.0" +numpy = ">=1.10" +pathlib2 = ">=2.3.0" +Pillow = {version = ">=10.3.0", markers = "python_version >= \"3.8\""} +psutil = ">=3.4.2" +pyjwt = ">=2.4.0,<2.11.0" +pyparsing = ">=2.0.3" +python-dateutil = ">=2.6.1" +PyYAML = ">=3.12" +referencing = {version = "<0.40", markers = "python_version >= \"3.8\""} +requests = {version = ">=2.32.0", markers = "python_version >= \"3.8\""} +six = ">=1.16.0" +urllib3 = ">=1.21.1" + +[package.extras] +azure = ["azure-storage-blob (>=12.0.0)"] +gs = ["google-cloud-storage (>=1.13.2)"] +router = ["fastapi (>=0.115.2)", "httpx (>=0.27.2)", "uvicorn (>=0.31.1)"] +s3 = ["boto3 (>=1.9)"] + [[package]] name = "click" version = "8.3.1" @@ -1818,6 +1851,21 @@ files = [ {file = "funcy-2.0.tar.gz", hash = "sha256:3963315d59d41c6f30c04bc910e10ab50a3ac4a225868bfa96feed133df075cb"}, ] +[[package]] +name = "furl" +version = "2.1.4" +description = "URL manipulation made simple." +optional = false +python-versions = "*" +files = [ + {file = "furl-2.1.4-py2.py3-none-any.whl", hash = "sha256:da34d0b34e53ffe2d2e6851a7085a05d96922b5b578620a37377ff1dbeeb11c8"}, + {file = "furl-2.1.4.tar.gz", hash = "sha256:877657501266c929269739fb5f5980534a41abd6bbabcb367c136d1d3b2a6015"}, +] + +[package.dependencies] +orderedmultidict = ">=1.0.1" +six = ">=1.8.0" + [[package]] name = "gitdb" version = "4.0.12" @@ -2310,6 +2358,41 @@ files = [ {file = "joblib-1.5.2.tar.gz", hash = "sha256:3faa5c39054b2f03ca547da9b2f52fde67c06240c31853f306aea97f13647b55"}, ] +[[package]] +name = "jsonschema" +version = "4.25.1" +description = "An implementation of JSON Schema validation for Python" +optional = false +python-versions = ">=3.9" +files = [ + {file = "jsonschema-4.25.1-py3-none-any.whl", hash = "sha256:3fba0169e345c7175110351d456342c364814cfcf3b964ba4587f22915230a63"}, + {file = "jsonschema-4.25.1.tar.gz", hash = "sha256:e4a9655ce0da0c0b67a085847e00a3a51449e1157f4f75e9fb5aa545e122eb85"}, +] + +[package.dependencies] +attrs = ">=22.2.0" +jsonschema-specifications = ">=2023.03.6" +referencing = ">=0.28.4" +rpds-py = ">=0.7.1" + +[package.extras] +format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"] +format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "rfc3987-syntax (>=1.1.0)", "uri-template", "webcolors (>=24.6.0)"] + +[[package]] +name = "jsonschema-specifications" +version = "2025.9.1" +description = "The JSON Schema meta-schemas and vocabularies, exposed as a Registry" +optional = false +python-versions = ">=3.9" +files = [ + {file = "jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe"}, + {file = "jsonschema_specifications-2025.9.1.tar.gz", hash = "sha256:b540987f239e745613c7a9176f3edb72b832a4ac465cf02712288397832b5e8d"}, +] + +[package.dependencies] +referencing = ">=0.31.0" + [[package]] name = "jupyter-client" version = "8.6.3" @@ -3279,6 +3362,20 @@ files = [ opentelemetry-api = "1.39.0" typing-extensions = ">=4.5.0" +[[package]] +name = "orderedmultidict" +version = "1.0.2" +description = "Ordered Multivalue Dictionary" +optional = false +python-versions = "*" +files = [ + {file = "orderedmultidict-1.0.2-py2.py3-none-any.whl", hash = "sha256:ab5044c1dca4226ae4c28524cfc5cc4c939f0b49e978efa46a6ad6468049f79b"}, + {file = "orderedmultidict-1.0.2.tar.gz", hash = "sha256:16a7ae8432e02cc987d2d6d5af2df5938258f87c870675c73ee77a0920e6f4a6"}, +] + +[package.dependencies] +six = ">=1.8.0" + [[package]] name = "orjson" version = "3.11.5" @@ -3496,6 +3593,20 @@ files = [ qa = ["flake8 (==5.0.4)", "mypy (==0.971)", "types-setuptools (==67.2.0.1)"] testing = ["docopt", "pytest"] +[[package]] +name = "pathlib2" +version = "2.3.7.post1" +description = "Object-oriented filesystem paths" +optional = false +python-versions = "*" +files = [ + {file = "pathlib2-2.3.7.post1-py2.py3-none-any.whl", hash = "sha256:5266a0fd000452f1b3467d782f079a4343c63aaa119221fbdc4e39577489ca5b"}, + {file = "pathlib2-2.3.7.post1.tar.gz", hash = "sha256:9fe0edad898b83c0c3e199c842b27ed216645d2e177757b2dd67384d4113c641"}, +] + +[package.dependencies] +six = "*" + [[package]] name = "pathspec" version = "0.12.1" @@ -4279,6 +4390,23 @@ files = [ {file = "pygtrie-2.5.0.tar.gz", hash = "sha256:203514ad826eb403dab1d2e2ddd034e0d1534bbe4dbe0213bb0593f66beba4e2"}, ] +[[package]] +name = "pyjwt" +version = "2.10.1" +description = "JSON Web Token implementation in Python" +optional = false +python-versions = ">=3.9" +files = [ + {file = "PyJWT-2.10.1-py3-none-any.whl", hash = "sha256:dcdd193e30abefd5debf142f9adfcdd2b58004e644f25406ffaebd50bd98dacb"}, + {file = "pyjwt-2.10.1.tar.gz", hash = "sha256:3cc5772eb20009233caf06e9d8a0577824723b44e6648ee0a2aedb6cf9381953"}, +] + +[package.extras] +crypto = ["cryptography (>=3.4.0)"] +dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.4.0)", "pre-commit", "pytest (>=6.0.0,<7.0.0)", "sphinx", "sphinx-rtd-theme", "zope.interface"] +docs = ["sphinx", "sphinx-rtd-theme", "zope.interface"] +tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] + [[package]] name = "pyparsing" version = "3.2.5" @@ -4547,6 +4675,22 @@ files = [ [package.dependencies] cffi = {version = "*", markers = "implementation_name == \"pypy\""} +[[package]] +name = "referencing" +version = "0.37.0" +description = "JSON Referencing + Python" +optional = false +python-versions = ">=3.10" +files = [ + {file = "referencing-0.37.0-py3-none-any.whl", hash = "sha256:381329a9f99628c9069361716891d34ad94af76e461dcb0335825aecc7692231"}, + {file = "referencing-0.37.0.tar.gz", hash = "sha256:44aefc3142c5b842538163acb373e24cce6632bd54bdb01b21ad5863489f50d8"}, +] + +[package.dependencies] +attrs = ">=22.2.0" +rpds-py = ">=0.7.0" +typing-extensions = {version = ">=4.4.0", markers = "python_version < \"3.13\""} + [[package]] name = "requests" version = "2.32.5" @@ -4586,6 +4730,130 @@ pygments = ">=2.13.0,<3.0.0" [package.extras] jupyter = ["ipywidgets (>=7.5.1,<9)"] +[[package]] +name = "rpds-py" +version = "0.30.0" +description = "Python bindings to Rust's persistent data structures (rpds)" +optional = false +python-versions = ">=3.10" +files = [ + {file = "rpds_py-0.30.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:679ae98e00c0e8d68a7fda324e16b90fd5260945b45d3b824c892cec9eea3288"}, + {file = "rpds_py-0.30.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4cc2206b76b4f576934f0ed374b10d7ca5f457858b157ca52064bdfc26b9fc00"}, + {file = "rpds_py-0.30.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:389a2d49eded1896c3d48b0136ead37c48e221b391c052fba3f4055c367f60a6"}, + {file = "rpds_py-0.30.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:32c8528634e1bf7121f3de08fa85b138f4e0dc47657866630611b03967f041d7"}, + {file = "rpds_py-0.30.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f207f69853edd6f6700b86efb84999651baf3789e78a466431df1331608e5324"}, + {file = "rpds_py-0.30.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:67b02ec25ba7a9e8fa74c63b6ca44cf5707f2fbfadae3ee8e7494297d56aa9df"}, + {file = "rpds_py-0.30.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c0e95f6819a19965ff420f65578bacb0b00f251fefe2c8b23347c37174271f3"}, + {file = "rpds_py-0.30.0-cp310-cp310-manylinux_2_31_riscv64.whl", hash = "sha256:a452763cc5198f2f98898eb98f7569649fe5da666c2dc6b5ddb10fde5a574221"}, + {file = "rpds_py-0.30.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e0b65193a413ccc930671c55153a03ee57cecb49e6227204b04fae512eb657a7"}, + {file = "rpds_py-0.30.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:858738e9c32147f78b3ac24dc0edb6610000e56dc0f700fd5f651d0a0f0eb9ff"}, + {file = "rpds_py-0.30.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:da279aa314f00acbb803da1e76fa18666778e8a8f83484fba94526da5de2cba7"}, + {file = "rpds_py-0.30.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7c64d38fb49b6cdeda16ab49e35fe0da2e1e9b34bc38bd78386530f218b37139"}, + {file = "rpds_py-0.30.0-cp310-cp310-win32.whl", hash = "sha256:6de2a32a1665b93233cde140ff8b3467bdb9e2af2b91079f0333a0974d12d464"}, + {file = "rpds_py-0.30.0-cp310-cp310-win_amd64.whl", hash = "sha256:1726859cd0de969f88dc8673bdd954185b9104e05806be64bcd87badbe313169"}, + {file = "rpds_py-0.30.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:a2bffea6a4ca9f01b3f8e548302470306689684e61602aa3d141e34da06cf425"}, + {file = "rpds_py-0.30.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dc4f992dfe1e2bc3ebc7444f6c7051b4bc13cd8e33e43511e8ffd13bf407010d"}, + {file = "rpds_py-0.30.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:422c3cb9856d80b09d30d2eb255d0754b23e090034e1deb4083f8004bd0761e4"}, + {file = "rpds_py-0.30.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:07ae8a593e1c3c6b82ca3292efbe73c30b61332fd612e05abee07c79359f292f"}, + {file = "rpds_py-0.30.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:12f90dd7557b6bd57f40abe7747e81e0c0b119bef015ea7726e69fe550e394a4"}, + {file = "rpds_py-0.30.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:99b47d6ad9a6da00bec6aabe5a6279ecd3c06a329d4aa4771034a21e335c3a97"}, + {file = "rpds_py-0.30.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33f559f3104504506a44bb666b93a33f5d33133765b0c216a5bf2f1e1503af89"}, + {file = "rpds_py-0.30.0-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:946fe926af6e44f3697abbc305ea168c2c31d3e3ef1058cf68f379bf0335a78d"}, + {file = "rpds_py-0.30.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:495aeca4b93d465efde585977365187149e75383ad2684f81519f504f5c13038"}, + {file = "rpds_py-0.30.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d9a0ca5da0386dee0655b4ccdf46119df60e0f10da268d04fe7cc87886872ba7"}, + {file = "rpds_py-0.30.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8d6d1cc13664ec13c1b84241204ff3b12f9bb82464b8ad6e7a5d3486975c2eed"}, + {file = "rpds_py-0.30.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3896fa1be39912cf0757753826bc8bdc8ca331a28a7c4ae46b7a21280b06bb85"}, + {file = "rpds_py-0.30.0-cp311-cp311-win32.whl", hash = "sha256:55f66022632205940f1827effeff17c4fa7ae1953d2b74a8581baaefb7d16f8c"}, + {file = "rpds_py-0.30.0-cp311-cp311-win_amd64.whl", hash = "sha256:a51033ff701fca756439d641c0ad09a41d9242fa69121c7d8769604a0a629825"}, + {file = "rpds_py-0.30.0-cp311-cp311-win_arm64.whl", hash = "sha256:47b0ef6231c58f506ef0b74d44e330405caa8428e770fec25329ed2cb971a229"}, + {file = "rpds_py-0.30.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a161f20d9a43006833cd7068375a94d035714d73a172b681d8881820600abfad"}, + {file = "rpds_py-0.30.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6abc8880d9d036ecaafe709079969f56e876fcf107f7a8e9920ba6d5a3878d05"}, + {file = "rpds_py-0.30.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca28829ae5f5d569bb62a79512c842a03a12576375d5ece7d2cadf8abe96ec28"}, + {file = "rpds_py-0.30.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a1010ed9524c73b94d15919ca4d41d8780980e1765babf85f9a2f90d247153dd"}, + {file = "rpds_py-0.30.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f8d1736cfb49381ba528cd5baa46f82fdc65c06e843dab24dd70b63d09121b3f"}, + {file = "rpds_py-0.30.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d948b135c4693daff7bc2dcfc4ec57237a29bd37e60c2fabf5aff2bbacf3e2f1"}, + {file = "rpds_py-0.30.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47f236970bccb2233267d89173d3ad2703cd36a0e2a6e92d0560d333871a3d23"}, + {file = "rpds_py-0.30.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:2e6ecb5a5bcacf59c3f912155044479af1d0b6681280048b338b28e364aca1f6"}, + {file = "rpds_py-0.30.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a8fa71a2e078c527c3e9dc9fc5a98c9db40bcc8a92b4e8858e36d329f8684b51"}, + {file = "rpds_py-0.30.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:73c67f2db7bc334e518d097c6d1e6fed021bbc9b7d678d6cc433478365d1d5f5"}, + {file = "rpds_py-0.30.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:5ba103fb455be00f3b1c2076c9d4264bfcb037c976167a6047ed82f23153f02e"}, + {file = "rpds_py-0.30.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7cee9c752c0364588353e627da8a7e808a66873672bcb5f52890c33fd965b394"}, + {file = "rpds_py-0.30.0-cp312-cp312-win32.whl", hash = "sha256:1ab5b83dbcf55acc8b08fc62b796ef672c457b17dbd7820a11d6c52c06839bdf"}, + {file = "rpds_py-0.30.0-cp312-cp312-win_amd64.whl", hash = "sha256:a090322ca841abd453d43456ac34db46e8b05fd9b3b4ac0c78bcde8b089f959b"}, + {file = "rpds_py-0.30.0-cp312-cp312-win_arm64.whl", hash = "sha256:669b1805bd639dd2989b281be2cfd951c6121b65e729d9b843e9639ef1fd555e"}, + {file = "rpds_py-0.30.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:f83424d738204d9770830d35290ff3273fbb02b41f919870479fab14b9d303b2"}, + {file = "rpds_py-0.30.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e7536cd91353c5273434b4e003cbda89034d67e7710eab8761fd918ec6c69cf8"}, + {file = "rpds_py-0.30.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2771c6c15973347f50fece41fc447c054b7ac2ae0502388ce3b6738cd366e3d4"}, + {file = "rpds_py-0.30.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0a59119fc6e3f460315fe9d08149f8102aa322299deaa5cab5b40092345c2136"}, + {file = "rpds_py-0.30.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:76fec018282b4ead0364022e3c54b60bf368b9d926877957a8624b58419169b7"}, + {file = "rpds_py-0.30.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:692bef75a5525db97318e8cd061542b5a79812d711ea03dbc1f6f8dbb0c5f0d2"}, + {file = "rpds_py-0.30.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9027da1ce107104c50c81383cae773ef5c24d296dd11c99e2629dbd7967a20c6"}, + {file = "rpds_py-0.30.0-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:9cf69cdda1f5968a30a359aba2f7f9aa648a9ce4b580d6826437f2b291cfc86e"}, + {file = "rpds_py-0.30.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a4796a717bf12b9da9d3ad002519a86063dcac8988b030e405704ef7d74d2d9d"}, + {file = "rpds_py-0.30.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5d4c2aa7c50ad4728a094ebd5eb46c452e9cb7edbfdb18f9e1221f597a73e1e7"}, + {file = "rpds_py-0.30.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ba81a9203d07805435eb06f536d95a266c21e5b2dfbf6517748ca40c98d19e31"}, + {file = "rpds_py-0.30.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:945dccface01af02675628334f7cf49c2af4c1c904748efc5cf7bbdf0b579f95"}, + {file = "rpds_py-0.30.0-cp313-cp313-win32.whl", hash = "sha256:b40fb160a2db369a194cb27943582b38f79fc4887291417685f3ad693c5a1d5d"}, + {file = "rpds_py-0.30.0-cp313-cp313-win_amd64.whl", hash = "sha256:806f36b1b605e2d6a72716f321f20036b9489d29c51c91f4dd29a3e3afb73b15"}, + {file = "rpds_py-0.30.0-cp313-cp313-win_arm64.whl", hash = "sha256:d96c2086587c7c30d44f31f42eae4eac89b60dabbac18c7669be3700f13c3ce1"}, + {file = "rpds_py-0.30.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:eb0b93f2e5c2189ee831ee43f156ed34e2a89a78a66b98cadad955972548be5a"}, + {file = "rpds_py-0.30.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:922e10f31f303c7c920da8981051ff6d8c1a56207dbdf330d9047f6d30b70e5e"}, + {file = "rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cdc62c8286ba9bf7f47befdcea13ea0e26bf294bda99758fd90535cbaf408000"}, + {file = "rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:47f9a91efc418b54fb8190a6b4aa7813a23fb79c51f4bb84e418f5476c38b8db"}, + {file = "rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1f3587eb9b17f3789ad50824084fa6f81921bbf9a795826570bda82cb3ed91f2"}, + {file = "rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:39c02563fc592411c2c61d26b6c5fe1e51eaa44a75aa2c8735ca88b0d9599daa"}, + {file = "rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51a1234d8febafdfd33a42d97da7a43f5dcb120c1060e352a3fbc0c6d36e2083"}, + {file = "rpds_py-0.30.0-cp313-cp313t-manylinux_2_31_riscv64.whl", hash = "sha256:eb2c4071ab598733724c08221091e8d80e89064cd472819285a9ab0f24bcedb9"}, + {file = "rpds_py-0.30.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6bdfdb946967d816e6adf9a3d8201bfad269c67efe6cefd7093ef959683c8de0"}, + {file = "rpds_py-0.30.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c77afbd5f5250bf27bf516c7c4a016813eb2d3e116139aed0096940c5982da94"}, + {file = "rpds_py-0.30.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:61046904275472a76c8c90c9ccee9013d70a6d0f73eecefd38c1ae7c39045a08"}, + {file = "rpds_py-0.30.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4c5f36a861bc4b7da6516dbdf302c55313afa09b81931e8280361a4f6c9a2d27"}, + {file = "rpds_py-0.30.0-cp313-cp313t-win32.whl", hash = "sha256:3d4a69de7a3e50ffc214ae16d79d8fbb0922972da0356dcf4d0fdca2878559c6"}, + {file = "rpds_py-0.30.0-cp313-cp313t-win_amd64.whl", hash = "sha256:f14fc5df50a716f7ece6a80b6c78bb35ea2ca47c499e422aa4463455dd96d56d"}, + {file = "rpds_py-0.30.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:68f19c879420aa08f61203801423f6cd5ac5f0ac4ac82a2368a9fcd6a9a075e0"}, + {file = "rpds_py-0.30.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:ec7c4490c672c1a0389d319b3a9cfcd098dcdc4783991553c332a15acf7249be"}, + {file = "rpds_py-0.30.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f251c812357a3fed308d684a5079ddfb9d933860fc6de89f2b7ab00da481e65f"}, + {file = "rpds_py-0.30.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ac98b175585ecf4c0348fd7b29c3864bda53b805c773cbf7bfdaffc8070c976f"}, + {file = "rpds_py-0.30.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3e62880792319dbeb7eb866547f2e35973289e7d5696c6e295476448f5b63c87"}, + {file = "rpds_py-0.30.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4e7fc54e0900ab35d041b0601431b0a0eb495f0851a0639b6ef90f7741b39a18"}, + {file = "rpds_py-0.30.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47e77dc9822d3ad616c3d5759ea5631a75e5809d5a28707744ef79d7a1bcfcad"}, + {file = "rpds_py-0.30.0-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:b4dc1a6ff022ff85ecafef7979a2c6eb423430e05f1165d6688234e62ba99a07"}, + {file = "rpds_py-0.30.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4559c972db3a360808309e06a74628b95eaccbf961c335c8fe0d590cf587456f"}, + {file = "rpds_py-0.30.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:0ed177ed9bded28f8deb6ab40c183cd1192aa0de40c12f38be4d59cd33cb5c65"}, + {file = "rpds_py-0.30.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:ad1fa8db769b76ea911cb4e10f049d80bf518c104f15b3edb2371cc65375c46f"}, + {file = "rpds_py-0.30.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:46e83c697b1f1c72b50e5ee5adb4353eef7406fb3f2043d64c33f20ad1c2fc53"}, + {file = "rpds_py-0.30.0-cp314-cp314-win32.whl", hash = "sha256:ee454b2a007d57363c2dfd5b6ca4a5d7e2c518938f8ed3b706e37e5d470801ed"}, + {file = "rpds_py-0.30.0-cp314-cp314-win_amd64.whl", hash = "sha256:95f0802447ac2d10bcc69f6dc28fe95fdf17940367b21d34e34c737870758950"}, + {file = "rpds_py-0.30.0-cp314-cp314-win_arm64.whl", hash = "sha256:613aa4771c99f03346e54c3f038e4cc574ac09a3ddfb0e8878487335e96dead6"}, + {file = "rpds_py-0.30.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:7e6ecfcb62edfd632e56983964e6884851786443739dbfe3582947e87274f7cb"}, + {file = "rpds_py-0.30.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:a1d0bc22a7cdc173fedebb73ef81e07faef93692b8c1ad3733b67e31e1b6e1b8"}, + {file = "rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d08f00679177226c4cb8c5265012eea897c8ca3b93f429e546600c971bcbae7"}, + {file = "rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5965af57d5848192c13534f90f9dd16464f3c37aaf166cc1da1cae1fd5a34898"}, + {file = "rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9a4e86e34e9ab6b667c27f3211ca48f73dba7cd3d90f8d5b11be56e5dbc3fb4e"}, + {file = "rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e5d3e6b26f2c785d65cc25ef1e5267ccbe1b069c5c21b8cc724efee290554419"}, + {file = "rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:626a7433c34566535b6e56a1b39a7b17ba961e97ce3b80ec62e6f1312c025551"}, + {file = "rpds_py-0.30.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:acd7eb3f4471577b9b5a41baf02a978e8bdeb08b4b355273994f8b87032000a8"}, + {file = "rpds_py-0.30.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fe5fa731a1fa8a0a56b0977413f8cacac1768dad38d16b3a296712709476fbd5"}, + {file = "rpds_py-0.30.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:74a3243a411126362712ee1524dfc90c650a503502f135d54d1b352bd01f2404"}, + {file = "rpds_py-0.30.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:3e8eeb0544f2eb0d2581774be4c3410356eba189529a6b3e36bbbf9696175856"}, + {file = "rpds_py-0.30.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:dbd936cde57abfee19ab3213cf9c26be06d60750e60a8e4dd85d1ab12c8b1f40"}, + {file = "rpds_py-0.30.0-cp314-cp314t-win32.whl", hash = "sha256:dc824125c72246d924f7f796b4f63c1e9dc810c7d9e2355864b3c3a73d59ade0"}, + {file = "rpds_py-0.30.0-cp314-cp314t-win_amd64.whl", hash = "sha256:27f4b0e92de5bfbc6f86e43959e6edd1425c33b5e69aab0984a72047f2bcf1e3"}, + {file = "rpds_py-0.30.0-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c2262bdba0ad4fc6fb5545660673925c2d2a5d9e2e0fb603aad545427be0fc58"}, + {file = "rpds_py-0.30.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:ee6af14263f25eedc3bb918a3c04245106a42dfd4f5c2285ea6f997b1fc3f89a"}, + {file = "rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3adbb8179ce342d235c31ab8ec511e66c73faa27a47e076ccc92421add53e2bb"}, + {file = "rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:250fa00e9543ac9b97ac258bd37367ff5256666122c2d0f2bc97577c60a1818c"}, + {file = "rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9854cf4f488b3d57b9aaeb105f06d78e5529d3145b1e4a41750167e8c213c6d3"}, + {file = "rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:993914b8e560023bc0a8bf742c5f303551992dcb85e247b1e5c7f4a7d145bda5"}, + {file = "rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58edca431fb9b29950807e301826586e5bbf24163677732429770a697ffe6738"}, + {file = "rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_31_riscv64.whl", hash = "sha256:dea5b552272a944763b34394d04577cf0f9bd013207bc32323b5a89a53cf9c2f"}, + {file = "rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ba3af48635eb83d03f6c9735dfb21785303e73d22ad03d489e88adae6eab8877"}, + {file = "rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:dff13836529b921e22f15cb099751209a60009731a68519630a24d61f0b1b30a"}, + {file = "rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:1b151685b23929ab7beec71080a8889d4d6d9fa9a983d213f07121205d48e2c4"}, + {file = "rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:ac37f9f516c51e5753f27dfdef11a88330f04de2d564be3991384b2f3535d02e"}, + {file = "rpds_py-0.30.0.tar.gz", hash = "sha256:dd8ff7cf90014af0c0f787eea34794ebf6415242ee1d6fa91eaba725cc441e84"}, +] + [[package]] name = "rsa" version = "4.2" @@ -5618,4 +5886,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = ">=3.12" -content-hash = "a47fd0f19b7dfcb319201abbce2b854839d2af70db4987f9545adb47a4846c82" +content-hash = "b0f43e5aecb8b4273637415f4d1bc6990b8f5e048ff84d4e0d19d74fcab5e0bf" diff --git a/pyproject.toml b/pyproject.toml index 66577f9..2fd4664 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,10 @@ scikit-learn = "^1.7.2" ipykernel = "^7.1.0" dvc = "^3.64.2" mlflow = "^3.7.0" +hydra-core = "^1.3.2" +omegaconf = "^2.3.0" +pydantic = "^2.12.5" +clearml = "^2.1.0" [tool.poetry.group.dev.dependencies] ruff = "^0.14.6" @@ -38,5 +42,20 @@ python_version = "3.12" strict = true ignore_missing_imports = true +[[tool.mypy.overrides]] +module = [ + "clearml", + "clearml.*", + "setup_clearml", +] +ignore_missing_imports = true +ignore_errors = true + +[[tool.mypy.overrides]] +module = [ + "src.clearml_integration.*", +] +disable_error_code = ["attr-defined", "no-untyped-call"] + [tool.bandit] exclude_dirs = ["tests", ".venv"] diff --git a/reports/figures/clearml_comparison.jpg b/reports/figures/clearml_comparison.jpg new file mode 100644 index 0000000..c3a2489 Binary files /dev/null and b/reports/figures/clearml_comparison.jpg differ diff --git a/reports/figures/clearml_experiments.jpg b/reports/figures/clearml_experiments.jpg new file mode 100644 index 0000000..0d292c3 Binary files /dev/null and b/reports/figures/clearml_experiments.jpg differ diff --git a/reports/figures/clearml_models.jpg b/reports/figures/clearml_models.jpg new file mode 100644 index 0000000..8918faa Binary files /dev/null and b/reports/figures/clearml_models.jpg differ diff --git a/reports/figures/mlflow_exps_1.jpg b/reports/figures/mlflow_exps_1.jpg new file mode 100644 index 0000000..8ceb263 Binary files /dev/null and b/reports/figures/mlflow_exps_1.jpg differ diff --git a/reports/figures/mlflow_exps_2.jpg b/reports/figures/mlflow_exps_2.jpg new file mode 100644 index 0000000..240f3a8 Binary files /dev/null and b/reports/figures/mlflow_exps_2.jpg differ diff --git a/src/clearml_integration/__init__.py b/src/clearml_integration/__init__.py new file mode 100644 index 0000000..d650cbc --- /dev/null +++ b/src/clearml_integration/__init__.py @@ -0,0 +1,31 @@ +""" +ClearML Integration Module for EPML-ITMO Project. + +This module provides comprehensive ClearML integration including: +- Experiment tracking with automatic logging +- Model management and versioning +- Pipeline orchestration +- Dashboard and comparison utilities + +Example usage: + from src.clearml_integration import ClearMLExperiment + + with ClearMLExperiment("my_experiment") as exp: + exp.log_parameters({"lr": 0.01}) + exp.log_metrics({"accuracy": 0.95}) + exp.log_model(model, "best_model") +""" + +from src.clearml_integration.experiment_tracker import ( + ClearMLExperiment, + clearml_experiment, +) +from src.clearml_integration.model_manager import ClearMLModelManager +from src.clearml_integration.pipeline import ClearMLPipeline + +__all__ = [ + "ClearMLExperiment", + "clearml_experiment", + "ClearMLModelManager", + "ClearMLPipeline", +] diff --git a/src/clearml_integration/dashboard.py b/src/clearml_integration/dashboard.py new file mode 100644 index 0000000..8dd41b6 --- /dev/null +++ b/src/clearml_integration/dashboard.py @@ -0,0 +1,365 @@ +#!/usr/bin/env python3 +""" +ClearML Dashboard and Analysis Utilities. + +Provides utilities for: +- Creating experiment dashboards +- Generating comparison reports +- Visualizing model performance +- Exporting analysis results + +Usage: + python -m src.clearml_integration.dashboard --report + python -m src.clearml_integration.dashboard --summary +""" + +from __future__ import annotations + +import argparse +import json +import logging +import sys +from datetime import datetime +from pathlib import Path +from typing import Any + +# Add project root to path +sys.path.insert(0, str(Path(__file__).resolve().parents[2])) + +from src.clearml_integration.experiment_tracker import ( # noqa: E402 + ExperimentComparison, +) +from src.clearml_integration.model_manager import ClearMLModelManager # noqa: E402 + +logger = logging.getLogger(__name__) + + +class ClearMLDashboard: + """ + Dashboard utility for ClearML experiments and models. + + Features: + - Generate experiment summaries + - Create comparison reports + - Export analysis to various formats + """ + + def __init__( + self, + project_name: str = "EPML-ITMO/Wine-Quality", + output_dir: str = "outputs/clearml/dashboard", + ): + """ + Initialize dashboard. + + Args: + project_name: ClearML project name + output_dir: Output directory for reports + """ + self.project_name = project_name + self.output_dir = Path(output_dir) + self.output_dir.mkdir(parents=True, exist_ok=True) + + self.experiment_comparison = ExperimentComparison( + project_name=f"{project_name}/Experiments" + ) + self.model_manager = ClearMLModelManager(project_name=f"{project_name}/Models") + + def get_experiments_summary(self) -> dict[str, Any]: + """ + Get summary of all experiments. + + Returns: + Summary dictionary + """ + experiments = self.experiment_comparison.get_experiments() + + summary: dict[str, Any] = { + "total_experiments": len(experiments), + "successful": sum(1 for e in experiments if e.get("status") == "completed"), + "failed": sum(1 for e in experiments if e.get("status") == "failed"), + "running": sum(1 for e in experiments if e.get("status") == "running"), + "experiments": experiments, + } + + # Get best experiment by accuracy + metrics_data = [e.get("metrics", {}) for e in experiments] + accuracies = [ + m.get("classification/accuracy", m.get("metrics/accuracy", 0)) + for m in metrics_data + ] + if accuracies and max(accuracies) > 0: + best_idx = accuracies.index(max(accuracies)) + summary["best_experiment"] = { + "name": experiments[best_idx].get("name"), + "accuracy": max(accuracies), + } + + return summary + + def get_models_summary(self) -> dict[str, Any]: + """ + Get summary of all registered models. + + Returns: + Summary dictionary + """ + all_models = self.model_manager.get_all_models() + + total_versions = sum(len(versions) for versions in all_models.values()) + + summary: dict[str, Any] = { + "total_models": len(all_models), + "total_versions": total_versions, + "models": {}, + } + + for model_name, versions in all_models.items(): + if versions: + latest = max(versions, key=lambda v: v.get("version", 0)) + summary["models"][model_name] = { + "versions": len(versions), + "latest_version": latest.get("version"), + "latest_metrics": latest.get("metrics", {}), + } + + # Get best model + best = self.model_manager.get_best_model(metric="accuracy") + if best: + summary["best_model"] = { + "id": best[0], + "accuracy": best[1].get("metrics", {}).get("accuracy", 0), + } + + return summary + + def generate_full_report(self) -> str: + """ + Generate a comprehensive Markdown report. + + Returns: + Report content as string + """ + timestamp = datetime.now() + + report_lines = [ + "# ClearML Dashboard Report", + f"\n*Generated: {timestamp.strftime('%Y-%m-%d %H:%M:%S')}*\n", + f"*Project: {self.project_name}*\n", + "---\n", + ] + + # Experiments Section + report_lines.append("## 📊 Experiments Summary\n") + exp_summary = self.get_experiments_summary() + report_lines.extend( + [ + f"- **Total Experiments:** {exp_summary.get('total_experiments', 0)}", + f"- **Successful:** {exp_summary.get('successful', 0)}", + f"- **Failed:** {exp_summary.get('failed', 0)}", + f"- **Running:** {exp_summary.get('running', 0)}", + ] + ) + + if exp_summary.get("best_experiment"): + best_exp = exp_summary["best_experiment"] + report_lines.extend( + [ + f"\n**Best Experiment:** {best_exp.get('name')}", + f"- Accuracy: {best_exp.get('accuracy', 0):.4f}", + ] + ) + + report_lines.append("\n") + + # Models Section + report_lines.append("## 🤖 Models Summary\n") + models_summary = self.get_models_summary() + report_lines.extend( + [ + f"- **Total Models:** {models_summary.get('total_models', 0)}", + f"- **Total Versions:** {models_summary.get('total_versions', 0)}", + ] + ) + + if models_summary.get("best_model"): + best_model = models_summary["best_model"] + report_lines.extend( + [ + f"\n**Best Model:** {best_model.get('id')}", + f"- Accuracy: {best_model.get('accuracy', 0):.4f}", + ] + ) + + report_lines.append("\n") + + # Models Comparison Table + if models_summary.get("models"): + report_lines.append("### Model Versions\n") + report_lines.append("| Model | Versions | Latest | Accuracy |") + report_lines.append("|-------|----------|--------|----------|") + + for model_name, info in models_summary.get("models", {}).items(): + versions = info.get("versions", 0) + latest = info.get("latest_version", "-") + accuracy = info.get("latest_metrics", {}).get("accuracy", 0) + report_lines.append( + f"| {model_name} | {versions} | v{latest} | {accuracy:.4f} |" + ) + + report_lines.append("\n") + + # Experiment Details + report_lines.append("## 📋 Experiment Details\n") + experiments = exp_summary.get("experiments", []) + if experiments: + report_lines.append("| Name | Status | Created |") + report_lines.append("|------|--------|---------|") + for exp in experiments[:10]: # Limit to 10 + name = exp.get("name", "N/A") + status = exp.get("status", "N/A") + created = exp.get("created", "N/A")[:19] # Trim timestamp + report_lines.append(f"| {name} | {status} | {created} |") + else: + report_lines.append("*No experiments found.*") + + report_lines.append("\n---\n") + + # Footer + report_lines.extend( + [ + "## 🔗 Quick Links\n", + "- **ClearML Web UI:** http://localhost:8080", + "- **ClearML API:** http://localhost:8008", + "- **Project Docs:** README.md", + ] + ) + + report = "\n".join(report_lines) + + # Save report + report_file = self.output_dir / "dashboard_report.md" + with open(report_file, "w") as f: + f.write(report) + + logger.info(f"Report saved to: {report_file}") + return report + + def export_metrics_csv(self) -> Path: + """ + Export all metrics to CSV. + + Returns: + Path to CSV file + """ + # Get model comparison + df = self.model_manager.compare_models() + + if df.empty: + logger.warning("No metrics to export") + return Path() + + output_file = self.output_dir / "metrics_export.csv" + df.to_csv(output_file, index=False) + + logger.info(f"Metrics exported to: {output_file}") + return output_file + + def export_summary_json(self) -> Path: + """ + Export summary to JSON. + + Returns: + Path to JSON file + """ + summary: dict[str, Any] = { + "generated_at": datetime.now().isoformat(), + "project": self.project_name, + "experiments": self.get_experiments_summary(), + "models": self.get_models_summary(), + } + + output_file = self.output_dir / "summary.json" + with open(output_file, "w") as f: + json.dump(summary, f, indent=2, default=str) + + logger.info(f"Summary exported to: {output_file}") + return output_file + + def print_summary(self) -> None: + """Print a quick summary to console.""" + print("\n" + "=" * 60) + print("ClearML Dashboard Summary") + print("=" * 60) + + # Experiments + exp_summary = self.get_experiments_summary() + print("\n📊 Experiments:") + print(f" Total: {exp_summary.get('total_experiments', 0)}") + print(f" Successful: {exp_summary.get('successful', 0)}") + print(f" Failed: {exp_summary.get('failed', 0)}") + + if exp_summary.get("best_experiment"): + best = exp_summary["best_experiment"] + print(f"\n 🏆 Best: {best.get('name')}") + print(f" Accuracy: {best.get('accuracy', 0):.4f}") + + # Models + models_summary = self.get_models_summary() + print("\n🤖 Models:") + print(f" Total: {models_summary.get('total_models', 0)}") + print(f" Versions: {models_summary.get('total_versions', 0)}") + + if models_summary.get("best_model"): + best = models_summary["best_model"] + print(f"\n 🏆 Best: {best.get('id')}") + print(f" Accuracy: {best.get('accuracy', 0):.4f}") + + # Model list + if models_summary.get("models"): + print("\n📋 Model Performance:") + for model_name, info in models_summary.get("models", {}).items(): + acc = info.get("latest_metrics", {}).get("accuracy", 0) + print(f" - {model_name}: {acc:.4f}") + + print("\n" + "=" * 60) + + +def main() -> None: + """Main entry point.""" + parser = argparse.ArgumentParser(description="ClearML Dashboard Utilities") + parser.add_argument("--report", action="store_true", help="Generate full report") + parser.add_argument("--summary", action="store_true", help="Print summary") + parser.add_argument("--export-csv", action="store_true", help="Export metrics CSV") + parser.add_argument( + "--export-json", action="store_true", help="Export summary JSON" + ) + parser.add_argument("--all", action="store_true", help="Run all exports") + + args = parser.parse_args() + + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + + dashboard = ClearMLDashboard() + + if args.all: + dashboard.print_summary() + dashboard.generate_full_report() + dashboard.export_metrics_csv() + dashboard.export_summary_json() + elif args.report: + report = dashboard.generate_full_report() + print(report) + elif args.export_csv: + dashboard.export_metrics_csv() + elif args.export_json: + dashboard.export_summary_json() + else: + dashboard.print_summary() + + +if __name__ == "__main__": + main() diff --git a/src/clearml_integration/experiment_tracker.py b/src/clearml_integration/experiment_tracker.py new file mode 100644 index 0000000..2314991 --- /dev/null +++ b/src/clearml_integration/experiment_tracker.py @@ -0,0 +1,683 @@ +""" +ClearML Experiment Tracker Module. + +Provides automatic experiment tracking with ClearML including: +- Parameter logging +- Metric logging with plots +- Artifact management +- Comparison dashboards +- Automatic scikit-learn integration +""" + +from __future__ import annotations + +import functools +import json +import logging +import pickle # nosec B403 +import time +from collections.abc import Callable +from datetime import datetime +from pathlib import Path +from typing import TYPE_CHECKING, Any + +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator +from sklearn.metrics import ( + accuracy_score, + classification_report, + confusion_matrix, + f1_score, + precision_score, + recall_score, +) + +if TYPE_CHECKING: + from clearml import Task as ClearMLTask + from clearml.logger import Logger as ClearMLLogger + +logger = logging.getLogger(__name__) + + +class ClearMLExperiment: + """ + ClearML Experiment Tracker with automatic logging support. + + Features: + - Automatic parameter and metric logging + - Sklearn model auto-logging + - Artifact and model versioning + - Comparison dashboards + - Offline mode support + + Example: + with ClearMLExperiment("my_experiment") as exp: + exp.log_parameters({"n_estimators": 100}) + model.fit(X_train, y_train) + exp.log_model(model, "random_forest") + exp.log_metrics({"accuracy": 0.95}) + """ + + def __init__( + self, + experiment_name: str, + project_name: str = "EPML-ITMO/Wine-Quality", + task_type: str = "training", + tags: list[str] | None = None, + auto_connect_frameworks: bool = True, + offline_mode: bool = False, + ): + """ + Initialize ClearML experiment. + + Args: + experiment_name: Name of the experiment + project_name: ClearML project name + task_type: Type of task (training, testing, inference, etc.) + tags: List of tags for the experiment + auto_connect_frameworks: Enable automatic framework logging + offline_mode: Run in offline mode (no server connection required) + """ + self.experiment_name = experiment_name + self.project_name = project_name + self.task_type = task_type + self.tags = tags or [] + self.auto_connect_frameworks = auto_connect_frameworks + self.offline_mode = offline_mode + self.task: ClearMLTask | None = None + self._logger: ClearMLLogger | None = None + self._start_time: float | None = None + + def __enter__(self) -> ClearMLExperiment: + """Start the experiment context.""" + self.start() + return self + + def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: + """End the experiment context.""" + success = exc_type is None + self.end(success=success) + + def start(self) -> None: + """Start the ClearML experiment.""" + try: + from clearml import Task + + # Set offline mode if specified + if self.offline_mode: + Task.set_offline(offline_mode=True) + + # Map task type string to Task.TaskTypes + task_types = { + "training": Task.TaskTypes.training, + "testing": Task.TaskTypes.testing, + "inference": Task.TaskTypes.inference, + "data_processing": Task.TaskTypes.data_processing, + "qc": Task.TaskTypes.qc, + "service": Task.TaskTypes.service, + "optimizer": Task.TaskTypes.optimizer, + "monitor": Task.TaskTypes.monitor, + "controller": Task.TaskTypes.controller, + "application": Task.TaskTypes.application, + "custom": Task.TaskTypes.custom, + } + + task_type_enum = task_types.get(self.task_type, Task.TaskTypes.training) + + # Create task + self.task = Task.init( + project_name=self.project_name, + task_name=self.experiment_name, + task_type=task_type_enum, + auto_connect_frameworks=self.auto_connect_frameworks, + reuse_last_task_id=False, + ) + + # Add tags + if self.tags and self.task: + self.task.add_tags(self.tags) + + if self.task: + self._logger = self.task.get_logger() + + self._start_time = time.time() + + logger.info(f"Started ClearML experiment: {self.experiment_name}") + if self.task: + logger.info(f"Task ID: {self.task.id}") + + except Exception as e: + logger.warning(f"Failed to start ClearML experiment: {e}") + logger.info("Running in local mode without ClearML tracking") + self.task = None + self._start_time = time.time() + + def end(self, success: bool = True) -> None: + """ + End the ClearML experiment. + + Args: + success: Whether the experiment completed successfully + """ + if self._start_time: + duration = time.time() - self._start_time + self.log_metric("total_duration_seconds", duration) + + if self.task: + try: + if not success: + self.task.mark_failed() + self.task.close() + logger.info(f"Closed ClearML experiment: {self.experiment_name}") + except Exception as e: + logger.warning(f"Error closing ClearML task: {e}") + + def log_parameters(self, params: dict[str, Any], prefix: str = "") -> None: + """ + Log parameters to the experiment. + + Args: + params: Dictionary of parameters + prefix: Optional prefix for parameter names + """ + if self.task: + try: + # Add prefix if specified + if prefix: + params = {f"{prefix}/{k}": v for k, v in params.items()} + self.task.connect(params) + logger.debug(f"Logged parameters: {list(params.keys())}") + except Exception as e: + logger.warning(f"Failed to log parameters: {e}") + + def log_metric( + self, name: str, value: float, series: str = "metrics", iteration: int = 0 + ) -> None: + """ + Log a single metric. + + Args: + name: Metric name + value: Metric value + series: Series name for grouping + iteration: Iteration number + """ + if self._logger: + try: + self._logger.report_scalar( + title=series, series=name, value=value, iteration=iteration + ) + logger.debug(f"Logged metric {name}: {value}") + except Exception as e: + logger.warning(f"Failed to log metric {name}: {e}") + + def log_metrics( + self, + metrics: dict[str, float], + series: str = "metrics", + iteration: int = 0, + ) -> None: + """ + Log multiple metrics. + + Args: + metrics: Dictionary of metrics + series: Series name for grouping + iteration: Iteration number + """ + for name, value in metrics.items(): + self.log_metric(name, value, series=series, iteration=iteration) + + def log_plot( + self, + title: str, + series: str, + x: list[float] | np.ndarray, + y: list[float] | np.ndarray, + xlabel: str = "x", + ylabel: str = "y", + ) -> None: + """ + Log a 2D plot. + + Args: + title: Plot title + series: Series name + x: X values + y: Y values + xlabel: X-axis label + ylabel: Y-axis label + """ + if self._logger: + try: + self._logger.report_line_plot( + title=title, + series=[series], + iteration=0, + xaxis=xlabel, + yaxis=ylabel, + mode="lines+markers", + ) + logger.debug(f"Logged plot: {title}") + except Exception as e: + logger.warning(f"Failed to log plot: {e}") + + def log_confusion_matrix( + self, + y_true: np.ndarray | list[Any], + y_pred: np.ndarray | list[Any], + labels: list[str] | None = None, + title: str = "Confusion Matrix", + ) -> None: + """ + Log a confusion matrix. + + Args: + y_true: True labels + y_pred: Predicted labels + labels: Optional class labels + title: Plot title + """ + if self._logger: + try: + cm = confusion_matrix(y_true, y_pred) + self._logger.report_confusion_matrix( + title=title, + series="Confusion Matrix", + matrix=cm, + xlabels=labels, + ylabels=labels, + iteration=0, + ) + logger.debug("Logged confusion matrix") + except Exception as e: + logger.warning(f"Failed to log confusion matrix: {e}") + + def log_classification_report( + self, + y_true: np.ndarray | list[Any], + y_pred: np.ndarray | list[Any], + target_names: list[str] | None = None, + ) -> dict[str, Any]: + """ + Log classification metrics and report. + + Args: + y_true: True labels + y_pred: Predicted labels + target_names: Optional class names + + Returns: + Dictionary with classification metrics + """ + # Calculate metrics + metrics = { + "accuracy": float(accuracy_score(y_true, y_pred)), + "precision_weighted": float( + precision_score(y_true, y_pred, average="weighted", zero_division=0) + ), + "recall_weighted": float( + recall_score(y_true, y_pred, average="weighted", zero_division=0) + ), + "f1_weighted": float( + f1_score(y_true, y_pred, average="weighted", zero_division=0) + ), + "precision_macro": float( + precision_score(y_true, y_pred, average="macro", zero_division=0) + ), + "recall_macro": float( + recall_score(y_true, y_pred, average="macro", zero_division=0) + ), + "f1_macro": float( + f1_score(y_true, y_pred, average="macro", zero_division=0) + ), + } + + # Log metrics + self.log_metrics(metrics, series="classification") + + # Log confusion matrix + self.log_confusion_matrix(y_true, y_pred, labels=target_names) + + # Log classification report as text + report_text = classification_report( + y_true, y_pred, target_names=target_names, zero_division=0 + ) + if self._logger: + self._logger.report_text(report_text) + + return metrics + + def log_model( + self, + model: BaseEstimator, + model_name: str, + framework: str = "sklearn", + metadata: dict[str, Any] | None = None, + ) -> str | None: + """ + Log a trained model. + + Args: + model: Trained model object + model_name: Name for the model + framework: ML framework (sklearn, pytorch, etc.) + metadata: Optional metadata dictionary + + Returns: + Model ID if successful, None otherwise + """ + if self.task: + try: + import joblib + + from clearml import OutputModel + + # Create output model + output_model = OutputModel(task=self.task, framework=framework) + + # Save model to temporary file + output_dir = Path("outputs/clearml/models") + output_dir.mkdir(parents=True, exist_ok=True) + + model_path = output_dir / f"{model_name}_{datetime.now():%Y%m%d_%H%M%S}" + + # Use joblib for sklearn models + if framework == "sklearn": + model_file = str(model_path) + ".joblib" + joblib.dump(model, model_file) + else: + # Generic pickle + model_file = str(model_path) + ".pkl" + with open(model_file, "wb") as f: + pickle.dump(model, f) # nosec B301 + + # Update model with file + output_model.update_weights(model_file) + + # Add metadata as labels + if metadata: + output_model.update_labels(metadata) + + logger.info(f"Logged model: {model_name}") + return str(output_model.id) + + except Exception as e: + logger.warning(f"Failed to log model: {e}") + + return None + + def log_artifact( + self, + name: str, + artifact: Any, + artifact_type: str = "data", + ) -> None: + """ + Log an artifact (data, file, etc.). + + Args: + name: Artifact name + artifact: Artifact object (DataFrame, dict, path, etc.) + artifact_type: Type hint for the artifact + """ + if self.task: + try: + if isinstance(artifact, pd.DataFrame): + self.task.upload_artifact(name=name, artifact_object=artifact) + elif isinstance(artifact, dict): + self.task.upload_artifact( + name=name, artifact_object=json.dumps(artifact, indent=2) + ) + elif isinstance(artifact, str | Path): + self.task.upload_artifact(name=name, artifact_object=str(artifact)) + else: + self.task.upload_artifact(name=name, artifact_object=artifact) + + logger.debug(f"Logged artifact: {name}") + except Exception as e: + logger.warning(f"Failed to log artifact {name}: {e}") + + def log_dataset( + self, + train_df: pd.DataFrame | None = None, + test_df: pd.DataFrame | None = None, + name: str = "dataset", + ) -> None: + """ + Log training and test datasets. + + Args: + train_df: Training DataFrame + test_df: Test DataFrame + name: Dataset name prefix + """ + if train_df is not None: + self.log_artifact(f"{name}_train", train_df) + self.log_parameters( + { + f"{name}_train_shape": str(train_df.shape), + f"{name}_train_columns": list(train_df.columns), + }, + prefix="data", + ) + + if test_df is not None: + self.log_artifact(f"{name}_test", test_df) + self.log_parameters( + { + f"{name}_test_shape": str(test_df.shape), + }, + prefix="data", + ) + + def set_comment(self, comment: str) -> None: + """ + Set experiment comment/description. + + Args: + comment: Comment text + """ + if self.task: + try: + self.task.set_comment(comment) + except Exception as e: + logger.warning(f"Failed to set comment: {e}") + + def get_task_id(self) -> str | None: + """Get the ClearML task ID.""" + return str(self.task.id) if self.task else None + + +def clearml_experiment( + experiment_name: str | None = None, + project_name: str = "EPML-ITMO/Wine-Quality", + task_type: str = "training", + tags: list[str] | None = None, +) -> Callable[..., Any]: + """ + Decorator to wrap a function in a ClearML experiment. + + Args: + experiment_name: Name of the experiment (defaults to function name) + project_name: ClearML project name + task_type: Type of task + tags: List of tags + + Returns: + Decorated function + """ + + def decorator(func: Callable[..., Any]) -> Callable[..., Any]: + @functools.wraps(func) + def wrapper(*args: Any, **kwargs: Any) -> Any: + exp_name = experiment_name or func.__name__ + + with ClearMLExperiment( + experiment_name=exp_name, + project_name=project_name, + task_type=task_type, + tags=tags, + ) as exp: + # Inject experiment into kwargs if function accepts it + import inspect + + sig = inspect.signature(func) + if "clearml_experiment" in sig.parameters: + kwargs["clearml_experiment"] = exp + + return func(*args, **kwargs) + + return wrapper + + return decorator + + +class ExperimentComparison: + """ + Utility for comparing multiple ClearML experiments. + + Features: + - Compare metrics across experiments + - Generate comparison reports + - Create comparison plots + """ + + def __init__(self, project_name: str = "EPML-ITMO/Wine-Quality"): + """ + Initialize experiment comparison utility. + + Args: + project_name: ClearML project name to search + """ + self.project_name = project_name + + def get_experiments( + self, + tags: list[str] | None = None, + status: str | None = None, + limit: int = 100, + ) -> list[dict[str, Any]]: + """ + Get experiments from ClearML. + + Args: + tags: Filter by tags + status: Filter by status + limit: Maximum number of experiments + + Returns: + List of experiment dictionaries + """ + try: + from clearml import Task + + # Get tasks from project + tasks: list[Any] = Task.get_tasks( + project_name=self.project_name, + tags=tags, + task_filter={"status": [status]} if status else None, + ) + + experiments: list[dict[str, Any]] = [] + for task in tasks[:limit]: + exp_data: dict[str, Any] = { + "id": task.id, + "name": task.name, + "status": task.status, + "created": str(task.data.created), + "tags": list(task.get_tags()), + "parameters": task.get_parameters(), + "metrics": {}, + } + + # Get last metrics + try: + scalars = task.get_last_scalar_metrics() + for title, series_dict in scalars.items(): + for series, value in series_dict.items(): + exp_data["metrics"][f"{title}/{series}"] = value + except Exception: # nosec B110 + pass + + experiments.append(exp_data) + + return experiments + + except Exception as e: + logger.warning(f"Failed to get experiments: {e}") + return [] + + def compare_metrics( + self, + experiment_ids: list[str] | None = None, + metric_names: list[str] | None = None, + ) -> pd.DataFrame: + """ + Compare metrics across experiments. + + Args: + experiment_ids: List of experiment IDs to compare + metric_names: Specific metrics to compare + + Returns: + DataFrame with comparison results + """ + experiments = self.get_experiments() + + if experiment_ids: + experiments = [e for e in experiments if e["id"] in experiment_ids] + + # Build comparison DataFrame + rows = [] + for exp in experiments: + row: dict[str, Any] = { + "experiment_id": exp["id"], + "experiment_name": exp["name"], + "status": exp["status"], + } + row.update(exp["metrics"]) + rows.append(row) + + df = pd.DataFrame(rows) + + # Filter columns if specific metrics requested + if metric_names: + cols = ["experiment_id", "experiment_name", "status"] + [ + c for c in df.columns if any(m in c for m in metric_names) + ] + df = df[cols] + + return df + + def generate_report( + self, + output_path: str | Path = "outputs/clearml/comparison_report.json", + ) -> dict[str, Any]: + """ + Generate a comparison report for all experiments. + + Args: + output_path: Path to save the report + + Returns: + Report dictionary + """ + experiments = self.get_experiments() + + report: dict[str, Any] = { + "generated_at": datetime.now().isoformat(), + "project": self.project_name, + "total_experiments": len(experiments), + "experiments": experiments, + } + + # Save report + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, "w") as f: + json.dump(report, f, indent=2, default=str) + + logger.info(f"Comparison report saved to: {output_path}") + return report diff --git a/src/clearml_integration/model_manager.py b/src/clearml_integration/model_manager.py new file mode 100644 index 0000000..5cc193f --- /dev/null +++ b/src/clearml_integration/model_manager.py @@ -0,0 +1,560 @@ +""" +ClearML Model Manager Module. + +Provides comprehensive model management including: +- Model registration and versioning +- Metadata management +- Model comparison +- Automatic version control +- Model deployment support +""" + +from __future__ import annotations + +import json +import logging +from datetime import datetime +from pathlib import Path +from typing import TYPE_CHECKING, Any + +import joblib +import pandas as pd +from sklearn.base import BaseEstimator + +if TYPE_CHECKING: + pass + +logger = logging.getLogger(__name__) + + +class ClearMLModelManager: + """ + Model Manager for ClearML with versioning and metadata support. + + Features: + - Automatic model versioning + - Rich metadata support + - Model comparison utilities + - Deployment-ready model export + - Model lineage tracking + + Example: + manager = ClearMLModelManager("Wine-Quality-Models") + model_id = manager.register_model( + model=trained_model, + model_name="RandomForest", + metrics={"accuracy": 0.95}, + parameters={"n_estimators": 100} + ) + """ + + def __init__( + self, + project_name: str = "EPML-ITMO/Wine-Quality/Models", + output_uri: str = "outputs/clearml/models", + ): + """ + Initialize Model Manager. + + Args: + project_name: ClearML project for models + output_uri: Local output directory for models + """ + self.project_name = project_name + self.output_uri = Path(output_uri) + self.output_uri.mkdir(parents=True, exist_ok=True) + self._models_registry: dict[str, list[dict[str, Any]]] = {} + self._registry_file = self.output_uri / "model_registry.json" + self._load_registry() + + def _load_registry(self) -> None: + """Load local model registry.""" + if self._registry_file.exists(): + with open(self._registry_file) as f: + self._models_registry = json.load(f) + else: + self._models_registry = {} + + def _save_registry(self) -> None: + """Save local model registry.""" + with open(self._registry_file, "w") as f: + json.dump(self._models_registry, f, indent=2, default=str) + + def register_model( + self, + model: BaseEstimator, + model_name: str, + metrics: dict[str, float] | None = None, + parameters: dict[str, Any] | None = None, + tags: list[str] | None = None, + description: str = "", + framework: str = "sklearn", + task_id: str | None = None, + ) -> str: + """ + Register a trained model with ClearML. + + Args: + model: Trained model object + model_name: Name of the model + metrics: Model performance metrics + parameters: Model hyperparameters + tags: Tags for categorization + description: Model description + framework: ML framework (sklearn, pytorch, etc.) + task_id: Optional ClearML task ID to associate + + Returns: + Model ID + """ + timestamp = datetime.now() + version = self._get_next_version(model_name) + model_id = f"{model_name}_v{version}_{timestamp:%Y%m%d_%H%M%S}" + + # Save model locally + model_dir = self.output_uri / model_name / f"v{version}" + model_dir.mkdir(parents=True, exist_ok=True) + + model_path = model_dir / f"{model_id}.joblib" + joblib.dump(model, model_path) + + # Create metadata + metadata: dict[str, Any] = { + "model_id": model_id, + "model_name": model_name, + "version": version, + "framework": framework, + "created_at": timestamp.isoformat(), + "model_path": str(model_path), + "metrics": metrics or {}, + "parameters": parameters or {}, + "tags": tags or [], + "description": description, + "task_id": task_id, + "model_class": type(model).__name__, + } + + # Save metadata + metadata_path = model_dir / "metadata.json" + with open(metadata_path, "w") as f: + json.dump(metadata, f, indent=2) + + # Update registry + if model_name not in self._models_registry: + self._models_registry[model_name] = [] + self._models_registry[model_name].append(metadata) + self._save_registry() + + # Register with ClearML if available + clearml_model_id = self._register_with_clearml( + model_path=model_path, + model_name=model_name, + version=version, + metadata=metadata, + framework=framework, + task_id=task_id, + ) + + if clearml_model_id: + metadata["clearml_model_id"] = clearml_model_id + with open(metadata_path, "w") as f: + json.dump(metadata, f, indent=2) + + logger.info(f"Registered model: {model_id}") + return model_id + + def _get_next_version(self, model_name: str) -> int: + """Get next version number for a model.""" + if model_name not in self._models_registry: + return 1 + versions = [m["version"] for m in self._models_registry[model_name]] + return int(max(versions)) + 1 if versions else 1 + + def _register_with_clearml( + self, + model_path: Path, + model_name: str, + version: int, + metadata: dict[str, Any], + framework: str, + task_id: str | None = None, + ) -> str | None: + """ + Register model with ClearML server. + + Args: + model_path: Path to saved model + model_name: Model name + version: Version number + metadata: Model metadata + framework: ML framework + task_id: Optional task ID + + Returns: + ClearML model ID if successful + """ + try: + from clearml import OutputModel, Task + + # Create or get task + task: Any + if task_id: + task = Task.get_task(task_id=task_id) + else: + task = Task.init( + project_name=self.project_name, + task_name=f"Register {model_name} v{version}", + task_type=Task.TaskTypes.custom, + reuse_last_task_id=False, + ) + + # Create output model + output_model = OutputModel( + task=task, + name=f"{model_name}_v{version}", + framework=framework, + comment=metadata.get("description", ""), + ) + + # Upload model weights + output_model.update_weights(weights_filename=str(model_path)) + + # Add labels (metadata) + labels = { + "version": str(version), + "model_class": metadata.get("model_class", ""), + **{ + f"metric_{k}": str(v) + for k, v in metadata.get("metrics", {}).items() + }, + **{ + f"param_{k}": str(v) + for k, v in metadata.get("parameters", {}).items() + }, + } + output_model.update_labels(labels) + + # Add tags + for tag in metadata.get("tags", []): + task.add_tags([tag]) + + if not task_id: + task.close() + + logger.info(f"Registered model with ClearML: {output_model.id}") + return str(output_model.id) + + except Exception as e: + logger.warning(f"Failed to register with ClearML: {e}") + return None + + def load_model( + self, + model_name: str, + version: int | str = "latest", + ) -> tuple[BaseEstimator, dict[str, Any]]: + """ + Load a registered model. + + Args: + model_name: Model name + version: Version number or "latest" + + Returns: + Tuple of (model, metadata) + """ + if model_name not in self._models_registry: + raise ValueError(f"Model '{model_name}' not found in registry") + + models = self._models_registry[model_name] + + if version == "latest": + model_meta = max(models, key=lambda m: m["version"]) + else: + model_meta_found = next( + (m for m in models if m["version"] == version), None + ) + if not model_meta_found: + raise ValueError( + f"Version {version} not found for model '{model_name}'" + ) + model_meta = model_meta_found + + model_path = Path(model_meta["model_path"]) + if not model_path.exists(): + raise FileNotFoundError(f"Model file not found: {model_path}") + + model: BaseEstimator = joblib.load(model_path) + logger.info(f"Loaded model: {model_name} v{model_meta['version']}") + + return model, model_meta + + def get_model_versions(self, model_name: str) -> list[dict[str, Any]]: + """ + Get all versions of a model. + + Args: + model_name: Model name + + Returns: + List of version metadata + """ + return self._models_registry.get(model_name, []) + + def get_all_models(self) -> dict[str, list[dict[str, Any]]]: + """ + Get all registered models. + + Returns: + Dictionary of model name -> versions + """ + return self._models_registry.copy() + + def compare_models( + self, + model_names: list[str] | None = None, + metric: str = "accuracy", + ) -> pd.DataFrame: + """ + Compare models by a specific metric. + + Args: + model_names: List of model names to compare (None = all) + metric: Metric to compare + + Returns: + DataFrame with comparison results + """ + rows: list[dict[str, Any]] = [] + + models_to_compare = model_names or list(self._models_registry.keys()) + + for model_name in models_to_compare: + versions = self._models_registry.get(model_name, []) + for version_meta in versions: + row: dict[str, Any] = { + "model_name": model_name, + "version": version_meta["version"], + "created_at": version_meta["created_at"], + "model_class": version_meta.get("model_class", ""), + } + + # Add all metrics + for metric_name, metric_value in version_meta.get( + "metrics", {} + ).items(): + row[metric_name] = metric_value + + rows.append(row) + + df = pd.DataFrame(rows) + + # Sort by specified metric if available + if metric in df.columns: + df = df.sort_values(metric, ascending=False) + + return df + + def get_best_model( + self, + model_name: str | None = None, + metric: str = "accuracy", + higher_is_better: bool = True, + ) -> tuple[str, dict[str, Any]] | None: + """ + Get the best model by a specific metric. + + Args: + model_name: Specific model name (None = search all) + metric: Metric to optimize + higher_is_better: Whether higher metric values are better + + Returns: + Tuple of (model_id, metadata) or None + """ + comparison = self.compare_models( + model_names=[model_name] if model_name else None, + metric=metric, + ) + + if comparison.empty or metric not in comparison.columns: + return None + + if higher_is_better: + best_idx = comparison[metric].idxmax() + else: + best_idx = comparison[metric].idxmin() + + best_row = comparison.loc[best_idx] + + best_model_name = str(best_row["model_name"]) + version = best_row["version"] + + versions = self._models_registry.get(best_model_name, []) + metadata = next((m for m in versions if m["version"] == version), None) + + if metadata: + return str(metadata["model_id"]), metadata + + return None + + def delete_model(self, model_name: str, version: int | None = None) -> bool: + """ + Delete a model version or all versions. + + Args: + model_name: Model name + version: Specific version to delete (None = all) + + Returns: + True if successful + """ + if model_name not in self._models_registry: + logger.warning(f"Model '{model_name}' not found") + return False + + if version is None: + # Delete all versions + for version_meta in self._models_registry[model_name]: + model_path = Path(version_meta["model_path"]) + if model_path.exists(): + model_path.unlink() + del self._models_registry[model_name] + else: + # Delete specific version + versions = self._models_registry[model_name] + for i, version_meta in enumerate(versions): + if version_meta["version"] == version: + model_path = Path(version_meta["model_path"]) + if model_path.exists(): + model_path.unlink() + versions.pop(i) + break + + self._save_registry() + logger.info(f"Deleted model: {model_name} v{version or 'all'}") + return True + + def export_model( + self, + model_name: str, + version: int | str = "latest", + export_path: str | Path | None = None, + include_metadata: bool = True, + ) -> Path: + """ + Export a model for deployment. + + Args: + model_name: Model name + version: Version to export + export_path: Export destination + include_metadata: Include metadata file + + Returns: + Path to exported model + """ + model, metadata = self.load_model(model_name, version) + + if export_path is None: + export_path = self.output_uri / "exports" / model_name + + export_path = Path(export_path) + export_path.mkdir(parents=True, exist_ok=True) + + # Export model + model_file = export_path / f"{model_name}_v{metadata['version']}.joblib" + joblib.dump(model, model_file) + + # Export metadata + if include_metadata: + metadata_file = export_path / "metadata.json" + with open(metadata_file, "w") as f: + json.dump(metadata, f, indent=2) + + logger.info(f"Exported model to: {export_path}") + return export_path + + def generate_model_report( + self, + output_path: str | Path = "outputs/clearml/model_report.md", + ) -> str: + """ + Generate a Markdown report of all models. + + Args: + output_path: Path to save the report + + Returns: + Report content + """ + report_lines = [ + "# Model Registry Report", + f"\n*Generated: {datetime.now().isoformat()}*\n", + "## Summary\n", + f"- Total Models: {len(self._models_registry)}", + f"- Total Versions: {sum(len(v) for v in self._models_registry.values())}", + "\n## Models\n", + ] + + for model_name, versions in self._models_registry.items(): + report_lines.append(f"### {model_name}\n") + report_lines.append(f"- Versions: {len(versions)}") + + if versions: + latest = max(versions, key=lambda m: m["version"]) + report_lines.append(f"- Latest Version: v{latest['version']}") + report_lines.append(f"- Latest Created: {latest['created_at']}") + + if latest.get("metrics"): + report_lines.append("\n**Latest Metrics:**\n") + for metric, value in latest["metrics"].items(): + report_lines.append(f"- {metric}: {value:.4f}") + + report_lines.append("") + + # Add comparison table + comparison = self.compare_models() + if not comparison.empty: + report_lines.append("## Model Comparison\n") + report_lines.append(comparison.to_markdown(index=False)) + + report = "\n".join(report_lines) + + # Save report + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, "w") as f: + f.write(report) + + logger.info(f"Model report saved to: {output_path}") + return report + + def sync_with_clearml(self) -> int: + """ + Sync local registry with ClearML server. + + Returns: + Number of models synced + """ + synced = 0 + + try: + from clearml import Model + + # Get all models from ClearML + clearml_models = Model.query_models( + project_name=self.project_name, + ) + + for _ in clearml_models: + # Check if already in registry + # This is a simplified sync - in production would be more robust + synced += 1 + + logger.info(f"Synced {synced} models with ClearML") + + except Exception as e: + logger.warning(f"Failed to sync with ClearML: {e}") + + return synced diff --git a/src/clearml_integration/pipeline.py b/src/clearml_integration/pipeline.py new file mode 100644 index 0000000..39ca00b --- /dev/null +++ b/src/clearml_integration/pipeline.py @@ -0,0 +1,649 @@ +""" +ClearML Pipeline Module. + +Provides ML pipeline orchestration using ClearML Pipelines: +- Automated pipeline creation +- Step-by-step execution +- Monitoring and notifications +- Automatic retries and error handling +""" + +from __future__ import annotations + +import argparse +import json +import logging +import sys +from datetime import datetime +from pathlib import Path +from typing import TYPE_CHECKING, Any + +import pandas as pd +from sklearn.base import ClassifierMixin + +# Add project root to path +sys.path.insert(0, str(Path(__file__).resolve().parents[2])) + +if TYPE_CHECKING: + from clearml.logger import Logger as ClearMLLogger + +logger = logging.getLogger(__name__) + + +class ClearMLPipeline: + """ + ClearML Pipeline for ML Workflow Orchestration. + + Features: + - Multi-step pipeline execution + - Automatic task creation for each step + - Progress monitoring + - Notification support + - Pipeline versioning + + Example: + pipeline = ClearMLPipeline("Wine-Quality-Pipeline") + pipeline.add_data_step(train_path, test_path) + pipeline.add_training_step("RandomForest", params) + pipeline.add_evaluation_step() + results = pipeline.run() + """ + + def __init__( + self, + pipeline_name: str, + project_name: str = "EPML-ITMO/Wine-Quality/Pipelines", + version: str = "1.0.0", + ): + """ + Initialize ClearML Pipeline. + + Args: + pipeline_name: Name of the pipeline + project_name: ClearML project name + version: Pipeline version + """ + self.pipeline_name = pipeline_name + self.project_name = project_name + self.version = version + self.steps: list[dict[str, Any]] = [] + self.results: dict[str, Any] = {} + self._pipeline_controller: Any = None + self._start_time: datetime | None = None + + def add_data_step( + self, + train_path: str | Path, + test_path: str | Path, + step_name: str = "data_loading", + ) -> ClearMLPipeline: + """ + Add data loading step to pipeline. + + Args: + train_path: Path to training data + test_path: Path to test data + step_name: Name of the step + + Returns: + Self for chaining + """ + self.steps.append( + { + "name": step_name, + "type": "data", + "config": { + "train_path": str(train_path), + "test_path": str(test_path), + }, + } + ) + return self + + def add_training_step( + self, + model_name: str, + model_params: dict[str, Any], + step_name: str | None = None, + ) -> ClearMLPipeline: + """ + Add training step to pipeline. + + Args: + model_name: Name of the model type + model_params: Model hyperparameters + step_name: Optional custom step name + + Returns: + Self for chaining + """ + self.steps.append( + { + "name": step_name or f"train_{model_name.lower()}", + "type": "training", + "config": { + "model_name": model_name, + "params": model_params, + }, + } + ) + return self + + def add_evaluation_step( + self, + metrics: list[str] | None = None, + step_name: str = "evaluation", + ) -> ClearMLPipeline: + """ + Add evaluation step to pipeline. + + Args: + metrics: List of metrics to compute + step_name: Name of the step + + Returns: + Self for chaining + """ + self.steps.append( + { + "name": step_name, + "type": "evaluation", + "config": { + "metrics": metrics + or ["accuracy", "precision", "recall", "f1_score"], + }, + } + ) + return self + + def add_model_registration_step( + self, + step_name: str = "model_registration", + ) -> ClearMLPipeline: + """ + Add model registration step to pipeline. + + Args: + step_name: Name of the step + + Returns: + Self for chaining + """ + self.steps.append( + { + "name": step_name, + "type": "registration", + "config": {}, + } + ) + return self + + def run( + self, + local_mode: bool = True, + queue_name: str = "default", + ) -> dict[str, Any]: + """ + Execute the pipeline. + + Args: + local_mode: Run locally or on ClearML agent + queue_name: Queue name for remote execution + + Returns: + Pipeline results + """ + self._start_time = datetime.now() + + logger.info("=" * 60) + logger.info(f"Starting Pipeline: {self.pipeline_name}") + logger.info(f"Version: {self.version}") + logger.info(f"Steps: {len(self.steps)}") + logger.info("=" * 60) + + if local_mode: + return self._run_local() + else: + return self._run_remote(queue_name) + + def _run_local(self) -> dict[str, Any]: + """Run pipeline locally.""" + self.results = { + "pipeline_name": self.pipeline_name, + "version": self.version, + "start_time": self._start_time.isoformat() if self._start_time else None, + "steps": {}, + "success": True, + } + + # Initialize ClearML task for pipeline + pipeline_task: Any = None + pipeline_logger: ClearMLLogger | None = None + + try: + from clearml import Task + + pipeline_task = Task.init( + project_name=self.project_name, + task_name=f"{self.pipeline_name}_v{self.version}", + task_type=Task.TaskTypes.controller, + reuse_last_task_id=False, + ) + + pipeline_task.add_tags(["pipeline", f"v{self.version}"]) + pipeline_logger = pipeline_task.get_logger() + except Exception as e: + logger.warning(f"ClearML not available: {e}") + pipeline_task = None + pipeline_logger = None + + # Track data and model across steps + train_df: pd.DataFrame | None = None + test_df: pd.DataFrame | None = None + model: ClassifierMixin | None = None + model_name: str = "" + metrics: dict[str, float] = {} + + # Execute each step + for i, step in enumerate(self.steps): + step_name = step["name"] + step_type = step["type"] + step_config = step["config"] + + logger.info(f"\n[Step {i + 1}/{len(self.steps)}] {step_name}") + step_start = datetime.now() + + try: + if step_type == "data": + train_df, test_df = self._execute_data_step(step_config) + step_result = { + "train_shape": ( + train_df.shape if train_df is not None else None + ), + "test_shape": test_df.shape if test_df is not None else None, + } + + elif step_type == "training": + if train_df is None or test_df is None: + raise ValueError("Data not loaded. Add data step first.") + model, model_name, train_metrics = self._execute_training_step( + train_df, test_df, step_config + ) + metrics.update(train_metrics) + step_result = { + "model_name": model_name, + "metrics": train_metrics, + } + + elif step_type == "evaluation": + if model is None or test_df is None: + raise ValueError("Model not trained. Add training step first.") + eval_metrics = self._execute_evaluation_step( + model, test_df, step_config + ) + metrics.update(eval_metrics) + step_result = {"metrics": eval_metrics} + + elif step_type == "registration": + if model is None: + raise ValueError( + "Model not available. Add training step first." + ) + model_id = self._execute_registration_step( + model, model_name, metrics + ) + step_result = {"model_id": model_id} + + else: + logger.warning(f"Unknown step type: {step_type}") + step_result = {} + + step_duration = (datetime.now() - step_start).total_seconds() + self.results["steps"][step_name] = { + "success": True, + "duration": step_duration, + "result": step_result, + } + + # Log to ClearML + if pipeline_logger: + pipeline_logger.report_scalar( + title="Pipeline Progress", + series="steps_completed", + value=i + 1, + iteration=0, + ) + + logger.info(f" ✓ Completed in {step_duration:.2f}s") + + except Exception as e: + logger.error(f" ✗ Step failed: {e}") + self.results["steps"][step_name] = { + "success": False, + "error": str(e), + } + self.results["success"] = False + break + + # Finalize + end_time = datetime.now() + total_duration = ( + (end_time - self._start_time).total_seconds() if self._start_time else 0 + ) + self.results["end_time"] = end_time.isoformat() + self.results["total_duration"] = total_duration + self.results["final_metrics"] = metrics + + # Log final results + if pipeline_task: + for metric_name, metric_value in metrics.items(): + pipeline_task.get_logger().report_scalar( + title="Final Metrics", + series=metric_name, + value=metric_value, + iteration=0, + ) + pipeline_task.close() + + # Save results + self._save_results() + + logger.info("\n" + "=" * 60) + logger.info(f"Pipeline {'COMPLETED' if self.results['success'] else 'FAILED'}") + logger.info(f"Total Duration: {total_duration:.2f}s") + logger.info("=" * 60) + + return self.results + + def _run_remote(self, queue_name: str) -> dict[str, Any]: + """Run pipeline on ClearML agent.""" + try: + from clearml import PipelineController + + # Create pipeline controller + pipe = PipelineController( + name=self.pipeline_name, + project=self.project_name, + version=self.version, + ) + + # Add steps as pipeline components + # This is a simplified version - full implementation would use + # @PipelineDecorator.component decorators + + # Start pipeline + pipe.start(queue=queue_name) + + logger.info(f"Pipeline started on queue: {queue_name}") + return {"status": "started", "queue": queue_name} + + except Exception as e: + logger.error(f"Failed to start remote pipeline: {e}") + return {"status": "failed", "error": str(e)} + + def _execute_data_step( + self, + config: dict[str, Any], + ) -> tuple[pd.DataFrame, pd.DataFrame]: + """Execute data loading step.""" + train_path = Path(config["train_path"]) + test_path = Path(config["test_path"]) + + train_df = pd.read_csv(train_path) + test_df = pd.read_csv(test_path) + + logger.info(f" Loaded train: {train_df.shape}") + logger.info(f" Loaded test: {test_df.shape}") + + return train_df, test_df + + def _execute_training_step( + self, + train_df: pd.DataFrame, + test_df: pd.DataFrame, + config: dict[str, Any], + ) -> tuple[ClassifierMixin, str, dict[str, float]]: + """Execute model training step.""" + from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier + from sklearn.linear_model import LogisticRegression + from sklearn.metrics import accuracy_score, f1_score + from sklearn.neighbors import KNeighborsClassifier + from sklearn.svm import SVC + from sklearn.tree import DecisionTreeClassifier + + model_name = config["model_name"] + params = config.get("params", {}) + + # Model mapping + model_classes: dict[str, type] = { + "RandomForest": RandomForestClassifier, + "GradientBoosting": GradientBoostingClassifier, + "LogisticRegression": LogisticRegression, + "SVM": SVC, + "DecisionTree": DecisionTreeClassifier, + "KNN": KNeighborsClassifier, + } + + if model_name not in model_classes: + raise ValueError(f"Unknown model: {model_name}") + + # Create and train model + model_class = model_classes[model_name] + model: ClassifierMixin = model_class(**params) + + X_train = train_df.iloc[:, :-1] + y_train = train_df.iloc[:, -1] + X_test = test_df.iloc[:, :-1] + y_test = test_df.iloc[:, -1] + + logger.info(f" Training {model_name}...") + model.fit(X_train, y_train) + + # Quick evaluation + y_pred = model.predict(X_test) + metrics = { + "accuracy": float(accuracy_score(y_test, y_pred)), + "f1_score": float(f1_score(y_test, y_pred, average="weighted")), + } + + logger.info(f" Accuracy: {metrics['accuracy']:.4f}") + + return model, model_name, metrics + + def _execute_evaluation_step( + self, + model: ClassifierMixin, + test_df: pd.DataFrame, + config: dict[str, Any], + ) -> dict[str, float]: + """Execute model evaluation step.""" + from sklearn.metrics import ( + accuracy_score, + f1_score, + precision_score, + recall_score, + ) + + X_test = test_df.iloc[:, :-1] + y_test = test_df.iloc[:, -1] + + y_pred = model.predict(X_test) + + metrics_to_compute = config.get( + "metrics", ["accuracy", "precision", "recall", "f1_score"] + ) + metrics: dict[str, float] = {} + + metric_funcs = { + "accuracy": lambda: accuracy_score(y_test, y_pred), + "precision": lambda: precision_score( + y_test, y_pred, average="weighted", zero_division=0 + ), + "recall": lambda: recall_score( + y_test, y_pred, average="weighted", zero_division=0 + ), + "f1_score": lambda: f1_score( + y_test, y_pred, average="weighted", zero_division=0 + ), + } + + for metric_name in metrics_to_compute: + if metric_name in metric_funcs: + metrics[metric_name] = float(metric_funcs[metric_name]()) + logger.info(f" {metric_name}: {metrics[metric_name]:.4f}") + + return metrics + + def _execute_registration_step( + self, + model: ClassifierMixin, + model_name: str, + metrics: dict[str, float], + ) -> str: + """Execute model registration step.""" + from src.clearml_integration.model_manager import ClearMLModelManager + + manager = ClearMLModelManager() + model_id = manager.register_model( + model=model, + model_name=model_name, + metrics=metrics, + tags=["pipeline", self.pipeline_name], + ) + + logger.info(f" Registered model: {model_id}") + return model_id + + def _save_results(self) -> None: + """Save pipeline results to file.""" + output_dir = Path("outputs/clearml/pipelines") + output_dir.mkdir(parents=True, exist_ok=True) + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_file = output_dir / f"{self.pipeline_name}_{timestamp}.json" + + with open(output_file, "w") as f: + json.dump(self.results, f, indent=2, default=str) + + logger.info(f"Results saved to: {output_file}") + + +def create_wine_quality_pipeline( + model_name: str = "RandomForest", + model_params: dict[str, Any] | None = None, +) -> ClearMLPipeline: + """ + Create a standard Wine Quality classification pipeline. + + Args: + model_name: Model type to use + model_params: Model hyperparameters + + Returns: + Configured ClearMLPipeline + """ + if model_params is None: + model_params = {"n_estimators": 100, "random_state": 42} + + pipeline = ClearMLPipeline( + pipeline_name=f"Wine-Quality-{model_name}", + version="1.0.0", + ) + + pipeline.add_data_step( + train_path="data/processed/train.csv", + test_path="data/processed/test.csv", + ) + + pipeline.add_training_step( + model_name=model_name, + model_params=model_params, + ) + + pipeline.add_evaluation_step() + + pipeline.add_model_registration_step() + + return pipeline + + +def run_all_models_pipeline() -> dict[str, dict[str, Any]]: + """ + Run pipeline for all available models. + + Returns: + Dictionary of model results + """ + models_config: dict[str, dict[str, Any]] = { + "RandomForest": {"n_estimators": 100, "max_depth": 10, "random_state": 42}, + "GradientBoosting": { + "n_estimators": 100, + "max_depth": 5, + "random_state": 42, + }, + "LogisticRegression": {"max_iter": 1000, "random_state": 42}, + "SVM": {"kernel": "rbf", "random_state": 42}, + "DecisionTree": {"max_depth": 10, "random_state": 42}, + "KNN": {"n_neighbors": 5}, + } + + results: dict[str, dict[str, Any]] = {} + + for model_name, params in models_config.items(): + logger.info(f"\n{'=' * 60}") + logger.info(f"Running pipeline for: {model_name}") + logger.info("=" * 60) + + pipeline = create_wine_quality_pipeline( + model_name=model_name, + model_params=params, + ) + + results[model_name] = pipeline.run(local_mode=True) + + # Summary + logger.info("\n" + "=" * 60) + logger.info("ALL PIPELINES COMPLETED") + logger.info("=" * 60) + + for model_name, result in results.items(): + status = "✓" if result.get("success") else "✗" + result_metrics = result.get("final_metrics", {}) + acc = result_metrics.get("accuracy", 0) + logger.info(f" {status} {model_name}: accuracy={acc:.4f}") + + return results + + +# CLI entry point +def main() -> None: + """Main entry point.""" + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + + parser = argparse.ArgumentParser(description="ClearML Pipeline Runner") + parser.add_argument( + "--model", + type=str, + default="RandomForest", + help="Model to train", + ) + parser.add_argument( + "--all", + action="store_true", + help="Run all models", + ) + + args = parser.parse_args() + + if args.all: + run_all_models_pipeline() + else: + pipeline = create_wine_quality_pipeline(model_name=args.model) + pipeline.run() + + +if __name__ == "__main__": + main() diff --git a/src/clearml_integration/run_experiments.py b/src/clearml_integration/run_experiments.py new file mode 100644 index 0000000..8032fbb --- /dev/null +++ b/src/clearml_integration/run_experiments.py @@ -0,0 +1,403 @@ +#!/usr/bin/env python3 +""" +Run ML experiments with ClearML tracking. + +This script provides a unified interface for running experiments +with automatic ClearML tracking, model registration, and comparison. + +Usage: + python -m src.clearml_integration.run_experiments --model RandomForest + python -m src.clearml_integration.run_experiments --all + python -m src.clearml_integration.run_experiments --compare +""" + +from __future__ import annotations + +import argparse +import logging +import sys +from pathlib import Path +from typing import Any + +import pandas as pd +from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier + +# Add project root to path +sys.path.insert(0, str(Path(__file__).resolve().parents[2])) + +from src.clearml_integration.experiment_tracker import ( # noqa: E402 + ClearMLExperiment, + ExperimentComparison, +) +from src.clearml_integration.model_manager import ClearMLModelManager # noqa: E402 + +logger = logging.getLogger(__name__) + +# Model configurations +MODELS_CONFIG: dict[str, dict[str, Any]] = { + "RandomForest": { + "class": RandomForestClassifier, + "params": { + "n_estimators": 100, + "max_depth": 10, + "min_samples_split": 2, + "random_state": 42, + }, + }, + "GradientBoosting": { + "class": GradientBoostingClassifier, + "params": { + "n_estimators": 100, + "max_depth": 5, + "learning_rate": 0.1, + "random_state": 42, + }, + }, + "LogisticRegression": { + "class": LogisticRegression, + "params": { + "max_iter": 1000, + "random_state": 42, + }, + }, + "SVM": { + "class": SVC, + "params": { + "kernel": "rbf", + "C": 1.0, + "random_state": 42, + }, + }, + "DecisionTree": { + "class": DecisionTreeClassifier, + "params": { + "max_depth": 10, + "min_samples_split": 2, + "random_state": 42, + }, + }, + "KNN": { + "class": KNeighborsClassifier, + "params": { + "n_neighbors": 5, + "weights": "uniform", + }, + }, +} + + +def load_data( + train_path: str = "data/processed/train.csv", + test_path: str = "data/processed/test.csv", +) -> tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: + """Load training and test data.""" + train_df = pd.read_csv(train_path) + test_df = pd.read_csv(test_path) + + X_train = train_df.iloc[:, :-1] + y_train = train_df.iloc[:, -1] + X_test = test_df.iloc[:, :-1] + y_test = test_df.iloc[:, -1] + + return X_train, y_train, X_test, y_test + + +def run_single_experiment( + model_name: str, + offline_mode: bool = False, + register_model: bool = True, +) -> dict[str, Any]: + """ + Run a single model experiment with ClearML tracking. + + Args: + model_name: Name of the model to train + offline_mode: Run without ClearML server + register_model: Whether to register the model + + Returns: + Experiment results + """ + if model_name not in MODELS_CONFIG: + raise ValueError(f"Unknown model: {model_name}") + + config = MODELS_CONFIG[model_name] + + # Create experiment + with ClearMLExperiment( + experiment_name=f"{model_name}_Experiment", + project_name="EPML-ITMO/Wine-Quality/Experiments", + task_type="training", + tags=[model_name, "wine-quality", "classification"], + offline_mode=offline_mode, + ) as exp: + # Log experiment description + exp.set_comment( + f""" +Wine Quality Classification Experiment +====================================== +Model: {model_name} +Dataset: Wine Quality (Red Wine) + +This experiment trains a {model_name} model on the wine quality dataset +and logs all metrics, parameters, and the trained model to ClearML. + """ + ) + + # Load data + logger.info("Loading data...") + X_train, y_train, X_test, y_test = load_data() + + # Log data info + exp.log_parameters( + { + "train_samples": len(X_train), + "test_samples": len(X_test), + "features": X_train.shape[1], + "classes": len(y_train.unique()), + }, + prefix="data", + ) + + # Log model parameters + exp.log_parameters(config["params"], prefix="model") + + # Create and train model + logger.info(f"Training {model_name}...") + model_class = config["class"] + model = model_class(**config["params"]) + model.fit(X_train, y_train) + + # Predict + y_pred = model.predict(X_test) + + # Log classification metrics + metrics = exp.log_classification_report( + y_true=y_test, + y_pred=y_pred, + target_names=[str(i) for i in sorted(y_test.unique())], + ) + + logger.info(f"Accuracy: {metrics['accuracy']:.4f}") + logger.info(f"F1 Score: {metrics['f1_weighted']:.4f}") + + # Log model + if exp.task: + exp.log_model( + model=model, + model_name=model_name, + metadata={"metrics": metrics, "params": config["params"]}, + ) + + # Register model with model manager + if register_model: + manager = ClearMLModelManager() + model_id = manager.register_model( + model=model, + model_name=model_name, + metrics=metrics, + parameters=config["params"], + tags=["wine-quality", "classification"], + task_id=exp.get_task_id(), + ) + logger.info(f"Registered model: {model_id}") + + return { + "model_name": model_name, + "task_id": exp.get_task_id(), + "metrics": metrics, + "success": True, + } + + +def run_all_experiments( + offline_mode: bool = False, + register_models: bool = True, +) -> dict[str, Any]: + """ + Run experiments for all configured models. + + Args: + offline_mode: Run without ClearML server + register_models: Whether to register models + + Returns: + Dictionary of all experiment results + """ + logger.info("=" * 60) + logger.info("Running All Model Experiments") + logger.info("=" * 60) + + results: dict[str, Any] = {} + + for model_name in MODELS_CONFIG: + logger.info(f"\n{'=' * 40}") + logger.info(f"Running: {model_name}") + logger.info("=" * 40) + + try: + result = run_single_experiment( + model_name=model_name, + offline_mode=offline_mode, + register_model=register_models, + ) + results[model_name] = result + logger.info(f"✓ {model_name} completed") + + except Exception as e: + logger.error(f"✗ {model_name} failed: {e}") + results[model_name] = { + "model_name": model_name, + "success": False, + "error": str(e), + } + + # Print summary + logger.info("\n" + "=" * 60) + logger.info("EXPERIMENT SUMMARY") + logger.info("=" * 60) + + for model_name, result in results.items(): + if result.get("success"): + metrics = result.get("metrics", {}) + acc = metrics.get("accuracy", 0) + f1 = metrics.get("f1_weighted", 0) + logger.info(f" ✓ {model_name}: accuracy={acc:.4f}, f1={f1:.4f}") + else: + logger.info(f" ✗ {model_name}: FAILED - {result.get('error', 'Unknown')}") + + # Find best model + successful = { + k: v for k, v in results.items() if v.get("success") and v.get("metrics") + } + if successful: + best_model = max( + successful.items(), + key=lambda x: x[1]["metrics"].get("accuracy", 0), + ) + logger.info(f"\n Best Model: {best_model[0]}") + logger.info(f" Best Accuracy: {best_model[1]['metrics']['accuracy']:.4f}") + + return results + + +def compare_experiments() -> pd.DataFrame: + """ + Compare all experiments in the project. + + Returns: + DataFrame with comparison results + """ + comparison = ExperimentComparison(project_name="EPML-ITMO/Wine-Quality/Experiments") + + # Get and display comparison + df = comparison.compare_metrics() + + if not df.empty: + logger.info("\nExperiment Comparison:") + logger.info(df.to_string()) + + # Generate report + comparison.generate_report() + + return df + + +def compare_models() -> pd.DataFrame: + """ + Compare all registered models. + + Returns: + DataFrame with comparison results + """ + manager = ClearMLModelManager() + + # Get comparison + df = manager.compare_models() + + if not df.empty: + logger.info("\nModel Comparison:") + logger.info(df.to_string()) + + # Generate report + manager.generate_model_report() + + # Find best model + best = manager.get_best_model(metric="accuracy") + if best: + logger.info(f"\nBest Model: {best[0]}") + logger.info(f"Accuracy: {best[1]['metrics'].get('accuracy', 0):.4f}") + + return df + + +def main() -> None: + """Main entry point.""" + parser = argparse.ArgumentParser( + description="Run ML experiments with ClearML tracking" + ) + parser.add_argument( + "--model", + type=str, + choices=list(MODELS_CONFIG.keys()), + help="Model to train", + ) + parser.add_argument( + "--all", + action="store_true", + help="Run all models", + ) + parser.add_argument( + "--compare", + action="store_true", + help="Compare experiments", + ) + parser.add_argument( + "--compare-models", + action="store_true", + help="Compare registered models", + ) + parser.add_argument( + "--offline", + action="store_true", + help="Run in offline mode (no ClearML server)", + ) + parser.add_argument( + "--no-register", + action="store_true", + help="Don't register models", + ) + + args = parser.parse_args() + + # Setup logging + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + + if args.compare: + compare_experiments() + elif args.compare_models: + compare_models() + elif args.all: + run_all_experiments( + offline_mode=args.offline, + register_models=not args.no_register, + ) + elif args.model: + run_single_experiment( + model_name=args.model, + offline_mode=args.offline, + register_model=not args.no_register, + ) + else: + parser.print_help() + + +if __name__ == "__main__": + main() diff --git a/src/config/__init__.py b/src/config/__init__.py new file mode 100644 index 0000000..b633105 --- /dev/null +++ b/src/config/__init__.py @@ -0,0 +1,21 @@ +"""Configuration module with Pydantic schemas.""" + +from src.config.schemas import ( + DataConfig, + LoggingConfig, + MLflowConfig, + ModelConfig, + PipelineConfig, + TrainingConfig, + validate_config, +) + +__all__ = [ + "DataConfig", + "LoggingConfig", + "MLflowConfig", + "ModelConfig", + "PipelineConfig", + "TrainingConfig", + "validate_config", +] diff --git a/src/config/schemas.py b/src/config/schemas.py new file mode 100644 index 0000000..56f97c1 --- /dev/null +++ b/src/config/schemas.py @@ -0,0 +1,137 @@ +""" +Configuration validation schemas using Pydantic. +Provides type-safe configuration validation for the ML pipeline. +""" + +from typing import Any, Literal + +from pydantic import BaseModel, Field, field_validator + + +class DataConfig(BaseModel): # type: ignore[misc] + """Data configuration schema.""" + + raw_path: str = Field(default="data/raw", description="Path to raw data") + processed_path: str = Field( + default="data/processed", description="Path to processed data" + ) + train_file: str = Field(default="train.csv", description="Training data filename") + test_file: str = Field(default="test.csv", description="Test data filename") + target_column: str = Field(default="quality", description="Target column name") + test_size: float = Field( + default=0.2, ge=0.0, le=1.0, description="Test set size ratio" + ) + random_state: int = Field(default=42, description="Random state for splitting") + + +class PreprocessingConfig(BaseModel): # type: ignore[misc] + """Preprocessing configuration schema.""" + + normalize: bool = Field(default=False, description="Whether to normalize features") + handle_missing: Literal["drop", "mean", "median"] = Field( + default="drop", description="Missing value handling strategy" + ) + + +class DataFullConfig(BaseModel): # type: ignore[misc] + """Full data configuration including preprocessing.""" + + data: DataConfig = Field(default_factory=DataConfig) + preprocessing: PreprocessingConfig = Field(default_factory=PreprocessingConfig) + + +class ModelParamsConfig(BaseModel): # type: ignore[misc] + """Base model parameters configuration.""" + + random_state: int = Field(default=42, description="Random state for model") + + class Config: + extra = "allow" # Allow additional model-specific parameters + + +class ModelConfig(BaseModel): # type: ignore[misc] + """Model configuration schema.""" + + name: str = Field(..., description="Model name") + _target_: str = Field(..., description="Full path to model class") + params: dict[str, Any] = Field( + default_factory=dict, description="Model hyperparameters" + ) + + @field_validator("name") # type: ignore[misc] + @classmethod + def validate_model_name(cls, v: str) -> str: + """Validate model name is not empty.""" + if not v or not v.strip(): + raise ValueError("Model name cannot be empty") + return v + + +class TrainingConfig(BaseModel): # type: ignore[misc] + """Training configuration schema.""" + + cv_folds: int = Field(default=5, ge=2, le=20, description="Number of CV folds") + shuffle: bool = Field(default=True, description="Shuffle data before CV") + metrics: list[str] = Field( + default=["accuracy", "precision", "recall", "f1_score"], + description="Metrics to track", + ) + register_model: bool = Field(default=True, description="Register model to MLflow") + model_name: str = Field(default="", description="Registered model name") + + +class EarlyStoppingConfig(BaseModel): # type: ignore[misc] + """Early stopping configuration schema.""" + + enabled: bool = Field(default=False, description="Enable early stopping") + patience: int = Field(default=10, ge=1, description="Patience for early stopping") + min_delta: float = Field( + default=0.001, ge=0.0, description="Minimum improvement delta" + ) + + +class MLflowConfig(BaseModel): # type: ignore[misc] + """MLflow configuration schema.""" + + tracking_uri: str = Field(..., description="MLflow tracking URI") + experiment_name: str = Field(..., description="Experiment name") + + +class LoggingConfig(BaseModel): # type: ignore[misc] + """Logging configuration schema.""" + + level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = Field( + default="INFO", description="Logging level" + ) + format: str = Field( + default="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + description="Log format string", + ) + + +class PipelineConfig(BaseModel): # type: ignore[misc] + """Full pipeline configuration schema.""" + + model: ModelConfig + data: DataConfig = Field(default_factory=DataConfig) + training: TrainingConfig = Field(default_factory=TrainingConfig) + mlflow: MLflowConfig + logging: LoggingConfig = Field(default_factory=LoggingConfig) + seed: int = Field(default=42, description="Global random seed") + output_dir: str = Field(default="outputs", description="Output directory") + + +def validate_config(config: dict[str, Any]) -> PipelineConfig: + """ + Validate configuration dictionary against schema. + + Args: + config: Configuration dictionary from Hydra + + Returns: + Validated PipelineConfig instance + + Raises: + ValidationError: If configuration is invalid + """ + return PipelineConfig(**config) diff --git a/src/models/run_experiments.py b/src/models/run_experiments.py new file mode 100644 index 0000000..12e2284 --- /dev/null +++ b/src/models/run_experiments.py @@ -0,0 +1,198 @@ +import logging +import sys +from pathlib import Path +from typing import Any + +import mlflow +import mlflow.sklearn +import pandas as pd +from sklearn.base import ClassifierMixin +from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier + +# Add project root to path to ensure imports work +sys.path.append(str(Path(__file__).resolve().parents[2])) + +from src.utils.mlflow_decorators import ( # noqa: E402 + log_experiment, + log_metrics, + log_params, +) + +logger = logging.getLogger(__name__) + + +class ExperimentRunner: + def __init__(self, data_path: Path): + self.data_path = data_path + self.X_train: pd.DataFrame + self.y_train: pd.Series + self.X_test: pd.DataFrame + self.y_test: pd.Series + self._load_data() + + def _load_data(self) -> None: + train_path = self.data_path / "train.csv" + test_path = self.data_path / "test.csv" + + if not train_path.exists() or not test_path.exists(): + raise FileNotFoundError(f"Data not found at {self.data_path}") + + train_df = pd.read_csv(train_path) + test_df = pd.read_csv(test_path) + + self.X_train = train_df.iloc[:, :-1] + self.y_train = train_df.iloc[:, -1] + self.X_test = test_df.iloc[:, :-1] + self.y_test = test_df.iloc[:, -1] + + @log_experiment(experiment_name="wine_quality_multimodel_v1") + def run_experiment( + self, + model_name: str, + model_class: type[ClassifierMixin], + params: dict[str, Any], + ) -> dict[str, float]: + """ + Runs a single experiment with the given model and parameters. + """ + logger.info(f"Running experiment with {model_name} and params {params}") + + # Log params + log_params(params) + mlflow.log_param("model_type", model_name) + + # Initialize and train model + model = model_class(**params) + model.fit(self.X_train, self.y_train) + + # Predict + y_pred = model.predict(self.X_test) + + # Calculate metrics + metrics = { + "accuracy": float(accuracy_score(self.y_test, y_pred)), + "precision": float( + precision_score(self.y_test, y_pred, average="weighted") + ), + "recall": float(recall_score(self.y_test, y_pred, average="weighted")), + "f1_score": float(f1_score(self.y_test, y_pred, average="weighted")), + } + + # Log metrics + log_metrics(metrics) + logger.info(f"Metrics: {metrics}") + + # Log model + mlflow.sklearn.log_model(model, "model", registered_model_name=model_name) + + return metrics + + +def main() -> None: + # Setup logging + logging.basicConfig(level=logging.INFO) + + # Path to processed data + data_path = Path("data/processed") + runner = ExperimentRunner(data_path) + + # Define experiments + # (name, class, params) + experiments: list[tuple[str, type[ClassifierMixin], dict[str, Any]]] = [ + # Random Forest Experiments + ( + "RandomForest", + RandomForestClassifier, + {"n_estimators": 50, "max_depth": 5, "random_state": 42}, + ), + ( + "RandomForest", + RandomForestClassifier, + {"n_estimators": 100, "max_depth": 10, "random_state": 42}, + ), + ( + "RandomForest", + RandomForestClassifier, + {"n_estimators": 200, "max_depth": None, "random_state": 42}, + ), + ( + "RandomForest", + RandomForestClassifier, + {"n_estimators": 50, "min_samples_split": 5, "random_state": 42}, + ), + # Gradient Boosting Experiments + ( + "GradientBoosting", + GradientBoostingClassifier, + {"n_estimators": 50, "learning_rate": 0.1, "random_state": 42}, + ), + ( + "GradientBoosting", + GradientBoostingClassifier, + { + "n_estimators": 100, + "learning_rate": 0.05, + "max_depth": 3, + "random_state": 42, + }, + ), + ( + "GradientBoosting", + GradientBoostingClassifier, + {"n_estimators": 100, "learning_rate": 0.2, "random_state": 42}, + ), + # Logistic Regression Experiments + ( + "LogisticRegression", + LogisticRegression, + {"C": 0.1, "max_iter": 1000, "random_state": 42}, + ), + ( + "LogisticRegression", + LogisticRegression, + {"C": 1.0, "max_iter": 1000, "random_state": 42}, + ), + ( + "LogisticRegression", + LogisticRegression, + {"C": 10.0, "max_iter": 1000, "random_state": 42}, + ), + # SVM Experiments + ("SVM", SVC, {"kernel": "rbf", "C": 1.0, "random_state": 42}), + ("SVM", SVC, {"kernel": "linear", "C": 1.0, "random_state": 42}), + ("SVM", SVC, {"kernel": "poly", "degree": 3, "random_state": 42}), + # Decision Tree Experiments + ( + "DecisionTree", + DecisionTreeClassifier, + {"max_depth": 5, "random_state": 42}, + ), + ( + "DecisionTree", + DecisionTreeClassifier, + {"max_depth": 10, "min_samples_split": 5, "random_state": 42}, + ), + # KNN Experiments + ("KNN", KNeighborsClassifier, {"n_neighbors": 3}), + ("KNN", KNeighborsClassifier, {"n_neighbors": 5}), + ("KNN", KNeighborsClassifier, {"n_neighbors": 7}), + ] + + logger.info(f"Starting {len(experiments)} experiments...") + + mlflow.set_tracking_uri("file://" + str(Path.cwd() / "mlruns")) + + for model_name, model_class, params in experiments: + try: + runner.run_experiment(model_name, model_class, params) + except Exception as e: + logger.error(f"Experiment failed: {e}") + + +if __name__ == "__main__": + main() diff --git a/src/models/train_model.py b/src/models/train_model.py index bba47ab..519f876 100644 --- a/src/models/train_model.py +++ b/src/models/train_model.py @@ -11,7 +11,65 @@ from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score +# Add project root to path +sys.path.append(str(Path(__file__).resolve().parents[2])) + +from src.utils.mlflow_decorators import ( # noqa: E402 + log_experiment, + log_metrics, + log_params, +) + warnings.filterwarnings("ignore") +logger = logging.getLogger(__name__) + + +@log_experiment(experiment_name="wine_quality_experiment") +def train_rf( + X_train: pd.DataFrame, + y_train: pd.Series, + X_test: pd.DataFrame, + y_test: pd.Series, + n_estimators: int, + max_depth: int, +) -> None: + """Train Random Forest model.""" + # Log parameters + log_params({"n_estimators": n_estimators, "max_depth": max_depth}) + + # Train model + clf = RandomForestClassifier( + n_estimators=n_estimators, max_depth=max_depth, random_state=42 + ) + clf.fit(X_train, y_train) + + # Predict + y_pred = clf.predict(X_test) + + # metrics + accuracy = accuracy_score(y_test, y_pred) + precision = precision_score(y_test, y_pred, average="weighted") + recall = recall_score(y_test, y_pred, average="weighted") + f1 = f1_score(y_test, y_pred, average="weighted") + + logger.info(f"Accuracy: {accuracy}") + logger.info(f"F1 Score: {f1}") + + # Log metrics + log_metrics( + { + "accuracy": float(accuracy), + "precision": float(precision), + "recall": float(recall), + "f1_score": float(f1), + } + ) + + # Log model + mlflow.sklearn.log_model( + clf, "model", registered_model_name="WineQualityRandomForest" + ) + logger.info("Model logged to MLflow") @click.command() # type: ignore[misc] @@ -24,7 +82,6 @@ ) def main(input_filepath: str, n_estimators: int, max_depth: int) -> None: """Trains a model on processed data.""" - logger = logging.getLogger(__name__) logger.info("Training model...") # Load data @@ -45,42 +102,9 @@ def main(input_filepath: str, n_estimators: int, max_depth: int) -> None: # Set up MLflow mlflow.set_tracking_uri("file://" + str(Path.cwd() / "mlruns")) - mlflow.set_experiment("wine_quality_experiment") - - with mlflow.start_run(): - # Log parameters - mlflow.log_param("n_estimators", n_estimators) - mlflow.log_param("max_depth", max_depth) - - # Train model - clf = RandomForestClassifier( - n_estimators=n_estimators, max_depth=max_depth, random_state=42 - ) - clf.fit(X_train, y_train) - - # Predict - y_pred = clf.predict(X_test) - - # metrics - accuracy = accuracy_score(y_test, y_pred) - precision = precision_score(y_test, y_pred, average="weighted") - recall = recall_score(y_test, y_pred, average="weighted") - f1 = f1_score(y_test, y_pred, average="weighted") - - logger.info(f"Accuracy: {accuracy}") - logger.info(f"F1 Score: {f1}") - - # Log metrics - mlflow.log_metric("accuracy", accuracy) - mlflow.log_metric("precision", precision) - mlflow.log_metric("recall", recall) - mlflow.log_metric("f1_score", f1) - - # Log model - mlflow.sklearn.log_model( - clf, "model", registered_model_name="WineQualityRandomForest" - ) - logger.info("Model logged to MLflow") + + # Run training + train_rf(X_train, y_train, X_test, y_test, n_estimators, max_depth) if __name__ == "__main__": diff --git a/src/pipelines/__init__.py b/src/pipelines/__init__.py new file mode 100644 index 0000000..34cac6c --- /dev/null +++ b/src/pipelines/__init__.py @@ -0,0 +1 @@ +"""Pipelines module for ML workflow orchestration.""" diff --git a/src/pipelines/evaluate_models.py b/src/pipelines/evaluate_models.py new file mode 100644 index 0000000..12d9c0b --- /dev/null +++ b/src/pipelines/evaluate_models.py @@ -0,0 +1,276 @@ +""" +Model evaluation and comparison script. + +Collects metrics from all trained models and generates comparison reports. +""" + +import json +import logging +from datetime import datetime +from pathlib import Path +from typing import Any + +import pandas as pd + +logger = logging.getLogger(__name__) + +# Model names to evaluate (directory names are lowercase without separators) +MODEL_NAMES = [ + "randomforest", + "gradientboosting", + "logisticregression", + "svm", + "decisiontree", + "knn", +] + + +def load_model_metrics(model_name: str, base_path: Path) -> dict[str, Any] | None: + """ + Load metrics for a single model. + + Args: + model_name: Name of the model + base_path: Base outputs directory + + Returns: + Metrics dictionary or None if not found + """ + metrics_path = base_path / model_name / "metrics.json" + + if not metrics_path.exists(): + logger.warning(f"Metrics not found for {model_name}: {metrics_path}") + return None + + with open(metrics_path) as f: + data = json.load(f) + + return { + "model": model_name, + **data.get("metrics", {}), + "run_id": data.get("run_id", ""), + "timestamp": data.get("timestamp", ""), + } + + +def collect_all_metrics(base_path: Path) -> list[dict[str, Any]]: + """ + Collect metrics from all models. + + Args: + base_path: Base outputs directory + + Returns: + List of metrics dictionaries + """ + all_metrics = [] + + for model_name in MODEL_NAMES: + metrics = load_model_metrics(model_name, base_path) + if metrics: + all_metrics.append(metrics) + + logger.info(f"Collected metrics from {len(all_metrics)} models") + return all_metrics + + +def find_best_model( + metrics_list: list[dict[str, Any]], metric: str = "f1_score" +) -> dict[str, Any]: + """ + Find the best model based on specified metric. + + Args: + metrics_list: List of metrics dictionaries + metric: Metric to use for comparison + + Returns: + Best model information + """ + if not metrics_list: + return {"error": "No metrics available"} + + best = max(metrics_list, key=lambda x: x.get(metric, 0)) + + return { + "best_model": best["model"], + "best_metric_name": metric, + "best_metric_value": best.get(metric, 0), + "run_id": best.get("run_id", ""), + "all_metrics": {m["model"]: m.get(metric, 0) for m in metrics_list}, + } + + +def create_comparison_table(metrics_list: list[dict[str, Any]]) -> pd.DataFrame: + """ + Create comparison DataFrame. + + Args: + metrics_list: List of metrics dictionaries + + Returns: + Comparison DataFrame + """ + if not metrics_list: + return pd.DataFrame() + + df = pd.DataFrame(metrics_list) + + # Reorder columns + cols = ["model", "accuracy", "precision", "recall", "f1_score"] + existing_cols = [c for c in cols if c in df.columns] + other_cols = [c for c in df.columns if c not in cols] + df = df[existing_cols + other_cols] + + # Sort by f1_score descending + if "f1_score" in df.columns: + df = df.sort_values("f1_score", ascending=False) + + return df + + +def generate_report( + metrics_list: list[dict[str, Any]], best_model: dict[str, Any] +) -> str: + """ + Generate text comparison report. + + Args: + metrics_list: List of metrics dictionaries + best_model: Best model information + + Returns: + Report text + """ + lines = [ + "=" * 70, + "MODEL COMPARISON REPORT", + "=" * 70, + f"Generated: {datetime.now().isoformat()}", + f"Models evaluated: {len(metrics_list)}", + "", + "-" * 70, + "RESULTS (sorted by F1 Score)", + "-" * 70, + "", + f"{'Model':<25} {'Accuracy':>10} {'Precision':>10} {'Recall':>10} {'F1':>10}", + "-" * 70, + ] + + # Sort by f1_score + sorted_metrics = sorted( + metrics_list, key=lambda x: x.get("f1_score", 0), reverse=True + ) + + for m in sorted_metrics: + lines.append( + f"{m['model']:<25} " + f"{m.get('accuracy', 0):>10.4f} " + f"{m.get('precision', 0):>10.4f} " + f"{m.get('recall', 0):>10.4f} " + f"{m.get('f1_score', 0):>10.4f}" + ) + + lines.extend( + [ + "", + "-" * 70, + "BEST MODEL", + "-" * 70, + f"Model: {best_model.get('best_model', 'N/A')}", + f"F1 Score: {best_model.get('best_metric_value', 0):.4f}", + f"MLflow Run ID: {best_model.get('run_id', 'N/A')}", + "", + "=" * 70, + ] + ) + + return "\n".join(lines) + + +def save_results( + metrics_list: list[dict[str, Any]], + best_model: dict[str, Any], + comparison_df: pd.DataFrame, + output_path: Path, +) -> None: + """ + Save evaluation results. + + Args: + metrics_list: List of metrics dictionaries + best_model: Best model information + comparison_df: Comparison DataFrame + output_path: Output directory + """ + output_path.mkdir(parents=True, exist_ok=True) + + # Save best model info + best_model_path = output_path / "best_model.json" + with open(best_model_path, "w") as f: + json.dump( + { + **best_model, + "timestamp": datetime.now().isoformat(), + }, + f, + indent=2, + ) + logger.info(f"Best model info saved to: {best_model_path}") + + # Save comparison CSV + comparison_csv_path = output_path / "metrics_comparison.csv" + comparison_df.to_csv(comparison_csv_path, index=False) + logger.info(f"Comparison CSV saved to: {comparison_csv_path}") + + # Save full metrics + full_metrics_path = output_path / "all_metrics.json" + with open(full_metrics_path, "w") as f: + json.dump(metrics_list, f, indent=2) + logger.info(f"All metrics saved to: {full_metrics_path}") + + # Save report + report = generate_report(metrics_list, best_model) + report_path = output_path / "comparison_report.txt" + with open(report_path, "w") as f: + f.write(report) + logger.info(f"Report saved to: {report_path}") + + # Print report + print(report) + + +def main() -> None: + """Main evaluation function.""" + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + + logger.info("Starting model evaluation") + + # Paths + base_path = Path("outputs") + output_path = base_path / "comparison" + + # Collect metrics + metrics_list = collect_all_metrics(base_path) + + if not metrics_list: + logger.error("No metrics found. Run training pipelines first.") + return + + # Find best model + best_model = find_best_model(metrics_list) + + # Create comparison table + comparison_df = create_comparison_table(metrics_list) + + # Save results + save_results(metrics_list, best_model, comparison_df, output_path) + + logger.info("Model evaluation completed") + + +if __name__ == "__main__": + main() diff --git a/src/pipelines/monitoring.py b/src/pipelines/monitoring.py new file mode 100644 index 0000000..cea565f --- /dev/null +++ b/src/pipelines/monitoring.py @@ -0,0 +1,336 @@ +""" +Pipeline monitoring and notification system. + +Provides execution tracking, timing, and reporting for ML pipelines. +""" + +import json +import logging +import smtplib +import time +from datetime import datetime +from email.mime.text import MIMEText +from pathlib import Path +from typing import Any + +from omegaconf import DictConfig + +logger = logging.getLogger(__name__) + + +class PipelineMonitor: + """ + Monitor for tracking pipeline execution. + + Tracks timing, success/failure status, and generates reports. + """ + + def __init__(self, cfg: DictConfig): + """ + Initialize pipeline monitor. + + Args: + cfg: Hydra configuration object + """ + self.cfg = cfg + self.stages: dict[str, dict[str, Any]] = {} + self.pipeline_start: float | None = None + self.pipeline_end: float | None = None + self.current_stage: str | None = None + + def start_pipeline(self) -> None: + """Mark pipeline start.""" + self.pipeline_start = time.time() + logger.info("=" * 60) + logger.info("PIPELINE STARTED") + logger.info(f"Model: {self.cfg.model.name}") + logger.info(f"Timestamp: {datetime.now().isoformat()}") + logger.info("=" * 60) + + def end_pipeline( + self, success: bool = True, error: str | None = None + ) -> dict[str, Any]: + """ + Mark pipeline end and generate summary. + + Args: + success: Whether pipeline completed successfully + error: Error message if failed + + Returns: + Pipeline execution report + """ + self.pipeline_end = time.time() + duration = self.pipeline_end - (self.pipeline_start or 0) + + report = { + "success": success, + "model": self.cfg.model.name, + "total_duration_seconds": round(duration, 2), + "timestamp": datetime.now().isoformat(), + "stages": self.stages, + } + + if error: + report["error"] = error + + # Log summary + logger.info("=" * 60) + logger.info("PIPELINE COMPLETED") + logger.info(f"Status: {'SUCCESS' if success else 'FAILED'}") + logger.info(f"Total duration: {duration:.2f}s") + logger.info("-" * 40) + + for stage_name, stage_info in self.stages.items(): + status = "✓" if stage_info.get("success") else "✗" + stage_duration = stage_info.get("duration", 0) + logger.info(f" {status} {stage_name}: {stage_duration:.2f}s") + + if error: + logger.error(f"Error: {error}") + + logger.info("=" * 60) + + # Send notification + self._send_notification(report) + + return report + + def start_stage(self, stage_name: str) -> None: + """ + Mark stage start. + + Args: + stage_name: Name of the stage + """ + self.current_stage = stage_name + self.stages[stage_name] = { + "start_time": time.time(), + "status": "running", + } + logger.info(f"[STAGE] Starting: {stage_name}") + + def end_stage( + self, stage_name: str, success: bool = True, error: str | None = None + ) -> None: + """ + Mark stage end. + + Args: + stage_name: Name of the stage + success: Whether stage completed successfully + error: Error message if failed + """ + if stage_name not in self.stages: + logger.warning(f"Stage {stage_name} was not started") + return + + end_time = time.time() + start_time = self.stages[stage_name]["start_time"] + duration = end_time - start_time + + self.stages[stage_name].update( + { + "end_time": end_time, + "duration": round(duration, 2), + "success": success, + "status": "completed" if success else "failed", + } + ) + + if error: + self.stages[stage_name]["error"] = error + + status_msg = "✓ Completed" if success else "✗ Failed" + logger.info(f"[STAGE] {status_msg}: {stage_name} ({duration:.2f}s)") + + self.current_stage = None + + def save_report(self, output_path: Path | None = None) -> Path: + """ + Save execution report to file. + + Args: + output_path: Optional custom output path + + Returns: + Path to saved report + """ + if output_path is None: + output_path = Path(self.cfg.output_dir) / "pipeline_report.json" + + output_path.parent.mkdir(parents=True, exist_ok=True) + + report = { + "model": self.cfg.model.name, + "experiment": self.cfg.mlflow.experiment_name, + "timestamp": datetime.now().isoformat(), + "total_duration": ( + round((self.pipeline_end or 0) - (self.pipeline_start or 0), 2) + ), + "stages": self.stages, + "config": { + "model_params": dict(self.cfg.model.params), + "training": dict(self.cfg.training), + "seed": self.cfg.seed, + }, + } + + with open(output_path, "w") as f: + json.dump(report, f, indent=2, default=str) + + logger.info(f"Pipeline report saved to: {output_path}") + return output_path + + def _send_notification(self, report: dict[str, Any]) -> None: + """ + Send notification about pipeline completion. + + Currently logs to file. Can be extended for email/Slack notifications. + + Args: + report: Pipeline execution report + """ + # Log notification to file + notification_file = Path(self.cfg.output_dir) / "notifications.log" + notification_file.parent.mkdir(parents=True, exist_ok=True) + + status = "SUCCESS" if report["success"] else "FAILED" + message = f""" +================================================================================ +PIPELINE NOTIFICATION - {status} +================================================================================ +Model: {report['model']} +Timestamp: {report['timestamp']} +Duration: {report['total_duration_seconds']}s + +Stages: +""" + for stage_name, stage_info in report.get("stages", {}).items(): + stage_status = "✓" if stage_info.get("success") else "✗" + message += ( + f" {stage_status} {stage_name}: {stage_info.get('duration', 0):.2f}s\n" + ) + + if not report["success"]: + message += f"\nError: {report.get('error', 'Unknown error')}\n" + + message += "=" * 80 + "\n" + + with open(notification_file, "a") as f: + f.write(message) + + logger.info(f"Notification logged to: {notification_file}") + + # Console notification + if report["success"]: + logger.info("🎉 Pipeline completed successfully!") + else: + logger.error("❌ Pipeline failed! Check logs for details.") + + +class EmailNotifier: + """ + Email notification handler for pipeline events. + + Note: Requires SMTP configuration. Disabled by default. + """ + + def __init__( + self, + smtp_server: str = "localhost", + smtp_port: int = 587, + username: str | None = None, + password: str | None = None, + ): + """ + Initialize email notifier. + + Args: + smtp_server: SMTP server address + smtp_port: SMTP server port + username: SMTP username + password: SMTP password + """ + self.smtp_server = smtp_server + self.smtp_port = smtp_port + self.username = username + self.password = password + + def send( + self, + to_email: str, + subject: str, + body: str, + from_email: str | None = None, + ) -> bool: + """ + Send email notification. + + Args: + to_email: Recipient email address + subject: Email subject + body: Email body + from_email: Sender email address + + Returns: + True if sent successfully, False otherwise + """ + try: + msg = MIMEText(body) + msg["Subject"] = subject + msg["From"] = from_email or self.username or "noreply@localhost" + msg["To"] = to_email + + with smtplib.SMTP(self.smtp_server, self.smtp_port) as server: + if self.username and self.password: + server.starttls() + server.login(self.username, self.password) + server.send_message(msg) + + logger.info(f"Email notification sent to {to_email}") + return True + + except Exception as e: + logger.warning(f"Failed to send email notification: {e}") + return False + + +def format_pipeline_report(report: dict[str, Any]) -> str: + """ + Format pipeline report as readable text. + + Args: + report: Pipeline execution report + + Returns: + Formatted report string + """ + lines = [ + "=" * 60, + "ML PIPELINE EXECUTION REPORT", + "=" * 60, + f"Model: {report.get('model', 'N/A')}", + f"Status: {'SUCCESS' if report.get('success') else 'FAILED'}", + f"Timestamp: {report.get('timestamp', 'N/A')}", + f"Total Duration: {report.get('total_duration_seconds', 0):.2f}s", + "", + "-" * 40, + "STAGES:", + "-" * 40, + ] + + for stage_name, stage_info in report.get("stages", {}).items(): + status = "✓" if stage_info.get("success") else "✗" + duration = stage_info.get("duration", 0) + lines.append(f" {status} {stage_name}: {duration:.2f}s") + + if not stage_info.get("success") and stage_info.get("error"): + lines.append(f" Error: {stage_info['error']}") + + if not report.get("success") and report.get("error"): + lines.extend(["", f"Pipeline Error: {report['error']}"]) + + lines.append("=" * 60) + + return "\n".join(lines) diff --git a/src/pipelines/prepare_data.py b/src/pipelines/prepare_data.py new file mode 100644 index 0000000..79b80a7 --- /dev/null +++ b/src/pipelines/prepare_data.py @@ -0,0 +1,205 @@ +""" +Data preparation pipeline with Hydra configuration. + +Handles data downloading, preprocessing, and splitting. +""" + +import logging +import sys +import urllib.request +from pathlib import Path + +import hydra +import pandas as pd +from omegaconf import DictConfig, OmegaConf +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler + +# Add project root to path +sys.path.insert(0, str(Path(__file__).resolve().parents[2])) + +logger = logging.getLogger(__name__) + +# Wine Quality dataset URL +DATASET_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" + + +def download_data(raw_path: Path, force: bool = False) -> Path: + """ + Download Wine Quality dataset. + + Args: + raw_path: Directory to save raw data + force: Force re-download even if file exists + + Returns: + Path to downloaded file + """ + raw_path.mkdir(parents=True, exist_ok=True) + file_path = raw_path / "winequality-red.csv" + + if file_path.exists() and not force: + logger.info(f"File already exists: {file_path}") + return file_path + + logger.info(f"Downloading data from {DATASET_URL}") + urllib.request.urlretrieve(DATASET_URL, file_path) # nosec + logger.info(f"Downloaded to: {file_path}") + + return file_path + + +def preprocess_data( + df: pd.DataFrame, + normalize: bool = False, + handle_missing: str = "drop", +) -> pd.DataFrame: + """ + Preprocess data according to configuration. + + Args: + df: Input DataFrame + normalize: Whether to normalize features + handle_missing: Missing value handling strategy + + Returns: + Preprocessed DataFrame + """ + # Handle missing values + if handle_missing == "drop": + df = df.dropna() + elif handle_missing == "mean": + df = df.fillna(df.mean(numeric_only=True)) + elif handle_missing == "median": + df = df.fillna(df.median(numeric_only=True)) + + logger.info(f"After handling missing values: {df.shape}") + + # Normalize features (except target) + if normalize: + feature_cols = df.columns[:-1] + scaler = StandardScaler() + df[feature_cols] = scaler.fit_transform(df[feature_cols]) + logger.info("Features normalized") + + return df + + +def split_data( + df: pd.DataFrame, + test_size: float = 0.2, + random_state: int = 42, +) -> tuple[pd.DataFrame, pd.DataFrame]: + """ + Split data into train and test sets. + + Args: + df: Input DataFrame + test_size: Test set proportion + random_state: Random seed for reproducibility + + Returns: + Tuple of (train_df, test_df) + """ + train_df, test_df = train_test_split( + df, + test_size=test_size, + random_state=random_state, + ) + + logger.info(f"Train set: {train_df.shape}") + logger.info(f"Test set: {test_df.shape}") + + return train_df, test_df + + +def save_data( + train_df: pd.DataFrame, + test_df: pd.DataFrame, + output_path: Path, + train_file: str = "train.csv", + test_file: str = "test.csv", +) -> None: + """ + Save processed data to files. + + Args: + train_df: Training DataFrame + test_df: Test DataFrame + output_path: Output directory + train_file: Training file name + test_file: Test file name + """ + output_path.mkdir(parents=True, exist_ok=True) + + train_path = output_path / train_file + test_path = output_path / test_file + + train_df.to_csv(train_path, index=False) + test_df.to_csv(test_path, index=False) + + logger.info(f"Saved train data to: {train_path}") + logger.info(f"Saved test data to: {test_path}") + + +@hydra.main(config_path="../../conf", config_name="config", version_base=None) # type: ignore[misc] +def main(cfg: DictConfig) -> None: + """ + Main data preparation pipeline. + + Args: + cfg: Hydra configuration + """ + # Setup logging + log_level = getattr(logging, cfg.logging.level, logging.INFO) + logging.basicConfig(level=log_level, format=cfg.logging.format) + + logger.info("Starting data preparation pipeline") + logger.info(f"Configuration:\n{OmegaConf.to_yaml(cfg.data)}") + + # Get original working directory + original_cwd = hydra.utils.get_original_cwd() + + # Paths - access directly from cfg.data (Hydra merges defaults) + raw_path = Path(original_cwd) / cfg.data.raw_path + processed_path = Path(original_cwd) / cfg.data.processed_path + + # Download data + data_file = download_data(raw_path) + + # Load data + df = pd.read_csv(data_file, sep=";") + logger.info(f"Loaded dataset: {df.shape}") + + # Get preprocessing config if available + normalize = False + handle_missing = "drop" + + if "preprocessing" in cfg.data: + normalize = cfg.data.preprocessing.get("normalize", False) + handle_missing = cfg.data.preprocessing.get("handle_missing", "drop") + + # Preprocess data + df = preprocess_data(df, normalize=normalize, handle_missing=handle_missing) + + # Split data + train_df, test_df = split_data( + df, + test_size=cfg.data.test_size, + random_state=cfg.data.random_state, + ) + + # Save data + save_data( + train_df, + test_df, + processed_path, + cfg.data.train_file, + cfg.data.test_file, + ) + + logger.info("Data preparation completed successfully") + + +if __name__ == "__main__": + main() diff --git a/src/pipelines/run_all_models.py b/src/pipelines/run_all_models.py new file mode 100644 index 0000000..c4bd74b --- /dev/null +++ b/src/pipelines/run_all_models.py @@ -0,0 +1,162 @@ +""" +Script to run training pipeline for all configured models. + +Uses Hydra's multirun feature for parallel execution. +""" + +import logging +import subprocess # nosec B404 +import sys +from pathlib import Path + +logger = logging.getLogger(__name__) + +# Available model configurations +MODELS = [ + "random_forest", + "gradient_boosting", + "logistic_regression", + "svm", + "decision_tree", + "knn", +] + + +def run_single_model(model: str) -> tuple[str, bool, str]: + """ + Run training pipeline for a single model. + + Args: + model: Model configuration name + + Returns: + Tuple of (model_name, success, output) + """ + logger.info(f"Running training for model: {model}") + + cmd = [ + sys.executable, + "-m", + "src.pipelines.train_pipeline", + f"model={model}", + ] + + try: + result = subprocess.run( # nosec B603 + cmd, + capture_output=True, + text=True, + cwd=Path(__file__).resolve().parents[2], + timeout=300, # 5 minute timeout + ) + + success = result.returncode == 0 + output = result.stdout if success else result.stderr + + if success: + logger.info(f"✓ {model} completed successfully") + else: + logger.error(f"✗ {model} failed: {result.stderr}") + + return model, success, output + + except subprocess.TimeoutExpired: + logger.error(f"✗ {model} timed out") + return model, False, "Execution timed out" + except Exception as e: + logger.error(f"✗ {model} error: {e}") + return model, False, str(e) + + +def run_all_models(models: list[str] | None = None) -> dict[str, bool]: + """ + Run training pipeline for all specified models. + + Args: + models: List of model names to run. Defaults to all models. + + Returns: + Dictionary mapping model names to success status + """ + if models is None: + models = MODELS + + logger.info(f"Running {len(models)} model(s): {models}") + logger.info("=" * 60) + + results = {} + + for model in models: + model_name, success, output = run_single_model(model) + results[model_name] = success + logger.info("-" * 40) + + # Summary + logger.info("=" * 60) + logger.info("SUMMARY") + logger.info("=" * 60) + + successful = sum(1 for v in results.values() if v) + failed = len(results) - successful + + for model, success in results.items(): + status = "✓ SUCCESS" if success else "✗ FAILED" + logger.info(f" {model}: {status}") + + logger.info("-" * 40) + logger.info(f"Total: {successful} succeeded, {failed} failed") + + return results + + +def run_multirun() -> None: + """ + Run all models using Hydra's multirun feature. + + This enables parallel execution when configured. + """ + models_str = ",".join(MODELS) + + cmd = [ + sys.executable, + "-m", + "src.pipelines.train_pipeline", + "--multirun", + f"model={models_str}", + ] + + logger.info(f"Running multirun with models: {models_str}") + + result = subprocess.run( # nosec B603 + cmd, + cwd=Path(__file__).resolve().parents[2], + ) + + if result.returncode == 0: + logger.info("Multirun completed successfully") + else: + logger.error("Multirun failed") + + +def main() -> None: + """Main entry point.""" + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + + # Parse arguments + if len(sys.argv) > 1: + if sys.argv[1] == "--multirun": + run_multirun() + else: + # Run specific models + models = sys.argv[1:] + run_all_models(models) + else: + # Run all models sequentially + run_all_models() + + +if __name__ == "__main__": + main() diff --git a/src/pipelines/train_pipeline.py b/src/pipelines/train_pipeline.py new file mode 100644 index 0000000..83f8ffd --- /dev/null +++ b/src/pipelines/train_pipeline.py @@ -0,0 +1,364 @@ +""" +Main training pipeline with Hydra configuration management. + +This module provides an automated ML training pipeline that integrates: +- Hydra for configuration management +- MLflow for experiment tracking +- DVC for data versioning and pipeline orchestration +""" + +import importlib +import json +import logging +import sys +from datetime import datetime +from pathlib import Path +from typing import Any + +import hydra +import mlflow +import mlflow.sklearn +import pandas as pd +from omegaconf import DictConfig, OmegaConf +from sklearn.base import ClassifierMixin +from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score +from sklearn.model_selection import cross_val_score + +# Add project root to path +sys.path.insert(0, str(Path(__file__).resolve().parents[2])) + +from src.pipelines.monitoring import PipelineMonitor # noqa: E402 + +logger = logging.getLogger(__name__) + + +class TrainingPipeline: + """ + ML Training Pipeline with Hydra configuration support. + + Handles data loading, model training, evaluation, and MLflow logging. + """ + + def __init__(self, cfg: DictConfig): + """ + Initialize training pipeline. + + Args: + cfg: Hydra configuration object + """ + self.cfg = cfg + self.monitor = PipelineMonitor(cfg) + self.model: ClassifierMixin | None = None + self.metrics: dict[str, float] = {} + + # Setup logging + log_level = getattr(logging, cfg.logging.level, logging.INFO) + logging.basicConfig(level=log_level, format=cfg.logging.format) + + logger.info("Training pipeline initialized") + logger.info(f"Configuration:\n{OmegaConf.to_yaml(cfg)}") + + def load_data(self) -> tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: + """ + Load training and test data. + + Returns: + Tuple of (X_train, y_train, X_test, y_test) + """ + self.monitor.start_stage("data_loading") + + try: + # Get original working directory (before Hydra changes it) + original_cwd = hydra.utils.get_original_cwd() + processed_path = self.cfg.data.get("processed_path", "data/processed") + data_path = Path(original_cwd) / processed_path + + train_file = self.cfg.data.get("train_file", "train.csv") + test_file = self.cfg.data.get("test_file", "test.csv") + train_path = data_path / train_file + test_path = data_path / test_file + + if not train_path.exists() or not test_path.exists(): + raise FileNotFoundError( + f"Data not found. Run 'dvc repro prepare' first. " + f"Looking in: {data_path}" + ) + + train_df = pd.read_csv(train_path) + test_df = pd.read_csv(test_path) + + X_train = train_df.iloc[:, :-1] + y_train = train_df.iloc[:, -1] + X_test = test_df.iloc[:, :-1] + y_test = test_df.iloc[:, -1] + + logger.info(f"Loaded training data: {X_train.shape}") + logger.info(f"Loaded test data: {X_test.shape}") + + self.monitor.end_stage("data_loading", success=True) + return X_train, y_train, X_test, y_test + + except Exception as e: + self.monitor.end_stage("data_loading", success=False, error=str(e)) + raise + + def create_model(self) -> ClassifierMixin: + """ + Create model instance from configuration. + + Returns: + Initialized model instance + """ + self.monitor.start_stage("model_creation") + + try: + # Parse model class path + target = self.cfg.model._target_ + module_path, class_name = target.rsplit(".", 1) + + # Import and instantiate model + module = importlib.import_module(module_path) + model_class = getattr(module, class_name) + + # Get model parameters + params = OmegaConf.to_container(self.cfg.model.params, resolve=True) + + # Create model instance + self.model = model_class(**params) + + logger.info(f"Created model: {self.cfg.model.name}") + logger.info(f"Parameters: {params}") + + self.monitor.end_stage("model_creation", success=True) + return self.model + + except Exception as e: + self.monitor.end_stage("model_creation", success=False, error=str(e)) + raise + + def train( + self, + X_train: pd.DataFrame, + y_train: pd.Series, + X_test: pd.DataFrame, + y_test: pd.Series, + ) -> dict[str, float]: + """ + Train model and evaluate metrics. + + Args: + X_train: Training features + y_train: Training labels + X_test: Test features + y_test: Test labels + + Returns: + Dictionary of evaluation metrics + """ + self.monitor.start_stage("training") + + try: + if self.model is None: + raise ValueError("Model not created. Call create_model() first.") + + # Train model + logger.info("Training model...") + self.model.fit(X_train, y_train) + + # Cross-validation + if self.cfg.training.cv_folds > 1: + cv_scores = cross_val_score( + self.model, + X_train, + y_train, + cv=self.cfg.training.cv_folds, + scoring="accuracy", + ) + cv_mean = cv_scores.mean() + cv_std = cv_scores.std() * 2 + logger.info(f"CV Accuracy: {cv_mean:.4f} (+/- {cv_std:.4f})") + + # Predict on test set + y_pred = self.model.predict(X_test) + + # Calculate metrics + self.metrics = { + "accuracy": float(accuracy_score(y_test, y_pred)), + "precision": float( + precision_score(y_test, y_pred, average="weighted", zero_division=0) + ), + "recall": float( + recall_score(y_test, y_pred, average="weighted", zero_division=0) + ), + "f1_score": float( + f1_score(y_test, y_pred, average="weighted", zero_division=0) + ), + } + + if self.cfg.training.cv_folds > 1: + self.metrics["cv_accuracy_mean"] = float(cv_scores.mean()) + self.metrics["cv_accuracy_std"] = float(cv_scores.std()) + + logger.info(f"Test metrics: {self.metrics}") + + self.monitor.end_stage("training", success=True) + return self.metrics + + except Exception as e: + self.monitor.end_stage("training", success=False, error=str(e)) + raise + + def log_to_mlflow(self) -> str: + """ + Log experiment to MLflow. + + Returns: + MLflow run ID + """ + self.monitor.start_stage("mlflow_logging") + + try: + # Get original working directory + original_cwd = hydra.utils.get_original_cwd() + tracking_uri = f"file://{original_cwd}/mlruns" + + mlflow.set_tracking_uri(tracking_uri) + mlflow.set_experiment(self.cfg.mlflow.experiment_name) + + with mlflow.start_run( + run_name=f"{self.cfg.model.name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" + ) as run: + # Log parameters + params = OmegaConf.to_container(self.cfg.model.params, resolve=True) + mlflow.log_params(params) + mlflow.log_param("model_type", self.cfg.model.name) + mlflow.log_param("cv_folds", self.cfg.training.cv_folds) + mlflow.log_param("seed", self.cfg.seed) + + # Log metrics + mlflow.log_metrics(self.metrics) + + # Log configuration as artifact + config_path = Path("config.yaml") + with open(config_path, "w") as f: + OmegaConf.save(self.cfg, f) + mlflow.log_artifact(str(config_path)) + + # Log model + if self.cfg.training.register_model and self.model is not None: + mlflow.sklearn.log_model( + self.model, + "model", + registered_model_name=self.cfg.model.name, + ) + + run_id: str = run.info.run_id + logger.info(f"Logged to MLflow. Run ID: {run_id}") + + self.monitor.end_stage("mlflow_logging", success=True) + return run_id + + except Exception as e: + self.monitor.end_stage("mlflow_logging", success=False, error=str(e)) + raise + + def save_results(self, run_id: str) -> None: + """ + Save pipeline results to output directory. + + Args: + run_id: MLflow run ID + """ + self.monitor.start_stage("save_results") + + try: + # Get original working directory + original_cwd = hydra.utils.get_original_cwd() + + # Create model-specific output directory + model_name = self.cfg.model.name.lower().replace(" ", "_") + output_dir = Path(original_cwd) / "outputs" / model_name + output_dir.mkdir(parents=True, exist_ok=True) + + # Save metrics + metrics_file = output_dir / "metrics.json" + with open(metrics_file, "w") as f: + json.dump( + { + "metrics": self.metrics, + "model": self.cfg.model.name, + "run_id": run_id, + "timestamp": datetime.now().isoformat(), + }, + f, + indent=2, + ) + + logger.info(f"Results saved to {output_dir}") + self.monitor.end_stage("save_results", success=True) + + # Update config output_dir for monitor + self.cfg.output_dir = str(output_dir) + + except Exception as e: + self.monitor.end_stage("save_results", success=False, error=str(e)) + raise + + def run(self) -> dict[str, Any]: + """ + Execute full training pipeline. + + Returns: + Dictionary with pipeline results + """ + self.monitor.start_pipeline() + + try: + # Execute pipeline stages + X_train, y_train, X_test, y_test = self.load_data() + self.create_model() + metrics = self.train(X_train, y_train, X_test, y_test) + run_id = self.log_to_mlflow() + self.save_results(run_id) + + # Generate final report + report = self.monitor.end_pipeline(success=True) + self.monitor.save_report() + + return { + "success": True, + "metrics": metrics, + "run_id": run_id, + "model": self.cfg.model.name, + "report": report, + } + + except Exception as e: + logger.error(f"Pipeline failed: {e}") + report = self.monitor.end_pipeline(success=False, error=str(e)) + self.monitor.save_report() + + return { + "success": False, + "error": str(e), + "report": report, + } + + +@hydra.main(config_path="../../conf", config_name="config", version_base=None) # type: ignore[misc] +def main(cfg: DictConfig) -> dict[str, Any]: + """ + Main entry point for training pipeline. + + Args: + cfg: Hydra configuration + + Returns: + Pipeline results + """ + pipeline = TrainingPipeline(cfg) + return pipeline.run() + + +if __name__ == "__main__": + main() diff --git a/src/utils/get_best_run.py b/src/utils/get_best_run.py new file mode 100644 index 0000000..5de17b1 --- /dev/null +++ b/src/utils/get_best_run.py @@ -0,0 +1,44 @@ +import sys +from pathlib import Path + +import mlflow + +# Add project root to path +sys.path.append(str(Path(__file__).resolve().parents[2])) + + +def get_experiment_results(experiment_name: str = "wine_quality_multimodel_v1") -> None: + mlflow.set_tracking_uri("file://" + str(Path.cwd() / "mlruns")) + experiment = mlflow.get_experiment_by_name(experiment_name) + + if experiment is None: + print(f"Experiment '{experiment_name}' not found.") + return + + runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id]) + + # Sort by f1_score descending + runs = runs.sort_values("metrics.f1_score", ascending=False) + + # Select interesting columns + cols = [ + "tags.mlflow.runName", + "params.model_type", + "metrics.accuracy", + "metrics.f1_score", + "metrics.precision", + "metrics.recall", + ] + + print("\nTop 5 Runs:") + print(runs[cols].head(5).to_markdown(index=False)) + + print("\nBest Run Details:") + best_run = runs.iloc[0] + for col in runs.columns: + if col.startswith("params.") or col.startswith("metrics."): + print(f"{col}: {best_run[col]}") + + +if __name__ == "__main__": + get_experiment_results() diff --git a/src/utils/mlflow_decorators.py b/src/utils/mlflow_decorators.py new file mode 100644 index 0000000..f5a88b2 --- /dev/null +++ b/src/utils/mlflow_decorators.py @@ -0,0 +1,60 @@ +import functools +import logging +import time +from collections.abc import Callable +from typing import Any + +import mlflow + +logger = logging.getLogger(__name__) + + +def log_experiment(experiment_name: str = "default_experiment") -> Callable[..., Any]: + """ + Decorator to wrap a function execution in an MLflow run. + """ + + def decorator(func: Callable[..., Any]) -> Callable[..., Any]: + @functools.wraps(func) + def wrapper(*args: Any, **kwargs: Any) -> Any: + mlflow.set_experiment(experiment_name) + + # Start run + with mlflow.start_run(run_name=func.__name__) as run: + logger.info(f"Started MLflow run: {run.info.run_id}") + + # Log execution time + start_time = time.time() + + try: + # Execute function + result = func(*args, **kwargs) + + # Log duration + duration = time.time() - start_time + mlflow.log_metric("execution_time", duration) + + return result + + except Exception as e: + # Log error + logger.error(f"Error in {func.__name__}: {str(e)}") + mlflow.set_tag("status", "failed") + mlflow.log_param("error", str(e)) + raise e + + return wrapper + + return decorator + + +def log_metrics(metrics: dict[str, float]) -> None: + """Helper to log multiple metrics.""" + for name, value in metrics.items(): + mlflow.log_metric(name, value) + + +def log_params(params: dict[str, Any]) -> None: + """Helper to log multiple parameters.""" + for name, value in params.items(): + mlflow.log_param(name, value)