-
-
Notifications
You must be signed in to change notification settings - Fork 0
Data Science Team
Complete data science workflow with model training, experiment tracking, and deployment automation for ML engineering teams.
This workflow is designed for data science teams working with:
- Python-based ML workflows with Jupyter notebooks
- Experiment tracking and management with MLflow
- Model training and validation pipelines
- Data processing and analysis tools
- Model deployment and serving to production
- 2-3 Data Scientists
- 1 ML Engineer
- 1 Data Engineer
- 1 DevOps Engineer (shared)
Repository: company/datascience-tools
# Initialize collection
mkdir datascience-tools && cd datascience-tools
git init
git remote add origin git@github.com:company/datascience-tools.git
# Create collection structure
mkdir -p bin/{environment,data,training,analysis,deployment}
mkdir -p docs/{environment,data,training,analysis,deployment}
mkdir -p config/{environments,models,data-sources}
mkdir -p notebooks/templatesCollection Metadata (dotrun.collection.yml):
name: "datascience-tools"
description: "Data science and ML engineering workflow tools"
author: "Data Science Team"
version: "1.5.0"
dependencies:
- python
- conda
- git
- docker
optional_dependencies:
- jupyter
- mlflow
- docker-compose
- kubectl
categories:
- environment
- data
- training
- analysis
- deploymentbin/environment/setup-ml-env.sh:
#!/usr/bin/env bash
### DOC
# Set up complete ML development environment
# Creates conda environment with all necessary packages
### DOC
set -euo pipefail
main() {
local env_name="${1:-mlenv}"
local python_version="${2:-3.9}"
echo "🧪 Setting up ML development environment: $env_name"
# Create conda environment
if command -v conda >/dev/null; then
echo "🐍 Creating conda environment..."
conda create -n "$env_name" python="$python_version" -y
# Activate environment
eval "$(conda shell.bash hook)"
conda activate "$env_name"
# Install ML packages
echo "📦 Installing ML packages..."
conda install -c conda-forge \
jupyter \
jupyterlab \
pandas \
numpy \
scipy \
scikit-learn \
matplotlib \
seaborn \
plotly \
tensorflow \
pytorch \
mlflow \
dvc \
-y
# Install additional packages via pip
pip install \
optuna \
shap \
lime \
streamlit \
fastapi \
gradio
else
echo "🐍 Using pip with virtual environment..."
python -m venv "$env_name"
source "$env_name/bin/activate"
pip install -r config/requirements-ml.txt
fi
# Set up Jupyter extensions
echo "📓 Installing Jupyter extensions..."
jupyter labextension install @jupyter-widgets/jupyterlab-manager
jupyter lab build
# Set up DVC for data versioning
echo "📊 Initializing DVC..."
dvc init --no-scm 2>/dev/null || echo "DVC already initialized"
# Create directory structure
mkdir -p {data,models,notebooks,reports,src}
echo "✅ ML environment ready: $env_name"
echo "🚀 Start with: conda activate $env_name && jupyter lab"
}
main "$@"bin/environment/validate-environment.sh:
#!/usr/bin/env bash
### DOC
# Validate ML environment setup and dependencies
### DOC
set -euo pipefail
check_package() {
local package="$1"
local import_name="${2:-$package}"
if python -c "import $import_name" 2>/dev/null; then
echo "✅ $package"
else
echo "❌ $package"
return 1
fi
}
main() {
echo "🔍 Validating ML environment..."
local failed=0
# Check core packages
echo "📦 Core packages:"
check_package "pandas" || ((failed++))
check_package "numpy" || ((failed++))
check_package "scipy" || ((failed++))
check_package "scikit-learn" "sklearn" || ((failed++))
echo ""
echo "📊 Visualization packages:"
check_package "matplotlib" || ((failed++))
check_package "seaborn" || ((failed++))
check_package "plotly" || ((failed++))
echo ""
echo "🤖 ML frameworks:"
check_package "tensorflow" || ((failed++))
check_package "torch" || ((failed++))
check_package "mlflow" || ((failed++))
echo ""
echo "🔧 Development tools:"
check_package "jupyter" || ((failed++))
check_package "dvc" || ((failed++))
if [[ $failed -eq 0 ]]; then
echo ""
echo "🎉 All packages installed correctly!"
return 0
else
echo ""
echo "⚠️ $failed packages missing or broken"
echo "💡 Run: dr ds/environment/setup-ml-env"
return 1
fi
}
main "$@"bin/data/fetch-dataset.sh:
#!/usr/bin/env bash
### DOC
# Fetch and prepare datasets for analysis
# Supports various data sources and automatic preprocessing
### DOC
set -euo pipefail
source "$DR_CONFIG/helpers/pkg.sh"
validatePkg python
download_kaggle_dataset() {
local dataset="$1"
local target_dir="$2"
if ! command -v kaggle >/dev/null; then
echo "❌ Kaggle CLI not installed"
echo "💡 Install with: pip install kaggle"
exit 1
fi
echo "📥 Downloading Kaggle dataset: $dataset"
kaggle datasets download -d "$dataset" -p "$target_dir" --unzip
}
download_from_url() {
local url="$1"
local target_file="$2"
echo "📥 Downloading from URL: $url"
curl -L "$url" -o "$target_file"
}
main() {
local source_type="$1"
local source_identifier="$2"
local target_dir="${3:-data/raw}"
mkdir -p "$target_dir"
case "$source_type" in
kaggle)
download_kaggle_dataset "$source_identifier" "$target_dir"
;;
url)
local filename
filename=$(basename "$source_identifier")
download_from_url "$source_identifier" "$target_dir/$filename"
;;
sample)
echo "📊 Generating sample dataset: $source_identifier"
python scripts/generate_sample_data.py --type "$source_identifier" --output "$target_dir"
;;
*)
echo "Usage: dr data/fetch-dataset <kaggle|url|sample> <identifier> [target-dir]"
echo "Examples:"
echo " dr data/fetch-dataset kaggle titanic data/raw"
echo " dr data/fetch-dataset url https://example.com/data.csv data/raw"
echo " dr data/fetch-dataset sample classification data/raw"
exit 1
;;
esac
echo "✅ Dataset downloaded to: $target_dir"
echo "📁 Files:"
ls -la "$target_dir"
}
main "$@"bin/data/preprocess-data.sh:
#!/usr/bin/env bash
### DOC
# Preprocess raw data for ML training
# Handles cleaning, feature engineering, and train/test splits
### DOC
set -euo pipefail
main() {
local input_file="$1"
local output_dir="${2:-data/processed}"
local config_file="${3:-config/preprocessing.yml}"
if [[ ! -f "$input_file" ]]; then
echo "❌ Input file not found: $input_file"
exit 1
fi
mkdir -p "$output_dir"
echo "🔄 Preprocessing data: $input_file"
echo "📋 Using config: $config_file"
# Run preprocessing pipeline
python scripts/preprocess.py \
--input "$input_file" \
--output "$output_dir" \
--config "$config_file"
# Generate data quality report
echo "📊 Generating data quality report..."
python scripts/data_quality_report.py \
--data "$output_dir" \
--output "$output_dir/quality_report.html"
echo "✅ Preprocessing complete!"
echo "📂 Processed data: $output_dir"
echo "📊 Quality report: $output_dir/quality_report.html"
}
main "$@"bin/training/train-model.sh:
#!/usr/bin/env bash
### DOC
# Train ML model with experiment tracking
# Supports multiple algorithms and hyperparameter optimization
### DOC
set -euo pipefail
source "$DR_CONFIG/helpers/pkg.sh"
validatePkg python
main() {
local model_type="${1:-random_forest}"
local dataset="${2:-data/processed/train.csv}"
local experiment_name="${3:-default}"
echo "🤖 Training $model_type model on $dataset..."
# Create experiment directory
local experiment_id="exp_$(date +%Y%m%d_%H%M%S)"
local exp_dir="experiments/$experiment_id"
mkdir -p "$exp_dir"
echo "📁 Experiment directory: $exp_dir"
echo "🔬 Experiment name: $experiment_name"
# Set MLflow tracking
export MLFLOW_EXPERIMENT_NAME="$experiment_name"
export MLFLOW_TRACKING_URI="file://$(pwd)/mlruns"
# Run training script with MLflow tracking
python scripts/train.py \
--model-type "$model_type" \
--data "$dataset" \
--experiment-dir "$exp_dir" \
--experiment-name "$experiment_name" \
--track-metrics
# Save model artifacts
echo "💾 Saving model artifacts..."
if [[ -f "models/latest_model.pkl" ]]; then
cp models/latest_model.pkl "$exp_dir/"
fi
if [[ -f "logs/training.log" ]]; then
cp logs/training.log "$exp_dir/"
fi
# Generate model report
echo "📊 Generating model report..."
python scripts/generate_model_report.py --experiment-dir "$exp_dir"
# Model validation
echo "🔍 Running model validation..."
python scripts/validate_model.py --model "$exp_dir/latest_model.pkl"
echo "✅ Training complete!"
echo "📂 Results in: $exp_dir"
echo "🔍 View with: dr ds/analysis/view-experiment $experiment_id"
echo "📊 MLflow UI: mlflow ui --backend-store-uri $(pwd)/mlruns"
}
main "$@"bin/training/optimize-hyperparameters.sh:
#!/usr/bin/env bash
### DOC
# Hyperparameter optimization using Optuna
# Automated search for best model parameters
### DOC
set -euo pipefail
main() {
local model_type="$1"
local dataset="$2"
local n_trials="${3:-100}"
local study_name="${4:-$(date +%Y%m%d_%H%M%S)}"
echo "🎯 Optimizing hyperparameters for $model_type"
echo "📊 Running $n_trials trials"
echo "🔬 Study name: $study_name"
# Create study directory
local study_dir="studies/$study_name"
mkdir -p "$study_dir"
# Run optimization
python scripts/optimize.py \
--model-type "$model_type" \
--data "$dataset" \
--n-trials "$n_trials" \
--study-name "$study_name" \
--study-dir "$study_dir"
# Generate optimization report
echo "📊 Generating optimization report..."
python scripts/optimization_report.py \
--study-dir "$study_dir" \
--output "$study_dir/optimization_report.html"
echo "✅ Hyperparameter optimization complete!"
echo "📂 Results: $study_dir"
echo "📊 Report: $study_dir/optimization_report.html"
echo "🏆 Best parameters saved to: $study_dir/best_params.json"
}
main "$@"bin/analysis/compare-models.sh:
#!/usr/bin/env bash
### DOC
# Compare multiple trained models
# Generates comprehensive comparison report
### DOC
set -euo pipefail
main() {
local experiment_dirs=("$@")
if [[ ${#experiment_dirs[@]} -lt 2 ]]; then
echo "Usage: dr analysis/compare-models <exp-dir1> <exp-dir2> [exp-dir3] ..."
echo "Example: dr analysis/compare-models experiments/exp_20240115_* "
exit 1
fi
echo "📊 Comparing ${#experiment_dirs[@]} models..."
# Create comparison directory
local comparison_id="comparison_$(date +%Y%m%d_%H%M%S)"
local comparison_dir="comparisons/$comparison_id"
mkdir -p "$comparison_dir"
# Run model comparison
python scripts/compare_models.py \
--experiments "${experiment_dirs[@]}" \
--output "$comparison_dir"
# Generate comparison report
echo "📊 Generating comparison report..."
python scripts/comparison_report.py \
--comparison-dir "$comparison_dir" \
--output "$comparison_dir/comparison_report.html"
echo "✅ Model comparison complete!"
echo "📂 Results: $comparison_dir"
echo "📊 Report: $comparison_dir/comparison_report.html"
}
main "$@"bin/analysis/explain-model.sh:
#!/usr/bin/env bash
### DOC
# Generate model explanations using SHAP and LIME
# Creates interpretability reports for model decisions
### DOC
set -euo pipefail
main() {
local model_path="$1"
local test_data="$2"
local explanation_type="${3:-shap}"
if [[ ! -f "$model_path" ]]; then
echo "❌ Model file not found: $model_path"
exit 1
fi
if [[ ! -f "$test_data" ]]; then
echo "❌ Test data not found: $test_data"
exit 1
fi
echo "🔍 Generating model explanations..."
echo "🤖 Model: $model_path"
echo "📊 Data: $test_data"
echo "🔬 Method: $explanation_type"
# Create explanations directory
local model_name
model_name=$(basename "$model_path" .pkl)
local explanations_dir="explanations/${model_name}_$(date +%Y%m%d_%H%M%S)"
mkdir -p "$explanations_dir"
case "$explanation_type" in
shap)
echo "🎯 Generating SHAP explanations..."
python scripts/shap_explanations.py \
--model "$model_path" \
--data "$test_data" \
--output "$explanations_dir"
;;
lime)
echo "🍋 Generating LIME explanations..."
python scripts/lime_explanations.py \
--model "$model_path" \
--data "$test_data" \
--output "$explanations_dir"
;;
both)
echo "🎯🍋 Generating both SHAP and LIME explanations..."
python scripts/shap_explanations.py \
--model "$model_path" \
--data "$test_data" \
--output "$explanations_dir/shap"
python scripts/lime_explanations.py \
--model "$model_path" \
--data "$test_data" \
--output "$explanations_dir/lime"
;;
*)
echo "❌ Unknown explanation type: $explanation_type"
echo "Available types: shap, lime, both"
exit 1
;;
esac
echo "✅ Model explanations generated!"
echo "📂 Results: $explanations_dir"
}
main "$@"bin/deployment/serve-model.sh:
#!/usr/bin/env bash
### DOC
# Deploy model as REST API service
# Supports local serving and containerized deployment
### DOC
set -euo pipefail
source "$DR_CONFIG/helpers/pkg.sh"
validatePkg python
main() {
local model_path="$1"
local deployment_type="${2:-local}"
local port="${3:-8000}"
if [[ ! -f "$model_path" ]]; then
echo "❌ Model file not found: $model_path"
exit 1
fi
echo "🚀 Deploying model: $model_path"
echo "📡 Deployment type: $deployment_type"
echo "🔌 Port: $port"
case "$deployment_type" in
local)
echo "🏠 Starting local FastAPI server..."
export MODEL_PATH="$model_path"
export PORT="$port"
python scripts/serve_model.py
;;
docker)
echo "🐳 Building Docker image..."
docker build -t ml-model-server:latest \
--build-arg MODEL_PATH="$model_path" \
-f docker/Dockerfile.serve .
echo "🚀 Starting Docker container..."
docker run -p "$port:$port" \
-e MODEL_PATH="/app/$model_path" \
ml-model-server:latest
;;
kubernetes)
echo "☸️ Deploying to Kubernetes..."
dr deployment/deploy-to-k8s "$model_path" "$port"
;;
*)
echo "❌ Unknown deployment type: $deployment_type"
echo "Available types: local, docker, kubernetes"
exit 1
;;
esac
}
main "$@"bin/deployment/monitor-model.sh:
#!/usr/bin/env bash
### DOC
# Monitor deployed model performance
# Tracks predictions, drift, and model health
### DOC
set -euo pipefail
main() {
local model_endpoint="$1"
local monitoring_duration="${2:-3600}" # 1 hour default
echo "📊 Monitoring model at: $model_endpoint"
echo "⏰ Duration: ${monitoring_duration}s"
# Create monitoring directory
local monitoring_id="monitoring_$(date +%Y%m%d_%H%M%S)"
local monitoring_dir="monitoring/$monitoring_id"
mkdir -p "$monitoring_dir"
# Start monitoring
python scripts/monitor_model.py \
--endpoint "$model_endpoint" \
--duration "$monitoring_duration" \
--output "$monitoring_dir"
# Generate monitoring report
echo "📊 Generating monitoring report..."
python scripts/monitoring_report.py \
--monitoring-dir "$monitoring_dir" \
--output "$monitoring_dir/monitoring_report.html"
echo "✅ Monitoring complete!"
echo "📂 Results: $monitoring_dir"
echo "📊 Report: $monitoring_dir/monitoring_report.html"
}
main "$@"# Team member imports collection
dr import git@github.com:company/datascience-tools.git ds
# Set up environment
dr ds/environment/setup-ml-env myproject-env
# Fetch and prepare data
dr ds/data/fetch-dataset kaggle titanic data/raw
dr ds/data/preprocess-data data/raw/titanic.csv
# Train and experiment
dr ds/training/train-model random_forest data/processed/train.csv titanic-experiment
dr ds/training/optimize-hyperparameters random_forest data/processed/train.csv 50
# Analyze results
dr ds/analysis/compare-models experiments/exp_*
dr ds/analysis/explain-model models/best_model.pkl data/processed/test.csv# Model validation and deployment
dr ds/analysis/compare-models experiments/candidate_models/*
dr ds/deployment/serve-model models/production_model.pkl docker
dr ds/deployment/monitor-model http://localhost:8000/predict
# Model lifecycle management
dr ds/deployment/deploy-to-k8s models/v2.0.pkl
dr ds/monitoring/setup-drift-detection production-model# Code review and validation
dr ds/environment/validate-environment
dr ds/testing/run-model-tests
dr ds/quality/check-code-quality
# Experiment sharing
dr ds/experiments/export-experiment exp_20240115_142030
dr ds/experiments/import-experiment team-experiments/best-models.zip# Start MLflow server
dr ds/mlflow/start-server
dr ds/mlflow/create-experiment new-project
# Track experiments
dr ds/training/train-model --track-mlflow random_forest data/train.csv
dr ds/analysis/compare-experiments --mlflow-ui# Data versioning
dr ds/data/setup-dvc-pipeline
dr ds/data/version-dataset data/processed/train.csv
dr ds/data/pull-latest-dataThis data science workflow provides comprehensive automation for ML teams while maintaining reproducibility, collaboration, and production readiness across the entire ML lifecycle.