remla25-team21 · zeri27 · May 26, 2025 · May 24, 2025 · May 24, 2025 · May 24, 2025
diff --git a/.github/workflows/code_quality.yml b/.github/workflows/code_quality.yml
@@ -19,6 +19,7 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install -r requirements.txt
+
       - id: pylint
         run: |
           pylint_output=$(PYTHONPATH=. pylint src/ --fail-under=8)
@@ -30,29 +31,75 @@ jobs:
             score="unknown"
           fi
           echo "pylint_score=$score" >> $GITHUB_OUTPUT
-      - name: Update README badge
+
+      - name: Update Pylint Badge in README
         run: |
           score=${{ steps.pylint.outputs.pylint_score }}
           badge="![Pylint Score](https://img.shields.io/badge/pylint-${score//./%2E}%2F10-brightgreen)"
           sed -i "/<!-- PYLINT_BADGE_START -->/,/<!-- PYLINT_BADGE_END -->/c\\<!-- PYLINT_BADGE_START -->\n$badge\n<!-- PYLINT_BADGE_END -->" README.md
+
       - name: Run flake8
         run: flake8 src/
 
       - name: Run bandit
         run: bandit -r src/
         continue-on-error: true
 
+      - name: Install DVC
+        run: pip install dvc[gdrive]
+
+      - name: Set up GDrive credentials for DVC
+        run: |
+          echo "${{ secrets.GDRIVE_JSON_BASE64 }}" | base64 --decode > gdrive-creds.json
+          dvc remote modify storage --local gdrive_use_service_account true
+          dvc remote modify storage --local gdrive_service_account_json_file_path gdrive-creds.json
+
+      - name: Pull data and models from DVC
+        run: dvc pull
+
+      - name: Run tests and collect coverage
+        run: |
+          coverage run -m pytest
+          coverage xml
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v3
+        with:
+          files: coverage.xml
+          token: ${{ secrets.CODECOV_TOKEN }}
+
+      - name: Update Coverage Badge in README
+        run: |
+          coverage_badge="![Coverage](https://codecov.io/github/remla25-team21/model-training/branch/feat%2Fa4-ml-testing/graph/badge.svg?token=L9ICV9K86O)"
+          sed -i "/<!-- COVERAGE_BADGE_START -->/,/<!-- COVERAGE_BADGE_END -->/c\\<!-- COVERAGE_BADGE_START -->\n$coverage_badge\n<!-- COVERAGE_BADGE_END -->" README.md
+
+      - name: Calculate ML Test Score
+        run: python ml_test_score.py
+
+      - name: Update ML Test Score Table in README
+        run: |
+          awk '/<!-- ML_TEST_SCORE_START -->/{print;flag=1;next}/<!-- ML_TEST_SCORE_END -->/{flag=0;print;next}!flag' README.md > tmp_README.md
+          cat ml_test_score.md >> tmp_README.md
+          mv tmp_README.md README.md
+
+      - name: Update ML Test Score Badge
+        run: |
+          if [ -f ml_test_score_badge.txt ]; then
+            badge_url=$(cat ml_test_score_badge.txt)
+            badge_md="![ML Test Score]($badge_url)"
+            sed -i "/<!-- ML_SCORE_BADGE_START -->/,/<!-- ML_SCORE_BADGE_END -->/c\\<!-- ML_SCORE_BADGE_START -->\n$badge_md\n<!-- ML_SCORE_BADGE_END -->" README.md
+          fi
+
       - name: Commit README update
         run: |
           git config user.name "github-actions[bot]"
           git config user.email "github-actions[bot]@users.noreply.github.com"
-
           if ! git diff --quiet; then
             git add README.md
-            git commit -m "Update pylint score badge to ${{ steps.pylint.outputs.pylint_score }}"
+            git commit -m "Update README with lint, coverage, and ML test score"
             git push
           else
             echo "No changes to commit."
           fi
         env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.gitignore b/.gitignore
@@ -30,3 +30,6 @@ __pycache__
 *.tar.gz
 *.tar
 *.tgz
+
+.coverage
+coverage.xml
diff --git a/README.md b/README.md
@@ -4,7 +4,15 @@
 ![Pylint Score](https://img.shields.io/badge/pylint-10%2E00%2F10-brightgreen)
 <!-- PYLINT_BADGE_END -->
 
-This repository contains the training pipeline for the sentiment analysis model used in our REMLA project.
+<!-- COVERAGE_BADGE_START -->
+![Coverage](https://codecov.io/github/remla25-team21/model-training/branch/feat%2Fa4-ml-testing/graph/badge.svg?token=L9ICV9K86O)
+<!-- COVERAGE_BADGE_END -->
+
+<!-- ML_SCORE_BADGE_START -->
+![ML Test Score](https://img.shields.io/badge/ML%20Test%20Score-12%2F12-brightgreen)
+<!-- ML_SCORE_BADGE_END -->
+
+This repository contains the training pipeline for the sentiment analysis model used in our REMLA project. 
 
 - It uses the [lib-ml](https://github.com/remla25-team21/lib-ml) library for data preprocessing and saves the trained model (`sentiment_model_*.pkl`) as a release artifact.
 - The training dataset can be found in `data/raw/a1_RestaurantReviews_HistoricDump.tsv`.
@@ -29,7 +37,7 @@ This repository contains the training pipeline for the sentiment analysis model
 >
 > ```bash
 > dvc remote modify storage --local gdrive_use_service_account true
-> dvc remote modify storage --local gdrive_service_account_json_file_path <path/to/file.json>  # Replace with your Google Drive service account JSON file path
+> dvc remote modify storage --local gdrive_service_account_json_file_path <path/to/file.json> # Replace with your Google Drive service account JSON file path
 > ```
 >
 > 4. Pull the data from remote storage or download it directly (see [Troubleshooting](#troubleshooting) section if facing issues)
@@ -43,11 +51,20 @@ This repository contains the training pipeline for the sentiment analysis model
 > ```bash
 > dvc repro
 > ```
+>
 > 6. Run the test 
 >
 > ```bash
 > pytest
 > ```
+>
+> 7. Generate the coverage report
+>
+> ```bash
+> coverage run -m pytest
+> coverage report # Prints summary in terminal 
+> coverage xml # Generates coverage.xml file in the root directory
+> ```
 
 ## Dependencies
 
@@ -111,16 +128,19 @@ For more details on collaborating with DVC, refer to [./docs/dvc-ref.md](./docs/
 If you encounter "This app is blocked" error during Google authentication when using DVC with Google Drive, you can download the dataset directly using one of these methods:
 
 #### Linux/macOS
+
 ```bash
 wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1mrWUgJlRCf_n_TbxPuuthJ9YsTBwGuRh' -O ./data/raw/a1_RestaurantReviews_HistoricDump.tsv
 ```
 
 #### Windows (PowerShell)
+
 ```powershell
 Invoke-WebRequest -Uri "https://drive.google.com/uc?export=download&id=1mrWUgJlRCf_n_TbxPuuthJ9YsTBwGuRh" -OutFile "./data/raw/a1_RestaurantReviews_HistoricDump.tsv"
 ```
 
 After downloading the dataset directly, you can proceed with the pipeline by running:
+
 ```bash
 dvc repro
 ```
@@ -144,39 +164,61 @@ python src/evaluate.py
 
 The pipeline produces the following artifacts:
 
-- `preprocessed_data_*.pkl`: Preprocessed data (features and labels)
-- `c1_BoW_Sentiment_Model_*.pkl`: Text vectorizer model
-- `trained_model_*.pkl`: Trained ML model before evaluation
-- `sentiment_model_*.pkl`: Final ML model after evaluation
-- `metrics_*.json`: Model performance metrics
+* `preprocessed_data_*.pkl`: Preprocessed data (features and labels)
+* `c1_BoW_Sentiment_Model_*.pkl`: Text vectorizer model
+* `trained_model_*.pkl`: Trained ML model before evaluation
+* `sentiment_model_*.pkl`: Final ML model after evaluation
+* `metrics_*.json`: Model performance metrics
 
-# 🧹 Linters
+# Linters
 
 Linters help improve code quality by identifying errors, enforcing style rules, and spotting security issues without running the code.
 
 ## Linters Used
 
-- **Pylint**: Checks for coding errors and enforces standards.
-- **Flake8**: Checks code style and complexity.
-- **Bandit**: Scans for security vulnerabilities in Python code.
+* **Pylint**: Checks for coding errors and enforces standards.
+* **Flake8**: Checks code style and complexity.
+* **Bandit**: Scans for security vulnerabilities in Python code.
 
 ## How to Run
 
 To run all linters and generate reports:
 
 ### For Mac/Linux
+
 ```bash
 bash lint.sh
 ```
 
 ### For Windows
 
-Use Git Bash as your terminal
-
+Use Git Bash as your terminal:
 
 ```bash
 1. chmod +x lint.sh
 ```
+
 ```bash
-2 ./lint.sh
-```
+2. ./lint.sh
+```
+
+## ML Test Score
+
+<!-- ML_TEST_SCORE_START -->
+<!-- ML_TEST_SCORE_END -->
+<!-- ML_TEST_SCORE_START -->
+<!-- ML_TEST_SCORE_END -->
+<!-- ML_TEST_SCORE_START -->
+| Category              | Test Count | Automated? |
+|-----------------------|------------|------------|
+| Feature & Data         | ✅ 5        | ✅         |
+| Model Development      | ✅ 5        | ✅         |
+| ML Infrastructure      | ✅ 2        | ✅         |
+| Monitoring             | ✅ 2        | ✅         |
+| Mutamorphic Testing    | ✅ 3        | ✅         |
+| Preprocessing Module   | ✅ 2        | ✅         |
+| Training Module        | ✅ 5        | ✅         |
+| Evaluation Module      | ✅ 4        | ✅         |
+
+**Final Score:** 12/12
+<!-- ML_TEST_SCORE_END -->
diff --git a/ml_test_score.py b/ml_test_score.py
@@ -0,0 +1,67 @@
+import os
+import re
+
+TEST_DIR = "tests"
+
+official_categories = {
+    "Feature & Data": "test_01_data_integrity.py",
+    "Model Development": "test_02_model_development.py",
+    "ML Infrastructure": "test_03_ml_infrastructure.py",
+    "Monitoring": "test_04_monitoring.py",
+    "Mutamorphic Testing": "test_05_mutamorphic.py",
+}
+
+extra_modules = {
+    "Preprocessing Module": "test_preprocess.py",
+    "Training Module": "test_train.py",
+    "Evaluation Module": "test_evaluate.py",
+}
+
+def count_tests(file_path):
+    if not os.path.exists(file_path):
+        return 0
+    with open(file_path, "r", encoding="utf-8") as f:
+        return len(re.findall(r"def test_", f.read()))
+
+def generate_table(category_map, count_towards_score=True):
+    lines = []
+    score = 0
+    for category, filename in category_map.items():
+        path = os.path.join(TEST_DIR, filename)
+        test_count = count_tests(path)
+        if test_count > 0:
+            lines.append(f"| {category:<22} | ✅ {test_count:<8} | ✅         |")
+            if count_towards_score:
+                score += 2
+        else:
+            lines.append(f"| {category:<22} | ❌ 0        | ❌         |")
+    return lines, score
+
+def main():
+    all_lines = []
+    all_lines.append("<!-- ML_TEST_SCORE_START -->")
+    all_lines.append("| Category              | Test Count | Automated? |")
+    all_lines.append("|-----------------------|------------|------------|")
+
+    # Official categories
+    official_lines, official_score = generate_table(official_categories)
+
+    # Extra module tests
+    extra_lines, extra_score = generate_table(extra_modules, count_towards_score=True)
+
+    all_lines.extend(official_lines)
+    all_lines.extend(extra_lines)
+    all_lines.append(f"\n**Final Score:** {min(official_score + extra_score, 12)}/12")
+    all_lines.append("<!-- ML_TEST_SCORE_END -->")
+
+    with open("ml_test_score.md", "w") as f:
+        f.write("\n".join(all_lines))
+
+    total_score = min(official_score + extra_score, 12)
+    badge_color = "brightgreen" if total_score >= 10 else "yellow" if total_score >= 6 else "red"
+    badge_url = f"https://img.shields.io/badge/ML%20Test%20Score-{total_score}%2F12-{badge_color}"
+    with open("ml_test_score_badge.txt", "w") as f:
+        f.write(badge_url)
+
+if __name__ == "__main__":
+    main()
diff --git a/pytest.ini b/pytest.ini
@@ -1,3 +1,4 @@
 # telling pytest to add src to PYTHONPATH automatically
 [pytest]
-pythonpath = src
+pythonpath = src
+testpaths = tests
diff --git a/requirements.txt b/requirements.txt
@@ -8,4 +8,8 @@ pyyaml
 pylint
 flake8
 bandit
-astroid
+astroid
+pytest
+coverage
+pytest-cov
+codecov
diff --git a/tests/test_01_data_integrity.py b/tests/test_01_data_integrity.py
@@ -0,0 +1,38 @@
+import os
+import pytest
+import joblib
+import pandas as pd
+
+RAW_DATA_PATH = "data/raw/a1_RestaurantReviews_HistoricDump.tsv"
+
+@pytest.fixture(scope="module")
+def raw_data():
+    assert os.path.exists(RAW_DATA_PATH), f"Data file not found at {RAW_DATA_PATH}"
+    df = pd.read_csv(RAW_DATA_PATH, sep='\t')
+    df.columns = df.columns.str.strip()
+    return df
+
+def test_column_schema(raw_data):
+    """Check that expected columns exist"""
+    expected = {'Review', 'Liked'}
+    actual = set(raw_data.columns)
+    missing = expected - actual
+    assert not missing, f"Missing expected columns: {missing}"
+
+def test_no_missing_values(raw_data):
+    """Ensure no nulls in important columns"""
+    for col in ['Review', 'Liked']:
+        assert raw_data[col].isnull().sum() == 0, f"Missing values found in {col}"
+
+def test_liked_label_values(raw_data):
+    """Ensure 'Liked' is binary (0 or 1)"""
+    assert raw_data['Liked'].isin([0, 1]).all(), "'Liked' column contains non-binary values"
+
+def test_review_length(raw_data):
+    """Check that Review has sufficient length"""
+    assert raw_data['Review'].str.len().gt(10).all(), "Some reviews are too short"
+
+def test_exact_duplicate_rows(raw_data):
+    """Check for fully duplicated rows with same Review and Liked"""
+    duplicates = raw_data.duplicated().sum()
+    assert duplicates <= 10, f"Unusual number of exact duplicate rows: {duplicates}"
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,4 +8,8 @@ pyyaml @@
     pylint
     flake8
     bandit
-    astroid
+    astroid
+    pytest
+    coverage
+    pytest-cov
+    codecov