diff --git a/.github/workflows/code_quality.yml b/.github/workflows/code_quality.yml index aef9a2d..9bae90d 100644 --- a/.github/workflows/code_quality.yml +++ b/.github/workflows/code_quality.yml @@ -19,6 +19,7 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements.txt + - id: pylint run: | pylint_output=$(PYTHONPATH=. pylint src/ --fail-under=8) @@ -30,11 +31,13 @@ jobs: score="unknown" fi echo "pylint_score=$score" >> $GITHUB_OUTPUT - - name: Update README badge + + - name: Update Pylint Badge in README run: | score=${{ steps.pylint.outputs.pylint_score }} badge="![Pylint Score](https://img.shields.io/badge/pylint-${score//./%2E}%2F10-brightgreen)" sed -i "//,//c\\\n$badge\n" README.md + - name: Run flake8 run: flake8 src/ @@ -42,17 +45,61 @@ jobs: run: bandit -r src/ continue-on-error: true + - name: Install DVC + run: pip install dvc[gdrive] + + - name: Set up GDrive credentials for DVC + run: | + echo "${{ secrets.GDRIVE_JSON_BASE64 }}" | base64 --decode > gdrive-creds.json + dvc remote modify storage --local gdrive_use_service_account true + dvc remote modify storage --local gdrive_service_account_json_file_path gdrive-creds.json + + - name: Pull data and models from DVC + run: dvc pull + + - name: Run tests and collect coverage + run: | + coverage run -m pytest + coverage xml + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v3 + with: + files: coverage.xml + token: ${{ secrets.CODECOV_TOKEN }} + + - name: Update Coverage Badge in README + run: | + coverage_badge="![Coverage](https://codecov.io/github/remla25-team21/model-training/branch/feat%2Fa4-ml-testing/graph/badge.svg?token=L9ICV9K86O)" + sed -i "//,//c\\\n$coverage_badge\n" README.md + + - name: Calculate ML Test Score + run: python ml_test_score.py + + - name: Update ML Test Score Table in README + run: | + awk '//{print;flag=1;next}//{flag=0;print;next}!flag' README.md > tmp_README.md + cat ml_test_score.md >> tmp_README.md + mv tmp_README.md README.md + + - name: Update ML Test Score Badge + run: | + if [ -f ml_test_score_badge.txt ]; then + badge_url=$(cat ml_test_score_badge.txt) + badge_md="![ML Test Score]($badge_url)" + sed -i "//,//c\\\n$badge_md\n" README.md + fi + - name: Commit README update run: | git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" - if ! git diff --quiet; then git add README.md - git commit -m "Update pylint score badge to ${{ steps.pylint.outputs.pylint_score }}" + git commit -m "Update README with lint, coverage, and ML test score" git push else echo "No changes to commit." fi env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.gitignore b/.gitignore index 69a7565..d83c084 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,6 @@ __pycache__ *.tar.gz *.tar *.tgz + +.coverage +coverage.xml \ No newline at end of file diff --git a/README.md b/README.md index 8358a84..126a4e3 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,15 @@ ![Pylint Score](https://img.shields.io/badge/pylint-10%2E00%2F10-brightgreen) -This repository contains the training pipeline for the sentiment analysis model used in our REMLA project. + +![Coverage](https://codecov.io/github/remla25-team21/model-training/branch/feat%2Fa4-ml-testing/graph/badge.svg?token=L9ICV9K86O) + + + +![ML Test Score](https://img.shields.io/badge/ML%20Test%20Score-12%2F12-brightgreen) + + +This repository contains the training pipeline for the sentiment analysis model used in our REMLA project. - It uses the [lib-ml](https://github.com/remla25-team21/lib-ml) library for data preprocessing and saves the trained model (`sentiment_model_*.pkl`) as a release artifact. - The training dataset can be found in `data/raw/a1_RestaurantReviews_HistoricDump.tsv`. @@ -29,7 +37,7 @@ This repository contains the training pipeline for the sentiment analysis model > > ```bash > dvc remote modify storage --local gdrive_use_service_account true -> dvc remote modify storage --local gdrive_service_account_json_file_path # Replace with your Google Drive service account JSON file path +> dvc remote modify storage --local gdrive_service_account_json_file_path # Replace with your Google Drive service account JSON file path > ``` > > 4. Pull the data from remote storage or download it directly (see [Troubleshooting](#troubleshooting) section if facing issues) @@ -43,11 +51,20 @@ This repository contains the training pipeline for the sentiment analysis model > ```bash > dvc repro > ``` +> > 6. Run the test > > ```bash > pytest > ``` +> +> 7. Generate the coverage report +> +> ```bash +> coverage run -m pytest +> coverage report # Prints summary in terminal +> coverage xml # Generates coverage.xml file in the root directory +> ``` ## Dependencies @@ -111,16 +128,19 @@ For more details on collaborating with DVC, refer to [./docs/dvc-ref.md](./docs/ If you encounter "This app is blocked" error during Google authentication when using DVC with Google Drive, you can download the dataset directly using one of these methods: #### Linux/macOS + ```bash wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1mrWUgJlRCf_n_TbxPuuthJ9YsTBwGuRh' -O ./data/raw/a1_RestaurantReviews_HistoricDump.tsv ``` #### Windows (PowerShell) + ```powershell Invoke-WebRequest -Uri "https://drive.google.com/uc?export=download&id=1mrWUgJlRCf_n_TbxPuuthJ9YsTBwGuRh" -OutFile "./data/raw/a1_RestaurantReviews_HistoricDump.tsv" ``` After downloading the dataset directly, you can proceed with the pipeline by running: + ```bash dvc repro ``` @@ -144,39 +164,61 @@ python src/evaluate.py The pipeline produces the following artifacts: -- `preprocessed_data_*.pkl`: Preprocessed data (features and labels) -- `c1_BoW_Sentiment_Model_*.pkl`: Text vectorizer model -- `trained_model_*.pkl`: Trained ML model before evaluation -- `sentiment_model_*.pkl`: Final ML model after evaluation -- `metrics_*.json`: Model performance metrics +* `preprocessed_data_*.pkl`: Preprocessed data (features and labels) +* `c1_BoW_Sentiment_Model_*.pkl`: Text vectorizer model +* `trained_model_*.pkl`: Trained ML model before evaluation +* `sentiment_model_*.pkl`: Final ML model after evaluation +* `metrics_*.json`: Model performance metrics -# 🧹 Linters +# Linters Linters help improve code quality by identifying errors, enforcing style rules, and spotting security issues without running the code. ## Linters Used -- **Pylint**: Checks for coding errors and enforces standards. -- **Flake8**: Checks code style and complexity. -- **Bandit**: Scans for security vulnerabilities in Python code. +* **Pylint**: Checks for coding errors and enforces standards. +* **Flake8**: Checks code style and complexity. +* **Bandit**: Scans for security vulnerabilities in Python code. ## How to Run To run all linters and generate reports: ### For Mac/Linux + ```bash bash lint.sh ``` ### For Windows -Use Git Bash as your terminal - +Use Git Bash as your terminal: ```bash 1. chmod +x lint.sh ``` + ```bash -2 ./lint.sh -``` \ No newline at end of file +2. ./lint.sh +``` + +## ML Test Score + + + + + + +| Category | Test Count | Automated? | +|-----------------------|------------|------------| +| Feature & Data | βœ… 5 | βœ… | +| Model Development | βœ… 5 | βœ… | +| ML Infrastructure | βœ… 2 | βœ… | +| Monitoring | βœ… 2 | βœ… | +| Mutamorphic Testing | βœ… 3 | βœ… | +| Preprocessing Module | βœ… 2 | βœ… | +| Training Module | βœ… 5 | βœ… | +| Evaluation Module | βœ… 4 | βœ… | + +**Final Score:** 12/12 + \ No newline at end of file diff --git a/ml_test_score.py b/ml_test_score.py new file mode 100644 index 0000000..7ae688a --- /dev/null +++ b/ml_test_score.py @@ -0,0 +1,67 @@ +import os +import re + +TEST_DIR = "tests" + +official_categories = { + "Feature & Data": "test_01_data_integrity.py", + "Model Development": "test_02_model_development.py", + "ML Infrastructure": "test_03_ml_infrastructure.py", + "Monitoring": "test_04_monitoring.py", + "Mutamorphic Testing": "test_05_mutamorphic.py", +} + +extra_modules = { + "Preprocessing Module": "test_preprocess.py", + "Training Module": "test_train.py", + "Evaluation Module": "test_evaluate.py", +} + +def count_tests(file_path): + if not os.path.exists(file_path): + return 0 + with open(file_path, "r", encoding="utf-8") as f: + return len(re.findall(r"def test_", f.read())) + +def generate_table(category_map, count_towards_score=True): + lines = [] + score = 0 + for category, filename in category_map.items(): + path = os.path.join(TEST_DIR, filename) + test_count = count_tests(path) + if test_count > 0: + lines.append(f"| {category:<22} | βœ… {test_count:<8} | βœ… |") + if count_towards_score: + score += 2 + else: + lines.append(f"| {category:<22} | ❌ 0 | ❌ |") + return lines, score + +def main(): + all_lines = [] + all_lines.append("") + all_lines.append("| Category | Test Count | Automated? |") + all_lines.append("|-----------------------|------------|------------|") + + # Official categories + official_lines, official_score = generate_table(official_categories) + + # Extra module tests + extra_lines, extra_score = generate_table(extra_modules, count_towards_score=True) + + all_lines.extend(official_lines) + all_lines.extend(extra_lines) + all_lines.append(f"\n**Final Score:** {min(official_score + extra_score, 12)}/12") + all_lines.append("") + + with open("ml_test_score.md", "w") as f: + f.write("\n".join(all_lines)) + + total_score = min(official_score + extra_score, 12) + badge_color = "brightgreen" if total_score >= 10 else "yellow" if total_score >= 6 else "red" + badge_url = f"https://img.shields.io/badge/ML%20Test%20Score-{total_score}%2F12-{badge_color}" + with open("ml_test_score_badge.txt", "w") as f: + f.write(badge_url) + +if __name__ == "__main__": + main() diff --git a/pytest.ini b/pytest.ini index 6bac73c..fa56afd 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,3 +1,4 @@ # telling pytest to add src to PYTHONPATH automatically [pytest] -pythonpath = src \ No newline at end of file +pythonpath = src +testpaths = tests diff --git a/requirements.txt b/requirements.txt index a8d84f0..3c57a65 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,8 @@ pyyaml pylint flake8 bandit -astroid \ No newline at end of file +astroid +pytest +coverage +pytest-cov +codecov \ No newline at end of file diff --git a/tests/test_01_data_integrity.py b/tests/test_01_data_integrity.py new file mode 100644 index 0000000..e87e24f --- /dev/null +++ b/tests/test_01_data_integrity.py @@ -0,0 +1,38 @@ +import os +import pytest +import joblib +import pandas as pd + +RAW_DATA_PATH = "data/raw/a1_RestaurantReviews_HistoricDump.tsv" + +@pytest.fixture(scope="module") +def raw_data(): + assert os.path.exists(RAW_DATA_PATH), f"Data file not found at {RAW_DATA_PATH}" + df = pd.read_csv(RAW_DATA_PATH, sep='\t') + df.columns = df.columns.str.strip() + return df + +def test_column_schema(raw_data): + """Check that expected columns exist""" + expected = {'Review', 'Liked'} + actual = set(raw_data.columns) + missing = expected - actual + assert not missing, f"Missing expected columns: {missing}" + +def test_no_missing_values(raw_data): + """Ensure no nulls in important columns""" + for col in ['Review', 'Liked']: + assert raw_data[col].isnull().sum() == 0, f"Missing values found in {col}" + +def test_liked_label_values(raw_data): + """Ensure 'Liked' is binary (0 or 1)""" + assert raw_data['Liked'].isin([0, 1]).all(), "'Liked' column contains non-binary values" + +def test_review_length(raw_data): + """Check that Review has sufficient length""" + assert raw_data['Review'].str.len().gt(10).all(), "Some reviews are too short" + +def test_exact_duplicate_rows(raw_data): + """Check for fully duplicated rows with same Review and Liked""" + duplicates = raw_data.duplicated().sum() + assert duplicates <= 10, f"Unusual number of exact duplicate rows: {duplicates}" diff --git a/tests/test_02_model_development.py b/tests/test_02_model_development.py new file mode 100644 index 0000000..d236401 --- /dev/null +++ b/tests/test_02_model_development.py @@ -0,0 +1,109 @@ +import pytest +import pickle +import joblib +import json +import time +import os +import tracemalloc +from preprocess import preprocess_data +from train import train_model +from evaluate import evaluate_model +from sklearn.metrics import accuracy_score +from sklearn.dummy import DummyClassifier + +DATA_PATH = "data/raw/a1_RestaurantReviews_HistoricDump.tsv" +MODEL_PATH = "../artifacts/trained_model.pkl" +MODEL_PATH = os.path.abspath("artifacts/trained_model.pkl") +VECTORIZER_PATH = "artifacts/c1_BoW_Sentiment_Model.pkl" + +@pytest.fixture(scope="module") +def preprocessed(): + return preprocess_data(DATA_PATH) + +@pytest.fixture(scope="module") +def train_test_data(preprocessed): + with open(preprocessed, "rb") as f: + data = pickle.load(f) + return data["X_train"], data["X_test"], data["y_train"], data["y_test"] + +def test_nondeterminism_robustness(preprocessed, train_test_data): + accs = [] + for seed in [1, 42, 123]: + model_path = train_model(preprocessed, random_state=seed) + metrics_path = evaluate_model(model_path, preprocessed) + + with open(metrics_path, "r") as f: + metrics = json.load(f) + acc = metrics.get("accuracy") + assert acc is not None, "Accuracy not found in metrics.json" + accs.append(acc) + + variability = max(accs) - min(accs) + assert variability <= 0.05, f"Accuracy variance too high: {accs}" + +def test_data_slice_performance(preprocessed, train_test_data): + _, X_test, _, y_test = train_test_data + model_path = train_model(preprocessed, random_state=0) + model = joblib.load(model_path) + + short_idx = [i for i, x in enumerate(X_test) if x.sum() <= 5] + long_idx = [i for i, x in enumerate(X_test) if x.sum() >= 15] + + if not short_idx or not long_idx: + pytest.skip("Insufficient short/long samples for slice test") + + short_X = X_test[short_idx] + short_y = [y_test[i] for i in short_idx] + long_X = X_test[long_idx] + long_y = [y_test[i] for i in long_idx] + + short_preds = model.predict(short_X) + long_preds = model.predict(long_X) + + acc_short = accuracy_score(short_y, short_preds) + acc_long = accuracy_score(long_y, long_preds) + + diff = abs(acc_short - acc_long) + assert diff <= 0.25, f"Accuracy gap on slices too large: short={acc_short:.2f}, long={acc_long:.2f}" + +def test_baseline_comparison(train_test_data, preprocessed): + X_train, X_test, y_train, y_test = train_test_data + + dummy = DummyClassifier(strategy="most_frequent", random_state=0) + dummy.fit(X_train, y_train) + baseline_preds = dummy.predict(X_test) + baseline_acc = accuracy_score(y_test, baseline_preds) + + model_path = train_model(preprocessed, random_state=0) + model = joblib.load(model_path) + model_preds = model.predict(X_test) + model_acc = accuracy_score(y_test, model_preds) + + assert model_acc > baseline_acc, ( + f"Trained model does not outperform baseline: model={model_acc:.2f}, baseline={baseline_acc:.2f}" + ) + +def test_prediction_latency(): + model = joblib.load(MODEL_PATH) + vectorizer = joblib.load(VECTORIZER_PATH) + texts = ["The food was absolutely amazing!"] + + start = time.time() + X = vectorizer.transform(texts) + _ = model.predict(X) + elapsed = time.time() - start + + assert elapsed < 0.5, f"Prediction took too long: {elapsed:.3f}s" + +def test_prediction_memory(): + model = joblib.load(MODEL_PATH) + vectorizer = joblib.load(VECTORIZER_PATH) + texts = ["The food was absolutely amazing!"] + + tracemalloc.start() + X = vectorizer.transform(texts) + _ = model.predict(X) + _, peak = tracemalloc.get_traced_memory() + tracemalloc.stop() + + assert peak < 50 * 1024 * 1024, f"Peak memory usage too high: {peak / 1024**2:.2f} MB" diff --git a/tests/test_03_ml_infrastructure.py b/tests/test_03_ml_infrastructure.py new file mode 100644 index 0000000..5e2bbe5 --- /dev/null +++ b/tests/test_03_ml_infrastructure.py @@ -0,0 +1,47 @@ +import os +import pytest +import pickle +import joblib +import json +from preprocess import preprocess_data +from train import train_model +from evaluate import evaluate_model +from sklearn.metrics import accuracy_score + +DATA_PATH = "data/raw/a1_RestaurantReviews_HistoricDump.tsv" + +@pytest.fixture(scope="module") +def preprocessed(): + return preprocess_data(DATA_PATH) + +@pytest.fixture(scope="module") +def train_test_data(preprocessed): + with open(preprocessed, "rb") as f: + data = pickle.load(f) + return data["X_train"], data["X_test"], data["y_train"], data["y_test"] + +def test_integration_pipeline(preprocessed): + """Run the full training and evaluation pipeline""" + model_path = train_model(preprocessed, random_state=0) + assert os.path.exists(model_path), "Trained model file not created" + + metrics_path = evaluate_model(model_path, preprocessed) + assert os.path.exists(metrics_path), "Metrics file not created" + + with open(metrics_path, "r") as f: + metrics = json.load(f) + + assert "accuracy" in metrics, "Accuracy not found in metrics file" + assert 0.7 <= metrics["accuracy"] <= 1.0, f"Unrealistic accuracy: {metrics['accuracy']}" + +def test_model_rollback(train_test_data, preprocessed): + """Test loading a saved model and re-evaluating""" + X_train, X_test, y_train, y_test = train_test_data + + model_path = train_model(preprocessed, random_state=0) + model = joblib.load(model_path) + + preds = model.predict(X_test) + acc = accuracy_score(y_test, preds) + + assert 0.7 <= acc <= 1.0, f"Reloaded model accuracy out of range: {acc:.2f}" diff --git a/tests/test_04_monitoring.py b/tests/test_04_monitoring.py new file mode 100644 index 0000000..2192fa9 --- /dev/null +++ b/tests/test_04_monitoring.py @@ -0,0 +1,50 @@ +import pytest +import pickle +from preprocess import preprocess_data +from train import train_model +import numpy as np +from scipy.stats import ks_2samp + +DATA_PATH = "data/raw/a1_RestaurantReviews_HistoricDump.tsv" + +@pytest.fixture(scope="module") +def preprocessed(): + return preprocess_data(DATA_PATH) + +@pytest.fixture(scope="module") +def train_test_data(preprocessed): + with open(preprocessed, "rb") as f: + data = pickle.load(f) + return data["X_train"], data["X_test"], data["y_train"], data["y_test"] + +def test_feature_distribution_drift(train_test_data): + """Compare feature distributions in train and test via Kolmogorov–Smirnov test""" + X_train, X_test, _, _ = train_test_data + + drift_scores = [] + for i in range(X_train.shape[1]): + train_feat = X_train[:, i].ravel() + test_feat = X_test[:, i].ravel() + stat, pval = ks_2samp(train_feat, test_feat) + drift_scores.append(pval) + + # If many p-values are very low, feature drift exists + drift_detected = np.sum(np.array(drift_scores) < 0.01) + ratio = drift_detected / len(drift_scores) + assert ratio < 0.1, f"Feature drift detected in {ratio:.2%} of features" + +def test_prediction_distribution_stability(train_test_data, preprocessed): + """Check for dramatic changes in predicted label distribution""" + _, X_test, _, _ = train_test_data + model_path = train_model(preprocessed, random_state=0) + + import joblib + model = joblib.load(model_path) + preds = model.predict(X_test) + + # Count proportion of each predicted label + unique, counts = np.unique(preds, return_counts=True) + ratios = dict(zip(unique, counts / len(preds))) + + for label, ratio in ratios.items(): + assert 0.1 <= ratio <= 0.9, f"Prediction ratio for class {label} is unrealistic: {ratio:.2f}" diff --git a/tests/test_05_mutamorphic.py b/tests/test_05_mutamorphic.py new file mode 100644 index 0000000..ce703e7 --- /dev/null +++ b/tests/test_05_mutamorphic.py @@ -0,0 +1,93 @@ +import os +import pickle +import pytest +import numpy as np + +@pytest.fixture(scope="module") +def trained_sentiment_model(): + model_path = "../artifacts/trained_model.pkl" + model_path = os.path.abspath("artifacts/trained_model.pkl") + + if not os.path.exists(model_path): + pytest.fail( + f"ERROR: Model file not found at {model_path}. " + f"Ensure the DVC 'train' stage has been run." + ) + with open(model_path, "rb") as f: + model = pickle.load(f) + return model + +@pytest.fixture(scope="module") +def sentiment_vectorizer(): + vectorizer_path = "artifacts/c1_BoW_Sentiment_Model.pkl" + if not os.path.exists(vectorizer_path): + pytest.fail( + f"ERROR: Vectorizer file not found at {vectorizer_path}. " + f"Ensure the DVC 'preprocess' stage has been run." + ) + with open(vectorizer_path, "rb") as f: + vectorizer = pickle.load(f) + return vectorizer + +def replace_with_synonym(text, original, synonym): + return text.replace(original, synonym) + +@pytest.mark.parametrize("original_review, replacements", [ + ("The food was great and service excellent.", [("great", "good"), ("excellent", "fine")]), + ("The food was terrible and the service awful.", [("terrible", "horrible"), ("awful", "dreadful")]) +]) + +def test_mutamorphic_synonym_consistency(trained_sentiment_model, sentiment_vectorizer, original_review, replacements): + model = trained_sentiment_model + vectorizer = sentiment_vectorizer + original_vec = vectorizer.transform([original_review]) + original_pred = model.predict(original_vec)[0] + + for original, synonym in replacements: + mutated = replace_with_synonym(original_review, original, synonym) + mutated_vec = vectorizer.transform([mutated]) + mutated_pred = model.predict(mutated_vec)[0] + assert mutated_pred == original_pred, ( + f"Prediction inconsistency:\n" + f"Original: {original_review} -> {original_pred}\n" + f"Mutated: {mutated} -> {mutated_pred}" + ) + +def test_mutamorphic_add_neutral_phrase(trained_sentiment_model, sentiment_vectorizer): + model = trained_sentiment_model + vectorizer = sentiment_vectorizer + review = "The experience was terrible." + neutralized = "To be honest, " + review + + vec_orig = vectorizer.transform([review]) + vec_neutral = vectorizer.transform([neutralized]) + pred_orig = model.predict(vec_orig)[0] + pred_neutral = model.predict(vec_neutral)[0] + + assert pred_orig == pred_neutral, ( + f"Prediction changed after neutral phrase: '{pred_orig}' -> '{pred_neutral}'" + ) + +def test_mutamorphic_repair_placeholder(trained_sentiment_model, sentiment_vectorizer): + """ + Placeholder test to suggest the idea of automatic inconsistency repair. + Currently does not perform real repair, just simulates detection. + """ + model = trained_sentiment_model + vectorizer = sentiment_vectorizer + sentence = "The dessert was delightful." + mutated = replace_with_synonym(sentence, "delightful", "amazing") + + orig_vec = vectorizer.transform([sentence]) + mutated_vec = vectorizer.transform([mutated]) + pred_orig = model.predict(orig_vec)[0] + pred_mutated = model.predict(mutated_vec)[0] + + if pred_orig != pred_mutated: + # placeholder "repair": fallback to original + repaired = sentence + repaired_vec = vectorizer.transform([repaired]) + repaired_pred = model.predict(repaired_vec)[0] + assert repaired_pred == pred_orig, ( + f"Repair step failed: original='{pred_orig}', mutated='{pred_mutated}', repaired='{repaired_pred}'" + ) diff --git a/tests/test_metamorphic.py b/tests/test_metamorphic.py deleted file mode 100644 index a5bc5d2..0000000 --- a/tests/test_metamorphic.py +++ /dev/null @@ -1,126 +0,0 @@ -import os -import pickle -import pytest -import numpy as np - - -@pytest.fixture(scope="module") -def trained_sentiment_model(): - model_path = "../artifacts/trained_model.pkl" - model_path = os.path.abspath("artifacts/trained_model.pkl") - - if not os.path.exists(model_path): - pytest.fail( - f"ERROR: Model file not found at {model_path}. " - f"Ensure the DVC 'train' stage has been run." - ) - with open(model_path, "rb") as f: - model = pickle.load(f) - return model - - -@pytest.fixture(scope="module") -def sentiment_vectorizer(): - vectorizer_path = "artifacts/c1_BoW_Sentiment_Model.pkl" - if not os.path.exists(vectorizer_path): - pytest.fail( - f"ERROR: Vectorizer file not found at {vectorizer_path}. " - f"Ensure the DVC 'preprocess' stage has been run." - ) - with open(vectorizer_path, "rb") as f: - vectorizer = pickle.load(f) - return vectorizer - - -def replace_with_synonym(text, original_word, synonym): - return text.replace(original_word, synonym) - - -# Metamorphic tests for sentiment analysis model -def test_metamorphic_synonym_positive_review( - trained_sentiment_model, sentiment_vectorizer -): - model = trained_sentiment_model - - original_review = "The food was great and service excellent." - original_review_vectorized = sentiment_vectorizer.transform([original_review]) - original_prediction = model.predict(original_review_vectorized)[0] - - # context similar alternative 1 - transformed_review_1_text = replace_with_synonym(original_review, "great", "good") - transformed_review_1_vectorized = sentiment_vectorizer.transform( - [transformed_review_1_text] - ) - transformed_prediction_1 = model.predict(transformed_review_1_vectorized)[0] - - assert ( - transformed_prediction_1 == original_prediction - ), f"Sentiment changed from '{original_prediction}' to '{transformed_prediction_1}' after synonym replacement (great -> good)." - - # context similar alternative 2 - transformed_review_2_text = replace_with_synonym( - original_review, "excellent", "fine" - ) - transformed_review_2_vectorized = sentiment_vectorizer.transform( - [transformed_review_2_text] - ) - transformed_prediction_2 = model.predict(transformed_review_2_vectorized)[0] - assert ( - transformed_prediction_2 == original_prediction - ), f"Sentiment changed from '{original_prediction}' to '{transformed_prediction_2}' after synonym replacement (excellent -> fine)." - - -# Metamorphic tests for sentiment analysis model -def test_metamorphic_synonym_negative_review( - trained_sentiment_model, sentiment_vectorizer -): - model = trained_sentiment_model - original_review = "The food was terrible and the service awful." - original_review_vectorized = sentiment_vectorizer.transform([original_review]) - original_prediction = model.predict(original_review_vectorized)[0] - - # context similar alternative 1 - transformed_review_1_text = replace_with_synonym( - original_review, "terrible", "horrible" - ) - transformed_review_1_vectorized = sentiment_vectorizer.transform( - [transformed_review_1_text] - ) - transformed_prediction_1 = model.predict(transformed_review_1_vectorized)[0] - - assert ( - transformed_prediction_1 == original_prediction - ), f"Sentiment changed from '{original_prediction}' to '{transformed_prediction_1}' after synonym replacement (terrible -> horrible)." - - # context similar alternative 2 - transformed_review_2_text = replace_with_synonym( - original_review, "awful", "dreadful" - ) - transformed_review_2_vectorized = sentiment_vectorizer.transform( - [transformed_review_2_text] - ) - transformed_prediction_2 = model.predict(transformed_review_2_vectorized)[0] - - assert ( - transformed_prediction_2 == original_prediction - ), f"Sentiment changed from '{original_prediction}' to '{transformed_prediction_2}' after synonym replacement (awful -> dreadful)." - - -def test_metamorphic_add_neutral_phrase_negative_review( - trained_sentiment_model, sentiment_vectorizer -): - model = trained_sentiment_model - original_review = "The experience was terrible." - original_review_vectorized = sentiment_vectorizer.transform([original_review]) - original_prediction = model.predict(original_review_vectorized)[0] - - transformed_review_text = "To be honest, " + original_review - transformed_review_vectorized = sentiment_vectorizer.transform( - [transformed_review_text] - ) - transformed_prediction = model.predict(transformed_review_vectorized)[0] - - # assert original_prediction == "Negative" # Base assumption - assert ( - transformed_prediction == original_prediction - ), f"Sentiment changed from '{original_prediction}' to '{transformed_prediction}' after adding a neutral phrase."