Gopher-Industries · Tanvir-ctrl1 · Sep 18, 2025
diff --git a/AI Guardian/Emotional_baseline/emotional_baseline.py b/AI Guardian/Emotional_baseline/emotional_baseline.py
diff --git a/AI Guardian/Emotional_baseline/out_final/confusion_matrix.png b/AI Guardian/Emotional_baseline/out_final/confusion_matrix.png
diff --git a/AI Guardian/Emotional_baseline/out_final/feature_preview.txt b/AI Guardian/Emotional_baseline/out_final/feature_preview.txt
@@ -0,0 +1,85 @@
+Sample nonzero features for first test doc
+
+--- BoW (1-gram) ---
+patient: 3.0000
+oxygen: 2.0000
+20mg: 1.0000
+2l: 1.0000
+85: 1.0000
+92: 1.0000
+administered: 1.0000
+alert: 1.0000
+bpm: 1.0000
+breathing: 1.0000
+cannula: 1.0000
+currently: 1.0000
+drink: 1.0000
+encouraged: 1.0000
+furosemide: 1.0000
+heart: 1.0000
+nasal: 1.0000
+rate: 1.0000
+remains: 1.0000
+responsive: 1.0000
+reveal: 1.0000
+saturation: 1.0000
+signs: 1.0000
+supplemental: 1.0000
+tolerated: 1.0000
+Vocab size: 242
+
+--- TF-IDF (1–2) ---
+currently breathing: 0.2249
+responsive encouraged: 0.2249
+2l nasal: 0.2119
+85 bpm: 0.2119
+administered patient: 0.2119
+rate 85: 0.2119
+breathing: 0.2019
+saturation 92: 0.2019
+85: 0.1936
+92: 0.1807
+alert responsive: 0.1807
+responsive: 0.1807
+furosemide 20mg: 0.1706
+oxygen 2l: 0.1623
+20mg administered: 0.1554
+cannula: 0.1554
+drink water: 0.1554
+encouraged drink: 0.1554
+nasal: 0.1554
+nasal cannula: 0.1554
+supplemental: 0.1554
+supplemental oxygen: 0.1554
+2l: 0.1523
+drink: 0.1466
+encouraged: 0.1371
+Vocab size: 745
+
+--- TF-IDF (1–3) ---
+alert responsive encouraged: 0.1876
+currently breathing: 0.1876
+responsive encouraged: 0.1876
+2l nasal: 0.1767
+2l nasal cannula: 0.1767
+85 bpm: 0.1767
+administered patient: 0.1767
+heart rate 85: 0.1767
+oxygen 2l nasal: 0.1767
+patient alert responsive: 0.1767
+rate 85: 0.1767
+rate 85 bpm: 0.1767
+breathing: 0.1683
+oxygen saturation 92: 0.1683
+saturation 92: 0.1683
+85: 0.1615
+92: 0.1506
+alert responsive: 0.1506
+furosemide 20mg administered: 0.1506
+responsive: 0.1506
+furosemide 20mg: 0.1422
+oxygen 2l: 0.1354
+supplemental oxygen 2l: 0.1354
+20mg administered: 0.1296
+cannula: 0.1296
+Vocab size: 1331
diff --git a/AI Guardian/Emotional_baseline/out_final/lexicon_preview.csv b/AI Guardian/Emotional_baseline/out_final/lexicon_preview.csv
@@ -0,0 +1,6 @@
+pos_count,neg_count,pos_rate,neg_rate,anger_count,joy_count,sadness_count,fear_count,surprise_count,disgust_count,anger_rate,joy_rate,sadness_rate,fear_rate,surprise_rate,disgust_rate
+0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
+1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
+1.0,0.0,0.02564102564102564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
+1.0,1.0,0.025,0.025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
+1.0,1.0,0.025,0.025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
diff --git a/AI Guardian/Emotional_baseline/out_final/settings.json b/AI Guardian/Emotional_baseline/out_final/settings.json
@@ -0,0 +1,23 @@
+{
+  "csv": "New AI spreadsheet - Sheet1.csv",
+  "text_col": "nursingNote",
+  "label_col": "state",
+  "group_col": null,
+  "test_size": 0.2,
+  "min_df": 2,
+  "n_splits": 5,
+  "with_lexicon": true,
+  "extra_label_words": "",
+  "stripped_tokens_effective": [
+    "comfort",
+    "comfortable",
+    "normal",
+    "normality",
+    "normally",
+    "sick",
+    "sickness",
+    "uncomfort",
+    "uncomfortable"
+  ],
+  "seed": 42
+}
diff --git a/AI Guardian/Emotional_baseline/out_final/sprint2_cv_results.csv b/AI Guardian/Emotional_baseline/out_final/sprint2_cv_results.csv
@@ -0,0 +1,21 @@
+Fold,Features,Model,Accuracy,F1_macro
+1,TFIDF(1–2) + lex,LogReg,0.975,0.9463203463203463
+1,TFIDF(1–2) + lex,LinearSVM,0.975,0.9463203463203463
+2,TFIDF(1–2) + lex,LogReg,0.975,0.9274853801169591
+2,TFIDF(1–2) + lex,LinearSVM,0.975,0.9274853801169591
+3,TFIDF(1–2) + lex,LogReg,0.975,0.9270440251572327
+3,TFIDF(1–2) + lex,LinearSVM,0.975,0.9270440251572327
+4,TFIDF(1–2) + lex,LogReg,0.95,0.9538461538461539
+4,TFIDF(1–2) + lex,LinearSVM,0.975,0.9775910364145659
+5,TFIDF(1–2) + lex,LogReg,1.0,1.0
+5,TFIDF(1–2) + lex,LinearSVM,1.0,1.0
+1,TFIDF(1–3) + lex,LogReg,0.975,0.9463203463203463
+1,TFIDF(1–3) + lex,LinearSVM,0.975,0.9463203463203463
+2,TFIDF(1–3) + lex,LogReg,0.975,0.9274853801169591
+2,TFIDF(1–3) + lex,LinearSVM,0.975,0.9274853801169591
+3,TFIDF(1–3) + lex,LogReg,0.975,0.9270440251572327
+3,TFIDF(1–3) + lex,LinearSVM,0.975,0.9270440251572327
+4,TFIDF(1–3) + lex,LogReg,0.95,0.9538461538461539
+4,TFIDF(1–3) + lex,LinearSVM,0.975,0.9775910364145659
+5,TFIDF(1–3) + lex,LogReg,1.0,1.0
+5,TFIDF(1–3) + lex,LinearSVM,1.0,1.0
diff --git a/AI Guardian/Emotional_baseline/out_final/sprint2_cv_summary.csv b/AI Guardian/Emotional_baseline/out_final/sprint2_cv_summary.csv
@@ -0,0 +1,5 @@
+Features,Model,Accuracy_mean,Accuracy_std,F1_mean,F1_std
+TFIDF(1–2) + lex,LinearSVM,0.9800000000000001,0.011180339887498959,0.9556881576018208,0.03219826993246655
+TFIDF(1–3) + lex,LinearSVM,0.9800000000000001,0.011180339887498959,0.9556881576018208,0.03219826993246655
+TFIDF(1–2) + lex,LogReg,0.975,0.017677669529663705,0.9509391810881385,0.029823679075897343
+TFIDF(1–3) + lex,LogReg,0.975,0.017677669529663705,0.9509391810881385,0.029823679075897343
diff --git a/AI Guardian/Emotional_baseline/out_final/sprint2_report.md b/AI Guardian/Emotional_baseline/out_final/sprint2_report.md
@@ -0,0 +1,64 @@
+# Sprint 2: Basic NLP Features & Baselines (Leakage-Safe)
+
+- **CSV:** New AI spreadsheet - Sheet1.csv
+
+- **Text column:** `nursingNote`   |   **Label column:** `state`
+
+- **Groups:** groups by normalized text
+
+- **Dedup:** exact duplicates removed after scrubbing
+
+- **Leakage guard:** stripped tokens ['comfort', 'comfortable', 'normal', 'normality', 'normally', 'sick', 'sickness', 'uncomfort', 'uncomfortable']
+
+
+## Feature extraction
+
+- BoW (1-gram), min_df=2
+
+- TF-IDF with bigrams & trigrams (captures phrases like *not happy*, *very tired*).
+
+- Lexicon counts: pos/neg + six emotions (anger, joy, sadness, fear, surprise, disgust).
+
+
+## Baseline models
+- Logistic Regression (balanced)
+- Linear SVM (balanced)
+
+
+## Single-split results (sorted by Macro-F1)
+
+
+        Features     Model  Accuracy  F1_macro
+    BoW(1) + lex    LogReg     1.000  1.000000
+    BoW(1) + lex LinearSVM     1.000  1.000000
+TFIDF(1–2) + lex LinearSVM     0.950  0.906020
+TFIDF(1–3) + lex LinearSVM     0.950  0.906020
+TFIDF(1–3) + lex    LogReg     0.925  0.884162
+TFIDF(1–2) + lex    LogReg     0.900  0.822402
+
+
+
+**Best (single split):** BoW(1) + lex + LogReg  
+
+Accuracy: **1.000**   |   Macro-F1: **1.000**
+
+
+**Confusion matrix** saved to `confusion_matrix.png`.
+
+
+## 5-fold CV (GroupKFold; mean ± std)
+
+
+        Features     Model      Accuracy      F1_macro
+TFIDF(1–2) + lex LinearSVM  0.98 ± 0.011 0.956 ± 0.032
+TFIDF(1–3) + lex LinearSVM  0.98 ± 0.011 0.956 ± 0.032
+TFIDF(1–2) + lex    LogReg 0.975 ± 0.018  0.951 ± 0.03
+TFIDF(1–3) + lex    LogReg 0.975 ± 0.018  0.951 ± 0.03
+
+
+
+## Environment
+
+- python: `/opt/anaconda3/envs/sprint2nlp/bin/python`
+
+- numpy: 2.3.3 | pandas: 2.3.2 | sklearn: 1.7.2
diff --git a/AI Guardian/Emotional_baseline/out_final/sprint2_results.csv b/AI Guardian/Emotional_baseline/out_final/sprint2_results.csv
@@ -0,0 +1,7 @@
+Features,Model,Accuracy,F1_macro
+BoW(1) + lex,LogReg,1.0,1.0
+BoW(1) + lex,LinearSVM,1.0,1.0
+TFIDF(1–2) + lex,LinearSVM,0.95,0.906020066889632
+TFIDF(1–3) + lex,LinearSVM,0.95,0.906020066889632
+TFIDF(1–3) + lex,LogReg,0.925,0.8841623785020012
+TFIDF(1–2) + lex,LogReg,0.9,0.8224019167415394