Skip to content

Commit 56c3d1a

Browse files
authored
Merge pull request #98 from Dev-Jeff28/main
Solved Issue: Add Model Evaluation Dashboard (Streamlit) #92
2 parents e88c3e6 + 56e3697 commit 56c3d1a

File tree

5 files changed

+192
-0
lines changed

5 files changed

+192
-0
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
## 🧠 Interactive Streamlit ML App
2+
3+
A small Streamlit web application that allows users to **upload a CSV file**, **choose a machine learning model**, and **view model performance metrics interactively**.
4+
5+
6+
## 🚀 Features
7+
8+
- 📂 Upload any CSV dataset
9+
- 🎯 Select your **target (label) column**
10+
- ⚙️ Choose from built-in ML models (Logistic Regression, Decision Tree, Random Forest, etc.)
11+
- 📊 View metrics such as Accuracy, Precision, Recall, F1-Score, and Confusion Matrix
12+
- 🧩 Adjustable train-test split and random seed
13+
- 🧰 Works for both **classification** and **regression**
14+
15+
---
16+
17+
## 🧰 Installation & Requirements
18+
pip install -r requirements.txt
19+
20+
▶️ Running the App
21+
streamlit run streamlit_app.py
22+
23+
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
streamlit>=1.20
2+
pandas
3+
scikit-learn
4+
matplotlib
5+
seaborn
6+
numpy
7+
plotly
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
# data_science/streamlit_app.py
2+
import streamlit as st
3+
import pandas as pd
4+
import numpy as np
5+
from sklearn.model_selection import train_test_split
6+
from sklearn.preprocessing import StandardScaler
7+
from sklearn.impute import SimpleImputer
8+
from sklearn.pipeline import Pipeline
9+
from sklearn.linear_model import LogisticRegression
10+
from sklearn.ensemble import RandomForestClassifier
11+
from sklearn.dummy import DummyClassifier
12+
from sklearn.metrics import (
13+
accuracy_score, precision_score, recall_score, f1_score,
14+
confusion_matrix, classification_report, roc_auc_score, roc_curve
15+
)
16+
import matplotlib.pyplot as plt
17+
import seaborn as sns
18+
19+
st.set_page_config(page_title="Data Science Demo", layout="wide")
20+
21+
st.title("Small Streamlit Data Science App")
22+
st.markdown("Upload a CSV, pick the target column, choose a model, and view metrics.")
23+
24+
uploaded = st.file_uploader("Upload a CSV file", type=["csv"])
25+
if uploaded is None:
26+
st.info("Upload a CSV to get started. Example: a classification dataset with a target column.")
27+
st.stop()
28+
29+
# read csv
30+
df = pd.read_csv(uploaded)
31+
st.write("### Preview of uploaded data", df.head())
32+
33+
# choose target
34+
all_columns = df.columns.tolist()
35+
target = st.selectbox("Select target column (label)", options=all_columns)
36+
37+
# simple features selection: drop non-numeric by default but allow user to choose
38+
st.write("Select feature columns (default: numeric columns excluding target)")
39+
numeric = df.select_dtypes(include=[np.number]).columns.tolist()
40+
default_features = [c for c in numeric if c != target]
41+
features = st.multiselect("Features", options=all_columns, default=default_features)
42+
43+
if len(features) == 0:
44+
st.error("Please select at least one feature column.")
45+
st.stop()
46+
47+
# task type detection (very naive)
48+
unique_vals = df[target].nunique()
49+
task_type = "classification" if unique_vals <= 20 else "regression (not implemented)"
50+
st.write(f"Detected: **{task_type}** (unique labels: {unique_vals})")
51+
52+
if task_type != "classification":
53+
st.warning("This demo only supports classification. Choose a categorical/binary target.")
54+
st.stop()
55+
56+
# train/test split params
57+
test_size = st.sidebar.slider("Test size (%)", min_value=10, max_value=50, value=25) / 100.0
58+
random_state = st.sidebar.number_input("Random state", min_value=0, max_value=9999, value=42)
59+
60+
# model selection
61+
model_name = st.selectbox("Choose model", ["Logistic Regression", "Random Forest", "Baseline Dummy"])
62+
if model_name == "Logistic Regression":
63+
model = LogisticRegression(max_iter=1000)
64+
elif model_name == "Random Forest":
65+
model = RandomForestClassifier(n_estimators=100, random_state=random_state)
66+
else:
67+
model = DummyClassifier(strategy="most_frequent")
68+
69+
# prepare data
70+
X = df[features].copy()
71+
y = df[target].copy()
72+
73+
# basic imputing and scaling pipeline
74+
pipeline = Pipeline([
75+
("imputer", SimpleImputer(strategy="mean")),
76+
("scaler", StandardScaler()),
77+
("clf", model)
78+
])
79+
80+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
81+
82+
with st.spinner("Training model..."):
83+
pipeline.fit(X_train, y_train)
84+
85+
y_pred = pipeline.predict(X_test)
86+
metrics = {
87+
"accuracy": accuracy_score(y_test, y_pred),
88+
"precision_macro": precision_score(y_test, y_pred, average="macro", zero_division=0),
89+
"recall_macro": recall_score(y_test, y_pred, average="macro", zero_division=0),
90+
"f1_macro": f1_score(y_test, y_pred, average="macro", zero_division=0)
91+
}
92+
93+
st.subheader("Metrics")
94+
col1, col2, col3, col4 = st.columns(4)
95+
col1.metric("Accuracy", f"{metrics['accuracy']:.4f}")
96+
col2.metric("Precision (macro)", f"{metrics['precision_macro']:.4f}")
97+
col3.metric("Recall (macro)", f"{metrics['recall_macro']:.4f}")
98+
col4.metric("F1 (macro)", f"{metrics['f1_macro']:.4f}")
99+
100+
st.subheader("Classification report")
101+
st.text(classification_report(y_test, y_pred, zero_division=0))
102+
103+
st.subheader("Confusion matrix")
104+
cm = confusion_matrix(y_test, y_pred)
105+
fig, ax = plt.subplots()
106+
sns.heatmap(cm, annot=True, fmt="d", ax=ax)
107+
ax.set_xlabel("Predicted")
108+
ax.set_ylabel("Actual")
109+
st.pyplot(fig)
110+
111+
# ROC AUC for binary problems
112+
if len(np.unique(y_test)) == 2:
113+
try:
114+
y_score = pipeline.predict_proba(X_test)[:, 1]
115+
auc = roc_auc_score(y_test, y_score)
116+
st.write(f"ROC AUC: **{auc:.4f}**")
117+
fpr, tpr, _ = roc_curve(y_test, y_score)
118+
fig2, ax2 = plt.subplots()
119+
ax2.plot(fpr, tpr)
120+
ax2.plot([0,1],[0,1],"--")
121+
ax2.set_xlabel("FPR")
122+
ax2.set_ylabel("TPR")
123+
ax2.set_title("ROC curve")
124+
st.pyplot(fig2)
125+
except Exception as e:
126+
st.info("Model does not provide probability predictions to compute ROC AUC.")
127+
128+
# feature importance (if model supports it)
129+
st.subheader("Feature importances (if available)")
130+
base_model = pipeline.named_steps["clf"]
131+
if hasattr(base_model, "feature_importances_"):
132+
importances = base_model.feature_importances_
133+
fi = pd.Series(importances, index=features).sort_values(ascending=False)
134+
st.bar_chart(fi)
135+
elif hasattr(base_model, "coef_"):
136+
coefs = np.abs(base_model.coef_).ravel()
137+
fi = pd.Series(coefs, index=features).sort_values(ascending=False)
138+
st.bar_chart(fi)
139+
else:
140+
st.info("Selected model has no feature_importances_ or coef_.")
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
age,income,loan_approved
2+
25,40000,0
3+
35,60000,1
4+
45,80000,1
5+
30,50000,0
6+
50,90000,1
7+
28,42000,0
8+
42,75000,1
9+
39,65000,1
10+
33,48000,0
11+
55,100000,1
12+
26,41000,0
13+
31,53000,0
14+
48,85000,1
15+
29,46000,0
16+
41,70000,1
17+
38,62000,1
18+
52,95000,1
19+
34,56000,0
20+
47,88000,1
21+
40,72000,1

0 commit comments

Comments
 (0)