|
| 1 | +# data_science/streamlit_app.py |
| 2 | +import streamlit as st |
| 3 | +import pandas as pd |
| 4 | +import numpy as np |
| 5 | +from sklearn.model_selection import train_test_split |
| 6 | +from sklearn.preprocessing import StandardScaler |
| 7 | +from sklearn.impute import SimpleImputer |
| 8 | +from sklearn.pipeline import Pipeline |
| 9 | +from sklearn.linear_model import LogisticRegression |
| 10 | +from sklearn.ensemble import RandomForestClassifier |
| 11 | +from sklearn.dummy import DummyClassifier |
| 12 | +from sklearn.metrics import ( |
| 13 | + accuracy_score, precision_score, recall_score, f1_score, |
| 14 | + confusion_matrix, classification_report, roc_auc_score, roc_curve |
| 15 | +) |
| 16 | +import matplotlib.pyplot as plt |
| 17 | +import seaborn as sns |
| 18 | + |
| 19 | +st.set_page_config(page_title="Data Science Demo", layout="wide") |
| 20 | + |
| 21 | +st.title("Small Streamlit Data Science App") |
| 22 | +st.markdown("Upload a CSV, pick the target column, choose a model, and view metrics.") |
| 23 | + |
| 24 | +uploaded = st.file_uploader("Upload a CSV file", type=["csv"]) |
| 25 | +if uploaded is None: |
| 26 | + st.info("Upload a CSV to get started. Example: a classification dataset with a target column.") |
| 27 | + st.stop() |
| 28 | + |
| 29 | +# read csv |
| 30 | +df = pd.read_csv(uploaded) |
| 31 | +st.write("### Preview of uploaded data", df.head()) |
| 32 | + |
| 33 | +# choose target |
| 34 | +all_columns = df.columns.tolist() |
| 35 | +target = st.selectbox("Select target column (label)", options=all_columns) |
| 36 | + |
| 37 | +# simple features selection: drop non-numeric by default but allow user to choose |
| 38 | +st.write("Select feature columns (default: numeric columns excluding target)") |
| 39 | +numeric = df.select_dtypes(include=[np.number]).columns.tolist() |
| 40 | +default_features = [c for c in numeric if c != target] |
| 41 | +features = st.multiselect("Features", options=all_columns, default=default_features) |
| 42 | + |
| 43 | +if len(features) == 0: |
| 44 | + st.error("Please select at least one feature column.") |
| 45 | + st.stop() |
| 46 | + |
| 47 | +# task type detection (very naive) |
| 48 | +unique_vals = df[target].nunique() |
| 49 | +task_type = "classification" if unique_vals <= 20 else "regression (not implemented)" |
| 50 | +st.write(f"Detected: **{task_type}** (unique labels: {unique_vals})") |
| 51 | + |
| 52 | +if task_type != "classification": |
| 53 | + st.warning("This demo only supports classification. Choose a categorical/binary target.") |
| 54 | + st.stop() |
| 55 | + |
| 56 | +# train/test split params |
| 57 | +test_size = st.sidebar.slider("Test size (%)", min_value=10, max_value=50, value=25) / 100.0 |
| 58 | +random_state = st.sidebar.number_input("Random state", min_value=0, max_value=9999, value=42) |
| 59 | + |
| 60 | +# model selection |
| 61 | +model_name = st.selectbox("Choose model", ["Logistic Regression", "Random Forest", "Baseline Dummy"]) |
| 62 | +if model_name == "Logistic Regression": |
| 63 | + model = LogisticRegression(max_iter=1000) |
| 64 | +elif model_name == "Random Forest": |
| 65 | + model = RandomForestClassifier(n_estimators=100, random_state=random_state) |
| 66 | +else: |
| 67 | + model = DummyClassifier(strategy="most_frequent") |
| 68 | + |
| 69 | +# prepare data |
| 70 | +X = df[features].copy() |
| 71 | +y = df[target].copy() |
| 72 | + |
| 73 | +# basic imputing and scaling pipeline |
| 74 | +pipeline = Pipeline([ |
| 75 | + ("imputer", SimpleImputer(strategy="mean")), |
| 76 | + ("scaler", StandardScaler()), |
| 77 | + ("clf", model) |
| 78 | +]) |
| 79 | + |
| 80 | +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y) |
| 81 | + |
| 82 | +with st.spinner("Training model..."): |
| 83 | + pipeline.fit(X_train, y_train) |
| 84 | + |
| 85 | +y_pred = pipeline.predict(X_test) |
| 86 | +metrics = { |
| 87 | + "accuracy": accuracy_score(y_test, y_pred), |
| 88 | + "precision_macro": precision_score(y_test, y_pred, average="macro", zero_division=0), |
| 89 | + "recall_macro": recall_score(y_test, y_pred, average="macro", zero_division=0), |
| 90 | + "f1_macro": f1_score(y_test, y_pred, average="macro", zero_division=0) |
| 91 | +} |
| 92 | + |
| 93 | +st.subheader("Metrics") |
| 94 | +col1, col2, col3, col4 = st.columns(4) |
| 95 | +col1.metric("Accuracy", f"{metrics['accuracy']:.4f}") |
| 96 | +col2.metric("Precision (macro)", f"{metrics['precision_macro']:.4f}") |
| 97 | +col3.metric("Recall (macro)", f"{metrics['recall_macro']:.4f}") |
| 98 | +col4.metric("F1 (macro)", f"{metrics['f1_macro']:.4f}") |
| 99 | + |
| 100 | +st.subheader("Classification report") |
| 101 | +st.text(classification_report(y_test, y_pred, zero_division=0)) |
| 102 | + |
| 103 | +st.subheader("Confusion matrix") |
| 104 | +cm = confusion_matrix(y_test, y_pred) |
| 105 | +fig, ax = plt.subplots() |
| 106 | +sns.heatmap(cm, annot=True, fmt="d", ax=ax) |
| 107 | +ax.set_xlabel("Predicted") |
| 108 | +ax.set_ylabel("Actual") |
| 109 | +st.pyplot(fig) |
| 110 | + |
| 111 | +# ROC AUC for binary problems |
| 112 | +if len(np.unique(y_test)) == 2: |
| 113 | + try: |
| 114 | + y_score = pipeline.predict_proba(X_test)[:, 1] |
| 115 | + auc = roc_auc_score(y_test, y_score) |
| 116 | + st.write(f"ROC AUC: **{auc:.4f}**") |
| 117 | + fpr, tpr, _ = roc_curve(y_test, y_score) |
| 118 | + fig2, ax2 = plt.subplots() |
| 119 | + ax2.plot(fpr, tpr) |
| 120 | + ax2.plot([0,1],[0,1],"--") |
| 121 | + ax2.set_xlabel("FPR") |
| 122 | + ax2.set_ylabel("TPR") |
| 123 | + ax2.set_title("ROC curve") |
| 124 | + st.pyplot(fig2) |
| 125 | + except Exception as e: |
| 126 | + st.info("Model does not provide probability predictions to compute ROC AUC.") |
| 127 | + |
| 128 | +# feature importance (if model supports it) |
| 129 | +st.subheader("Feature importances (if available)") |
| 130 | +base_model = pipeline.named_steps["clf"] |
| 131 | +if hasattr(base_model, "feature_importances_"): |
| 132 | + importances = base_model.feature_importances_ |
| 133 | + fi = pd.Series(importances, index=features).sort_values(ascending=False) |
| 134 | + st.bar_chart(fi) |
| 135 | +elif hasattr(base_model, "coef_"): |
| 136 | + coefs = np.abs(base_model.coef_).ravel() |
| 137 | + fi = pd.Series(coefs, index=features).sort_values(ascending=False) |
| 138 | + st.bar_chart(fi) |
| 139 | +else: |
| 140 | + st.info("Selected model has no feature_importances_ or coef_.") |
0 commit comments