telco_customer_churn_prediction .py

# -*- coding: utf-8 -*-
"""Telco Customer Churn Prediction.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1xd1khcb44pI9eOZoJOR8IsN0x2IIZ5W0

# Title: Telco Customer Churn Prediction
"""

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_curve, f1_score, roc_auc_score, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns

"""# --- FUNCTIONS ---"""

def load_and_preprocess_data(file_path):
    """Loads and preprocesses the Telco customer churn data.

    Args:
        file_path (str): Path to the CSV file.

    Returns:
        tuple: X_train, X_test, y_train, y_test (training and testing data)
    """
    df = pd.read_csv(file_path)
    df.set_index("customerID", inplace=True)

    # Handle missing values
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    df["TotalCharges"] = df["TotalCharges"].fillna(df["TotalCharges"].mean())

    # Feature Engineering
    df["SeniorCitizen"] = df["SeniorCitizen"].astype("object")
    X_dummies = pd.get_dummies(df.drop(columns=["Churn"], axis=1))
    y_target = df["Churn"]

    # Data Splitting
    X_train, X_test, y_train, y_test = train_test_split(X_dummies, y_target, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

def train_and_evaluate_model(X_train, X_test, y_train, y_test, model, model_name="Model"):
    """Trains and evaluates a logistic regression model.

    Args:
        X_train, X_test, y_train, y_test: Training and testing data.
        model: The model to train and evaluate.
        model_name (str): Name of the model for display purposes.
    """
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Evaluation Metrics
    precision = precision_score(y_test, y_pred, pos_label="Yes")
    recall = recall_score(y_test, y_pred, pos_label="Yes")
    f1 = f1_score(y_test, y_pred, pos_label="Yes")
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    print(f"\n--- {model_name} Evaluation ---")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"AUC-ROC: {roc_auc:.2f}")

    # ROC Curve (Matplotlib)
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba, pos_label="Yes")
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f"{model_name} ROC Curve")
    plt.legend(loc="lower right")
    plt.savefig(f'{model_name}_roc_curve.png')  # Save the plot
    plt.show()
    plt.close()

def hyperparameter_tuning(X_train, y_train, model):
    """Performs hyperparameter tuning for logistic regression.

    Args:
        X_train, y_train: Training data.
        model: The model to tune.

    Returns:
        The best model found during grid search.
    """
    param_grid = {
        'C': [1e-5, 1e-4, 1e-3, 0.01, 0.1],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear'],
        'class_weight': ['balanced', None]
    }

    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_macro')
    grid_search.fit(X_train, y_train)

    print("\n--- Hyperparameter Tuning Results ---")
    print("Best Parameters:", grid_search.best_params_)
    print("Best Score:", grid_search.best_score_)

    return grid_search.best_estimator_

def plot_feature_importance(model, feature_names):
    """Plots feature importance using matplotlib."""
    importance = model.coef_[0]
    feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
    feature_importance = feature_importance.sort_values('Importance', ascending=False)

    plt.figure(figsize=(10, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importance)
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.show()

"""# --- MAIN EXECUTION ---

"""

# Load and preprocess data
X_train, X_test, y_train, y_test = load_and_preprocess_data("/content/WA_Fn-UseC_-Telco-Customer-Churn.csv")  # Update with the file path

# Initial model training and evaluation
initial_model = LogisticRegression()
train_and_evaluate_model(X_train, X_test, y_train, y_test, initial_model, "Initial Model")

# Hyperparameter tuning
best_model = hyperparameter_tuning(X_train, y_train, initial_model)

# Retraining and final evaluation with best parameters
train_and_evaluate_model(X_train, X_test, y_train, y_test, best_model, "Best Model")

# Plot feature importance
plot_feature_importance(best_model, X_train.columns)