-
Notifications
You must be signed in to change notification settings - Fork 0
/
test2.py
162 lines (136 loc) · 4.86 KB
/
test2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#step 1
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
#step 2
# Load the data
data = pd.read_csv('data.csv')
#step 3
# Initial Data Exploration
print("******\n")
print(data.head())
print(data.info())
print("******\n\n")
#step 4
# Clean the data
data = data.dropna()
data = data.drop_duplicates()
#step 5
# Remove unnecessary columns
columns_to_remove = ['customer_id'] # Specify columns to remove
data = data.drop(columns=columns_to_remove, errors='ignore')
print("******\n")
print(data.info())
print("******\n\n")
# Clean specific columns
# Clean the 'age' column
data['age'] = data['age'].str.replace(r'\D', '', regex=True)
data['age'] = pd.to_numeric(data['age'], errors='coerce')
# Clean the 'monthly_minutes' column
data['monthly_minutes'] = data['monthly_minutes'].str.replace(r'\D', '', regex=True)
data['monthly_minutes'] = pd.to_numeric(data['monthly_minutes'], errors='coerce')
# Ensure all relevant columns are numeric
relevant_columns = ['age', 'income', 'monthly_minutes', 'monthly_data_gb', 'monthly_bill', 'outstanding_balance', 'support_tickets', 'churn']
data[relevant_columns] = data[relevant_columns].apply(pd.to_numeric, errors='coerce')
print(data.info()) # Print info to ensure correct data types
print("******\n\n")
#step 6
# Remove outliers based on z-score
z_scores = np.abs(stats.zscore(data[relevant_columns].select_dtypes(include=[np.number])))
data = data[(z_scores < 3).all(axis=1)]
# Visualize outliers with box plots
for col in ['monthly_minutes', 'monthly_bill']:
plt.figure(figsize=(12, 6))
sns.boxplot(x=data[col])
plt.title(f'Box Plot of {col.capitalize()}')
plt.xlabel(col.capitalize())
plt.show()
#step 7
# Identify outliers for visualization
data['is_outlier'] = (z_scores >= 3).any(axis=1)
# Scatter plots to visualize outliers
for col in ['monthly_minutes', 'monthly_bill']:
plt.figure(figsize=(12, 6))
plt.scatter(data.index, data[col], c=data['is_outlier'].map({True: 'red', False: 'blue'}), alpha=0.5)
plt.title(f'Scatter Plot of {col.capitalize()} with Outliers Highlighted')
plt.xlabel('Index')
plt.ylabel(col.capitalize())
plt.show()
# Visualize how features influence churn
plt.figure(figsize=(12, 6))
sns.countplot(data=data, x='churn')
plt.title('Churn Distribution')
plt.xlabel('Churn (1: Left, 0: Stayed)')
plt.ylabel('Count')
plt.show()
# Visualizing churn against other features (example with monthly_bill)
plt.figure(figsize=(12, 6))
sns.boxplot(x='churn', y='monthly_bill', data=data)
plt.title('Monthly Bill by Churn Status')
plt.xlabel('Churn (1: Left, 0: Stayed)')
plt.ylabel('Monthly Bill')
plt.show()
#step 8
#Normalizing or standardizing scales
scaler = StandardScaler()
data[data.columns] = scaler.fit_transform(data[data.columns])
#step 9
# splitting data into target and features
X = data.drop('churn', axis=1) # X is the features
y = data['churn'] # y is the target
#step 10
# splitting data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#step 11
# training
model = LinearRegression()
model.fit(X_train, y_train)
# Step 12: Make predictions for a single instance
input_data = pd.DataFrame({
"region": ["North"], # Assuming 'region' needs to be one-hot encoded or label-encoded
"income": [75913],
"monthly_minutes": [2071],
"monthly_data_gb": [18.12],
"support_tickets": [8],
"monthly_bill": [189.67],
"outstanding_balance": [221.73]
})
# Make sure to preprocess the input_data in the same way as you did for the training data
# For example, if you encoded the 'region' column, you need to apply the same encoding here.
# Make predictions on the test set
y_pred = model.predict(input_data)
print(y_pred)
# Assuming y_pred gives probabilities for churn (0 to 1)
threshold = 0.5
predicted_churn = (y_pred >= threshold).astype(int) # Convert probabilities to 0 or 1
print(predicted_churn)
# Step 13: Visualize predictions
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.title('Actual vs. Predicted Values')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.plot([0, 1], [0, 1], color='red', linestyle='--') # Diagonal line for reference
plt.show()
# Calculate residuals
residuals = y_test - y_pred
# Step 13: Residual Plot
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='red', linestyle='--') # Horizontal line at 0
plt.title('Residuals vs. Predicted Values')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.show()
# Step 13: Histogram of residuals
plt.figure(figsize=(10, 6))
plt.hist(residuals, bins=30, edgecolor='k')
plt.title('Histogram of Residuals')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()