forked from google-deepmind/funsearch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathproblem2.py
93 lines (77 loc) · 2.77 KB
/
problem2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import itertools
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
"""
This dataset consists of customer activity data from a telecom service. It will be used to develop predictive models to predict churn.
These are the attributes of the dataset:
State: string
Account length: integer
Area code: integer
International plan: string
Voice mail plan: string
Number vmail messages: integer
Total day minutes: double
Total day calls: integer
Total day charge: double
Total eve minutes: double
Total eve calls: integer
Total eve charge: double
Total night minutes: double
Total night calls: integer
Total night charge: double
Total intl minutes: double
Total intl calls: integer
Total intl charge: double
Customer service calls: integer
The target column has been removed from the dataset
Complete the function to preprocesses the dataset, returning a dataset with the 8 most important columns.
Process each column in any way that seems plausible, like one-hot-encoding or droping NA values.
Import libraries as needed.
"""
@funsearch.run
def run_evaluate():
path="dataset/churn-bigml-80.csv"
target_label = 'Churn'
df = pd.read_csv(path)
y = df[target_label].values
df = df.drop(columns=[target_label])
df : pd.DataFrame = select_columns_and_return_dataframe(df)
column_names: list[str] = df.columns.tolist()
column_names.sort()
df = df[column_names]
X = df.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
models = generate_models(X_train, y_train)
return [evaluate(models, X_test, y_test), column_names]
def generate_models(X_train, y_train):
logistic_regression = LogisticRegression()
svm = SVC()
k_neighbors = KNeighborsClassifier()
decision_tree = DecisionTreeClassifier()
naive_bayes = GaussianNB()
logistic_regression.fit(X_train, y_train)
svm.fit(X_train, y_train)
k_neighbors.fit(X_train, y_train)
decision_tree.fit(X_train, y_train)
naive_bayes.fit(X_train, y_train)
return [logistic_regression, svm, k_neighbors, decision_tree, naive_bayes]
def evaluate(models, X_test, y_test) -> float:
"""Returns the model accuracy."""
f1s = []
for model in models:
y_pred = model.predict(X_test)
f1s.append(f1_score(y_test, y_pred))
f1_avg = sum(f1s) / len(f1s)
return f1_avg
@funsearch.evolve
def select_columns_and_return_dataframe(df: pd.DataFrame) -> pd.DataFrame:
# Preprocess columns as needed and the return df with up to 8 columns
df = df[['Total day minutes']]
return df