forked from google-deepmind/funsearch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathproblem1.py
88 lines (72 loc) · 2.85 KB
/
problem1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import itertools
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
"""
A model to detect higher chance of heart attack is being trained on a dataset.
These are the attributes of the dataset:
- age : Age of the patient
- sex : Sex of the patient
- cp : Chest pain type ~ 0 = Typical Angina, 1 = Atypical Angina, 2 = Non-anginal Pain, 3 = Asymptomatic
- trtbps : Resting blood pressure (in mm Hg)
- chol : Cholestoral in mg/dl fetched via BMI sensor
- fbs : (fasting blood sugar > 120 mg/dl) ~ 1 = True, 0 = False
- restecg : Resting electrocardiographic results ~ 0 = Normal, 1 = ST-T wave normality, 2 = Left ventricular hypertrophy
- thalachh : Maximum heart rate achieved
- oldpeak : Previous peak
- slp : Slope
- caa : Number of major vessels
- thall : Thalium Stress Test result ~ (0,3)
- exng : Exercise induced angina ~ 1 = Yes, 0 = No
The target column has been removed from the dataset.
Complete the function to preprocesses the dataset, returning a dataset with up to FIVE important columns.
Process each column in any way that seems plausible.
Import libraries as needed.
"""
@funsearch.run
def run_evaluate():
path="dataset/heart.csv"
target_label = 'output'
df = pd.read_csv(path)
y = df[target_label].values
df = df.drop(columns=[target_label])
df : pd.DataFrame = select_columns_and_return_dataframe(df)
column_names: list[str] = df.columns.tolist()
column_names.sort()
df = df[column_names]
X = df.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
models = generate_models(X_train, y_train)
return [evaluate(models, X_test, y_test), column_names]
def generate_models(X_train, y_train):
logistic_regression = LogisticRegression()
svm = SVC()
k_neighbors = KNeighborsClassifier()
decision_tree = DecisionTreeClassifier()
naive_bayes = GaussianNB()
logistic_regression.fit(X_train, y_train)
svm.fit(X_train, y_train)
k_neighbors.fit(X_train, y_train)
decision_tree.fit(X_train, y_train)
naive_bayes.fit(X_train, y_train)
return [logistic_regression, svm, k_neighbors, decision_tree, naive_bayes]
def evaluate(models, X_test, y_test) -> float:
"""Returns the model accuracy."""
f1s = []
for model in models:
y_pred = model.predict(X_test)
f1s.append(f1_score(y_test, y_pred))
f1_avg = sum(f1s) / len(f1s)
return f1_avg
@funsearch.evolve
def select_columns_and_return_dataframe(df: pd.DataFrame) -> pd.DataFrame:
# Preprocess as needed, select the 5 best columns and return df
df = df[['age']]
return df