-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathsimple_linear_regression.py
106 lines (95 loc) · 2.87 KB
/
simple_linear_regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from random import seed
from random import randrange
from csv import reader
from math import sqrt
import matplotlib.pyplot as plt
# Load a CSV file
def load_csv(filename):
dataset = list()
with open(filename, 'r') as file:
csv_reader = reader(file)
for row in csv_reader:
row=row[0].split()
if not row:
continue
dataset.append(row)
return dataset
# Convert string column to float
def str_column_to_float(dataset, column):
for row in dataset:
row[column] = float(row[column].strip())
# Split a dataset into a train and test set
def train_test_split(dataset, split):
train = list()
train_size = split * len(dataset)
dataset_copy = list(dataset)
while len(train) < train_size:
index = randrange(len(dataset_copy))
train.append(dataset_copy.pop(index))
return train, dataset_copy
# Calculate root mean squared error
def rmse_metric(actual, predicted):
sum_error = 0.0
for i in range(len(actual)):
prediction_error = predicted[i] - actual[i]
sum_error += (prediction_error ** 2)
mean_error = sum_error / float(len(actual))
return sqrt(mean_error)
# Evaluate an algorithm using a train/test split
def evaluate_algorithm(dataset, algorithm, split, *args):
train, test = train_test_split(dataset, split)
test_set = list()
y=[]
for row in test:
row_copy = list(row)
row_copy[-1] = None
test_set.append(row_copy)
y.append(row[-1])
predicted = algorithm(train, test_set, y, *args)
actual = [row[-1] for row in test]
rmse = rmse_metric(actual, predicted)
return rmse
# Calculate the mean value of a list of numbers
def mean(values):
return sum(values) / float(len(values))
# Calculate covariance between x and y
def covariance(x, mean_x, y, mean_y):
covar = 0.0
for i in range(len(x)):
covar += (x[i] - mean_x) * (y[i] - mean_y)
return covar
# Calculate the variance of a list of numbers
def variance(values, mean):
return sum([(x-mean)**2 for x in values])
# Calculate coefficients
def coefficients(dataset):
x = [row[0] for row in dataset]
y = [row[1] for row in dataset]
x_mean, y_mean = mean(x), mean(y)
b1 = covariance(x, x_mean, y, y_mean) / variance(x, x_mean)
b0 = y_mean - b1 * x_mean
return [b0, b1]
# Simple linear regression algorithm
def simple_linear_regression(train, test, y):
predictions = list()
b0, b1 = coefficients(train)
x=[]
for row in test:
yhat = b0 + b1 * row[0]
predictions.append(yhat)
x.append(row[0])
plt.figure()
plt.plot(x,predictions,'r',x,y,'bo')
plt.show()
return predictions
# Simple linear regression on insurance dataset
seed(1)
# load and prepare data
filename = 'insurance.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])):
str_column_to_float(dataset, i)
# evaluate algorithm
split = 0.6
rmse = evaluate_algorithm(dataset, simple_linear_regression, split)
print('RMSE: %.3f' % (rmse))