-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlinear_regression.py
117 lines (101 loc) · 5.31 KB
/
linear_regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""Linear Regression is a statistical method used to model the relationship between a dependent variable and one or more independent variables. It minimizes the cost
function, typically the mean squared error or the mean absolute error, by adjusting parameters (slope/weight and intercept/bias) through gradient descent. Predictions
are made based on these optimized parameters. For this implementation, we have used the Placement dataset from Kaggle, in which we will implement simple linear regression
to predict the Package (target) a student could get based on its CGPA (feature). The purpose of this implementation is to try not to use any inbuilt libraries to
demonstrate fundamental concepts like gradient descent for optimization and error calculation using a cost function and implement linear regression from scratch.
NOTE: The sklearn library is used just to compare our results with the in-built Linear Regression function.
Lets begin !!
"""
# IMPORTING NECESSARY LIBRARIES
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# GET DATA
def get_data():
""""Get the features and target values from the data"""
data = pd.read_csv('datasets\linear-regression-data.csv')
x = data['cgpa'].values.reshape(-1, 1)
y = data['package'].values
return x,y
# COST FUNCTION
def cost_function(x, y, w, b):
"""Cost function is the function that finds the error or difference between the predicted and actual values.
Cost function formula: J(w,b) = sum((predicted - actual)^2) / len_data
:param x : dataset feature values
:param y : dataset target values
:param w : weight assigned to the feature or slope
:param b : bias
:return : cost
"""
m = len(y)
fx = np.dot(w, x.transpose()) + b # Predicted
cost_j = ( (1/m) * np.sum(np.square(fx - y))) # Cost function formula
return cost_j
# GRADIENT DESCENT ALGORITHM TO MINIMIZE J (COST)
def run_gradient_descent(x, y, w, b, alpha):
"""Gradient Descent an iterative first-order optimisation algorithm, used to find a local minimum/maximum
of a given function. It is used to minimize the error in cost function.
Formula for updating w: w = w - [ alpha/len_data * sum( (predicted - actual) * x )
Formula for updating b: b = b - [ alpha/len_data * sum( (predicted - actual) )
:param x : dataset feature values
:param y : dataset target values
:param w : weight assigned to the feature or slope
:param b : bias
:param alpha : learning rate of the model
:return : updated values for model prameters w, b
"""
m = len(y)
fx = np.dot(w, x.transpose()) + b # Predicted
dw = (1 / m) * np.dot((fx - y), x).flatten() # d / dw (cost) i.e derivative of cost function w.r.t dw
db = (1 / m) * np.sum(fx - y) # d / db (cost) i.e derivative of cost function w.r.t db
w = w - alpha * dw # Update w
b = b - alpha * db # Update b
return w, b
# LINEAR REGRESSION IMPLEMENTATION
def run_linear_regression(x, y):
"""Implement linear regression over the dataset"""
iterations = 100000
alpha = 0.0013 # Learning rate: tells how big or small the steps are taken to reach minimum, should be b/w 0 and 1
no_features = x.shape[1]
np.random.seed(0)
w = np.random.randn(no_features) * 0.01 # Initialize small random value for the target
b = 1 # Initialize bias
# Loop for computing gradient descent and cost function for each iterations
for i in range(iterations):
w, b = run_gradient_descent(x, y, w, b, alpha)
error = cost_function(x, y, w, b)
# print(f"At Iteration {i + 1}, Error is {error:.5f}") # Uncomment to see the error updations
return w, b
# PREDICTION FUNCTION FOR NEW DATA
def make_predictions(new_data, w, b):
"""Predict the target variable (package) for new data"""
prediction = np.dot(new_data, w) + b
return prediction
# Main Function
def main():
""""Driver Function"""
x, y = get_data()
# Split data into training and testing sets (80% training, 20% testing)
x_train , x_test ,y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)
# Training the LR model w/o using libraries i.e our implementation and getting the model parameters
w, b = run_linear_regression(x_train, y_train)
print(f"\nModel Parameters W/O using ML Libraries\nw: {w}\nb: {b}")
# Training the LR model using sklearn library and getting the model parameters
model = LinearRegression()
model.fit(x_train,y_train)
wLR = model.coef_
bLR = model.intercept_
print(f"\nModel Parameters Using ML Libraries:\nw: {wLR}\nb: {bLR}")
# Making predictions on testing data using both models
predictions = make_predictions(x_test, w, b)
print("\nOur Predictions:\n", predictions)
scikit_predictions = model.predict(x_test)
print("\nSklearn Model Predictions:\n", scikit_predictions)
# Calculating Mean Squared Error (MSE) for both models
custom_mse = np.mean(np.square(predictions - y_test))
print("\nOur Linear Regression MSE:", custom_mse)
scikit_mse = np.mean(np.square(scikit_predictions - y_test))
print("Sklearn Linear Regression MSE:", scikit_mse)
if __name__ == "__main__":
main()