-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
230 lines (188 loc) · 8.01 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import torch
import torch.nn as nn
import torch.optim as optim
# Load the dataset
# The dataset is loaded with a semicolon separator and comma as the decimal point.
df = pd.read_csv("USB/Air-Quality-Study/AirQuality.csv", sep=";", decimal=",")
print("First few rows of the dataset:")
print(df.head())
# Visualize missing data using a heatmap
# This helps in understanding the extent and pattern of missing data in the dataset.
plt.figure(figsize=(12, 6))
sns.heatmap(df.isna(), yticklabels=False, cmap='crest', cbar=False)
plt.title("Heatmap of Missing Data")
plt.show()
# Remove unnecessary columns (e.g., unnamed index columns)
# Unnamed columns are usually generated when the index is stored as a column during CSV creation.
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
print("Dataset after removing unnecessary columns:")
print(df.head())
# Replace -200 values with NaN (missing data)
# -200 is used in this dataset to represent missing or invalid data.
df.replace(to_replace=-200, value=np.nan, inplace=True)
# Drop rows with missing data
# This is a straightforward approach to handle missing data, but might result in data loss.
# Consider using imputation techniques if too much data is being dropped.
df.dropna(inplace=True)
print("Dataset after dropping rows with missing data:")
print(df.tail())
# Feature Engineering: Extract 'Month' from 'Date' if applicable
# This helps in analyzing seasonal trends in air quality.
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
df['Month'] = df['Date'].dt.month
# Visualization: Histogram of CO(GT)
# Visualize the distribution of CO levels to understand its frequency and range.
plt.figure(figsize=(10, 6))
df['CO(GT)'].plot(kind='hist', bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of CO(GT)')
plt.xlabel('CO(GT)')
plt.ylabel('Frequency')
plt.gca().spines[['top', 'right']].set_visible(False)
plt.show()
# Visualization: Average CO(GT) by Month
# Analyze how CO levels vary month by month to check for seasonal patterns.
monthly_avg = df.groupby('Month')['CO(GT)'].mean()
plt.figure(figsize=(10, 6))
sns.barplot(x=monthly_avg.index, y=monthly_avg.values, palette='crest')
plt.title('Average CO(GT) by Month')
plt.xlabel('Month')
plt.ylabel('Average CO(GT)')
plt.gca().spines[['top', 'right']].set_visible(False)
plt.show()
# Additional Data Exploration: Summary statistics
# Provides insights into central tendency, dispersion, and overall data distribution.
print("Summary statistics of the dataset:")
print(df.describe())
# Impute missing values with column mean
# Instead of dropping rows, missing data is replaced with the mean of the respective column.
for col in df.columns:
df[col] = df[col].fillna(df[col].mean())
print("Missing values after imputation:")
print(df.isna().sum())
# Box Plot for Outliers
# Visualize outliers in the data. Outliers are potential anomalies and can affect model performance.
plt.figure(figsize=(12, 8))
sns.boxplot(data=df)
plt.title('Box Plot for Outliers checking')
plt.xticks(rotation=45)
plt.show()
# Calculate Interquartile Range (IQR) for outlier detection
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
# Detect outliers beyond 1.5 * IQR from Q1 and Q3
outlier_mask = (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))
# Replace outliers with the median of the respective column
for col in outlier_mask.columns:
median = df[col].median()
df.loc[outlier_mask[col], col] = median
# Verify outliers handling
print("Outliers after replacement:")
print(((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).sum())
# Box Plot after handling outliers
plt.figure(figsize=(12, 8))
sns.boxplot(data=df)
plt.title('Box Plot after Outlier Handling')
plt.xticks(rotation=45)
plt.show()
# Correlation Heatmap
# Visualize correlations between features to identify potential relationships.
plt.figure(figsize=(10, 5))
sns.heatmap(df.corr(), cmap='YlGnBu', annot=True)
plt.title('Correlation Matrix')
plt.show()
# Feature Selection and Target Variable
# Select features (independent variables) and the target variable for model training.
X = df[['CO(GT)', 'PT08.S1(CO)', 'PT08.S2(NMHC)', 'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)']]
y = df['C6H6(GT)']
# Convert the features and target variable to PyTorch tensors
X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32).view(-1, 1)
# Splitting the data into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
# Define an Artificial Neural Network (ANN) model
class ANN(nn.Module):
def __init__(self, input_size, hidden_size):
super(ANN, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, 1)
def forward(self, x):
x = torch.relu(self.fc1(x)) # ReLU activation function
x = self.fc2(x) # Output layer
return x
# Initialize the model
input_size = X_train.shape[1] # Number of input features
hidden_size = 64 # Number of neurons in the hidden layer
model = ANN(input_size, hidden_size)
# Define loss function and optimizer
criterion = nn.MSELoss() # Mean Squared Error Loss for regression
optimizer = optim.Adam(model.parameters(), lr=0.001) # Adam optimizer
# Training the model with validation
num_epochs = 100
for epoch in range(num_epochs):
# Forward pass
outputs = model(X_train)
loss = criterion(outputs, y_train)
# Validation pass
with torch.no_grad():
val_outputs = model(X_val)
val_loss = criterion(val_outputs, y_val)
# Backward pass and optimization
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (epoch+1) % 10 == 0:
print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}')
# Evaluation on Training and Test Data
with torch.no_grad():
y_pred_train = model(X_train)
train_rmse = torch.sqrt(criterion(y_pred_train, y_train)).item()
train_r2 = r2_score(y_train.numpy(), y_pred_train.numpy())
y_pred_test = model(X_test)
test_rmse = torch.sqrt(criterion(y_pred_test, y_test)).item()
test_r2 = r2_score(y_test.numpy(), y_pred_test.numpy())
print("Train RMSE:", train_rmse)
print("Test RMSE:", test_rmse)
print("Train R^2 Score:", train_r2)
print("Test R^2 Score:", test_r2)
# Convert 'Date' and 'Time' columns to a single datetime column if not already done
df['Datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%d/%m/%Y %H.%M.%S')
# Extract various time-based features
df['Year'] = df['Datetime'].dt.year
df['Month'] = df['Datetime'].dt.month
df['Day'] = df['Datetime'].dt.day
df['Hour'] = df['Datetime'].dt.hour
df['DayOfWeek'] = df['Datetime'].dt.dayofweek
df['Weekend'] = df['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0) # 1 if weekend else 0
#Adding Time Based Feature Extraction for fun, hehe
# Define seasons based on month (Northern Hemisphere season definitions)
def get_season(month):
if month in [12, 1, 2]:
return 'Winter'
elif month in [3, 4, 5]:
return 'Spring'
elif month in [6, 7, 8]:
return 'Summer'
else:
return 'Fall'
df['Season'] = df['Month'].apply(get_season)
print("Extracted time-based features:")
print(df[['Year', 'Month', 'Day', 'Hour', 'DayOfWeek', 'Weekend', 'Season']].head())
# Create lag features for pollutants and weather data (example with CO(GT))
# Lagging by 1 hour, 2 hours, etc.
lags = [1, 2, 3, 6, 12, 24] # Define your lag intervals
for lag in lags:
df[f'CO(GT)_lag_{lag}'] = df['CO(GT)'].shift(lag)
df[f'NO2(GT)_lag_{lag}'] = df['NO2(GT)'].shift(lag)
# Add more lagged features as needed
# Drop rows with NaN values generated by lagging
df.dropna(inplace=True)
print("Lag features added:")
print(df.head())