-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing.py
More file actions
196 lines (140 loc) · 5.52 KB
/
preprocessing.py
File metadata and controls
196 lines (140 loc) · 5.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import pandas as pd
import numpy as np
from typing import List, Tuple
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import TensorDataset
from PIL import Image
from torchvision import transforms
from sklearn.decomposition import PCA
COLS_TO_DROP = ["No.", "Id"]
COL_ID = ["Id"]
COL_LABEL = ["wheatvariety"]
COLS_TO_KEEP = [
"kernelarea",
"kernelperimeter",
"compactness",
"kernellength",
"kernelwidth",
"asymmetry",
"groovelength",
"germarea",
"germlength",
]
COLS_TO_ADD = [
"germarea_kernelarea",
"germlength_kernellength",
"kernelwidth_kernellength",
]
COL_FEATURES = COLS_TO_ADD + COLS_TO_KEEP
def drop_columns(df: pd.DataFrame) -> pd.DataFrame:
return df.drop(columns=COLS_TO_DROP)
def encode_nominal_data(df: pd.DataFrame) -> pd.DataFrame:
encoder = LabelEncoder()
for col in COL_LABEL:
df[col] = encoder.fit_transform(df[col])
return df
def normalize_data(df: pd.DataFrame) -> pd.DataFrame:
for col in COL_FEATURES:
df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
return df
def add_indirect_features(df: pd.DataFrame) -> pd.DataFrame:
df_new = df.copy()
df_new["germarea_kernelarea"] = df["germarea"] / df["kernelarea"]
df_new["germlength_kernellength"] = df["germlength"] / df["kernellength"]
df_new["kernelwidth_kernellength"] = df["kernelwidth"] / df["kernellength"]
return df_new
def preprocess_data(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
df = drop_columns(df)
df = encode_nominal_data(df)
df = add_indirect_features(df)
df = normalize_data(df)
X = df[COL_FEATURES]
y = df[COL_LABEL]
return X, y
def pd_to_numpy_X_y(X: pd.DataFrame, y: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
X_np = X.to_numpy()
y_np = y.to_numpy().ravel()
return X_np, y_np
def numpy_to_tensor_X_y(
X: np.ndarray, y: np.ndarray
) -> Tuple[torch.Tensor, torch.Tensor]:
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)
return X_tensor, y_tensor
def get_tensor_dataset(X: np.ndarray, y: np.ndarray) -> TensorDataset:
X_tensor, y_tensor = numpy_to_tensor_X_y(X, y)
return TensorDataset(X_tensor, y_tensor)
### IMAGE PREPROCESSING ########################################
def to_grayscale(img: Image.Image) -> Image.Image:
return img.convert("L")
def center_crop(img: Image.Image, target_width: int, target_height: int) -> Image.Image:
width, height = img.size
left = (width - target_width) // 2
top = (height - target_height) // 2
right = left + target_width
bottom = top + target_height
return img.crop((left, top, right, bottom))
def pad_image(img: Image.Image, target_width: int, target_height: int) -> Image.Image:
org_width, org_height = img.size
if target_height < org_height or target_width < org_width:
raise ValueError(
f"Target dimensions ({target_width}, {target_height}) can't be smaller than original dimensions ({org_width}, {org_height})"
)
new_image = Image.new(img.mode, (target_width, target_height))
pad_size_left = (target_width - org_width) // 2
pad_size_top = (target_height - org_height) // 2
new_image.paste(img, (pad_size_left, pad_size_top))
return new_image
def image_to_tensor(img: Image.Image) -> torch.Tensor:
to_tensor = transforms.ToTensor()
tensor = to_tensor(img)
return tensor
def get_max_sizes(images: List[Image.Image]) -> Tuple[int, int]:
w, h = [], []
for img in images:
w.append(img.width)
h.append(img.height)
return max(w), max(h)
def get_min_sizes(images: List[Image.Image]) -> Tuple[int, int]:
w, h = [], []
for img in images:
w.append(img.width)
h.append(img.height)
return min(w), min(h)
def preprocess_images(images: List[Image.Image]) -> torch.Tensor:
max_width, max_height = get_max_sizes(images)
preprocessed_images = []
for img in images:
img = to_grayscale(img)
img = pad_image(img, max_width, max_height)
img = image_to_tensor(img)
preprocessed_images.append(img)
return torch.stack(preprocessed_images, dim=0)
### IMAGE REDUCTION ########################################
def img_tensor_to_ndarray(images: torch.Tensor) -> np.ndarray:
return images.cpu().detach().numpy()
def dim_reduction(images: np.ndarray | torch.Tensor, target_dim: int) -> torch.Tensor:
images_flat = images.reshape(len(images), -1)
pca = PCA(n_components=target_dim)
images_reduced = pca.fit_transform(images_flat) # type: ignore
return torch.from_numpy(images_reduced)
### JOINED PREPROCESSING ########################################
def sort_dataframe(df: pd.DataFrame, ids: List[str]) -> pd.DataFrame:
ids = [iid[:-4] for iid in ids]
return df.set_index(COL_ID).loc[ids].reset_index()
def preprocess_all(
df: pd.DataFrame, imgs: List[Image.Image], ids: List[str]
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
df = sort_dataframe(df, ids)
imgs_tensor = preprocess_images(imgs)
df_x, df_y = preprocess_data(df)
numpy_x, numpy_y = pd_to_numpy_X_y(df_x, df_y)
tensor_x, tensor_y = numpy_to_tensor_X_y(numpy_x, numpy_y)
return tensor_x, imgs_tensor, tensor_y
def tensors_to_dataset(
features: torch.Tensor, imgs: torch.Tensor, labels: torch.Tensor
) -> TensorDataset:
return TensorDataset(features, imgs, labels)
def join_multisource(x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
return torch.cat((x1, x2), dim=1)