Grain-Classification-QML/preprocessing.py at main · AGH-CEAI/Grain-Classification-QML · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import pandas as pd
import numpy as np
from typing import List, Tuple
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import TensorDataset
from PIL import Image
from torchvision import transforms
from sklearn.decomposition import PCA

COLS_TO_DROP = ["No.", "Id"]
COL_ID = ["Id"]
COL_LABEL = ["wheatvariety"]
COLS_TO_KEEP = [
    "kernelarea",
    "kernelperimeter",
    "compactness",
    "kernellength",
    "kernelwidth",
    "asymmetry",
    "groovelength",
    "germarea",
    "germlength",
]
COLS_TO_ADD = [
    "germarea_kernelarea",
    "germlength_kernellength",
    "kernelwidth_kernellength",
]
COL_FEATURES = COLS_TO_ADD + COLS_TO_KEEP


def drop_columns(df: pd.DataFrame) -> pd.DataFrame:
    return df.drop(columns=COLS_TO_DROP)


def encode_nominal_data(df: pd.DataFrame) -> pd.DataFrame:
    encoder = LabelEncoder()
    for col in COL_LABEL:
        df[col] = encoder.fit_transform(df[col])
    return df


def normalize_data(df: pd.DataFrame) -> pd.DataFrame:
    for col in COL_FEATURES:
        df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
    return df


def add_indirect_features(df: pd.DataFrame) -> pd.DataFrame:
    df_new = df.copy()
    df_new["germarea_kernelarea"] = df["germarea"] / df["kernelarea"]
    df_new["germlength_kernellength"] = df["germlength"] / df["kernellength"]
    df_new["kernelwidth_kernellength"] = df["kernelwidth"] / df["kernellength"]
    return df_new


def preprocess_data(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    df = drop_columns(df)
    df = encode_nominal_data(df)
    df = add_indirect_features(df)
    df = normalize_data(df)

    X = df[COL_FEATURES]
    y = df[COL_LABEL]
    return X, y


def pd_to_numpy_X_y(X: pd.DataFrame, y: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
    X_np = X.to_numpy()
    y_np = y.to_numpy().ravel()
    return X_np, y_np


def numpy_to_tensor_X_y(
    X: np.ndarray, y: np.ndarray
) -> Tuple[torch.Tensor, torch.Tensor]:
    X_tensor = torch.tensor(X, dtype=torch.float32)
    y_tensor = torch.tensor(y, dtype=torch.long)
    return X_tensor, y_tensor


def get_tensor_dataset(X: np.ndarray, y: np.ndarray) -> TensorDataset:
    X_tensor, y_tensor = numpy_to_tensor_X_y(X, y)
    return TensorDataset(X_tensor, y_tensor)


### IMAGE PREPROCESSING ########################################


def to_grayscale(img: Image.Image) -> Image.Image:
    return img.convert("L")


def center_crop(img: Image.Image, target_width: int, target_height: int) -> Image.Image:
    width, height = img.size

    left = (width - target_width) // 2
    top = (height - target_height) // 2
    right = left + target_width
    bottom = top + target_height

    return img.crop((left, top, right, bottom))


def pad_image(img: Image.Image, target_width: int, target_height: int) -> Image.Image:
    org_width, org_height = img.size
    if target_height < org_height or target_width < org_width:
        raise ValueError(
            f"Target dimensions ({target_width}, {target_height}) can't be smaller than original dimensions ({org_width}, {org_height})"
        )
    new_image = Image.new(img.mode, (target_width, target_height))
    pad_size_left = (target_width - org_width) // 2
    pad_size_top = (target_height - org_height) // 2
    new_image.paste(img, (pad_size_left, pad_size_top))
    return new_image


def image_to_tensor(img: Image.Image) -> torch.Tensor:
    to_tensor = transforms.ToTensor()
    tensor = to_tensor(img)
    return tensor


def get_max_sizes(images: List[Image.Image]) -> Tuple[int, int]:
    w, h = [], []
    for img in images:
        w.append(img.width)
        h.append(img.height)
    return max(w), max(h)


def get_min_sizes(images: List[Image.Image]) -> Tuple[int, int]:
    w, h = [], []
    for img in images:
        w.append(img.width)
        h.append(img.height)
    return min(w), min(h)


def preprocess_images(images: List[Image.Image]) -> torch.Tensor:
    max_width, max_height = get_max_sizes(images)

    preprocessed_images = []
    for img in images:
        img = to_grayscale(img)
        img = pad_image(img, max_width, max_height)
        img = image_to_tensor(img)
        preprocessed_images.append(img)

    return torch.stack(preprocessed_images, dim=0)


### IMAGE REDUCTION ########################################


def img_tensor_to_ndarray(images: torch.Tensor) -> np.ndarray:
    return images.cpu().detach().numpy()


def dim_reduction(images: np.ndarray | torch.Tensor, target_dim: int) -> torch.Tensor:
    images_flat = images.reshape(len(images), -1)
    pca = PCA(n_components=target_dim)
    images_reduced = pca.fit_transform(images_flat)  # type: ignore
    return torch.from_numpy(images_reduced)


### JOINED PREPROCESSING ########################################


def sort_dataframe(df: pd.DataFrame, ids: List[str]) -> pd.DataFrame:
    ids = [iid[:-4] for iid in ids]
    return df.set_index(COL_ID).loc[ids].reset_index()


def preprocess_all(
    df: pd.DataFrame, imgs: List[Image.Image], ids: List[str]
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    df = sort_dataframe(df, ids)
    imgs_tensor = preprocess_images(imgs)
    df_x, df_y = preprocess_data(df)

    numpy_x, numpy_y = pd_to_numpy_X_y(df_x, df_y)
    tensor_x, tensor_y = numpy_to_tensor_X_y(numpy_x, numpy_y)

    return tensor_x, imgs_tensor, tensor_y


def tensors_to_dataset(
    features: torch.Tensor, imgs: torch.Tensor, labels: torch.Tensor
) -> TensorDataset:
    return TensorDataset(features, imgs, labels)


def join_multisource(x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
    return torch.cat((x1, x2), dim=1)