@@ -116,6 +116,7 @@ def extract_features(
116
116
117
117
if variants_kmers is None :
118
118
variants_kmers = load_variants_kmers (save_path = save_path )
119
+ message .info_founded_features (len (variants_kmers ))
119
120
120
121
file_list = [
121
122
dir_path + '/' + name
@@ -137,8 +138,6 @@ def extract_features(
137
138
def process_dataframe (
138
139
data_frame : pd .DataFrame ,
139
140
dir_path : str = None ,
140
- should_save_data : bool = False ,
141
- should_save_model : bool = False ,
142
141
) -> tuple [pd .DataFrame , np .ndarray ]:
143
142
"""
144
143
Process a DataFrame and optionally save data and model.
@@ -151,9 +150,6 @@ def process_dataframe(
151
150
data_frame (pd.DataFrame): The DataFrame to be processed.
152
151
dir_path (str, optional): The directory path for saving data and model. \
153
152
Default is None.
154
- should_save_data (bool, optional): Whether to save processed data. \
155
- Default is False.
156
- should_save_model (bool, optional): Whether to save a model. Default is False.
157
153
158
154
Returns:
159
155
tuple[pd.DataFrame, np.ndarray]: A tuple containing the processed \
@@ -169,37 +165,38 @@ def process_dataframe(
169
165
data_frame .drop (columns = ['CLASS' ], axis = 1 , inplace = True )
170
166
data_frame .replace ([np .inf , - np .inf ], 0 , inplace = True )
171
167
data_frame .replace (np .nan , 0 , inplace = True )
172
- df_col_names = data_frame .columns
168
+
169
+ df = data_frame .sort_index (axis = 1 )
170
+ del data_frame
171
+
172
+ df_col_names = df .columns
173
173
174
174
# MinMax Scaler
175
175
minMax_scaler = MinMaxScaler ()
176
- minMax_scaler .fit (data_frame )
177
- df_minmax = minMax_scaler .transform (data_frame )
178
- data_frame = pd .DataFrame (df_minmax )
176
+ minMax_scaler .fit (df )
177
+ df_minmax = minMax_scaler .transform (df )
178
+ df = pd .DataFrame (df_minmax )
179
179
del df_minmax
180
180
label_encdr = LabelEncoder ()
181
181
class_values = label_encdr .fit_transform (class_values )
182
182
183
- data_frame .columns = df_col_names
184
- data_frame ['CLASS' ] = class_values
183
+ df .columns = df_col_names
184
+ df ['CLASS' ] = class_values
185
185
186
- if should_save_data :
187
- save_data (
188
- data_frame = data_frame ,
189
- class_names_to_save = class_names_to_save ,
190
- dir_path = dir_path ,
191
- )
192
- if should_save_model :
193
- save_ranges (ranges = minMax_scaler , dir_path = dir_path )
194
- return data_frame , name_class
186
+ save_data (
187
+ data_frame = df ,
188
+ class_names_to_save = class_names_to_save ,
189
+ dir_path = dir_path ,
190
+ )
191
+
192
+ save_ranges (ranges = minMax_scaler , dir_path = dir_path )
193
+ return df , name_class
195
194
196
195
197
196
def sequence_classification (
198
197
data_frame : pd .DataFrame ,
199
198
name_class : np .ndarray ,
200
199
dir_path : str ,
201
- should_save_model : bool = False ,
202
- should_save_confusion_matrix : bool = False ,
203
200
) -> None :
204
201
"""
205
202
Perform sequence classification based on provided data and options.
@@ -213,10 +210,6 @@ def sequence_classification(
213
210
data_frame (pd.DataFrame): The data frame containing sequence data and features.
214
211
name_class (np.ndarray): The array of class names corresponding to the data.
215
212
dir_path (str): The path to the directory for saving model and plot files.
216
- should_save_model (bool, optional): Whether to save the trained model. \
217
- Default is False.
218
- should_save_confusion_matrix (bool, optional): Whether to save the \
219
- confusion matrix plot. Default is False.
220
213
221
214
Returns:
222
215
None
@@ -235,8 +228,7 @@ def sequence_classification(
235
228
rf_classifier = RandomForestClassifier (n_estimators = 100 )
236
229
rf_classifier .fit (x_train , y_train )
237
230
238
- if should_save_model :
239
- save_model (model = rf_classifier , dir_path = dir_path )
231
+ save_model (model = rf_classifier , dir_path = dir_path )
240
232
241
233
# Make predictions on the test set
242
234
y_pred = rf_classifier .predict (x_test )
@@ -266,15 +258,14 @@ def sequence_classification(
266
258
save_metrics (acc = acc , metrics = metrics , dir_path = dir_path )
267
259
del acc , metrics
268
260
269
- if should_save_confusion_matrix :
270
- conf_mtx = confusion_matrix (y_true = y_test , y_pred = y_pred )
271
- vmax = max (np .unique (y_test , return_counts = True )[1 ])
272
- save_confusion_matrix (
273
- conf_mtx = conf_mtx ,
274
- name_class = name_class ,
275
- vmax = vmax ,
276
- dir_path = dir_path ,
277
- )
261
+ conf_mtx = confusion_matrix (y_true = y_test , y_pred = y_pred )
262
+ vmax = max (np .unique (y_test , return_counts = True )[1 ])
263
+ save_confusion_matrix (
264
+ conf_mtx = conf_mtx ,
265
+ name_class = name_class ,
266
+ vmax = vmax ,
267
+ dir_path = dir_path ,
268
+ )
278
269
return
279
270
280
271
@@ -283,12 +274,9 @@ def classify(
283
274
step : int ,
284
275
save_path : str ,
285
276
dir_path : str ,
286
- should_get_kmers : bool = False ,
277
+ get_kmers : bool = False ,
287
278
reference_path : str | None = None ,
288
279
dictonary : str = 'DNA' ,
289
- should_save_data : bool = True ,
290
- should_save_model : bool = True ,
291
- should_save_confusion_matrix : bool = True ,
292
280
chunk_size : int = 100 ,
293
281
):
294
282
"""
@@ -305,25 +293,19 @@ def classify(
305
293
step (int): The step size for moving the sliding window.
306
294
save_path (str): The path to save the processed data and model files.
307
295
dir_path (str): The path to the directory containing sequence data.
308
- should_get_kmers (bool, optional): Whether to extract exclusive k-mers. \
296
+ get_kmers (bool, optional): Whether to extract exclusive k-mers. \
309
297
Default is False.
310
298
reference_path (str, optional): The path to the reference sequence data file. \
311
299
Default is None.
312
300
dictonary (str): The DNA dictionary for k-mer analysis. Default is 'DNA'.
313
- should_save_data (bool, optional): Whether to save processed data. \
314
- Default is True.
315
- should_save_model (bool, optional): Whether to save the trained model. \
316
- Default is True.
317
- should_save_confusion_matrix (bool, optional): Whether to save the \
318
- confusion matrix plot. Default is True.
319
301
chunk_size (int, optional): The chunk size for loading sequences. \
320
302
Default is 100.
321
303
322
304
Returns:
323
305
Message class: A message confirming the classification pipeline has completed.
324
306
"""
325
307
exclusive_kmers = None
326
- if should_get_kmers :
308
+ if get_kmers :
327
309
file_list = [
328
310
name for name in listdir (dir_path ) if fnmatch (name , '*.fasta' )
329
311
]
@@ -338,14 +320,16 @@ def classify(
338
320
word = word ,
339
321
step = step ,
340
322
dictonary = dictonary ,
341
- save_kmers = False ,
323
+ save_path = save_path ,
342
324
chunk_size = chunk_size ,
343
325
)
344
326
for file in files
345
327
)
346
328
347
329
exclusive_kmers = np .unique (np .concatenate (exclusive_kmers ))
348
330
331
+ message .info_founded_features (len (exclusive_kmers ))
332
+
349
333
data_frame = extract_features (
350
334
word = word ,
351
335
step = step ,
@@ -360,15 +344,11 @@ def classify(
360
344
df_process , name_class = process_dataframe (
361
345
data_frame = data_frame ,
362
346
dir_path = save_path ,
363
- should_save_data = should_save_data ,
364
- should_save_model = should_save_model ,
365
347
)
366
348
sequence_classification (
367
349
data_frame = df_process ,
368
350
name_class = name_class ,
369
351
dir_path = save_path ,
370
- should_save_model = should_save_model ,
371
- should_save_confusion_matrix = should_save_confusion_matrix ,
372
352
)
373
353
return message .info_done ()
374
354
@@ -430,6 +410,7 @@ def extract_features_to_predict(
430
410
)
431
411
432
412
data_frame = pd .DataFrame (features )
413
+
433
414
return data_frame
434
415
435
416
@@ -459,16 +440,20 @@ def process_dataframe_predict(
459
440
data_frame .drop (columns = ['ID' ], axis = 1 , inplace = True )
460
441
data_frame .replace ([np .inf , - np .inf ], 0 , inplace = True )
461
442
data_frame .replace (np .nan , 0 , inplace = True )
462
- df_col_names = data_frame .columns
463
443
464
- df_minmax = minMax_scaler .transform (data_frame )
465
- data_frame = pd .DataFrame (df_minmax )
444
+ df = data_frame .sort_index (axis = 1 )
445
+ del data_frame
446
+
447
+ df_col_names = df .columns
448
+
449
+ df_minmax = minMax_scaler .transform (df )
450
+ df = pd .DataFrame (df_minmax )
466
451
del df_minmax
467
452
468
- data_frame .columns = df_col_names
469
- data_frame ['ID' ] = id_values
453
+ df .columns = df_col_names
454
+ df ['ID' ] = id_values
470
455
471
- return data_frame
456
+ return df
472
457
473
458
474
459
def predict_data (
0 commit comments