Skip to content

Commit 9359351

Browse files
feat: methodology upgrades. include positions analysis. remove phylo module.
1 parent 2ec2b17 commit 9359351

18 files changed

+677
-975
lines changed

Cargo.toml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,23 @@
11
[package]
22
name = "gramep"
3-
version = "1.0.9"
3+
version = "1.1.0"
44
edition = "2021"
55

66
[lib]
77
name = "utilrs"
88
crate-type = ["cdylib"]
99

1010
[dependencies]
11-
pyo3 = "0.22.2"
11+
pyo3 = "0.22.5"
1212
rayon = "1.10"
1313
num_cpus = "1.16"
1414
rustc-hash = "2.0.0"
1515
bio = "2.0.1"
16-
regex = "1.10.6"
16+
regex = "1.11.1"
1717
concat-string = "1.0.1"
1818
itertools = "0.13.0"
1919
spinners = "4.1.1"
20-
anyhow = "1.0.89"
20+
anyhow = "1.0.91"
2121

2222
[profile.release]
2323
lto = 'thin'

data/alpha_kmers.txt

Lines changed: 0 additions & 4 deletions
This file was deleted.

data/parameters.ini

Lines changed: 2 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,6 @@ snps_max = 1
1717
dictonary = ACTG
1818
# Create report
1919
create_report = True
20-
# Save Exclusive kmers
21-
save_kmers = True
22-
# Load Exclusive kmers
23-
load_exclusive_kmers = False
24-
# Path Exclusive kmers
25-
path_exclusive_kmers = None
2620
# Chunk size
2721
chunk_size = 100
2822

@@ -41,14 +35,10 @@ step = 1
4135
save_path = data/output/mutations/
4236
# Dir path
4337
dir_path = data/VOCs/
38+
# Get kmers
39+
get_kmers = False
4440
# dictonary (ACTG/ACGU)
4541
dictonary = ACTG
46-
# Save data
47-
should_save_data = True
48-
# Save model
49-
should_save_model = True
50-
# Save confusion matrix
51-
should_save_confusion_matrix = True
5242
# Chunk size
5343
chunk_size = 100
5444

@@ -71,9 +61,3 @@ load_ranges_path = data/output/classify/model/ranges.sav
7161
load_model_path = data/output/classify/model/model.sav
7262
# Chunk size
7363
chunk_size = 100
74-
75-
[phylogenic]
76-
# Save path
77-
save_path = data/output/mutations/
78-
# Save model
79-
save_heatmap = False

data/tutorial_data.tar.xz

17.5 KB
Binary file not shown.

pyproject.toml

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,15 @@ build-backend = "maturin"
44

55
[project]
66
name = "gramep"
7-
version = "1.0.9"
7+
version = "1.1.0"
88
requires-python = ">=3.12"
99
description = "GRAMEP - Genome vaRiation Analysis from the Maximum Entropy Principle"
1010
authors = [
11-
{ name="Matheus Pimenta", email="matheus.pimenta@outlook.com"},
11+
{ name="Matheus Pimenta", email="omatheuspimenta@outlook.com"},
1212
{ name="Fabricio Lopes", email="fabricio@utfpr.edu.br"},
1313
]
1414
maintainers = [
15-
{ name="Matheus Pimenta", email="matheus.pimenta@outlook.com"},
15+
{ name="Matheus Pimenta", email="omatheuspimenta@outlook.com"},
1616
]
1717
readme = "README.md"
1818
license = {text = "MIT License"}
@@ -40,8 +40,6 @@ dependencies = [
4040
"upsetplot >=0.9.0",
4141
"scikit-learn >=1.5.1",
4242
"seaborn >=0.13.2",
43-
"polars >=1.5.0",
44-
"toytree >=3.0.5",
4543
"thefuzz >=0.22.1",
4644
]
4745

python/gramep/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = '1.0.9'
1+
__version__ = '1.1.0'

python/gramep/analysis.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ def mutations_analysis(
4343
seq_path: str,
4444
ref_path: str,
4545
seq_kmers_exclusive: list[str],
46+
kmers_positions: defaultdict[str, list[int]],
4647
word: int,
4748
step: int,
4849
snps_max: int,
@@ -100,6 +101,7 @@ def mutations_analysis(
100101
seq_path=seq_path,
101102
ref_path=ref_path,
102103
exclusive_kmers=seq_kmers_exclusive,
104+
final_positions=kmers_positions,
103105
k=word,
104106
step=step,
105107
max_dist=snps_max,

python/gramep/classify_utils.py

Lines changed: 45 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ def extract_features(
116116

117117
if variants_kmers is None:
118118
variants_kmers = load_variants_kmers(save_path=save_path)
119+
message.info_founded_features(len(variants_kmers))
119120

120121
file_list = [
121122
dir_path + '/' + name
@@ -137,8 +138,6 @@ def extract_features(
137138
def process_dataframe(
138139
data_frame: pd.DataFrame,
139140
dir_path: str = None,
140-
should_save_data: bool = False,
141-
should_save_model: bool = False,
142141
) -> tuple[pd.DataFrame, np.ndarray]:
143142
"""
144143
Process a DataFrame and optionally save data and model.
@@ -151,9 +150,6 @@ def process_dataframe(
151150
data_frame (pd.DataFrame): The DataFrame to be processed.
152151
dir_path (str, optional): The directory path for saving data and model. \
153152
Default is None.
154-
should_save_data (bool, optional): Whether to save processed data. \
155-
Default is False.
156-
should_save_model (bool, optional): Whether to save a model. Default is False.
157153
158154
Returns:
159155
tuple[pd.DataFrame, np.ndarray]: A tuple containing the processed \
@@ -169,37 +165,38 @@ def process_dataframe(
169165
data_frame.drop(columns=['CLASS'], axis=1, inplace=True)
170166
data_frame.replace([np.inf, -np.inf], 0, inplace=True)
171167
data_frame.replace(np.nan, 0, inplace=True)
172-
df_col_names = data_frame.columns
168+
169+
df = data_frame.sort_index(axis=1)
170+
del data_frame
171+
172+
df_col_names = df.columns
173173

174174
# MinMax Scaler
175175
minMax_scaler = MinMaxScaler()
176-
minMax_scaler.fit(data_frame)
177-
df_minmax = minMax_scaler.transform(data_frame)
178-
data_frame = pd.DataFrame(df_minmax)
176+
minMax_scaler.fit(df)
177+
df_minmax = minMax_scaler.transform(df)
178+
df = pd.DataFrame(df_minmax)
179179
del df_minmax
180180
label_encdr = LabelEncoder()
181181
class_values = label_encdr.fit_transform(class_values)
182182

183-
data_frame.columns = df_col_names
184-
data_frame['CLASS'] = class_values
183+
df.columns = df_col_names
184+
df['CLASS'] = class_values
185185

186-
if should_save_data:
187-
save_data(
188-
data_frame=data_frame,
189-
class_names_to_save=class_names_to_save,
190-
dir_path=dir_path,
191-
)
192-
if should_save_model:
193-
save_ranges(ranges=minMax_scaler, dir_path=dir_path)
194-
return data_frame, name_class
186+
save_data(
187+
data_frame=df,
188+
class_names_to_save=class_names_to_save,
189+
dir_path=dir_path,
190+
)
191+
192+
save_ranges(ranges=minMax_scaler, dir_path=dir_path)
193+
return df, name_class
195194

196195

197196
def sequence_classification(
198197
data_frame: pd.DataFrame,
199198
name_class: np.ndarray,
200199
dir_path: str,
201-
should_save_model: bool = False,
202-
should_save_confusion_matrix: bool = False,
203200
) -> None:
204201
"""
205202
Perform sequence classification based on provided data and options.
@@ -213,10 +210,6 @@ def sequence_classification(
213210
data_frame (pd.DataFrame): The data frame containing sequence data and features.
214211
name_class (np.ndarray): The array of class names corresponding to the data.
215212
dir_path (str): The path to the directory for saving model and plot files.
216-
should_save_model (bool, optional): Whether to save the trained model. \
217-
Default is False.
218-
should_save_confusion_matrix (bool, optional): Whether to save the \
219-
confusion matrix plot. Default is False.
220213
221214
Returns:
222215
None
@@ -235,8 +228,7 @@ def sequence_classification(
235228
rf_classifier = RandomForestClassifier(n_estimators=100)
236229
rf_classifier.fit(x_train, y_train)
237230

238-
if should_save_model:
239-
save_model(model=rf_classifier, dir_path=dir_path)
231+
save_model(model=rf_classifier, dir_path=dir_path)
240232

241233
# Make predictions on the test set
242234
y_pred = rf_classifier.predict(x_test)
@@ -266,15 +258,14 @@ def sequence_classification(
266258
save_metrics(acc=acc, metrics=metrics, dir_path=dir_path)
267259
del acc, metrics
268260

269-
if should_save_confusion_matrix:
270-
conf_mtx = confusion_matrix(y_true=y_test, y_pred=y_pred)
271-
vmax = max(np.unique(y_test, return_counts=True)[1])
272-
save_confusion_matrix(
273-
conf_mtx=conf_mtx,
274-
name_class=name_class,
275-
vmax=vmax,
276-
dir_path=dir_path,
277-
)
261+
conf_mtx = confusion_matrix(y_true=y_test, y_pred=y_pred)
262+
vmax = max(np.unique(y_test, return_counts=True)[1])
263+
save_confusion_matrix(
264+
conf_mtx=conf_mtx,
265+
name_class=name_class,
266+
vmax=vmax,
267+
dir_path=dir_path,
268+
)
278269
return
279270

280271

@@ -283,12 +274,9 @@ def classify(
283274
step: int,
284275
save_path: str,
285276
dir_path: str,
286-
should_get_kmers: bool = False,
277+
get_kmers: bool = False,
287278
reference_path: str | None = None,
288279
dictonary: str = 'DNA',
289-
should_save_data: bool = True,
290-
should_save_model: bool = True,
291-
should_save_confusion_matrix: bool = True,
292280
chunk_size: int = 100,
293281
):
294282
"""
@@ -305,25 +293,19 @@ def classify(
305293
step (int): The step size for moving the sliding window.
306294
save_path (str): The path to save the processed data and model files.
307295
dir_path (str): The path to the directory containing sequence data.
308-
should_get_kmers (bool, optional): Whether to extract exclusive k-mers. \
296+
get_kmers (bool, optional): Whether to extract exclusive k-mers. \
309297
Default is False.
310298
reference_path (str, optional): The path to the reference sequence data file. \
311299
Default is None.
312300
dictonary (str): The DNA dictionary for k-mer analysis. Default is 'DNA'.
313-
should_save_data (bool, optional): Whether to save processed data. \
314-
Default is True.
315-
should_save_model (bool, optional): Whether to save the trained model. \
316-
Default is True.
317-
should_save_confusion_matrix (bool, optional): Whether to save the \
318-
confusion matrix plot. Default is True.
319301
chunk_size (int, optional): The chunk size for loading sequences. \
320302
Default is 100.
321303
322304
Returns:
323305
Message class: A message confirming the classification pipeline has completed.
324306
"""
325307
exclusive_kmers = None
326-
if should_get_kmers:
308+
if get_kmers:
327309
file_list = [
328310
name for name in listdir(dir_path) if fnmatch(name, '*.fasta')
329311
]
@@ -338,14 +320,16 @@ def classify(
338320
word=word,
339321
step=step,
340322
dictonary=dictonary,
341-
save_kmers=False,
323+
save_path=save_path,
342324
chunk_size=chunk_size,
343325
)
344326
for file in files
345327
)
346328

347329
exclusive_kmers = np.unique(np.concatenate(exclusive_kmers))
348330

331+
message.info_founded_features(len(exclusive_kmers))
332+
349333
data_frame = extract_features(
350334
word=word,
351335
step=step,
@@ -360,15 +344,11 @@ def classify(
360344
df_process, name_class = process_dataframe(
361345
data_frame=data_frame,
362346
dir_path=save_path,
363-
should_save_data=should_save_data,
364-
should_save_model=should_save_model,
365347
)
366348
sequence_classification(
367349
data_frame=df_process,
368350
name_class=name_class,
369351
dir_path=save_path,
370-
should_save_model=should_save_model,
371-
should_save_confusion_matrix=should_save_confusion_matrix,
372352
)
373353
return message.info_done()
374354

@@ -430,6 +410,7 @@ def extract_features_to_predict(
430410
)
431411

432412
data_frame = pd.DataFrame(features)
413+
433414
return data_frame
434415

435416

@@ -459,16 +440,20 @@ def process_dataframe_predict(
459440
data_frame.drop(columns=['ID'], axis=1, inplace=True)
460441
data_frame.replace([np.inf, -np.inf], 0, inplace=True)
461442
data_frame.replace(np.nan, 0, inplace=True)
462-
df_col_names = data_frame.columns
463443

464-
df_minmax = minMax_scaler.transform(data_frame)
465-
data_frame = pd.DataFrame(df_minmax)
444+
df = data_frame.sort_index(axis=1)
445+
del data_frame
446+
447+
df_col_names = df.columns
448+
449+
df_minmax = minMax_scaler.transform(df)
450+
df = pd.DataFrame(df_minmax)
466451
del df_minmax
467452

468-
data_frame.columns = df_col_names
469-
data_frame['ID'] = id_values
453+
df.columns = df_col_names
454+
df['ID'] = id_values
470455

471-
return data_frame
456+
return df
472457

473458

474459
def predict_data(

0 commit comments

Comments
 (0)