-
Notifications
You must be signed in to change notification settings - Fork 1
/
partial_motif_corrector.py
162 lines (124 loc) · 4.94 KB
/
partial_motif_corrector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 8 14:41:13 2024
@author: lafields2
"""
import pandas as pd
import csv
from Bio.SeqIO.FastaIO import SimpleFastaParser
partial_motif_path = r"D:\Manuscripts\2024_MotifAutomation\Algorithm_Validation\Databases\AMP\partial_motif_0.40_AMP_scores_2.csv"
full_motif_path = r"D:\Manuscripts\2024_MotifAutomation\Algorithm_Validation\Databases\AMP\full_motif_0amp_scores_2.csv"
fasta_path = r"D:\Manuscripts\2024_MotifAutomation\Algorithm_Validation\Databases\AMP\AMP_database_UNMC.fasta"
output_path = r"D:\Manuscripts\2024_MotifAutomation\Algorithm_Validation\Databases\AMP\processed_dbs"
#manual_db_path = r"D:\Manuscripts\2024_MotifAutomation\Algorithm_Validation\Databases\Manual_NP_motif_DB.csv"
min_len = 2
aa_list = ['A','G','I','L','P','V','F','W','Y','D','E','R','H','K','S','T','C','M','N','Q']
scores_rep = pd.read_csv(partial_motif_path)
fasta_to_df = []
title_to_df = []
with open(fasta_path) as fasta_file: # Will close handle cleanly
for title, sequence in SimpleFastaParser(fasta_file):
fasta_to_df.append(sequence)
fasta_df = pd.DataFrame()
fasta_df['Sequence'] = fasta_to_df
#fasta_df['Title'] = title_to_df
partial_motif_list = scores_rep['motif'].values.tolist()
all_motif_options_storage = []
for a in partial_motif_list:
for b in aa_list:
replaced_a = a.replace('X',b)
res = any(replaced_a in sub for sub in fasta_to_df)
if res == True:
if replaced_a not in all_motif_options_storage:
all_motif_options_storage.append(replaced_a)
else:
pass
else:
pass
full_motif = pd.read_csv(full_motif_path)
full_motif_list = full_motif['motif'].values.tolist()
all_motifs = full_motif_list + all_motif_options_storage
all_motifs_filtered = []
for x in all_motifs:
if len(x) >= min_len:
all_motifs_filtered.append(x)
else:
pass
motif_rep_out = pd.DataFrame()
motif_rep_out['Sequence'] = all_motifs_filtered
out = output_path + '\\combined_AMP_motif_DB.csv'
with open(out,'w',newline='') as filec:
writerc = csv.writer(filec)
motif_rep_out.to_csv(filec,index=False)
partial_motifs_filtered = []
for x in all_motif_options_storage:
if len(x) >= min_len:
partial_motifs_filtered.append(x)
else:
pass
partial_motif_rep_out = pd.DataFrame()
partial_motif_rep_out['Sequence'] = partial_motifs_filtered
out = output_path + '\\partial_AMP_motif_DB.csv'
with open(out,'w',newline='') as filec:
writerc = csv.writer(filec)
partial_motif_rep_out.to_csv(filec,index=False)
full_motifs_filtered = []
for x in full_motif_list:
if len(x) >= min_len:
full_motifs_filtered.append(x)
else:
pass
full_motif_rep_out = pd.DataFrame()
full_motif_rep_out['Sequence'] = full_motifs_filtered
out = output_path + '\\full_AMP_motif_DB.csv'
with open(out,'w',newline='') as filec:
writerc = csv.writer(filec)
full_motif_rep_out.to_csv(filec,index=False)
# paired_seq_storage = []
# paired_mot_storage = []
# for value in all_motifs:
# if len(value) >= min_len:
# for string in fasta_to_df:
# if value in string:
# paired_seq_storage.append(string)
# paired_mot_storage.append(value)
# paired_rep = pd.DataFrame()
# paired_rep['Motif'] = paired_mot_storage
# paired_rep['Sequence'] = paired_seq_storage
# paired_rep_fam = paired_rep.merge(fasta_df, on='Sequence')
# out = output_path + '\\full_NP_motif_DB_3_w_fam_new.csv'
# with open(out,'w',newline='') as filec:
# writerc = csv.writer(filec)
# paired_rep_fam.to_csv(filec,index=False)
# # #%%
# manual_db = pd.read_csv(manual_db_path)
# manual_db_motif_list = manual_db['Sequence'].values.tolist()
# man_motif_storage = []
# man_seq_storage = []
# true_storage = []
# for mot in manual_db_motif_list:
# for seq in fasta_to_df:
# if mot in seq:
# man_motif_storage.append(mot)
# man_seq_storage.append(seq)
# if mot not in seq:
# pass
# for mot in manual_db_motif_list:
# if mot in man_motif_storage:
# pass
# elif mot not in man_motif_storage:
# man_motif_storage.append(mot)
# man_seq_storage.append('Matching seq not found')
# for x in man_seq_storage:
# if x in fasta_to_df:
# true_storage.append(True)
# if x not in fasta_to_df:
# true_storage.append(False)
# man_motif_check = pd.DataFrame()
# man_motif_check['Motif'] = man_motif_storage
# man_motif_check['Sequence'] = man_seq_storage
# man_motif_check['Status'] = true_storage
# out = output_path + '\\manual_motif_db_corrected_new.csv'
# with open(out,'w',newline='') as filec:
# writerc = csv.writer(filec)
# man_motif_check.to_csv(filec,index=False)