-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathGC-content-streamlit.py
355 lines (317 loc) · 18 KB
/
GC-content-streamlit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
import streamlit as st
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from Bio import SeqIO
from io import StringIO
import tempfile
import zipfile
import os
#functions
def f_smoothcal(dataframe, column, f_smooth_points=100):
points = len(dataframe[column])
f_val = (f_smooth_points / points)
return min(f_val, 1)
def calculate_gc_content(sequence, window, step):
gc_content = []
sequence_length = len(sequence)
for i in range(0, sequence_length, step):
start = i
end = min(i + window, sequence_length)
window_sequence = sequence[start:end]
if len(window_sequence) > 0:
gc_count = window_sequence.count('G') + window_sequence.count('C')
gc_percent = gc_count / len(window_sequence)
gc_content.append((start + 1, end, gc_percent))
return gc_content
def map_regions(df, fasta_len, cutoff):
df['Core'] = np.where(df["Smoothed_GC"] < cutoff, 1, 0)
df['Changes'] = df['Core'].diff().fillna(0).abs()
hits = df[df['Changes'] == 1].index
regions = []
cutoff_status = "Disruptive" if df["Smoothed_GC"].iloc[0] >= cutoff else "Core"
for i, change in enumerate(hits):
start = df['Start'].iloc[hits[i-1]] + 1 if i > 0 else 1
end = df['Start'].iloc[change]
regions.append((start, end, cutoff_status))
cutoff_status = "Core" if cutoff_status == "Disruptive" else "Disruptive"
final_start = regions[-1][1] + 1 if regions else 1
regions.append((final_start, fasta_len, cutoff_status))
return regions
def legend_without_duplicate_labels(ax):
handles, labels = ax.get_legend_handles_labels()
unique = [(h, l) for i, (h, l) in enumerate(zip(handles, labels)) if l not in labels[:i]]
ax.legend(*zip(*unique), loc= 'upper right')
### SIDEBAR
with st.sidebar:
st.subheader("Select analysis")
analysis_type = st.toggle("GOI-based analysis")
if analysis_type:
GOI= st.text_input("Search by gene of interest (keyword)", value="MASP", max_chars=100,label_visibility="visible")
GOI= GOI.upper()
st.subheader("General settings")
min_len = st.number_input("Minimum sequence length to analyze:", min_value=0, max_value=1000000, value=50000) # Control para la cantidad de gráficos
window_size = st.number_input("Window size (0-1000):", min_value=0, max_value=1000, value=500)
step_size = st.number_input("Step size (0-500):", min_value=0, max_value=500, value=300)
smooth_f = st.slider("Smoothing factor (Lowess):", 0, 200, 100)# Control para la cantidad de gráficos
cutoff_value = st.slider("Core/Disruptive Cutoff:", 0.1, 1.0, 0.51)
num_plots = st.slider("Number of GC content plots to display:", 0, 20, 1) # Control para la cantidad de gráficos
st.subheader("Graph settings")
ymin_graph = st.number_input("Y min:", min_value=0.00, step=0.01, max_value=0.50, value=0.20)
ymax_graph = st.number_input("Y max:", min_value=0.50, step=0.01, max_value=1.00, value=0.80)
###BODY
st.markdown("<h1 style='text-align: center; color: red; line-height: 0.1'>GCanner</h1><h5 style='text-align: center; color: grey;line-height:1 '> An interactive tool for genome-wide GC analysis and isochore identification</h5>", unsafe_allow_html=True)
st.subheader("", divider="red")
st.markdown("<h4 style='text-align: left; color: black;line-height:0.3; margin-top: 0px; '>Upload FASTA file</h5>", unsafe_allow_html=True)
fasta_file = st.file_uploader("", type=["fasta", "fa", "fna"])
st.subheader("", divider="red")
######################FASTA FILE ONLY
# Almacena gráficos y regiones para exportación
all_figs = []
all_regions = []
# Análisis cuando se sube el archivo
if fasta_file is not None and not analysis_type:
fasta_content = StringIO(fasta_file.getvalue().decode("utf-8"))
sequences = SeqIO.parse(fasta_content, "fasta")
plot_count = 0
st.subheader("Results: Core and Disruptive regions", divider=False)
filtered_Contigs_to_analyze=[]
for record in sequences:
if len(str(record.seq))>=min_len:
filtered_Contigs_to_analyze.append(record.id)
gc_content = calculate_gc_content(str(record.seq), window_size, step_size)
df_gc_content = pd.DataFrame(gc_content, columns=["Start", "End", "GC_Content"])
smooth_factor = f_smoothcal(df_gc_content, "GC_Content", smooth_f)
df_gc_content['Smoothed_GC'] = sm.nonparametric.lowess(df_gc_content["GC_Content"], df_gc_content["Start"], frac=smooth_factor)[:, 1]
regions = map_regions(df_gc_content, len(record.seq), cutoff_value)
for start, end, region_type in regions:
all_regions.append({"Sequence name" : record.id, "Start": start, "End": end, "Region Type": region_type})
df_gc_content["Core/Disruptive"] = np.where(df_gc_content["Smoothed_GC"] < cutoff_value, "Core", "Disruptive")
# Visualización de datos
regions_df = pd.DataFrame(all_regions)
if plot_count >= num_plots:
continue
fig, ax = plt.subplots(figsize=(10, 5))
sns.lineplot(data=df_gc_content, x="Start", y="GC_Content", ax=ax, label="GC-Content",alpha=0.5).set_title(f"Sequence name: {record.id}, window size:{window_size}, step size:{step_size}, smoothing points:{smooth_f}, cutoff:{cutoff_value}")
sns.lineplot(data=df_gc_content, x="Start", y="Smoothed_GC", color="red", ax=ax, label="Lowess smoothing")
ax.axhline(y=cutoff_value, xmin=0, xmax=max(df_gc_content["End"]), color="white") #curoff line
for t in regions:
if t[-1] == "Disruptive":
plt.axvspan(t[0], t[1], facecolor='salmon', alpha=0.2)
elif t[-1] == "Core":
plt.axvspan(t[0], t[1], facecolor='springgreen', alpha=0.2)
ax.set_ylim(ymin_graph,ymax_graph)
ax.set_ylabel("GC Content")
ax.set_ylabel("GC content and Lowess smoothing")
ax.set_xlabel("Sequence position")
ax.margins(x=0)
st.pyplot(fig)
all_figs.append((record.id, fig))
plot_count += 1
if len(all_regions)== 0:
st.markdown(f" **:red-background[The sequence length is shorter than {min_len} bp]**")
st.stop()
regions_df["Region Length"]=regions_df["End"]-regions_df["Start"]+1
dis = sum(regions_df[regions_df["Region Type"]=="Disruptive"]["Region Length"])
Core= sum(regions_df[regions_df["Region Type"]=="Core"]["Region Length"])
tot = sum(regions_df["Region Length"])
df = pd.DataFrame({'Compartment': ['Core', 'Disruptive'],'%': [Core/tot*100, dis/tot*100], "bases":[Core,dis]})
# st.write("Summary statistics")
#st.write(f"Disruptive bases: {dis}\nCore bases: {Core}\nTotal bases: {tot} ")
col6,col7 = st.columns(2)
#plots Summary Core Disruptive
with col6:
fig, ax = plt.subplots(figsize=(4, 4))
sns.barplot(df, x='Compartment', y=round(df['%'],1), hue= 'Compartment',palette=["green", "red"], alpha=0.2, ax=ax)
for i, p in enumerate(ax.patches):
h, w, x = p.get_height(), p.get_width(), p.get_x()
xy = (x + w / 2.,max(df["%"])*1.2)
text1 = f' {round(df.iloc[i]["bases"]/1000,2)}Kb\n {round(df.iloc[i]["%"],2)}%'
ax.annotate(text=text1, xy=xy, ha='center', va='top')
ax.set_ylim(0,110)
if len(filtered_Contigs_to_analyze)== 1:
ax.set_title(f"Compartment proportions in {filtered_Contigs_to_analyze[0]} ")
else:
ax.set_title(f"Compartment proportions in all sequences\n>{min_len} from the FASTA file")
all_figs.append((f"Compartment_porportions_in_>{min_len}", fig))
st.pyplot(fig)
# Export/Download graphs as SVG
with tempfile.TemporaryDirectory() as tmpdirname:
svg_files = []
for figure in all_figs:
svg_path = f"{tmpdirname}/plot_{figure[0]}.svg"
figure[1].savefig(svg_path, format="svg")
svg_files.append(svg_path)
zip_path = f"{tmpdirname}/plots_svg.zip"
with zipfile.ZipFile(zip_path, "w") as zipf:
for svg_file in svg_files:
zipf.write(svg_file, arcname=os.path.basename(svg_file))
with open(zip_path, "rb") as f:
st.download_button("Download figures as SVG", data=f, file_name="plots_svg.zip", mime="application/zip")
st.dataframe(data=regions_df)
if analysis_type:
st.dataframe(data=filtered_DF_GOI)
################### FASTA FILE AND GFF
if fasta_file is not None and analysis_type:
st.markdown("<h4 style='text-align: left; color: black;line-height:0.1; margin-top: 0px; '>Upload GFF file</h5>", unsafe_allow_html=True)
GFF_file = st.file_uploader("", type=["gff"])
st.subheader("", divider="red")
if GFF_file is not None:
GFF_file_content = StringIO(GFF_file.getvalue().decode("utf-8"))
all_GOI = []
contigs_GOI=[]
for line in GFF_file_content:
if GOI in line.upper():
line=line.strip().split("\t")
if len(line)>8:
Contig=line[0].strip()
pbi=int(line[3])
pbf=int(line[4])
desc=line[8]
contigs_GOI.append(Contig)
all_GOI.append({"Sequence name" : Contig, "pbi": pbi, "pbf": pbf, "Description": desc})
df_GOI = pd.DataFrame(data=all_GOI, columns=["Sequence name" , "pbi", "pbf", "Description"])
set_contigs_GOI= set(contigs_GOI)
c_to_map= set(df_GOI["Sequence name"])
if len(c_to_map) == 1:
map=1
elif len(c_to_map) == 0:
st.markdown(f" **:red-background[No ***{GOI}*** were found]**")
st.stop()
elif GOI == "":
st.markdown(f" **:red-background[Write a keyword for the GOI-based search]**")
st.stop()
# Almacena gráficos y regiones para exportación
all_figs = []
all_regions = []
# Análisis cuando se sube el archivo
if fasta_file is not None and analysis_type and GFF_file is not None and len(GOI)>=1:
fasta_content = StringIO(fasta_file.getvalue().decode("utf-8"))
sequences = SeqIO.parse(fasta_content, "fasta")
st.subheader("Results: GOI-based search")
filtered_Contigs_to_analyze=[]
plot_count = 0
for record in sequences:
if len(str(record.seq))>=min_len and record.id in set_contigs_GOI:
filtered_Contigs_to_analyze.append(record.id)
gc_content = calculate_gc_content(str(record.seq), window_size, step_size)
df_gc_content = pd.DataFrame(gc_content, columns=["Start", "End", "GC_Content"])
smooth_factor = f_smoothcal(df_gc_content, "GC_Content", smooth_f)
df_gc_content['Smoothed_GC'] = sm.nonparametric.lowess(df_gc_content["GC_Content"], df_gc_content["Start"], frac=smooth_factor)[:, 1]
regions = map_regions(df_gc_content, len(record.seq), cutoff_value)
for start, end, region_type in regions:
all_regions.append({"Sequence name" : record.id, "Start": start, "End": end, "Region Type": region_type})
df_gc_content["Core/Disruptive"] = np.where(df_gc_content["Smoothed_GC"] < cutoff_value, "Core", "Disruptive")
regions_df = pd.DataFrame(all_regions)
for hit in all_GOI: #mapeo de GFF en regiones
f_df = regions_df[regions_df["Sequence name" ] == hit["Sequence name" ] ] #me quedo con las regiones de ese Contig solamente
d_Contig = f_df.to_dict('records')
for region in d_Contig : # list of dict
if int(hit["pbi"]) >= region["Start"] and int(hit["pbf"]) <= region["End"]:
hit["Region Type"]= region["Region Type"]
hit["Pbi-Region"]= region["Start"]
hit["Pbf-Region"]= region["End"]
hit["Contig-len"]= max(regions_df[regions_df["Sequence name" ] == hit["Sequence name" ] ]["End"] )
elif int(hit["pbi"]) >= region["Start"] and not int(hit["pbf"]) <= region["End"]:
hit["Region Type"]= "Edge"
hit["Pbi-Region"]= region["Start"]
hit["Pbf-Region"]= region["End"]
hit["Contig-len"]= max(regions_df[regions_df["Sequence name" ] == hit["Sequence name" ] ]["End"] )
if plot_count >= num_plots :# Visualización de datos
continue
fig, ax = plt.subplots(figsize=(10, 5))
sns.lineplot(data=df_gc_content, x="Start", y="GC_Content", ax=ax, label="GC content",alpha=0.5).set_title(f"Sequence name: {record.id}, window size:{window_size}, step size:{step_size}, smoothing points:{smooth_f}, cutoff:{cutoff_value}")
sns.lineplot(data=df_gc_content, x="Start", y="Smoothed_GC", color="red", ax=ax, label="Lowess smoothing")
ax.axhline(y=cutoff_value, xmin=0, xmax=max(df_gc_content["End"]), color="white") #curoff line
for t in regions:
if t[-1] == "Disruptive":
plt.axvspan(t[0], t[1], facecolor='salmon', alpha=0.2)
elif t[-1] == "Core":
plt.axvspan(t[0], t[1], facecolor='springgreen', alpha=0.2)
for pb in df_GOI[df_GOI["Sequence name" ]==record.id]["pbi"]:
ax.axvspan(xmin=pb, ymax=0.05,xmax=df_GOI[ (df_GOI["pbi"]==pb) & (df_GOI["Sequence name" ]==record.id) ]["pbf"].iloc[0], color="black", alpha=0.5,label=GOI)
legend_without_duplicate_labels(ax)
ax.set_ylim(ymin_graph,ymax_graph)
ax.set_ylabel("GC Content")
ax.set_ylabel("GC content and Lowess smoothing")
ax.set_xlabel("Sequence position")
ax.margins(x=0)
st.pyplot(fig)
plot_count+=1
all_figs.append((record.id, fig))
if fasta_file is not None and analysis_type and GFF_file is not None and len(GOI)>=1:
try: #handles empty dataframes due to the lack of GOI within it or non annotated contigs
PRU= regions_df["End"]
except:
regions_df= None
st.markdown(f" **:red-background[Inconsistencies were found]**")
if fasta_file is not None and analysis_type and GFF_file is not None and regions_df is not None and len(GOI)>=1:
regions_df["Region Length"]=regions_df["End"]-regions_df["Start"]+1
dis = sum(regions_df[regions_df["Region Type"]=="Disruptive"]["Region Length"])
Core= sum(regions_df[regions_df["Region Type"]=="Core"]["Region Length"])
tot = sum(regions_df["Region Length"])
df = pd.DataFrame({'Compartment': ['Core', 'Disruptive'],'%': [Core/tot*100, dis/tot*100], "bases":[Core,dis]})
col6,col7 = st.columns(2)
#plots Summary Core Disruptive
with col6:
fig, ax = plt.subplots(figsize=(4, 5))
sns.barplot(df, x='Compartment', y=round(df['%'],1), hue= 'Compartment',palette=["green", "red"], alpha=0.2, ax=ax)
for i, p in enumerate(ax.patches):
h, w, x = p.get_height(), p.get_width(), p.get_x()
xy = (x + w / 2.,max(df["%"])*1.2)
text1 = f' {round(df.iloc[i]["bases"]/1000,2)}Kb\n {round(df.iloc[i]["%"],2)}%'
ax.annotate(text=text1, xy=xy, ha='center', va='top')
ax.set_ylim(0,110)
if len(filtered_Contigs_to_analyze)== 1:
ax.set_title(f"Compartment proportions in {filtered_Contigs_to_analyze[0]}\n ")
else:
ax.set_title(f"Compartments proportions in all sequences\n>{min_len} from the FASTA file")
all_figs.append((f"Compartments proportions in >{min_len} sequences", fig))
st.pyplot(fig)
df_GOI = pd.DataFrame(data=all_GOI , columns=["Sequence name" , "pbi", "pbf", "Description", "Region Type", "Pbi-Region", "Pbf-Region" , "Contig-len"])
filtered_DF_GOI= df_GOI[df_GOI['Contig-len']>min_len][["Sequence name" , "pbi", "pbf", "Description", "Region Type"]]
df_trans=filtered_DF_GOI.groupby('Region Type').describe()["pbi"]
tot_counts= sum(df_trans["count"])
df_trans["%count"]= df_trans["count"]/tot_counts*100
# col8,col9= st.columns(2)
with col7:
try:
fig, ax = plt.subplots(figsize=(4, 5))
sns.barplot(df_trans, x='Region Type', y="%count", hue= 'Region Type',palette={"Disruptive":"red","Core":"green","Edge":"grey"}, alpha=0.2, ax=ax)
for i, p in enumerate(ax.patches):
h, w, x = p.get_height(), p.get_width(), p.get_x()
xy = (x + w / 2.,max(df_trans["%count"])*1.15)
text1 = f' {round(df_trans.iloc[i]["count"],2)}\n {round(df_trans.iloc[i]["%count"],2)}%'
ax.annotate(text=text1, xy=xy, ha='center', va='top')
ax.set_ylim(0,120)
ax.set_ylabel(f"% {GOI}")
if map==1:
c=df_GOI["Sequence name" ].iloc[0]
ax.set_title(f"Compartment proportions for\n{GOI} in sequence {c}")
else:
ax.set_title(f"Compartment proportions for {GOI}")
all_figs.append((f"{GOI} counts", fig))
st.pyplot(fig)
except:
if map ==0:
st.write(f"No {GOI} were found in GFF file, try another keyword")
# st.write(f"No {GOI} were found in GFF file, try another keyword")
# Export/Download graphs as SVG
with tempfile.TemporaryDirectory() as tmpdirname:
svg_files = []
for figure in all_figs:
svg_path = f"{tmpdirname}/plot_{figure[0]}.svg"
figure[1].savefig(svg_path, format="svg")
svg_files.append(svg_path)
zip_path = f"{tmpdirname}/plots_svg.zip"
with zipfile.ZipFile(zip_path, "w") as zipf:
for svg_file in svg_files:
zipf.write(svg_file, arcname=os.path.basename(svg_file))
with open(zip_path, "rb") as f:
st.download_button("Download figures as SVG", data=f, file_name="plots_svg.zip", mime="application/zip")
st.dataframe(data=regions_df)
if analysis_type:
st.dataframe(data=filtered_DF_GOI)
st.caption("This program is part of the Disruptomics project. Please cite Balouz V. et al. 2025")