-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathFilterFasta.py
More file actions
32 lines (24 loc) · 914 Bytes
/
FilterFasta.py
File metadata and controls
32 lines (24 loc) · 914 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
#!/usr/bin/python
#Program to filter out problem protein sequences, remove '*' from ends of predicted sequences and remove sequences shorter than 20 aa
import sys
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
sequences = []
for file in os.walk(topdown=False):
records = list(SeqIO.parse(file, "fasta"))
for rec in records:
if len(rec.seq) == 0:
sys.exit("zero length sequence in file {}".format(file))
elif '|' not in rec.id:
sys.exit("problem with ID field in {}".format(file))
elif str(rec.seq).endswith('*'):
rec.seq = Seq(str(rec.seq)[-1])
elif len(rec.seq) < 20:
print("Very short sequence in file {}, excluding".format(file))
else:
pass
sequences.append(rec)
outfile = open("goodProteins.fasta", 'w')
SeqIO.write(sequences, outfile, "fasta")