-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathstrip_vcf_scaffolds_10kb.py
executable file
·84 lines (71 loc) · 2.88 KB
/
strip_vcf_scaffolds_10kb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#author=Tyler Fording
import argparse
def main(fasta_path, vcf_path, new_vcf):
'''
strip_vcf_scaffolds_10kb.py
:param fasta_path:
:param vcf_path:
:param new_vcf:
:return:
'''
scaflist = scaffold_list(fasta_path)
fh = open(vcf_path, 'r')
fh_out = open(str(new_vcf), 'w')
for line in fh:
if line[0] == '#':
fh_out.write(line)
else:
line1 = line.split('\t')
if line1[0] in scaflist:
fh_out.write(line)
else:
pass
def scaffold_list(fasta_path):
'''
This function takes a path to a .fasta file passes the path to readFastaIntoMemory, checks the values under each
key in the dictionary to see if they meet a lenth requirement. It then stores the key for each value that is
above the cutoff and returns it.
:param fasta_path: PATH
:return: list of scaffold names
'''
chrDict = readFastaIntoMemory(fasta_path)
scaffold_list = []
for item in chrDict:
if len(chrDict[item]) < 10000:
pass
else:
scaffold_list.append(item)
return scaffold_list # in the first use case (Asp marmorata) 223 scaffolds
def readFastaIntoMemory(fasta):
'''
This function stores a .fasta into a dictionary with the scaffold name being the key and the value being the
sequence.
:param fasta: PATH
:return: Dictionary of scaffolds with the scaffold name being the key and the sequence being the value
:author: Aaron Odell
'''
chrDict = {}
fasta = open(fasta,'r')
### This loops through our fasta file and creates a new dictionary entry for each new chromosome it comes across. The value is inittially a list that gets appended too for each new line sequence ###
for line in fasta:
line = line.strip('\r')
line = line.strip('\n')
if line[0] == ">":
chrName = line[1::]
if chrName not in chrDict:
chrDict.update({chrName:[]})
else:
return "Error"+'\t'+"Multiple chromosomes with the same name"
else:
chrDict[chrName].append(line)
### Now we have the whole fasta in memory. We loop through the keys/values and "join" the list of seuqnces together into a single string for each chromsome entry in the dictionary
for chromosome in chrDict:
chrDict[chromosome] = ''.join(chrDict[chromosome])
return chrDict
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="VCF to pre-makeped")
parser.add_argument("fasta_path", type=str, help="Enter file path to a fasta file", nargs='?')
parser.add_argument("vcf_path", type=str, help="Enter file path to the vcf to be parsed", nargs='?')
parser.add_argument("new_vcf", type=str, help="Enter file name for output", nargs='?')
args = parser.parse_args()
main(args.fasta_path, args.vcf_path, args.new_vcf)