-
Notifications
You must be signed in to change notification settings - Fork 1
/
scaflist.py
executable file
·65 lines (56 loc) · 2.27 KB
/
scaflist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#author=Tyler Fording
def getLengths(fasta):
'''
This function creates a dictionary where key=scaffold name and value=length of the scaffold
:param fasta: PATH to a fasta
:return: SCAFF:len(SCAFF) Dictionary
'''
scaffDict = readFastaIntoMemory(fasta)
tenkbDict = {}
for key in scaffDict:
if len(scaffDict[key]) >= 10000:
tenkbDict[key] = len(scaffDict[key])
scaffDict = {}
scaffold_list = []
for key in tenkbDict:
scaffold_list.append([tenkbDict[key], key])
scaffold_list = sorted(scaffold_list, reverse=True)
reversed_scaf_list = []
for i in range(len(scaffold_list)):
#print scaffold_list[i]
item = [scaffold_list[i][1], scaffold_list[i][0]]
reversed_scaf_list.append(item)
scflst = [i[1] for i in scaffold_list]
scflenlst = [i[0] for i in scaffold_list]
#print scflenlst
#print scaffold_list
print reversed_scaf_list
#print scflst
#print len(scflst)
def readFastaIntoMemory(fasta):
'''
This function stores a .fasta into a dictionary with the scaffold name being the key and the value being the
sequence.
:param fasta: PATH
:return: Dictionary of scaffolds with the scaffold name being the key and the sequence being the value
:author: Aaron Odell
'''
chrDict = {}
fasta = open(fasta,'r')
### This loops through our fasta file and creates a new dictionary entry for each new chromosome it comes across. The value is inittially a list that gets appended too for each new line sequence ###
for line in fasta:
line = line.strip('\r')
line = line.strip('\n')
if line[0] == ">":
chrName = line[1::]
if chrName not in chrDict:
chrDict.update({chrName:[]})
else:
return "Error"+'\t'+"Multiple chromosomes with the same name"
else:
chrDict[chrName].append(line)
### Now we have the whole fasta in memory. We loop through the keys/values and "join" the list of seuqnces together into a single string for each chromsome entry in the dictionary
for chromosome in chrDict:
chrDict[chromosome] = ''.join(chrDict[chromosome])
return chrDict
getLengths('/Volumes/projects/tfording/genomes/Asp/AspMar_1.0.fasta')