-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathNullSeq_Functions.py
805 lines (637 loc) · 19.3 KB
/
NullSeq_Functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
'''
File: NullSeq_Functions.py
Author: Sophia Liu
Date: May 16, 2016
Description: This script will generate random nucleotide sequences with a
given GC content either based on a nucleotide or amino acid
sequence or amino acid usage probabitliy
'''
import collections
from Bio.Seq import Seq
from Bio.Data import CodonTable
import random
import math
import bisect
import numpy as np
import pandas as pd
import os.path
def Codons_for_AA(n):
''' Determines the codons used to code an amino acid given a codon table
Parameters
----------
n : int
the codon table index according to NCBI
Returns
-------
CodonforAA : dictionary
Key : str
Amino acids in Single letter notation (caps)
Values : str
Three letter codon (in caps)
'''
GivenCodonTable = CodonTable.unambiguous_dna_by_id[n]
nucleotides = ['A', 'T', 'C', 'G']
CodonforAA = {}
for first in nucleotides:
for second in nucleotides:
for third in nucleotides:
Codon = first + second + third
if Codon not in CodonTable.unambiguous_dna_by_id[n].stop_codons:
if GivenCodonTable.forward_table[Codon] in CodonforAA.keys():
CodonforAA[GivenCodonTable.forward_table[Codon]].append(Codon)
else:
CodonforAA[GivenCodonTable.forward_table[Codon]] = [Codon]
else:
pass
return CodonforAA
def AAUsage_from_csv(filename):
''' Gets AA usage dictionary from csv filename
Parameters:
-----------
filename : str
path to csv file
Returns:
--------
AAusage : dictionary
Keys : str
AA abrv
Values : float
P(AA)
AAUsagedf : pandas dataframe
'''
AAUsageDf = pd.read_csv(filename)
return AAUsageDf
def get_Equal_AA_Prob(n):
'''Determines the Equal AA frequency according to a codon table
Parameters:
-----------
n :
codon table
Returns:
--------
AAdict : dictionary
Keys : str
amino acid
Values : float
Probability of using the amino acid
'''
AAdict = Codons_for_AA(n)
EqualAAdf = pd.DataFrame()
AAlist = AAdict.keys()
freqlist = [1/len(AAlist)]*len(AAlist)
EqualAAdf['AA'] = AAlist
EqualAAdf['f'] = freqlist
return EqualAAdf
def df_to_dict(df, n):
''' Translates pandas dataframe to dictionary
Parameters:
----------
df : pandas DataFrame
Returns : dictionary
Keys : str
AA
Values : float
P(AA)
'''
AAUsageDict = {}
valuelist = df.values
for item in valuelist:
AAUsageDict[item[0]] = item[1]
AAUsageDict = clean_AA_dict(AAUsageDict, n)
return AAUsageDict
def cdf(weights):
''' Translates weights to cdf
Parameters:
-----------
weights : list
list of weights
Returns:
-------
result : list
cdf of the weights
'''
total = sum(weights)
result = []
cumsum = 0
for w in weights:
cumsum += w
result.append(cumsum / total)
return result
def choice(population, weights):
'''Chooses from the population with probablities
determined by the weights
Parameters:
-----------
population : list
list of item to choose from
weights : list
weights of each item in population
'''
assert len(population) == len(weights)
cdf_vals = cdf(weights)
x = random.random()
idx = bisect.bisect(cdf_vals, x)
return population[idx]
def get_Random_AA_Seq(AAprob, length):
'''Creates a random amino acid sequence based on a given AA frequency
Parameters:
-----------
AAprob : dictionary
Keys : str
Amino acid
Values : float
Probability of using the corresponding amino acid
length : int
Number of Amino acids in the sequence
Returns:
--------
AAseq : str
random AA sequence drawn from a specified amino acid Probability and length
Notes:
------
Start/Stop codon not included
'''
AAlist = []
Problist = []
for AA, Prob in AAprob.items():
AAlist.append(AA)
Problist.append(Prob)
AAseq = ''
for i in range(length):
AAseq += choice(AAlist, Problist)
return AAseq
def get_AA_Count(seq, n, Nuc=True):
'''Determines the number of each Amino acids used in the sequence
Parameters:
-----------
seq : str
The sequence to be evaluated
if Nuc is True:
seq is the nucleotide sequence
if Nuc is False:
seq is the amino acid sequence
n : int
Codon table to be used
Nuc : bool (optional)
True:
evaulates seq as a nucleotide sequence
False:
evaulates seq as a Amino acid sequence
Returns:
--------
AAUsageDict : dictionary
Keys : str
Amino acid symbol
Values : float
the number of the corresponding AA in the sequence
Notes:
-----
Does not include first AA
'''
if Nuc:
# print(seq)
AA = str(Seq(seq).translate(table=n)[1:-1])
else:
AA = seq[1:]
AAUsageDict = collections.Counter(AA)
return AAUsageDict
def get_AA_Freq(seq, n, nucleotide=True):
'''Determines the frequency of each Amino acids used in the sequence
Parameters:
-----------
seq : str
The sequence to be evaluated
if nucleotide is True:
seq is the nucleotide sequence
if nucleotide is False:
seq is the amino acid sequence
n : int
Codon table to be used
nucleotide : bool (optional)
True:
evaulates seq as a nucleotide sequence
False:
evaulates seq as a Amino acid sequence
Returns:
--------
AAUsageDict : dictionary
Keys : str
Amino acid symbolget_AA_Count
Values : float
the frequency of the corresponding amino acid
'''
AAUsageDict = get_AA_Count(seq, n, Nuc=nucleotide)
for AA in AAUsageDict.keys():
if nucleotide:
AAUsageDict[AA] = AAUsageDict[AA]/((len(seq)-6)/3)
else:
AAUsageDict[AA] = AAUsageDict[AA]/(len(seq)-1)
AAUsageDict = clean_AA_dict(AAUsageDict, n)
return AAUsageDict
def clean_AA_dict(AAUsageDict, n):
'''
Recently added function to add in amino acids with zero frequency and ensure only usage of the
standard alphabet.
--AJH
'''
CforAA = Codons_for_AA(n)
for aa in CforAA.keys():
if aa not in AAUsageDict.keys():
AAUsageDict[aa] = 0.0
assert np.isclose(1.0, np.sum(list(AAUsageDict.values()))), 'The amino acid frequencies from your file do not sum to 1. Exiting'
assert set(list(CforAA.keys())) == set(list(AAUsageDict.keys())), 'Problematic amino acid usage frequency dictionary, '\
'I found {} amino acids instead of the {} included in the indicated translation table (which, for the record is {})'.format(len(AAUsageDict.keys()), len(CforAA.keys()), n)
return AAUsageDict
def get_ATCG_Count(seq):
'''Determines the number of each nucleotide in the sequence
Parameters:
-----------
seq : str
nucleotide sequence to be evaluated
Returns:
--------
ATCGcount : dictionary
Keys : str
A, T, C, G
Values : float
number of corresponding nucleotide in the sequence
Note:
-----
input sequence should include start and stop Codons
calculated values do not take into accout the start and stop codons
'''
ATCGcount = collections.Counter([seq[x] for x in range(3, len(seq)-3)])
return ATCGcount
def get_GC_Count(seq):
'''Determines the number GC nucleotides in the sequence
Parameters:
-----------
seq : str
nucleotide sequence to be evaluated
Returns:
--------
GCcount : float
the number of GC nucleotides in the sequence
Note:
-----
input sequence should include start and stop Codons
calculated values do not take into accout the start and stop codons
'''
ATCGcount = get_ATCG_Count(seq)
GCcount = ATCGcount['G'] + ATCGcount['C']
return GCcount
def get_GC_Freq(seq):
'''Determines the frequency of GC nucleotide in the sequence
Parameters:
-----------
seq : str
nucleotide sequence to be evaluated
Returns:
--------
GCfreq : float
the frequency of GC nucleotides in the sequence
Note:
-----
input sequence should include start and stop Codons
calculated values do not take into accout the start and stop codons
'''
GCfreq = get_GC_Count(seq)/(len(seq)-6)
return GCfreq
def GC_Content_for_Codons(n):
''' Determines the number of G/C nucletides for all codons in the
codon table
Parameters
----------
n : int
The codon table index according to NCBI
Returns
-------
CodonGCContent : dictionary
Key : str
Three letter codon (in caps)
Values : int
Number of G/C nucleotides in the codon
'''
CodonforAA = Codons_for_AA(n)
CodonGCContent = collections.defaultdict(int)
for AA in CodonforAA:
for codon in CodonforAA[AA]:
codonusage = collections.Counter(codon)
CodonGCContent[codon] = codonusage['G'] + codonusage['C']
return CodonGCContent
def minmax_GC_for_AA(n):
''' Determines the maximum or minimum number
of G/C nucleotide for a given AAsequence
Parameters:
-----------
n : int
NCBI translation table
Returns:
--------
minGCDict : dictionary
Keys : str
AA
Values : int
smallest number of G/C nucleotides
maxGCDict : dictionary
Keys : str
AA
Values : int
largest number of G/C nucleotides
'''
CodonforAA = Codons_for_AA(n)
CodonGCContent = GC_Content_for_Codons(n)
maxGCDict = {}
minGCDict = {}
for AA in CodonforAA.keys():
templist = [CodonGCContent[codon] for codon in CodonforAA[AA]]
maxGCDict[AA] = max(templist)
minGCDict[AA] = min(templist)
return (minGCDict, maxGCDict)
def get_maxmin_GC_count(AAfreq, n):
'''Determines the maximum and minimum GC ratio for
specified AA compostion
Parameters:
-----------
AAfreq : dictionary
Keys : str
AA
Values : float
frequency of AA in sequence
n : int
NCBI translation table
Returns:
--------
low : float
lowest possible GC ratio
high : float
highest possible GC ratio
'''
(minGCDict, maxGCDict) = minmax_GC_for_AA(n)
high = 0
low = 0
for AA in AAfreq:
high += AAfreq[AA]*maxGCDict[AA]/3
low += AAfreq[AA]*minGCDict[AA]/3
return (low, high)
def evaluate_possibility(AAfreq, GC, l, n):
'''Determines whether is it possible to obtain the GC content
for the given AA compostion
Parameters:
-----------
AAfreq : dictionary
Keys : str
AA
Values : float
frequency of AA in sequence
GC : float
desired GC content [0, 100]
l : int
length of sequence
n : int
NCBI translation table
Returns:
--------
bool
True : if the GC content is obtainable
False: if the GC content is not obtainable
'''
(L, H) = get_maxmin_GC_count(AAfreq, n)
if GC/100 > L and GC/100 < H:
return True
else:
return False
def get_beta(given, length, CforAA, setAAProb, n):
'''Determines the value of beta given the GC content of the sequence
Parameters:
-----------
given : float
GC content can be speficied (%)
length : float
number of AA in the sequnece
CforAA : dictionary
the codon table being used
key : str
AA
Values : list
list of codons that can be used to code for the AA
setAAProb : dictionary
the specified AA usage probability
key : str
AAprob
Values : float
P(AA)
Returns:
-------
beta : float
the constant in the equation P = exp(-beta*N)/Z
'''
m = given*length*3
rl = -20
rr = 20
avl = compute_average_GC(rl, length, CforAA, setAAProb, n) - m
avr = compute_average_GC(rr, length, CforAA, setAAProb, n) - m
if avl*avr > 0:
if avl < 0:
return rl
else:
return rr
else:
rm = (rl+rr)/2
avv = compute_average_GC(rm, length, CforAA, setAAProb, n) - m
while math.fabs(avv) > 1e-7:
if avv*avl > 0:
rl = rm
else:
rr = rm
rm = 0.5*(rl+rr)
avv = compute_average_GC(rm, length, CforAA, setAAProb, n) - m
return rm
def Probability_Given_Beta(beta, CforAA, n):
''' Gives the probability of using each codon given beta.
The probabilites are determined using the bolzmann distribtion
where P = exp(-beta*N)/Z
N is the number of G/C nucleotides in the codon
Parameters:
----------
beta : float
The constant beta used in the calulations of the probability
n : int
The codon table index according to NCBI
Returns:
-------
probs : dictionary
Key : str
Codons (caps)
Values : float
The probability of using the codon (P(codon|AA))
'''
probs = {}
CodonforAA = CforAA
CodonGC = GC_Content_for_Codons(n)
for AA in CodonforAA:
Z = sum([math.exp(-beta*CodonGC[codon]) for codon in CodonforAA[AA]])
for codon in CodonforAA[AA]:
probs[codon] = math.exp(-beta*CodonGC[codon])/Z
return probs
def compute_average_GC(b, length, CforAA, setAAProb, n, given='beta'):
'''Determines the average GC of the given sequence given codon
probablities or a given beta
Parameters:
-----------
b : float or dictionary
When calulating for a given beta b is a float.
When calulating for a codon distribution is a dictionary
key : str
codon
Values : float
P(codon)
length : float
number of AA in the sequnece
CforAA : dictionary
the codon table being used
key : str
AA
Values : list
list of codons that can be used to code for the AA
setAAProb : dictionary
the specified AA usage probability
key : str
AAprob
Values : float
P(AA)
given : str (optional)
When given is 'beta' the fuction calculates average GC for beta.
When given is 'probability' the the fuction calculates average GC
for a given codon usage proability
Returns:
--------
sum(GCnumber) : float
The expected number GC nucleotides for the given sequence given
the conditions
'''
CodonGC = GC_Content_for_Codons(n)
GCnumber = []
if given == 'beta':
Probability = Probability_Given_Beta(b, CforAA, n)
CodonforAA = CforAA
AAcount = {}
for AA in setAAProb:
AAcount[AA] = setAAProb[AA]*length
for AA in CodonforAA:
GCnumber.append(AAcount[AA]*sum([Probability[codon]*CodonGC[codon] \
for codon in CodonforAA[AA]]))
return sum(GCnumber)
def Normalize_Probabilites(rawprob, n):
''' Normalizes the Probabilities of synonymus codons so that
it sums to 1
Parameters:
-----------
rawprob : dictionary
Keys : str
three letter codon CodonFreq
Values :
raw Probabilities associated with using that codonProb
n : int
codon table
Returns:
--------
NormProb : dictionary
Keys : str
AA
Values : tuple
[0] : list of codons
[1] : associated Normalized Probabilities
'''
CodonforAA = Codons_for_AA(n)
NormProb = {}
for AA, Codonlist in CodonforAA.items():
rawproblist = []
for codon in Codonlist:
rawproblist.append(rawprob[codon])
s = sum(rawproblist)
NormProb[AA] = (Codonlist, [i/s for i in rawproblist])
return NormProb
def parse_fastafile(filename):
''' parses fasta files into list of sequence
Parameters:
-----------
filename : str
path to fasta file
Returns:
--------
seqlist : list
list of sequences in the fasta file
'''
with open(filename, 'r') as f:
lines = f.readlines()
seqlist = []
for l in lines:
if l[0] == '>':
pass
else:
seqlist.append(l.split('\n')[0])
return seqlist[0]
def get_Random_Nuc_Seq(AAfreq=None, AAseq=None, nseq=None, GC=None, length=None, n=None):
'''Gets list of Random nucleotide sequence given specified Parameters
Parameters:
-----------
AAfreq : dictionary (opt)
The amino acid frequency of the random sequence will be drawn from
If not specified, equal AA usage is assumed
Keys : str
Amino acid
Values : float
amino acid usage frequency
AAseq : str (opt)
when specified, the random sequences have exactally the same AA usage as the sequence
with start and stop codons
nseq : int
the number of random sequences generated
GC : float
GC ratio of the sequence
length : int
length of the sequence (number of amino acids)
n : int
codon table
Returns:
--------
SeqListBeta : list
list of randomly generated nucleotide sequences
'''
CodonforAA = Codons_for_AA(n)
def CC(tupl):
return 'ATG' + ''.join(tupl) + 'TGG'
AAsequence = None
if AAfreq is None and AAseq is None:
AAFrequency = get_Equal_AA_Prob(n)
elif AAseq is None and AAfreq is not None:
AAFrequency = AAfreq
else:
AAsequence = AAseq
AAFrequency = get_AA_Freq(AAseq, n, nucleotide=False)
length = len(AAseq)-1
b = get_beta(GC, length, CodonforAA, AAFrequency, n)
codonProb = Probability_Given_Beta(b, CodonforAA, n)
betaProb = Normalize_Probabilites(codonProb, n)
if AAsequence is None:
pofC = []
pofAA = []
codonlist = []
for AA, clistp in betaProb.items():
pofAA = pofAA + [AAFrequency[AA]]*len(CodonforAA[AA])
pofC = pofC + clistp[1]
codonlist = codonlist + clistp[0]
prob = np.array(pofC)*np.array(pofAA)
prob = prob/sum(prob)
SeqListBeta = np.random.choice(codonlist, size=(nseq, length), replace=True, p=prob)
SeqListBeta = np.apply_along_axis(CC, 1, np.array(SeqListBeta))
return SeqListBeta
else:
SeqListBeta = [['ATG']*nseq]
for AA in AAsequence[1:]:
RandCforAA = np.random.choice(betaProb[AA][0], size=(nseq),
replace=True, p=betaProb[AA][1])
SeqListBeta.append(list(RandCforAA))
SeqListBeta = np.apply_along_axis(CC, 0, np.array(SeqListBeta[1:]))
return SeqListBeta