-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathhaploy.py
executable file
·1198 lines (1087 loc) · 39.2 KB
/
haploy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import re
import csv
import os
import copy
import snpload
from bs4 import BeautifulSoup
import urllib.request
import json
import glob
def print_uptree(snpset, ut, do_print=True, b3x='b37'):
prev_gl=[]
rep=''
y=snpset['Y'];
pos=0
neg=0
tot=0
for i, mut in enumerate(ut):
txt=''
if 'txt' in mut:
txt=mut['txt']
#txt+="%s %s"%(mut[b3x], mut['t'])
otherg=mut['isog']
if mut['ftg'] != '?':
if mut['isog']:
otherg+=', '
otherg+=mut['ftg']
if mut[b3x] in y:
rep += "%-1s%-11s%s %-30s %-32s %s\n"%(mut['tag'], mut['g'], y[mut[b3x]]['gen'], mut['raw'], otherg, txt)
else:
rep += "%-1s%-11s%s %-30s %-32s %s\n"%(mut['tag'], mut['g'], ' ', mut['raw'], otherg, txt)
pass
if not mut['g'] in prev_gl:
if mut['g'] in annotations_by_g:
for anno in annotations_by_g[mut['g']]:
rep += "%15s* %s\n"%('',anno['txt'])
prev_gl.append(mut['g'])
if not mut['ftg'] in prev_gl:
if mut['ftg'] in annotations_by_g:
for anno in annotations_by_g[mut['ftg']]:
rep += "%15s* %s\n"%('',anno['txt'])
if mut['ftg'] != '?':
prev_gl.append(mut['ftg'])
for m in mut['raw'].split('/'):
m2=m.replace('(H)','')
if m2 in annotations_by_m:
for anno in annotations_by_m[m2]:
rep += "%15s* %s\n"%('',anno['txt'])
if do_print:
print(rep)
return rep
def print_extras(snpset, bt, do_print=True, b3x='b37'):
rep=''
last=''
out=[]
outs=''
sbt=sorted(bt['extras'], key=lambda x: int(x[b3x]))
for e in sbt:
out.append(e['raw'])
outs += e['raw'] + ':' + e['t'] + ' '
#outs += '%8s %s %s\n'%(e[b3x], e['t'], e['raw'])
q=e['raw']
rep += 'Extra: %s\n'%(outs)
if do_print:
print(rep)
return rep
def print_extras2(snpset, bt, do_print=True, b3x='b37'):
rep=''
last=''
out=[]
outs=''
sbt=sorted(bt['extras2'].keys(), key=lambda x: int(x))
for ek in sbt:
e=bt['extras2'][ek]
#print(e)
out.append(e['n'])
#outs += e['n'] + ' '
outs += '%8s %s %s\n'%(e[b3x], e['t'], e['n'])
q=e['n']
rep += 'Extra2: '+outs+'\n'
if do_print:
print(rep)
return rep
#TODO
def print_all(snpset, bt, do_print=True):
rep=''
last=''
out=[]
outs=''
c=0
for e in bt['all']:
out.append(e['raw'])
outs += '%-40s'%(e['raw']+',')
c+=1
if c == 3:
outs += '\n'
c=0
last=e['raw']
rep += 'All:\n'+outs+'\n'
if do_print:
print(rep)
return rep
def print_data(do_print=True):
rep=''
rep += 'Based on data from:\n'
rep += '* yfull.com %s [CC BY]\n'%haplo_muts_yfull_info
rep += '* isogg.org/tree Y-DNA Tree %s [CC BY-NC-SA 3.0]\n'%haplo_isogg_info
rep += '* ybrowse.org %s [CC BY-NC-SA Thomas Krahn]\n'%haplo_ybrowse_info
rep += '* ftdna.com %s\n'%ftdna_info
rep += '* all-ancient-dna-2-07-73.csv [CC-BY indo-european.eu Carlos Quiles, Jean Manco %s]\n'%''
if do_print:
print(rep)
return rep
def print_links(gr, do_print=True):
rep=''
rep += 'Links:\n'
rep += 'https://www.yfull.com/tree/%s/\n'%(gr)
rep += 'https://phylogeographer.com/scripts/heatmap.php?newlookup=%s\n'%(gr)
rep += 'http://scaledinnovation.com/gg/snpTracker.html?snp=%s\n'%(gr)
if gr.startswith('N-') or gr.startswith('I-'):
rep += 'https://haplotree.info/n/?node=%s\n'%(gr)
if do_print:
print(rep)
return rep
def path_str(ut, n):
rep=''
c=0
prev=''
#for i, mut in enumerate(ut):
for mut in reversed(ut):
if mut['g'] != prev:
if c > 0:
rep = ' -> ' + rep
else:
rep = ' (ISOGG: %s)'%mut['isog']
rep = mut['g'] + rep
c+=1
prev = mut['g']
if c >= n:
break
if not '-' in mut['g']:
break
return rep
def report(fname, n, do_uptree=True, do_extra=True, do_all=False, filt='', force='', min_match_level=0, vcf_sample='', force_build=0):
rep=''
snpset, meta = snpload.load(fname, ['Y'], vcf_sample=vcf_sample, force_build=force_build)
if 'Y' not in snpset:
return "No Y data found\n"
rep += print_data(False)
rep += "%s: Total SNPs: %d\n"%(fname, meta['total'])
b3x='b36'
if meta['build']==37:
b3x='b37'
if meta['build']==38:
b3x='b38'
best_trees = yfind2(snpset, n, filt, force, b3x, min_match_level)
for bt in best_trees:
if do_all:
rep += print_all(snpset, bt, False)
if do_uptree:
rep += print_uptree(snpset, bt['ut'], False, b3x)
leaf_mut = bt['ut'][len(bt['ut'])-1]
rep += "Result (%.1f%% %d -%d +%d): %-8s\n"%(bt['score'], bt['tot'], bt['neg'], bt['nextras'], leaf_mut['g'])
rep += "%s\n"%(path_str(bt['ut'], 20))
if do_extra:
rep += print_extras(snpset, bt, False, b3x)
if bt['extras2']:
rep += print_extras2(snpset, bt, False, b3x)
rep += print_links(leaf_mut['g'], False)
return rep
#for analysing separate ftdna roots, not used currently
def yfind_roots(snpset):
roots_list = []
s=-1
rflag=0
for i, m in enumerate(haplo_muts_list):
#print(i, m, s)
if m['l'] == 0:
if s >= 0 and rflag==0:
roots_list.append(haplo_muts_list[s:i-1])
s=i
rflag=1
else:
rflag=0
roots_list.append(haplo_muts_list[s:])
for r in roots_list:
print(len(r))
return roots_list
#Create a list of mutations on a path upwards from a mutation
def yfind_uptree(snpset, find_g):
found = 0
uptree = []
sames = []
sameg = ''
#print("Scanning uptree", find_g)
for mut in reversed(haplo_muts_list):
if found == 0:
if mut['g'] == find_g: #mut_leaf['g']: #TODO: g?
found = 1
depth = mut['l']
g = mut['g']
uptree.append(mut)
else:
if mut['g'] == g:
uptree.insert(0, mut)
continue
if mut['l'] < depth:
depth = mut['l']
g = mut['g']
uptree.insert(0, mut)
return uptree
def ymatch(snpset, mut, b3x):
y=snpset['Y'];
if mut['t'] == y[mut[b3x]]['gen']:
return True
else:
return False
# Finds n best Y-DNA matches by tracing all possible mutation paths in database
def yfind2(snpset, nbest=5, filt='', force='', b3x='b37', min_match_level=0):
mt=snpset['Y'];
all_mutations = []
uptrees=[]
extras_all = []
#TODO: support more exotic mutations?
#find the route uptree for each mutation in snpset
prev_uptree_g = ''
for mut in haplo_muts_list:
pos = mut[b3x]
if pos in mt:
#print(mt[pos], mut)
if mt[pos]['gen'] == mut['t']:
#print(mt[pos], mut)
#if mut['!']==0:
all_mutations.append(mut)
if mut['l'] < min_match_level:
continue
#TODO: faster uptree algo for big databases
extras_all.append(mut)
if mut['g'] == prev_uptree_g:
#no need to add same group many times, will speed up considerably
continue
prev_uptree_g = mut['g']
uptree = yfind_uptree(snpset, mut['g'])
#print(uptree)
#print_uptree(snpset, uptree, do_print=True, b3x=b3x)
uptrees.append(uptree)
#if there is a forced path set, find a path for it too
if force:
uptree = yfind_uptree(snpset, force)
uptrees.append(uptree)
extras_all2 = {}
for mut in haplo_allmuts_list:
pos = mut[b3x]
if pos in mt:
if mt[pos]['gen'] == mut['t']:
extras_all2[pos]=mut
#find the uptree route that is most consistent
#random mutations can have many negative matches above them until the path reaches the common
#common segment with the correct ones
best_trees=[]
for ut in uptrees:
verbose=0
pos=0
neg=0
tot=0
bm=[]
dbm=[]
tbm=[]
#prepare list of back mutations that need special handling
#for mut in ut:
# #TODO triple bm
# if mut['!'] == 1:
# bm.append(mut)
# if mut['!'] == 2:
# dbm.append(mut)
# if mut['!'] == 3:
# tbm.append(mut)
tag=''
ut_copy=[]
for i, mut in enumerate(ut):
if mut[b3x] in mt and mut['t'] != '?':
bm_matched = 0 #bm_match(snpset, i, ut, bm)
dbm_matched = 0 #bm_match(snpset, i, ut, dbm)
if ymatch(snpset, mut, b3x):
if bm_matched:
##Double back mutation
if verbose: print("!!%-12s %s %s"%(mut['g'], mt[mut['p']]['gen'], mut['raw']))
tag='!!'
else:
#TODO: should not match double without a single back mutation
if verbose: print("+ %-12s %s %s"%(mut['g'], mt[mut['p']]['gen'], mut['raw']))
tag='+'
pos+=1
tot+=1
else:
if bm_matched:
if verbose: print("! %-12s %s %s"%(mut['g'], mt[mut['p']]['gen'], mut['raw']))
tag='!'
elif dbm_matched:
if verbose: print("!!%-12s %s %s"%(mut['g'], mt[mut['p']]['gen'], mut['raw']))
tag='!!'
else:
if verbose: print("- %-12s %s %s"%(mut['g'], mt[mut['p']]['gen'], mut['raw']))
tag='-'
neg+=1
tot+=1
else:
tag=''
#if verbose: print(" (%s)"%mut['g'])
pass
mut_copy = mut.copy()
mut_copy['tag']=tag
ut_copy.append(mut_copy)
#find extras: mutations that are found in snpset but not in uptree
toremove={}
extras=extras_all.copy()
for mut in ut_copy:
toremove[mut['raw']]=1
extras = filter(lambda e: not e['raw'] in toremove, extras)
extras = sorted(extras, key=lambda i: int(i[b3x]))
last = ''
de_duplicate=[]
for e in extras:
if last != e['raw']: # and e['!'] == 0:
de_duplicate.append(e)
last = e['raw']
extras = de_duplicate
extras2=extras_all2.copy()
for mut in ut_copy:
if mut[b3x] in extras2 and mut['t'] == extras2[mut[b3x]]['t']:
del extras2[mut[b3x]]
nextras = len(extras)
#extract unique all mutations
last = ''
de_duplicate=[]
#for e in sorted(all_mutations, key=lambda x: int(re.compile(r'[^\d.]+').sub('', x['raw']))):
for e in sorted(all_mutations, key=lambda x: x['raw']):
if last != e['raw']:
de_duplicate.append(e)
last = e['raw']
all_mutations = de_duplicate
#calculate score giving penalty from negative matches, favoring longest matches
score = 100.0*(pos - 2.0*neg + 0.00*tot - 0.0*nextras)/(tot + nextras)
bt={
'ut': ut_copy,
'score': score,
'nextras': nextras,
'extras': extras,
'extras2': extras2,
'all': all_mutations,
'pos': pos,
'neg': neg,
'tot': tot,
}
best_trees.append(bt)
if filt:
if filt[0] == '=':
best_trees = filter(lambda bt: filt[1:] == bt['ut'][-1]['g'], best_trees)
else:
best_trees = filter(lambda bt: filt in bt['ut'][-1]['g'], best_trees)
best_trees=sorted(best_trees, key=lambda i: -i['score'])
return best_trees[:nbest]
def yfind(snpset):
y=snpset['Y'];
found_mutations = []
for snp in y:
if 'b36' in y[snp] and y[snp]['b36'] in haplo_muts_by_b36:
if y[snp]['gen'] == haplo_muts_by_b36[y[snp]['b36']]['p']:
m = haplo_muts_by_b36[y[snp]['b36']]
found_mutations.append(m)
#print("Match loc: %s - %s (b38:%s = %s)"%(m['g'], m['m'], m['b38'], m['p']))
if 'b37' in y[snp] and y[snp]['b37'] in haplo_muts_by_b37:
if y[snp]['gen'] == haplo_muts_by_b37[y[snp]['b37']]['p']:
m = haplo_muts_by_b37[y[snp]['b37']]
found_mutations.append(m)
#print("Match loc: %s - %s (b38:%s = %s)"%(m['g'], m['m'], m['b38'], m['p']))
if 'b38' in y[snp] and y[snp]['b38'] in haplo_muts_by_b38:
if y[snp]['gen'] == haplo_muts_by_b38[y[snp]['b38']]['p']:
m = haplo_muts_by_b38[y[snp]['b38']]
found_mutations.append(m)
#print("Match loc: %s - %s (b38:%s = %s)"%(m['g'], m['m'], m['b38'], m['p']))
found_mutations.sort(key=_mk_sort_key)
return found_mutations
def _mk_sort_key(mut):
#sort by mutation age - alphabetical order is roughly age order
g = mut['g']
g = g.rstrip('~')
#some tweaks to improve ordering
if g == 'CT':
pr = 'C2'
elif g.startswith('GH'):
pr='G2'
elif g.startswith('IJK'):
pr = 'I2'
elif g.startswith('IJ'):
pr = 'I3'
elif g.startswith('NO'):
pr = 'N2'
else:
pr = g[0]+'5'
return pr + g
haplo_isogg_info = 'Version: 15.73 Date: 11 July 2020'
haplo_muts_yfull_info = ''
haplo_muts_by_b38 = {}
haplo_muts_by_b37 = {}
haplo_muts_by_b36 = {}
haplo_muts_by_name = {}
haplo_muts_list = []
haplo_ftdna_muts_list = []
haplo_ftdna_muts_by_name = {}
haplo_allmuts_list = []
#Imports database from ISOGG spreadsheet
def load_snp():
lnum = 0
pnum = 0
# https://isogg.org/tree/ISOGG_YDNA_SNP_Index.html
# Save as csv:
# https://docs.google.com/spreadsheets/d/1UY26FvLE3UmEmYFiXgOy0uezJi_wOut-V5TD0a_6-bE/edit#gid=1934392066&fvid=105380649
# Name,Subgroup Name,Alternate Names,rs numbers,Build 37 Number,Build 38 Number,Mutation Info
with open('SNP Index - Human.csv') as f:
for line in f:
lnum += 1
if lnum <= 2:
continue
sline = [ '{}'.format(x) for x in list(csv.reader([line], delimiter=',', quotechar='"'))[0] ]
if len(sline) < 7:
continue
if len(sline[6]) < 3:
print('TODO1:', line) #TODO: fix other bugs in db
continue
try:
#int(sline[4]) #may be missing
int(sline[5])
except:
print('TODO2:', line) #TODO: ranges are not SNPs
#try:
# sline[5] = re.findall(r'\d+', sline[5])[0]
#except:
continue
b37=sline[4]
b38=sline[5]
mname=sline[0].split('.')[0] ##TODO test multiple hg with same mut
#p=sline[6].replace('-.', '->',1).split('->')
p=sline[6].split('->')
if len(p) < 2:
print('TODO3:', line) #TODO: format errors
continue
if len(p[0]) > 1 or len(p[1]) > 1:
print('TODO4:', line) #TODO: multibase
continue
p=p[1][0].upper()
mut = {
'm': mname,
'mall': sline[0],
'g': sline[1],
'rs': sline[3],
'b37': b37,
'b38': b38,
'p': p,
}
#print(mname)
if mname not in haplo_muts_by_name:
haplo_muts_by_name[mname] = mut
if b38 not in haplo_muts_by_b38:
haplo_muts_by_b38[b38] = mut
else:
haplo_muts_by_b38[b38]['mall'] += ", " + sline[0]
if b37 not in haplo_muts_by_b37:
haplo_muts_by_b37[b37] = mut
pnum+=1
print("Lines in ISOGG mut DB: %d (%d)"%(lnum, pnum))
haplo_yfull_muts_by_name = {}
haplo_yfull_muts_by_b38 = {}
#Imports database from YFull /snp-list/ URL
def load_yfull_snp_file(fname):
pnum = 0
with open(fname) as f:
print('Importing file: ' +fname)
soup = BeautifulSoup(f.read(), features="html.parser")
d = soup.find('div', id='t1')
t = d.find('table')
rows = t.find_all('tr')
for row in rows:
pnum+=1
if pnum == 1:
continue
cells = row.find_all('td')
mname=cells[0].text
b38=cells[3].text
if b38 == '':
continue
der=cells[5].text
if len(der) > 1:
#TODO: multibase "SNP"
der='?'
mut = {
'm': mname,
'mall': '?',
'g': '',
'rs': '?',
'b37': cells[2].text,
'b38': b38,
'p': der,
}
if mname not in haplo_yfull_muts_by_name:
haplo_yfull_muts_by_name[mname] = mut
if b38 not in haplo_yfull_muts_by_b38:
haplo_yfull_muts_by_b38[b38] = mut
def load_yfull_snp(pages):
try:
os.mkdir('yfull')
except OSError:
pass
for page in range(1, pages):
fname='yfull/yfull-snp-'+str(page)+'.html'
url='https://www.yfull.com/snp-list/?page='+str(page)
try:
with open(fname) as f:
pass
except OSError:
print('File not found: ' +fname)
print('Downloading ' + url + 'to file: ' + fname)
urllib.request.urlretrieve(url, fname);
load_yfull_snp_file(fname)
print("Lines in YFull mut DB: ", len(haplo_yfull_muts_by_name))
haplo_ybrowse_info = ''
haplo_ybrowse_muts_by_name = {}
# Source: http://www.ybrowse.org/gbrowse2/gff/
def load_ybrowse_snp():
ln=0;
with open('snps_hg38.gff3') as f:
for line in f:
ln+=1
if ln % 100000 == 0:
print('Line: %d...'%ln)
if len(line) < 2:
continue
if line.startswith('## date'):
global haplo_ybrowse_info
haplo_ybrowse_info=line[8:].strip('\n')
if line[0] == '#':
continue
tlv=line.split('\t')
if len(tlv) < 8:
print('TODO:', tlv)
continue
#print(tlv)
#TODO: indel works only partially due to varying coding, disabled importing
#if tlv[0] != 'chrY' or tlv[2] != 'snp' or (tlv[1] != 'point' and tlv[1] != 'indel'):
if tlv[0] != 'chrY' or tlv[2] != 'snp' or (tlv[1] != 'point'):
if tlv[1] != 'primate' and tlv[1] != 'indel':
print(tlv)
continue
#TODO quality based filtering
b38 = tlv[3]
f=tlv[8].split(';')
if len(f) < 4:
continue
mname=f[0].split('=')[1]
der=f[3].split('=')[1]
der=der[0].upper()
isog='' #f[7].split('=')[1]
#print(b38, mname, der)
mut = {
'm': mname,
'mall': '?',
'g': isog,
'rs': '?',
'b38': b38,
'p': der,
}
if mname not in haplo_ybrowse_muts_by_name:
haplo_ybrowse_muts_by_name[mname] = mut
print("Lines in YBrowse snp DB: ", len(haplo_ybrowse_muts_by_name))
# Convert formats with CrossMap and chain file in crossmap/
# http://crossmap.sourceforge.net/
# ftp://ftp.ensembl.org/pub/assembly_mapping/homo_sapiens/
def convert_build38_mkinput():
haplo_mut = haplo_muts_by_name
haplo_mut2 = haplo_yfull_muts_by_name
haplo_mut3 = haplo_ybrowse_muts_by_name
haplo_mut4 = haplo_ftdna_muts_by_name
with open('crossmap/conv_in.bed', 'w') as f:
for mut in haplo_mut:
f.write("chrY %d %d %s\n"%(int(haplo_mut[mut]['b38']), int(haplo_mut[mut]['b38']), haplo_mut[mut]['m']))
for mut in haplo_mut2:
try:
f.write("chrY %d %d %s\n"%(int(haplo_mut2[mut]['b38']), int(haplo_mut2[mut]['b38']), haplo_mut2[mut]['m']))
except:
pass
for mut in haplo_mut3:
f.write("chrY %d %d %s\n"%(int(haplo_mut3[mut]['b38']), int(haplo_mut3[mut]['b38']), haplo_mut3[mut]['m']))
for mut in haplo_mut4:
f.write("chrY %d %d %s\n"%(int(haplo_mut4[mut]['b38']), int(haplo_mut4[mut]['b38']), 'none'))
def convert_build38to36():
convert_build38_mkinput()
os.system("cd crossmap; CrossMap.py bed GRCh38_to_NCBI36.chain.gz conv_in.bed > conv_out.bed")
i=0
with open('crossmap/conv_out.bed', 'r') as f:
for line in f:
con=line.split('->')
if len(con) < 2:
#print('TODO:', line)
continue
b36 = con[1].split()[1]
b38 = con[0].split()[1]
mname=''
if len(con[0].split()) > 3:
mname = con[0].split()[3]
if mname in haplo_muts_by_name:
haplo_muts_by_name[mname]['b36'] = b36
if mname in haplo_yfull_muts_by_name:
haplo_yfull_muts_by_name[mname]['b36'] = b36
if mname in haplo_ybrowse_muts_by_name:
haplo_ybrowse_muts_by_name[mname]['b36'] = b36
if mname in haplo_ftdna_muts_by_name:
haplo_ftdna_muts_by_name[mname]['b36'] = b36
i+=1
os.system("cd crossmap; rm conv_in.bed conv_out.bed")
def convert_build38to37():
convert_build38_mkinput()
os.system("cd crossmap; CrossMap.py bed GRCh38_to_GRCh37.chain.gz conv_in.bed > conv_out.bed")
i=0
with open('crossmap/conv_out.bed', 'r') as f:
for line in f:
con=line.split('->')
if len(con) < 2:
#print('TODO:', line)
continue
b37 = con[1].split()[1]
b38 = con[0].split()[1]
mname=''
if len(con[0].split()) > 3:
mname = con[0].split()[3]
if mname in haplo_muts_by_name:
haplo_muts_by_name[mname]['b37'] = b37
if mname in haplo_yfull_muts_by_name:
haplo_yfull_muts_by_name[mname]['b37'] = b37
if mname in haplo_ybrowse_muts_by_name:
haplo_ybrowse_muts_by_name[mname]['b37'] = b37
if mname in haplo_ftdna_muts_by_name:
haplo_ftdna_muts_by_name[mname]['b37'] = b37
i+=1
os.system("cd crossmap; rm conv_in.bed conv_out.bed")
#Load and save full converted database in a local cache file for faster loading
def save_db():
#pickle.dump( haplo_muts_by_b36, open( "haploy_map.txt", "wb" ) )
with open('haploy_map.txt', 'w') as f:
for mut in haplo_muts_by_name:
print(haplo_muts_by_name[mut], file = f)
def load_db():
#haplo_muts_by_b36 = pickle.load( open( "haploy_map.txt", "rb" ) )
with open('haploy_map.txt', 'r') as f:
for line in f:
mut = eval(line)
haplo_muts_by_b36[mut['b36']] = mut
haplo_muts_by_b37[mut['b37']] = mut
haplo_muts_by_b38[mut['b38']] = mut
use_ftdna_tree=0
def load_db2j(min_tree_load_level=0):
fname='haploy_map2j.txt'
if use_ftdna_tree:
fname='haploy_map3j.txt'
with open(fname, 'r') as f:
jdata = json.load(f)
global haplo_ybrowse_info
global haplo_isogg_info
global haplo_muts_yfull_info
global ftdna_info
haplo_ybrowse_info = jdata['info_ybrowse']
haplo_isogg_info = jdata['info_isogg']
haplo_muts_yfull_info = jdata['info_yfull']
ftdna_info = jdata['info_ftdna']
for mut in jdata['muts']:
if mut['l'] >= min_tree_load_level:
haplo_muts_list.append(mut)
def save_db2j():
jroot={
'info_ybrowse': haplo_ybrowse_info,
'info_isogg': haplo_isogg_info,
'info_yfull': haplo_muts_yfull_info,
'info_ftdna': ftdna_info,
'muts': haplo_muts_list }
with open('haploy_map2j.txt', 'w') as f:
json.dump(jroot, f, indent=1);
def save_db3j():
jroot={
'info_ybrowse': haplo_ybrowse_info,
'info_isogg': haplo_isogg_info,
'info_yfull': haplo_muts_yfull_info,
'info_ftdna': ftdna_info,
'muts': haplo_ftdna_muts_list }
with open('haploy_map3j.txt', 'w') as f:
json.dump(jroot, f, indent=1);
#TODO: results in excessive extras - errors in database or back mutations needing proper handling?
def save_alldbj():
allmuts=[]
for mutkey in haplo_ybrowse_muts_by_name:
mut = haplo_ybrowse_muts_by_name[mutkey]
#for mutkey in haplo_muts_by_name:
# mut = haplo_muts_by_name[mutkey]
if is_unreliable_b38(int(mut['b38'])):
continue
if is_blacklisted_name(mutkey):
continue
allmut={
'n': mutkey,
'b38': mut['b38'],
't': mut['p'] }
allmuts.append(allmut)
jroot={
'info': 'todo',
'muts': allmuts }
with open('haploy_alldbj.txt', 'w') as f:
json.dump(jroot, f, indent=1);
def load_alldbj():
with open('haploy_alldbj.txt', 'r') as f:
jdata = json.load(f)
for mut in jdata['muts']:
haplo_allmuts_list.append(mut)
def show_db2():
for m in sorted(haplo_muts_list, key=lambda e: int(e['b37'])):
print(m)
blacklist_etc='M8990' #str etc
blacklist_yb='M3745'
blacklist_yf='Z8834 Z7451 YP1757 YP2129 YP1822 YP1795 YP2228 YP1809 YP2229 YP1948 YP2226 YP1827 L508'
blacklist_yf+=' Y125394 Y125393 Y125392 Y125391 Y125390 Y125389 Y125396 Y125397 Y125408 [report-spacer] V1896 PAGE65.1 Y2363 PF3515 PF3512 PF3507 PF3596 Z6023 M547 A3073 Z1716 PF5827 PF1534 PF6011 PF2634 PF2635'
blacklist_yf+=' YP3755 FGC26799 FGC25766 FGC25761 FGC27299 FGC25769 FGC27434 FGC27818 L1114 FGC25649 FGC26816 A3281 ALK431 A2953 YP3792 \
FGC25649 FGC26816 A3281 FGC27688 FGC25512 FGC27324 L1113 Y49138 FGC25634 M4507 FGC27864 FGC25716 YP3773 FGC26539\
FGC25472 FGC26328 FGC27725 AF4 FGC27435 FGC25546 FGC26921 FGC24700 FGC25917 YP3766 YP3771 FGC25743 FGC26773 V2271\
FGC25748 YP3754 V3872 FGC26706 FGC25651 Y17329 FGC27857 FGC25782 FGC26895 V1957 FGC24743 A2969 FGC26812 FGC26409 FGC24722 \
FGC24780 V2040 FGC25635 FGC25742 FGC27823 YP3459 FGC27321 Y9854 Y17293 YP1838 YP1740 YP2250 YP1807 YP2228 YP1841' #Not defining for A0-T but A00
blacklist_yf+=' YP2174 FGC25497 FGC25562 YP1808 YP1832 FGC26383 YP2133 FGC26577 A2674 FGC24796 L1132 FGC24795 FGC25408 YP1756 FGC26394\
FGC25797 YP2159 FGC27497 YP2251 YP1813 YP1819 YP1820 FGC25816 FGC26267 FGC26688 FGC27081 FGC27116 YP2248 YP2231 YP2182\
YP2180 YP2134 YP2152 YP1828 FGC26487 FGC27290 FGC25704 L1143 FGC27473 FGC25677 YP2254 Y125419 CTS12490 M6176 \
Y125369 L1129 AF12 Y125404 Y125406 Y125417 L1095 A3073' #Not defining for A00 but A0-T
blacklist_yf+=' Z798' #Not same as M1076/V1836
blacklist_unreliable='S782 M8963 Z6132 S27746 Y1477 BY2829'
blacklist=blacklist_etc+' '+blacklist_yb+' '+blacklist_yf+' '+blacklist_unreliable
blacklist=blacklist.split()
#blacklist=[]
unreliable_region_b38 = [
[1 , 2781479 ],
[10072350, 11686750],
[20054914, 20296729], [20296731, 20351054],
[26637971, 26673210],
[56887903, 57217415]]
def is_unreliable_b38(loc):
if loc is None:
return False
try:
loc = int(loc)
except ValueError:
return False
for region in unreliable_region_b38:
if loc >= region[0] and loc <= region[1]:
return True
return False
def is_blacklisted_name(name):
if name in blacklist:
return True
return False
def decode_entry(e):
global haplo_muts_by_name
e2=[]
for e1 in e.split('/'):
e2.append(e1.replace('(H)',''))
m={}
for e1 in e2:
if is_blacklisted_name(e1):
return m
for e1 in e2:
if e1 in haplo_muts_by_name:
mut = haplo_muts_by_name[e1]
m['f']=''
if not 'isog' in m:
m['isog']=''
m['isog']+=mut['g']+''
m['t']=mut['p']
m['b38']=mut['b38']
if 'b37' in mut:
m['b37']=mut['b37']
if 'b36' in mut:
m['b36']=mut['b36']
break
for e1 in e2:
if e1 in haplo_ybrowse_muts_by_name:
mut = haplo_ybrowse_muts_by_name[e1]
m['f']=''
if not 'isog' in m:
m['isog']=''
#m['isog']+=mut['g']+'+(YB)'
m['t']=mut['p']
m['b38']=mut['b38']
if 'b37' in mut:
m['b37']=mut['b37']
if 'b36' in mut:
m['b36']=mut['b36']
break
for e1 in e2:
if e1 in haplo_yfull_muts_by_name:
mut = haplo_yfull_muts_by_name[e1]
m['f']=''
if not 'isog' in m:
m['isog']=''
#m['isog']+=mut['g']+'+(YF)'
if 't' in m and mut['p'] != '?' and m['t'] != mut['p']:
print('Mismatch:', m, mut)
m['t']=mut['p']
m['b38']=mut['b38']
if 'b37' in mut:
m['b37']=mut['b37']
if 'b36' in mut:
m['b36']=mut['b36']
break
#cross-checking
for e1 in e2:
if e1 in haplo_ftdna_muts_by_name:
mut = haplo_ftdna_muts_by_name[e1]
if 'b38' in m and 't' in m and mut['t'] != '?':
if int(mut['b38']) > 0:
if mut['b38'] != m['b38'] :
print('FTDNA pos mismatch:', e, mut['b38'], m['b38'], mut, m)
else:
if mut['t'] != m['t']:
print('FTDNA der mismatch:', e, mut['t'], m['t'], mut, m)
if 'b38' in m:
m['ftg']=mut['g']
return m
def yfull_fname(group):
if group:
return 'yfull/yfull-ytree-'+group+'.html'
else:
return 'yfull/yfull-ytree.html'
def yfull_url(group):
if group:
return 'https://www.yfull.com/tree/' + group + '/'
else:
return 'https://www.yfull.com/tree/'
# YFull mtree import (experimental)
def download_yfull_file(group):
try:
os.mkdir('yfull')
except OSError:
pass
fname = yfull_fname(group)
url = yfull_url(group)
print('Downloading ' + url + 'to file: ' + fname)
urllib.request.urlretrieve("https://www.yfull.com/tree/"+group+"/", fname);
def yfull_parse_muts(li):
s=''
snpforhg=li.find('span', class_='yf-snpforhg', recursive=False)
if snpforhg:
s+=snpforhg.text
plussnps=li.find('span', class_='yf-plus-snps', recursive=False)
if plussnps:
s += ' * ' + plussnps['title']
o=[]
if len(s) > 0:
for m in s.split('*'):
o.append(m.strip())
return o
def yfull_parse_age(li):
s=''
agespan=li.find('span', class_='yf-age', recursive=False)
if agespan:
s+=agespan.text
return s
#these have subtree in their url even when they have persons listed
def yfull_is_tree_quirk(group_name, fileroot):
if fileroot:
return False
if group_name=='R-P312':
return True
if group_name=='R-Z2118':
return True
if group_name=='R-M335':
return True
if group_name=='R-U106':
return True
if group_name=='K-Y28299':
return True
if group_name=='H':
return True
return False
def yfull_recurse_list(ul_in, level, fileroot):
lis = ul_in.find_all('li', recursive=False)
for li in lis:
muts={}
muts['l']=level
g=li.find('a', recursive=False)
group_name=''
if g:
group_name=g.text
muts['g']=g.text
l=li.find('a', href=True, recursive=False)
if l:
muts['link']=l['href']
#print(g.text, g['href'])
mlist = yfull_parse_muts(li)
age = yfull_parse_age(li)
#if 'g' in muts and not muts['g'].endswith('*') and not fileroot:
if 'g' in muts and not muts['g'].endswith('*'):
#add separate entries for each mut in same hg