-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathMouseContamEstimator.py
63 lines (41 loc) · 1.63 KB
/
MouseContamEstimator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import argparse
import numpy
parser = argparse.ArgumentParser()
parser.add_argument('input', help='input file name with its path (TSV formatted output of GATK CollectAllelicCounts)')
args = parser.parse_args()
## Read HAMAlist ##
HAMAlist_file = open('HAMAlist.v002.vcf','r')
HAMAlist = {}
for HAMAlist_line in HAMAlist_file :
if HAMAlist_line[0] == '#' : continue
hama_col = HAMAlist_line.split('\t')
hama_id = hama_col[0]+'.'+hama_col[1]+hama_col[3]+'>'+hama_col[4]
HAMAlist.update({hama_col[0]+'.'+hama_col[1]:hama_col[3]+'>'+hama_col[4]})
## Estimate input file ##
input_file = open('%s'%args.input,'r')
HAMAF_list = []
for input_line in input_file :
if input_line[0] == '@' : continue
if input_line[:6] == 'CONTIG' : continue
col = input_line.rstrip('\n').split('\t')
pos = col[0]+'.'+col[1]
base = col[4]+'>'+col[5]
if pos in HAMAlist :
if base == HAMAlist[pos] :
HAMAF = float(col[3])/(float(col[2])+float(col[3]))
else : continue
HAMAF_list.append(HAMAF)
else : continue
HAMAF_median = numpy.median(HAMAF_list)
## Print result ##
print ('## MouseContamEstimator v0.1-beta.1 ##')
print ('Input File:\t'+args.input.split('/')[-1])
print ('Number of HAMA:\t'+str(len(HAMAF_list)))
print ('Median of HAMAFs:\t'+str(HAMAF_median))
if len(HAMAF_list) < 120000 :
print ('Estimated Mouse Contamination Ratio:\t The number of HAMA is insufficient to perform the estimation. (possibly no contamination of murine genome.)')
else :
if (HAMAF_median/0.7396)*100 >= 100 :
print ('Estimated Mouse Contamination Ratio:\t100 %')
else :
print ('Estimated Mouse Contamination Ratio:\t'+str(round((HAMAF_median/0.7396)*100,1))+' %')