-
Notifications
You must be signed in to change notification settings - Fork 0
/
recurrence_first_v2.py
136 lines (126 loc) · 8.03 KB
/
recurrence_first_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#Hufsah Ashraf, 9.11.2020
#latest script for getting (per single cell) Watson and Crick read counts for each bi allelic SNP allele that lies within an inversion
import argparse
import sys
import pysam
from pathlib import Path
import glob
import pdb
from collections import defaultdict
import vcf
from pysam import VariantFile
def counts(input_bam, input_bed,count_output, input_vcf,sample):
dictionary={}
#change 'a' to 'w' in following line if the file doesn't already exist, 'a' appends to an already existing file
counts_file= open(count_output, 'w')
# Write header to output file
counts_file.write('sample'+'\t'+'cell'+'\t'+'chrom'+'\t'+'inv_start'+'\t'+'inv_end'+'\t'+'inv_ID'+'\t'+'snp_pos'+ "\t"+'ref_allele'+ "\t" +'alt_allele'+ "\t"+'seen_allele'+ "\t" + 'orientation' + "\t" + 'count' + "\t" + 'snp_AF' +"\n")
# Get all bam file paths
path = Path(input_bam)
glob_path = path.glob('*.bam')
# Iterate over each cell / bam file
for file in glob_path:
# Load the according file
file_name = str(file).strip().split("/")[-1]
cell=file_name.strip().split(".bam")[0]
bam_file = pysam.AlignmentFile(file, "rb")
# Also get the bed_file
with open(input_bed, 'r') as bed_file:
# Iterate over each bed file entry
for line in bed_file:
line_r = line.strip().split("\t")
chromosome= line_r[0]
interval_start= int(line_r[1])
interval_end= int(line_r[2])
inversion_id=line_r[3]
# Fetch all bam entries that fall in this bin.
vcf_in = VariantFile(input_vcf) # auto-detect input format
recs=[]
for variant in vcf_in.fetch(chromosome, interval_start, interval_end):
assert len(variant.alts)==1 #to make sure all snps are bi allelic
#if variant.info["AF"][0] >=0.2 and variant.info["AF"][0] <=0.8: #to consider only frequent snps
recs.append((variant.pos,variant.ref, variant.alts[0], variant.info["AF"][0]))
for read in bam_file.fetch(chromosome, interval_start, interval_end):
orientation= None
allele=None
readname= read.query_name
query_seq = read.query_sequence
query_seq_index= (read.query_alignment_start)-1
if read.reference_start is not None:
coord_ref_start = (read.reference_start)+1 # leftmost ref_coordinate where the read maping started
if read.reference_end is not None:
coord_ref_end = (read.reference_end)-1 #rightmost ref_coordinate where the read maping ended
readinfo=read.get_aligned_pairs(matches_only=False, with_seq=False)
if len(readinfo)>0:
print(readname, readinfo[0])
all_tuples=list()
i=0
snp_count=0 #no. of snps a read covers
index_counter=0
c1 = 'read.is_read2'
c2 = 'read.is_qcfail'
c3 = 'read.is_secondary'
c4 = 'read.is_duplicate'
c5 = 'read.mapq < 10'
c6 = 'read.pos < interval_start'
c7 = 'read.pos >= interval_end'
if any([eval(c1),eval(c2),eval(c3),eval(c4),eval(c5),eval(c6),eval(c7)]):
pass
else:
if read.is_reverse:
orientation='Watson'
elif not read.is_reverse:
orientation='Crick'
else:
orientation=None
if orientation is not None:
seen_positions = set()
for r in range(len(recs)):
rec_pos,rec_ref,rec_alt,AF= recs[r]
assert rec_pos not in seen_positions
seen_positions.add(rec_pos)
# a variant should be processed only if it lies somehere within the read mapping region
if rec_pos>=coord_ref_start and rec_pos<=coord_ref_end:
index_all_tuples=index_counter
for t in range(len(readinfo)):
snp_count+=1
tup=readinfo[t]
index_counter+=1
if tup[1] is not None:
all_tuples.append(tup[0])
i+=1
if tup[1]==rec_pos:
posit= all_tuples[i-2]
if posit is not None:
allele= query_seq[posit]
print('allele was obtained from position', posit , 'in the query and all tuples was accessed at ', i-2 )
else:
allele=None
print(cell,readname,coord_ref_start, coord_ref_end,inversion_id,len(recs),rec_pos,rec_ref, rec_alt, allele, orientation)
if allele is not None and (allele==rec_ref or allele==rec_alt):
if (sample, cell,chromosome, interval_start, interval_end, inversion_id,rec_pos,rec_ref,rec_alt,allele,orientation) in dictionary:
dictionary[(sample, cell,chromosome, interval_start, interval_end, inversion_id,rec_pos,rec_ref,rec_alt,allele,orientation,AF)]+=1
else:
dictionary[(sample, cell,chromosome, interval_start, interval_end, inversion_id,rec_pos,rec_ref,rec_alt,allele,orientation,AF)]=1
else:
print(rec_pos,rec_ref, rec_alt, read.query_name, 'neither of the two alleles but carries ', allele)
break
#if it is before the read start, move to the next variant
elif rec_pos<coord_ref_start:
pass
#if it lies after the read end, move to the next read
elif rec_pos>coord_ref_start:
print(cell, readname, 'covered ',snp_count, 'snps')
break
for line in dictionary:
counts_file.write(str(line[0])+ "\t"+ str(line[1])+ "\t" + str(line[2])+"\t" + str(line[3])+ "\t"+ str(line[4])+ "\t" +
str(line[5]) + "\t"+ str(line[6])+"\t"+str(line[7])+ "\t"+ str(line[8])+"\t"+str(line[9])+"\t"+str(line[10])+"\t" + str(dictionary[line])+"\t"+str(line[11])+"\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("-i", "--input_bam", help="The input bam files path", required=True)
parser.add_argument("-b", "--input_bed", type=str, help="The bed file with inversion segments",required=True)
parser.add_argument("-v", "--input_vcf",type=str, help="The vcf file containing only bi allelic SNVs, sorted by bcftools", required=True)
parser.add_argument("-o", "--count_output",type=str, help="The output file with watson and crick reads for each SNP allele (one per sample)", required=True)
parser.add_argument("-s", "--sample", help="The sample name", required=True)
args = parser.parse_args()
counts(args.input_bam, args.input_bed, args.count_output, args.input_vcf, args.sample)