-
Notifications
You must be signed in to change notification settings - Fork 0
/
UMI_seq_fragment_length.py
166 lines (125 loc) · 5.17 KB
/
UMI_seq_fragment_length.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import argparse
import pysam
import os
import csv
import numpy as np
class Variation:
def __init__(self, pos, ref, alt):
self.pos = pos
self.ref = ref
self.alt = alt
def str_set_join(s1, s2):
return "".join(set(s1 + s2))
class FragmentFinder:
def __init__(
self,
bam_file
):
# Check if index exists, if not create an index file
bai_filename = f"{bam_file}.bai"
if not os.path.exists(bai_filename):
print(f"No index file found ({bai_filename}), generating...")
pysam.index(bam_file)
# Open files for reading and writing
self.bam_file = pysam.AlignmentFile(bam_file, "rb")
def find_fragments(self, chrom, start, stop, mapq=20):
for pileupcolumn in self.bam_file.pileup(
contig=chrom,
start=start,
stop=stop,
truncate=True,
min_mapping_quality=mapq,
ignore_overlaps=False,
flag_require=2, # proper paired
):
ref_pos = pileupcolumn.reference_pos
# Go through reads
mem = {}
for read in pileupcolumn.pileups:
#print(mem)
if read.is_del or read.is_refskip:
continue
query_position = read.query_position
read = read.alignment
if read.is_duplicate or read.is_secondary or read.is_supplementary:
continue
query_name = read.query_name
if query_name not in mem:
mem[query_name] = (
read.reference_start,
read.reference_end,
read.is_reverse,
read.is_read1,
read.query_sequence[query_position]
)
else:
mem_start, mem_end, mem_reverse, mem_is_read1, mem_read_allel = mem[query_name]
#print(mem[query_name])
#print(read.reference_start, read.reference_end, read.is_reverse, read.is_read1, read.query_sequence[query_position])
del mem[query_name]
if (
mem_start is None
or mem_end is None
or read.reference_start is None
or read.reference_end is None
or read.is_reverse == mem_reverse
):
continue
if read.is_reverse:
start = mem_start
end = read.reference_end
start_is_first = mem_is_read1
else:
start = read.reference_start
end = mem_end
start_is_first = not mem_is_read1
length = abs(read.template_length)
#print(start,end, length)
if start >= end or length == 0:
continue
read_allel = read.query_sequence[query_position]
if read_allel not in "ATGC" or read_allel != mem_read_allel:
#print(read_allel,mem_read_allel)
continue
yield ref_pos, read_allel, length
def regions(bed_file):
with open(bed_file) as fd:
reader = csv.reader(fd, delimiter = "\t")
for line in reader:
#yield line[0], int(line[1]), int(line[2]) # yields: chromosome, start position, end positionm
#pos <- int(line[2] - int(line[1]) + 1
chromosome,start,end = line[0], int(line[1]), int(line[2]) # yields: chromosome, start position, end positionm
#print(chromosome)
for position in range(start,end+1):
yield chromosome,position,position+1
nucleosome_index_map = {"A":0, "T":1, "G":2, "C":3}
index_nucleaosome_map = {0:"A", 1:"T", 2:"G", 3:"C"}
def main(bam_file, bed_file, output_file,max_length=700):
finder = FragmentFinder(bam_file)
region_lst = list(regions(bed_file))
tensor = np.zeros((len(region_lst), 4, max_length-1), dtype=np.uint16)
for index,region in enumerate(region_lst):
chrom, start, stop = region
for ref_pos,read_allel,length in finder.find_fragments(chrom, start, stop):
#print(ref_pos,read_allel,length)
if 1<=length<=max_length:
nucle = nucleosome_index_map.get(read_allel)
tensor[index,nucle,length]+=1
with open(output_file, "w") as fd:
writer = csv.writer(fd, delimiter = "\t")
for i in range(len(tensor)):
chrom, start, _=region_lst[i]
for j in range(4):
writer.writerow([chrom,start,index_nucleaosome_map.get(j)] + list(map(str,tensor[i,j])))
print(tensor.sum())
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("bam_file", help="bam file")
parser.add_argument("bed_file")
parser.add_argument("output_file")
args = parser.parse_args()
main(
args.bam_file,
args.bed_file,
args.output_file,
)