-
Notifications
You must be signed in to change notification settings - Fork 1
/
maf_iterate.py
137 lines (116 loc) · 5.9 KB
/
maf_iterate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/bin/python
def maf_iterate(handle, gene_ranges, fo1, fo2, block_id=1):
"""Iterates over lines in a MAF file. Extract interested range of sequences,
with species, chromosome, block coordinates, and sign an incremented fragment
id to the block."""
print gene_ranges
# gets the first gene range, and remove it from the gene_ranges.
gene_range = gene_ranges.pop(0)
#ucsc is 0 indexing basis, AVA is 1 indexing basis, convert AVA index to ucsc.
gene_start, gene_end = gene_range[0] - 1, gene_range[1] - 1
is_match = False
in_a_block = False
while True:
# allows parsing of the last block
try:
line = handle.next()
except StopIteration:
line = ""
if in_a_block:
if line.startswith('s'):
#process the sequence 's' line, where all informations are presented
line_split = line.strip().split()
if len(line_split) != 7:
raise ValueError("Error parsing alignment - 's' line must have 7 fields")
species, chromosome = line_split[1].split('.')[0], line_split[1].split('.')[1]
strand, sequence = line_split[4], line_split[6]
#check block coordinates of hg19 matches gene range.
if species == "hg19":
block_start, block_size = int(line_split[2]), int(line_split[3])
block_end = block_start + block_size - 1
if block_start <= gene_end and block_end >= gene_start:
#remove the first entry of gene ranges
is_match = True
fo1.write("{0},{1},{2},{3}\n".format(str(block_id), chromosome.strip('chr'), str(block_start), str(block_end)))
else:
is_match = False
if block_end > gene_end:
# iteration passes the matched block, get next gene range by pop(0)
if len(gene_ranges) > 0:
gene_range = gene_ranges.pop(0)
#ucsc is 0 indexing basis, AVA is 1 indexing basis, convert AVA index to ucsc.
gene_start, gene_end = gene_range[0] - 1, gene_range[1] - 1
else:
break
if is_match:
fo2.write("{0},{1},{2},{3}\n".format(str(block_id), species, strand, sequence))
elif line.startswith("e") or \
line.startswith("i") or \
line.startswith("q"):
# not implemented
pass
#empty line as break of block
elif not line.strip():
in_a_block = False
else:
raise ValueError("Error parsing alignment - unexpected line:\n%s" % (line,))
elif line.startswith("a"):
# start a bundle of records
in_a_block = True
if is_match:
block_id += 1
if len(line.strip().split()[1:]) != line.count("="):
raise ValueError("Error parsing alignment - invalid key in 'a' line")
elif line.startswith("#"):
# ignore comments
pass
elif not line:
break
return block_id
def chr_iterate(bed_input, out_dir, block_id, start_chr, end_chr):
"""iterates over all the chromosomes from hg19. Read in bed_input file "chr# gend_start gene_end gene_id"
and gzip maf files; outputs block range file and block sequence file for each chromosome.
block_id will be defined as the starting block_id, chr_start will be the first chromosome to be processed, it
will be converted to list index for all chromosomes, end_chr will be included into the processing."""
import os
import gzip
#gets the directory of the script
path = os.path.dirname(os.path.abspath(__file__))
#gets directory that the chr*.maf.gz sit.
chr_path = '/' + path.strip('/tools') + '/ref/multiz100way/maf/'
bed_list = []
for line in bed_input:
if line.strip():
line_split = line.strip().split()
chr_index = line_split[0]
gene_start = int(line_split[1])
gene_end = int(line_split[2])
bed_list.append((gene_start, gene_end, chr_index))
#note that AVA call mitochondra chromosome as chrMT, ucsc calls it chrM.
chrs = ['chr' + str(x) for x in range(1,23)] + ['chrX', 'chrY', 'chrMT']
# chrs = ['chr' + str(x) for x in range(22,23)]
#range of chromosomes, X will be 23, and Y will be 24, and MT will be 25.
for chr in chrs[start_chr -1 : end_chr]:
print chr
#gets the gene ranges from bed_list for the iterated chr
gene_ranges = [x[:2] for x in bed_list if x[2] == chr]
#sorts the list from smallest start to largest.
gene_ranges.sort(key=lambda x: x[0])
fo1_file = 'multiz100way_genome_alignment_coord' + chr + '.csv'
fo2_file = 'multiz100way_genome_alignment_seq' + chr + '.csv'
fo1 = open(os.path.join(out_dir, fo1_file), 'w')
fo2 = open(os.path.join(out_dir, fo2_file), 'w')
maf_gz_file = chr + ".maf.gz"
handle = gzip.open(os.path.join(chr_path, maf_gz_file), 'rb')
#run the maf_iterate on the iterated chr, and get the ending block_id for next iteration.
block_id = maf_iterate(handle, gene_ranges, fo1, fo2, block_id)
fo1.close()
fo2.close()
if __name__ == "__main__":
import sys
bed_input = open(sys.argv[1], 'rU')
out_dir = str(sys.argv[2])
block_id = int(sys.argv[3])
start_chr = int(sys.argv[4])
end_chr = int(sys.argv[5])
chr_iterate(bed_input, out_dir, block_id, start_chr, end_chr)