forked from akahanaton/SpliceGrapher
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mafOrtho.py
executable file
·161 lines (148 loc) · 8.33 KB
/
mafOrtho.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/env python
from __future__ import print_function
from __future__ import division
import psyco_full
import bx.align.maf
from Bio import SeqIO
from BCBio import GFF
from collections import defaultdict
import argparse
import sys
import pprint
import GTF # https://gist.github.com/slowkow/8101481
from docopt import docopt
import pandas as pd
import gzip
import time
from contextlib import contextmanager
import re
from collections import namedtuple
import os.path
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
def nested_dict(n, type):
if n == 1:
return defaultdict(type)
else:
return defaultdict(lambda: nested_dict(n-1, type))
@contextmanager
def log(message):
"""Log a timestamp, a message, and the elapsed time to stderr."""
start = time.time()
sys.stderr.write("{} # {}\n".format(time.asctime(), message))
yield
elapsed = int(time.time() - start + 0.5)
sys.stderr.write("{} # done in {} s\n".format(time.asctime(), elapsed))
sys.stderr.flush()
def main(args):
with log("Reading compare Gencode annotation file: {}".format(args.compGffFile)):
gc = GTF.dictionary(args.compGffFile,"ID")
compGeneInfo = gc['gene']
#--------------------------------------------------
# gene['Name'] = gene['ID'].map(lambda x: re.sub(r':maker.*','',x))
#--------------------------------------------------
with log("Reading reference Gencode annotation file: {}".format(args.refGffFile)):
gc = GTF.dataframe(args.refGffFile)
# Select just genes of protein coding genes, and columns that we want to use.
idx = (gc.feature == 'gene')
gene = gc.ix[idx, ['seqname','start','end','Name']]
#--------------------------------------------------
# print(gene)
#--------------------------------------------------
# Convert columns to proper types.
gene.start = gene.start.astype(int)
gene.end = gene.end.astype(int)
for geneID in gene['Name']:
if geneID in compGeneInfo:
# gene annotated in both species, read coordinates projecting information in maf
mafFile = args.mafPath + "/" + geneID + ".maf"
if not os.path.exists(mafFile):
continue
with log("Reading the Maf file: {}".format(mafFile)):
with open(mafFile) as maf:
out_files = dict()
geneCoords = dict()
for block in bx.align.maf.Reader(maf):
ref_comp = block.components[0]
refSpecies, refChrom = ref_comp.src.split('.')[:2]
if refSpecies not in geneCoords:
geneCoords[refSpecies] = nested_dict(2,str)
geneCoords[refSpecies]['refInfo']['start'] = ref_comp.forward_strand_start
geneCoords[refSpecies]['refInfo']['end'] = ref_comp.forward_strand_end
geneCoords[refSpecies]['refInfo']['chr'] = refChrom
for comp in block.components[1:]:
comp_species, compChrom = comp.src.split('.')[:2]
if comp_species not in geneCoords:
geneCoords[comp_species] = nested_dict(2,str)
geneCoords[comp_species][compChrom]['start'] = comp.start
geneCoords[comp_species][compChrom]['end'] = int(comp.end)
if compChrom not in geneCoords[comp_species]:
geneCoords[comp_species][compChrom]['start'] = comp.start
geneCoords[comp_species][compChrom]['end'] = int(comp.end)
if comp_species not in out_files:
bedfile = "%s/%s.%s.bed" % (args.mafPath, geneID, comp_species )
f = open( bedfile , "w" )
out_files[comp_species] = f
pid = block_pid( ref_comp, comp )
if pid:
#--------------------------------------------------
# print("%s\t%s" % (comp.end, geneCoords[comp_species][compChrom]))
#--------------------------------------------------
if geneCoords[refSpecies]['refInfo']['start'] > ref_comp.forward_strand_start:
geneCoords[refSpecies]['refInfo']['start'] = ref_comp.forward_strand_start
if geneCoords[refSpecies]['refInfo']['end'] < ref_comp.forward_strand_end:
geneCoords[refSpecies]['refInfo']['end'] = ref_comp.forward_strand_end
if geneCoords[comp_species][compChrom]['start'] > comp.start:
geneCoords[comp_species][compChrom]['start'] = comp.start
if geneCoords[comp_species][compChrom]['end'] <= int(comp.end):
geneCoords[comp_species][compChrom]['end'] = int(comp.end)
out_files[comp_species].write( "%s\t%d\t%d\t%s:%d-%d,%s\t%f\n" %
( refChrom, ref_comp.forward_strand_start, ref_comp.forward_strand_end, \
compChrom, comp.start, comp.end, comp.strand, pid ) )
for f in out_files.values():
f.close()
if args.compSpecies in geneCoords:
for chrom in geneCoords[args.compSpecies]:
if chrom in compGeneInfo[geneID]:
annoStart = int(compGeneInfo[geneID][chrom]['start'])
annoEnd = int(compGeneInfo[geneID][chrom]['end'])
compStart = int(geneCoords[args.compSpecies][chrom]['start'])
compEnd = int(geneCoords[args.compSpecies][chrom]['end'])
if compEnd > annoEnd and compStart < annoEnd or compEnd > annoStart and compStart < annoEnd :
print("Matched\t%s\t%s\t%s\tanno: %d - %d\tmapped: %d - %d\t%s\t%s\t%s: %s - %s" % \
(geneID, args.compSpecies, chrom, annoStart,annoEnd, compStart, compEnd, compGeneInfo[geneID][chrom]['ID'], \
args.refSpecies,geneCoords[args.refSpecies]['refInfo']['chr'], geneCoords[args.refSpecies]['refInfo']['start'], geneCoords[args.refSpecies]['refInfo']['end'] ))
else:
eprint("unMatched\t%s\t%s\t%s\tanno: %d - %d\tmapped: %d - %d" % \
(geneID, args.compSpecies, chrom, annoStart, annoEnd, compStart, compEnd))
else:
eprint("Error Chrom\t%s\t%s\t%s\tmapped: %s - %s" % \
(geneID, args.compSpecies, chrom, geneCoords[args.compSpecies][chrom]['start'],geneCoords[args.compSpecies][chrom]['end']))
def block_pid( comp1, comp2 ):
match = 0
total = 0
t1 = comp1.text.lower()
t2 = comp2.text.lower()
for i in range( 0, len(t1) ):
a, b = t1[i], t2[i]
if a == '-' or b == '-':
continue
elif a == b:
match += 1
total += 1
if total == 0: return None
return ( match / total )
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Get the ortholog region according to maf alignment')
parser.add_argument('refGffFile', type=str, help='reference gff file')
parser.add_argument('compGffFile', type=str, help='compare gff file')
parser.add_argument('mafPath', type=str, help='maf dir path')
parser.add_argument('refSpecies', type=str, help='name of reference species')
parser.add_argument('compSpecies', type=str, help='name of compare species')
parser.add_argument('-f', dest='fastaFile', type=str, help='fasta file matching the gff file')
#--------------------------------------------------
# parser.add_argument('-c', dest='cleanCodon', type=str, default='Y', choices=['Y','N'], help='clean stop codon: Y/N')
# parser.add_argument('-r', dest='cleanByRef', type=str, default=None, help='aligment format')
#--------------------------------------------------
args = parser.parse_args()
main(args)