-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathA5_A3_ISOTOPE_part1.py
341 lines (298 loc) · 20.2 KB
/
A5_A3_ISOTOPE_part1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
"""
@authors: Juan L. Trincado
@email: juanluis.trincado@upf.edu
A5_A3_ISOTOPE.py: get significant alternative splice site events
"""
from lib.A5_A3.extract_exonized_junctions import *
from lib.A5_A3.get_reads_exonizations import *
from lib.A5_A3.overlap_with_repeats import *
from lib.A5_A3.get_significant_exonizations import *
from lib.A5_A3.compare_reads_random_junctions import *
from lib.A5_A3.check_mutations_nearby import *
from lib.A5_A3.filter_exonizations import *
from lib.A5_A3.filter_exonizations_CHESS import *
from lib.A5_A3.get_peptide_sequence import *
from lib.A5_A3.select_fasta_candidates import *
from lib.A5_A3.run_netMHC_classI_slurm_part1 import *
from lib.A5_A3.run_netMHCpan_classI_slurm_part1 import *
from itertools import chain, islice
import os
from argparse import ArgumentParser, RawTextHelpFormatter
import argparse
# create logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# create console handler and set level to info
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
# create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# add formatter to ch
ch.setFormatter(formatter)
# add ch to logger
logger.addHandler(ch)
def str2bool(v):
if isinstance(v, bool):
return v
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
description = \
"Description: Get alternative splice site events\n\n"
parser = ArgumentParser(description=description, formatter_class=RawTextHelpFormatter,
add_help=True)
parser.add_argument("-r", "--reads", required=True, help = "reads mapped to junctions")
parser.add_argument("-trans", "--transcript", required=True, help = "transcript expression file")
parser.add_argument("-g", "--gtf", required=True, help = "gtf annotation")
parser.add_argument("-conv", "--conversion", required=True, help = "gene name conversion")
parser.add_argument("-m", "--max", required=False, type=int, default=500)
parser.add_argument("-t", "--thres", required=False, type=int, default=5, help="Minimum number of reads mapping the event")
parser.add_argument("-chunks", "--size_chunks", required=False, type=int, default=1, help="For paralellization, this values indicates number of jobs to run")
parser.add_argument("-rep", "--repeats", required=True, help = "Regions of the genome with repeats from maskerDB",default=None)
parser.add_argument("-mut","--mutations", required=False, default="Missing", help = "Mutations path")
parser.add_argument("--chessA5", required=False, default="Missing", help = "Path to annotated A5 from CHESSdb (GTEX)")
parser.add_argument("--chessA3", required=False, default="Missing", help = "Path to annotated A3 from CHESSdb (GTEX)")
parser.add_argument("-mosea", "--mosea", required=True, help = "MoSEA path")
parser.add_argument("-mxfinder", "--mxfinder", required=True, help = "MxFinder path")
parser.add_argument("-genome", "--genome", required=True, help = "Genome annotation")
parser.add_argument("-HLAclass", "--HLAclass", required=True, help = "HLA genotype of the samples")
parser.add_argument("-HLAtypes", "--HLAtypes", required=True, help = "HLA alelles recognized by NetMHC")
parser.add_argument("-HLAtypespan", "--HLAtypespan", required=True, help = "HLA alelles recognized by NetMHCpan")
parser.add_argument("-netMHC", "--netMHC", required=True, help = "netMHC path")
parser.add_argument("-netMHCpan", "--netMHCpan", required=True, help = "netMHCpan path")
parser.add_argument("--temp", type=str2bool, nargs='?',const=True, default=False, required=False,help="Remove temp files")
parser.add_argument("--tumor_specific", type=str2bool, nargs='?',const=True, default=False,help="Tumor specific mode")
parser.add_argument("-control_path", "--control_path", required=False, default="Missing", help = "reads mapped to junctions controls")
parser.add_argument("-Intropolis", "--Intropolis", required=False, default="Missing", help = "reads mapped to junctions from Intropolis db")
parser.add_argument("--Rudin", type=str2bool, nargs='?',const=True, default=False,help="Rudin mode")
parser.add_argument("--username", required=True, help = "Cluster user name")
parser.add_argument("-o", "--output", required=True, help = "Output path")
parser.add_argument("-c", "--cluster", type=str2bool, nargs='?',const=True, default=False,help="Run in parallel on a cluster")
def chunks(iterable, n):
"chunks(ABCDE,2) => AB CD E"
iterable = iter(iterable)
while True:
yield chain([next(iterable)], islice(iterable, n-1))
def main(readcounts_path, transcript_expression_path, gtf_path, conversion_names, max_length,
threshold, size_chunks, repeats_path, mutations_path, CHESS_A5_path, CHESS_A3_path,
tumor_specific, control_path, Intropolis_path, mosea_path, mxfinder, genome_path, HLAclass_path, HLAtypes_path,
HLAtypes_pan_path, netMHC_path, netMHC_pan_path, remove_temp_files, flag_Rudin,
name_user, output_path, cluster):
try:
logger.info("Starting execution A5_A3_ISOTOPE_part1")
# 0. Create a gtf with only the exon information
dir_path = os.path.dirname(os.path.realpath(__file__))
gtf_path_exon = '{}.{}'.format(gtf_path, "exon")
gtf = pd.read_table(gtf_path, delimiter="\t",header=None,comment="#")
#Get only the information on the exons and on chromosomes from 1 to 22, X and Y
gtf.columns = ['chr', 'type1', 'type2', 'start', 'end', 'dot', 'strand', 'dot2', 'rest_information']
gtf = gtf[gtf['type2'].isin(["exon"])]
gtf = gtf[gtf['chr'].isin(list(range(1,23)) + ["X","Y"])]
#Add the chr suffix
gtf['chr'] = 'chr' + gtf['chr'].astype(str)
#Save the gtf in external file
gtf.to_csv(gtf_path_exon,index=False,header=False,sep ='\t',quoting=csv.QUOTE_NONE)
# 1. Identify the junctions that could generate an alternative splice site
logger.info("Part1...")
output_path_aux = output_path+"/new_A5_A3_junctions.tab"
extract_exonized_junctions(readcounts_path, gtf_path_exon, genome_path, max_length, output_path_aux, mosea_path)
# 2. Given the list with the possible A5_A3, get the reads associate to each of them
logger.info("Part2...")
output_path_aux2 = output_path+"/new_A5_A3_junctions_reads.tab"
get_reads_exonizations(output_path_aux, readcounts_path, output_path_aux2, False)
# 3. find the overlap between the nex A5_A3 and repeatitions (RepeatMasker)
logger.info("Part3...")
output_path_aux3 = output_path + "/new_A5_A3_junctions_reads_repeatitions.tab"
overlap_with_repeats(output_path_aux2, repeats_path, output_path_aux3)
# 4. given the table of the A5_A3 with the reads counts,get those that are over a threshold
logger.info("Part4...")
get_significant_exonizations(output_path_aux3, threshold, output_path + "/A5_A3_by_sample.tab")
# 5. for applying some filtering on the list of A5_A3 junctions, we are gonna compare the readcounts for each
# junction against other new junctions associated to the same gene
logger.info("Part5...")
compare_reads_random_junctions(output_path + "/A5_A3_by_sample.tab", readcounts_path, gtf_path_exon, output_path + "/A5_A3_by_sample_coverage.tab")
# 6. Check if in the A5_A3 there are mutations nearby
logger.info("Part6...")
check_mutations_nearby(output_path + "/A5_A3_by_sample_coverage.tab", mutations_path, 200, output_path + "/A5_A3_by_sample_coverage_mut.tab")
# 7. Separate the mutated from the non-mutated cases
logger.info("Part7...")
command1 = "Rscript " + dir_path + "/lib/A5_A3/separate_mutated_cases.R " + output_path + "/A5_A3_by_sample_coverage_mut.tab " \
+ output_path + "/A5_A3_mutated.tab " + output_path + "/A5_A3_non_mutated.tab"
os.system(command1)
# 8. Get the tumor specific events
if (tumor_specific):
logger.info("Get the tumor specific events...")
# Get the significant exonizations from Intropolis (control)
logger.info("Intropolis...")
output_path_aux = output_path + "/new_A5_A3_junctions.tab"
output_Intropolis_path_aux2 = output_path + "/new_A5_A3_junctions_Intropolis_reads.tab"
get_reads_exonizations(output_path_aux, Intropolis_path,
output_Intropolis_path_aux2, True)
output_Intropolis_path_aux3 = output_path + "/new_A5_A3_junctions_Intropolis_reads_repeatitions.tab"
overlap_with_repeats(output_Intropolis_path_aux2, repeats_path, output_Intropolis_path_aux3)
output_Intropolis_path_aux4 = output_path + "/A5_A3_by_sample_Intropolis.tab"
get_significant_exonizations(output_Intropolis_path_aux3, threshold, output_Intropolis_path_aux4)
if(control_path!="Missing"):
# Get the significant A5_A3 from normal samples
logger.info("Additional controls...")
extract_exonized_junctions(control_path, gtf_path, genome_path, max_length, output_path + "/new_A5_A3_junctions_control.tab", mosea_path)
get_reads_exonizations(output_path + "/new_A5_A3_junctions_control.tab", control_path, output_path + "/new_A5_A3_junctions_control_reads.tab", False)
get_significant_exonizations(output_path + "/new_A5_A3_junctions_control_reads.tab", threshold, output_path + "/A5_A3_by_sample_control.tab")
#Filter A5_A3
logger.info("Filtering events...")
filter_exonizations(output_path + "/A5_A3_non_mutated.tab", output_path + "/A5_A3_by_sample_control.tab",
output_path + "/A5_A3_by_sample_Intropolis.tab", output_path + "/A5_A3_non_mutated_filtered.tab")
filter_exonizations_CHESS(output_path + "/A5_A3_non_mutated_filtered.tab", CHESS_A5_path, CHESS_A3_path, output_path + "/A5_A3_non_mutated_filtered2.tab")
else:
#Filter A5_A3
logger.info("Filtering events...")
filter_exonizations(output_path + "/A5_A3_non_mutated.tab", "Missing",
output_path + "/A5_A3_by_sample_Intropolis.tab", output_path + "/A5_A3_non_mutated_filtered.tab")
filter_exonizations_CHESS(output_path + "/A5_A3_non_mutated_filtered.tab", CHESS_A5_path, CHESS_A3_path, output_path + "/A5_A3_non_mutated_filtered2.tab")
# 9. Join the mutated and non_mutated cases
output_path_aux13 = output_path + "/all_A5_A3.tab"
command3 = "cat " + output_path + "/A5_A3_mutated.tab" + " > " + output_path_aux13 + ";tail -n+2 " + output_path + "/A5_A3_non_mutated_filtered2.tab" + " >> " + output_path_aux13
os.system(command3)
else:
# 9. Join the mutated and non_mutated cases
output_path_aux13 = output_path + "/all_A5_A3.tab"
command3 = "cat " + output_path + "/A5_A3_mutated.tab" + " > " + output_path_aux13 + ";tail -n+2 " + output_path + "/A5_A3_non_mutated.tab" + " >> " + output_path_aux13
os.system(command3)
# 10. Get the peptide sequence associated
logger.info("Part9...")
get_peptide_sequence(output_path_aux13, transcript_expression_path, gtf_path,
output_path + "/A5_A3_peptide_sequence.fa", output_path + "/A5_A3_fasta_sequence.fa",
output_path + "/A5_A3_ORF.tab", output_path + "/A5_A3_ORF_sequences.tab",
mosea_path, genome_path, mxfinder, remove_temp_files)
# # 10.1. Split the input file into n pieces. Run a job per piece. When all jobs have finished, we will assemble all the pieces
# logger.info("Part9...")
# logger.info("get_peptide_sequence: Split the file into pieces and run get_peptide_sequence by chunk")
# dir_path = os.path.dirname(os.path.realpath(__file__))
#
# with open(output_path_aux13) as f_aux:
# header = f_aux.readline()
#
# dict_jobs = {}
# with open(output_path_aux13) as bigfile:
# logger.info("Specified chunk size "+str(size_chunks))
# for i, lines in enumerate(chunks(bigfile, size_chunks)):
# file_split = '{}.{}'.format(output_path_aux13, i)
# f = open(file_split, 'w')
# #Output the header, if it's not the first chunk
# if(i!=0):
# f.writelines(header)
# with f:
# f.writelines(lines)
# f.close()
# #Run a job per file
# logger.info("Processing " + "chunk_" + str(i) + "...")
# command1 = "python " + dir_path + "/lib/A5_A3/get_peptide_sequence.py " + file_split + " " + \
# transcript_expression_path + " " + gtf_path + " " + '{}.{}'.format(output_path + "/A5_A3_peptide_sequence.fa", i) + " " + \
# '{}.{}'.format(output_path + "/A5_A3_fasta_sequence.fa", i) + " " + '{}.{}'.format(output_path + "/A5_A3_ORF.tab", i) + " " + \
# '{}.{}'.format(output_path + "/A5_A3_ORF_sequences.tab", i) + " " + mosea + " " + genome_path + " " + mxfinder + " " + str(remove_temp_files)
# open_peptides_file = open(output_path + "/aux.sh", "w")
# open_peptides_file.write("#!/bin/sh\n")
# # open_peptides_file.write("#SBATCH --partition=normal\n")
# open_peptides_file.write("#SBATCH --mem 3000\n")
# open_peptides_file.write(
# "#SBATCH -e " + output_path + "/" + "get_peptide_sequence" + "_chunk_" + str(i) + ".err" + "\n")
# open_peptides_file.write(
# "#SBATCH -o " + output_path + "/" + "get_peptide_sequence" + "_chunk_" + str(i) + ".out" + "\n")
# open_peptides_file.write(command1 + ";\n")
# open_peptides_file.close()
# command2 = "sbatch -J " + "get_peptide_sequence" + "_chunk_" + str(i) + " " + output_path + "/aux.sh; sleep 0.5;"
# # os.system(command2)
# job_message = subprocess.check_output(command2, shell=True)
# # Get the job id and store it
# job_id = (str(job_message).rstrip().split(" ")[-1])[:-3]
# dict_jobs[job_id] = 1
#
# logger.info("get_peptide_sequence: Waiting for all the jobs to finished...")
# flag_exit = False
# while (not flag_exit):
# # Initialize the dictionary with the pending jobs in the cluster
# pending_jobs = {}
# os.system("sleep 10")
# p = subprocess.Popen(["squeue", "-u", name_user], stdout=subprocess.PIPE)
# # Skip the first line (the header)
# line = p.stdout.readline()
# for line in p.stdout:
# flag_exit = True
# # Get the id of the job
# job_id_aux = str(line).rstrip().split()[1]
# # Save the id of the jobs
# pending_jobs[job_id_aux] = 1
# # If there is any job on the cluster on dict_jobs, break the loop and wait for another 10 seconds
# # to check the status of the jobs in the cluster
# if (job_id_aux in dict_jobs):
# flag_exit = False
# break
#
# logger.info("get_peptide_sequence:All jobs finished.\n\n")
# 11. Filter the relevant results
command4 = "Rscript " + dir_path + "/lib/A5_A3/filter_results.R " + output_path + "/A5_A3_ORF.tab " \
+ conversion_names + " " + output_path + "/A5_A3_ORF_filtered.tab " + output_path + "/A5_A3_ORF_filtered_peptide_change.tab " \
+ str(max_length)
os.system(command4)
# 12. Select the fasta candidates for being run to the epitope analysis
logger.info("Part10...")
# Create the folder, if it doesn't exists
if not os.path.exists(output_path + "/A5_A3_fasta_files"):
os.makedirs(output_path + "/A5_A3_fasta_files")
select_fasta_candidates(output_path + "/A5_A3_ORF_filtered_peptide_change.tab",
output_path + "/A5_A3_peptide_sequence.fa", output_path + "/A5_A3_peptide_sequence_filtered.fa",
output_path + "/A5_A3_fasta_files")
# 13. Run netMHC-4.0_part1
logger.info("Part11...")
if not os.path.exists(output_path + "/A5_A3_NetMHC-4.0_files"):
os.makedirs(output_path + "/A5_A3_NetMHC-4.0_files")
run_netMHC_classI_slurm_part1(output_path + "/A5_A3_ORF_filtered_peptide_change.tab", HLAclass_path, HLAtypes_path,
output_path + "/A5_A3_fasta_files",
output_path + "/A5_A3_NetMHC-4.0_files",
output_path + "/A5_A3_NetMHC-4.0_neoantigens_type_gained.tab",
output_path + "/A5_A3_NetMHC-4.0_neoantigens_type_gained_all.tab",
output_path + "/A5_A3_NetMHC-4.0_neoantigens_type_lost.tab",
output_path + "/A5_A3_NetMHC-4.0_neoantigens_type_lost_all.tab",
output_path + "/A5_A3_NetMHC-4.0_junctions_ORF_neoantigens.tab",
netMHC_path, cluster)
# 14. Run netMHCpan-4.0_part1
logger.info("Part12...")
if not os.path.exists(output_path + "/A5_A3_NetMHCpan-4.0_files"):
os.makedirs(output_path + "/A5_A3_NetMHCpan-4.0_files")
run_netMHCpan_classI_slurm_part1(output_path + "/A5_A3_ORF_filtered_peptide_change.tab", HLAclass_path, HLAtypes_pan_path,
output_path + "/A5_A3_fasta_files",
output_path + "/A5_A3_NetMHCpan-4.0_files",
output_path + "/A5_A3_NetMHCpan-4.0_neoantigens_type_gained.tab",
output_path + "/A5_A3_NetMHCpan-4.0_neoantigens_type_gained_all.tab",
output_path + "/A5_A3_NetMHCpan-4.0_neoantigens_type_lost.tab",
output_path + "/A5_A3_NetMHCpan-4.0_neoantigens_type_lost_all.tab",
output_path + "/A5_A3_NetMHCpan-4.0_junctions_ORF_neoantigens.tab",
netMHC_pan_path, cluster)
logger.info("Wait until all jobs have finished. Then, go on with part2")
exit(0)
except Exception as error:
logger.error('ERROR: ' + repr(error))
logger.error("Aborting execution")
sys.exit(1)
if __name__ == '__main__':
args = parser.parse_args()
main(args.reads,args.transcript,args.gtf,args.conversion,args.max,args.thres,args.size_chunks,
args.repeats,args.mutations,args.chessA5,args.chessA3,args.tumor_specific,args.control_path,args.Intropolis,
args.mosea,args.mxfinder,args.genome,args.HLAclass,args.HLAtypes,args.HLAtypespan,args.netMHC,args.netMHCpan,
args.temp,args.Rudin,args.username, args.output, args.cluster)
# main("/home/trincadojl/Projects/SCLC/Smart/data/readCounts_TEST.tab",
# "/home/trincadojl/Projects/SCLC/Smart/data/iso_tpm.txt",
# "/home/trincadojl/Projects/SCLC/Smart/annotation/Homo_sapiens.GRCh37.75.gtf"
# , "/home/trincadojl/Projects/SCLC/Smart/annotation/Ensembl_gene_conversion.txt", 500, 5, 1,
# "/home/trincadojl/Projects/SCLC/Smart/annotation/hg19_repeats.bed", "Missing", "Missing", "Missing", False,
# "Missing", "Missing",
# "/home/trincadojl/Software/MoSEA", "/home/trincadojl/Software/MxFinder",
# "/media/trincadojl/data/Projects/annotation/hg19.fa",
# "/home/trincadojl/Projects/SCLC/Smart/data/PHLAT_summary_ClassI.out",
# "/home/trincadojl/Projects/SCLC/Smart/data/NetMHC-4.0_HLA_types_accepted.tab",
# "/home/trincadojl/Projects/SCLC/Smart/data/NetMHCpan-4.0_HLA_types_accepted.tab",
# "/home/trincadojl/Software/netMHC-4.0/netMHC", "/home/trincadojl/Software/netMHCpan-4.0/netMHCpan",
# False, False, "juanluis", "/home/trincadojl/Projects/SCLC/Smart/test_ISOTOPE/A5_A3", False)