-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSnakefile
143 lines (117 loc) · 6.35 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# The main entry point of your workflow.
# After configuring, running snakemake -n in a clone of this repository should successfully execute a dry-run of the workflow.
configfile: 'config.yaml'
import os
import yaml
import pandas as pd
metadata = pd.read_csv(config['metadata'])
oncokb_api_key = config['oncokb_api_key']
intermediate_dir = config['intermediate_dir']
results_dir = config['results_dir']
scripts_dir = config['scripts_dir']
# List of unique tumour samples for somatic calls
tumor_sample_ids = metadata.query('target_sample_category != "NORMAL"')['target_sample'].unique()
# List of unique normal samples for germline calls
normal_sample_ids = metadata.query('target_sample_category == "NORMAL"')['target_sample'].unique()
# List of paths to input files
paths = metadata.set_index(['target_sample', 'app', 'file_type'])['path'].to_dict()
# We need a set of germline calls for every tumour sample, thus
# we need to duplicate germline calls where there are multiple
# tumour samples per patient. Generate a list of tumour sample ids
# for each normal sample id to allow replication of germline calls
# for each tumour sample.
normal_tumor_map = {}
for individual, indiv_metadata in metadata.groupby('individual'):
indiv_normal = indiv_metadata.query('target_sample_category == "NORMAL"')['target_sample'].unique()
indiv_tumor = indiv_metadata.query('target_sample_category != "NORMAL"')['target_sample'].unique()
for normal_sample_id in indiv_normal:
for tumor_sample_id in indiv_tumor:
if normal_sample_id not in normal_tumor_map:
normal_tumor_map[normal_sample_id] = []
normal_tumor_map[normal_sample_id].append(tumor_sample_id)
# Generate a mapping of normal sample ids used for each tumour sample
# during somatic calling. We will add these sample ids into the maf,
# replacing existing sample ids
tumour_normal_map = metadata.query('target_sample_category != "NORMAL"').set_index(['target_sample'])['reference_sample'].to_dict()
rule all:
input:
os.path.join(results_dir, 'cohort.maf'),
os.path.join(results_dir, 'cohort_filtered.maf')
# Somatic
#
def get_somatic_input_paths(wildcards):
return {
'consensus_somatic_maf': paths[(wildcards.tumor_sample_id, 'WGS-SOMATICCALLING', 'consensus_somatic_maf')],
'museq_paired_annotated': paths[(wildcards.tumor_sample_id, 'WGS-SOMATICCALLING', 'museq_paired_annotated')],
'strelka_snv_annotated': paths[(wildcards.tumor_sample_id, 'WGS-SOMATICCALLING', 'strelka_snv_annotated')],
}
rule somatic_filter_maf:
input: unpack(get_somatic_input_paths)
output: os.path.join(intermediate_dir, 'somatic_{tumor_sample_id}.filtered.maf')
params: scripts_dir=scripts_dir
singularity: "docker://amcpherson/filtermafs"
shell: 'python {params.scripts_dir}/filter_snv_vcf.py {input.consensus_somatic_maf} {input.museq_paired_annotated} {input.strelka_snv_annotated} {output}'
rule somatic_fix_sample_id:
input: os.path.join(intermediate_dir, 'somatic_{tumor_sample_id}.filtered.maf')
output: os.path.join(intermediate_dir, 'somatic_{tumor_sample_id}.filtered.fixup.maf')
params:
tumor_sample_ids = lambda wildcards: [wildcards.tumor_sample_id],
normal_sample_id = lambda wildcards: tumour_normal_map[wildcards.tumor_sample_id],
singularity: "docker://amcpherson/filtermafs"
script: "scripts/fix_sample_id.py"
rule somatic_annotate_maf:
input: os.path.join(intermediate_dir, 'somatic_{tumor_sample_id}.filtered.fixup.maf')
output: os.path.join(intermediate_dir, 'somatic_{tumor_sample_id}.filtered.fixup.annotated.maf')
singularity: "docker://amcpherson/oncokb-annotator"
shell: 'python /oncokb-annotator/MafAnnotator.py -i {input} -o {output} -b {oncokb_api_key}'
rule somatic_merge_mafs:
input: expand(os.path.join(intermediate_dir, 'somatic_{tumor_sample_id}.filtered.fixup.annotated.maf'), tumor_sample_id=tumor_sample_ids)
output: os.path.join(intermediate_dir, 'somatic.maf')
singularity: "docker://amcpherson/filtermafs"
script: "scripts/merge_mafs.py"
# Germline
#
def get_germline_input_paths(wildcards):
return [paths[(wildcards.normal_sample_id, 'WGS-GERMLINECALLING', 'consensus_germline_maf')]]
rule germline_filter_maf:
input: unpack(get_germline_input_paths)
output: os.path.join(intermediate_dir, 'germline_{normal_sample_id}.filtered.maf')
singularity: "docker://amcpherson/filtermafs"
script: "scripts/filter_germline_maf.py"
rule germline_fix_sample_id:
input: os.path.join(intermediate_dir, 'germline_{normal_sample_id}.filtered.maf')
output: os.path.join(intermediate_dir, 'germline_{normal_sample_id}.filtered.fixup.maf')
params:
normal_sample_id = lambda wildcards: wildcards.normal_sample_id,
tumor_sample_ids = lambda wildcards: normal_tumor_map.get(wildcards.normal_sample_id, []),
singularity: "docker://amcpherson/filtermafs"
script: "scripts/fix_sample_id.py"
rule germline_annotate_maf:
input: os.path.join(intermediate_dir, 'germline_{normal_sample_id}.filtered.fixup.maf')
output: os.path.join(intermediate_dir, 'germline_{normal_sample_id}.filtered.fixup.annotated.maf')
singularity: "docker://amcpherson/oncokb-annotator"
shell: 'python /oncokb-annotator/MafAnnotator.py -i {input} -o {output} -b {oncokb_api_key}'
rule germline_merge_mafs:
input: expand(os.path.join(intermediate_dir, 'germline_{normal_sample_id}.filtered.fixup.annotated.maf'), normal_sample_id=normal_sample_ids)
output: os.path.join(intermediate_dir, 'germline.maf')
singularity: "docker://amcpherson/filtermafs"
script: "scripts/merge_mafs.py"
# Cohort somatic and germline
#
rule merge_somatic_germline:
input:
os.path.join(intermediate_dir, 'somatic.maf'),
os.path.join(intermediate_dir, 'germline.maf')
output: os.path.join(intermediate_dir, 'merged.maf')
singularity: "docker://amcpherson/filtermafs"
script: "scripts/merge_somatic_germline.py"
rule annotate_brcaexchange:
input: os.path.join(intermediate_dir, 'merged.maf')
output: os.path.join(results_dir, 'cohort.maf')
singularity: "docker://amcpherson/filtermafs"
script: "scripts/annotate_brca_exchange.py"
rule filter_maf:
input: os.path.join(results_dir, 'cohort.maf')
output: os.path.join(results_dir, 'cohort_filtered.maf')
singularity: "docker://amcpherson/filtermafs"
script: "scripts/filter_maf.py"