-
Notifications
You must be signed in to change notification settings - Fork 0
/
rna_seq.snake
124 lines (115 loc) · 3.21 KB
/
rna_seq.snake
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# use the names variable to store your list of file names
sample=['AD', 'P0', 'P4', 'P7']
rep = ['rep1', 'rep2']
number=[1,2]
rule all:
input:
'results/multiqc_report.html',
'results/GRCm39.fa.gz',
expand('results/{sample}{rep}.txt', sample = ['AD', 'P0', 'P4', 'P7'], rep = ['rep1', 'rep2']),
'results/full_genemap.csv',
'results/full_filtered_counts_matrix.csv'
rule fastqc:
input:
fastq = 'samples/full_files/{sample}{rep}_R{number}.fastq.gz',
output:
fastqc = 'results/{sample}{rep}_R{number}_fastqc.html'
params:
outdir = 'results/'
shell:
'''
fastqc {input.fastq} -o {params.outdir}
'''
# remember that you want multiqc to run only after fastqc has run on all the files
rule multiqc:
input:
fastqc = 'results/{sample}{rep}_R{number}_fastqc.html'
output:
multiqc = 'results/multiqc_report.html'
params:
outdir = 'results/'
shell:
'''
multiqc -f {params.outdir} -o {params.outdir}
'''
rule get_m39:
output:
'results/GRCm39.fa.gz'
shell:
'''
wget -O {output} https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M34/GRCm39.primary_assembly.genome.fa.gz
'''
rule get_m39_gtf:
output:
'results/m39_GA.gtf'
params:
outdir = 'results/m39_GA.gtf.gz'
shell:
'''
wget -O {params.outdir} https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M34/gencode.vM34.primary_assembly.annotation.gtf.gz
gunzip -k {output}
'''
# make sure to read up on how STAR names it output files
rule star:
input:
fastq1 = 'samples/full_files/{sample}{rep}_R1.fastq.gz',
fastq2 = 'samples/full_files/{sample}{rep}_R2.fastq.gz',
Genome_index = 'results/m39_star/'
output:
bam = 'results/{sample}{rep}.Aligned.out.bam'
params:
outdir = 'results/'
shell:
'''
STAR --genomeDir {input.Genome_index} --readFilesIn {input.fastq1} {input.fastq2} --readFilesCommand zcat --outSAMtype BAM Unsorted --outFileNamePrefix {params.outdir}{wildcards.sample}{wildcards.rep}.
'''
rule samtools_flagstat:
input:
bam = 'results/{sample}{rep}.Aligned.out.bam'
output:
flagstat = 'results/{sample}{rep}.txt'
shell:
'''
samtools flagstat {input.bam} > {output.flagstat}
'''
rule verse:
input:
bam='results/{sample}{rep}.Aligned.out.bam',
gtf='results/m39_GA.gtf'
output:
exon='results/{sample}{rep}.exon.txt'
params:
prefix='results/{sample}{rep}',
sample=lambda wildcards: wildcards.sample
rep=lambda wildcards: wildcards.rep
shell:
'''
verse -S -a {input.gtf} -o {params.prefix} {input.bam}
'''
rule concat_df:
input:
verse_files = 'results/{sample}{rep}.exon.txt'
output:
cts_matrix = 'results/full_matrix.csv'
shell:
'''
python concat_df.py -i {input.verse_files} -o {output.cts_matrix}
'''
rule parse_gtf:
input:
gtf = 'results/m39_GA.gtf'
output:
map = 'results/full_genemap.csv'
shell:
'''
python parse_gtf.py -i {input.gtf} -o {output.map}
'''
rule filter_counts:
input:
matrix = 'results/full_matrix.csv'
output:
filter = 'results/full_filtered_counts_matrix.csv'
shell:
'''
python filter_cts_mat.py -i {input.matrix} -o {output.filter}
'''