Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
Zhuoqing Fang committed Mar 20, 2020
1 parent d93a7d4 commit bcdf1d1
Show file tree
Hide file tree
Showing 9 changed files with 173 additions and 143 deletions.
6 changes: 3 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.snakemake
.DS_Store
.idea
.vscode
.DS_Store
.idea
.vscode
2 changes: 1 addition & 1 deletion bowtie2-macs2-homer.smk
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ rule macs_broad:
"--outdir macs2_highQuality_results -n {wildcards.sample} --broad"
"{params.extra} {params.extra2} 2> {log}"

rule annotatepeaks
rule annotatepeaks:
input:
bed="{sample}.bed"
output:
Expand Down
29 changes: 11 additions & 18 deletions config.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#this is a snakemake config file.
# snakeflow config file.
# 2017-03-28

# dir contains all indexes, genome sequences
Expand Down Expand Up @@ -29,8 +29,6 @@ rseqc:
# adaptors:
# illumina: "/home/fangzq/github/snakeflow/adaptors/TruSeq3-PE.fa"



# Index dir
hisat2_index: "/home/fangzq/genome/hisat2Indices_hg38"
salmon_index: "/home/fangzq/genome/salmonIndices_hg38"
Expand All @@ -56,9 +54,13 @@ paired: True
# Stranded library ? True or False
stranded: False

# if you have a fastq file named Sample1_R1.fastq.gz,
# extract the sample pattern blow
# information in `samples` is used for deseq2 and rMATS
# for trimed fastq, use suffix like: _trimmed.fq.gz
read_pattern:
r1: "{sample}_R1_001.fastq.gz" # don't change {sample}
r2: "{sample}_R2_001.fastq.gz" # don't change {sample}

# sample metadata
sample_meta: "/home/fangzq/projects/bulk/group.txt"
# ``dataframe``attribute works only if a file is given.
# each column names correspond to the samples' attributes of above.
Expand All @@ -68,21 +70,12 @@ sample_meta: "/home/fangzq/projects/bulk/group.txt"
## WGC096875R S75 Cancer 0
## WGC096876R S76 Normal 0
## WGC096877R S77 Cancer 0
# Before running with your samples,
# please rename all your samples with the same suffix below.
# for example, rename all your sample like this: "WGC096874R_combined_R1.fastq.gz"
# for trimed fastq, use suffix like: _trimmed.fq.gz
read_pattern:
r1: "{sample}_R1_001.fastq.gz" # don't change {sample}
r2: "{sample}_R2_001.fastq.gz" # don't change {sample}

enrichr_library: ['GO_Cellular_Component_2017','GO_Molecular_Function_2017',
'GO_Biological_Process_2017','Human_Phenotype_Ontology',
'MSigDB_Oncogenic_Signatures','WikiPathways_2016',
'KEGG_2016']

# RNA Binding Protein list
rbps: "/home/fangzq/github/snakeflow/221RBPs.csv"
# DESeq2 cutoff
log2fc: 1
fdr: 0.01
fdr: 0.01
enrichr_library: ['GO_Biological_Process_2018','GO_Cellular_Component_2018','GO_Molecular_Function_2018',
'Human_Phenotype_Ontology', 'MSigDB_Oncogenic_Signatures',
'KEGG_2016', 'KEGG_2019_Human'] # KEGG_2019_Mouse
120 changes: 120 additions & 0 deletions gatk.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
from snakemake.shell import shell

GENOME="/home/fangzq/genome/mouse/GRCm38_68.fa"
dbSNP="/home/fangzq/genome/mouse/mgp.v5.merged.snps_all.dbSNP142.sorted.vcf"
STRAINS = "129P2 129S1 129S5 A_J AKR B10 B_C BPL BPN BTBR BUB C3H C57BL10J C57BL6NJ C57BRcd C57LJ C58 CBA CEJ DBA1J DBA FVB ILNJ KK LGJ LPJ MAMy NOD NON NOR NOR NUJ NZB NZO NZW PJ PLJ RBF RFJ RHJ RIIIS SEA SJL SMJ ST SWR TALLYHO"
STRAINS = STRAINS.split(" ")
TMPDIR = "/home/fangzq/TMPDATA"
#CHROMSOME = [ str(c) for c in range(1,20)] + ["X", "Y", "MT"]
CHROMSOME = ['1'] + [ str(c) for c in range(10,20)] + [ str(c) for c in range(2,10)]+ ["MT", "X", "Y"]
OUTPUT = expand("combined.chr{i}.vcf", i=CHROMSOME)

rule all:
input: OUTPUT


rule sample_calling:
input:
dbSNP=dbSNP,
genome=GENOME,
bam="/data/bases/fangzq/strains/{strain}/output.GATKrealigned.Recal.bam",
bai="/data/bases/fangzq/strains/{strain}/output.GATKrealigned.Recal.bai",
output:
gvcf="/data/bases/fangzq/GVCF/{strain}.raw.g.vcf",
gvcfi="/data/bases/fangzq/GVCF/{strain}.raw.g.vcf.idx"
# gvcf=expand("/data/bases/fangzq/strains/GATK_TMP/{strain}.chr{i}.raw.g.vcf", i=CHROMSOME),
# gvcfi=expand("/data/bases/fangzq/strains/GATK_TMP/{strain}.chr{i}.raw.g.vcf.idx", i=CHROMSOME)
threads: 2
log: "/data/bases/fangzq/strains/{strain}.haplotypecaller.log"
params:
#java_ops="-Xmx16G -Djava.io.tmpdir=%s"%TMPDIR,
chrs=CHROMSOME,
tmpdir=TMPDIR,
strain="{strain}"
shell:
"""gatk HaplotypeCaller \
-ERC GVCF --tmp-dir {params.tmpdir} \
--native-pair-hmm-threads {threads} \
--dbsnp {input.dbSNP} \
-R {input.genome} \
-I {input.bam} \
-O {output.gvcf} 2> {log}
"""
## split run into chromosomes
# run:
# for ch in params['chrs']:
# shell("""gatk HaplotypeCaller \
# -ERC GVCF --tmp-dir /home/fangzq/TMPDATA \
# --native-pair-hmm-threads 2 \
# --dbsnp {input.dbSNP} \
# -R {input.genome} \
# -I {input.bam} \
# -L {i} \
# -O /data/bases/fangzq/strains/GATK_TMP/{params.strain}.chr{i}.raw.g.vcf
# """.format(i=ch))

# rule gatherVCFs:
# input:
# gvcf=expand("/data/bases/fangzq/strains/GATK_TMP/{strain}.chr{i}.raw.g.vcf", i=CHROMSOME),
# gvcfi=expand("/data/bases/fangzq/strains/GATK_TMP/{strain}.chr{i}.raw.g.vcf.idx", i=CHROMSOME)
# output:
# gvcf="/data/bases/fangzq/GVCF/{strain}.raw.g.vcf",
# gvcfi="/data/bases/fangzq/GVCF/{strain}.raw.g.vcf.idx"
# log: "/data/bases/fangzq/strains/{strain}.gatherVCFs.log"
# run:
# g = " -I ".join(input.gvcf)
# shell("gatk GatherVcfs -I {gvcf} -O {output.gvcf}".format(gvcf=g))

# 1 CombinedGVCFs, has to be an interval, so seperate each chromosome
rule combineGVCFs:
input:
genome=GENOME,
gvcf=expand("/data/bases/fangzq/GVCF/{strain}.raw.g.vcf", strain=STRAINS),
gvcfi=expand("/data/bases/fangzq/GVCF/{strain}.raw.g.vcf.idx", strain=STRAINS)
output:
expand("combined.chr{i}.g.vcf", i=CHROMSOME),
expand("combined.chr{i}.g.vcf.idx", i=CHROMSOME)
params:
chrs=CHROMSOME
log: "/data/bases/fangzq/strains/combineGVCFs.log"
run:
variant = " --variant ".join(input.gvcf)
for i in params['chrs']:
shell("gatk CombineGVCFs -L {chr} -R {input.genome} --variant {var} -O combined.chr${chr}.g.vcf >> {log}".format(chr=i, var=variant))

rule joint_calling:
input:
gvcf="combined.chr{i}.g.vcf",
gvcfi="combined.chr{i}.g.vcf.idx",
genome=GENOME
output: "combined.chr{i}.vcf",
params:
tmpdir=TMPDIR,
java_ops= "-Xmx12G -Djava.io.tmpdir=%s"%TMPDIR
log: "/data/bases/fangzq/strains/chr{i}.GenotypeGVCFs.log"
shell:
"gatk --java-options '{params.java_ops}' "
"GenotypeGVCFs -R {input.genome} -V {input.gvcf} -O {output} 2> {log}"
# """
# gatk GenotypeGVCFs \
# --tmp-dir {params.tmpdir} \
# -R {input.genome} \
# -V {input.gvcf} \
# -O {output} 2> {log}
# """


# g.vcf文件用CombineGVCFs方式或者GenomicsDBImport方式合并成一个文件,前者(比较传统)是一个总的g.vcf文件,后者是一个GenomicsDB(XX.db)
# 2 GenomicsDBImport方法(这里需要注意的是其必须要输入一个区间One or more genomic intervals,所以可以选择分染色体进行):
#for i in $(seq 1 19) X Y MT;
# do
# gatk GenomicsDBImport $(fori in $(ls *.vcf);do echo "-V $i";done) \
# -L $i \
# --genomicsdb-workspace-path DB.chr${i}
#

#gatk --java-options "-Xmx32G -Djava.io.tmpdir=/home/fangzq/TMPDATA" GenotypeGVCFs \
# -R $GENOME \
# -V gendb://DB.chr${i} \
# -O chr${i}.combined.vcf
# done
33 changes: 2 additions & 31 deletions hisat2-rmats-turbo-v0.1.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,40 +2,11 @@ from os.path import join, isfile
from itertools import combinations
from snakemake.shell import shell

include: "rules/common.smk"

configfile: 'config.yml'
#Workding directory
workdir: config['workdir']

# utils function
def unique(seq):
"""Remove duplicates from a list in Python while preserving order.
:param seq: a python list object.
:return: a list without duplicates while preserving order.
"""
seen = set()
seen_add = seen.add

return [x for x in seq if x not in seen and not seen_add(x)]

def parse_samples(tab=config['samples']['coldata']):
"""parse samples """
SAMPLES=[]
SAMPLES_ALIAS=[]
GROUP=[]
TIME=[]
with open(tab, 'rU') as f:
lines = f.readlines()
for line in lines:
line = line.strip()
if not len(line) or line.startswith('#'): continue #skip blank line or comment linne
item = line.split(" ")
SAMPLES.append(item[0])
SAMPLES_ALIAS.append(item[1])
GROUP.append(item[2])
if len(item) >3: TIME.append(item[3])

return SAMPLES, SAMPLES_ALIAS, GROUP, TIME

################### globals #############################################

# Full path to an uncompressed FASTA file with all chromosome sequences.
Expand Down
33 changes: 1 addition & 32 deletions hisat2-rseqc-fastqc-multiqc.smk
Original file line number Diff line number Diff line change
@@ -1,41 +1,10 @@
from os.path import join, isfile

############# Globals ######################################
include: "rules/common.smk"

configfile: 'config.yml'
#Workding directory
workdir: config['workdir']

# utils function
def unique(seq):
"""Remove duplicates from a list in Python while preserving order.
:param seq: a python list object.
:return: a list without duplicates while preserving order.
"""
seen = set()
seen_add = seen.add

return [x for x in seq if x not in seen and not seen_add(x)]

def parse_samples(tab=config['sample_meta']):
"""parse samples """
SAMPLES=[]
SAMPLES_ALIAS=[]
GROUP=[]
TIME=[]
with open(tab, 'rU') as f:
lines = f.readlines()
for line in lines:
line = line.strip()
if not len(line) or line.startswith('#'): continue #skip blank line or comment linne
item = line.split(" ")
SAMPLES.append(item[0])
SAMPLES_ALIAS.append(item[1])
GROUP.append(item[2])
if len(item) >3: TIME.append(item[3])

return SAMPLES, SAMPLES_ALIAS, GROUP, TIME

################### globals #############################################

# Full path to an uncompressed FASTA file with all chromosome sequences.
Expand Down
31 changes: 31 additions & 0 deletions rules/common.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from os.path import join, isfile
from itertools import combinations

def unique(seq):
"""Remove duplicates from a list in Python while preserving order.
:param seq: a python list object.
:return: a list without duplicates while preserving order.
"""
seen = set()
seen_add = seen.add

return [x for x in seq if x not in seen and not seen_add(x)]

def parse_samples(tab=config['sample_meta']):
"""parse samples """
SAMPLES=[]
SAMPLES_ALIAS=[]
GROUP=[]
TIME=[]
with open(tab, 'rU') as f:
lines = f.readlines()
for line in lines:
line = line.strip()
if not len(line) or line.startswith('#'): continue #skip blank line or comment linne
item = line.split(" ")
SAMPLES.append(item[0])
SAMPLES_ALIAS.append(item[1])
GROUP.append(item[2])
if len(item) >3: TIME.append(item[3])

return SAMPLES, SAMPLES_ALIAS, GROUP, TIME
32 changes: 2 additions & 30 deletions salmon-tximport-deseq2-docker.smk
Original file line number Diff line number Diff line change
@@ -1,39 +1,11 @@
from os.path import join, isfile
from itertools import combinations

include: "rules/common.smk"

configfile: 'config.yml'
#Workding directory
workdir: config['workdir']

def unique(seq):
"""Remove duplicates from a list in Python while preserving order.
:param seq: a python list object.
:return: a list without duplicates while preserving order.
"""
seen = set()
seen_add = seen.add

return [x for x in seq if x not in seen and not seen_add(x)]

def parse_samples(tab=config['samples']['coldata']):
"""parse samples """
SAMPLES=[]
SAMPLES_ALIAS=[]
GROUP=[]
TIME=[]
with open(tab, 'rU') as f:
lines = f.readlines()
for line in lines:
line = line.strip()
if not len(line) or line.startswith('#'): continue #skip blank line or comment linne
item = line.split("\t")
SAMPLES.append(item[0])
SAMPLES_ALIAS.append(item[1])
GROUP.append(item[2])
TIME.append(item[3])

return SAMPLES, SAMPLES_ALIAS, GROUP, TIME

################### globals #############################################

# Full path to an uncompressed FASTA file with all chromosome sequences.
Expand Down
30 changes: 2 additions & 28 deletions salmon-tximport-deseq2-v0.3.smk
Original file line number Diff line number Diff line change
@@ -1,38 +1,12 @@
from os.path import join, isfile
from itertools import combinations

include: "rules/common.smk"

configfile: 'config.yml'
#Workding directory
workdir: config['workdir']

def unique(seq):
"""Remove duplicates from a list in Python while preserving order.
:param seq: a python list object.
:return: a list without duplicates while preserving order.
"""
seen = set()
seen_add = seen.add

return [x for x in seq if x not in seen and not seen_add(x)]

def parse_samples(tab=config['sample_meta']):
"""parse samples """
SAMPLES=[]
SAMPLES_ALIAS=[]
GROUP=[]
TIME=[]
with open(tab, 'rU') as f:
lines = f.readlines()
for line in lines:
line = line.strip()
if not len(line) or line.startswith('#'): continue #skip blank line or comment linne
item = line.split(" ")
SAMPLES.append(item[0])
SAMPLES_ALIAS.append(item[1])
GROUP.append(item[2])
if len(item) >3: TIME.append(item[3])

return SAMPLES, SAMPLES_ALIAS, GROUP, TIME

################### globals #############################################

Expand Down

0 comments on commit bcdf1d1

Please sign in to comment.