update

ruilulab · Mar 20, 2020 · bcdf1d1 · bcdf1d1
1 parent d93a7d4
commit bcdf1d1
Show file tree

Hide file tree

Showing 9 changed files with 173 additions and 143 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,4 @@
 .snakemake
- .DS_Store
- .idea
- .vscode
+.DS_Store
+.idea
+.vscode
diff --git a/bowtie2-macs2-homer.smk b/bowtie2-macs2-homer.smk
@@ -160,7 +160,7 @@ rule macs_broad:
        "--outdir macs2_highQuality_results -n {wildcards.sample}  --broad"
        "{params.extra} {params.extra2} 2> {log}"
 
-rule annotatepeaks
+rule annotatepeaks:
     input:
         bed="{sample}.bed"
     output:

diff --git a/config.yml b/config.yml
@@ -1,4 +1,4 @@
-#this is a snakemake config file.
+# snakeflow config file.
 # 2017-03-28
 
 # dir contains all indexes, genome sequences
@@ -29,8 +29,6 @@ rseqc:
 # adaptors:
 #     illumina: "/home/fangzq/github/snakeflow/adaptors/TruSeq3-PE.fa"
 
-
-
 # Index dir
 hisat2_index: "/home/fangzq/genome/hisat2Indices_hg38"
 salmon_index: "/home/fangzq/genome/salmonIndices_hg38"
@@ -56,9 +54,13 @@ paired: True
 # Stranded library ? True or False
 stranded: False
 
-# if you have a fastq file named Sample1_R1.fastq.gz,
-# extract the sample pattern blow
 # information in `samples` is used for deseq2 and rMATS
+# for trimed fastq, use suffix like: _trimmed.fq.gz
+read_pattern:
+    r1: "{sample}_R1_001.fastq.gz" # don't change {sample}
+    r2: "{sample}_R2_001.fastq.gz" # don't change {sample}
+
+# sample metadata
 sample_meta: "/home/fangzq/projects/bulk/group.txt"
 # ``dataframe``attribute works only if a file is given.
 # each column names correspond to the samples' attributes of above.
@@ -68,21 +70,12 @@ sample_meta: "/home/fangzq/projects/bulk/group.txt"
       ## WGC096875R S75    Cancer 0
       ## WGC096876R S76    Normal 0
       ## WGC096877R S77    Cancer 0
-# Before running with your samples,
-# please rename all your samples with the same suffix below.
-# for example, rename all your sample like this: "WGC096874R_combined_R1.fastq.gz"
-# for trimed fastq, use suffix like: _trimmed.fq.gz
-read_pattern:
-    r1: "{sample}_R1_001.fastq.gz" # don't change {sample}
-    r2: "{sample}_R2_001.fastq.gz" # don't change {sample}
-
-enrichr_library: ['GO_Cellular_Component_2017','GO_Molecular_Function_2017',
-                  'GO_Biological_Process_2017','Human_Phenotype_Ontology',
-                   'MSigDB_Oncogenic_Signatures','WikiPathways_2016',
-                   'KEGG_2016']
 
 # RNA Binding Protein list
 rbps: "/home/fangzq/github/snakeflow/221RBPs.csv"
 # DESeq2 cutoff
 log2fc: 1
-fdr: 0.01
+fdr: 0.01
+enrichr_library: ['GO_Biological_Process_2018','GO_Cellular_Component_2018','GO_Molecular_Function_2018',
+                  'Human_Phenotype_Ontology', 'MSigDB_Oncogenic_Signatures',
+                   'KEGG_2016', 'KEGG_2019_Human'] # KEGG_2019_Mouse
diff --git a/gatk.smk b/gatk.smk
@@ -0,0 +1,120 @@
+from snakemake.shell import shell
+
+GENOME="/home/fangzq/genome/mouse/GRCm38_68.fa"
+dbSNP="/home/fangzq/genome/mouse/mgp.v5.merged.snps_all.dbSNP142.sorted.vcf"
+STRAINS = "129P2 129S1 129S5 A_J AKR B10 B_C BPL BPN BTBR BUB C3H C57BL10J C57BL6NJ C57BRcd C57LJ C58 CBA CEJ DBA1J DBA FVB ILNJ KK LGJ LPJ MAMy NOD NON NOR NOR NUJ NZB NZO NZW PJ PLJ RBF RFJ RHJ RIIIS SEA SJL SMJ ST SWR TALLYHO"
+STRAINS = STRAINS.split(" ")
+TMPDIR = "/home/fangzq/TMPDATA"
+#CHROMSOME = [ str(c) for c in range(1,20)] + ["X", "Y", "MT"]
+CHROMSOME = ['1'] + [ str(c) for c in range(10,20)] + [ str(c) for c in range(2,10)]+ ["MT", "X", "Y"]
+OUTPUT = expand("combined.chr{i}.vcf", i=CHROMSOME)
+
+rule all:
+    input: OUTPUT
+
+
+rule sample_calling:
+    input:
+        dbSNP=dbSNP,
+        genome=GENOME,
+        bam="/data/bases/fangzq/strains/{strain}/output.GATKrealigned.Recal.bam",
+        bai="/data/bases/fangzq/strains/{strain}/output.GATKrealigned.Recal.bai",
+    output: 
+        gvcf="/data/bases/fangzq/GVCF/{strain}.raw.g.vcf",
+        gvcfi="/data/bases/fangzq/GVCF/{strain}.raw.g.vcf.idx"
+        # gvcf=expand("/data/bases/fangzq/strains/GATK_TMP/{strain}.chr{i}.raw.g.vcf", i=CHROMSOME),
+        # gvcfi=expand("/data/bases/fangzq/strains/GATK_TMP/{strain}.chr{i}.raw.g.vcf.idx", i=CHROMSOME)
+    threads: 2
+    log: "/data/bases/fangzq/strains/{strain}.haplotypecaller.log"
+    params:
+        #java_ops="-Xmx16G -Djava.io.tmpdir=%s"%TMPDIR,
+        chrs=CHROMSOME,
+        tmpdir=TMPDIR,
+        strain="{strain}"
+    shell:
+        """gatk HaplotypeCaller  \
+                -ERC GVCF --tmp-dir {params.tmpdir} \
+                --native-pair-hmm-threads {threads} \
+                --dbsnp {input.dbSNP} \
+                -R {input.genome} \
+                -I {input.bam} \
+                -O {output.gvcf} 2> {log} 
+        """
+    ## split run into chromosomes
+    # run:
+    #     for ch in params['chrs']:
+    #         shell("""gatk HaplotypeCaller  \
+    #                 -ERC GVCF --tmp-dir /home/fangzq/TMPDATA \
+    #                 --native-pair-hmm-threads 2 \
+    #                 --dbsnp {input.dbSNP} \
+    #                 -R {input.genome} \
+    #                 -I {input.bam} \
+    #                 -L {i} \
+    #                 -O /data/bases/fangzq/strains/GATK_TMP/{params.strain}.chr{i}.raw.g.vcf 
+    #                """.format(i=ch))
+
+# rule gatherVCFs:
+#     input:
+#         gvcf=expand("/data/bases/fangzq/strains/GATK_TMP/{strain}.chr{i}.raw.g.vcf", i=CHROMSOME),
+#         gvcfi=expand("/data/bases/fangzq/strains/GATK_TMP/{strain}.chr{i}.raw.g.vcf.idx", i=CHROMSOME)
+#     output:
+#         gvcf="/data/bases/fangzq/GVCF/{strain}.raw.g.vcf",
+#         gvcfi="/data/bases/fangzq/GVCF/{strain}.raw.g.vcf.idx"
+#     log: "/data/bases/fangzq/strains/{strain}.gatherVCFs.log"
+#     run:
+#        g = " -I ".join(input.gvcf)
+#        shell("gatk GatherVcfs -I {gvcf} -O {output.gvcf}".format(gvcf=g))
+
+# 1 CombinedGVCFs, has to be an interval, so seperate each chromosome
+rule combineGVCFs:
+    input:
+        genome=GENOME,
+        gvcf=expand("/data/bases/fangzq/GVCF/{strain}.raw.g.vcf", strain=STRAINS),
+        gvcfi=expand("/data/bases/fangzq/GVCF/{strain}.raw.g.vcf.idx", strain=STRAINS)
+    output:
+        expand("combined.chr{i}.g.vcf", i=CHROMSOME),
+        expand("combined.chr{i}.g.vcf.idx", i=CHROMSOME)
+    params:
+        chrs=CHROMSOME
+    log: "/data/bases/fangzq/strains/combineGVCFs.log"
+    run:
+        variant = " --variant ".join(input.gvcf) 
+        for i in params['chrs']:
+            shell("gatk CombineGVCFs -L {chr} -R {input.genome} --variant {var} -O combined.chr${chr}.g.vcf >> {log}".format(chr=i, var=variant))
+
+rule joint_calling:
+    input: 
+        gvcf="combined.chr{i}.g.vcf",
+        gvcfi="combined.chr{i}.g.vcf.idx",
+        genome=GENOME
+    output: "combined.chr{i}.vcf",
+    params:
+        tmpdir=TMPDIR,
+        java_ops= "-Xmx12G -Djava.io.tmpdir=%s"%TMPDIR
+    log: "/data/bases/fangzq/strains/chr{i}.GenotypeGVCFs.log"
+    shell:
+        "gatk --java-options '{params.java_ops}' "
+        "GenotypeGVCFs -R {input.genome} -V {input.gvcf} -O {output} 2> {log}"
+        # """
+        # gatk GenotypeGVCFs \
+        #     --tmp-dir {params.tmpdir} \
+        #     -R {input.genome} \
+        #     -V {input.gvcf} \
+        #     -O {output} 2> {log}
+        # """
+
+
+# g.vcf文件用CombineGVCFs方式或者GenomicsDBImport方式合并成一个文件，前者（比较传统）是一个总的g.vcf文件，后者是一个GenomicsDB（XX.db）
+# 2 GenomicsDBImport方法（这里需要注意的是其必须要输入一个区间One or more genomic intervals，所以可以选择分染色体进行）：
+#for i in $(seq 1 19) X Y MT;
+# do
+# gatk GenomicsDBImport $(fori in $(ls *.vcf);do echo "-V $i";done) \
+#    -L $i \
+#    --genomicsdb-workspace-path DB.chr${i}
+#
+
+#gatk --java-options "-Xmx32G -Djava.io.tmpdir=/home/fangzq/TMPDATA" GenotypeGVCFs \
+#    -R $GENOME \
+#    -V gendb://DB.chr${i} \
+#    -O chr${i}.combined.vcf
+# done
diff --git a/hisat2-rmats-turbo-v0.1.smk b/hisat2-rmats-turbo-v0.1.smk
@@ -2,40 +2,11 @@ from os.path import join, isfile
 from itertools import combinations
 from snakemake.shell import shell
 
+include: "rules/common.smk"
+
 configfile: 'config.yml'
-#Workding directory
 workdir: config['workdir']
 
-# utils function
-def unique(seq):
-    """Remove duplicates from a list in Python while preserving order.
-    :param seq: a python list object.
-    :return: a list without duplicates while preserving order.
-    """
-    seen = set()
-    seen_add = seen.add
-
-    return [x for x in seq if x not in seen and not seen_add(x)]
-
-def parse_samples(tab=config['samples']['coldata']):
-    """parse samples """
-    SAMPLES=[]
-    SAMPLES_ALIAS=[]
-    GROUP=[]
-    TIME=[]
-    with open(tab, 'rU') as f:
-        lines = f.readlines()
-    for line in lines:
-        line = line.strip()
-        if not len(line) or line.startswith('#'): continue #skip blank line or comment linne
-        item = line.split(" ")
-        SAMPLES.append(item[0])
-        SAMPLES_ALIAS.append(item[1])
-        GROUP.append(item[2])
-        if len(item) >3: TIME.append(item[3])
-
-    return SAMPLES, SAMPLES_ALIAS, GROUP, TIME
-
 ################### globals #############################################
 
 # Full path to an uncompressed FASTA file with all chromosome sequences.

diff --git a/hisat2-rseqc-fastqc-multiqc.smk b/hisat2-rseqc-fastqc-multiqc.smk
@@ -1,41 +1,10 @@
 from os.path import join, isfile
 
-############# Globals ######################################
+include: "rules/common.smk"
 
 configfile: 'config.yml'
-#Workding directory
 workdir: config['workdir']
 
-# utils function
-def unique(seq):
-    """Remove duplicates from a list in Python while preserving order.
-    :param seq: a python list object.
-    :return: a list without duplicates while preserving order.
-    """
-    seen = set()
-    seen_add = seen.add
-
-    return [x for x in seq if x not in seen and not seen_add(x)]
-
-def parse_samples(tab=config['sample_meta']):
-    """parse samples """
-    SAMPLES=[]
-    SAMPLES_ALIAS=[]
-    GROUP=[]
-    TIME=[]
-    with open(tab, 'rU') as f:
-        lines = f.readlines()
-    for line in lines:
-        line = line.strip()
-        if not len(line) or line.startswith('#'): continue #skip blank line or comment linne
-        item = line.split(" ")
-        SAMPLES.append(item[0])
-        SAMPLES_ALIAS.append(item[1])
-        GROUP.append(item[2])
-        if len(item) >3: TIME.append(item[3])
-
-    return SAMPLES, SAMPLES_ALIAS, GROUP, TIME
-
 ################### globals #############################################
 
 # Full path to an uncompressed FASTA file with all chromosome sequences.

diff --git a/rules/common.smk b/rules/common.smk
@@ -0,0 +1,31 @@
+from os.path import join, isfile
+from itertools import combinations
+
+def unique(seq):
+    """Remove duplicates from a list in Python while preserving order.
+    :param seq: a python list object.
+    :return: a list without duplicates while preserving order.
+    """
+    seen = set()
+    seen_add = seen.add
+
+    return [x for x in seq if x not in seen and not seen_add(x)]
+
+def parse_samples(tab=config['sample_meta']):
+    """parse samples """
+    SAMPLES=[]
+    SAMPLES_ALIAS=[]
+    GROUP=[]
+    TIME=[]
+    with open(tab, 'rU') as f:
+        lines = f.readlines()
+    for line in lines:
+        line = line.strip()
+        if not len(line) or line.startswith('#'): continue #skip blank line or comment linne
+        item = line.split(" ")
+        SAMPLES.append(item[0])
+        SAMPLES_ALIAS.append(item[1])
+        GROUP.append(item[2])
+        if len(item) >3: TIME.append(item[3])
+
+    return SAMPLES, SAMPLES_ALIAS, GROUP, TIME
diff --git a/salmon-tximport-deseq2-docker.smk b/salmon-tximport-deseq2-docker.smk
@@ -1,39 +1,11 @@
 from os.path import join, isfile
 from itertools import combinations
 
+include: "rules/common.smk"
+
 configfile: 'config.yml'
-#Workding directory
 workdir: config['workdir']
 
-def unique(seq):
-    """Remove duplicates from a list in Python while preserving order.
-    :param seq: a python list object.
-    :return: a list without duplicates while preserving order.
-    """
-    seen = set()
-    seen_add = seen.add
-
-    return [x for x in seq if x not in seen and not seen_add(x)]
-
-def parse_samples(tab=config['samples']['coldata']):
-    """parse samples """
-    SAMPLES=[]
-    SAMPLES_ALIAS=[]
-    GROUP=[]
-    TIME=[]
-    with open(tab, 'rU') as f:
-        lines = f.readlines()
-    for line in lines:
-        line = line.strip()
-        if not len(line) or line.startswith('#'): continue #skip blank line or comment linne
-        item = line.split("\t")
-        SAMPLES.append(item[0])
-        SAMPLES_ALIAS.append(item[1])
-        GROUP.append(item[2])
-        TIME.append(item[3])
-
-    return SAMPLES, SAMPLES_ALIAS, GROUP, TIME
-
 ################### globals #############################################
 
 # Full path to an uncompressed FASTA file with all chromosome sequences.

diff --git a/salmon-tximport-deseq2-v0.3.smk b/salmon-tximport-deseq2-v0.3.smk
@@ -1,38 +1,12 @@
 from os.path import join, isfile
 from itertools import combinations
 
+include: "rules/common.smk"
+
 configfile: 'config.yml'
-#Workding directory
 workdir: config['workdir']
 
-def unique(seq):
-    """Remove duplicates from a list in Python while preserving order.
-    :param seq: a python list object.
-    :return: a list without duplicates while preserving order.
-    """
-    seen = set()
-    seen_add = seen.add
-
-    return [x for x in seq if x not in seen and not seen_add(x)]
-
-def parse_samples(tab=config['sample_meta']):
-    """parse samples """
-    SAMPLES=[]
-    SAMPLES_ALIAS=[]
-    GROUP=[]
-    TIME=[]
-    with open(tab, 'rU') as f:
-        lines = f.readlines()
-    for line in lines:
-        line = line.strip()
-        if not len(line) or line.startswith('#'): continue #skip blank line or comment linne
-        item = line.split(" ")
-        SAMPLES.append(item[0])
-        SAMPLES_ALIAS.append(item[1])
-        GROUP.append(item[2])
-        if len(item) >3: TIME.append(item[3])
 
-    return SAMPLES, SAMPLES_ALIAS, GROUP, TIME
 
 ################### globals #############################################