update

ruilulab · Oct 28, 2021 · e6836b5 · e6836b5
1 parent c15bf51
commit e6836b5
Show file tree

Hide file tree

Showing 2 changed files with 217 additions and 32 deletions.
diff --git a/workflows/amplicon.smk b/workflows/amplicon.smk
@@ -0,0 +1,157 @@
+
+"""
+This is a pipeline for CRISPR Editing Amplicon analysis
+
+NGS sequencing with 250 bp in each end of amplicon
+
+R1: 
+    barcode| ----- primer ---- | ----- amplicon ----
+    >>>>>>>>===================AATCGCGTTCCGGAAAATCCTG......
+
+R2: 
+
+    barcode| ----- primer ---- | ----- amplicon ----
+    >>>>>>>>===================CCTGAACCTCGATCGGATAGCT......
+
+pipeline:
+>> demux barcode (split sample) >> trim primer >> join R1-R2 >> amplicon >> qc >> CRISPR Resso 
+-> cutadapt -> CRISPResso2 -> NGmerge or fastp (for join pair emd reads)
+
+NOTE: run demultiplexing first, then run CRISPResso2
+"""
+
+
+import os, sys, csv 
+import pandas as pd
+
+workdir: "/data/bases/fangzq/Amplicon_seq/30-585200003"
+
+
+POOL_SAMPLES = ['C3-ALB-Clone', 'PNPLA3-C3-clone','PNPLA3-Cellpool']
+SPLIT_SAMPLES = {}
+## NOTE: 
+# barcodes metadata file has following columns
+# ['sample-id', 'barcode-fwd', 'adaptor-fwd', 'barcode-rev', 'adaptor-rev', 'sgRNA','PCR-fwd','PCR-rev','reference']
+for s in POOL_SAMPLES:
+    _SAMPLES = []
+    with open(f"{s}.barcodes.txt", newline='') as f:
+        reader = csv.DictReader(f, delimiter='\t',)
+        for row in reader:
+            _SAMPLES.append(row['sample-id'])
+            #SPLIT_TEMPLATES[row['sample-id']] = row['template']
+    SPLIT_SAMPLES [s] = _SAMPLES
+
+
+
+
+#### OUTPUT FILES ##########
+OUTDIR = directory(expand("{sample}_Results", sample=POOL_SAMPLES))
+CRISPResso_OUTPUT = ["%s_Results/CRISPResso_on_%s/%s.Alleles_frequency_table.zip"%(sample, split, split) for sample in POOL_SAMPLES for split in SPLIT_SAMPLES[sample]]
+AMPLICONS = ["%s_Results/%s_amplicon.fastq.gz"%(sample, split) for sample in POOL_SAMPLES for split in SPLIT_SAMPLES[sample]]
+
+
+rule all:
+    input: OUTDIR, CRISPResso_OUTPUT, AMPLICONS# ALIGNMENT
+
+
+rule fastp:
+    input: 
+        R1="00_fastq/{sample}_R1_001.fastq.gz",
+        R2="00_fastq/{sample}_R2_001.fastq.gz",
+    output:
+        R1="{sample}/forward.fastq.gz",
+        R2="{sample}/reverse.fastq.gz",
+        j = "temp/{sample}.fastp.json",
+        h = "temp/{sample}.fastp.html"
+    shell:
+        "fastp -i {input.R1} -I {input.R2} "
+        "-o {output.R1} -O {output.R2} "
+        "--json {output.j} --html {output.h} "
+
+
+
+checkpoint demulex_barcodes:
+    input:
+        barcode = "{sample}.barcodes.txt",
+        R1="{sample}/forward.fastq.gz",
+        R2="{sample}/reverse.fastq.gz",
+    output:
+        t = temp(directory("temp/{sample}")),
+        d = directory("{sample}_Results")
+    run:
+        bfwd = []
+        brev = []
+        os.makedirs(output.t, exist_ok=True)
+        os.makedirs(output.d, exist_ok=True)
+        with open(input.barcode, newline='') as f:
+            reader = csv.DictReader(f, delimiter='\t',)
+            for row in reader:
+                bfwd.append(row['barcode-fwd'])
+                brev.append(row['barcode-rev']) 
+                s = row['sample-id']
+                # split sample by barcode
+                cmd1 = "cutadapt -e 0.15 --no-indels --pair-adapters -m 1 --trimmed-only " +\
+                    "-g ^%s -G ^%s "%(bfwd[-1], brev[-1]) +\
+                    "-o temp/%s/%s_R1.fastq.gz -p temp/%s/%s_R2.fastq.gz "%(wildcards.sample, s, wildcards.sample, s) +\
+                    "%s/forward.fastq.gz %s/reverse.fastq.gz"%(wildcards.sample, wildcards.sample)
+
+                shell(cmd1)
+                # remove adaptor per sample
+                cmd2 = "cutadapt -e 0.1 --no-indels --pair-adapters -m 1 --trimmed-only " +\
+                    "-g ^%s -G ^%s "%(row['adaptor-fwd'], row['adaptor-rev']) +\
+                    "-o %s_Results/%s_R1.fastq.gz -p %s_Results/%s_R2.fastq.gz "%(wildcards.sample, s, wildcards.sample, s) +\
+                    "temp/%s/%s_R1.fastq.gz temp/%s/%s_R2.fastq.gz "%(wildcards.sample, s, wildcards.sample, s)
+                shell(cmd2)
+
+
+
+rule CRISPResso_Analysis:
+    """
+    CRISPR Editing analysis
+    """
+    input: 
+        #R1=lambda wildcards: "%s/{split}_R1.fastq.gz"%(checkpoints.demulex_barcodes.get(sample=wildcards.sample)),
+        #R2=lambda wildcards: "%s/{split}_R2.fastq.gz"%(checkpoints.demulex_barcodes.get(sample=wildcards.sample)),
+        #R2=lambda wildcards: "%s_Results/{split}_R2.fastq.gz"%(wildcards.sample),
+        R1="{sample}_Results/{split}_R1.fastq.gz",
+        R2="{sample}_Results/{split}_R2.fastq.gz",
+        barcode = "{sample}.barcodes.txt",
+    output: 
+        msa="{sample}_Results/CRISPResso_on_{split}/{split}.Alleles_frequency_table.zip",
+        #outdir = directory("{sample}_demux/CRISPResso/{split}")
+    threads: 1
+    params: 
+        outdir = "{sample}_Results/",
+    run:
+        barcodes = pd.read_table(input.barcode, index_col=0)
+        amplicon = barcodes.loc[wildcards.split, 'reference']
+        sgRNA = barcodes.loc[wildcards.split, 'sgRNA']
+        cmd = "CRISPResso --amplicon_seq " + amplicon # this could be WT allel sequences.
+        # cmd = " --expected_hdr_amplicon_seq " + hdr_amplicon  # if you know the exepected sequence
+        cmd += " --fastq_r1 {input.R1} --fastq_r2 {input.R2} "
+        cmd += "--name {wildcards.split}  --output_folder {params.outdir} --file_prefix {wildcards.split} "
+        cmd += " --amplicon_name {wildcards.sample} " 
+        #cmd += " --guide_seq %s "%sgRNA   # exclude PAM
+        #cmd += " --guide_name %s "%name
+        cmd += "--max_paired_end_reads_overlap 250 " # 250 most for amplicon seq
+        cmd += "--min_paired_end_reads_overlap 10 "
+        cmd += " --quantification_window_size 10 " #--quantification_window_center -10 "
+        cmd += " --write_cleaned_report "# --place_report_in_output_folder"
+        shell(cmd)
+
+
+
+
+rule merge_paired_end:
+    """
+    merge paired end reads into one amplicons.
+    """
+    input:
+        #R1="{sample}_demux/{split}_R1.fastq.gz",
+        #R2="{sample}_demux/{split}_R2.fastq.gz",
+        R1=lambda wildcards: "%s/{split}_R1.fastq.gz"%(checkpoints.demulex_barcodes.get(**wildcards).output.d),
+        R2=lambda wildcards: "%s/{split}_R2.fastq.gz"%(checkpoints.demulex_barcodes.get(**wildcards).output.d),
+    output:
+        "{sample}_demux/{split}_amplicon.fastq.gz"
+    shell:
+        "NGmerge -1 {input.R1} -2 {input.R2} -m 10 -o {output}"
diff --git a/workflows/qiime2.amplicon.smk b/workflows/qiime2.amplicon.smk
@@ -1,5 +1,26 @@
 
-""" see a short tutorial here for metagenome analysis
+"""
+This is a pipeline for CRISPR Editing Amplicon analysis using QIIME2
+
+NGS sequencing with 250 bp in each end of amplicon
+
+R1: 
+    barcode| ----- primer ---- | ----- amplicon ----
+    >>>>>>>>===================AATCGCGTTCCGGAAAATCCTG......
+
+R2: 
+
+    barcode| ----- primer ---- | ----- amplicon ----
+    >>>>>>>>===================CCTGAACCTCGATCGGATAGCT......
+
+pipeline:
+>> demux barcode (split sample) >> trim primer >> join R1-R2 >> amplicon >> qc >> CRISPR Resso 
+
+
+two pipline ->  qiime2 +  CRISPRResso2 
+
+
+see a short tutorial here for metagenome analysis
 http://www.int-res.com/articles/suppl/m648p169_supp3.txt
 
 #What: Computational pipeline for the V3/V4 region of the 16S rRNA gene 
@@ -45,20 +66,6 @@ rule all:
     input: OUT, OUT2, CRISPRESSO# ALIGNMENT
 
 
-# rule rename_r1:
-#     input: "{sample}_R1.fastq.gz"
-#     output: "{sample}/forward.fastq.gz"
-#     shell:
-#         "mv {input} {output}"
-
-# rule rename_r2:
-#     input: "{sample}_R2.fastq.gz"
-#     output: "{sample}/reverse.fastq.gz"
-#     shell:
-#         "mv {input} {output}"
-
-
-
 rule data_import:
     """
     The import format for paired-end reads with the barcodes still in the sequence 
@@ -96,26 +103,40 @@ rule demulex_cutadapt:
         "--m-reverse-barcodes-file {input.barcode}  "
         "--o-per-sample-sequences {output.demulexed}  "
         "--o-untrimmed-sequences {output.unmatched} "
+        #"--p-mixed-orientation "
+        "--p-error-rate 0.20 "
+
 
-rule trim_primer:
+rule trim_index_primer:
     """
     If there are sequencing adapters or PCR primers in the reads which you’d like to remove, you can do that next as follows.
     """
-    input: "{sample}.demulexed.qza"
+    input: 
+        demux="{sample}.demulexed.qza",
+        barcode = "{sample}.barcodes.txt"
     output: "{sample}.trimmed.qza"
-    params:
-        primer_f= "GCACTCACTAGCCGTGACTATCGACTAATAGAT",#"GCACTCACTAGCCGTGACT",
-        primer_r= "CGACCGCACGTCTATTTAGTGGAGCCGAGATGT",#CGACCGCACGTCTATTTAGT",
     threads: 8
-    shell:
-        "qiime cutadapt trim-paired "
-        "--i-demultiplexed-sequences {input} "
-        "--p-front-f {params.primer_f} "
-        "--p-front-r {params.primer_r} "
-        "--p-error-rate 0.1 "
-        "--o-trimmed-sequences {output} "
-        "--verbose "
-        "--p-cores {threads} "
+    run:
+        fwd = set()
+        rev = set()
+        with open(input.barcode, newline='') as f:
+            reader = csv.DictReader(f, delimiter='\t',)
+            for row in reader:
+                fwd.add(row['adaptor-fwd'])
+                rev.add(row['adaptor-rev'])    
+
+        primer_f = fwd.pop()
+        primer_r = rev.pop()    
+
+        cmd =  "qiime cutadapt trim-paired " +\
+                "--i-demultiplexed-sequences {input.demux} " +\
+                f"--p-front-f {primer_f} " +\
+                f"--p-front-r {primer_r} " +\
+                "--p-error-rate 0.1 " +\
+                "--o-trimmed-sequences {output} " +\
+                "--verbose " +\
+                "--p-cores {threads} "
+        shell(cmd)
 
 rule join_paired:
     input: "{sample}.trimmed.qza"
@@ -186,22 +207,29 @@ rule amplicon_demux:
 rule CRISPResso_alignment:
     input: 
         fq="{sample}_demux/{split}_{i}.{final}.fastq",
-        ref = "template.ref.fa",
+        ref = "template.ref.fa", 
     output: 
         msa="{sample}_demux/CRISPResso/CRISPResso_on_{split}_{i}.{final}/Alleles_frequency_table.zip",
     threads: 1
     params: 
         outdir = "{sample}_demux/CRISPResso",
         final = "{final}",
+        tubb = "GAGGCCGAAGAGGAGGCCTA", # sgRNA
+        h4c3 = "GGGCGCACTCTGTATGGCTT",
     run:
         from Bio import SeqIO
         amplicons = list(SeqIO.parse("template.ref.fa", "fasta"))
         ids = { t.id : i for i, t in enumerate(amplicons) }
         idx = ids[params.final]
+        name = params.final.split("-")[0].lower()
+        sgRNA = params.tubb if name == 'tubb' else params.h4c3
         amplicon = str(amplicons[idx].seq)       
-        cmd = "CRISPResso --amplicon_seq " + amplicon
+        cmd = "CRISPResso --amplicon_seq " + amplicon # this could be WT allel sequences.
+        # cmd = " --expected_hdr_amplicon_seq " + hdr_amplicon  # if you know the exepected sequence
         cmd += " --fastq_r1 {input.fq} --output_folder {params.outdir} "
-        cmd += " --amplicon_name {wildcards.final} " #"--guide_seq GAGTCCGAGCAGAAGAAGAA  --guide_name xxx"
+        cmd += " --amplicon_name {wildcards.final} " 
+        #cmd += " --guide_seq %s "%sgRNA   # exclude PAM
+        #cmd += " --guide_name %s "%name
         cmd += " --quantification_window_size 20 " #--quantification_window_center -10 "
         cmd += " --write_cleaned_report "# --place_report_in_output_folder"
         shell(cmd)