forked from zqfang/snakeflow
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' of https://github.com/zqfang/snakeflow
- Loading branch information
Showing
3 changed files
with
236 additions
and
234 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,236 @@ | ||
# snakemake pipeline for 10x genomics single cell CITE-seq | ||
# @author: Zhuoqing Fang | ||
# @email: maxzqfang@stanford.edu | ||
# @date: May 7th, 2020 | ||
import glob, re, os | ||
|
||
WORKSPACE = "/data/bases/fangzq/20200505_HCC" | ||
workdir: WORKSPACE | ||
#### globals ######## | ||
#configfile: "config.yaml" | ||
#workdir: config["workdir"] | ||
|
||
#SAMPLES = config["samples"] | ||
#READ1 = condfig["r1"] | ||
#READ2 = config["r2"] | ||
#READi = config["ri"] | ||
#TRANSCRIPTOM10X = config["transcriptom"] | ||
|
||
########### INPUTS ########################################### | ||
|
||
# fastqs and hastag sequence | ||
HASHTAG = os.path.join(WORKSPACE, "hashtag.totalseqB.V3.feature_ref.csv") | ||
FASTQ_DIR = "/data/bases/shared/HCC/30-363886939/00_fastq" | ||
|
||
# automatic search sample names using regex | ||
pat = re.compile(r"([A-Z0-9-]+)-([A-Z0-9a-z]{2,4})_[0-9A-Z]{2}_(L[0-9]{3})_([RI][12])_[0-9]{3}.fastq.gz") | ||
# name, capture, lane, reads = pat.search(f).groups() | ||
files = glob.glob(os.path.join(FASTQ_DIR, "*fastq.gz")) | ||
files = [pat.search(f).groups() for f in files] | ||
# remove duplicate entries | ||
files = {s[0]: { x[1]: x[0] for x in files } for s in files} | ||
|
||
lane = "S1_L001" | ||
PROJECT = "HCC" | ||
SAMPLES = ["HCC1-GE", "HCC2-GE", "HCC3-2-GE"] | ||
SAMPLE_HASH=["HCC1-HASH", "HCC3-HASH", "HCC3-HASH"] | ||
|
||
# cellranger | ||
TRANSCRIPTOM10X = "/home/fangzq/genome/10XRefdata/refdata-cellranger-GRCh38-3.0.0" | ||
|
||
# vireo | ||
SNV = "/home/fangzq/genome" | ||
KNOWN_SNV = os.path.join(SNV, "genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf") | ||
DONOR_NUM_PER_SAMPLE = 3 # per 10x run samples/patients ... | ||
|
||
|
||
#### Target output files ##### | ||
MTX = expand("{sample}_cnt/outs/filtered_feature_bc_matrix/matrix.mtx.gz",sample=SAMPLES) | ||
BARCODE = expand("{sample}_cnt/outs/filtered_feature_bc_matrix/barcodes.tsv.gz", sample=SAMPLES) | ||
FEATURES = expand("{sample}_cnt/outs/filtered_feature_bc_matrix/features.tsv.gz",sample=SAMPLES) | ||
AGGR="{project}_aggr/outs/filtered_feature_bc_matrix.h5".format(project = PROJECT) | ||
BAM = expand("{sample}_cnt/outs/possorted_genome_bam.bam", sample=SAMPLES) | ||
VIREO = expand("{sample}_vireo/results.demux/donor_ids.tsv", sample=SAMPLES) | ||
|
||
#### Rules ######## | ||
rule target: | ||
input: MTX, BARCODE, FEATURES, BAM, AGGR, VIREO | ||
|
||
|
||
rule prepare_count: | ||
output: | ||
expand("{sample}.libraries.csv", sample=SAMPLES) | ||
params: | ||
samples=SAMPLES, | ||
fastqs=FASTQ_DIR, | ||
capture="HASH", # tag name represent which file is hashtag fastq | ||
files=files, | ||
run: | ||
for sample in params.samples: | ||
hashtag = params.files[sample] | ||
out = sample+".libraries.csv" | ||
with open(out, 'w') as w: | ||
w.write("fastqs,sample,library_type\n") | ||
for k, v in hashtag.items(): | ||
if k == params.capture: | ||
w.write("%s,%s,Antibody Capture\n"%(params.fastqs, sample)) | ||
else: | ||
w.write("%s,%s,Gene Expression\n"%(params.fastqs, sample)) | ||
|
||
|
||
|
||
# snakemake creates the folder if it does not exisit. but cellranger fails if the folder is | ||
# not created by itself. | ||
|
||
# trick 1: rm {sample}_cnt/_lock file and re-run pipeline, it will be OK. | ||
# https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/troubleshooting | ||
# trick 2: rm output folder before cellranger run | ||
rule citeseq_count: | ||
input: | ||
read1 = os.path.join(FASTQ_DIR, "{sample}_%s_R1_001.fastq.gz"%lane), | ||
read2 = os.path.join(FASTQ_DIR, "{sample}_%s_R2_001.fastq.gz"%lane), | ||
readi = os.path.join(FASTQ_DIR, "{sample}_%s_I1_001.fastq.gz"%lane), | ||
transcriptom = os.path.join(TRANSCRIPTOM10X, "fasta/genome.fa"), | ||
antibody = HASHTAG, | ||
libraries = "{sample}.libraries.csv", | ||
#libraries = rules.pre_count.output | ||
output: | ||
bam = "{sample}_cnt/outs/possorted_genome_bam.bam", | ||
barcode = "{sample}_cnt/outs/filtered_feature_bc_matrix/barcodes.tsv.gz", | ||
features = "{sample}_cnt/outs/filtered_feature_bc_matrix/features.tsv.gz", | ||
mtx = "{sample}_cnt/outs/filtered_feature_bc_matrix/matrix.mtx.gz", | ||
mh5 = "{sample}_cnt/outs/molecule_info.h5", | ||
threads: 12 | ||
log: "logs/{sample}.count.log" | ||
version: "0.1" | ||
params: | ||
extra=" --expect-cells=2000 --nosecondary", # --expect-cells=12000 | ||
fastqs=FASTQ_DIR, | ||
transcriptom=TRANSCRIPTOM10X, | ||
run: | ||
# non antibody captured code | ||
# "cellranger count --id={wildcards.sample}.cnt" | ||
# "--fastqs={params.fastqs} " | ||
# "--sample={wildcards.sample} " | ||
# rm outputdir to solve "pipestance directory" error from cellranger | ||
shell("rm -rf {wildcards.sample}_cnt") | ||
shell("cellranger count --id={wildcards.sample}_cnt " +\ | ||
"--transcriptome={params.transcriptom} " +\ | ||
"--libraries {input.libraries} " +\ | ||
"--feature-ref {input.antibody} " +\ | ||
"--localcores={threads} {params.extra} > {log}") | ||
|
||
rule prepare_aggr: | ||
input: expand("{sample}_cnt/outs/molecule_info.h5", sample=SAMPLES) | ||
output: "%s.aggr.csv"%PROJECT | ||
params: | ||
samples = SAMPLES, | ||
ids = [s[:4] for s in SAMPLES], # only used for loupe browser | ||
condition = [lane]*3 #CONDITION, ### ->>>> | ||
run: | ||
run = [s[:4] for s in SAMPLES] | ||
with open(output[0],'w') as out: | ||
# only add 'batch' when you need Chemistry Batch Correction | ||
# batch is optional, see docs if you have bath info | ||
# https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/aggregate | ||
out.write("library_id,molecule_h5,run\n") | ||
for s, b in zip(params.samples, params.ids): | ||
out.write(f"{s},{s}_cnt/outs/molecule_info.h5,{b}\n") | ||
|
||
|
||
rule citeseq_aggr: | ||
input: | ||
mh5 = expand("{sample}_cnt/outs/molecule_info.h5", sample=SAMPLES), | ||
aggr = "{project}.aggr.csv" | ||
output: | ||
barcode = "{project}_aggr/outs/filtered_feature_bc_matrix/barcodes.tsv.gz", | ||
features = "{project}_aggr/outs/filtered_feature_bc_matrix/features.tsv.gz", | ||
mtx = "{project}_aggr/outs/filtered_feature_bc_matrix/matrix.mtx.gz", | ||
mh5 = "{project}_aggr/outs/filtered_feature_bc_matrix.h5" | ||
log: "logs/{project}.aggr.log" | ||
threads: 12 | ||
run: | ||
# rm outputdir to solve "pipestance directory" error from cellranger | ||
shell("rm -rf {wildcards.project}_aggr") | ||
shell("cellranger aggr --id={wildcards.project}_aggr " +\ | ||
"--csv={input.aggr} " +\ | ||
"--normalize=mapped > {log}") | ||
|
||
# demuxlet --sam ${sample}citeseq/outs/possorted_genome_bam.bam \ | ||
# --vcf /home/fangzq/projects/merged.sorted.vcf \ | ||
# --out ${sample}.demuxlet \ | ||
# --min-mac 1 --field GT --alpha 0 \ | ||
# --group-list /home/fangzq/projects/SFGF-Peltz-YG-13709/${sample}citeseq/outs/filtered_feature_bc_matrix/barcodes.tsv.gz | ||
|
||
# call single cell SNP | ||
|
||
# a single BAM/SAM file, e.g., from cellranger, | ||
# a list of cell barcodes, a VCF file for common SNPs. | ||
# This mode is recommended comparing to mode 2, if a list of common SNP is known, e.g., human (see Candidate SNPs below) | ||
rule cell_snp: | ||
input: | ||
bam="{sample}_cnt/outs/possorted_genome_bam.bam", | ||
barcode="{sample}_cnt/outs/filtered_feature_bc_matrix/barcodes.tsv.gz", | ||
vcf=KNOWN_SNV, | ||
output: | ||
"{sample}_vireo/CELL_DATA/cellSNP.cells.vcf.gz", | ||
"{sample}_vireo/CELL_DATA/cellSNP.base.vcf.gz" | ||
threads: 12 | ||
log: "logs/{sample}.cellsnp.log" | ||
shell: | ||
"cellSNP -s {input.bam} -b {input.barcode} -R {input.vcf} " | ||
"-p {threads} --minMAF 0.1 --minCOUNT 20 " | ||
"-O {wildcards.sample}_vireo/CELL_DATA > {log}" | ||
|
||
# # deconvolution without known genotype | ||
# You have to know how many individals, e.g. n_donor=3 | ||
rule vireo_demux: | ||
input: | ||
"{sample}_vireo/CELL_DATA/cellSNP.cells.vcf.gz", | ||
"{sample}_vireo/CELL_DATA/cellSNP.base.vcf.gz", | ||
output: | ||
"{sample}_vireo/results.demux/donor_ids.tsv", | ||
"{sample}_vireo/results.demux/GT_donors.vireo.vcf.gz", | ||
params: | ||
donor_num = DONOR_NUM_PER_SAMPLE, | ||
CELL_DATA = "{sample}_vireo/CELL_DATA" | ||
# with known genotype, add below argument | ||
# known_gt = "-d {DONOR_GT_FILE} -t GT" # DONOR_GT_FILE(vcf) obtained from input samples | ||
log: "logs/{sample}.vireo.log" | ||
shell: | ||
"vireo -c {params.CELL_DATA} -N {params.donor_num} -o {wildcards.sample}_vireo/results.demux > {log}" | ||
|
||
# Mode 3: pileup a list of SNPs for one or multiple BAM/SAM files | ||
# rule cell_SNP3: | ||
# input: | ||
# bam=expand("{sample}_cnt/outs/possorted_genome_bam.bam", sample=SAMPLES) | ||
# barcode=expand("{sample}_cnt/outs/filtered_feature_bc_matrix/barcodes.tsv.gz", sample=SAMPLES) | ||
# vcf= | ||
# output: | ||
# params: | ||
# extra="--UMItag None" # only include this if smart-seq or bulk rna-seq | ||
# bams=",".join(expand("{sample}_cnt/outs/possorted_genome_bam.bam", sample=SAMPLES)), | ||
# ids=",".join(SAMPLES) | ||
# log: | ||
# shell: | ||
# "cellSNP -s {params.bams} -I {params.ids} -o $OUT_FILE -R {input.vcf} -p 20" | ||
|
||
# https://github.com/10XGenomics/vartrix | ||
rule vatrix: | ||
input: | ||
bam="{sample}_cnt/outs/possorted_genome_bam.bam", | ||
barcode="{sample}_cnt/outs/filtered_feature_bc_matrix/barcodes.tsv.gz", | ||
vcf=KNOWN_SNV, | ||
genome=os.path.join(TRANSCRIPTOM10X, "fasta/genome.fa") | ||
output: | ||
"{sample}.vatrix/matrix.mtx" | ||
threads: 12 | ||
shell: | ||
"vartrix -v {input.vcf} -b {input.bam} -f {input.genome} -c {input.barcode} " | ||
"--threads {threads} -o {wildcards.sample}.vatrix" | ||
|
||
rule vcf2SNVloci: | ||
input: KNOWN_SNV | ||
output: "SNV.loci.txt" | ||
shell: | ||
"awk '{{print $1,$2}}' {input} | sed -i 's/\s/:/g' > {output}" |
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.