From dbb5c53eff10d2ad70d90be4de2034015dd14e9a Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Mon, 16 Sep 2024 13:37:27 -0400 Subject: [PATCH 1/3] :pencil: updated docs with missing params :hammer: added extra args params to catch most STAR possible inputs --- docs/10X_STAR_Solo_alignment.md | 17 +++++++++++++++- tools/star_solo_2.7.10b.cwl | 1 + workflows/kf_STAR_Solo_10x_alignment_wf.cwl | 22 +++++++++++++++++++-- 3 files changed, 37 insertions(+), 3 deletions(-) diff --git a/docs/10X_STAR_Solo_alignment.md b/docs/10X_STAR_Solo_alignment.md index 6e8c53c..4b6f250 100644 --- a/docs/10X_STAR_Solo_alignment.md +++ b/docs/10X_STAR_Solo_alignment.md @@ -32,6 +32,20 @@ Output QC is based on [this tutorial](https://github.com/hbctraining/scRNA-seq_o - Smart-seq: each cell in a separate FASTQ (paired- or single-end), barcodes are corresponding read-groups, no UMI sequences, alignments deduplicated according to alignment start and end (after extending soft-clipped bases) - default: "CB_UMI_Simple" - `soloCBwhitelist`: file with whitelist of cell barcodes + - `soloMultiMappers`: Possible one or more values: 'Unique', 'Uniform', 'PropUnique', 'EM', 'Rescue'. Including + multi-gene reads allows for more accurate gene quantification and, more importantly, enables detection of gene expression from + certain classes of genes that are supported only by multi-gene reads, such as overlapping genes and highly similar paralog families. + Unique: software default, count only reads that map to unique genes Uniform: uniformly distributes the multi-gene UMIs to all + genes in its gene set. Each gene gets a fractional count of 1/N_genes, where N_genes is the number of genes in the set. This + is the simplest possible option, and it offers higher sensitivity for gene detection at the expense of lower precision PropUnique: + distributes the multi-gene UMIs proportionally to the number of unique UMIs per gene. UMIs that map to genes that are not supported + by unique UMIs are distributed uniformly EM: uses Maximum Likelihood Estimation (MLE) to distribute multi-gene UMIs among their + genes, taking into account other UMIs (both unique- and multi-gene) from the same cell (i.e. with the same CB). Expectation-Maximization + (EM) algorithm is used to find the gene expression values that maximize the likelihood function. Recovering multi-gene reads + via MLE-EM model was previously used to quantify transposable elements in bulk RNA-seq {TEtranscripts} and in scRNA-seq {Alevin; + Kallisto-bustools}. Rescue: distributes multi-gene UMIs to their gene set proportionally to the sum of the number of unique-gene + UMIs and uniformly distributed multi-gene UMIs in each gene Mortazavi et al. It can be thought of as the first step of the EM + algorithm - `soloUMIlen`: UMI length, default: 12 - `clipAdapterType`: adapter clipping type. - Hamming: adapter clipping based on Hamming distance, with the number of mismatches controlled by -clip5pAdapterMMp @@ -74,8 +88,9 @@ Output QC is based on [this tutorial](https://github.com/hbctraining/scRNA-seq_o - CellRanger2.2: simple filtering of CellRanger 2.2. Can be followed by numbers: number of expected cells, robust maximum percentile for UMI count, maximum to minimum ratio for UMI count. The harcoded values are from CellRanger: nExpectedCells=3000; maxPercentile=0.99; maxMinRatio=10 - EmptyDrops_CR: EmptyDrops filtering in CellRanger flavor. Please cite the original EmptyDrops paper: A.T.L Lun et al, Genome Biology, 20, 63 (2019): https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1662-y, Can be followed by 10 numeric parameters: nExpectedCells maxPercentile maxMinRatio indMin indMax umiMin umiMinFracMedian candMaxN FDR simN. The harcoded values are from CellRanger: 3000 0.99 10 45000 90000 500 0.01 20000 0.01 10000 - default: "EmptyDrops_CR" - outSAMtype: type of SAM/BAM output. None: no SAM/BAM output. Otherwise, first word is output type (BAM or SAM), second is sort type (Unsorted or SortedByCoordinate) + - `outSAMtype`: type of SAM/BAM output. None: no SAM/BAM output. Otherwise, first word is output type (BAM or SAM), second is sort type (Unsorted or SortedByCoordinate) - default: "None" + - `star_extra_args`: Any additional arguments for this tool. See STAR Documentation for complete list of options. Example input: `--limitSjdbInsertNsj 1000001` ### seurat Harvard Bioinformatics Core (HBC) qc - `qc_min_umi`: minimum number of umi for cell-level filtering - `qc_min_genes`: minimum number of genes for cell-level filtering diff --git a/tools/star_solo_2.7.10b.cwl b/tools/star_solo_2.7.10b.cwl index 72a5a05..02297c2 100644 --- a/tools/star_solo_2.7.10b.cwl +++ b/tools/star_solo_2.7.10b.cwl @@ -106,6 +106,7 @@ inputs: EM: uses Maximum Likelihood Estimation (MLE) to distribute multi-gene UMIs among their genes, taking into account other UMIs (both unique- and multi-gene) from the same cell (i.e. with the same CB). Expectation-Maximization (EM) algorithm is used to find the gene expression values that maximize the likelihood function. Recovering multi-gene reads via MLE-EM model was previously used to quantify transposable elements in bulk RNA-seq {TEtranscripts} and in scRNA-seq {Alevin; Kallisto-bustools}. \ Rescue: distributes multi-gene UMIs to their gene set proportionally to the sum of the number of unique-gene UMIs and uniformly distributed multi-gene UMIs in each gene Mortazavi et al. It can be thought of as the first step of the EM algorithm", inputBinding: { position: 5, prefix: "--soloMultiMappers" } } + extra_args: { type: 'string?', inputBinding: { position: 5, shellQuote: false }, doc: "Any additional arguments for this tool. See STAR Documentation for complete list of options. Example input: --limitSjdbInsertNsj 1000001" } outSAMtype: { type: [ 'null', {type: enum, name: outSAMtype, symbols: ["BAM Unsorted", "None", "BAM SortedByCoordinate", "SAM Unsorted", "SAM SortedByCoordinate"]}], default: "None", doc: "type of SAM/BAM output. None: no SAM/BAM output. Otherwise, first word is output type (BAM or SAM), second is sort type (Unsorted or SortedByCoordinate)", diff --git a/workflows/kf_STAR_Solo_10x_alignment_wf.cwl b/workflows/kf_STAR_Solo_10x_alignment_wf.cwl index 69df3b3..196632d 100644 --- a/workflows/kf_STAR_Solo_10x_alignment_wf.cwl +++ b/workflows/kf_STAR_Solo_10x_alignment_wf.cwl @@ -37,6 +37,20 @@ doc: | - Smart-seq: each cell in a separate FASTQ (paired- or single-end), barcodes are corresponding read-groups, no UMI sequences, alignments deduplicated according to alignment start and end (after extending soft-clipped bases) - default: "CB_UMI_Simple" - `soloCBwhitelist`: file with whitelist of cell barcodes + - `soloMultiMappers`: Possible one or more values: 'Unique', 'Uniform', 'PropUnique', 'EM', 'Rescue'. Including + multi-gene reads allows for more accurate gene quantification and, more importantly, enables detection of gene expression from + certain classes of genes that are supported only by multi-gene reads, such as overlapping genes and highly similar paralog families. + Unique: software default, count only reads that map to unique genes Uniform: uniformly distributes the multi-gene UMIs to all + genes in its gene set. Each gene gets a fractional count of 1/N_genes, where N_genes is the number of genes in the set. This + is the simplest possible option, and it offers higher sensitivity for gene detection at the expense of lower precision PropUnique: + distributes the multi-gene UMIs proportionally to the number of unique UMIs per gene. UMIs that map to genes that are not supported + by unique UMIs are distributed uniformly EM: uses Maximum Likelihood Estimation (MLE) to distribute multi-gene UMIs among their + genes, taking into account other UMIs (both unique- and multi-gene) from the same cell (i.e. with the same CB). Expectation-Maximization + (EM) algorithm is used to find the gene expression values that maximize the likelihood function. Recovering multi-gene reads + via MLE-EM model was previously used to quantify transposable elements in bulk RNA-seq {TEtranscripts} and in scRNA-seq {Alevin; + Kallisto-bustools}. Rescue: distributes multi-gene UMIs to their gene set proportionally to the sum of the number of unique-gene + UMIs and uniformly distributed multi-gene UMIs in each gene Mortazavi et al. It can be thought of as the first step of the EM + algorithm - `soloUMIlen`: UMI length, default: 12 - `clipAdapterType`: adapter clipping type. - Hamming: adapter clipping based on Hamming distance, with the number of mismatches controlled by -clip5pAdapterMMp @@ -79,8 +93,9 @@ doc: | - CellRanger2.2: simple filtering of CellRanger 2.2. Can be followed by numbers: number of expected cells, robust maximum percentile for UMI count, maximum to minimum ratio for UMI count. The harcoded values are from CellRanger: nExpectedCells=3000; maxPercentile=0.99; maxMinRatio=10 - EmptyDrops_CR: EmptyDrops filtering in CellRanger flavor. Please cite the original EmptyDrops paper: A.T.L Lun et al, Genome Biology, 20, 63 (2019): https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1662-y, Can be followed by 10 numeric parameters: nExpectedCells maxPercentile maxMinRatio indMin indMax umiMin umiMinFracMedian candMaxN FDR simN. The harcoded values are from CellRanger: 3000 0.99 10 45000 90000 500 0.01 20000 0.01 10000 - default: "EmptyDrops_CR" - outSAMtype: type of SAM/BAM output. None: no SAM/BAM output. Otherwise, first word is output type (BAM or SAM), second is sort type (Unsorted or SortedByCoordinate) + - `outSAMtype`: type of SAM/BAM output. None: no SAM/BAM output. Otherwise, first word is output type (BAM or SAM), second is sort type (Unsorted or SortedByCoordinate) - default: "None" + - `star_extra_args`: Any additional arguments for this tool. See STAR Documentation for complete list of options. Example input: `--limitSjdbInsertNsj 1000001` ### seurat Harvard Bioinformatics Core (HBC) qc - `qc_min_umi`: minimum number of umi for cell-level filtering - `qc_min_genes`: minimum number of genes for cell-level filtering @@ -190,13 +205,15 @@ inputs: outSAMtype: {type: ['null', {type: enum, name: outSAMtype, symbols: ["BAM Unsorted", "None", "BAM SortedByCoordinate", "SAM Unsorted", "SAM SortedByCoordinate"]}], default: "None", doc: "type of SAM/BAM output. None: no SAM/BAM output. Otherwise, first word is output type (BAM or SAM), second is sort type (Unsorted or SortedByCoordinate)"} + star_extra_args: {type: 'string?', doc: "Any additional arguments for this tool. See STAR Documentation for complete list of options. + Example input: --limitSjdbInsertNsj 1000001"} # Seurat HBC QC qc_min_umi: {type: 'int?', doc: "minimum number of umi for cell-level filtering", default: 500} qc_min_genes: {type: 'int?', doc: "minimum number of genes for cell-level filtering", default: 250} qc_min_complexity: {type: 'float?', doc: "minimum novelty score (log10GenesPerUMI)", default: 0.8} qc_max_mito_ratio: {type: 'float?', doc: "maximum ratio mitochondrial reads per cell", default: 0.2} qc_min_gene_prevalence: {type: 'int?', doc: "Minimum number of cells a gene must be expressed in to keep after filtering", default: 10} - qc_memory: { type: 'int?', doc: "Memory in GB that ought to be available to the script", default: 16 } + qc_memory: {type: 'int?', doc: "Memory in GB that ought to be available to the script", default: 16} outputs: star_solo_counts_dir: {type: File, outputSource: tar_solo_count_outdir/output, doc: "Tar gzipped counts output from STAR Solo"} @@ -240,6 +257,7 @@ steps: soloCellFilter: soloCellFilter soloMultiMappers: soloMultiMappers outSAMtype: outSAMtype + extra_args: star_extra_args out: [log_progress_out, log_out, log_final_out, genomic_bam_out, junctions_out, counts_dir] create_h5_output: run: ../tools/convert_to_h5.cwl From 4893c92fb67bf21eeea033f4e808874f53d35866 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Mon, 16 Sep 2024 14:51:25 -0400 Subject: [PATCH 2/3] :hammer: made EM default --- workflows/kf_STAR_Solo_10x_alignment_wf.cwl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/kf_STAR_Solo_10x_alignment_wf.cwl b/workflows/kf_STAR_Solo_10x_alignment_wf.cwl index 196632d..8e82694 100644 --- a/workflows/kf_STAR_Solo_10x_alignment_wf.cwl +++ b/workflows/kf_STAR_Solo_10x_alignment_wf.cwl @@ -198,10 +198,10 @@ inputs: via MLE-EM model was previously used to quantify transposable elements in bulk RNA-seq {TEtranscripts} and in scRNA-seq {Alevin; Kallisto-bustools}. Rescue: distributes multi-gene UMIs to their gene set proportionally to the sum of the number of unique-gene UMIs and uniformly distributed multi-gene UMIs in each gene Mortazavi et al. It can be thought of as the first step of the EM - algorithm"} + algorithm", default: ["EM"]} raw_count_choice: {type: ['null', {type: enum, name: raw_count_choice, symbols: ["Unique", "Uniform", "PropUnique", "EM", "Rescue"]}], doc: "Based on `soloMultiMappers`, if you wish to include/handle multi-gene hits in downstream anaylsis instead of default (ignore - multi-gene mappers), pick the method you want to use", default: "Unique"} + multi-gene mappers), pick the method you want to use", default: "EM"} outSAMtype: {type: ['null', {type: enum, name: outSAMtype, symbols: ["BAM Unsorted", "None", "BAM SortedByCoordinate", "SAM Unsorted", "SAM SortedByCoordinate"]}], default: "None", doc: "type of SAM/BAM output. None: no SAM/BAM output. Otherwise, first word is output type (BAM or SAM), second is sort type (Unsorted or SortedByCoordinate)"} From 1b3843836f36106e79693a4855032176953c3ccb Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Mon, 16 Sep 2024 15:01:21 -0400 Subject: [PATCH 3/3] :broom: rm salmon wf --- salmon-rnaseq | 1 - 1 file changed, 1 deletion(-) delete mode 160000 salmon-rnaseq diff --git a/salmon-rnaseq b/salmon-rnaseq deleted file mode 160000 index 5620223..0000000 --- a/salmon-rnaseq +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 5620223917df37573a0552f55b06c00357f5d2ad