From dbb5c53eff10d2ad70d90be4de2034015dd14e9a Mon Sep 17 00:00:00 2001
From: Miguel Brown <miguel.a.brown@gmail.com>
Date: Mon, 16 Sep 2024 13:37:27 -0400
Subject: [PATCH 1/3] :pencil: updated docs with missing params :hammer: added
 extra args params to catch most STAR possible inputs

---
 docs/10X_STAR_Solo_alignment.md             | 17 +++++++++++++++-
 tools/star_solo_2.7.10b.cwl                 |  1 +
 workflows/kf_STAR_Solo_10x_alignment_wf.cwl | 22 +++++++++++++++++++--
 3 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/docs/10X_STAR_Solo_alignment.md b/docs/10X_STAR_Solo_alignment.md
index 6e8c53c..4b6f250 100644
--- a/docs/10X_STAR_Solo_alignment.md
+++ b/docs/10X_STAR_Solo_alignment.md
@@ -32,6 +32,20 @@ Output QC is based on [this tutorial](https://github.com/hbctraining/scRNA-seq_o
    - Smart-seq: each cell in a separate FASTQ (paired- or single-end), barcodes are corresponding read-groups, no UMI sequences, alignments deduplicated according to alignment start and end (after extending soft-clipped bases)
    - default: "CB_UMI_Simple"
  - `soloCBwhitelist`: file with whitelist of cell barcodes
+ - `soloMultiMappers`: Possible one or more values: 'Unique', 'Uniform', 'PropUnique', 'EM', 'Rescue'. Including
+      multi-gene reads allows for more accurate gene quantification and, more importantly, enables detection of gene expression from
+      certain classes of genes that are supported only by multi-gene reads, such as overlapping genes and highly similar paralog families.
+      Unique: software default, count only reads that map to unique genes Uniform: uniformly distributes the multi-gene UMIs to all
+      genes in its gene set. Each gene gets a fractional count of 1/N_genes, where N_genes is the number of genes in the set. This
+      is the simplest possible option, and it offers higher sensitivity for gene detection at the expense of lower precision PropUnique:
+      distributes the multi-gene UMIs proportionally to the number of unique UMIs per gene. UMIs that map to genes that are not supported
+      by unique UMIs are distributed uniformly EM: uses Maximum Likelihood Estimation (MLE) to distribute multi-gene UMIs among their
+      genes, taking into account other UMIs (both unique- and multi-gene) from the same cell (i.e. with the same CB). Expectation-Maximization
+      (EM) algorithm is used to find the gene expression values that maximize the likelihood function. Recovering multi-gene reads
+      via MLE-EM model was previously used to quantify transposable elements in bulk RNA-seq {TEtranscripts} and in scRNA-seq {Alevin;
+      Kallisto-bustools}. Rescue: distributes multi-gene UMIs to their gene set proportionally to the sum of the number of unique-gene
+      UMIs and uniformly distributed multi-gene UMIs in each gene Mortazavi et al. It can be thought of as the first step of the EM
+      algorithm
  - `soloUMIlen`: UMI length, default: 12
  - `clipAdapterType`: adapter clipping type.
    - Hamming: adapter clipping based on Hamming distance, with the number of mismatches controlled by -clip5pAdapterMMp
@@ -74,8 +88,9 @@ Output QC is based on [this tutorial](https://github.com/hbctraining/scRNA-seq_o
    - CellRanger2.2: simple filtering of CellRanger 2.2. Can be followed by numbers: number of expected cells, robust maximum percentile for UMI count, maximum to minimum ratio for UMI count. The harcoded values are from CellRanger: nExpectedCells=3000; maxPercentile=0.99; maxMinRatio=10
    - EmptyDrops_CR: EmptyDrops filtering in CellRanger flavor. Please cite the original EmptyDrops paper: A.T.L Lun et al, Genome Biology, 20, 63 (2019): https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1662-y, Can be followed by 10 numeric parameters: nExpectedCells maxPercentile maxMinRatio indMin indMax umiMin umiMinFracMedian candMaxN FDR simN. The harcoded values are from CellRanger: 3000 0.99 10 45000 90000 500 0.01 20000 0.01 10000
    - default: "EmptyDrops_CR"
-  outSAMtype: type of SAM/BAM output. None: no SAM/BAM output. Otherwise, first word is output type (BAM or SAM), second is sort type (Unsorted or SortedByCoordinate)
+ - `outSAMtype`: type of SAM/BAM output. None: no SAM/BAM output. Otherwise, first word is output type (BAM or SAM), second is sort type (Unsorted or SortedByCoordinate)
    - default: "None"
+ - `star_extra_args`: Any additional arguments for this tool. See STAR Documentation for complete list of options. Example input: `--limitSjdbInsertNsj 1000001`
 ### seurat Harvard Bioinformatics Core (HBC) qc
  - `qc_min_umi`: minimum number of umi for cell-level filtering
  - `qc_min_genes`: minimum number of genes for cell-level filtering
diff --git a/tools/star_solo_2.7.10b.cwl b/tools/star_solo_2.7.10b.cwl
index 72a5a05..02297c2 100644
--- a/tools/star_solo_2.7.10b.cwl
+++ b/tools/star_solo_2.7.10b.cwl
@@ -106,6 +106,7 @@ inputs:
     EM: uses Maximum Likelihood Estimation (MLE) to distribute multi-gene UMIs among their genes, taking into account other UMIs (both unique- and multi-gene) from the same cell (i.e. with the same CB). Expectation-Maximization (EM) algorithm is used to find the gene expression values that maximize the likelihood function. Recovering multi-gene reads via MLE-EM model was previously used to quantify transposable elements in bulk RNA-seq {TEtranscripts} and in scRNA-seq {Alevin; Kallisto-bustools}. \
     Rescue: distributes multi-gene UMIs to their gene set proportionally to the sum of the number of unique-gene UMIs and uniformly distributed multi-gene UMIs in each gene Mortazavi et al. It can be thought of as the first step of the EM algorithm",
     inputBinding: { position: 5, prefix: "--soloMultiMappers" } }
+  extra_args: { type: 'string?', inputBinding: { position: 5, shellQuote: false }, doc: "Any additional arguments for this tool. See STAR Documentation for complete list of options. Example input: --limitSjdbInsertNsj 1000001" }
   outSAMtype: { type: [ 'null', {type: enum, name: outSAMtype, symbols: ["BAM Unsorted", "None", "BAM SortedByCoordinate", "SAM Unsorted", "SAM SortedByCoordinate"]}],
     default: "None",
     doc: "type of SAM/BAM output. None: no SAM/BAM output. Otherwise, first word is output type (BAM or SAM), second is sort type (Unsorted or SortedByCoordinate)",
diff --git a/workflows/kf_STAR_Solo_10x_alignment_wf.cwl b/workflows/kf_STAR_Solo_10x_alignment_wf.cwl
index 69df3b3..196632d 100644
--- a/workflows/kf_STAR_Solo_10x_alignment_wf.cwl
+++ b/workflows/kf_STAR_Solo_10x_alignment_wf.cwl
@@ -37,6 +37,20 @@ doc: |
      - Smart-seq: each cell in a separate FASTQ (paired- or single-end), barcodes are corresponding read-groups, no UMI sequences, alignments deduplicated according to alignment start and end (after extending soft-clipped bases)
      - default: "CB_UMI_Simple"
    - `soloCBwhitelist`: file with whitelist of cell barcodes
+   - `soloMultiMappers`: Possible one or more values: 'Unique', 'Uniform', 'PropUnique', 'EM', 'Rescue'. Including
+        multi-gene reads allows for more accurate gene quantification and, more importantly, enables detection of gene expression from
+        certain classes of genes that are supported only by multi-gene reads, such as overlapping genes and highly similar paralog families.
+        Unique: software default, count only reads that map to unique genes Uniform: uniformly distributes the multi-gene UMIs to all
+        genes in its gene set. Each gene gets a fractional count of 1/N_genes, where N_genes is the number of genes in the set. This
+        is the simplest possible option, and it offers higher sensitivity for gene detection at the expense of lower precision PropUnique:
+        distributes the multi-gene UMIs proportionally to the number of unique UMIs per gene. UMIs that map to genes that are not supported
+        by unique UMIs are distributed uniformly EM: uses Maximum Likelihood Estimation (MLE) to distribute multi-gene UMIs among their
+        genes, taking into account other UMIs (both unique- and multi-gene) from the same cell (i.e. with the same CB). Expectation-Maximization
+        (EM) algorithm is used to find the gene expression values that maximize the likelihood function. Recovering multi-gene reads
+        via MLE-EM model was previously used to quantify transposable elements in bulk RNA-seq {TEtranscripts} and in scRNA-seq {Alevin;
+        Kallisto-bustools}. Rescue: distributes multi-gene UMIs to their gene set proportionally to the sum of the number of unique-gene
+        UMIs and uniformly distributed multi-gene UMIs in each gene Mortazavi et al. It can be thought of as the first step of the EM
+        algorithm
    - `soloUMIlen`: UMI length, default: 12
    - `clipAdapterType`: adapter clipping type.
      - Hamming: adapter clipping based on Hamming distance, with the number of mismatches controlled by -clip5pAdapterMMp
@@ -79,8 +93,9 @@ doc: |
      - CellRanger2.2: simple filtering of CellRanger 2.2. Can be followed by numbers: number of expected cells, robust maximum percentile for UMI count, maximum to minimum ratio for UMI count. The harcoded values are from CellRanger: nExpectedCells=3000; maxPercentile=0.99; maxMinRatio=10
      - EmptyDrops_CR: EmptyDrops filtering in CellRanger flavor. Please cite the original EmptyDrops paper: A.T.L Lun et al, Genome Biology, 20, 63 (2019): https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1662-y, Can be followed by 10 numeric parameters: nExpectedCells maxPercentile maxMinRatio indMin indMax umiMin umiMinFracMedian candMaxN FDR simN. The harcoded values are from CellRanger: 3000 0.99 10 45000 90000 500 0.01 20000 0.01 10000
      - default: "EmptyDrops_CR"
-    outSAMtype: type of SAM/BAM output. None: no SAM/BAM output. Otherwise, first word is output type (BAM or SAM), second is sort type (Unsorted or SortedByCoordinate)
+   - `outSAMtype`: type of SAM/BAM output. None: no SAM/BAM output. Otherwise, first word is output type (BAM or SAM), second is sort type (Unsorted or SortedByCoordinate)
      - default: "None"
+   - `star_extra_args`: Any additional arguments for this tool. See STAR Documentation for complete list of options. Example input: `--limitSjdbInsertNsj 1000001`
   ### seurat Harvard Bioinformatics Core (HBC) qc
    - `qc_min_umi`: minimum number of umi for cell-level filtering
    - `qc_min_genes`: minimum number of genes for cell-level filtering
@@ -190,13 +205,15 @@ inputs:
   outSAMtype: {type: ['null', {type: enum, name: outSAMtype, symbols: ["BAM Unsorted", "None", "BAM SortedByCoordinate", "SAM Unsorted",
           "SAM SortedByCoordinate"]}], default: "None", doc: "type of SAM/BAM output. None: no SAM/BAM output. Otherwise, first word
       is output type (BAM or SAM), second is sort type (Unsorted or SortedByCoordinate)"}
+  star_extra_args: {type: 'string?', doc: "Any additional arguments for this tool. See STAR Documentation for complete list of options.
+      Example input: --limitSjdbInsertNsj 1000001"}
   # Seurat HBC QC
   qc_min_umi: {type: 'int?', doc: "minimum number of umi for cell-level filtering", default: 500}
   qc_min_genes: {type: 'int?', doc: "minimum number of genes for cell-level filtering", default: 250}
   qc_min_complexity: {type: 'float?', doc: "minimum novelty score (log10GenesPerUMI)", default: 0.8}
   qc_max_mito_ratio: {type: 'float?', doc: "maximum ratio mitochondrial reads per cell", default: 0.2}
   qc_min_gene_prevalence: {type: 'int?', doc: "Minimum number of cells a gene must be expressed in to keep after filtering", default: 10}
-  qc_memory: { type: 'int?', doc: "Memory in GB that ought to be available to the script", default: 16 }
+  qc_memory: {type: 'int?', doc: "Memory in GB that ought to be available to the script", default: 16}
 
 outputs:
   star_solo_counts_dir: {type: File, outputSource: tar_solo_count_outdir/output, doc: "Tar gzipped counts output from STAR Solo"}
@@ -240,6 +257,7 @@ steps:
       soloCellFilter: soloCellFilter
       soloMultiMappers: soloMultiMappers
       outSAMtype: outSAMtype
+      extra_args: star_extra_args
     out: [log_progress_out, log_out, log_final_out, genomic_bam_out, junctions_out, counts_dir]
   create_h5_output:
     run: ../tools/convert_to_h5.cwl

From 4893c92fb67bf21eeea033f4e808874f53d35866 Mon Sep 17 00:00:00 2001
From: Miguel Brown <miguel.a.brown@gmail.com>
Date: Mon, 16 Sep 2024 14:51:25 -0400
Subject: [PATCH 2/3] :hammer: made EM default

---
 workflows/kf_STAR_Solo_10x_alignment_wf.cwl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflows/kf_STAR_Solo_10x_alignment_wf.cwl b/workflows/kf_STAR_Solo_10x_alignment_wf.cwl
index 196632d..8e82694 100644
--- a/workflows/kf_STAR_Solo_10x_alignment_wf.cwl
+++ b/workflows/kf_STAR_Solo_10x_alignment_wf.cwl
@@ -198,10 +198,10 @@ inputs:
       via MLE-EM model was previously used to quantify transposable elements in bulk RNA-seq {TEtranscripts} and in scRNA-seq {Alevin;
       Kallisto-bustools}. Rescue: distributes multi-gene UMIs to their gene set proportionally to the sum of the number of unique-gene
       UMIs and uniformly distributed multi-gene UMIs in each gene Mortazavi et al. It can be thought of as the first step of the EM
-      algorithm"}
+      algorithm", default: ["EM"]}
   raw_count_choice: {type: ['null', {type: enum, name: raw_count_choice, symbols: ["Unique", "Uniform", "PropUnique", "EM", "Rescue"]}],
     doc: "Based on `soloMultiMappers`, if you wish to include/handle multi-gene hits in downstream anaylsis instead of default (ignore
-      multi-gene mappers), pick the method you want to use", default: "Unique"}
+      multi-gene mappers), pick the method you want to use", default: "EM"}
   outSAMtype: {type: ['null', {type: enum, name: outSAMtype, symbols: ["BAM Unsorted", "None", "BAM SortedByCoordinate", "SAM Unsorted",
           "SAM SortedByCoordinate"]}], default: "None", doc: "type of SAM/BAM output. None: no SAM/BAM output. Otherwise, first word
       is output type (BAM or SAM), second is sort type (Unsorted or SortedByCoordinate)"}

From 1b3843836f36106e79693a4855032176953c3ccb Mon Sep 17 00:00:00 2001
From: Miguel Brown <miguel.a.brown@gmail.com>
Date: Mon, 16 Sep 2024 15:01:21 -0400
Subject: [PATCH 3/3] :broom: rm salmon wf

---
 salmon-rnaseq | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 salmon-rnaseq

diff --git a/salmon-rnaseq b/salmon-rnaseq
deleted file mode 160000
index 5620223..0000000
--- a/salmon-rnaseq
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 5620223917df37573a0552f55b06c00357f5d2ad