Merge remote-tracking branch 'AlexsLemonade/main' into allyhawkins/me…

…rged-annotations-ewings
AlexsLemonade · Feb 5, 2025 · 1e4251e · 1e4251e
2 parents 3b620fb + 540bbc3
commit 1e4251e
Show file tree

Hide file tree

Showing 6 changed files with 87 additions and 8 deletions.
diff --git a/.github/workflows/run_cell-type-ewings.yml b/.github/workflows/run_cell-type-ewings.yml
@@ -42,6 +42,7 @@ jobs:
         run: |
           conda activate openscpca-cell-type-ewings
           ./download-data.py --test-data --format "sce,anndata" --project SCPCP000015
+          ./download-results.py --test-data --modules merge-sce --projects SCPCP000015
 
       - name: Run analysis
         run: |

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,20 @@ Add new release notes in reverse numerical order (newest first) below this comme
 You may want to add temporary notes here for tracking as features are added, before a new release is ready.
 -->
 
+## v0.2.1
+
+This release includes minor updates to the following modules: 
+
+- `cell-type-ewings`
+- `cell-type-consensus`
+
+In particular, the workflow for assigning consensus cell types has been created in `cell-type-consensus` module. 
+This workflow is now present in `OpenScPCA-nf`, and results will be made available to all contributors. 
+
+New FAQs covering how to work with ScPCA data as `Seurat` objects and using gene symbols instead of Ensembl IDs have been added to the [OpenScPCA documentation](https://openscpca.readthedocs.io/en/latest/troubleshooting-faq/faq/). 
+
+`Rcpp` package versions within modules were updated for compatibility. 
+
 ## v0.2.0
 
 This release adds the first set of community-contributed analyses to the repository.

diff --git a/analyses/cell-type-ewings/README.md b/analyses/cell-type-ewings/README.md
@@ -212,6 +212,7 @@ The full list of gene signatures used can be found in [the references `README.md
 
 `AUCell` is run for each gene signature, and AUC values along with the AUC threshold reported by `AUCell` are saved to a TSV file. 
 By default, `AUCell` is run with an `aucMaxRank` value equal to 1% of the detected genes in the processed object. 
+Results from `AUCell` will be generated for each library and for the merged object containing all samples in `SCPCP000015`. 
 
 ### Usage
 

diff --git a/analyses/cell-type-ewings/run-aucell-ews-signatures.sh b/analyses/cell-type-ewings/run-aucell-ews-signatures.sh
@@ -3,7 +3,8 @@
 # This script is used to get AUC values for a set of EWS-FLI1 high/low gene signatures using AUCell
 # The input gene signatures are those saved in `references/gene_signatures` and MSigDB gene sets noted in `references/msigdb-gene-sets.tsv`
 
-# For each library, a TSV is saved with the following columns: 
+# AUCell is run on each library and the merged object 
+# The output is a TSV for each library/object with the following columns: 
 # `gene_set`, `barcodes`, `auc`, and `auc_threshold`
 
 # Usage: ./run-aucell-ews-signatures.sh
@@ -28,9 +29,13 @@ max_rank_threshold=${max_rank_threshold:-0.01}
 
 # set up input and output file paths
 data_dir="../../data/current/SCPCP000015"
+merged_sce_file="../../data/current/results/merge-sce/SCPCP000015/SCPCP000015_merged.rds"
+
 results_dir="results/aucell-ews-signatures"
 mkdir -p ${results_dir}
 
+merged_results_file="${results_dir}/SCPCP000015_auc-ews-gene-signatures.tsv"
+
 # gene signature directory 
 gene_signatures_dir="${module_dir}/references/gene_signatures"
 msigdb_geneset_file="${module_dir}/references/msigdb-gene-sets.tsv"
@@ -64,3 +69,16 @@ for sample_id in $sample_ids; do
           --seed $seed
       done
 done
+
+
+# run AUCell on merged object 
+echo "Running AUCell for merged object" 
+Rscript scripts/aucell-ews-signatures/01-aucell.R \
+    --sce_file $merged_sce_file \
+    --custom_geneset_dir $gene_signatures_dir \
+    --msigdb_genesets $msigdb_geneset_file \
+    --max_rank_threshold $max_rank_threshold \
+    --is_merged \
+    --output_file $merged_results_file \
+    --threads $threads \
+    --seed $seed
diff --git a/analyses/cell-type-ewings/scripts/README.md b/analyses/cell-type-ewings/scripts/README.md
@@ -185,14 +185,23 @@ This can be changed using the `--max_rank_threshold` parameter.
 
 By default, this script uses 4 CPUs. 
 
-To run this script using the default parameters use the following command: 
+To run this script on a single library use the default parameters use the following command: 
 
 ```sh
 Rscript 01-aucell.R \
   --sce_file <path to processed SCE file> \
   --output_file <path to TSV file to save AUC results>
 ```
 
+To run this script with a merged object use the following command: 
+
+```sh
+Rscript 01-aucell.R \
+  --sce_file <path to processed SCE file> \
+  --output_file <path to TSV file to save AUC results> \
+  --is_merged
+```
+
 ## Scripts used in the CNV annotation workflow
 
 **NOTE:** This workflow is no longer used in the cell type annotation analysis, has been removed from CI, and is no longer maintained! 
@@ -301,7 +310,7 @@ This script also requires a gene order file (created by `00-make-gene-order-file
 To run this script use the following command:
 
 ```sh
-Rscript 04-run-infercnv.Rmd \
+Rscript 04-run-infercnv.R \
   --annotations_file <path to save annotations file> \
   --reference_cell_file <path to file with table of normal cell barcodes> \
   --output_dir <full path to folder to save results> \

diff --git a/analyses/cell-type-ewings/scripts/aucell-ews-signatures/01-aucell.R b/analyses/cell-type-ewings/scripts/aucell-ews-signatures/01-aucell.R
@@ -38,6 +38,12 @@ option_list <- list(
     help = "Percentage of genes detected to set as the `aucMaxRank`. 
       Must be a number between 0 and 1."
   ),
+  make_option(
+    opt_str = c("--is_merged"),
+    action = "store_true",
+    default = FALSE,
+    help = "Indicate whether or not the SCE object is a merged object."
+  ),
   make_option(
     opt_str = c("--output_file"),
     type = "character",
@@ -89,15 +95,26 @@ if (opt$threads > 1) {
 # check that max rank is number between 0-1
 
 # make sure directory exists for writing output
-output_dir <- dir(opt$output_file)
+output_dir <- dirname(opt$output_file)
 fs::dir_create(output_dir)
 
 # read in SCE
 sce <- readr::read_rds(opt$sce_file)
 
 # remove genes that are not detected from SCE object 
-genes_to_remove <- rowData(sce)$detected > 0 
-filtered_sce <- sce[genes_to_remove , ]
+if(!opt$is_merged){
+  genes_to_keep <- rowData(sce)$detected > 0 
+} else {
+
+  # if merged object then need to sum all columns to find genes not present in the object at all 
+  genes_to_keep <- rowData(sce) |> 
+    as.data.frame() |>
+    dplyr::select(ends_with("detected")) |> 
+    rowSums() > 0
+
+}
+
+filtered_sce <- sce[genes_to_keep , ] 
 
 # read in gene sets to use with msigdb
 genesets_df <- readr::read_tsv(opt$msigdb_genesets)
@@ -170,10 +187,29 @@ collection <- all_genes_list |>
 
 # Run AUCell -------------------------------------------------------------------
 
-# run AUCell
-counts_mtx <- counts(sce)
+# extract counts matrix
+counts_mtx <- counts(filtered_sce) |>
+  as("dgCMatrix") # account for merged objects not being sparse
+
+# check intersection with gene sets
+overlap_pct <- all_genes_list |> 
+  purrr::map_dbl(\(list){
+    num_genes <- length(list)
+    intersect(rownames(counts_mtx), list) |>
+      length()/num_genes 
+  })
+
+# if any gene sets don't have enough overlap (cutoff is 20%)
+# print a message and quit
+if(any(overlap_pct <= 0.20)){
+  message("Gene sets do not have at least 20% of genes present in SCE. 
+          AUCell will not be run.")
+  quit(save = "no")
+}
+
 max_rank <- ceiling(opt$max_rank_threshold*nrow(counts_mtx))
 
+# run aucell 
 auc_results <- AUCell::AUCell_run(
   counts_mtx, 
   collection,