Merge pull request #102 from AlexsLemonade/jashapiro/all-metadata

jashapiro · web-flow · commit c04c48ad05f8 · 2024-11-15T10:04:39.000-05:00
Update metadata to include project-specific fields
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,7 @@ You may want to add temporary notes here for tracking as features are added, bef
 - Update simulations to match current (v0.8.5) `scpca-nf` output
   - Change reduced dimension names in AnnData output (to `X_pca` and `X_umap`) and updated formatting to match scpca-nf v0.8.5
   - Use new age columns
+  - Metadata for simulated data now includes project-specific fields
 - Centralized docker image definitions in `config/containers.config`
 - Added initial documentation about porting modules
 
diff --git a/modules/simulate-sce/main.nf b/modules/simulate-sce/main.nf
@@ -116,7 +116,7 @@ workflow simulate_sce {
     permuted_metadata_ch = permute_metadata(metadata_ch)
 
     // get bulk files for each project, if present: [project_id, bulk_quant_file, bulk_metadata_file]
-    bulk_ch = project_ch.map{[it[0], it[1] / 'bulk_quant.tsv', it[1] / 'bulk_metadata.tsv']}
+    bulk_ch = project_ch.map{[it[0], it[1] / "${it[0]}_bulk_quant.tsv", it[1] / "${it[0]}_bulk_metadata.tsv"]}
       .filter{it[1].exists()}
     permute_bulk(bulk_ch)
 
diff --git a/modules/simulate-sce/readme.md b/modules/simulate-sce/readme.md
@@ -4,4 +4,4 @@ This workflow is designed to simulate single cell data, primarily using the [spl
 
 Scripts are derived from the the `simulate-sce` module of the [OpenScPCA-analysis](https://github.com/AlexsLemonade/OpenScPCA-analysis) repository.
 
-Permalink to the version used: https://github.com/AlexsLemonade/OpenScPCA-analysis/tree/c903e51fe18f0e048ced9a4978bcf056f3f78999/analyses/simulate-sce
+Permalink to the version used: https://github.com/AlexsLemonade/OpenScPCA-analysis/tree/0a3d96089991dea692a8485e3126ed6d69958028/analyses/simulate-sce
diff --git a/modules/simulate-sce/resources/usr/bin/permute-metadata.R b/modules/simulate-sce/resources/usr/bin/permute-metadata.R
@@ -46,9 +46,14 @@ library_fields <- c(
   "seq_unit",
   "technology",
   "filtered_cell_count",
+  "filtered_spots",
+  "unfiltered_spots",
+  "tissue_spots",
   "submitter",
   "pi_name",
-  "project_title"
+  "project_title",
+  "demux_samples",
+  "demux_cell_count_estimate"
 )
 
 # fields that apply at sample level
@@ -81,7 +86,9 @@ processing_fields <- c(
   "genome_assembly",
   "has_cellhash",
   "includes_anndata",
+  "is_cell_line",
   "is_multiplexed",
+  "is_xenograft",
   "has_citeseq",
   "adt_filtering_method",
   "adt_normalization_method",
@@ -92,23 +99,27 @@ processing_fields <- c(
   "prob_compromised_cutoff",
   "processed_cells",
   "salmon_version",
+  "spaceranger_version",
   "total_reads",
   "transcript_type",
   "unfiltered_cells",
+  "demux_method",
   "workflow",
   "workflow_commit",
   "workflow_version"
 )
 
-# Remove project-specific columns
-match_cols <- sort(match(c(library_fields, sample_fields, processing_fields), colnames(metadata)))
-metadata <- metadata[, match_cols]
+# get project-specific columns, which should also be sample-specific
+project_fields <- setdiff(colnames(metadata), c(library_fields, sample_fields, processing_fields))
 
 # get sample metadata only & reduce to one line per sample
-sample_metadata <- metadata[, sample_fields] |> dplyr::distinct()
+sample_metadata <- metadata[, c(sample_fields, project_fields)] |> dplyr::distinct()
 
 # check that sample data are not repeated
-stopifnot(length(unique(sample_metadata$scpca_sample_id)) == nrow(sample_metadata))
+stopifnot(
+  "Sample data seem to be repeated, metadata permutation failed" =
+    length(unique(sample_metadata$scpca_sample_id)) == nrow(sample_metadata)
+)
 
 # permute sample metadata -------------------------------------------------------------
 diagnosis_order <- sample(seq(1, nrow(sample_metadata)), nrow(sample_metadata))
@@ -130,6 +141,11 @@ sample_metadata <- sample_metadata |>
     submitter_id = "" # remove submitter_id,
   )
 
+# permute project-specific columns
+for (f in project_fields) {
+  sample_metadata[[f]] <- sample(sample_metadata[[f]])
+}
+
 metadata <- metadata |>
   dplyr::rows_update(sample_metadata, by = "scpca_sample_id")
 
diff --git a/modules/simulate-sce/resources/usr/bin/reformat_anndata.py b/modules/simulate-sce/resources/usr/bin/reformat_anndata.py
@@ -65,7 +65,7 @@ def reformat_anndata(anndata_file, pca_metafile):
     parser.add_argument(
         "-d",
         "--dir",
-        help="directory containing H5AD files and PCA metadaa",
+        help="directory containing H5AD files and PCA metadata",
         required=True,
     )
 

Original file line number	Diff line number	Diff line change
`@@ -4,4 +4,4 @@ This workflow is designed to simulate single cell data, primarily using the [spl`
`4`	`4`
`5`	`5`	Scripts are derived from the the `simulate-sce` module of the [OpenScPCA-analysis](https://github.com/AlexsLemonade/OpenScPCA-analysis) repository.
`6`	`6`
`7`		`-Permalink to the version used: https://github.com/AlexsLemonade/OpenScPCA-analysis/tree/c903e51fe18f0e048ced9a4978bcf056f3f78999/analyses/simulate-sce`
	`7`	`+Permalink to the version used: https://github.com/AlexsLemonade/OpenScPCA-analysis/tree/0a3d96089991dea692a8485e3126ed6d69958028/analyses/simulate-sce`
Original file line number	Diff line number	Diff line change
`@@ -65,7 +65,7 @@ def reformat_anndata(anndata_file, pca_metafile):`
`65`	`65`	`parser.add_argument(`
`66`	`66`	`"-d",`
`67`	`67`	`"--dir",`
`68`		`- help="directory containing H5AD files and PCA metadaa",`
	`68`	`+ help="directory containing H5AD files and PCA metadata",`
`69`	`69`	`required=True,`
`70`	`70`	`)`
`71`	`71`