Skip to content

Commit c04c48a

Browse files
authored
Merge pull request #102 from AlexsLemonade/jashapiro/all-metadata
Update metadata to include project-specific fields
2 parents 462b6aa + cb5b181 commit c04c48a

File tree

5 files changed

+26
-9
lines changed

5 files changed

+26
-9
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ You may want to add temporary notes here for tracking as features are added, bef
1414
- Update simulations to match current (v0.8.5) `scpca-nf` output
1515
- Change reduced dimension names in AnnData output (to `X_pca` and `X_umap`) and updated formatting to match scpca-nf v0.8.5
1616
- Use new age columns
17+
- Metadata for simulated data now includes project-specific fields
1718
- Centralized docker image definitions in `config/containers.config`
1819
- Added initial documentation about porting modules
1920

modules/simulate-sce/main.nf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ workflow simulate_sce {
116116
permuted_metadata_ch = permute_metadata(metadata_ch)
117117

118118
// get bulk files for each project, if present: [project_id, bulk_quant_file, bulk_metadata_file]
119-
bulk_ch = project_ch.map{[it[0], it[1] / 'bulk_quant.tsv', it[1] / 'bulk_metadata.tsv']}
119+
bulk_ch = project_ch.map{[it[0], it[1] / "${it[0]}_bulk_quant.tsv", it[1] / "${it[0]}_bulk_metadata.tsv"]}
120120
.filter{it[1].exists()}
121121
permute_bulk(bulk_ch)
122122

modules/simulate-sce/readme.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@ This workflow is designed to simulate single cell data, primarily using the [spl
44

55
Scripts are derived from the the `simulate-sce` module of the [OpenScPCA-analysis](https://github.com/AlexsLemonade/OpenScPCA-analysis) repository.
66

7-
Permalink to the version used: https://github.com/AlexsLemonade/OpenScPCA-analysis/tree/c903e51fe18f0e048ced9a4978bcf056f3f78999/analyses/simulate-sce
7+
Permalink to the version used: https://github.com/AlexsLemonade/OpenScPCA-analysis/tree/0a3d96089991dea692a8485e3126ed6d69958028/analyses/simulate-sce

modules/simulate-sce/resources/usr/bin/permute-metadata.R

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,14 @@ library_fields <- c(
4646
"seq_unit",
4747
"technology",
4848
"filtered_cell_count",
49+
"filtered_spots",
50+
"unfiltered_spots",
51+
"tissue_spots",
4952
"submitter",
5053
"pi_name",
51-
"project_title"
54+
"project_title",
55+
"demux_samples",
56+
"demux_cell_count_estimate"
5257
)
5358

5459
# fields that apply at sample level
@@ -81,7 +86,9 @@ processing_fields <- c(
8186
"genome_assembly",
8287
"has_cellhash",
8388
"includes_anndata",
89+
"is_cell_line",
8490
"is_multiplexed",
91+
"is_xenograft",
8592
"has_citeseq",
8693
"adt_filtering_method",
8794
"adt_normalization_method",
@@ -92,23 +99,27 @@ processing_fields <- c(
9299
"prob_compromised_cutoff",
93100
"processed_cells",
94101
"salmon_version",
102+
"spaceranger_version",
95103
"total_reads",
96104
"transcript_type",
97105
"unfiltered_cells",
106+
"demux_method",
98107
"workflow",
99108
"workflow_commit",
100109
"workflow_version"
101110
)
102111

103-
# Remove project-specific columns
104-
match_cols <- sort(match(c(library_fields, sample_fields, processing_fields), colnames(metadata)))
105-
metadata <- metadata[, match_cols]
112+
# get project-specific columns, which should also be sample-specific
113+
project_fields <- setdiff(colnames(metadata), c(library_fields, sample_fields, processing_fields))
106114

107115
# get sample metadata only & reduce to one line per sample
108-
sample_metadata <- metadata[, sample_fields] |> dplyr::distinct()
116+
sample_metadata <- metadata[, c(sample_fields, project_fields)] |> dplyr::distinct()
109117

110118
# check that sample data are not repeated
111-
stopifnot(length(unique(sample_metadata$scpca_sample_id)) == nrow(sample_metadata))
119+
stopifnot(
120+
"Sample data seem to be repeated, metadata permutation failed" =
121+
length(unique(sample_metadata$scpca_sample_id)) == nrow(sample_metadata)
122+
)
112123

113124
# permute sample metadata -------------------------------------------------------------
114125
diagnosis_order <- sample(seq(1, nrow(sample_metadata)), nrow(sample_metadata))
@@ -130,6 +141,11 @@ sample_metadata <- sample_metadata |>
130141
submitter_id = "" # remove submitter_id,
131142
)
132143

144+
# permute project-specific columns
145+
for (f in project_fields) {
146+
sample_metadata[[f]] <- sample(sample_metadata[[f]])
147+
}
148+
133149
metadata <- metadata |>
134150
dplyr::rows_update(sample_metadata, by = "scpca_sample_id")
135151

modules/simulate-sce/resources/usr/bin/reformat_anndata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def reformat_anndata(anndata_file, pca_metafile):
6565
parser.add_argument(
6666
"-d",
6767
"--dir",
68-
help="directory containing H5AD files and PCA metadaa",
68+
help="directory containing H5AD files and PCA metadata",
6969
required=True,
7070
)
7171

0 commit comments

Comments
 (0)