Skip to content

Commit

Permalink
sample right after download
Browse files Browse the repository at this point in the history
  • Loading branch information
rcannood committed Sep 25, 2024
1 parent f9cb326 commit cebecbd
Show file tree
Hide file tree
Showing 6 changed files with 105 additions and 84 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ param_list:
dataset_description: See dataset_reference for more information. Note that we only took the 10xv2 data from the dataset.
dataset_organism: mus_musculus
sample_n_obs: 500000
sample_obs_weight: subclass
sample_transform: log
sample_seed: 42
output_dataset: "\$id/dataset.h5ad"
output_meta: "\$id/dataset_meta.yaml"
output_state: "\$id/state.yaml"
Expand Down
10 changes: 5 additions & 5 deletions scripts/create_resources/process_vizgen_merscope.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ param_list:
- id: "vizgen_merscope/2022_vizgen_human_breast_cancer_merfish/rep1"
gcloud_bucket: "vz-ffpe-showcase"
dataset_bucket_name: "HumanBreastCancerPatient1"
dataset_name: "2022 Vizgen Human Breast Cancer MERFISH Patient1"
dataset_name: "Vizgen Human Breast Cancer MERFISH Patient1"
dataset_url: "https://info.vizgen.com/ffpe-showcase?submissionGuid=a93dbab5-c128-4269-afe3-82ea2bf9cdaf"
dataset_summary: "Human Breast Cancer data from the MERSCOPE FFPE Human Immuno-Oncology Data Release."
dataset_description: "The MERSCOPE FFPE Human Immuno-Oncology Data Release was generated using the MERSCOPE FFPE Sample Prep Solution and the MERSCOPE Immuno-Oncology Predesigned Panel. This data release includes 16 MERFISH datasets generated by the MERSCOPE Platform from 8 different human tumor types, each measuring 500 genes representing approximately 4 billion transcripts and 9 million cells cumulatively."
Expand All @@ -26,7 +26,7 @@ param_list:
- id: "vizgen_merscope/2022_vizgen_human_liver_cancer_merfish/rep1"
gcloud_bucket: "vz-ffpe-showcase"
dataset_bucket_name: "HumanLiverCancerPatient1"
dataset_name: "2022 Vizgen Human Liver Cancer MERFISH Patient1"
dataset_name: "Vizgen Human Liver Cancer MERFISH Patient1"
dataset_url: "https://info.vizgen.com/ffpe-showcase?submissionGuid=a93dbab5-c128-4269-afe3-82ea2bf9cdaf"
dataset_summary: "Human Liver Cancer data from the MERSCOPE FFPE Human Immuno-Oncology Data Release."
dataset_description: "The MERSCOPE FFPE Human Immuno-Oncology Data Release was generated using the MERSCOPE FFPE Sample Prep Solution and the MERSCOPE Immuno-Oncology Predesigned Panel. This data release includes 16 MERFISH datasets generated by the MERSCOPE Platform from 8 different human tumor types, each measuring 500 genes representing approximately 4 billion transcripts and 9 million cells cumulatively."
Expand All @@ -36,7 +36,7 @@ param_list:
- id: "vizgen_merscope/2022_vizgen_human_liver_cancer_merfish/rep2"
gcloud_bucket: "vz-ffpe-showcase"
dataset_bucket_name: "HumanLiverCancerPatient2"
dataset_name: "2022 Vizgen Human Liver Cancer MERFISH Patient2"
dataset_name: "Vizgen Human Liver Cancer MERFISH Patient2"
dataset_url: "https://info.vizgen.com/ffpe-showcase?submissionGuid=a93dbab5-c128-4269-afe3-82ea2bf9cdaf"
dataset_summary: "Human Liver Cancer data from the MERSCOPE FFPE Human Immuno-Oncology Data Release."
dataset_description: "The MERSCOPE FFPE Human Immuno-Oncology Data Release was generated using the MERSCOPE FFPE Sample Prep Solution and the MERSCOPE Immuno-Oncology Predesigned Panel. This data release includes 16 MERFISH datasets generated by the MERSCOPE Platform from 8 different human tumor types, each measuring 500 genes representing approximately 4 billion transcripts and 9 million cells cumulatively."
Expand All @@ -46,7 +46,7 @@ param_list:
- id: "vizgen_merscope/2022_vizgen_human_lung_cancer_merfish/rep1"
gcloud_bucket: "vz-ffpe-showcase"
dataset_bucket_name: "HumanLungCancerPatient1"
dataset_name: "2022 Vizgen Human Lung Cancer MERFISH Patient1"
dataset_name: "Vizgen Human Lung Cancer MERFISH Patient1"
dataset_url: "https://info.vizgen.com/ffpe-showcase?submissionGuid=a93dbab5-c128-4269-afe3-82ea2bf9cdaf"
dataset_summary: "Human Lung Cancer data from the MERSCOPE FFPE Human Immuno-Oncology Data Release."
dataset_description: "The MERSCOPE FFPE Human Immuno-Oncology Data Release was generated using the MERSCOPE FFPE Sample Prep Solution and the MERSCOPE Immuno-Oncology Predesigned Panel. This data release includes 16 MERFISH datasets generated by the MERSCOPE Platform from 8 different human tumor types, each measuring 500 genes representing approximately 4 billion transcripts and 9 million cells cumulatively."
Expand All @@ -56,7 +56,7 @@ param_list:
- id: "vizgen_merscope/2022_vizgen_human_lung_cancer_merfish/rep2"
gcloud_bucket: "vz-ffpe-showcase"
dataset_bucket_name: "HumanLungCancerPatient2"
dataset_name: "2022 Vizgen Human Lung Cancer MERFISH Patient2"
dataset_name: "Vizgen Human Lung Cancer MERFISH Patient2"
dataset_url: "https://info.vizgen.com/ffpe-showcase?submissionGuid=a93dbab5-c128-4269-afe3-82ea2bf9cdaf"
dataset_summary: "Human Lung Cancer data from the MERSCOPE FFPE Human Immuno-Oncology Data Release."
dataset_description: "The MERSCOPE FFPE Human Immuno-Oncology Data Release was generated using the MERSCOPE FFPE Sample Prep Solution and the MERSCOPE Immuno-Oncology Predesigned Panel. This data release includes 16 MERFISH datasets generated by the MERSCOPE Platform from 8 different human tumor types, each measuring 500 genes representing approximately 4 billion transcripts and 9 million cells cumulatively."
Expand Down
22 changes: 17 additions & 5 deletions src/datasets/loaders/allen_brain_cell_atlas/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,28 @@ argument_groups:
- MB
- OLF
- TH
- name: Caching
- name: Sampling
arguments:
- type: integer
name: --sample_n_obs
required: false
description: The number of cells to sample.
- type: string
name: --cache_dir
name: --sample_obs_weight
required: false
description: Directory to cache the downloaded data.
description: The column to use for weighting the sampling of cells.
choices:
- donor_label
- anatomical_division_label
- class
- subclass
- type: string
name: --output_cache_dir
name: --sample_transform
required: false
description: Output directory of the cached data.
description: The transformation to apply to the column used for weighting the sampling of cells.
choices:
- log
- sqrt
- name: Metadata
arguments:
- type: string
Expand Down
84 changes: 55 additions & 29 deletions src/datasets/loaders/allen_brain_cell_atlas/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,30 @@

from pathlib import Path
import pandas as pd
import numpy as np
from collections import defaultdict
import scipy as sp
import scanpy as sc
import anndata as ad
from abc_atlas_access.abc_atlas_cache.abc_project_cache import AbcProjectCache

## VIASH START
par = {
"abca_version": "20230630",
"regions": ["CTXsp", "HPF", "HY", "Isocortex-1", "Isocortex-2", "Isocortex-3", "Isocortex-4", "MB", "OLF", "TF"],
"sample_n_obs": 500000,
"sample_obs_weight": "subclass",
"sample_transform": "sqrt",
"sample_seed": None,
"output": "tmp_dataset.h5ad",
}
meta = {
"temp_dir": "/tmp/allen_brain_cell_atlas",
}
## VIASH END

if par["sample_seed"]:
np.random.seed(par["sample_seed"])

# helper variables
VERSION = par["abca_version"]
REGIONS = par["regions"]
Expand All @@ -35,8 +42,8 @@
metadata_files = [
"cell_metadata_with_cluster_annotation",
]
for file in metadata_files:
abc_cache.get_metadata_path(directory="WMB-10X", file_name=file)
for file_name in metadata_files:
abc_cache.get_metadata_path(directory="WMB-10X", file_name=file_name)

print("Reading obs", flush=True)
obs = pd.read_csv(
Expand All @@ -51,35 +58,54 @@
)
)

print("Downloading expression matrices", flush=True)
# From abc_cache.list_data_files("WMB-10Xv2") # TODO: potentially also load other chemistries (currently only 10Xv2)
for region in REGIONS:
print(f"Downloading h5ad file for region {region}", flush=True)
file = f"WMB-10Xv2-{region}/raw"
abc_cache.get_data_path(directory="WMB-10Xv2", file_name=file)
print("Filtering obs based on regions", flush=True)
obs = obs[obs["anatomical_division_label"].isin(REGIONS)]

if par["sample_n_obs"]:
print("Filtering obs based on n_obs", flush=True)
col = par["sample_obs_weight"]

if col:
weights = obs.groupby(col).size()

if par["sample_transform"] == "sqrt":
weights = weights.apply(lambda x: np.sqrt(x))
elif par["sample_transform"] == "log":
weights = weights.apply(lambda x: np.log(x))

print("Reading expression matrices", flush=True)
obs = obs.sample(n=par["sample_n_obs"], weights=obs[col].map(weights))
else:
obs = obs.sample(n=par["sample_n_obs"])


# From abc_cache.list_data_files("WMB-10Xv2")
# TODO: potentially also load other chemistries (currently only 10Xv2)

print("Downloading and reading expression matrices", flush=True)
adatas = []
for region in REGIONS:
print(f"Reading h5ad for region {region}", flush=True)
adata = ad.read_h5ad(
TMP_DIR / f"expression_matrices/WMB-10Xv2/{VERSION}/WMB-10Xv2-{region}-raw.h5ad"
)
sc.pp.filter_cells(adata, min_genes=5)
sc.pp.filter_cells(adata, min_counts=50)

adata = adata[adata.obs_names.isin(obs.index)]
adata.obs["region"] = region
counts = adata.X
del adata.X

# make sure counts is sparse
if not isinstance(counts, sp.sparse.csr_matrix):
counts = sp.sparse.csr_matrix(counts)
adata.layers["counts"] = counts

# add anndata to list
adatas.append(adata)
try:
print(f"Reading h5ad for region {region}", flush=True)

file_name = f"WMB-10Xv2-{region}/raw"
adata_path = abc_cache.get_data_path(directory="WMB-10Xv2", file_name=file_name)

adata = ad.read_h5ad(str(adata_path), backed=True)

adata_ = adata[adata.obs_names.isin(obs.index)]
adata.obs["region"] = region
counts = adata.X
del adata.X

# make sure counts is sparse
if not isinstance(counts, sp.sparse.csr_matrix):
counts = sp.sparse.csr_matrix(counts)
adata.layers["counts"] = counts

# add anndata to list
adatas.append(adata)
except Exception as e:
print(f"Error reading {region}: {e}")

print("Concatenating data", flush=True)
adata = ad.concat(adatas, merge="first")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,36 +57,27 @@ argument_groups:
required: false
- name: Sampling options
arguments:
- name: "--do_subsample"
type: boolean
default: false
description: "Whether or not to subsample the dataset"
- name: "--n_obs"
type: integer
description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed.
default: 500
- name: "--n_vars"
type: integer
description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed.
default: 500
- name: "--keep_features"
type: string
multiple: true
description: A list of genes to keep.
- name: "--keep_cell_type_categories"
type: "string"
multiple: true
description: "Categories indexes to be selected"
- type: integer
name: --sample_n_obs
required: false
- name: "--keep_batch_categories"
type: "string"
multiple: true
description: "Categories indexes to be selected"
description: The number of cells to sample.
- type: string
name: --sample_obs_weight
required: false
description: The column to use for weighting the sampling of cells.
choices:
- donor_label
- anatomical_division_label
- class
- subclass
- type: string
name: --sample_transform
required: false
- name: "--even"
type: "boolean_true"
description: Subsample evenly from different batches
- name: "--seed"
description: The transformation to apply to the column used for weighting the sampling of cells.
choices:
- log
- sqrt
- name: "--sample_seed"
type: "integer"
description: "A seed for the subsampling."
example: 123
Expand Down
21 changes: 4 additions & 17 deletions src/datasets/workflows/process_allen_brain_cell_atlas/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -31,29 +31,16 @@ workflow run_wf {
"dataset_summary",
"dataset_description",
"dataset_organism",
"sample_n_obs",
"sample_obs_weight",
"sample_transform",
"sample_seed"
],
toState: [
"output_raw": "output"
]
)

// subsample if so desired
| subsample.run(
runIf: { id, state -> state.do_subsample },
fromState: [
"input": "output_raw",
"n_obs": "n_obs",
"n_vars": "n_vars",
"keep_features": "keep_features",
"keep_cell_type_categories": "keep_cell_type_categories",
"keep_batch_categories": "keep_batch_categories",
"even": "even",
"seed": "seed"
],
args: [output_mod2: null],
toState: ["output_raw": "output"]
)

| log_cp.run(
key: "log_cp10k",
fromState: [
Expand Down

0 comments on commit cebecbd

Please sign in to comment.