docs: change style and update MC2 reference

GfellerLab · Nov 20, 2023 · 76673fe · 76673fe
1 parent 8733d66
commit 76673fe
Show file tree

Hide file tree

Showing 5 changed files with 246 additions and 243 deletions.
diff --git a/01-Requirements.Rmd b/01-Requirements.Rmd
@@ -0,0 +1,214 @@
+# Requirements 
+
+```{r include=FALSE}
+TO_CACHE = FALSE
+```
+
+This chapter describes how to obtain the packages and data needed to reproduce the analyses performed in this tutorial.
+
+## Installations {#installations}
+
+### Using conda (recommended)
+To build a conda environment containing the three metacell building tools used in this tutorial (SuperCell, MC2 and SEACells), 
+please follow the instructions provided in the README of our MetacellAnalysisToolkit [github repository](https://github.com/GfellerLab/MetacellToolkit).
+
+
+```{r, eval = FALSE}
+library(reticulate)
+conda_env <-  conda_list()[reticulate::conda_list()$name == "MetacellAnalysisToolkit","python"]
+
+use_condaenv(conda_env)
+```
+
+### Without conda
+If you don't have conda, you can use the following instructions:
+
+Set up a python virtual environment with MC2 and SEACells installed:
+
+```{bash, eval = FALSE}
+pip install virtualenv
+virtualenv my_env
+source my_env/bin/activate
+
+# Installing SEACells
+git clone https://github.com/dpeerlab/SEACells.git
+cd SEACells
+python setup.py install
+cd ..
+pip install -r SEACells_requirements.txt
+pip install ipywidgets
+pip install jupyter
+
+# Install MC2
+pip install git+https://github.com/tanaylab/metacells
+```
+
+In R, install the SuperCell package:
+```{r, eval = FALSE, echo = TRUE}
+remotes::install_github("GfellerLab/SuperCell", force = TRUE, upgrade = FALSE)
+```
+
+To run python function in R, install reticulate:
+```{r, eval = FALSE, echo = TRUE}
+install.packages('reticulate')
+```
+
+To use the python libraries installed in the virtual environment, define the RETICULATE_PYTHON variable as follow:
+```{bash, eval = FALSE, echo = TRUE}
+echo 'RETICULATE_PYTHON=my_env/bin/python' > '.Renviron'
+```
+
+## Retrieve a discrete dataset (PBMCs dataset) {#PBMC-data}
+
+To test metacell construction on a discrete dataset, we retrieved the 3k PBMCs from scanpy datasets as follows:
+```{python, eval = T, collapse = T, cache = TO_CACHE}
+import scanpy as sc 
+import os
+
+adata = sc.datasets.pbmc3k()
+adata_proc = sc.datasets.pbmc3k_processed()
+
+adata       = adata[adata_proc.obs_names].copy()
+adata.obs   = adata_proc.obs.copy()
+adata.uns   = adata_proc.uns.copy()
+adata.obsm  = adata_proc.obsm.copy()
+adata.obsp  = adata_proc.obsp.copy()
+
+adata.X = adata.X.astype("float32")
+raw_ad = sc.AnnData(adata.X.copy())
+raw_ad.obs_names, raw_ad.var_names = adata.obs_names, adata.var_names
+adata.raw = raw_ad
+
+```
+
+The data are saved in the following file for future analyses in python (use of SEACells and MC2): "data/3k_pbmc/singlecell_anndata_filtered.h5ad".
+
+```{python, eval = T, collapse = T, cache = TO_CACHE}
+directory = os.path.join("data", "3k_pbmc")
+
+if not os.path.exists(directory):
+    os.makedirs(directory)
+  
+adata.write_h5ad(os.path.join("data", "3k_pbmc", "singlecell_anndata_filtered.h5ad"))
+```
+
+The data are saved in the following file for future analyses in R (use of SuperCell): "data/3k_pbmc/singlecell_seurat_filtered.rds".
+
+```{r, eval = T, collapse = T, cache = TO_CACHE}
+library(reticulate)
+library(Seurat)
+library(anndata)
+adata <- anndata::read_h5ad(file.path("data/3k_pbmc/singlecell_anndata_filtered.h5ad"))
+
+raw_counts <- Matrix::t(adata$raw$X)
+colnames(raw_counts) <- rownames(adata$obs)
+rownames(raw_counts) <- rownames(adata$var)
+
+pbmc <- CreateSeuratObject(counts = raw_counts, meta.data = adata$obs)
+saveRDS(pbmc, file = paste0("data/3k_pbmc/singlecell_seurat_filtered.rds"))
+```
+
+
+## Retrieve a continuous dataset (CD34 dataset) {#CD34-data}
+
+To test metacell construction on discrete dataset, we retrieved the CD34 dataset provided by Persad et al.[@SEACells]:
+```{bash, eval = FALSE, cache = TO_CACHE}
+mkdir data/CD34
+wget -O data/CD34/cd34_multiome_rna.h5ad 'https://zenodo.org/record/6383269/files/cd34_multiome_rna.h5ad?download=1' 
+```
+
+```{python, eval = T, collapse = T, cache = TO_CACHE}
+import scanpy as sc 
+import os
+
+adata = sc.read(os.path.join("data", "CD34", "cd34_multiome_rna.h5ad"))
+adata.X.sort_indices()
+raw_ad = sc.AnnData(adata.X.copy())
+raw_ad.obs_names, raw_ad.var_names = adata.obs_names, adata.var_names
+adata.raw = raw_ad
+
+sc.pl.embedding(adata, 'X_umap', color='celltype')
+```
+
+The data are saved in the following file for future analyses in python (use of SEACells and MC2): "data/CD34/singlecell_anndata_filtered.h5ad".
+```{python, eval = T, collapse = T, cache = TO_CACHE}
+directory = os.path.join("data", "cd34_multiome")
+
+if not os.path.exists(directory):
+    os.makedirs(directory)
+    
+adata.write_h5ad(os.path.join("data", "CD34", "singlecell_anndata_filtered.h5ad"))
+```
+
+The data are saved in the following file for future analyses in R (use of SuperCell): "data/CD34/singlecell_seurat_filtered.rds".
+
+```{r, eval = T, collapse = T, cache = TO_CACHE}
+library(reticulate)
+library(Seurat)
+library(anndata)
+adata <- anndata::read_h5ad(file.path("data/CD34/singlecell_anndata_filtered.h5ad"))
+
+raw_counts <- Matrix::t(adata$raw$X)
+colnames(raw_counts) <- rownames(adata$obs)
+rownames(raw_counts) <- rownames(adata$var)
+
+cd34 <- CreateSeuratObject(counts = raw_counts, meta.data = adata$obs)
+saveRDS(cd34, file = file.path("data/CD34/singlecell_seurat_filtered.rds"))
+```
+
+## Retrieve the lung atlas dataset {#HLCA-data}
+
+This dataset will be used for the integration of a large number of single-cell datasets at the level of metacells (see section \@ref(integration)).
+Considering, the large size of the data to download, if you don't consider running the integration analysis, you can skip this part of the tutorial.
+
+### Downloading the atlas
+
+To illustrate how metacells can be used in the context of single-cell data integration,
+we used a cell atlas of the human lung (core) available on [cellxgene](https://cellxgene.cziscience.com/collections/6f6d381a-7701-4781-935c-db10d30de293). 
+To download the data, please choose the `.h5ad` option after clicking on the download button for the core atlas (3 tissues, 584'944 cells).
+
+Save these data in the `data/HLCA/` directory. 
+
+Please note that this may take some time (\~45 mins) as the file is quite large (5.6 GB).
+
+###  Splitting atlas by datasets
+
+We will use anndata to read in backed mode (saving a lot of memory) the whole atlas and write one h5ad file for each dataset. 
+This should take less than 10 minutes.
+
+If you are limited in time feel free to process only a subset of the dataset.
+
+```{r , eval = FALSE, collapse = T, cache = TO_CACHE}
+t0.split <- Sys.time()
+
+library(anndata)
+adata <- read_h5ad("data/HLCA/local.h5ad",backed = "r")
+adata$var_names <- adata$var$feature_name # We will use gene short name for downstream analyses
+datasets <- unique(adata$obs$dat)
+
+# If you are limited in time you can process on half of the datasets (uncomment th following line)
+# datasets <- datasets[1:7]
+
+print(dim(adata))
+
+lapply(datasets,FUN =  function(x) {
+  dir.create(paste0("data/HLCA/datasets/",x),recursive = T)
+  adata.dataset <- AnnData(X = adata[adata$obs$dataset == x]$raw$X,
+                           var = adata[adata$obs$dataset == x]$var,
+                           obs = adata[adata$obs$dataset == x]$obs)
+  #This will allow us to construct supervised metacell for each cell type in each sample later in the tutorial
+  adata.dataset$obs$ann <- as.character(adata.dataset$obs$ann_level_3)
+  # For cell without an annotation at the 3rd level we will use the second level of annotation
+  adata.dataset$obs$ann[adata.dataset$obs$ann_level_3 == 'None'] = as.character(adata.dataset$obs$ann_level_2[adata.dataset$obs$ann_level_3 == 'None'])
+  adata.dataset$obs$ann_sample <- paste0(adata.dataset$obs$ann,"_",adata.dataset$obs$sample)
+  
+  write_h5ad(adata.dataset,paste0("data/HLCA/datasets/",x,"/sc_adata.h5ad"))
+}
+)
+
+remove(adata)
+gc()
+
+tf.split <- Sys.time()
+tf.split - t0.split
+```
diff --git a/_bookdown.yml b/_bookdown.yml
@@ -1,10 +1,8 @@
+book_filename: "MetacellAnalysisTutorial"
+new_session: true
+before_chapter_script: R/config.R
 delete_merged_file: true
 language:
   ui:
     chapter_name: "Chapter "
-new_session: yes
-before_chapter_script: './R/config.R'
-view: https://github.com/GfellerLab/Metacell_tutorial/blob/master/%s
-edit: https://github.com/GfellerLab/Metacell_tutorial/edit/master/%s
-output_dir: "docs"
-clean: ["my-book.bbl", "R-packages.bib"]
+output_dir: "docs"
diff --git a/_output.yml b/_output.yml
@@ -1,18 +1,15 @@
-bookdown::gitbook:
+bookdown::bs4_book:
   css: style.css
-  split_by: section
-  config:
-    toc:
-      before: |
-        <li><a href="./">Metacell Tutorial </a></li>
-      after: |
-        <li><a href="https://github.com/rstudio/bookdown" target="blank">Published with bookdown</a></li>
-    edit: https://github.com/GfellerLab/Metacell_tutorial/edit/master/%s
-    download: ["pdf", "epub"]
+  theme:
+    primary: "#096B72"
+  repo: 
+    base: https://github.com/GfellerLab/MetacellAnalysisTutorial
+    branch: main
 bookdown::pdf_book:
   includes:
     in_header: preamble.tex
   latex_engine: xelatex
   citation_package: natbib
   keep_tex: yes
 bookdown::epub_book: default
+
diff --git a/citations.bib b/citations.bib
@@ -9,19 +9,28 @@ @Book{xie2015
   url = {http://yihui.org/knitr/},
 }
 
+
+
 @article{MC2,
-	title = {A divide and conquer metacell algorithm for scalable {scRNA}-seq analysis},
-	url = {http://biorxiv.org/content/early/2021/08/08/2021.08.08.453314.abstract},
-	doi = {10.1101/2021.08.08.453314},
-	abstract = {Scaling scRNA-seq to profile millions of cells is increasingly feasible. Such data is crucial for the construction of high-resolution maps of transcriptional manifolds. But current analysis strategies, in particular dimensionality reduction and two-phase clustering, offers only limited scaling and sensitivity to define such manifolds. Here we introduce Metacell-2, a recursive divide and conquer algorithm allowing efficient decomposition of scRNA-seq datasets of any size into small and cohesive groups of cells denoted as metacells. We show the algorithm outperforms current solutions in time, memory and quality. Importantly, Metacell-2 also improves outlier cell detection and rare cell type identification, as we exemplify by analysis of human bone marrow cell atlas and mouse embryonic data. Metacell-2 is implemented over the scanpy framework for easy integration in any analysis pipeline.},
-	journal = {bioRxiv},
+	title = {Metacell-2: a divide-and-conquer metacell algorithm for scalable {scRNA}-seq analysis},
+	volume = {23},
+	issn = {1474-760X},
+	shorttitle = {Metacell-2},
+	url = {https://genomebiology.biomedcentral.com/articles/10.1186/s13059-022-02667-1},
+	doi = {10.1186/s13059-022-02667-1},
+	language = {en},
+	number = {1},
+	urldate = {2023-04-20},
+	journal = {Genome Biology},
 	author = {Ben-Kiki, Oren and Bercovich, Akhiad and Lifshitz, Aviezer and Tanay, Amos},
-	month = aug,
-	year = {2021},
-	pages = {2021.08.08.453314},
+	month = dec,
+	year = {2022},
+	pages = {100},
+	file = {Full Text:/Users/mariiabilous/Zotero/storage/K7BFMH4G/Ben-Kiki et al. - 2022 - Metacell-2 a divide-and-conquer metacell algorith.pdf:application/pdf;MC2_review_comments_13059_2022_2667_MOESM2_ESM.docx:/Users/mariiabilous/Documents/PhD/UNIL/papers/MC2_review_comments_13059_2022_2667_MOESM2_ESM.docx:application/vnd.openxmlformats-officedocument.wordprocessingml.document},
 }
 
 
+
 @article{baran_metacell_2019,
 	title = {{MetaCell}: {Analysis} of single-cell {RNA}-seq data using {K}-nn graph partitions},
 	issn = {1474760X},