From 435d5544371a83cc7c090f8e304131f0e673e931 Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Thu, 6 Feb 2025 17:46:41 -0600 Subject: [PATCH] re-render --- .../02-explore-consensus-results.nb.html | 139 +++++++++++------- .../03-osteosarcoma-consensus-celltypes.Rmd | 9 +- ...3-osteosarcoma-consensus-celltypes.nb.html | 31 ++-- 3 files changed, 117 insertions(+), 62 deletions(-) diff --git a/analyses/cell-type-consensus/exploratory-notebooks/02-explore-consensus-results.nb.html b/analyses/cell-type-consensus/exploratory-notebooks/02-explore-consensus-results.nb.html index 597fb76d9..c0f913593 100644 --- a/analyses/cell-type-consensus/exploratory-notebooks/02-explore-consensus-results.nb.html +++ b/analyses/cell-type-consensus/exploratory-notebooks/02-explore-consensus-results.nb.html @@ -11,7 +11,7 @@ - + Explore consensus cell types @@ -1750,7 +1750,7 @@

Explore consensus cell types

Ally Hawkins

-

2025-02-05

+

2025-02-06

@@ -1777,8 +1777,7 @@

2025-02-05

notebook.

- - +
suppressPackageStartupMessages({
   # load required packages
   library(ggplot2)
@@ -1789,15 +1788,13 @@ 

2025-02-05

theme_classic() )
-

Data setup

- - +
# The base path for the OpenScPCA repository, found by its (hidden) .git directory
 repository_base <- rprojroot::find_root(rprojroot::is_git_root)
 module_base <- file.path(repository_base, "analyses", "cell-type-consensus")
@@ -1808,13 +1805,11 @@ 

Data setup

# diagnoses table used for labeling plots diagnoses_file <- file.path(module_base, "sample-info", "project-diagnoses.tsv")
- - - +
# list all results files 
 results_files <- list.files(results_dir, pattern = "_consensus-cell-types\\.tsv.\\gz$", full.names = TRUE)
 
@@ -1827,24 +1822,20 @@ 

Data setup

project_ids <- setdiff(project_ids, cell_line_projects) # remove cell line projects results_files <- results_files[project_ids]
- - - +
# source summarize_celltypes() function
 setup_functions <- file.path(module_base, "exploratory-notebooks", "utils", "setup-functions.R")
 source(setup_functions)
- - - +
# read in diagnoses
 diagnoses_df <- readr::read_tsv(diagnoses_file)
 
@@ -1858,10 +1849,8 @@ 

Data setup

dplyr::mutate( # create a label for plotting project_label = glue::glue("{project_id}:{diagnosis}") - ) -
+ ) -
@@ -1872,8 +1861,7 @@

Is it all just Unknown?

SingleR and CellAssign was identified.

- - +
unknown_only <- all_results_df |> 
   dplyr::filter(consensus_annotation == "Unknown")
 
@@ -1884,10 +1872,11 @@ 

Is it all just Unknown?

labs( x = "", y = "Percent of cells annotated as Unknown" - ) -
+ ) - + +

+

It looks like we do have some samples that aren’t just all “Unknown”! @@ -1898,8 +1887,7 @@

Is it all just Unknown?

have cells called as “Unknown”.

- - +
high_tumor_df <- unknown_only |> 
   dplyr::mutate(no_cells_identified = percent_cells_annotation == 100) |> 
   dplyr::group_by(project_label) |> 
@@ -1911,21 +1899,24 @@ 

Is it all just Unknown?

# set order for plots dplyr::mutate(project_label = forcats::fct_reorder(project_label, total_libraries, .desc = TRUE))
-

Which projects have the highest proportion of samples with all “Unknown”?

- - +
# table with percentage of samples 
 high_tumor_df |> 
   dplyr::select(project_label, percentage_unknown) |> 
-  dplyr::arrange(desc(percentage_unknown))
-
+ dplyr::arrange(desc(percentage_unknown)) + +
+ +
@@ -1937,8 +1928,7 @@

Is it all just Unknown?

patient tissue counterparts.

- - +
# list of projects with pdx 
 pdx_projects <- c(
   "SCPCP000003",
@@ -1969,7 +1959,9 @@ 

Is it all just Unknown?

y = "Percent of cells annotated as Unknown" )
- + +

+

It looks like in SCPCP000003 and @@ -1992,8 +1984,7 @@

Number of cell types observed

for all samples. This does not include cells labeled as “Unknown”.

- - +
num_celltypes_df <- all_results_df |> 
   # add a new line for facet labels 
   dplyr::mutate(facet_label = glue::glue("{project_id}\n{diagnosis}")) |>
@@ -2011,7 +2002,9 @@ 

Number of cell types observed

) + theme_bw()
- + +

+ @@ -2025,8 +2018,7 @@

Distribution of consensus cell types

types.

- - +
plot_df <- all_results_df |> 
     dplyr::group_by(project_id) |> 
     dplyr::mutate(
@@ -2036,9 +2028,17 @@ 

Distribution of consensus cell types

forcats::fct_infreq() |> # make sure all remaining and unknown are last, use this to assign colors in specific order forcats::fct_relevel("All remaining cell types", "Unknown", after = Inf) - ) - -# get all unique cell types ordered by frequency + )
+ + +
Warning: There was 1 warning in `dplyr::mutate()`.
+ℹ In argument: `top_celltypes = forcats::fct_relevel(...)`.
+ℹ In group 19: `project_id = "SCPCP000021"`.
+Caused by warning:
+! 1 unknown level in `f`: All remaining cell types
+ + +
# get all unique cell types ordered by frequency 
 unique_celltypes <- plot_df |> 
   dplyr::filter(!top_celltypes %in% c("All remaining cell types", "Unknown")) |> 
   dplyr::pull(top_celltypes) |> 
@@ -2055,13 +2055,11 @@ 

Distribution of consensus cell types

) names(colors) <- c(unique_celltypes, "All remaining cell types", "Unknown")
- - - +
project_labels <- unique(all_results_df$project_label)
 
 # stacked bar chart showing the distribution of the top 9 cell types for each project, including Unknown
@@ -2094,7 +2092,9 @@ 

Distribution of consensus cell types

}) |> patchwork::wrap_plots(ncol = 1)
- + +

+

This looks really promising! A few observations:

@@ -2131,8 +2131,7 @@

Most frequently observed cell types

of libraries the cell type is observed.

- - +
all_results_df |> 
   dplyr::filter(consensus_annotation != "Unknown") |> 
   dplyr::group_by(consensus_annotation) |> 
@@ -2143,9 +2142,14 @@ 

Most frequently observed cell types

median_percentage = median(percent_cells_annotation), max_percentage = max(percent_cells_annotation) ) |> - dplyr::arrange(desc(total_libraries)) -
+ dplyr::arrange(desc(total_libraries)) + +
+ +
@@ -2154,11 +2158,42 @@

Most frequently observed cell types

Session info

- - +
# record the versions of the packages used in this analysis and other environment information
 sessionInfo()
+ +
R version 4.4.2 (2024-10-31)
+Platform: aarch64-apple-darwin20
+Running under: macOS Sequoia 15.3
+
+Matrix products: default
+BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
+LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
+
+locale:
+[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
+
+time zone: America/Chicago
+tzcode source: internal
+
+attached base packages:
+[1] stats     graphics  grDevices datasets  utils     methods   base     
+
+other attached packages:
+[1] ggplot2_3.5.1
+
+loaded via a namespace (and not attached):
+ [1] sass_0.4.9          utf8_1.2.4          generics_0.1.3      renv_1.0.11         stringi_1.8.4       hms_1.1.3          
+ [7] digest_0.6.37       magrittr_2.0.3      evaluate_1.0.1      grid_4.4.2          fastmap_1.2.0       rprojroot_2.0.4    
+[13] jsonlite_1.8.9      BiocManager_1.30.25 purrr_1.0.2         fansi_1.0.6         scales_1.3.0        tweenr_2.0.3       
+[19] jquerylib_0.1.4     cli_3.6.3           rlang_1.1.4         crayon_1.5.3        polyclip_1.10-7     bit64_4.5.2        
+[25] munsell_0.5.1       withr_3.0.2         cachem_1.1.0        yaml_2.3.10         tools_4.4.2         parallel_4.4.2     
+[31] tzdb_0.4.0          dplyr_1.1.4         colorspace_2.1-1    forcats_1.0.0       vctrs_0.6.5         R6_2.5.1           
+[37] lifecycle_1.0.4     stringr_1.5.1       bit_4.5.0.1         vroom_1.6.5         MASS_7.3-61         pkgconfig_2.0.3    
+[43] pillar_1.9.0        bslib_0.8.0         gtable_0.3.6        Rcpp_1.0.13-1       glue_1.8.0          ggforce_0.4.2      
+[49] xfun_0.49           tibble_3.2.1        tidyselect_1.2.1    knitr_1.49          farver_2.1.2        patchwork_1.3.0    
+[55] htmltools_0.5.8.1   rmarkdown_2.29      labeling_0.4.3      readr_2.1.5         compiler_4.4.2     
diff --git a/analyses/cell-type-consensus/exploratory-notebooks/03-osteosarcoma-consensus-celltypes.Rmd b/analyses/cell-type-consensus/exploratory-notebooks/03-osteosarcoma-consensus-celltypes.Rmd index c7266d6c2..2c3728d2d 100644 --- a/analyses/cell-type-consensus/exploratory-notebooks/03-osteosarcoma-consensus-celltypes.Rmd +++ b/analyses/cell-type-consensus/exploratory-notebooks/03-osteosarcoma-consensus-celltypes.Rmd @@ -194,7 +194,7 @@ We have already looked at this in `02-explore-consensus-results.Rmd`, but here w all_results_df <- all_results_df |> dplyr::mutate( # get most frequently observed cell types across libraries in that project - top_celltypes = forcats::fct_lump_n(consensus_annotation, 10, other_level = "All remaining cell types", ties.method = "first") |> + top_celltypes = forcats::fct_lump_n(consensus_annotation, 15, other_level = "All remaining cell types", ties.method = "first") |> # sort by frequency forcats::fct_infreq() |> # make sure all remaining and unknown are last, use this to assign colors in specific order @@ -236,6 +236,13 @@ Generally, we see most annotated cells are smooth muscle cells and endothelial c There also appears to be some samples that have macrophages and/or T cell populations. We also see a handful of samples that don't have any cells that are annotated. +```{r} +stacked_barchart(total_order_df, fill_color = "top_celltypes", facet_variable = "project_id", colors = all_celltype_colors) +``` + +It looks like both total number of cells that are classified and composition of those cells is project dependent. +This makes sense since sample prep is probably different across labs. + ## Immune cell populations Let's look specifically at immune cell populations. diff --git a/analyses/cell-type-consensus/exploratory-notebooks/03-osteosarcoma-consensus-celltypes.nb.html b/analyses/cell-type-consensus/exploratory-notebooks/03-osteosarcoma-consensus-celltypes.nb.html index 032f4072d..4645c1095 100644 --- a/analyses/cell-type-consensus/exploratory-notebooks/03-osteosarcoma-consensus-celltypes.nb.html +++ b/analyses/cell-type-consensus/exploratory-notebooks/03-osteosarcoma-consensus-celltypes.nb.html @@ -1975,12 +1975,12 @@

Composition of top cell types

cells labeled with a consensus label.

- +
# add column of "top cell types" for easier plotting 
 all_results_df <- all_results_df |> 
   dplyr::mutate(
     # get most frequently observed cell types across libraries in that project 
-    top_celltypes = forcats::fct_lump_n(consensus_annotation, 10, other_level = "All remaining cell types", ties.method = "first") |> 
+    top_celltypes = forcats::fct_lump_n(consensus_annotation, 15, other_level = "All remaining cell types", ties.method = "first") |> 
       # sort by frequency 
       forcats::fct_infreq() |> 
       # make sure all remaining and unknown are last, use this to assign colors in specific order
@@ -2032,7 +2032,7 @@ 

Composition of top cell types

stacked_barchart(total_order_df, fill_color = "top_celltypes", colors = all_celltype_colors)
-

+

@@ -2042,6 +2042,19 @@

Composition of top cell types

to be some samples that have macrophages and/or T cell populations. We also see a handful of samples that don’t have any cells that are annotated.

+ + + +
stacked_barchart(total_order_df, fill_color = "top_celltypes", facet_variable = "project_id", colors = all_celltype_colors)
+ + +

+ + + +

It looks like both total number of cells that are classified and +composition of those cells is project dependent. This makes sense since +sample prep is probably different across labs.

Immune cell populations

@@ -2258,7 +2271,7 @@

Is there any relationship between immune cell percentage and patchwork::wrap_plots(ncol = 1, guides = "collect")

-

+

@@ -2286,28 +2299,28 @@

Is there any relationship between immune cell percentage and
[[1]]
-

+


 [[2]]
-

+


 [[3]]
-

+


 [[4]]
-

+

@@ -2361,7 +2374,7 @@

Session info

-
---
title: "Consensus cell types across osteosarcoma samples"
author: Ally Hawkins
date: "`r Sys.Date()`"
output:
  html_notebook:
    toc: true
    toc_depth: 3
    code_folding: hide
---

This notebook summarizes the consensus cell types identified in all osteosarcoma samples that are part of ScPCA. 
This includes all samples from `SCPCP000017`, `SCPCP000018`, and `SCPCP000023`. 

We are interested in looking at any differences in cell composition across samples, with a particular interest in looking at the immune cell populations. 

```{r packages}
suppressPackageStartupMessages({
  # load required packages
  library(ggplot2)
})

# Set default ggplot theme
theme_set(
  theme_classic()
)
```


## Functions

```{r}
# barchart with or without faceting
# each bar is a stacked barchart using the fill_color
# faceting is only done if a facet_variable is provided
stacked_barchart <- function(
    df, 
    fill_color, 
    facet_variable = NULL,
    colors
){
  
  barchart <- ggplot(df) + 
    aes(
      x = library_id, 
      y = percent_cells_annotation, 
      fill = !!sym(fill_color)
    ) +
    geom_col() + 
    scale_y_continuous(expand = c(0,0)) +
    scale_fill_manual(values = colors) +
    theme(axis.text.x = element_blank()) +
    labs(
      fill= "cell type"
    )
  
  if(!is.null(facet_variable)){
    barchart <- barchart +
      facet_wrap(vars(!!sym(facet_variable)), scales ="free")
  }
  
  return(barchart)
}

# sina plot looking at immune percentage on the y-axis 
sina_plot <- function(df, plot_column){
  
  ggplot(immune_plot_df, aes(x = !!sym(plot_column), y = percent_immune)) +
    ggforce::geom_sina() +
    stat_summary(fun.y=median, geom="crossbar" , color = "red", linewidth = 0.2) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
          plot.margin = margin(10,10,10,10)) +
    labs(
      x = "", 
      y = "Percent of cells annotated as Immune",
      title = plot_column
    )
  
}
```


## Data setup


```{r base paths}
# The base path for the OpenScPCA repository, found by its (hidden) .git directory
repository_base <- rprojroot::find_root(rprojroot::is_git_root)
module_base <- file.path(repository_base, "analyses", "cell-type-consensus")

# results directory with cell-type-consensus 
results_dir <- file.path(module_base, "results", "cell-type-consensus")

# data directory where project metadata files live
data_dir <- file.path(repository_base, "data", "current")
```

```{r}
# list all results files 
results_files <- list.files(results_dir, pattern = "_consensus-cell-types\\.tsv.\\gz$", full.names = TRUE)

# get project ids from file list and assign as names
project_ids <- stringr::str_remove(basename(results_files), "_consensus-cell-types.tsv.gz")
names(results_files) <- project_ids

# grab only osteo projects
osteo_project_ids <- c("SCPCP000017", "SCPCP000018", "SCPCP000023")
results_files <- results_files[osteo_project_ids]

# list project metadata files and grab only osteo metadata 
metadata_files <- list.files(data_dir, pattern = "single_cell_metadata\\.tsv$", full.names = TRUE, recursive = TRUE)
metadata_ids <- stringr::word(dirname(metadata_files), -1, sep = "/")
names(metadata_files) <- metadata_ids
osteo_metadata_files <- metadata_files[osteo_project_ids]

# list of all immune cell types 
consensus_immune_file <- file.path(module_base, "references", "consensus-immune-cell-types.tsv")
```

```{r}
# source summarize_celltypes() function
setup_functions <- file.path(module_base, "exploratory-notebooks", "utils", "setup-functions.R")
source(setup_functions)
```

```{r, message=FALSE}
# get immune cell types 
immune_df <- readr::read_tsv(consensus_immune_file)
immune_types <- immune_df$consensus_annotation

# read in metadata
all_metadata_df <- osteo_metadata_files |>
  purrr::map(readr::read_tsv) |>
  dplyr::bind_rows() |>
  # select columns that might be useful 
  dplyr::select(
    project_id = scpca_project_id,
    library_id = scpca_library_id,
    disease_timing,
    age,
    sex,
    tissue_location,
    primary_or_metastasis,
    seq_unit
  )

# read in results and prep data frame for plotting 
all_results_df <- results_files |> 
  purrr::imap(summarize_celltypes) |> 
  dplyr::bind_rows(.id = "project_id") |> 
  # join with sample metadata
  dplyr::left_join(all_metadata_df, by = c("project_id", "library_id")) |>
  # remove pdx samples 
  dplyr::filter(sample_type != "patient-derived xenograft")
```


```{r}
# assign a color scheme for all bar charts 

# get all possible cell types 
# there are exactly 25 of them so will use alphabet with two greys for unknown/all remaining 
unique_celltypes <- all_results_df |> 
  dplyr::filter(percent_cells_annotation > 0.1, # we're only ever going to plot cell types that are greater than 1%
                !consensus_annotation == "Unknown") |> 
  dplyr::pull(consensus_annotation) |> 
  unique() |>
  as.character()

# define colors for all cell types
all_celltype_colors <- c(
  palette.colors(palette = "alphabet")[1:25],
  "grey60", # all remaining
  "grey95", # unknown
  "grey60" # non-immune, use the same as all remaining since they are never in the same plot
)
names(all_celltype_colors) <- c(unique_celltypes, "All remaining cell types", "Unknown", "non-immune")

# define a three color scheme for immune, other, and unknown
three_color_scheme <- c(
  "immune" = "navy",
  "non-immune" = "grey60",
  "Unknown" = "grey95"
)
```

## Composition of top cell types 

Here we look at the top cell types identified across all samples. 
We have already looked at this in `02-explore-consensus-results.Rmd`, but here we will plot all samples from all three osteo projects ordered by the total percent of cells labeled with a consensus label. 

```{r}
# add column of "top cell types" for easier plotting 
all_results_df <- all_results_df |> 
  dplyr::mutate(
    # get most frequently observed cell types across libraries in that project 
    top_celltypes = forcats::fct_lump_n(consensus_annotation, 10, other_level = "All remaining cell types", ties.method = "first") |> 
      # sort by frequency 
      forcats::fct_infreq() |> 
      # make sure all remaining and unknown are last, use this to assign colors in specific order
      forcats::fct_relevel("All remaining cell types", "Unknown", after = Inf),
  )
```


```{r}
# get a vector of library ids ordered by total percentage annotated
library_levels <- all_results_df |> 
  dplyr::filter(consensus_annotation != "Unknown") |> 
  dplyr::group_by(library_id) |> 
  dplyr::summarize(
    total_percent_annotated = sum(total_cells_per_annotation)/total_cells_per_library
  ) |>
  unique() |> 
  dplyr::arrange(desc(total_percent_annotated)) |> 
  dplyr::pull(library_id)

# append any libraries that have 0 cells annotated
non_annotated_libs <- setdiff(unique(all_results_df$library_id), library_levels)
library_levels <- c(library_levels, non_annotated_libs)

# reorder by total percentage annotated 
total_order_df <- all_results_df |> 
  dplyr::mutate(
    library_id = forcats::fct_relevel(library_id, library_levels)
  )
```


```{r}
stacked_barchart(total_order_df, fill_color = "top_celltypes", colors = all_celltype_colors)
```

It looks like there's definitely some variation between distributions of cell types within the osteo samples. 
Generally, we see most annotated cells are smooth muscle cells and endothelial cells. 
There also appears to be some samples that have macrophages and/or T cell populations. 
We also see a handful of samples that don't have any cells that are annotated.

## Immune cell populations

Let's look specifically at immune cell populations. 
To do this we will lump all immune cells as "immune", all non-immune as "non-immune", and all unknown as "unknown". 
Here we will sort by total percentage of immune cells. 

```{r}
# look at immune cell types vs. unknown vs. other 
all_results_df <- all_results_df |> 
  #dplyr::left_join(immune_df, by = c("consensus_annotation", "consensus_ontology")) |> 
  dplyr::mutate(
    # first get a column that is just immune, unknown, or other
    immune_category = dplyr::case_when(
      consensus_annotation %in% immune_types ~ "immune",
      consensus_annotation == "Unknown" ~ "Unknown",
      .default = "non-immune"
    )
  )
```

```{r}
# now sort just by immune percentage
immune_pct_df <- all_results_df |> 
  dplyr::filter(immune_category == "immune") |> 
  dplyr::group_by(library_id) |> 
  dplyr::summarize(
    percent_immune = sum(total_cells_per_annotation)/total_cells_per_library
  ) |>
  dplyr::select(library_id, percent_immune) |> 
  unique() 

library_levels <- immune_pct_df |> 
  dplyr::arrange(desc(percent_immune)) |> 
  dplyr::pull(library_id)

# append any libraries that have 0 immune cells annotated
non_annotated_libs <- setdiff(unique(all_results_df$library_id), library_levels)
library_levels <- c(library_levels, non_annotated_libs)

immune_results_df <- all_results_df |> 
  dplyr::left_join(immune_pct_df, by = c("library_id")) |> 
  dplyr::mutate(
    percent_immune = dplyr::if_else(is.na(percent_immune), 0, percent_immune),
    library_id = forcats::fct_relevel(library_id, library_levels)
  )

stacked_barchart(immune_results_df, fill_color = "immune_category", colors = three_color_scheme)
```
Let's see how the percentage of immune cells compares across projects. 

```{r}
stacked_barchart(immune_results_df, fill_color = "immune_category", facet_variable = "project_id", colors = three_color_scheme)
```

When looking at all samples together we do see variation in immune cells classified and it appears that `SCPCP000017` and `SCPCP000018` have more cells classified in general and have more cells classified as immune. 
It does appear that libraries that have more immune cell composition also have a higher percentage of non-immune cell types which could very well be a technical artifact and related to sample prep. 

Just for visualization purposes, let's classify these as "hot" and "cold", where "hot" tumors have an immune composition > 5%.

```{r}
immune_results_df <- immune_results_df |> 
  dplyr::mutate(
    # classify hot/cold based on immune percent > 5 %
    hot_cold = dplyr::if_else(percent_immune > .05, "hot", "cold")
  )
```

```{r}
stacked_barchart(immune_results_df, fill_color = "immune_category", facet_variable = "hot_cold", colors = three_color_scheme)
```

Our threshold is pretty arbitrary, but I think there are definitely some "cold" tumors where we see little to no immune involvement. 

Below we can look at all immune cell types vs. all non-immune. 

```{r}
immune_results_df <- immune_results_df |> 
  dplyr::mutate(
    is_immune = dplyr::if_else(
      immune_category == "immune" | top_celltypes == "Unknown",
      consensus_annotation,
      "non-immune"
    )
  )

stacked_barchart(immune_results_df, fill_color = "is_immune", colors = all_celltype_colors)
```

Generally, if immune cells are present they tend to be macrophages. 
There's also a lot of T cell populations, which makes sense since it has been previously noted that osteosarcoma has a high percentage of T cell infiltrate (https://doi.org/10.18632/oncotarget.19071).  

## Is there any relationship between immune cell percentage and clinical metadata? 

One thing I was curious about was whether or not there are any differences in the presence of immune cells based on clinical metadata such as primary vs. metastasis, initial diagnosis vs. recurrence, and tissue location (bone vs. soft tissue). 
I also expect there might be some differences between single-cell and single-nuclei samples. 

Below I look at the immune cell percentage stratified by each of the mentioned metadata. 
This is just a preliminary look and if we really want to dig into this further we should consider looking at algorithms for comparing cell type composition in single-cell RNA-seq data, but that is outside the scope of this notebook. 

```{r}
# do some data wrangling on the metadata columns we want to look at 
immune_results_df <- immune_results_df |> 
  dplyr::mutate(
    primary_or_metastasis = dplyr::if_else(!is.na(primary_or_metastasis), primary_or_metastasis, disease_timing),
    disease_timing_mod = dplyr::if_else(disease_timing %in% c("Initial diagnosis", "Recurrence"), disease_timing, "other"),
    tissue_location_mod = dplyr::case_when(
      stringr::str_detect(tissue_location, "Bone|femur|Humerus|Tibia") ~ "Bone",
      #stringr::str_detect(tissue_location, "femur") ~ "Bone",
      stringr::str_detect(tissue_location, "lung|Lung|Chest") ~ "lung or chest",
      .default = "other"
    )
  )
```


```{r, fig.height = 10}
# get a list of metadata we care about 
metadata_categories <- c(
  "primary_or_metastasis",
  "disease_timing_mod",
  "tissue_location_mod",
  "seq_unit"
)

# make stacked bar plots looking at only immune and non immune 
metadata_categories |> 
  purrr::map(\(category){
    
    stacked_barchart(
      immune_results_df,
      fill_color = "immune_category",
      facet_variable = category,
      colors = three_color_scheme
    )
}) |>
  patchwork::wrap_plots(ncol = 1, guides = "collect")
```

```{r, fig.height=10}
# now look at all top celltypes 
metadata_categories |> 
  purrr::map(\(category){
    
    stacked_barchart(
      immune_results_df,
      fill_color = "top_celltypes",
      facet_variable = category,
      colors = all_celltype_colors
    )
}) |>
  patchwork::wrap_plots(ncol = 1, guides = "collect")
```

Below are sina plots that compare the total immune percentage for all samples in a given category. 

```{r}
immune_plot_df <- immune_results_df |> 
  # just look at immune population across different diagnoses 
  dplyr::select(library_id, primary_or_metastasis, disease_timing_mod, tissue_location_mod, percent_immune, seq_unit) |> 
  unique()
```

```{r}
metadata_categories |> 
  purrr::map(\(category){
    sina_plot(immune_plot_df, category)
  })
```

It looks like there might be higher immune infiltrate in "bone" samples, but again I think if we want to make any conclusions we need to look at software that helps correct for technical artifacts. 
There may also be a difference in single-cell and single-nuclei, but there are much fewer single-cell samples so it's hard to say. 


## Session info 

```{r session info}
# record the versions of the packages used in this analysis and other environment information
sessionInfo()
```

+
---
title: "Consensus cell types across osteosarcoma samples"
author: Ally Hawkins
date: "`r Sys.Date()`"
output:
  html_notebook:
    toc: true
    toc_depth: 3
    code_folding: hide
---

This notebook summarizes the consensus cell types identified in all osteosarcoma samples that are part of ScPCA. 
This includes all samples from `SCPCP000017`, `SCPCP000018`, and `SCPCP000023`. 

We are interested in looking at any differences in cell composition across samples, with a particular interest in looking at the immune cell populations. 

```{r packages}
suppressPackageStartupMessages({
  # load required packages
  library(ggplot2)
})

# Set default ggplot theme
theme_set(
  theme_classic()
)
```


## Functions

```{r}
# barchart with or without faceting
# each bar is a stacked barchart using the fill_color
# faceting is only done if a facet_variable is provided
stacked_barchart <- function(
    df, 
    fill_color, 
    facet_variable = NULL,
    colors
){
  
  barchart <- ggplot(df) + 
    aes(
      x = library_id, 
      y = percent_cells_annotation, 
      fill = !!sym(fill_color)
    ) +
    geom_col() + 
    scale_y_continuous(expand = c(0,0)) +
    scale_fill_manual(values = colors) +
    theme(axis.text.x = element_blank()) +
    labs(
      fill= "cell type"
    )
  
  if(!is.null(facet_variable)){
    barchart <- barchart +
      facet_wrap(vars(!!sym(facet_variable)), scales ="free")
  }
  
  return(barchart)
}

# sina plot looking at immune percentage on the y-axis 
sina_plot <- function(df, plot_column){
  
  ggplot(immune_plot_df, aes(x = !!sym(plot_column), y = percent_immune)) +
    ggforce::geom_sina() +
    stat_summary(fun.y=median, geom="crossbar" , color = "red", linewidth = 0.2) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
          plot.margin = margin(10,10,10,10)) +
    labs(
      x = "", 
      y = "Percent of cells annotated as Immune",
      title = plot_column
    )
  
}
```


## Data setup


```{r base paths}
# The base path for the OpenScPCA repository, found by its (hidden) .git directory
repository_base <- rprojroot::find_root(rprojroot::is_git_root)
module_base <- file.path(repository_base, "analyses", "cell-type-consensus")

# results directory with cell-type-consensus 
results_dir <- file.path(module_base, "results", "cell-type-consensus")

# data directory where project metadata files live
data_dir <- file.path(repository_base, "data", "current")
```

```{r}
# list all results files 
results_files <- list.files(results_dir, pattern = "_consensus-cell-types\\.tsv.\\gz$", full.names = TRUE)

# get project ids from file list and assign as names
project_ids <- stringr::str_remove(basename(results_files), "_consensus-cell-types.tsv.gz")
names(results_files) <- project_ids

# grab only osteo projects
osteo_project_ids <- c("SCPCP000017", "SCPCP000018", "SCPCP000023")
results_files <- results_files[osteo_project_ids]

# list project metadata files and grab only osteo metadata 
metadata_files <- list.files(data_dir, pattern = "single_cell_metadata\\.tsv$", full.names = TRUE, recursive = TRUE)
metadata_ids <- stringr::word(dirname(metadata_files), -1, sep = "/")
names(metadata_files) <- metadata_ids
osteo_metadata_files <- metadata_files[osteo_project_ids]

# list of all immune cell types 
consensus_immune_file <- file.path(module_base, "references", "consensus-immune-cell-types.tsv")
```

```{r}
# source summarize_celltypes() function
setup_functions <- file.path(module_base, "exploratory-notebooks", "utils", "setup-functions.R")
source(setup_functions)
```

```{r, message=FALSE}
# get immune cell types 
immune_df <- readr::read_tsv(consensus_immune_file)
immune_types <- immune_df$consensus_annotation

# read in metadata
all_metadata_df <- osteo_metadata_files |>
  purrr::map(readr::read_tsv) |>
  dplyr::bind_rows() |>
  # select columns that might be useful 
  dplyr::select(
    project_id = scpca_project_id,
    library_id = scpca_library_id,
    disease_timing,
    age,
    sex,
    tissue_location,
    primary_or_metastasis,
    seq_unit
  )

# read in results and prep data frame for plotting 
all_results_df <- results_files |> 
  purrr::imap(summarize_celltypes) |> 
  dplyr::bind_rows(.id = "project_id") |> 
  # join with sample metadata
  dplyr::left_join(all_metadata_df, by = c("project_id", "library_id")) |>
  # remove pdx samples 
  dplyr::filter(sample_type != "patient-derived xenograft")
```


```{r}
# assign a color scheme for all bar charts 

# get all possible cell types 
# there are exactly 25 of them so will use alphabet with two greys for unknown/all remaining 
unique_celltypes <- all_results_df |> 
  dplyr::filter(percent_cells_annotation > 0.1, # we're only ever going to plot cell types that are greater than 1%
                !consensus_annotation == "Unknown") |> 
  dplyr::pull(consensus_annotation) |> 
  unique() |>
  as.character()

# define colors for all cell types
all_celltype_colors <- c(
  palette.colors(palette = "alphabet")[1:25],
  "grey60", # all remaining
  "grey95", # unknown
  "grey60" # non-immune, use the same as all remaining since they are never in the same plot
)
names(all_celltype_colors) <- c(unique_celltypes, "All remaining cell types", "Unknown", "non-immune")

# define a three color scheme for immune, other, and unknown
three_color_scheme <- c(
  "immune" = "navy",
  "non-immune" = "grey60",
  "Unknown" = "grey95"
)
```

## Composition of top cell types 

Here we look at the top cell types identified across all samples. 
We have already looked at this in `02-explore-consensus-results.Rmd`, but here we will plot all samples from all three osteo projects ordered by the total percent of cells labeled with a consensus label. 

```{r}
# add column of "top cell types" for easier plotting 
all_results_df <- all_results_df |> 
  dplyr::mutate(
    # get most frequently observed cell types across libraries in that project 
    top_celltypes = forcats::fct_lump_n(consensus_annotation, 15, other_level = "All remaining cell types", ties.method = "first") |> 
      # sort by frequency 
      forcats::fct_infreq() |> 
      # make sure all remaining and unknown are last, use this to assign colors in specific order
      forcats::fct_relevel("All remaining cell types", "Unknown", after = Inf),
  )
```


```{r}
# get a vector of library ids ordered by total percentage annotated
library_levels <- all_results_df |> 
  dplyr::filter(consensus_annotation != "Unknown") |> 
  dplyr::group_by(library_id) |> 
  dplyr::summarize(
    total_percent_annotated = sum(total_cells_per_annotation)/total_cells_per_library
  ) |>
  unique() |> 
  dplyr::arrange(desc(total_percent_annotated)) |> 
  dplyr::pull(library_id)

# append any libraries that have 0 cells annotated
non_annotated_libs <- setdiff(unique(all_results_df$library_id), library_levels)
library_levels <- c(library_levels, non_annotated_libs)

# reorder by total percentage annotated 
total_order_df <- all_results_df |> 
  dplyr::mutate(
    library_id = forcats::fct_relevel(library_id, library_levels)
  )
```


```{r}
stacked_barchart(total_order_df, fill_color = "top_celltypes", colors = all_celltype_colors)
```

It looks like there's definitely some variation between distributions of cell types within the osteo samples. 
Generally, we see most annotated cells are smooth muscle cells and endothelial cells. 
There also appears to be some samples that have macrophages and/or T cell populations. 
We also see a handful of samples that don't have any cells that are annotated.

```{r}
stacked_barchart(total_order_df, fill_color = "top_celltypes", facet_variable = "project_id", colors = all_celltype_colors)
```

It looks like both total number of cells that are classified and composition of those cells is project dependent. 
This makes sense since sample prep is probably different across labs. 

## Immune cell populations

Let's look specifically at immune cell populations. 
To do this we will lump all immune cells as "immune", all non-immune as "non-immune", and all unknown as "unknown". 
Here we will sort by total percentage of immune cells. 

```{r}
# look at immune cell types vs. unknown vs. other 
all_results_df <- all_results_df |> 
  #dplyr::left_join(immune_df, by = c("consensus_annotation", "consensus_ontology")) |> 
  dplyr::mutate(
    # first get a column that is just immune, unknown, or other
    immune_category = dplyr::case_when(
      consensus_annotation %in% immune_types ~ "immune",
      consensus_annotation == "Unknown" ~ "Unknown",
      .default = "non-immune"
    )
  )
```

```{r}
# now sort just by immune percentage
immune_pct_df <- all_results_df |> 
  dplyr::filter(immune_category == "immune") |> 
  dplyr::group_by(library_id) |> 
  dplyr::summarize(
    percent_immune = sum(total_cells_per_annotation)/total_cells_per_library
  ) |>
  dplyr::select(library_id, percent_immune) |> 
  unique() 

library_levels <- immune_pct_df |> 
  dplyr::arrange(desc(percent_immune)) |> 
  dplyr::pull(library_id)

# append any libraries that have 0 immune cells annotated
non_annotated_libs <- setdiff(unique(all_results_df$library_id), library_levels)
library_levels <- c(library_levels, non_annotated_libs)

immune_results_df <- all_results_df |> 
  dplyr::left_join(immune_pct_df, by = c("library_id")) |> 
  dplyr::mutate(
    percent_immune = dplyr::if_else(is.na(percent_immune), 0, percent_immune),
    library_id = forcats::fct_relevel(library_id, library_levels)
  )

stacked_barchart(immune_results_df, fill_color = "immune_category", colors = three_color_scheme)
```
Let's see how the percentage of immune cells compares across projects. 

```{r}
stacked_barchart(immune_results_df, fill_color = "immune_category", facet_variable = "project_id", colors = three_color_scheme)
```

When looking at all samples together we do see variation in immune cells classified and it appears that `SCPCP000017` and `SCPCP000018` have more cells classified in general and have more cells classified as immune. 
It does appear that libraries that have more immune cell composition also have a higher percentage of non-immune cell types which could very well be a technical artifact and related to sample prep. 

Just for visualization purposes, let's classify these as "hot" and "cold", where "hot" tumors have an immune composition > 5%.

```{r}
immune_results_df <- immune_results_df |> 
  dplyr::mutate(
    # classify hot/cold based on immune percent > 5 %
    hot_cold = dplyr::if_else(percent_immune > .05, "hot", "cold")
  )
```

```{r}
stacked_barchart(immune_results_df, fill_color = "immune_category", facet_variable = "hot_cold", colors = three_color_scheme)
```

Our threshold is pretty arbitrary, but I think there are definitely some "cold" tumors where we see little to no immune involvement. 

Below we can look at all immune cell types vs. all non-immune. 

```{r}
immune_results_df <- immune_results_df |> 
  dplyr::mutate(
    is_immune = dplyr::if_else(
      immune_category == "immune" | top_celltypes == "Unknown",
      consensus_annotation,
      "non-immune"
    )
  )

stacked_barchart(immune_results_df, fill_color = "is_immune", colors = all_celltype_colors)
```

Generally, if immune cells are present they tend to be macrophages. 
There's also a lot of T cell populations, which makes sense since it has been previously noted that osteosarcoma has a high percentage of T cell infiltrate (https://doi.org/10.18632/oncotarget.19071).  

## Is there any relationship between immune cell percentage and clinical metadata? 

One thing I was curious about was whether or not there are any differences in the presence of immune cells based on clinical metadata such as primary vs. metastasis, initial diagnosis vs. recurrence, and tissue location (bone vs. soft tissue). 
I also expect there might be some differences between single-cell and single-nuclei samples. 

Below I look at the immune cell percentage stratified by each of the mentioned metadata. 
This is just a preliminary look and if we really want to dig into this further we should consider looking at algorithms for comparing cell type composition in single-cell RNA-seq data, but that is outside the scope of this notebook. 

```{r}
# do some data wrangling on the metadata columns we want to look at 
immune_results_df <- immune_results_df |> 
  dplyr::mutate(
    primary_or_metastasis = dplyr::if_else(!is.na(primary_or_metastasis), primary_or_metastasis, disease_timing),
    disease_timing_mod = dplyr::if_else(disease_timing %in% c("Initial diagnosis", "Recurrence"), disease_timing, "other"),
    tissue_location_mod = dplyr::case_when(
      stringr::str_detect(tissue_location, "Bone|femur|Humerus|Tibia") ~ "Bone",
      #stringr::str_detect(tissue_location, "femur") ~ "Bone",
      stringr::str_detect(tissue_location, "lung|Lung|Chest") ~ "lung or chest",
      .default = "other"
    )
  )
```


```{r, fig.height = 10}
# get a list of metadata we care about 
metadata_categories <- c(
  "primary_or_metastasis",
  "disease_timing_mod",
  "tissue_location_mod",
  "seq_unit"
)

# make stacked bar plots looking at only immune and non immune 
metadata_categories |> 
  purrr::map(\(category){
    
    stacked_barchart(
      immune_results_df,
      fill_color = "immune_category",
      facet_variable = category,
      colors = three_color_scheme
    )
}) |>
  patchwork::wrap_plots(ncol = 1, guides = "collect")
```

```{r, fig.height=10}
# now look at all top celltypes 
metadata_categories |> 
  purrr::map(\(category){
    
    stacked_barchart(
      immune_results_df,
      fill_color = "top_celltypes",
      facet_variable = category,
      colors = all_celltype_colors
    )
}) |>
  patchwork::wrap_plots(ncol = 1, guides = "collect")
```

Below are sina plots that compare the total immune percentage for all samples in a given category. 

```{r}
immune_plot_df <- immune_results_df |> 
  # just look at immune population across different diagnoses 
  dplyr::select(library_id, primary_or_metastasis, disease_timing_mod, tissue_location_mod, percent_immune, seq_unit) |> 
  unique()
```

```{r}
metadata_categories |> 
  purrr::map(\(category){
    sina_plot(immune_plot_df, category)
  })
```

It looks like there might be higher immune infiltrate in "bone" samples, but again I think if we want to make any conclusions we need to look at software that helps correct for technical artifacts. 
There may also be a difference in single-cell and single-nuclei, but there are much fewer single-cell samples so it's hard to say. 


## Session info 

```{r session info}
# record the versions of the packages used in this analysis and other environment information
sessionInfo()
```
