project_parameters.Config.yaml

# the following parameters are the same across the project and might be needed in more than one module #
root_dir: "./sc-rna-seq-snap" # path to the main dir of the project where GitHub repo lives
data_dir: "./sc-rna-seq-snap/analyses/cellranger-analysis/results/02_cellranger_count/ForcedCells8000Parameters" # path to data dir of the project with CellRanger output results
metadata_dir: "./mouse-test-dataset" # path to metadata dir of the project
genome_name: "GRCm39" # define genome reference and versioning
PROJECT_NAME: "mouse-test-dataset"
PI_NAME: "Stanislav Zakharenko"
TASK_ID: "NA"
PROJECT_LEAD_NAME: "NA"
DEPARTMENT: "Developmental Neurobiology"
LEAD_ANALYSTS: "Antonia Chroni, PhD"
GROUP_LEAD: "Cody A. Ramirez, PhD"
CONTACT_EMAIL: "antonia.chroni@stjude.org"
PIPELINE: "Standard sc-/sn-RNA-Seq Analysis in 10X Genomics data"
START_DATE: "10/15/2024"
COMPLETION_DATE: "ONGOING"


# the following parameters are set up as default values and/or are specific for the following modules: 
# `./analyses/fastqc-analysis`
# path to the fastqc files for the `fastqc-analysis` module
fastqc_dir: 
  - /path1
  #- /path2

# `./analyses/cellranger-analysis`
genome_reference_path: "./" #path to genome reference to be used for the `cellranger-analysis` module
cellranger_parameters: "ForcedCells8000Parameters" # define as: "DefaultParameters", "ForcedCells8000Parameters", or else
genome_name_cellranger: "GRCm39" #please define the genome of preference for dual genomes. In case for single genomes, please use the same as used for `genome_name`.

# `./analyses/upstream-analysis`
print_pdf_seurat_multiple_samples: "YES" # set value for 01B_run_seurat_qc_multiple_samples.R
use_condition_split_seurat_multiple_samples: "NO" # set value for 01B_run_seurat_qc_multiple_samples.R
grouping: "orig.ident" # define grouping to use 
Regress_Cell_Cycle_value: "NO" # Indicates whether or not to regress for cell cycle and, if so, which method to use and scale data; acceptable values: "YES", "NO" and "DIFF"
assay_seurat_qc: "RNA" # define assay: this will always be "RNA"
use_SoupX_filtering_seurat_qc: "YES" # set for 02A_run_seurat_qc.Rmd; "YES", "NO"
min_genes: 300 # define minimum number of genes for filtering
min_count: 500 # define minimum number of UMIs for filtering
mtDNA_pct_default: 10 # define minimum percentage of mtDNA for filtering
normalize_method: "log_norm" # define method for normalization of counts
num_pcs: 30 # define number of principal components
nfeatures_value: 3000 # define number of variable features
prefix: "lognorm" # create label based on the normalization method used
use_miQC: "NO" # define use of miQC R package or not; see `README.md` file for more information; acceptable values: "YES" and "NO"
use_only_step1: "YES" # define use of both or only first step for filtering low quality cells; see `README.md` file for more information; acceptable values: "YES" and "NO"
condition_value: "Genotype" # define main condition of the project; this can be used for visualization purposes on the UMAPs; value to be extracted from column name in `project_metadata.tsv` file
num_dim_seurat_qc: [20, 25] # number of PCs to use in UMAP
num_neighbors_seurat_qc: [10, 20, 30] # number of neighbors to use in UMAP
soup_fraction_value_default: 0.05 # set rho default value to use if estimated rho is > 20%
assay_filter_object: "RNA_SoupX" # define as: "RNA" or "RNA_SoupX"; define assay for 04_run_filter_object.Rmd
num_dim_filter_object: 30 # set one value for 04_run_filter_object.Rmd
num_neighbors_filter_object: 30 # set one value for 04_run_filter_object.Rmd
use_condition_split_filter_object: "YES" # set for 04_run_filter_object.Rmd
print_pdf_filter_object: "NO" # set for 04_run_filter_object.Rmd
use_SoupX_filtering_filter_object: "YES" # set for 04_run_filter_object.Rmd; "YES", "NO"
use_scDblFinder_filtering_filter_object: "NO" # set for 04_run_filter_object.Rmd; "YES", "NO"
#PCA_Feature_List_value: transcription.factor.gene.list # set for 04_run_filter_object.Rmd if necessary
use_SoupX_filtering_summary_report: "YES" #set value for 05_run_summary_report.Rmd; "YES", "NO"
use_scDblFinder_filtering_summary_report: "NO" #set value for 05_run_summary_report.Rmd; "YES", "NO"

# `./analyses/integrative-analysis`
use_seurat_integration: "NO"
use_harmony_integration: "YES"
use_liger_integration: "NO"
integration_method: "harmony" #options:"seurat", "harmony", "inmf"
num_dim_seurat: 30
num_dim_seurat_integration: 50
big_data_value: FALSE #options:"TRUE" or "FALSE"
num_dim_harmony: 30
n_neighbors_value: 20
variable_value: "ID"
reference_list_value: NULL
PCA_Feature_List_value: NULL
#future_globals_value_integration_module: 1048576000 #Indicative but might need to be adjusted based on cohort size. Seurat: 20971520000, 2000 * 1024^2; Harmony: 1048576000, 1000 * 1024^2; Liger: 1048576000, 1000 * 1024^2

# `./analyses/cluster-cell-calling`
#future_globals_value_clustering_module: 1048576000 #Indicative but might need to be adjusted based on cohort size. 1048576000, 1000 * 1024^2
resolution_clustering_module: "custom_multiple" # "custom_multiple", "default_multiple" or define value of resolution, e.g., 0.5
integration_method_clustering_module: "harmony" # other options: "seurat", "harmony", "inmf"
num_dim_clustering_module: 30 
reduction_value_clustering_module: "harmony" # seurat: "pca"; harmony: "harmony"; liger: "inmf"
assay_clustering_module: "RNA_SoupX" # define assay; if user wants to use SoupX matrix, this should be replaced as: "RNA_SoupX" or "RNA"

# Initially, we use a list of multiple resolutions and then we run the module again with the single resolution that fits the data best. 
# If no list is provided, then it will calculate the clusters by the default list.
# We recommend to run first by default and then explore a customized list of resolutions (if these are not provided in the list already).
# In the latter case, user needs to comment in/out the `resolution_list_default_clustering_module` accordingly.
resolution_list_clustering_module: [0.1, 0.5, 1] # [0.1, 0.5, 1]; [0.5]; NULL; This can be a single or multiple resolutions or NULL.
resolution_list_default_clustering_module: [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
algorithm_value_clustering_module: 4 # Leiden algorthim

resolution_find_markers: "custom_multiple" # "custom_multiple", "default_multiple" or define value of resolution, e.g., 0.5
resolution_list_find_markers: 0.1 # this will be the single resolution that fits the data the best
n_value_find_markers: 10 # number of top genes to explore

# `./analyses/cell-contamination-removal-analysis`
#future_globals_value_step1_contamination_module: 8388608000 #8000 * 1024^2
keep_clusters_contamination_module: [1, 2, 3, 4, 5, 7, 8, 9, 10, 12, 13, 15, 16]
#future_globals_value_step2_contamination_module: 1048576000 #Indicative but might need to be adjusted based on cohort size. Seurat: 20971520000, 2000 * 1024^2; Harmony: 1048576000, 1000 * 1024^2; Liger: 1048576000, 1000 * 1024^2
assay_contamination_module: "RNA_SoupX" # define assay; if user wants to use SoupX matrix, this should be replaced as: "RNA_SoupX"; otherwise it is "RNA" by default

# `./analyses/cell-types-annotation`
#future_globals_value_annotation_module: 1048576000 #Indicative but might need to be adjusted based on cohort size. 1048576000, 1000 * 1024^2
module_with_input_data: "cluster-cell-calling" # or "cell-contamination-removal-analysis"
input_data_folder_name: "01_cluster_cell_calling" # or "03_cluster_cell_calling" if the data are coming from the `cell-contamination-removal-analysis` module
redution_value_annotation_module: "umap" #for seurat and harmony use: "umap"; for Liger: "glue::glue("{integration_method}")"
min.diff.med_value_annotation_module: 0.1 # higher thresholds for pruning labels correspond to greater assignment certainty
use_min.diff.med_annotation_module: "NO" # options: 'YES' or 'NO'
assay_annotation_module: "RNA_SoupX" # define assay; if user wants to use SoupX matrix, this should be replaced as: "RNA_SoupX" or "RNA"