Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
PaulRegnier committed Jul 21, 2023
0 parents commit b5dfe82
Show file tree
Hide file tree
Showing 71 changed files with 6,724 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
^.*\.Rproj$
^\.Rproj\.user$
^doc$
^Meta$
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
/doc/
/Meta/
.Rproj.user
.Rhistory
.Rdata
.httr-oauth
.DS_Store
.quarto
42 changes: 42 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
Package: PICAFlow
Title: PICAFlow: a complete R workflow dedicated to flow/mass cytometry data, from data pre-processing to deep and comprehensive analysis
Version: 1.0
Authors@R:
person("Paul", "Régnier", , "paul.regnier@aphp.fr", role = c("aut", "cre"),
comment = c(ORCID = "0000-0002-9713-1132"))
Description: PICAFlow (Pipeline for Integrative and Comprehensive Analysis of Flow/mass cytometry data) is a R-written integrative workflow dedicated to flow/mass cytometry data analysis. It contains all the necessary functions and packages to go from data pre-processing to deep mining and analysis, presented in a user-friendly and easy-to-use fashion.
License: GPL (>= 3)
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
Imports:
parallel,
doSNOW,
foreach,
flowCore,
ggplot2,
pracma,
attempt,
flowStats,
uwot,
class,
readxl,
ROCit,
Biobase,
cowplot,
flowWorkspace,
ggcyto,
gplots,
matrixStats,
methods,
plotly,
rlang,
tcltk,
shiny,
utils,
stats
Suggests:
knitr,
rmarkdown,
testthat (>= 3.0.0)
VignetteBuilder: knitr
57 changes: 57 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Generated by roxygen2: do not edit by hand

export(ROCanalysis)
export(UMAPFlowset)
export(UMAP_clusters)
export(UMAP_downsampledDataset)
export(analyzePeaks)
export(applyClusterModel)
export(bindData)
export(clustersPercentagesHeatmap)
export(clustersPhenotypesHeatmap)
export(collapseCloseClusters)
export(compensateData)
export(constructPlots)
export(convertToRDS)
export(determineOptimalUMAPParameters)
export(determineParameterThreshold)
export(exportClustersStatsAndPlots)
export(exportDataBind)
export(exportDownsamplingOutput)
export(exportFCS)
export(exportFeaturesTables)
export(exportGatingParameters)
export(exportGatingStatistics)
export(exportParametersUsed)
export(exportPerParameter)
export(finalClusteringTraining)
export(gateData)
export(heatmapAbundancesGroups)
export(hierarchicalClusteringData)
export(initialClusteringTraining)
export(keepPreviousPeaksData)
export(launchManualLogicleShinyApp)
export(mergeData)
export(mergeMetadata)
export(mergeParameters)
export(mergeSamples)
export(normalizeData)
export(openClusteredFullDataCollapsed)
export(openDownsampledData)
export(openUMAPData)
export(plotFacets)
export(plotFinalClusteringTraining)
export(plotInitialClusteringTraining)
export(plotUMAP_projectionCollapsed)
export(plotUMAP_projectionTraining)
export(poolData)
export(removeOutliers)
export(setupWorkingDirectory)
export(splitDataset)
export(subsetData)
export(subsetDataUMAP)
export(synthesizePeaks)
export(testDatasetNormality)
export(transformData)
importFrom(foreach,"%do%")
importFrom(foreach,"%dopar%")
22 changes: 22 additions & 0 deletions PICAFlow.Rproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
Version: 1.0

RestoreWorkspace: No
SaveWorkspace: No
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: Sweave
LaTeX: pdfLaTeX

AutoAppendNewline: Yes
StripTrailingWhitespace: Yes
LineEndingConversion: Posix

BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageRoxygenize: rd,collate,namespace
253 changes: 253 additions & 0 deletions R/0_Preprocessing.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
#' Setup the working directory
#'
#' This function setups the working directory. It does not take any input argument but assumes that the `workingDirectory` variable is set to the path of interest using the base R `setwd()` function. From this path, everything will be wiped and `input`, `output` and `rds` directories will be created. The `output`` directory will also contain several other specific directories that will be used in the subsequent analyses.
#'
#' @export

setupWorkingDirectory = function()
{
workingDirectory = getwd()

unlink(file.path(workingDirectory, "input"), recursive = TRUE)
dir.create(file.path(workingDirectory, "input"))

unlink(file.path(workingDirectory, "output"), recursive = TRUE)
dir.create(file.path(workingDirectory, "output"))
dir.create(file.path(workingDirectory, "output", "1_Transformation"))
dir.create(file.path(workingDirectory, "output", "2_Normalization"))
dir.create(file.path(workingDirectory, "output", "3_Gating"))
dir.create(file.path(workingDirectory, "output", "4_Downsampling"))
dir.create(file.path(workingDirectory, "output", "5_UMAP"))
dir.create(file.path(workingDirectory, "output", "6_FCS"))
dir.create(file.path(workingDirectory, "output", "7_Clustering"))
dir.create(file.path(workingDirectory, "output", "8_Analysis"))

unlink(file.path(workingDirectory, "rds"), recursive = TRUE)
dir.create(file.path(workingDirectory, "rds"))
}

#' Convert `fcs` files to `rds` files
#'
#' This function converts every `fcs` file in the `input` directory to a `flowFrame` object (from `flowCore` package) encapsulated into a `rds` file. It eventually renames the parameters used in the dataset if desired. This step helps to decrease the overall computing time and complexity of several next steps.
#'
#' @param conversionTable A tabular-delimited text file containing the appropriate information to convert one or several channels to other ones. This is used when the overall staining mix presents the same specificities but slightly different fluorophores depending on the samples. This typically occurs when the cytometer configuration is modified or when the cytometer is different from one batch to another. Defaults to `NULL`.
#'
#' The `conversionTable` table follows a pre-defined format: 4 columns in any order (`from_desc`, `to_desc`, `from_name` and `to_name`) and any given number of line, each line referring to a specific matching to be treated. For instance, if a line has the values `from_desc = CXCR5 B610-ECD-A`, `to_desc = CXCR5 ECD-A`, `from_name = FL2-A` and `to_name = FL11-A`, it means that any occurrence of a parameter (in any `rds` file) named `FL2-A` which also matches the description `CXCR5 B610-ECD-A` will see its values respectively replaced with `FL11-A` and `CXCR5 ECD-A`.
#'
#' Please note that the renaming both affects the parameters of each file AND the parameters in each self-contained compensation matrix.
#'
#' @return Generated `rds` files are saved to `rds` directory. The function also returns a matrix of all unique parameters (once correctly renamed) used in the dataset.
#'
#' @importFrom foreach %dopar%
#'
#' @export

convertToRDS = function(conversionTable = NULL)
{
a = NULL
b = NULL

workingDirectory = getwd()

fcs.dir = file.path(workingDirectory, "input")
filesToOpen = dir(fcs.dir, full.names = TRUE)

if (parallel::detectCores() == 1)
{
coresNumber = 1
} else
{
coresNumber = parallel::detectCores() - 1
}

cl = parallel::makeCluster(coresNumber, type = "PSOCK")
doSNOW::registerDoSNOW(cl)

pb = utils::txtProgressBar(min = 0, max = length(filesToOpen), style = 3)

progress = function(n) utils::setTxtProgressBar(pb, n)
opts = list(progress = progress)

totalParametersInfos = foreach::foreach(a = filesToOpen, .packages = c("foreach", "flowCore", "tcltk"), .combine = "c", .options.snow = opts) %dopar%
{
currentData = flowCore::read.FCS(a, transformation = FALSE, truncate_max_range = FALSE, ignore.text.offset = TRUE)
currentFilename = gsub("(.+)/(.+)\\.fcs", "\\2", a)

if (is.null(conversionTable) == FALSE)
{
currentFileParameterDescriptions = as.vector(currentData@parameters@data[, "desc"])
currentFileParameterNames = as.vector(currentData@parameters@data[, "name"])

compensationMatricesSlot = as.numeric(which(lengths(flowStats::spillover(currentData)) > 0))
compensationMatricesSlotName = names(flowStats::spillover(currentData))[compensationMatricesSlot]

foreach::foreach(b = 1:length(currentFileParameterDescriptions)) %do% {
currentDescriptionFrom = currentFileParameterDescriptions[b]
currentDescriptionName = currentFileParameterNames[b]

currentNameFromMatching = which((conversionTable$from_desc == currentDescriptionFrom) & conversionTable$from_name == currentDescriptionName)

if (length(currentNameFromMatching) > 0)
{
currentDescriptionTo = conversionTable[conversionTable$from_desc == currentDescriptionFrom, "to_desc"]
currentNameTo = conversionTable[conversionTable$from_desc == currentDescriptionFrom, "to_name"]
currentNameFrom = conversionTable[conversionTable$from_desc == currentDescriptionFrom, "from_name"]

rowToReplaceID = which(currentData@parameters@data$desc == currentDescriptionFrom)

currentData@parameters@data[rowToReplaceID, "desc"] = paste(currentDescriptionTo, "_replaced", sep = "")
currentData@parameters@data[rowToReplaceID, "name"] = paste(currentNameTo, "_replaced", sep = "")

colToReplaceID = as.numeric(which(colnames(currentData@exprs) == currentNameFrom))
colnames(currentData@exprs)[colToReplaceID] = paste(currentNameTo, "_replaced", sep = "")

matchingCompensationNameID = which(colnames(currentData@description[compensationMatricesSlotName][[1]]) == currentNameFrom)

if (length(matchingCompensationNameID) > 0)
{
colnames(currentData@description[compensationMatricesSlotName][[1]])[matchingCompensationNameID] = paste(currentNameTo, "_replaced", sep = "")
}
}
}

currentData@parameters@data[, "desc"] = gsub("_replaced", "", currentData@parameters@data[, "desc"])
currentData@parameters@data[, "name"] = gsub("_replaced", "", currentData@parameters@data[, "name"])
colnames(currentData@exprs) = gsub("_replaced", "", colnames(currentData@exprs))

colnames(currentData@description[compensationMatricesSlotName][[1]]) = gsub("_replaced", "", colnames(currentData@description[compensationMatricesSlotName][[1]]))
}

currentFileDescriptions = as.vector(currentData@parameters@data[, "desc"])
currentFileDescriptions[is.na(currentFileDescriptions)] = paste("Empty Description #", c(1:length(which(is.na(currentFileDescriptions)))), sep = "")

currentParametersInfos = list(as.vector(currentData@parameters@data[, "name"]), currentFileDescriptions)

names(currentParametersInfos) = c(paste("names_", currentFilename, sep = ""), paste("descriptions_", currentFilename, sep = ""))

saveRDS(currentData, file.path("rds", paste(currentFilename, ".rds", sep = "")))

gc()

return(currentParametersInfos)
}

close(pb)

parallel::stopCluster(cl)

totalParametersNamesID = grep("names", names(totalParametersInfos))
totalParametersNames = totalParametersInfos[totalParametersNamesID]
totalParametersNames = totalParametersNames[[1]]

totalParametersDescriptionsID = grep("descriptions", names(totalParametersInfos))
totalParametersDescriptions = totalParametersInfos[totalParametersDescriptionsID]
totalParametersDescriptions = totalParametersDescriptions[[1]]

totalParametersData = data.frame(totalParametersNames, totalParametersDescriptions, stringsAsFactors = FALSE)
totalParametersData = cbind(1:nrow(totalParametersData), totalParametersData)
colnames(totalParametersData) = c("Parameter_ID", "Parameter_Name", "Parameter_Description")

return(totalParametersData)
}

#' Subset `rds` files
#'
#' This function subsets every `rds` file with the specified parameters of interest. It can also rename each parameter in a more user-friendly way.
#'
#' @param parametersToKeep A vector of all parameters to keep in each `rds` file. Defaults to `NULL`.
#'
#' @param customNames A vector of the same length as `parametersToKeep` containing the user-friendly parameter names. Defaults to `NULL`.
#'
#' @return Generated `rds` files are saved to `rds` directory, overwriting the previous ones.
#'
#' @importFrom foreach %dopar%
#'
#' @export

subsetData = function(parametersToKeep = NULL, customNames = NULL)
{
a = NULL

filesToOpen = dir(file.path("rds"), full.names = TRUE)

coresNumber = parallel::detectCores() - 1

cl = parallel::makeCluster(coresNumber, type = "PSOCK")
doSNOW::registerDoSNOW(cl)

pb = utils::txtProgressBar(min = 0, max = length(filesToOpen), style = 3)
progress = function(n) utils::setTxtProgressBar(pb, n)
opts = list(progress = progress)

foreach::foreach(a = filesToOpen, .packages = c("foreach", "flowCore", "tcltk"), .options.snow = opts) %dopar%
{
currentData = readRDS(a)
currentFilename = gsub("(.+)/(.+)\\.rds", "\\2", a)

currentData = currentData[, parametersToKeep]

currentData@parameters@data[, "desc"] = customNames

saveRDS(currentData, file.path("rds", paste(currentFilename, ".rds", sep = "")))

rm(currentData)
rm(currentFilename)
gc()
}

parallel::stopCluster(cl)
}

#' Compensate `rds` files
#'
#' This function applies the self-contained compensation matrix of each `rds` file to every desired parameter.
#'
#' @param parametersToCompensate A vector of all parameters to compensate in each `rds` file. Defaults to `NULL`.
#'
#' @return Generated `rds` files are saved to `rds` directory, overwriting the previous ones.
#'
#' @importFrom foreach %dopar%
#'
#' @export

compensateData = function(parametersToCompensate = NULL)
{
a = NULL

filesToOpen = dir(file.path("rds"), full.names = TRUE)

coresNumber = parallel::detectCores() - 1

cl = parallel::makeCluster(coresNumber, type = "PSOCK")
doSNOW::registerDoSNOW(cl)

pb = utils::txtProgressBar(min = 0, max = length(filesToOpen), style = 3)
progress = function(n) utils::setTxtProgressBar(pb, n)
opts = list(progress = progress)

foreach::foreach(a = filesToOpen, .packages = c("foreach", "flowCore", "tcltk"), .options.snow = opts) %dopar%
{
# Debug only : a = rds.files[1:coresNumber]

currentData = readRDS(a)
currentFilename = gsub("(.+)/(.+)\\.rds", "\\2", a)

compensationMatricesSlot = as.numeric(which(lengths(flowStats::spillover(currentData)) > 0))
compensationMatrix = flowStats::spillover(currentData)[[compensationMatricesSlot]]
parametersToKeepID = which(colnames(compensationMatrix) %in% parametersToCompensate)
currentSampleComp = compensationMatrix[parametersToKeepID, parametersToKeepID]

currentData = flowCore::compensate(currentData, currentSampleComp)

saveRDS(currentData, file.path("rds", paste(currentFilename, ".rds", sep = "")))

rm(currentData)
rm(currentSampleComp)
rm(parametersToKeepID)
rm(compensationMatrix)
rm(compensationMatricesSlot)
gc()
}

parallel::stopCluster(cl)
}
Loading

0 comments on commit b5dfe82

Please sign in to comment.