-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAdaptable-RtSNE_v2.0.0.R
361 lines (270 loc) · 18.2 KB
/
Adaptable-RtSNE_v2.0.0.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
# Adaptable RtSNE v2.0.0
# Description
# Running bh-tSNE on data from FCS or CSV files, with additional functionality including phenograph
# Read data from FCS files, run tSNE, write back to FCS file with new tSNE values
# Prints the 'verbose' from the tSNE analysis as well
# Most parameters that control the tSNE algorithm are modifiable in the Rtsne function
# Details
# Author: Thomas Myles Ashhurst
# Published: 2017-10-11
# Contact: tomashhurst@gmail.com
# Github: https://github.com/sydneycytometry/Adaptable-RtSNE
# Website: www.sydneycytometry.org.au
# Acknowledgements
# tsne algorithm developed by Lauren van der Maaten https://github.com/lvdmaaten/bhtsne
# van der Maaten, L.J.P., Hinton, G.E. (2008). Visualizing High-Dimensional Data Using t-SNE. Journal of Machine Learning Research 9(Nov):2579-2605.
# van der Maaten, L.J.P. (2014). Accelerating t-SNE using Tree-Based Algorithms. Journal of Machine Learning Research 15(Oct):3221-3245.
# tsne licence: https://github.com/lvdmaaten/bhtsne/blob/master/LICENSE.txt
# Script structure adapted from https://github.com/lmweber/Rtsne-example
# Rtsne function by Jesse Krijthe
# Version 0.12 and 0.13 of Rtsne compiled after discussion with Jesse Krijthe, regarding adding additional input arguments into Rtsne function
# FCS file reading and writing adapted from https://gist.github.com/yannabraham/c1f9de9b23fb94105ca5
# Version
# R version 3.2.3 (2015-12-10) -- "Wooden Christmas-Tree"
# flowCore version 1.42.2
# plyr version 1.8.4
# Biobase version 2.36.2
# Rtsne version 0.13 # important, v 0.12 and 0.13 use different value systems for stop_lying_iter and mom_switch_iter
# flowViz version 1.40.0
# Summary
# 1 INSTALL AND LOAD PACKAGES
# 1.1 - Install packages
# 1.2 - Load packages
# 1.3 - Set working directory
# 2 USER INPUT -- DATA PREPARATION
# 2.1 - Define options
# 2.2 - Read files (FCS or CSV) into data
# 2.3 - Create parameter names
# 2.4 - Remove duplicates (optional, but recommended)
# 2.5 - Modifications (if required, skip if not required)
# 2.6 - Select markers for the calculation of t-SNE parameters
# 3 END USER INPUT -- RUN ENTIRE SCRIPT
# 3.1 - Create output directories
# 3.2 - Save a copy of 'data' (original data) and 'data specific' (data used to calculate tSNE)
# 3.3 - Run tSNE (if defined in 2.1)
# 3.4 - Run Phenograph (if defined in 2.1)
# 3.5 - Write data (with tSNE and phenograph parameters) to .csv
# 3.6 - Write data (with tSNE and phenograph parameters) to .fcs
# 3.7 - Create coloured tSNE plots (separate script: github.com/sydneycytometry/tSNEplots)
###################################################### 1. INSTALL AND-OR LOAD PACKAGES ######################################################
### 1.1 - Install packages (if not already installed)
if(!require('flowCore')) {install.packages('flowCore')}
if(!require('plyr')) {install.packages('plyr')}
if(!require('Biobase')) {install.packages("Biobase")}
if(!require('Rtsne')) {install.packages("Rtsne")}
if(!require('ggplot2')) {install.packages('ggplot2')}
if(!require('devtools')){install.packages("devtools")}
if(!require('Rphenograph')){devtools::install_github("JinmiaoChenLab/Rphenograph")}
if(!require('flowViz')) {source("https://bioconductor.org/biocLite.R")
biocLite('flowViz')}
### 1.2 Load packages
library('flowCore')
library('plyr')
library('Biobase')
library('Rtsne')
library('ggplot2')
library('devtools')
library("Rphenograph")
library('flowViz')
### 1.3 - Set working directory and assign as 'PrimaryDirectory'
setwd("/Volumes/GoogleDrive/My Drive/2017 Ashhurst work/1. ASHHURST/01. R&D - Technology/[Script projects]/New/Phenograph/Sample data/")
setwd("/Users/Tom/Desktop/tSNEplots-master 2/Input/") # Set your working directory here (e.g. "/Users/Tom/Desktop/") -- press tab when selected after the '/' to see options
getwd() # Check your working directory has changed correctly
PrimaryDirectory <- getwd() # Assign the working directory as 'PrimaryDirectory'
PrimaryDirectory
### Alternative working directory method, with rstudio api
#dirname(rstudioapi::getActiveDocumentContext()$path) # Finds the directory where this script is located
#setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) # Sets the working directory to where the script is located
#getwd()
#PrimaryDirectory <- getwd()
#PrimaryDirectory
###################################################### 2. BEGIN USER INPUT ######################################################
### 2.1 - Define options
## Use to list the .fcs or .csv files in the working directory
list.files(path=PrimaryDirectory, pattern = ".fcs") # list of FCS files
list.files(path=PrimaryDirectory, pattern = ".csv") # list of CSV files
## Specify options
Data_name <- "Sample_Data" # Name the dataset something meaningful (e.g. "CNS_sample", "Human_PMBC")
File_Type <- ".csv" # Use ".fcs" OR ".csv", in lower case
Run_Phenograph <- 1 # Enter a 1 to run Phenograph, or 0 to skip
Run_tSNE <- 1 # Enter a 1 to run tSNE, or 0 to skip
# Important, the only FCS/CSV file in the directory should be the one desired for analysis. If more than one are found, only the first file will be used
### 2.2 - Read files into 'data' (run all at once, no modifications required) -- if both are there, CSV will take priority
if (File_Type == ".fcs"){ # if files are .fcs, will read into data
FileNames <- list.files(path=PrimaryDirectory, pattern = ".fcs")
FileNames
fcsfilename <- FileNames[[1]] # Enter name of FCS file in between the ""
data <- exprs(read.FCS(fcsfilename, transformation = FALSE))
dim(data) # check the total number of events (first entry), and number of parameters (second entry)
data_parameters <- parameters(read.FCS(fcsfilename))
data_parameters}
if (File_Type == ".csv"){ # if files are .csv, will read into data
FileNames <- list.files(path=PrimaryDirectory, pattern = ".csv")
FileNames
FileNames[[1]]
csvfilename <- FileNames[[1]]
data <- read.csv(csvfilename)
dim(data)}
### 2.3 - Create set of column (parameter) names
head(data) # show data with headings
ColumnNames <- unname(colnames(data)) # assign reporter and marker names (column names) to 'ColumnNames'
ColumnNames # Use ColumnNames (assigned above) to review the column names
### 2.4 - Remove duplicates (optional, may cause problems for tSNE if ignored)
dim(data) # note the second entry, which is number of parameters
data <- data[!duplicated(data), ] # remove rows containing duplicate values within rounding
dim(data) # check to see if the number of parameters has reduced (no change means no duplicates were present)
### 2.5 - Modifications (IF REQUIRED, SKIP IF NOT REQUIRED)
## IF REQUIRED, Arcsinh transformation (only if required, data may have already been transformed)
data # note the kinds of values in the data set (i.e. are the between 1 and 10, 100 and 1000, etc)
asinh_scale <- 5 # for CyTOF choose between 5 and 15, for flow you will need > 200.
data <- asinh(data / asinh_scale) # transforms all columns! including event number etc
data # check the data has been converted
## IF REQUIRED, subsampling (only if required)
dim(data)
nsub <- 10000 # set the desired number of cells for subsampling
set.seed(123) # set random seed
data <- data[sample(1:nrow(data), nsub), ]
dim(data)
### 2.6 - Select the markers to use in calculation of t-SNE parameters, by including below which columns in 'data' to include
as.matrix(ColumnNames) # view the column 'number' for each parameter
ColumnNames_for_tSNE <- unname(colnames(data))[c(2,3,4,5)] # e.g. [c(11, 23, 10)] to include the markers corresponding to the column numbers 11, 23, 10
ColumnNames_for_tSNE # check that the column names that appear are the ones you want to analyse
dim(data) # Review dimensionality of 'data' -- N events and N parameters
data_specific <- data[, ColumnNames_for_tSNE] # Prepare data for Rtsne -- select columns to use
dim(data_specific) # Review dimensionality of 'data' -- N events and N parameters
# Note, the final tSNE and Phenograph parameters are added 'data' (after transformation and subsampling), not 'data_specific' stage
# data_specific is used to specify which parametes are used to run tSNE and Phenograph
###################################################### 3. END USER INPUT (unless modifying tSNE defaults) ######################################################
### Run the rest of the script all at once
### 3.1 - Create output directories
setwd(PrimaryDirectory)
dir.create("Output", showWarnings = FALSE)
setwd("Output")
OutputDirectory <- getwd()
setwd(OutputDirectory)
dir.create("Output_info", showWarnings = FALSE)
setwd(OutputDirectory)
dir.create("Output_data", showWarnings = FALSE)
### 3.2 - Save a copy of 'data' and 'data specific'
setwd(OutputDirectory)
setwd("Output_info")
## Export subsampled data in CSV format (record of what was run)
subsampled_data_name <- paste0("Subsampled_data_", Data_name)
subsampled_data <- paste(subsampled_data_name, ".csv", sep = "")
write.csv(x = data, file = subsampled_data)
## Export a list of parameters used to run tSNE and Phenograph
tSNEparametersname <- paste0("Columns_used_for_tSNE_and_phenograph_", Data_name)
tSNEparameters <- paste(tSNEparametersname, ".csv", sep = "")
write.csv(x = data_specific, file = tSNEparameters)
### 3.3 - Run tSNE ('data' at this point is scaled, transformed) -- RUN ALL OF STEP 5
if (Run_tSNE == 1) {
## Set working directory to "Output_info"
setwd(OutputDirectory)
setwd("Output_info")
## Turn sink on (send the verbose text from the tSNE algorithm progress updates to a .txt file)
verbose_name <- paste0("tSNE_verbose_", Data_name, ".txt")
sink(file = verbose_name, append=TRUE, split=FALSE, type = c("output", "message"))
## Run tSNE algorithm
set.seed(42) # default = 42 -- sets seed for reproducibility, delete or comment out for random tSNE
tsne_out <- Rtsne(as.matrix(data_specific), # dataset (assigned above) read as a matrix
dims = 2, # default = 2 -- only input 2 or 3 (2 = plot in 2D, 3 = plot in 3D)
initial_dims = 50, # default = 50 -- number of dimensions retained in initial PCA step
perplexity = 30, # default = 30
theta = 0.5, # default = 0.5 -- use 0.5 for Barnes-Hut tSNE, 0.0 for exact tSNE (takes longer)
check_duplicates = TRUE, # default = TRUE, can set to FALSE if problematic
pca = TRUE, # default = TRUE -- performs a PCA prior to actual tSNE run
max_iter = 1000, # default = 1000 -- default total iterations
verbose = TRUE, # default = FALSE -- best to set as TRUE to receive feedback
is_distance = FALSE, # default = FALSE -- experimental, using X as a distance matrix
Y_init = NULL, # default = NULL -- recommend to use NULL for random initialisation
stop_lying_iter = 250, # default = 250 -- number of iterations of early exaggeration
mom_switch_iter = 250, # default = 250 -- number of iterations before increased momentum of spread
momentum = 0.5, # default = 0.5 -- initial momentum of spread
final_momentum = 0.8, # default = 0.8 -- momentum of spread at 'final_momentum'
eta = 200, # default = 200 -- learning rate
exaggeration_factor = 12.0 # default = 12.0 -- factor used during early exaggeration
)
## turn sink off
sink()
## if tSNE detected duplicates, the following errors should occur:
# "Error in Rtsne.default(as.matrix(data_specific), dims = 2, initial_dims = 50, : Remove duplicates before running TSNE."
# "Error in inherits(.data, "split") : object 'tsne_out' not found"
## Save tsne_out output info as .csv (i.e., output settings as raw results)
tsne_out.df <- ldply(tsne_out, data.frame)
output_name <- paste0("tSNE_parameter_values_", Data_name, ".csv")
write.csv(x = tsne_out.df, file = output_name) # pretty blood good -- doesn't give row number for Y, costs, or itercosts -- but easy to figure out
## Add Y (tSNE coordinate) values to starting data
tsne_out_Y <- tsne_out$Y
tsne_out_Y
data <- cbind(data, tSNE1 = tsne_out_Y[,1])
data <- cbind(data, tSNE2 = tsne_out_Y[,2])
}
### 3.4 - Run Phenograph
if (Run_Phenograph == 1) {
setwd(OutputDirectory)
setwd("Output_info")
# set sink (send the verbose text from the tSNE algorithm progress updates to a .txt file)
verbose_ph_name <- paste0("Phenograph_verbose_", Data_name, ".txt")
sink(file = verbose_ph_name, append=TRUE, split=FALSE, type = c("output", "message"))
# Run Phenograph
data_ph_matrix <- as.matrix(data_specific)
Rphenograph_out <- Rphenograph(data_ph_matrix, k = 45)
Rphenograph_out
modularity(Rphenograph_out[[2]])
membership(Rphenograph_out[[2]])
# Save phenograph cluster number results
output_name_ph <- paste0("Phenograph_cluster_values_", Data_name, ".csv")
write.csv(x = factor(membership(Rphenograph_out[[2]])), file = output_name_ph)
# Add phenograph cluster numbers to 'data'
as.numeric(as.character(factor(membership(Rphenograph_out[[2]])))) # converts factor to integer
data$phenograph_cluster <- as.numeric(as.character(factor(membership(Rphenograph_out[[2]])))) # writes integer version to data
# end sink
sink()
sink() # just in case
}
# ggplot(data, aes(x=tSNE1, y=tSNE2, col=phenograph_cluster)) + geom_point(size = 3)+theme_bw()
# ggplot(data, aes(x=X149Sm_CD19, y=X152Gd_CD3e, col=phenograph_cluster)) + geom_point(size = 3)+theme_bw()
# shape=phenograph_cluster
### 3.5 - Write data (with tSNE and phenograph parameters) to .csv
## Set working directory
setwd(OutputDirectory)
setwd("Output_data")
## Save data (with new tSNE parameters) as CSV
new_file_name_csv <- paste0("Output_", Data_name, "_with_tSNE", ".csv")
write.csv(x = data, file = new_file_name_csv)
### 3.6 - Write data (with tSNE and phenograph parameters) to .fcs
setwd(OutputDirectory)
setwd("Output_data")
## Check data and data column names
head(data)
dimnames(data)[[2]]
## Create FCS file metadata - column names with descriptions
metadata <- data.frame(name=dimnames(data)[[2]],
desc=paste('column',dimnames(data)[[2]],'from dataset')
)
## Double check metadata
metadata
## Create FCS file metadata - ranges, min, and max settings
metadata$range <- apply(apply(data,2,range),2,diff)
metadata$minRange <- apply(data,2,min)
metadata$maxRange <- apply(data,2,max)
metadata$range
metadata$minRange
metadata$maxRange
## Create flowframe with tSNE data
data.ff <- new("flowFrame",
exprs=as.matrix(data), # in order to create a flow frame, data needs to be read as matrix
parameters=AnnotatedDataFrame(metadata)
)
## Check flow frame
data.ff
## Save flowframe as .fcs file -- save data (with new tSNE parameters) as FCS
new_file_name_fcs <- paste0("Output_ ", Data_name, "_with_tSNE", ".fcs")
write.FCS(data.ff, new_file_name_fcs)
## Move back to PrimaryDirectory
setwd(PrimaryDirectory)
getwd()
# note -- in the final data output, all parameters are included, but only the subsampled and/or transformed cells
### 3.7 - Create coloured tSNE plots
## Uses a separate script
## github.com/sydneycytometry/tSNEplots