forked from ChengxiangQiu/tome_code
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Section2_trajectory_Step3_summarize_results.R
172 lines (142 loc) · 7.3 KB
/
Section2_trajectory_Step3_summarize_results.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#############################
#### Section2_trajectory ####
#############################
#### The scripts used for creating cellular trajectories
#################################
#### Step3_summarize_results ####
#################################
library(reshape2)
library(scales)
library(ggplot2)
library(dplyr)
library(gplots)
library(viridis)
work_path = ""
time_point = paste0("E", c(3.5, 4.5, 5.25, 5.5, 6.25, seq(6.5, 8.25, 0.25), "8.5a", "8.5b", seq(9.5, 13.5, 1)))
replication_times=500
res_median_umap = list()
for(time_i in 1:(length(time_point)-1)){
print(paste0(time_point[time_i], ":", time_point[time_i+1]))
dat = readRDS(paste0(work_path, "/",time_point[time_i],"_",time_point[time_i+1],"_Knn_umap.rds"))
state_1 = row.names(dat[[1]])
state_1 = paste0(time_point[time_i+1], ":", update_name(gsub(paste0(time_point[time_i+1], ":"), "", state_1)))
state_2 = names(dat[[1]])
state_2 = paste0(time_point[time_i], ":", update_name(gsub(paste0(time_point[time_i], ":"), "", state_2)))
tmp_1 = matrix(NA,nrow(dat[[1]]),ncol(dat[[1]]))
for(i in 1:nrow(dat[[1]])){
for(j in 1:ncol(dat[[1]])){
xx = NULL
for(k in 1:replication_times){
xx = c(xx, dat[[k]][i,j])
}
tmp_1[i,j] = median(xx[!is.na(xx)])
}
}
tmp_1 = data.frame(tmp_1)
row.names(tmp_1) = state_1
names(tmp_1) = state_2
res_median_umap[[time_i]] = tmp_1
}
dat = NULL
for(i in 1:length(res_median_umap)){
print(time_point[i])
dat = rbind(dat, melt(as.matrix(res_median_umap[[i]])))
}
dat = data.frame(dat)
names(dat) = c("nex", "pre", "prob")
#### As described in the manuscript, anchor-based batch correction and integration
#### of profiles of E8.5 cells from (Pijuan-Sala et al. 2019) (termed “E8.5a”)
#### and newly generated profiles of E8.5 nuclei (termed “E8.5b”) worked very
#### well with the exception of primitive erythroid cells, which we suspect may
#### be due to more extensive differences between cells vs. nuclei in this cell type.
#### we manually set E8.5b:Primitive erythroid cells as the ancestor of E8.5b:Primitive erythroid cells
dat$prob[dat$nex == "E8.5b:Primitive erythroid cells"] = 0.0
dat$prob[dat$nex == "E8.5b:Primitive erythroid cells" & dat$pre == "E8.5a:Primitive erythroid cells"] = 1.0
dat$pre_time = unlist(lapply(as.vector(dat$pre), function(x) strsplit(x,"[:]")[[1]][1]))
dat$pre_cell = unlist(lapply(as.vector(dat$pre), function(x) strsplit(x,"[:]")[[1]][2]))
dat$nex_time = unlist(lapply(as.vector(dat$nex), function(x) strsplit(x,"[:]")[[1]][1]))
dat$nex_cell = unlist(lapply(as.vector(dat$nex), function(x) strsplit(x,"[:]")[[1]][2]))
saveRDS(dat, paste0(work_path, "/edge_all.rds"))
### Here we use “cell state” to mean an annotated cluster at a given stage.
print(paste0("how many edges: ", nrow(dat)))
print(paste0("how many edges (> 0): ", nrow(dat[dat$prob>0,])))
print(paste0("how many edges (> 0.2): ", nrow(dat[dat$prob>=0.2,])))
print(paste0("how many edges (> 0.7): ", nrow(dat[dat$prob>=0.7,])))
print(paste0("how many edges (> 0.8): ", nrow(dat[dat$prob>=0.8,])))
print(paste0("how many nodes: ", length(unique(c(as.vector(dat$pre), as.vector(dat$nex))))))
print(paste0("how many cell types: ", length(unique(c(as.vector(dat$pre_cell), as.vector(dat$nex_cell))))))
#### extract edges with prob > 0.2
x = dat[dat$prob>=0.2,]
x = x[,c("pre","nex","prob")]
print(paste0("how many nodes now: ", length(unique(c(as.vector(x$pre), as.vector(x$nex))))))
#### As described in the manuscript, we introduced 4 “dummy nodes”, corresponding
#### to morula at E3.0 (as a root for trophectoderm and inner cell mass),
#### trophectoderm at E3.5 and E4.5 (which had been removed at these timepoints
#### by immunosurgery (Mohammed et al. 2017)) and parietal endoderm at E6.75
#### (undetected, likely due to undersampling). For technical reasons (see above),
#### we also introduced an edge between primitive erythroid cells at E8.5a and E8.5b.
x = x[!x$nex %in% c("E5.25:Extraembryonic ectoderm", "E7:Parietal endoderm"),]
dummy = NULL
dummy = rbind(dummy, c("E3:Morula","E3.5:Inner cell mass",1))
dummy = rbind(dummy, c("E3:Morula","E3.5:Trophectoderm",1))
dummy = rbind(dummy, c("E3.5:Trophectoderm","E4.5:Trophectoderm",1))
dummy = rbind(dummy, c("E4.5:Trophectoderm","E5.25:Extraembryonic ectoderm",1))
dummy = rbind(dummy, c("E6.5:Parietal endoderm","E6.75:Parietal endoderm",1))
dummy = rbind(dummy, c("E6.75:Parietal endoderm","E7:Parietal endoderm",1))
dummy = data.frame(dummy)
names(dummy) = c("pre","nex","prob")
res = rbind(x, dummy)
dat_sub = res
dat_sub$pre_cell = unlist(lapply(as.vector(dat_sub$pre), function(x) strsplit(x,"[:]")[[1]][2]))
dat_sub$nex_cell = unlist(lapply(as.vector(dat_sub$nex), function(x) strsplit(x,"[:]")[[1]][2]))
sum(dat_sub$pre_cell == dat_sub$nex_cell)
sum(dat_sub$pre_cell == dat_sub$nex_cell)/nrow(dat_sub)
sum(dat_sub$pre_cell != dat_sub$nex_cell)
sum(dat_sub$pre_cell != dat_sub$nex_cell)/nrow(dat_sub)
#### summary on finally used to create the tree
print(paste0("how many edges: ", nrow(res)))
print(paste0("how many nodes: ", length(unique(c(as.vector(res$pre), as.vector(res$nex))))))
nex_list = as.vector(unique(res$nex))
tree = NULL
for(i in 1:length(nex_list)){
res_sub = res[res$nex==nex_list[i],]
if(nrow(res_sub)==1){
tree = rbind(tree, res_sub)
} else {
res_sub = res_sub[order(res_sub$prob, decreasing = TRUE),]
tree = rbind(tree, res_sub[1,])
}
}
tree = data.frame(tree)
tree = tree[,c("pre","nex")]
write.table(res, paste0(work_path, "/edge_prob.txt"), row.names = F, col.names = F, quote = F, sep = "\t")
write.table(tree, paste0(work_path, "/mouse_edge.txt"), row.names = F, col.names = F, quote = F, sep = "\t")
#### create heatmap of each pair of adjacent stages
dat = readRDS(paste0(work_path, "/edge_all.rds"))
time_point = paste0("E", c(3.5, 4.5, 5.25, 5.5, 6.25, seq(6.5, 8.25, 0.25), "8.5a", "8.5b", seq(9.5, 13.5, 1)))
mm_cell_type_list = read.table(paste0("help_code/celltype_group.txt"), as.is=T, sep="\t")
mm_cell_type_list = as.vector(mm_cell_type_list$V1)
save_path = ""
kk = 1
df = data.frame(matrix(c(1,1,0,0),2,2))
rownames(df) = c("Epiblast","Hypoblast")
colnames(df) = c("Inner cell mass","Inner cell mass_NA")
pdf(paste0(save_path, "heatmap_",kk,".pdf"),10,10)
print(heatmap.2(as.matrix(df), col=viridis, scale="none", Rowv = FALSE, Colv = FALSE, key=T, density.info="none", trace="none", cexRow=0.6, cexCol = 0.6))
dev.off()
write.table(rownames(df),paste0(save_path, "rownames_",kk,".txt"), row.names=F, col.names=F, sep="\t", quote=F)
write.table(colnames(df),paste0(save_path, "colnames_",kk,".txt"), row.names=F, col.names=F, sep="\t", quote=F)
for(kk in 2:19){
print(paste0(kk,"/",19))
dat_sub = dat[dat$pre_time == time_point[kk],c("pre_cell","nex_cell","prob")]
df = dcast(dat_sub, nex_cell~pre_cell)
rownames(df) <- df[,1]; df <- df[,-1]
x = mm_cell_type_list[mm_cell_type_list %in% rownames(df)]
y = mm_cell_type_list[mm_cell_type_list %in% colnames(df)]
df = df[x,y]
pdf(paste0(save_path, "heatmap_",kk,".pdf"),10,10)
print(heatmap.2(as.matrix(df), col=viridis, scale="none", Rowv = FALSE, Colv = FALSE, key=T, density.info="none", trace="none", cexRow=0.6, cexCol = 0.6))
dev.off()
write.table(rownames(df),paste0(save_path, "rownames_",kk,".txt"), row.names=F, col.names=F, sep="\t", quote=F)
write.table(colnames(df),paste0(save_path, "colnames_",kk,".txt"), row.names=F, col.names=F, sep="\t", quote=F)
}