-
Notifications
You must be signed in to change notification settings - Fork 1
/
protein inference.Rmd
239 lines (189 loc) · 11.8 KB
/
protein inference.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
---
title: "protein inference method in maxquant"
output: html_notebook
---
```{r setup}
library(tidyverse)
```
```{r function}
protieninference <- function(pep2pr, delim = ";"){
# prepare
# find the distinct peptide ids, which there is only one protein matched to a protein
distinct_peptide_ids <- pep2pr$id[which(unlist(lapply(pep2pr$Proteins, function(x){length(unlist(strsplit(x, delim)))})) ==1)]
# break apart, convert to 1 to 1 mapping
pep2pr_1v1 <- tidytext::unnest_tokens(pep2pr,Protein, Proteins, token = "regex", pattern = delim, to_lower = FALSE)
# produce the protein list with peptide id mapping, and order
pr2pep <- aggregate(.~Protein, data = pep2pr_1v1, paste, collapse = ";")
pr2pep$counts <- unlist(lapply(pr2pep$id, function(x){length(unlist(strsplit(x, ";")))}))
pr2pep <- pr2pep[order(pr2pep$counts,pr2pep$Protein,decreasing = TRUE),]
# create a empty protein_group file
pg <- data.frame(matrix(ncol = 12, nrow = 0))
colnames(pg) <- c("Leading_Protein", # Leading_Protein
"peptide_Counts", #peptide_Count
"#_of_subs", #number_of_subs
"peptide_id_leading", #id
"peptide_counts_each", #peptide_Count_each
"peptide_counts_razor_each", #peptide_Count_razor_each
"peptide_counts_distinct_each", #peptide_Count_distinct_each
"protein_names_each", #protein_v[1]
"peptide_Ids_each",# pep_v[1],
"peptide_is_razor_leading",#peptide_is_razor_leading
"peptide_Count_razor_leading",
"peptide_IDs_razor_each") #peptide_is_razor
# using vector, instead of data.frame is much faster in R: https://stackoverflow.com/questions/2908822/speed-up-the-loop-operation-in-r
protein_v <- pr2pep[,1] # get the protein list vector
pep_v <- pr2pep[,2] # get the peptide id list vector
count_v <- pr2pep[,3] # get the peptide counts list vector
razor_peptide_ids_global <- c() # to store the razoer peptide set (combine all peptides occured)
i<-0 # mark how many protein groups inferred
# start the loop
while(length(protein_v)>1){
# initialization for each top row
Leading_Protein <-protein_v[1]
peptide_ids_string_leading_protein <- pep_v[1]
peptide_ids_leading_protein <- unlist(strsplit(pep_v[1], ";")) # split peptide_ids into a vector
#
peptide_Count_each <- length(peptide_ids_leading_protein)
peptide_Count_razor_leading <- peptide_Count_razor_each <- sum(!(peptide_ids_leading_protein %in% razor_peptide_ids_global))
peptide_IDs_razor_each <- paste(peptide_ids_leading_protein[!(peptide_ids_leading_protein %in% razor_peptide_ids_global)], collapse = ";")
peptide_is_razor_leading <- paste(!(peptide_ids_leading_protein %in% razor_peptide_ids_global), collapse = ";")
peptide_Count_distinct_each <- sum(peptide_ids_leading_protein %in% distinct_peptide_ids)
peptide_Count <- count_v[1]
number_of_subs <-1
sub_index_to_del <- c(1)
for (j in 2:length(protein_v)){ # start from the bottom(shortest ones), it will avoid missing lines
# the logic is, if find any sub, merge, otherwise, list it. Then delete this line
if( all(unlist(strsplit(pep_v[j], ";")) %in% peptide_ids_leading_protein)){
ids_this_subprotein <- unlist(strsplit(pep_v[j], ";")) # split ids of the current proteins
peptide_Count_each <- paste(peptide_Count_each,length(ids_this_subprotein),sep = ";")
peptide_Count_razor_each <- paste(peptide_Count_razor_each,sum(!(ids_this_subprotein %in% razor_peptide_ids_global)),sep = ";")
peptide_Count_distinct_each <- paste(peptide_Count_distinct_each,sum(ids_this_subprotein %in% distinct_peptide_ids),sep = ";")
peptide_IDs_razor_each <- paste(peptide_IDs_razor_each, ids_this_subprotein[!(ids_this_subprotein %in% razor_peptide_ids_global)], sep = "/")
protein_v[1] <- paste(protein_v[1], protein_v[j], sep = ";") # paste protein names
pep_v[1] <- paste(pep_v[1], pep_v[j], sep = "/") # paste peptide ids
number_of_subs <- number_of_subs+1 # count how many proteins
sub_index_to_del <- c(sub_index_to_del, j)
# remove this element, because it is a subprotein of a longer one
}
}
razor_peptide_ids_global <- unique(c(razor_peptide_ids_global, peptide_ids_leading_protein)) # store all previously used peptide id to determine if this is a razor peptide
i <- i+1
pg[i,] <- c(Leading_Protein,
peptide_Count,
number_of_subs,
peptide_ids_string_leading_protein,
peptide_Count_each,
peptide_Count_razor_each,
peptide_Count_distinct_each,
protein_v[1],
pep_v[1],
peptide_is_razor_leading,
peptide_Count_razor_leading,
peptide_IDs_razor_each)# add this to the new data.frames
cat(i," \t",Leading_Protein,"\twith",peptide_Count," peptides;\t")
cat("Found: ", number_of_subs-1, " subproteins;\t")
cat(length(protein_v)-1, " rows left\n")
# remove these rows (the leading protein and the sub proteins) from the vector
protein_v <- protein_v[-sub_index_to_del]
pep_v <- pep_v[-sub_index_to_del]
count_v <- count_v[-sub_index_to_del]
}
# if there is one last lien alone
if(length(protein_v)==1){
pg[i+1,] <- c(protein_v,
count_v,
1, # number of sub proteins
pep_v, # peptide ids
peptide_Count_each,
peptide_Count_razor_each,
peptide_Count_distinct_each,
protein_v,
pep_v, #peptide_Ids_each
peptide_is_razor_leading,
peptide_Count_razor_leading,
#peptide_IDs_razor_leading,
peptide_IDs_razor_each)
}
# re examine the group uniqueness, because the uniqueness here is group unique,
pg_mapping <- pg %>% rownames_to_column(var = "pg_id") %>%
select(pg_id,peptide_id_leading) %>%
tidytext::unnest_tokens(.,peptide_id, peptide_id_leading, token = "regex", pattern = ";", to_lower = FALSE) %>%
aggregate(.~peptide_id, data = ., paste, collapse = ";")
pg_unique_peptide_ids <- pg_mapping$peptide_id[which(unlist(lapply(pg_mapping$pg_id, function(x){length(unlist(strsplit(x, ";")))})) ==1)]
# adding columns
pg$peptide_counts_unique_each <- unlist(lapply(pg$peptide_Ids_each, function(x){
paste(unlist(lapply(unlist(strsplit(x, "/")), function(x){sum(unlist(strsplit(x, ";")) %in% pg_unique_peptide_ids)})), collapse = ";" )
}))
pg$peptide_is_unique_leading <- unlist(lapply(pg$peptide_Ids_each, function(x){
paste(unlist(strsplit(unlist(strsplit(x, "/"))[[1]] , ";")) %in% pg_unique_peptide_ids, collapse = ";")
}))
pg$peptide_counts_razor_unique_each <- unlist(lapply(1:nrow(pg), function(x){
peptide_IDs_each <- unlist(strsplit(pg$peptide_Ids_each[x], "/"))
peptide_IDs_razor_each <- unlist(strsplit(pg$peptide_IDs_razor_each[x], "/"))
if(length(peptide_IDs_each) == 1 && length(peptide_IDs_razor_each) ==0){
return(0)
}else{
return(
paste(unlist(lapply(1: length(peptide_IDs_each), function(x){
peptide_IDs_this <- unlist(strsplit(peptide_IDs_each[x], ";"))
peptide_IDs_unique_this <- peptide_IDs_this[peptide_IDs_this %in% pg_unique_peptide_ids]
peptide_IDs_razor_this <- unlist(strsplit(peptide_IDs_razor_each[x], ";")) # razor list is already generated
length(unique(c(peptide_IDs_unique_this,peptide_IDs_razor_this)))
})), collapse = ";" )
)
}
}))
pg$peptide_counts_razor_unique_leading <- unlist(lapply(pg$peptide_counts_razor_unique_each, function(x){
unlist(strsplit(x, ";"))[[1]]
}))
pg$peptide_counts_unique_leading <- unlist(lapply(pg$peptide_counts_unique_each, function(x){
unlist(strsplit(x, ";"))[[1]]
}))
# organize column nanmes for easy comparison with maxquant result
pg <- pg %>% filter(peptide_counts_razor_unique_leading >0) %>% # remove pg with 0 razor peptides
rename("Peptide counts leading"="peptide_Counts",
"Number of proteins"="#_of_subs",
"Peptide IDs leading" = "peptide_id_leading",
"Peptide counts (all)"= "peptide_counts_each",
"Peptide counts (razor)" = "peptide_counts_razor_each",
"Peptide counts (distinct)" = "peptide_counts_distinct_each",
"Protein IDs" = "protein_names_each" ,
"peptide IDs each" = "peptide_Ids_each",
"Peptide counts (unique)" = "peptide_counts_unique_each",
"Peptide counts (razor) leading" = "peptide_Count_razor_leading",
"peptide_is_razor leading" ="peptide_is_razor_leading",
"peptide_is_unique leading" ="peptide_is_unique_leading",
"Peptide counts (razor+unique)" = "peptide_counts_razor_unique_each",
"Peptide counts (unique) leading" = "peptide_counts_unique_leading",
"Peptide counts (razor+unique) leading"= "peptide_counts_razor_unique_leading"
#"peptide IDs unique each" = "pg_unique_peptide_ids"
) %>%
select("Leading_Protein",
"Peptide counts leading",
"Peptide counts (razor) leading",
"Peptide counts (unique) leading",
"Peptide counts (razor+unique) leading",
"Peptide IDs leading",
"peptide_is_razor leading",
"peptide_is_unique leading",
"Number of proteins",
"Protein IDs",
"Peptide counts (all)",
"Peptide counts (razor)",
"Peptide counts (unique)",
"Peptide counts (razor+unique)",
"Peptide counts (distinct)",
"peptide IDs each"
#"peptide IDs unique each"
)
return(pg)
}
```
```{r test}
pep2pr <- rio::import("QuantifiedPeptides.tsv", check.names = TRUE) %>% select(c("id", "Proteins")) # only use two columns
# pep2pr_sub <- pep2pr %>% sample_n(2000)
system.time(pg <- protieninference(pep2pr))
head(pg)
rio::export(pg, "proteinGroups_homebrew.txt")
################
```