-
Notifications
You must be signed in to change notification settings - Fork 0
/
Task1.R
34 lines (26 loc) · 1011 Bytes
/
Task1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
library(readr)
library(tidyr)
library(dplyr)
library(stringr)
df <- read_delim("Homo_sapiens.gene_info", delim = "\t", col_types = cols_only(
GeneID = col_character(),
Symbol = col_character(),
Synonyms = col_character()
))
newdf <- df %>%
mutate(Synonyms = strsplit(as.character(Synonyms), "\\|")) %>%
unnest(Synonyms) %>%
select(GeneID, Symbol = Synonyms) %>%
bind_rows(df %>% select(GeneID, Symbol))
gmtdf <- read_delim("h.all.v2023.1.Hs.symbols.gmt", delim = "\t", col_names = FALSE)
max_cols <- max(sapply(gmtdf, function(x) sum(!is.na(x))))
# Ensure all rows have the same number of columns
gmtdf <- gmtdf %>%
mutate(across(everything(), as.character)) %>%
mutate(across(everything(), ~ replace(., is.na(.), "")))
gmtdf <- gmtdf %>%
mutate(across(everything(), ~ replace(., . == "", NA)))
for (i in 3:ncol(gmtdf)) {
gmtdf[[i]] <- ifelse(gmtdf[[i]] %in% newdf$Symbol, newdf$GeneID[match(gmtdf[[i]], newdf$Symbol)], gmtdf[[i]])
}
write.csv(gmtdf, "out.csv", row.names = FALSE)