-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_GEO_acess.R
83 lines (61 loc) · 2.22 KB
/
test_GEO_acess.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
library(GEOquery)
library(Biobase)
library(ChAMP)
library(yaml)
# load the config file
yaml.file <- yaml.load_file('config.yml')
# extract the information from the yaml file
GSE_NUM <- yaml.file$GSE_NUM
getGEOSuppFiles(GSE_NUM)
GSE <- getGEO(GEO = GSE_NUM)
pd <- pData(phenoData(GSE[[1]]))
setwd(paste0(GSE_NUM,"/"))
untar(tarfile = paste0(GSE_NUM,'_RAW.tar'))
idat_files <- list.files(pattern = 'idat.gz')
list_files <- data.frame(list.files(pattern = "idat"))
colnames(list_files) <- "Basename"
list_files_p <- NULL
list_files_p <- unlist(lapply(basename(as.character(list_files$Basename)), function(x) (strsplit(x, "_Gr|_Re")[[1]][1])))
list_files_p <- data.frame(list_files_p)
colnames(list_files_p) <- "Basename"
ChAMP_txt <- data.frame(
Sample_Name=pd$title,
Chips=pd$channel_count,
mutation=pd$organism_ch1,
Sex=pd$'Sex:ch1',
Sample_status=pd$'case control status:ch1',
Mutation_status=pd$organism_ch1,
Basename=list_files_p,
tissue=pd$source_name_ch1,
stringsAsFactors=F)
write.table(ChAMP_txt,"pD_ChAMP_test.txt", row.names=F, quote=F, sep=",")
###############################################################################################################################
# Test data_850K : load_pD pheno_data
pD_EPIC <- read.csv("pD_ChAMP_test.txt", header=T,sep=",", stringsAsFactors=F)
head(pD_EPIC)
#View(pD_EPIC)
pD_EPIC$Sample_Name
colnames(pD_EPIC)
#Format pD_files EPIC for ChAMP (change col names)
Sentrix_ID=unlist(lapply(basename(as.character(pD_EPIC$Basename)), function(x) strsplit(gsub("_R",":R",x),":")[[1]][[1]]))
Sentrix_Position=unlist(lapply(basename(as.character(list_files$Basename)), function(x) (strsplit(x, "_")[[1]][3])))
head(pD_EPIC)
ChAMP_csv <- data.frame(
Sample_Name=pD_EPIC$Sample_Name,
Sample_Plate=pD_EPIC$Chips,
Sample_Group=pD_EPIC$mutation,
Sample_Group_2=pD_EPIC$Sex,
Sample_GSE_ID="",
Sample_Status=pD_EPIC$Sample_status,
Mutation_Status=pD_EPIC$Mutation_status,
Pool_ID="",
Project="",
Sample_Well="",
Sentrix_ID=Sentrix_ID,
Sentrix_Position=Sentrix_Position,
Basename=pD_EPIC$Basename,
Tissue=pD_EPIC$tissue,
stringsAsFactors=F)
# a revoir
ChAMP_csv <- na.omit(ChAMP_csv)
write.table(ChAMP_csv,"pd_ChAMP.csv", row.names=F, quote=F, sep=",")