-
Notifications
You must be signed in to change notification settings - Fork 0
/
perso_data_process.R
57 lines (41 loc) · 1.55 KB
/
perso_data_process.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# Load libraries
library(ChAMP)
library(dplyr)
library(yaml)
# Test data_850K : load_pD pheno_data
pD_EPIC <- read.csv("config/pD_MD.txt", header=T,sep=",", stringsAsFactors=F)
head(pD_EPIC)
#View(pD_EPIC)
pD_EPIC$Sample_Name
colnames(pD_EPIC)
#Format pD_files EPIC for ChAMP (change col names)
Sentrix_ID=unlist(lapply(basename(as.character(pD_EPIC$Basename)), function(x) strsplit(gsub("_R",":R",x),":")[[1]][[1]]))
Sentrix_Position=unlist(lapply(basename(as.character(pD_EPIC$Basename)), function(x) strsplit(gsub("_R",":R",x),":")[[1]][[2]]))
head(pD_EPIC)
ChAMP_csv <- data.frame(
Sample_Name=pD_EPIC$Sample_Name,
Sample_Plate=pD_EPIC$Chips,
Sample_Group=pD_EPIC$mutation,
Sample_Group_2=pD_EPIC$Sex,
Sample_GSE_ID="",
Sample_Status=pD_EPIC$Sample_status,
Mutation_Status=pD_EPIC$Mutation_status,
Pool_ID="",
Project="",
Sample_Well="",
Sentrix_ID=Sentrix_ID,
Sentrix_Position=Sentrix_Position,
Basename=pD_EPIC$Basename,
Tissue=pD_EPIC$tissue,
stringsAsFactors=F)
head(ChAMP_csv)
#Filters pD
# keeping all Sample_Status == CTL / Discovery / Validation & Mutation_Status == LOF / abs
# keeping samples with tissue == Blood DNA or Cell type
factor(ChAMP_csv$Sample_Status)
ChAMP_f_csv <- subset(ChAMP_csv, Tissue=="LCL" & Sample_Status %in% c("CTL","Sotos","ATRX","ORC") & Mutation_Status %in% c("LOF","abs"))
head(ChAMP_f_csv)
dim(ChAMP_f_csv)
#CHAMP PROCEDURES_EPIC
# Load data, setwd() in data dir with all idat to create csv file of pD define in Samples Selections
write.table(ChAMP_f_csv,"pD_ChAMP.csv", row.names=F, quote=F, sep=",")