.Rhistory

library(treeClust)
rmarkdown::render("markdown_reports/clustering_report.Rmd", params = list(cl_method = hcut, Rows_num = 4))
# loeading
library(factoextra)
#library("NbClust")
data("USArrests")      # Loading the data set
df <- scale(USArrests) # Scaling the data
# View the firt 3 rows of the data
head(df, params$Rows_num)
set.seed(123)
# Elbow method
fviz_nbclust(df, FUNcluster = params$cl_method, method = "wss") +
geom_vline(xintercept = 4, linetype = 2)+
labs(subtitle = "Elbow method")
# Silhouette method
fviz_nbclust(df, FUNcluster =  params$cl_method, method = "silhouette")+
labs(subtitle = "Silhouette method")
# Gap statistic
# nboot = 50 to keep the function speedy.
# recommended value: nboot= 500 for your analysis.
# Use verbose = FALSE to hide computing progression.
fviz_nbclust(df, FUNcluster =  params$cl_method, nstart = 25, method = "gap_stat", nboot = 200)+
labs(subtitle = "Gap statistic method")
rmarkdown::render("markdown_reports/clustering_report.Rmd", params = list(cl_method = hcut, Rows_num = 4))
rmarkdown::render("markdown_reports/clustering_report.Rmd", params = list(cl_method = hcut, Rows_num = 4))
rmarkdown::render("markdown_reports/clustering_report.Rmd", params = list(cl_method = hcut, Rows_num = 4))
# loeading
library(factoextra)
rmarkdown::render("markdown_reports/clustering_report.Rmd", params = list(cl_method = hcut, Rows_num = 4))
s1 <- c(1,2)
s2 <- c(3,4)
s1*s2
g <- function(lam, v){
f <- lam * v
}
g(s1,s2)
g <- function(lam, v){
f <- lam * v
return(f)
}
g(s1,s2)
rnorm(0,1)
rnorm(0,1)
?rnorm
rnorm(1,0,1)
rnorm(10,0,1)
rnorm(1,0,1)
s1 <- c(1,2)
s2 <- c(3,4)
s3 <- c(2,2)
s1*s2
s1*s2*s3
g <- function(lam, v){
z <- rnorm(2,0,1)
f <- lam * z* v
return(f)
}
g(s1,s2)
rnorm
rnorm(2,0,1)
g <- function(lam, v){
z <- rnorm(2,0,1)
f <- lam * v
return(f)
}
g(s1,s2)
g <- function(lam, v){
z <- rnorm(2,0,1)
f <- lam *z*  v
return(f)
}
g(s1,s2)
g <- function(lam, v){
z <- rnorm(2,0,1)
f <- lam *z*  v
return(sum(f))
}
g(s1,s2)
?seq
seq(0,1, by = 0.5)
seq(0.1,1.1, by = 0.5)
t <- seq(0.1,1.1,0.5)
e <- c(cos(t), sin(t))
e
t <- seq(0.1,1.1,0.5)
e <- c(cos(t), sin(t))
e
t
t <- seq(0.1,1.1,0.5)
e <- c(cos(t), sin(t))
e
t
cos(t)
t <- seq(0.1,1.1,0.5)
e <- c(cos(t), sin(t))
e
t
cos(t)
s1
s1 * e
t <- seq(0.1,1.1,0.5)
e <- c(cos(t), sin(t))
e
t
cos(t)
s1
s1 * cos(t)
t <- seq(0.1,1.1,0.5)
e <- c(cos(t), sin(t))
e
t
cos(t)
s1
s1[1] * cos(t)
head(result_food_uft_DEL_k)
result_food_uft_DEL_k <- read.csv('/Users/xbasra/Documents/Data/Clustering_Food_Alergies/CsvData/result_food_uft_DEL_k.csv')
head(result_food_uft_DEL_k)
dim(result_food_uft_DEL_k)
head(result_food_uft_DEL_k[,2:20])
head(result_food_uft_DEL_k[,3:20])
result_food_uft_DEL_k <- result_food_uft_DEL_k[,3:20]
head(result_food_uft_DEL_k)
library(igraph)
EDL_Embedding = read.csv('/Users/xbasra/Documents/Data/Clustering_Food_Alergies/CsvData/EDL_Embedding.csv')
head(EDL_Embedding)
mat <- cor(t(EDL_Embedding))
dim(matrix(data, nrow = rows, ncol = cols))
dim(mat)
mat
head(mat)
rm(mat)
Em_corr <- cor(t(EDL_Embedding))
dim(Em_corr)
head(Em_corr, 3)
Em_corr[Em_corr > 0.8]
dim(Em_corr[Em_corr > 0.9])
length(Em_corr[Em_corr > 0.9])
length(Em_corr[Em_corr > 0.95])
length(Em_corr[Em_corr > 0.99])
length(Em_corr[Em_corr > 0.999])
length(Em_corr[Em_corr > 0.9999])
length(Em_corr[Em_corr > 0.999999])
length(Em_corr[Em_corr > 0.99999999])
length(Em_corr[Em_corr > 0.99999999999])
length(Em_corr[Em_corr > 0.99999999999999999])
length(Em_corr[Em_corr > 0.999999999999999])
length(Em_corr[Em_corr > 0.9999999999999999])
length(Em_corr[Em_corr > 0.99999999999999999])
length(Em_corr[Em_corr > 0.999999999999999])
head(Em_corr[Em_corr > 0.999999999999999])
length(Em_corr[Em_corr > 0.999999999])
length(Em_corr[Em_corr > 0.99999])
head(Em_corr[Em_corr > 0.99999])
head(Em_corr[Em_corr > 0.99999],30)
dim(Em_corr)
1872*1872
length(Em_corr[Em_corr > 0.999])
length(Em_corr[Em_corr > 0.995])
length(Em_corr[Em_corr > 0.9995])
network <- graph_from_adjacency_matrix( Em_corr[Em_corr > 0.9995], weighted=T, mode="undirected", diag=F)
Em_corr[Em_corr < 0.9995] <- 0
network <- graph_from_adjacency_matrix( Em_corr[Em_corr > 0.9995], weighted=T, mode="undirected", diag=F)
network <- graph_from_adjacency_matrix( Em_corr, weighted=T, mode="undirected", diag=F)
plot(network)
result_food_uft_DEL_k$cluster <- as.factor(result_food_uft_DEL_k$cluster)
result_food_uft_DEL_k <- result_food_uft_DEL_k[-c(1,2,20)]
catdes(result_food_uft_DEL_k, 17)
library(FactoMineR)
result_food_uft_DEL_k$cluster <- as.factor(result_food_uft_DEL_k$cluster)
result_food_uft_DEL_k <- result_food_uft_DEL_k[-c(1,2,20)]
catdes(result_food_uft_DEL_k, 17)
result_food_uft_DEL_k <- read.csv('/Users/xbasra/Documents/Data/Clustering_Food_Alergies/CsvData/result_food_uft_DEL_k.csv')
head(result_food_uft_DEL_k)
dim(result_food_uft_DEL_k)
tsne_converted_food$cl_DEL <- factor(result_food_uft_DEL_k$cluster)
result_food_uft_DEL_k <- read.csv('/Users/xbasra/Documents/Data/Clustering_Food_Alergies/CsvData/result_food_uft_DEL_k.csv')
result_food_uft_DEL_k <- result_food_uft_DEL_k[,3:20]
table_uft_DEL_k <- tableby(cluster ~ ., data = as.list(result_food_uft_DEL_k))
library(arsenal)
result_food_uft_DEL_k <- read.csv('/Users/xbasra/Documents/Data/Clustering_Food_Alergies/CsvData/result_food_uft_DEL_k.csv')
result_food_uft_DEL_k <- result_food_uft_DEL_k[,3:20]
table_uft_DEL_k <- tableby(cluster ~ ., data = as.list(result_food_uft_DEL_k))
summary(table_uft_DEL_k, title = "Charachtaristcs of Clusters")
# adding the id variable
#result_food_uft_DEL_k$ID <- food_data_id$ID
#write.csv(result_food_uft_DEL_k, "/Users/xbasra/Documents/Data/Clustering_Food_Alergies/CsvData/result_food_uft_DEL_k.csv")
result_food_uft_DEL_k$cluster <- as.factor(result_food_uft_DEL_k$cluster)
result_food_uft_DEL_k <- result_food_uft_DEL_k[-c(1,2,20)]
catdes(result_food_uft_DEL_k, 17)
head(result_food_uft_DEL_k)
result_food_uft_DEL_k <- read.csv('/Users/xbasra/Documents/Data/Clustering_Food_Alergies/CsvData/result_food_uft_DEL_k.csv')
result_food_uft_DEL_k <- result_food_uft_DEL_k[,3:20]
head(result_food_uft_DEL_k)
dim(result_food_uft_DEL_k)
head(result_food_uft_DEL_k[,2:17])
head(result_food_uft_DEL_k[,1:17])
result_food_uft_DEL_k <- read.csv('/Users/xbasra/Documents/Data/Clustering_Food_Alergies/CsvData/result_food_uft_DEL_k.csv')
#result_food_uft_DEL_k <- result_food_uft_DEL_k[,3:20]
table_uft_DEL_k <- tableby(cluster ~ ., data = as.list(result_food_uft_DEL_k))
summary(table_uft_DEL_k, title = "Charachtaristcs of Clusters")
# adding the id variable
#result_food_uft_DEL_k$ID <- food_data_id$ID
#write.csv(result_food_uft_DEL_k, "/Users/xbasra/Documents/Data/Clustering_Food_Alergies/CsvData/result_food_uft_DEL_k.csv")
result_food_uft_DEL_k$cluster <- as.factor(result_food_uft_DEL_k$cluster)
result_food_uft_DEL_k <- result_food_uft_DEL_k[-c(1,2,20)]
catdes(result_food_uft_DEL_k, 17)
result_food_uft_DEL_k$cluster <- as.factor(result_food_uft_DEL_k$cluster)
#result_food_uft_DEL_k <- result_food_uft_DEL_k[-c(1,2,20)]
catdes(result_food_uft_DEL_k, 17)
head(result_food_uft_DEL_k)
result_food_uft_DEL_k <- read.csv('/Users/xbasra/Documents/Data/Clustering_Food_Alergies/CsvData/result_food_uft_DEL_k.csv')
head(result_food_uft_DEL_k)
result_food_uft_DEL_k <- read.csv('/Users/xbasra/Documents/Data/Clustering_Food_Alergies/CsvData/result_food_uft_DEL_k.csv')
#result_food_uft_DEL_k <- result_food_uft_DEL_k[,3:20]
table_uft_DEL_k <- tableby(cluster ~ ., data = as.list(result_food_uft_DEL_k))
summary(table_uft_DEL_k, title = "Charachtaristcs of Clusters")
# adding the id variable
#result_food_uft_DEL_k$ID <- food_data_id$ID
#write.csv(result_food_uft_DEL_k, "/Users/xbasra/Documents/Data/Clustering_Food_Alergies/CsvData/result_food_uft_DEL_k.csv")
result_food_uft_DEL_k$cluster <- as.factor(result_food_uft_DEL_k$cluster)
#result_food_uft_DEL_k <- result_food_uft_DEL_k[-c(1,2,20)]
catdes(result_food_uft_DEL_k, 17)
set.seed(10)
read.csv()
class(EDL_Embedding)
dim(EDL_Embedding)
head(EDL_Embedding)
set.seed(10)
tsne_converted_food_DEL <- Rtsne(X = EDL_Embedding ,perplexity= 200, is_distance = FALSE, check_duplicates = FALSE)
library(Rtsne)
set.seed(10)
tsne_converted_food_DEL <- Rtsne(X = EDL_Embedding ,perplexity= 200, is_distance = FALSE, check_duplicates = FALSE)
tsne_converted_food_DEL <- tsne_converted_food_DEL$Y %>%
data.frame() %>%
setNames(c("X", "Y"))
ggplot(aes(x = X, y = Y), data = tsne_converted_food_DEL)  + geom_point()
library(ggplot2)
ggplot(aes(x = X, y = Y), data = tsne_converted_food_DEL)  + geom_point()
tsne_converted_food_DEL$cl <- factor(result_food_uft_DEL_k$cluster)
ggplot(tsne_converted_food_DEL, aes(x=X, y=Y, color=cl)) + geom_point()
set.seed(10)
#tsne_converted_food_DEL <- Rtsne(X = EDL_Embedding ,perplexity= 200, is_distance = FALSE, check_duplicates = FALSE)
tsne_converted_food_DEL <- Rtsne(X = EDL_Embedding ,perplexity= 100, is_distance = FALSE, check_duplicates = FALSE)
tsne_converted_food_DEL <- tsne_converted_food_DEL$Y %>%
data.frame() %>%
setNames(c("X", "Y"))
tsne_converted_food_DEL$cl <- factor(result_food_uft_DEL_k$cluster)
ggplot(tsne_converted_food_DEL, aes(x=X, y=Y, color=cl)) + geom_point()
#ggplot(aes(x = X, y = Y), data = tsne_converted_food_DEL)  + geom_point()
set.seed(10)
#tsne_converted_food_DEL <- Rtsne(X = EDL_Embedding ,perplexity= 200, is_distance = FALSE, check_duplicates = FALSE)
tsne_converted_food_DEL <- Rtsne(X = EDL_Embedding ,perplexity= 50, is_distance = FALSE, check_duplicates = FALSE)
tsne_converted_food_DEL <- tsne_converted_food_DEL$Y %>%
data.frame() %>%
setNames(c("X", "Y"))
tsne_converted_food_DEL$cl <- factor(result_food_uft_DEL_k$cluster)
ggplot(tsne_converted_food_DEL, aes(x=X, y=Y, color=cl)) + geom_point()
#ggplot(aes(x = X, y = Y), data = tsne_converted_food_DEL)  + geom_point()
set.seed(10)
#tsne_converted_food_DEL <- Rtsne(X = EDL_Embedding ,perplexity= 200, is_distance = FALSE, check_duplicates = FALSE)
tsne_converted_food_DEL <- Rtsne(X = EDL_Embedding ,perplexity= 500, is_distance = FALSE, check_duplicates = FALSE)
tsne_converted_food_DEL <- tsne_converted_food_DEL$Y %>%
data.frame() %>%
setNames(c("X", "Y"))
tsne_converted_food_DEL$cl <- factor(result_food_uft_DEL_k$cluster)
ggplot(tsne_converted_food_DEL, aes(x=X, y=Y, color=cl)) + geom_point()
#ggplot(aes(x = X, y = Y), data = tsne_converted_food_DEL)  + geom_point()
set.seed(10)
#tsne_converted_food_DEL <- Rtsne(X = EDL_Embedding ,perplexity= 200, is_distance = FALSE, check_duplicates = FALSE)
tsne_converted_food_DEL <- Rtsne(X = EDL_Embedding ,perplexity= 150, is_distance = FALSE, check_duplicates = FALSE)
tsne_converted_food_DEL <- tsne_converted_food_DEL$Y %>%
data.frame() %>%
setNames(c("X", "Y"))
tsne_converted_food_DEL$cl <- factor(result_food_uft_DEL_k$cluster)
ggplot(tsne_converted_food_DEL, aes(x=X, y=Y, color=cl)) + geom_point()
#ggplot(aes(x = X, y = Y), data = tsne_converted_food_DEL)  + geom_point()
?Rtsne
library(plotly)
package_version('plotly')
packageVersion('plotly')
tsne_converted_food_DEL <- Rtsne(X = EDL_Embedding ,perplexity= 150, dims = 3, is_distance = FALSE, check_duplicates = FALSE)
class(tsne_converted_food_DEL)
class(tsne_converted_food_DEL$Y)
dim(tsne_converted_food_DEL$Y)
tsne_converted_food_DEL <- tsne_converted_food_DEL$Y %>%
data.frame() %>%
setNames(c("X", "Y", "Z"))
class(tsne_converted_food_DEL$Y)
class(tsne_converted_food_DEL)
head(tsne_converted_food_DEL)
tsne_converted_food_DEL$cl <- factor(result_food_uft_DEL_k$cluster)
p <- plot_ly(tsne_converted_food_DEL, x = ~X, y = ~Y, z = ~Z, color = ~am, colors = c('#BF382A', '#0C4B8E')) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Weight'),
yaxis = list(title = 'Gross horsepower'),
zaxis = list(title = '1/4 mile time')))
p
p <- plot_ly(tsne_converted_food_DEL, x = ~X, y = ~Y, z = ~Z, color = ~cl, colors = c('#BF382A', '#0C4B8E')) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Weight'),
yaxis = list(title = 'Gross horsepower'),
zaxis = list(title = '1/4 mile time')))
p
p <- plot_ly(tsne_converted_food_DEL, x = ~X, y = ~Y, z = ~Z, color = ~cl, colors = c('#BF382A', '#0C4B8E')) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Dim1'),
yaxis = list(title = 'Dim2'),
zaxis = list(title = 'Dim3')))
p
set.seed(10)
#tsne_converted_food_DEL <- Rtsne(X = EDL_Embedding ,perplexity= 200, is_distance = FALSE, check_duplicates = FALSE)
tsne_converted_food_DEL <- Rtsne(X = EDL_Embedding ,perplexity= 150, is_distance = FALSE, check_duplicates = FALSE)
tsne_converted_food_DEL <- tsne_converted_food_DEL$Y %>%
data.frame() %>%
setNames(c("X", "Y"))
tsne_converted_food_DEL$cl <- factor(result_food_uft_DEL_k$cluster)
ggplot(tsne_converted_food_DEL, aes(x=X, y=Y, color=cl)) + geom_point()
#ggplot(aes(x = X, y = Y), data = tsne_converted_food_DEL)  + geom_point()
library(plotly)
#packageVersion('plotly')
tsne_converted_food_DEL_3d <- Rtsne(X = EDL_Embedding ,perplexity= 150, dims = 3, is_distance = FALSE, check_duplicates = FALSE)
tsne_converted_food_DEL_3d <- tsne_converted_food_DEL_3d$Y %>%
data.frame() %>%
setNames(c("X", "Y", "Z"))
tsne_converted_food_DEL_3d$cl <- factor(result_food_uft_DEL_k$cluster)
p <- plot_ly(tsne_converted_food_DEL_3d, x = ~X, y = ~Y, z = ~Z, color = ~cl, colors = c('#BF382A', '#0C4B8E')) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Dim1'),
yaxis = list(title = 'Dim2'),
zaxis = list(title = 'Dim3')))
p
#food_data <- read.csv('/Users/xbasra/Documents/Data/Clustering_Food_Alergies/Original_Data/Analysis_Data.csv')
#food_data <- read.csv('/Users/xbasra/Documents/Data/Clustering_Food_Alergies/Original_Data/Cluster_Analysis_Data.csv') # this is the previous version
food_data <- read.csv('/Users/xbasra/Documents/Data/Clustering_Food_Alergies/Original_Data/Cluster_Analysis_Data2.csv')
food_data_id <- read.csv('/Users/xbasra/Documents/Data/Clustering_Food_Alergies/Original_Data/Cluster_Analysis_Data2.csv')
#food_data_id <- read.csv('/Users/xbasra/Documents/Data/Clustering_Food_Alergies/Cluster_Analysis_Data.csv')
library("factoextra")
library('FactoMineR')
library('mclust')
library("seriation")
library("clustertend")
library(Rtsne)
library(caret)
library(dplyr)
library(fpc)
library(NbClust)
library(clValid)
library(magrittr)
library(dplyr)
library(tibble)
library(tidyr)
library(clValid)
library(ggplot2)
library(corrplot)
library(caret)
# possible orderd variables are degreeurban and smoking2
# replacing smoking variable with integer values
#food_data$smoking2 <- ifelse(food_data$smoking2 == 'non-smokers',0, ifelse(food_data$smoking2 == 'ex-smokers',1, ifelse(food_data$smoking2 == 'current-smokers',2,0)))
# converting the ever somke to integer variable
#food_data$smoking2 <- as.integer(food_data$smoking2)
# # replacing degreeurban variable with integer values
# food_data$degreeurban <- ifelse(food_data$degreeurban == '<= 10 000 inh',0, ifelse(food_data$degreeurban == '>10 000 inh',1,2))
# food_data$degreeurban <- as.integer(food_data$degreeurban)
# Dropping the ID variable and degreeurban
#drops <- c("ID")
drops <- c("ID", "degreeurban","family_allergy_hist")
drops2 <- c("ID", "degreeurban","fam_hist_allergy", "fam_hist_asthma", "smoking2", "farmlive", "dustexposure")
drops3 <- c("ID")
food_data <- food_data[ , !(names(food_data) %in% drops3)]
# spliting the data
food_spilts <- split(names(food_data),sapply(food_data, function(x) paste(class(x), collapse=" ")))
# replacingthe outliers
# changing the subjects with row number 1419 and 1648 to zeros
#food_data$sIgE_f2[food_data$sIgE_f2 == 1e+09]
#food_data[food_data$sIgE_f2 >  1000, ]$sIgE_f2 <- 0
#food_data[food_data$sIgE_f13 >  1000, ]$sIgE_f13 <- 0
food_data$sIgE_f2[1419] <- 0
food_data$sIgE_f13[1648] <- 0
# # missing value
# # replacing missing value in the farmlive variable to no
# food_data$farmlive[food_data$farmlive == ""] <- NA
# food_data <-  food_data %>% replace_na (list(farmlive = 'no'))
#
# # replacing missing value in the dustexposure variable to no
# food_data$dustexposure[food_data$dustexposure == ""] <- NA
# food_data <-  food_data %>% replace_na (list(dustexposure = 'no'))
# food_data <- lapply(food_data, function(x){
#      x[x == ""] <- 'no'
#      return(x)
# })
# which(is.na(food_data$bmi2))
sort(apply(food_data, 2, function(x){sum(is.na(x))}), decreasing = TRUE)
# as we can see there are one missing value in the bmi2 (we will impute taht value for now with the median)
food_data <-  food_data %>% replace_na (list(bmi2 = median(food_data$bmi2, na.rm = TRUE)))
#scaled_food <- scale(food_data)
write.csv(food_data, '/Users/xbasra/Documents/Data/Clustering_Food_Alergies/CsvData/food_data.csv')
food_data2 <- mutate(food_data,
#dustexposure = if_else(dustexposure == "yes",1L,0L),
farmlive = if_else(farmlive == "yes", 1L,0L),
family_allergy_hist = if_else(family_allergy_hist == "yes",1L,0L),
#fam_hist_asthma = if_else(fam_hist_asthma == "yes", 1L,0L),
#fam_hist_allergy = if_else(fam_hist_allergy== "yes",1L,0L),
gender2 = if_else(gender2 == "males", 1L,0L))
## one should think about values such as 999 if it is outliers!!
# replacing any_airmidicatio to the original binary values
#Airway2$any_airw_medication <- mydata$any_airw_medication
ordinal_columns_food <- which(names(food_data)%in% food_spilts$integer)
nuum_col_food <-  which(names(food_data)%in% food_spilts$numeric)
asymm_col_food <-  which(names(food_data)%in% food_spilts$factor)
symm_col_food <-  which( colnames(food_data)== "gender2" )
asymm_col_food <- setdiff(asymm_col_food, symm_col_food)
# no scaling is needed as explained in the daisy function that a Gower's standardization based on range will be applied
gower_food <- daisy(food_data2, metric = "gower", type = list(ordratio = ordinal_columns_food, symm = symm_col_food
, asymm = asymm_col_food))
# # calculating the gower matrix
# gower_Airway <- daisy(Airway2, metric = "gower", type = list(ordratio=c(1,14), symm=c(19), asymm=c(5,6,8,9,10,11,12,13,15,16,17,18,21)))
gower_food_mat <- as.matrix(gower_food)
# # scaling with caret average
# ordinalCol_Airway <- c('ever_smoker20py', 'dyspneaMRC')
# pp <- preProcess(Airway2[Airway_spilts$numeric], method = "range")
# pp1 <- preProcess(Airway2[ordinalCol_Airway], method = "range")
# scaled_Airway <- Airway2
# scaled_Airway[Airway_spilts$numeric]<- predict(pp, Airway2[Airway_spilts$numeric])
# scaled_Airway[ordinalCol_Airway] <- predict(pp1, Airway2[ordinalCol_Airway])
# saving the data
write.csv(food_data2, '/Users/xbasra/Documents/Data/Clustering_Food_Alergies/CsvData/food_data2.csv')
# write.csv(scaled_Airway, '/Users/xbasra/Documents/Data/Clustering/Results_Data_Reports/CsvData/scaled_Airway.csv')
# write.csv(gower_Airway_mat, '/Users/xbasra/Documents/Data/Clustering/Results_Data_Reports/CsvData/gower_Airway_mat.csv')
set.seed(22)
library(dplyr)
drops_numerical <-food_spilts$numeric
#drops_ordinal <- c("ever_smoker20py","dyspneaMRC")
food_cate <- food_data2[ , !(names(food_data2) %in% drops_numerical)]
#food_cate <- food_cate[ , !(names(food_cate) %in% drops_ordinal)]
food_cate_to_num <- food_cate
m <- length(food_cate_to_num)
d <- dim(food_cate_to_num)[1]
#food_cate_to_num <- food_cate
for (j in 1:m) {
n <- length(unique(food_cate[,j]))
#S <- unique(Airway_cate[,j])
S <- as.vector(as.matrix(dplyr::count(food_cate, food_cate[,j], sort = TRUE)[,1])) # the unique values of variable (we did this some that the order match with count and probabilities)
C <- as.vector(as.matrix(dplyr::count(food_cate, food_cate[,j], sort = TRUE)[,2])) # the count (number of occurance) of the values in the variable
#P <- as.vector(as.matrix(count(food_cate, food_cate[,j])[,2]/1175))
#P <- C/(1175*1) # Probabilities
P <- C/(d*1) # Probabilities
for (i in 1:n){
mu <- 0
L1 <- 0
L2 <- 1- sum(P^3)
L3 <- 0
for (h in 1:n) {
H <- P[h]*P[i]*(h-i)^2
L3 <- H + L3
}
for (k in 1:i){
mu1 <- sum((n-k) * P[k])
L1 <- L1 + mu1
}
mu <- ((n -i) - L1) * sqrt(L2/L3)
#M <- rnorm(C[i], mean = mu, sd = P[i])
for (l in 1:d){
if (food_cate_to_num[l,j] == S[i]){
food_cate_to_num[l,j] <- rnorm(1,mu, P[i])
}
}
}
}
#famd_df_no_ordinal <- as.data.frame(scale(famd_df_no_ordinal)) # scaling the data
#numeric_variables<-AD_data[, sapply(AD_data, is.numeric)]
library(ggcorrplot)
corr <- round(cor(food_converted), 1)
# scaling numerical variables (standarization)
converted_scaled_num <- scale(food_data2[food_spilts$numeric])
#food_cate_to_num <- scale(food_cate_to_num)
# combining the numerical columns
#transformed_catpca_famd_airway <- cbind(cat_fit$objectscores, famd_df_no_ordinal)
#colnames(Airway2[food_splits$numeric])
#Airway_cate_to_num_only_cate <- food_cate_to_num
food_converted <- cbind(food_cate_to_num, converted_scaled_num)
# combining the transformed ordinal data
#food_cate_to_num <- cbind(food_cate_to_num, cat_fit$transform)
colnames(food_converted)
write.csv(food_converted, '/Users/xbasra/Documents/Data/Clustering_Food_Alergies/CsvData/food_converted.csv')
#numeric_variables<-AD_data[, sapply(AD_data, is.numeric)]
library(ggcorrplot)
corr <- round(cor(food_converted), 1)
ggcorrplot(corr, hc.order = TRUE, type = "lower",
lab = TRUE)
class(corr)
dim(corr)
mydata = read.csv(file = "/Users/xbasra/Documents/myReproducibleresearch/CsvData/USArrests.csv")
View(mydata)