-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathABSA.r
87 lines (63 loc) · 3.32 KB
/
ABSA.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# ABSA in practice
# Call needed libraries
library(data.table)
library(nnet)
library(ROCR)
library(caret)
## Set your working directory - Should contain the 5 .r files described below and the needed input files
## Call functions built to run ABSA
# Internal functions
source("~/AB_SA/R/ReadParseScoary.r") # function called by create_input_mnl.r
source("~/AB_SA/R/DefStrainCategory.r") # function called by create_input_mnl.r
# Functions for ABSA user
source("~/AB_SA/R/CreateInputMNL.r") # function to be used to create mnl inputs (for fitting and prediction)
source("~/AB_SA/R/MNLTrainTest.r") # Training and testing the model
source("~/AB_SA/R/MNLFit.r") # Fitting (full dataset)
source("~/AB_SA/R/MNLPredict.r") # Predicting membership probabilities of strains to attribute
## Application of ABSA on a dataset (here 'Salmonella' Typhimurium COMPARE French dataset)
setwd("~/AB_SA/data")
# First create the inputs for multinomial logistic model for a chosen maximum number of enriched genes in sources (maxGenes) with CreateInputMNL function
CreateInputMNL("~/AB_SA/data/FR_scoary_trait.csv","~/AB_SA/data/gene_presence_absence.Rtab",maxGenes=4)
# Then, assess the perfomance of that genes with MNLTrainTest function
testedMNL<-MNLTrainTest("mnl_input_0.csv",0.70,100)
# Explore the outputs:
#[[1]] quantiles describing bootstrap accuracies,
testedMNL[[1]]
#[[2]] median of balanced accuracies for each source
testedMNL[[2]]
#[[3]] ksdensity of bootstrap accuracies
testedMNL[[3]]
# Finally, used the full datatset to prepare the prediction model with MNLFit function, and predict the source of strains with unknown origin with MNLPredict
final.trained<-MNLFit("mnl_input_0.csv")
predict.unknown<-MNLPredict("predict_sporadic.csv",final.trained)
write.table(file="predicted_sources.csv",predict.unknown,sep=";")
#xlegend<-c("1S","2S","3S","4W","5W","6W","7W","8W","9W","10W","11W","12C","13W","14W","15W","16W","17W","18W","19W","20W","21W","22W","23W","24W","25W","26W","27B","28B","29B")
#rownames(predict.unknown)<-xlegend
barplot(t(predict.unknown),legend=row.names(t(predict.unknown)),args.legend = list(x='right',bty='n',inset=c(-0.1,0),xpd=TRUE),xlim = c(0,45),cex.names = 0.7,xlab="Environnemental strains",ylab = "Membership probabilities")
## Optimisation for French dataset for the article describing the approach
AIC<-c()
coefnames<-c()
Accuracy<-matrix(c(0),10,3)
Balanced_accuracies<-matrix(c(0),10,3)
for (ng in 1:5)
{CreateInputMNL("~/AB_SA/data/FR_scoary_trait.csv","~/AB_SA/data/gene_presence_absence.Rtab",ng)
testedMNL<-MNLTrainTest("mnl_input_0.csv",0.70,100)
percentiles_accuracy<-testedMNL[[1]]
balanced_accuracies<-testedMNL[[2]]
Accuracy[ng,1:3]<-percentiles_accuracy[c(1,3,5)]
Balanced_accuracies[ng,1:3]<-balanced_accuracies
final.trained<-MNLFit("mnl_input_0.csv")
AIC[ng]<-final.trained$AIC
coefnames<-c(coefnames,final.trained$coefnames)
}
# Plot of coef
library(questionr)
library(GGally)
mnl_input<-c("mnl_input_0.csv")
data<-read.table(mnl_input,sep=",",header = TRUE)
data2<-subset(data,select=-1)
data2$Source<-relevel(data2$Source,ref="Ruminant_FR")
multinomModel_full <- nnet::multinom(Source ~ ., data=data2,trace=FALSE) # multinom Model
ggcoef(multinomModel_full,exponentiate=FALSE,conf.int = FALSE)+facet_grid(~y.level)
#tidy(multinomModel_full,exponentiate = TRUE,conf.int = FALSE)
odds.ratio(multinomModel_full)