Skip to content

Commit

Permalink
ABCRF included in pdf
Browse files Browse the repository at this point in the history
  • Loading branch information
mnavascues committed Sep 26, 2018
1 parent 7024ed8 commit 3b734bc
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 29 deletions.
52 changes: 48 additions & 4 deletions main.Rnw
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@
\usepackage[nomarkers,nofiglist,figuresonly]{endfloat}
\usepackage[acronym]{glossaries}

\usepackage{fancyhdr}
\pagestyle{fancy}
\fancyhf{}
\rfoot{\thepage}
\lfoot{\href{http://doi.org/10.5281/zenodo.1435503}{doi:10.5281/zenodo.1435503}}


\newif\iflatexboolean


Expand Down Expand Up @@ -44,6 +51,8 @@
\begin{document}

\maketitle
\newpage


%\LaTeX{} version: \LaTeXe~\fmtversion

Expand Down Expand Up @@ -797,7 +806,7 @@ points(ref_table$TajD[which(sim_model=="C")],
col="grey",pch=20)
partition.tree(classification_tree,
ordvars=c("TajD","FuLiD"),
add=T,cex=3,col="red")
add=T,cex=3,col=7)
@

\subsection*{Many trees: random forest}
Expand All @@ -818,21 +827,21 @@ plot(ref_table$pi,
pch=20,log="x")
partition.tree(regression_tree,
ordvars=c("pi","theta"),
add=T,cex=1.5,col="blue",lwd=2)
add=T,cex=1.5,col=6,lwd=2)
plot(ref_table$pi,
log10(ref_table$theta),
xlab=expression(pi),
ylab=expression(log[10]*theta),
pch=20,log="x")
for (i in 1:100){
random_sample <- sample(length(theta),size=100,replace=T)
random_sample <- sample(length(theta),size=500,replace=T)
ref_table_random_sample <- ref_table[random_sample,]
regression_tree_random_sample <- tree(log10(theta) ~ pi,
data=ref_table_random_sample)
partition.tree(regression_tree_random_sample,
ordvars=c("pi","theta"),
add=T,cex=1.5,col="red",lwd=1)
add=T,cex=1.5,col=7,lwd=1)
}
@
Expand All @@ -846,11 +855,46 @@ For model choice, random forest provides you with the number of votes supporting
<<rcode_32, cache=F, tidy=TRUE>>=
@

figure~\ref{fig:VarImportance}:
<<VarImportance, fig.height=6, fig.width=6, echo=TRUE, fig.cap="Variable importance plot.">>=
plot(model_RF,
training=ref_table)
@


For each simulation of the reference table we can get a prediction from the random forest (only from trees grown without that simulation). Comparing with the true value we can get the prior error rate and the confusion matrix:

<<rcode_33, cache=F, tidy=TRUE>>=
@

Now we can grow a \textbf{second} random forest which will learn the relationship between the ``local'' error rate and the summary statistics. The probability of a correct classification (posterior probability) is one minus the probability of an incorrect classification (error rate). This second random forest allows to estimate the posterior probability of the \textbf{chosen} model (implemented in function {\tt \Sexpr{hi_latex('predict.abcrf()')}} from {\tt abcrf}):

<<rcode_34, cache=F, tidy=TRUE>>=
@

For the posterior probability distribution of a parameter, the key thing to remember is that prediction of quantitative variables from random forest is based on the weight given by the trees to each simulation of the reference table (i.e. the rediction is a weigthed mean). The same weigths can be applied to the median and percentiles to obtain a point estimate and credibility intervals, and to plot an estimate of the distribution (e.g. an histogram):

<<rcode_35, cache=F, tidy=TRUE>>=
@
<<rcode_36, cache=F, tidy=TRUE>>=
@


figure~\ref{fig:hist_abcrf}:
<<hist_abcrf, fig.height=6, fig.width=6, echo=TRUE, fig.cap="Probability density estimated from ABCRF.">>=
hist(x = log10theta,
breaks = seq(-1,2,0.05),
col = "grey",
freq = FALSE,
ylim = c(0,5),
main = "",
xlab = expression(log(theta)),
ylab = "probability density")
wtd.hist(x = log10theta,
breaks = seq(-1,2,0.05),
col = ReddishPurple_transparency, freq=FALSE, add=TRUE,
weight = posterior_theta_RF$weights); box()
@



Expand Down
Binary file modified main.pdf
Binary file not shown.
78 changes: 53 additions & 25 deletions src/main.r
Original file line number Diff line number Diff line change
Expand Up @@ -976,7 +976,7 @@ points(ref_table$TajD[which(sim_model=="C")],
col="grey",pch=20)
partition.tree(classification_tree,
ordvars=c("TajD","FuLiD"),
add=T,cex=3,col="red")
add=T,cex=3,col=7)



Expand Down Expand Up @@ -1005,23 +1005,20 @@ plot(ref_table$pi,
pch=20,log="x")
partition.tree(regression_tree,
ordvars=c("pi","theta"),
add=T,cex=1.5,col="blue",lwd=2)
add=T,cex=1.5,col=6,lwd=2)

for (i in 1:100){
random_sample <- sample(length(theta),size=500)
random_sample <- sample(length(theta),size=500,replace=T)
ref_table_random_sample <- ref_table[random_sample,]
regression_tree_random_sample <- tree(log10(theta) ~ pi,
data=ref_table_random_sample)
partition.tree(regression_tree_random_sample,
ordvars=c("pi","theta"),
add=T,cex=1.5,col="red",lwd=1)
add=T,cex=1.5,col=7,lwd=1)

}



## ---- rcode_32

ref_table <- data.frame(sim_model,sim_SS)
model_RF <- abcrf(formula = sim_model~.,
data = ref_table,
Expand All @@ -1030,38 +1027,44 @@ model_RF <- abcrf(formula = sim_model~.,
paral = T)
## ---- rcode_32end

cat(paste("Prior error rate for this random forest is",
round(model_RF$prior.err, digits=3)))


plot(model_RF,
training=ref_table)


err.abcrf(model_RF,
training=ref_table,
paral=T)


cat(paste("Prior error rate for this random forest is",
round(model_RF$prior.err, digits=3)))

## ---- rcode_33
# GLOBAL ERROR
as.factor(sim_model[1:20])
model_RF$model.rf$predictions[1:20]
sum(as.character(model_RF$model.rf$predictions)!=sim_model)/length(sim_model)
model_RF$model.rf$confusion.matrix
# LOCAL ERROR
as.numeric(as.character(model_RF$model.rf$predictions[1:20])!=sim_model[1:20])
local_error <- as.numeric(as.character(model_RF$model.rf$predictions)!=sim_model)
## ---- rcode_33end

predict(object=model_RF$model.rf,
data=as.data.frame(rbind(target1_SS,target2_SS)))$predictions
rowSums(predict(object=model_RF$model.rf,predict.all = TRUE,
data=as.data.frame(rbind(target1_SS,target2_SS)))$predictions==1)
# Probability of correct = 1 - probability of incorrect

# Now we train a second forest to predict the error and
# make classification error prediction on observed data
# implemented in predict.abcrf :

## ---- rcode_34
model_selection_result_RF <- predict(object = model_RF,
obs = as.data.frame(rbind(target1_SS,target2_SS)),
training = ref_table,
ntree = 500,
ntree = 1000,
paral = T,
paral.predict = T)
(model_selection_result_RF)
## ---- rcode_34end




## ---- rcode_35

log10theta <- log10(sim_theta)
sim_SS <- cbind(sim1_S,sim1_pi,sim1_NH,
Expand All @@ -1072,17 +1075,42 @@ RFmodel_theta <- regAbcrf(formula = log10theta~.,
data = ref_table,
ntree = 1000,
paral = T)
## ---- rcode_35end


plot(RFmodel_theta)
err.regAbcrf(object = RFmodel_theta,
training = ref_table,
paral = T)
#err.regAbcrf(object = RFmodel_theta,
# training = ref_table,
# paral = T)
## ---- rcode_36
posterior_theta_RF <- predict(object = RFmodel_theta,
obs = as.data.frame(target1_SS),
training = ref_table,
paral = T)
paral = T,
rf.weights = T)
(posterior_theta_RF)
## ---- rcode_36end


hist(x = log10theta,
breaks = seq(-1,2,0.05),
col = "grey",
freq = FALSE,
ylim = c(0,5),
main = "",
xlab = expression(log(theta)),
ylab = "probability density")
wtd.hist(x = log10theta,
breaks = seq(-1,2,0.05),
col = ReddishPurple_transparency, freq=FALSE, add=TRUE,
weight = posterior_theta_RF$weights)
wtd.hist(x = log10(sim_theta_adjusted_1),
breaks = seq(-1.5,2,0.05),
col = Blue_transparency, freq=FALSE, add=TRUE,
weight = sim_weights_1)



densityPlot(object = RFmodel_theta,
obs = as.data.frame(target1_SS),
training = ref_table,
Expand Down

0 comments on commit 3b734bc

Please sign in to comment.