-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRWeka_prediction_R_pipeline.R
35 lines (24 loc) · 2 KB
/
RWeka_prediction_R_pipeline.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# prediction with RWeka
library(RWeka)
library(rJava)
# Add the path and file name that you wish to analyse
tree_stats_path <- "ADD PATH HERE" # Example: "~/Desktop/ML_tree_stats.txt"
nuc_diversity_path <- "ADD PATH HERE" # Example: "~/Desktop/nuc_diversity.txt"
# After download the classifier you should load it in R using the following command for prediction of empirical data
load("classifier_1000bp.dat") # Classifier that should be used for predictions when DNA sequence alingment is approximately 1,000 bp
load("classifier_10000bp.dat") # Classifier that should be used for predictions when DNA sequence alingment is approximately 10,000 bp
tree_stats <- read.delim(tree_stats_path, header = FALSE)
nuc_diversity <- read.delim(nuc_diversity_path, header = FALSE)
# Adding names to collumns
colnames(tree_stats) <- c("file_name", "Beta_Index", "maxLadder", "ILportion", "maxDepth", "maxWidth", "woverd", "maxDiffWidth", "nCherry", "prop")
colnames(nuc_diversity) <- c("file_name", "nuc_diversity")
# Joining data to a single data frame
# MAKE SURE THAT THE FILE NAME ORDER IN "tree_stats" ARE THE SAME AS IN "nuc_diversity" (if you are analysing more than one tree/DNA sequence alignment)
allData <- cbind(tree_stats, nuc_diversity$nuc_diversity)
colnames(allData)[11] <- "nuc_diversity"
# Prediction function to be used with trees reconstructed with DNA sequence alignments of approximatelly 1,000 bp. This will give the result in probability.
# For the example in the paper, this classifier will be used to predict data for env E and pol genes (rows 1 and 4 for the prediction results).
predict(classifier1000bp, newdata = allData, type = "probability")
# Prediction function to be used with trees reconstructed with DNA sequence alignments of approximatelly 10,000 bp. This will give the result in probability.
# For the example in the paper, this classifier will be used to predict data for all PERVs and PERV-B genomes (rows 2 and 3 for the prediction results)
predict(classifier10000bp, newdata = allData, type = "probability")