Merge branch 'master' of https://github.com/nvihrs14/tcherry

nvihrs14 · Jul 16, 2019 · e472a7a · e472a7a
2 parents a929470 + a1c1086
commit e472a7a
Show file tree

Hide file tree

Showing 13 changed files with 178 additions and 64 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -20,12 +20,12 @@ Imports:
     utils,
     gRbase,
     compare,
-    stats
+    stats,
+    Rgraphviz,
+    gRain
 RdMacros: Rdpack
 Suggests: 
     testthat,
-    Rgraphviz,
-    gRain,
     knitr,
     rmarkdown
 VignetteBuilder: knitr

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,11 +1,11 @@
 # Generated by roxygen2: do not edit by hand
 
-export(BIC_junction_tree)
 export(CPT)
 export(ChowLiu)
 export(MI2)
 export(MI3)
 export(MIk)
+export(compute_BIC_junction_tree)
 export(cond_independence_test)
 export(diff_edges_tch)
 export(increase_order1)
@@ -21,6 +21,6 @@ export(random_tcherry)
 export(tcherry_CL)
 export(tcherry_complete_search)
 export(tcherry_step)
-export(thinning_edges)
+export(thin_edges)
 export(weight_junction_tree)
 importFrom(Rdpack,reprompt)
diff --git a/R/BIC_junction_tree.R b/R/BIC_junction_tree.R
@@ -60,10 +60,10 @@
 #'                    c("var5", "var6"))
 #'
 #' # smooth is used to deal with zero probabilities.
-#' BIC_junction_tree(cliques, separators, data, smooth = 0.1)
+#' compute_BIC_junction_tree(cliques, separators, data, smooth = 0.1)
 #' @export
 
-BIC_junction_tree <- function(cliques, separators, data,
+compute_BIC_junction_tree <- function(cliques, separators, data,
                               base_log = 2, ...){
 
   if (any(is.na(data))){

diff --git a/R/thinning_edges.R b/R/thinning_edges.R
@@ -69,11 +69,11 @@
 #'                    c("var2", "var5"),
 #'                    c("var5", "var6"))
 #'
-#' thinning_edges(cliques, separators, data = data, alpha = 0.1,
+#' thin_edges(cliques, separators, data = data, alpha = 0.1,
 #'                smooth = 0.1)
 #' @export
 
-thinning_edges <- function(cliques, separators, data, alpha = 0.05, ...){
+thin_edges <- function(cliques, separators, data, alpha = 0.05, ...){
 
   if (any(is.na(data))){
     warning(paste("The data contains NA values.",
@@ -87,7 +87,7 @@ thinning_edges <- function(cliques, separators, data, alpha = 0.05, ...){
   if (! (is.data.frame(data) | is.matrix(data))) {
     stop("data must be a data frame or a matrix.")
   }
-  
+
   data <- as.data.frame(data)
 
   if (! all(sapply(data, function(x){
@@ -212,7 +212,7 @@ thinning_edges <- function(cliques, separators, data, alpha = 0.05, ...){
 
   adj_matrix <- matrix(0, nrow = n_var, ncol = n_var)
   rownames(adj_matrix) <- colnames(adj_matrix) <- nodes
-  
+
   for (i in 1:n_cliq) {
     if (length(cliques[[i]]) != 1){
     adj_matrix[cliques[[i]], cliques[[i]]] <- 1
@@ -227,4 +227,4 @@ thinning_edges <- function(cliques, separators, data, alpha = 0.05, ...){
               "separators" = separators,
               "n_edges" = n_edges_graph,
               "n_edges_removed" = n_edges_removed))
-}
+}
diff --git a/README.md b/README.md
@@ -12,8 +12,12 @@ With vignettes
 
 If there are problems with viewing documentation or vignettes, it is recommended to restart the R session.
 
-Note that the package requres the following R-packages, which are automatically installed with the package:
-    Rdpack, utils, gRbase, compare and stats.
+Note that the package requires the following R-packages, which are automatically installed with the package:
+    Rdpack, utils, gRbase, compare, rGrain, Rgraphviz and stats.
+
+The package gRbase further requires the packages graph and RBGL which may have to be installed from Bioconductor for instance with
+
+`BiocManager::install(c("graph", "RBGL"))`
 
 ## Main functions (see vignette for more details)
 
@@ -23,18 +27,18 @@ Note that the package requres the following R-packages, which are automatically
 
 -__`k_tcherry_p_lookahead`__: Determine a k'th order t-cherry tree from data by adding p cliques at a time by a greedy search. Note that if p is the total number of cliques in a k'th order t-cherry tree with the desired number of vertices, this is a complete search.
 
--__`thinning_edges`__: Thinning of edges in an undirected graphical model with a triangulated graph.
+-__`thin_edges`__: Thinning of edges in an undirected graphical model with a triangulated graph.
 
--__`BIC_junction_tree`__: Calculates the BIC value for a graphical model from a junction tree for the graph.
+-__`compute_BIC_junction_tree`__: Calculates the BIC value for a graphical model from a junction tree for the graph.
 
 ## Example usage
 
-To demonstrate the main functions in this package consider the car evaluation data set from UCI Machine Learning Repository (Dau & Graff 2017). This data set contains 7 variables (all categorical) with 1728 observations for each and no missing values. The variables are describing different aspects of the car such as the estimated safety of the car, the number of doors etc. To find a graphical structure of a third order t-cherry tree for this data the function k_tcherry_p_lookahead is used. It is chosen to add just one clique at a time in the greedy search procedure.
+To demonstrate the main functions in this package consider the car evaluation data set from UCI Machine Learning Repository (Dau & Graff 2017). This data set contains 7 variables (all categorical) with 1728 observations for each and no missing values. The variables are describing different aspects of the car such as the estimated safety of the car, the number of doors etc. To find a graphical structure of a third order t-cherry tree for this data the function `k_tcherry_p_lookahead` is used. It is chosen to add just one clique at a time in the greedy search procedure.
 
 ``` r
 library(tcherry)
 car <- read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data",
-header = FALSE, sep = ",", dec = ".")
+          header = FALSE, sep = ",", dec = ".")
 names(car) <- c("buying", "maint", "doors", "persons", "lug_boot",
                   "safety", "class")
 tch3 <- k_tcherry_p_lookahead(data = car, k = 3, p = 1, smooth = 0.001)
@@ -52,10 +56,10 @@ tch3$adj_matrix
 
 Note that the smooth argument is added to cell counts when estimating probabilities to avoid zero probabilities, which would make some calculations invalid. 
 
-The graphical structure of af fourth order t-cherry tree for this data can be found by using the same function as above whit k = 4. However in this case, it is chosen to show how increase_order2 can be used to increase the order of the fitted third order t-cherry tree. The typical reason for this choice will be to save time, but often at the cost of a fitted structure of smaller likelihood.
+The graphical structure of af fourth order t-cherry tree for this data can be found by using the same function as above with k = 4. However in this case, it is chosen to show how `increase_order2` can be used to increase the order of the fitted third order t-cherry tree. The typical reason for this choice will be to save time, but often at the cost of a fitted structure of smaller likelihood.
 
 ``` r
-tch4 <- increase_order2(cliques = tch3$cliques, data = car, smooth = 0.001)
+tch4 <- increase_order2(tch_cliq = tch3$cliques, data = car, smooth = 0.001)
 tch4$adj_matrix
 #>            buying maint doors persons lug_boot safety class
 #> buying        0     1     0       1        1      1     1
@@ -73,7 +77,7 @@ Note that the smooth argument is added for the same reasons as above, and the gi
 It can now be attempted to simplify this model by thinning the edges.
 
 ``` r
-tch_thinning <- thinning_edges(cliques = tch4$cliques, separators = tch4$separators, data = car,
+tch_thinning <- thin_edges(cliques = tch4$cliques, separators = tch4$separators, data = car,
 smooth = 0.001)
 tch_thinning$adj_matrix
 #>            buying class doors lug_boot maint persons safety
@@ -94,13 +98,13 @@ Notice that in this function the structure is represented by the cliques and sep
 The three fitted structures can be compared by calculating a BIC score.
 
 ``` r
-BIC_junction_tree(cliques = tch3$cliques, separators = tch3$separators, data = car, smooth = 0.001)
+compute_BIC_junction_tree(cliques = tch3$cliques, separators = tch3$separators, data = car, smooth = 0.001)
 #> -20079.89
 
-BIC_junction_tree(cliques = tch4$cliques, separators = tch4$separators, data = car, smooth = 0.001)
+compute_BIC_junction_tree(cliques = tch4$cliques, separators = tch4$separators, data = car, smooth = 0.001)
 #> -21572.4
 
-BIC_junction_tree(cliques = tch_thinning$cliques, separators = tch_thinning$separators, data = car,
+compute_BIC_junction_tree(cliques = tch_thinning$cliques, separators = tch_thinning$separators, data = car,
 smooth = 0.001)
 #> -19923.95
 ```

diff --git a/contributing.md b/contributing.md
@@ -0,0 +1,89 @@
+# Contributing
+
+When contributing to this R-package please discuss suggested changes via issues, email, or any other method with the maintainer of the package. 
+
+Please note the code of conduct below, and please follow it when contributing in any way.
+
+## Pull Request Process
+* Please make sure the package is build, checked and can still be installed after the changes. 
+* Update the file README.md with any relevant changes to the usage of the package. Make sure the installation guide in this file will sucessfully install the package without having preinstalled any dependencies. If new dependencies are added which require manual installation please add the details under Install in README.md.
+* Increase the package version number.
+* If the maintainer accepts the changes, she will merge in the Pull Request. To speed up this process, you may make the maintainer aware of the pull request via email.
+
+## Code of Conduct
+
+### Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to making participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+### Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+### Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+### Scope
+
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+
+### Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at [ninnavihrs@hotmail.dk]. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
+
diff --git a/man/BIC_junction_tree.Rd → man/compute_BIC_junction_tree.Rd b/man/BIC_junction_tree.Rd → man/compute_BIC_junction_tree.Rd
diff --git a/man/thinning_edges.Rd → man/thin_edges.Rd b/man/thinning_edges.Rd → man/thin_edges.Rd
diff --git a/paper.bib b/paper.bib
@@ -8,6 +8,19 @@ @article{EKTShyp
   DOI = {10.1007/s10479-010-0814-y}
 }
 
+@article{patternrec,
+	author = {Tamas Szantai and Edith Kovacs},
+	title = {Application Of t-Cherry Junction Trees in Pattern Recognition},
+	journal = {BRAIN. Broad Research in Artificial Intelligence and Neuroscience},
+	volume = {1},
+	number = {0},
+	year = {2010},
+	keywords = {pattern recognition, probabilistic modeling, h-uniform hyper- tree, t-cherry junction tree},
+	issn = {2067-3957},
+	url = {https://www.edusoft.ro/brain/index.php/brain/article/view/103},
+	pages = {40--45}
+}
+
 @article{gRain,
     title = {Graphical Independence Networks with the {gRain} Package for {R}},
     author = {S{\o}ren H{\o}jsgaard},
@@ -17,8 +30,9 @@ @article{gRain
     number = {10},
     pages = {1--26},
     Url = {http://www.jstatsoft.org/v46/i10/},
+    DOI={10.18637/jss.v046.i10}
   }
-  
+
   @article{ChowLiu,
   title={Approximating Discrete Probability Distributions with Dependence Trees},
   author={C. K. Chow and C. N. Liu},
@@ -44,7 +58,8 @@ @article{EKTS
   volume={633},
   pages={39-56},
   year={2010},
-  publisher={Springer}
+  publisher={Springer},
+  DOI={10.1007/978-3-642-03735-1_3}
 }
 
 
@@ -54,7 +69,8 @@ @article{EKTSdisc
   journal={Optim Eng},
   volume={14},
   pages={503-518},
-  year={2013}
+  year={2013},
+  DOI={10.1007/s11081-013-9232-8}
 }
 
 @Book{Proulx,
@@ -70,7 +86,8 @@ @article{Y-STR
   journal={Forensic Science International: Genetics},
   volume={37},
   pages={29-36},
-  year={2018}
+  year={2018},
+  DOI={10.1016/j.fsigen.2018.07.014}
   }
 
 @misc{UCI ,

diff --git a/paper.md b/paper.md
@@ -23,7 +23,11 @@ affiliations:
 
 # Summary
 
-The R [@R] package `tcherry` contains a variety of functions for learning the structure of a k'th order t-cherry tree from given data, see for instance @EKTShyp for an explanation of this concept. This is a type of graphical models extending what is known as Chow-Liu trees [@ChowLiu]. Chow-Liu trees have for instance been used to estimate population frequencies of Y-STR haplotypes in @Y-STR. The functions attempt to find a t-cherry structure of maximal likelihood. To do this exact, it is necesarry to investigate all possible t-cherry structures of the given order. This is in most cases to time-consuming and therefore most of the functions use greedy search algorithms. Some implementations are inspired by algorithms in @EKTS, @EKTSdisc and @Proulx, but the package also contains some new algorithms and extensions. The package is only for structure learning and only categorical data is supported. The learned structure may be used to investigate dependencies between the variables in data. If the structure is used in combination with packages such as `gRain` [@gRain], it may also be used to estimate probability distributions of the variables or for prediction. 
+The R [@R] package `tcherry` contains a variety of functions for learning the structure of a k'th order t-cherry tree from given categorical data, see for instance @EKTShyp for an explanation of this concept. This is a graphical model extending what is known as a Chow-Liu tree [@ChowLiu]. Chow-Liu trees have for instance been used to estimate population frequencies of Y-STR haplotypes in @Y-STR and t-cherry trees have been used to model relationships in social networks in @Proulx. The functions attempt to find a t-cherry tree structure of maximal likelihood. To do this exactly, it is necesarry to investigate all possible t-cherry tree structures of the given order. This is in most cases too time-consuming and therefore most of the functions use greedy search algorithms. Some implementations are inspired by algorithms in @EKTS, @EKTSdisc and @Proulx, but the package also contains some new algorithms and extensions. The package is only for structure learning and only categorical data is supported. 
+
+The package can be used as a tool to analyse problems exploring dependencies between any kind of categorical variables. The fitted t-cherry structure can be used to make statements about conditional dependencies and independencies. The structure can also be used for pattern recognition and independence statements can be used for variabel selection for a prediction problem [@patternrec]. If the structure is used in combination with packages such as `gRain` [@gRain], it may also be used to estimate probability distributions of the variables or for prediction. This makes it possible to use the structure as an expert system.
+
+The t-cherry tree structure can be used in a variety of scientific fields such as biostatistics and artificial intelligence. The audience of the package is anyone who wants to model dependencies between categorical variables, approximate their probability distribution or solve classification problems with categorical variables.
 
 The following figure shows an example of a fourth order t-cherry tree learned from the car evaluation data set from UCI Machine Learning Repository [@UCI].