From 676f6b78eb24e191f65fd713ea739436809826e3 Mon Sep 17 00:00:00 2001
From: Nikita Titov <nekit94-08@mail.ru>
Date: Fri, 20 Jul 2018 10:23:42 +0300
Subject: [PATCH] refined examples and hyperparameters of FastRGF (#203)

* renamed examples folders

* refined FastRGF examples

* enhenced FastRGF parameters tuning guide and fixed typo in param name

* fixed execute script command

* made examples cross-platform
---
 FastRGF/README.md                             | 49 ++++++++++++++++---
 FastRGF/examples/README.md                    | 37 +++-----------
 .../examples/binary_classification/README.md  | 20 ++++++++
 .../inputs/madelon.test                       |  0
 .../inputs/madelon.train                      |  0
 .../outputs/.gitignore                        |  0
 .../{ex1 => binary_classification}/run.sh     | 26 ++++------
 FastRGF/examples/ex2/run.sh                   | 35 -------------
 FastRGF/examples/regression/README.md         | 20 ++++++++
 .../{ex2 => regression}/inputs/config         |  3 +-
 .../{ex2 => regression}/inputs/feature.names  |  0
 .../{ex2 => regression}/inputs/housing.test   |  0
 .../{ex2 => regression}/inputs/housing.train  |  0
 .../{ex2 => regression}/outputs/.gitignore    |  0
 FastRGF/examples/regression/run.sh            | 31 ++++++++++++
 FastRGF/include/discretization.h              |  4 +-
 FastRGF/include/dtree.h                       |  4 +-
 python-package/Readme.rst                     |  8 +--
 python-package/rgf/fastrgf_model.py           |  4 +-
 19 files changed, 138 insertions(+), 103 deletions(-)
 create mode 100644 FastRGF/examples/binary_classification/README.md
 rename FastRGF/examples/{ex1 => binary_classification}/inputs/madelon.test (100%)
 rename FastRGF/examples/{ex1 => binary_classification}/inputs/madelon.train (100%)
 rename FastRGF/examples/{ex1 => binary_classification}/outputs/.gitignore (100%)
 rename FastRGF/examples/{ex1 => binary_classification}/run.sh (55%)
 delete mode 100644 FastRGF/examples/ex2/run.sh
 create mode 100644 FastRGF/examples/regression/README.md
 rename FastRGF/examples/{ex2 => regression}/inputs/config (96%)
 rename FastRGF/examples/{ex2 => regression}/inputs/feature.names (100%)
 rename FastRGF/examples/{ex2 => regression}/inputs/housing.test (100%)
 rename FastRGF/examples/{ex2 => regression}/inputs/housing.train (100%)
 rename FastRGF/examples/{ex2 => regression}/outputs/.gitignore (100%)
 create mode 100644 FastRGF/examples/regression/run.sh

diff --git a/FastRGF/README.md b/FastRGF/README.md
index 0a409c23..85824f49 100644
--- a/FastRGF/README.md
+++ b/FastRGF/README.md
@@ -18,11 +18,13 @@ Please see the file [`CHANGES.md`](./CHANGES.md) for the changelog of FastRGF.
 
 3. [Examples](#3-examples)
 
-4. [Contact](#4-contact)
+4. [Hyperparameters Tuning](#4-hyperparameters-tuning)
 
-5. [Copyright](#5-copyright)
+5. [Contact](#5-contact)
 
-6. [References](#6-references)
+6. [Copyright](#6-copyright)
+
+7. [References](#7-references)
 
 # 1. Introduction
 
@@ -85,17 +87,48 @@ make install
 
 # 3. Examples
 
- Go to the [`examples`](./examples) subdirectory and follow the instructions in the [`README.md`](./examples/README.md) file. The file also contains some tips for parameter tuning.
- 
-# 4. Contact
+ Please go to the [`examples`](./examples) subdirectory.
+
+# 4. Hyperparameters Tuning
+
+## Forest-level hyperparameters
+
+-  **forest.ntrees**: Controls the number of trees in the forest. Typical range is `[100, 10000]`. Default value is `500`.
+-  **forest.opt**: Optimization method for training the forest. You can select `rgf` or `epsilon-greedy`. Default value is `rgf`.
+-  **forest.stepsize**: Controls the step size of epsilon-greedy boosting. Meant for being used with `forest.opt=epsilon-greedy`. Default value is `0.0`.
+
+## Tree-level hyperparameters
+
+-  **dtree.max_level**: Controls the maximum tree depth. Default value is `6`.
+-  **dtree.max_nodes**: Controls the maximum number of leaf nodes in best-first search. Default value is `50`.
+-  **dtree.new_tree_gain_ratio**: Controls when to start a new tree. New tree is created when _leaf nodes gain < this value \* estimated gain of creating new tree_. Default value is `1.0`.
+-  **dtree.min_sample**: Controls the minimum number of training data points in each leaf node. Default value is `5`.
+-  **dtree.loss**: You can select `LS`, `MODLS` or `LOGISTIC` loss function. Default value is `LS`. However, for binary classification task `LOGISTIC` often works better.
+-  **dtree.lamL1**: Controls the degree of L1 regularization. A large value induces sparsity. Typical range is `[0.0, 1000.0]`. Default value is `1.0`.
+-  **dtree.lamL2**: Controls the degree of L2 regularization. The larger value is, the larger `forest.ntrees` you need to use: the resulting accuracy is often better with a longer training time. Use a relatively large value such as `1000.0` or `10000.0`. Default value is `1000.0`.
+
+## Discretization hyperparameters
+
+-  **discretize.sparse.max_buckets**: Controls the maximum number of discretized values. Typical range is `[10, 250]`. Default value is `200`. Meant for being used with sparse data.
+
+   *If you want to try a larger value up to `65000`, then you need to edit [include/header.h](./include/header.h) and replace `using disc_sparse_value_t=unsigned char;` by `using disc_sparse_value_t=unsigned short;`. However, this will increase the memory usage.*
+-  **discretize.dense.max_buckets**: Controls the maximum number of discretized values. Typical range is `[10, 65000]`. Default value is `65000`. Meant for being used with dense data.
+-  **discretize.sparse.min_bucket_weights**: Controls the minimum number of effective samples for each discretized value. Default value is `5.0`. Meant for being used with sparse data.
+-  **discretize.dense.min_bucket_weights**: Controls the minimum number of effective samples for each discretized value. Default value is `5.0`. Meant for being used with dense data.
+-  **discretize.sparse.lamL2**: Controls the degree of L2 regularization for discretization. Default value is `2.0`. Meant for being used with sparse data.
+-  **discretize.dense.lamL2**: Controls the degree of L2 regularization for discretization. Default value is `2.0`. Meant for being used with dense data.
+-  **discretize.sparse.max_features**: Controls the maximum number of selected features. Typical range is `[1000, 10000000]`. Default value is `80000`. Meant for being used with sparse data.
+-  **discretize.sparse.min_occurrences**: Controls the minimum number of occurrences for a feature to be selected. Default value is `5`. Meant for being used with sparse data.
+
+# 5. Contact
 
 Please post an [issue](https://github.com/RGF-team/rgf/issues) at GitHub repository for any errors you encounter.
 
-# 5. Copyright
+# 6. Copyright
 
 FastRGF is distributed under the **MIT license**. Please read the file [`LICENSE`](./LICENSE).
 
-# 6. References
+# 7. References
 
 [1] [Rie Johnson and Tong Zhang. Learning Nonlinear Functions Using Regularized Greedy Forest.](https://arxiv.org/abs/1109.0887) IEEE Transactions on Pattern Analysis and Machine Intelligence, 36(5):942-954, May 2014.
 
diff --git a/FastRGF/examples/README.md b/FastRGF/examples/README.md
index 52de28c8..8a420ef3 100644
--- a/FastRGF/examples/README.md
+++ b/FastRGF/examples/README.md
@@ -1,34 +1,9 @@
-### Examples
----
-* ex1 This is a binary classification problem, in libsvm's sparse feature format.
-Use the *shell script* [run.sh](ex1/run.sh) to perform training/test.
-The dataset is downloaded from <https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#madelon>.
-    
-    
-* ex2: This is a regression problem, in dense feature format. Use the *shell script* [run.sh](ex2/run.sh) to perform training/test.
-The dataset is from <https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression.html#housing>.
-    
+# Examples
 
-Note that for these small examples, the running time with multi-threads may be slower than with single-thread due to the overhead it introduces. However, for large datasets, one can observe an almost linear speed up.
+You can learn how to use FastRGF by these examples.
 
-The program can directly handle high dimensional sparse features in the libsvm format as in ex1. This is the recommended format to use when the dataset is relatively large (although some other formats are supported).
-
----
-### Tips for Parameter Tuning
-
-There are multiple training parameters that can affect performance. The following are the more important ones:
-
-* **dtree.loss**: default is LS, but for binary classificaiton, LOGISTIC often works better.
-* **forest.ntrees**: typical range is [100,10000], and a typical value is 1000.
-* **dtree.lamL2**: use a relatively large vale such as 1000 or 10000. The larger dtree.lamL2 is, the larger forest.ntrees you need to use: the resulting accuracy is often better with a longer training time.
-* **dtree.lamL1**: try values in [0,1000], and a large value induces sparsity.
-* **dtree.max_level** and **dtree.max_nodes** and **dtree.new_tree_gain_ratio**: these parameters control the tree depth and size (and when to start a new tree). One can try different values (such as dtree.max_level=4, or dtree.max_nodes=10, or dtree.new_tree_gain_ratio=0.5) to fine tuning performance.
-
-You may also modify the discreitzation options below:
-
-* **discretize.dense.max_buckets**: try in the range of [10,65000]
-* **discretize.sparse.max_buckets**: try in the range of [10, 250]. If you want to try a larger value up to 65000, then you need to edit [../include/header.h](../include/header.h) and replace
- "*using disc_sparse_value_t=unsigned char;*"
-    by "*using disc_sparse_value_t=unsigned short;*". However, this increase the memory useage.     
-* **discretize.sparse.max_features**: you may try a different value in [1000,10000000].
+Note that for these small examples, the running time with multithreading may be slower than with single-threading due to the overhead it introduces.
+However, for large datasets, one can observe an almost linear speedup.
 
+FastRGF can directly handle high-dimensional sparse features in the libsvm format as in [binary_classification example](./binary_classification).
+This is the recommended format to use when the dataset is relatively large (although some other formats are supported).
diff --git a/FastRGF/examples/binary_classification/README.md b/FastRGF/examples/binary_classification/README.md
new file mode 100644
index 00000000..7517669d
--- /dev/null
+++ b/FastRGF/examples/binary_classification/README.md
@@ -0,0 +1,20 @@
+# Binary Classification Example
+
+Here is an example for FastRGF to run binary classification task.
+Dataset for this example is taken from [here](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#madelon) and features are written in libsvm sparse format.
+
+You should make sure that executable files are placed into `../../bin` folder.
+
+Execute the shell script in this folder to run the example:
+
+for Windows:
+
+```
+run.sh
+```
+
+for Unix-like systems:
+
+```
+bash run.sh
+```
diff --git a/FastRGF/examples/ex1/inputs/madelon.test b/FastRGF/examples/binary_classification/inputs/madelon.test
similarity index 100%
rename from FastRGF/examples/ex1/inputs/madelon.test
rename to FastRGF/examples/binary_classification/inputs/madelon.test
diff --git a/FastRGF/examples/ex1/inputs/madelon.train b/FastRGF/examples/binary_classification/inputs/madelon.train
similarity index 100%
rename from FastRGF/examples/ex1/inputs/madelon.train
rename to FastRGF/examples/binary_classification/inputs/madelon.train
diff --git a/FastRGF/examples/ex1/outputs/.gitignore b/FastRGF/examples/binary_classification/outputs/.gitignore
similarity index 100%
rename from FastRGF/examples/ex1/outputs/.gitignore
rename to FastRGF/examples/binary_classification/outputs/.gitignore
diff --git a/FastRGF/examples/ex1/run.sh b/FastRGF/examples/binary_classification/run.sh
similarity index 55%
rename from FastRGF/examples/ex1/run.sh
rename to FastRGF/examples/binary_classification/run.sh
index 56f556e1..89979c40 100644
--- a/FastRGF/examples/ex1/run.sh
+++ b/FastRGF/examples/binary_classification/run.sh
@@ -1,5 +1,4 @@
 #!/bin/sh -f
-#
 
 exe_train=../../bin/forest_train
 exe_predict=../../bin/forest_predict
@@ -9,28 +8,23 @@ trn=inputs/madelon.train
 tst=inputs/madelon.test
 
 model_rgf=outputs/model-rgf
-    
+
 prediction=outputs/prediction
 
 orig_format="y.sparse"
 save_freq=200
 
 echo ------ training ------
-time ${exe_train} trn.x-file=${trn} trn.x-file_format=${orig_format} trn.target=BINARY tst.x-file=${tst} tst.x-file_format=${orig_format} tst.target=BINARY model.save=${model_rgf} dtree.new_tree_gain_ratio=1.0  dtree.lamL2=5000  forest.ntrees=1000  dtree.loss=LOGISTIC forest.save_frequency=${save_freq}
-
-echo " "
+time ${exe_train} trn.x-file=${trn} trn.x-file_format=${orig_format} trn.target=BINARY tst.x-file=${tst} tst.x-file_format=${orig_format} tst.target=BINARY model.save=${model_rgf} dtree.new_tree_gain_ratio=1.0 dtree.lamL2=5000 forest.ntrees=1000 dtree.loss=LOGISTIC forest.save_frequency=${save_freq}
 echo " "
-echo ------ testing intermediate model at ${save_freq} on ${tst} ------
-time ${exe_predict} tst.x-file=${tst} tst.x-file_format=${orig_format} tst.target=BINARY model.load=${model_rgf}-${save_freq} 
 
+echo ------ testing intermediate model at ${save_freq} on ${tst} ------
+time ${exe_predict} tst.x-file=${tst} tst.x-file_format=${orig_format} tst.target=BINARY model.load=${model_rgf}-${save_freq}
 echo " "
-echo " "
-echo ------ testing ------
-for datafile in ${trn}  ${tst}
-do
-   suffix=`echo ${datafile}|sed 's/.*\.//g'`
-   echo === $datafile ===	
-   time ${exe_predict} tst.x-file=${datafile} tst.x-file_format=${orig_format} tst.target=BINARY model.load=${model_rgf} tst.output-prediction=${prediction}-${suffix} 
-done
-    
 
+echo ------ testing ------
+echo === ${trn} ===
+time ${exe_predict} tst.x-file=${trn} tst.x-file_format=${orig_format} tst.target=BINARY model.load=${model_rgf} tst.output-prediction=${prediction}-train
+echo " "
+echo === ${tst} ===
+time ${exe_predict} tst.x-file=${tst} tst.x-file_format=${orig_format} tst.target=BINARY model.load=${model_rgf} tst.output-prediction=${prediction}-test
diff --git a/FastRGF/examples/ex2/run.sh b/FastRGF/examples/ex2/run.sh
deleted file mode 100644
index a1914efe..00000000
--- a/FastRGF/examples/ex2/run.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/sh -f
-#
-
-exe_train=../../bin/forest_train
-exe_predict=../../bin/forest_predict
-
-trn=inputs/housing.train
-tst=inputs/housing.test
-feat_name=inputs/feature.names
-    
-config=inputs/config
-
-model_rgf=outputs/model-rgf
-    
-prediction=outputs/prediction
-
-orig_format="y.x"
-
-echo ------ training ------
-time ${exe_train} -config=${config} trn.x-file=${trn} trn.x-file_format=${orig_format}  trn.target=REAL tst.x-file=${tst} tst.x-file_format=${orig_format} tst.target=REAL model.save=${model_rgf} 
-
-echo ------ print forest ------    
-${exe_predict} model.load=${model_rgf} tst.print-forest=${model_rgf}.print tst.feature-names=${feat_name}
-
-echo " "
-echo " "
-echo ------ testing ------
-for datafile in ${trn}  ${tst}
-do
-   suffix=`echo ${datafile}|sed 's/.*\.//g'`
-   echo === $datafile ===	
-   time ${exe_predict} tst.x-file=${datafile} tst.x-file_format=${orig_format} tst.target=REAL model.load=${model_rgf} tst.output-prediction=${prediction}-${suffix} 
-done
-    
-    
diff --git a/FastRGF/examples/regression/README.md b/FastRGF/examples/regression/README.md
new file mode 100644
index 00000000..e5ba8e86
--- /dev/null
+++ b/FastRGF/examples/regression/README.md
@@ -0,0 +1,20 @@
+# Regression Example
+
+Here is an example for FastRGF to run regression task.
+Dataset for this example is taken from [here](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression.html#housing) and features are written in the dense format.
+
+You should make sure that executable files are placed into `../../bin` folder.
+
+Execute the shell script in this folder to run the example:
+
+for Windows:
+
+```
+run.sh
+```
+
+for Unix-like systems:
+
+```
+bash run.sh
+```
diff --git a/FastRGF/examples/ex2/inputs/config b/FastRGF/examples/regression/inputs/config
similarity index 96%
rename from FastRGF/examples/ex2/inputs/config
rename to FastRGF/examples/regression/inputs/config
index 27c1cc75..997aced8 100644
--- a/FastRGF/examples/ex2/inputs/config
+++ b/FastRGF/examples/regression/inputs/config
@@ -1,11 +1,10 @@
 # discretization options
 discretize.dense.max_buckets=250
 discretize.dense.lamL2=10
-    
+
 # training options
 dtree.new_tree_gain_ratio=1.0
 dtree.loss=LS
 dtree.lamL1=10
 dtree.lamL2=1000
 forest.ntrees=1000
- 
diff --git a/FastRGF/examples/ex2/inputs/feature.names b/FastRGF/examples/regression/inputs/feature.names
similarity index 100%
rename from FastRGF/examples/ex2/inputs/feature.names
rename to FastRGF/examples/regression/inputs/feature.names
diff --git a/FastRGF/examples/ex2/inputs/housing.test b/FastRGF/examples/regression/inputs/housing.test
similarity index 100%
rename from FastRGF/examples/ex2/inputs/housing.test
rename to FastRGF/examples/regression/inputs/housing.test
diff --git a/FastRGF/examples/ex2/inputs/housing.train b/FastRGF/examples/regression/inputs/housing.train
similarity index 100%
rename from FastRGF/examples/ex2/inputs/housing.train
rename to FastRGF/examples/regression/inputs/housing.train
diff --git a/FastRGF/examples/ex2/outputs/.gitignore b/FastRGF/examples/regression/outputs/.gitignore
similarity index 100%
rename from FastRGF/examples/ex2/outputs/.gitignore
rename to FastRGF/examples/regression/outputs/.gitignore
diff --git a/FastRGF/examples/regression/run.sh b/FastRGF/examples/regression/run.sh
new file mode 100644
index 00000000..6dbdf61b
--- /dev/null
+++ b/FastRGF/examples/regression/run.sh
@@ -0,0 +1,31 @@
+#!/bin/sh -f
+
+exe_train=../../bin/forest_train
+exe_predict=../../bin/forest_predict
+
+trn=inputs/housing.train
+tst=inputs/housing.test
+feat_name=inputs/feature.names
+
+config=inputs/config
+
+model_rgf=outputs/model-rgf
+
+prediction=outputs/prediction
+
+orig_format="y.x"
+
+echo ------ training ------
+time ${exe_train} -config=${config} trn.x-file=${trn} trn.x-file_format=${orig_format} trn.target=REAL tst.x-file=${tst} tst.x-file_format=${orig_format} tst.target=REAL model.save=${model_rgf}
+echo " "
+
+echo ------ printing forest ------
+${exe_predict} model.load=${model_rgf} tst.print-forest=${model_rgf}.print tst.feature-names=${feat_name}
+echo " "
+
+echo ------ testing ------
+echo === ${trn} ===
+time ${exe_predict} tst.x-file=${trn} tst.x-file_format=${orig_format} tst.target=REAL model.load=${model_rgf} tst.output-prediction=${prediction}-train
+echo " "
+echo === ${tst} ===
+time ${exe_predict} tst.x-file=${tst} tst.x-file_format=${orig_format} tst.target=REAL model.load=${model_rgf} tst.output-prediction=${prediction}-test
diff --git a/FastRGF/include/discretization.h b/FastRGF/include/discretization.h
index 54d41df5..5cef36c1 100644
--- a/FastRGF/include/discretization.h
+++ b/FastRGF/include/discretization.h
@@ -144,7 +144,7 @@ namespace rgf {
 			    80000, 
 			    "maximum number of selected features",
 			    this);
-	min_occurrences.insert(prefix+"min_occrrences",
+	min_occurrences.insert(prefix+"min_occurrences",
 			      5, 
 			      "minimum number of occurrences for a feature to be selected",
 			      this);
@@ -275,5 +275,3 @@ namespace rgf {
 }
 
 #endif
-
-
diff --git a/FastRGF/include/dtree.h b/FastRGF/include/dtree.h
index 78378dd1..3eafa983 100644
--- a/FastRGF/include/dtree.h
+++ b/FastRGF/include/dtree.h
@@ -203,9 +203,9 @@ namespace rgf {
 	maxNodes.insert(prefix + "max_nodes", 50,
 			"maximum number of leaf nodes in best-first search", this);
 	newTreeGainRatio.insert(prefix + "new_tree_gain_ratio",1.0,
-				"new tree is created when leaf-nodes gain < this value * estimated gain of creating new three", this);
+				"new tree is created when leaf nodes gain < this value * estimated gain of creating new tree", this);
 	min_sample.insert(prefix + "min_sample", 5,
-			  "minum sample per node", this);
+			  "minimum number of samples per node", this);
 
 	lamL1.insert(prefix + "lamL1", 1,
 		     "L1 regularization parameter", this);
diff --git a/python-package/Readme.rst b/python-package/Readme.rst
index 701b355b..7397d083 100644
--- a/python-package/Readme.rst
+++ b/python-package/Readme.rst
@@ -120,13 +120,13 @@ We provide `docker image <https://github.com/RGF-team/rgf/blob/master/python-pac
     # Run FastRGF example
     python ./rgf/python-package/examples/FastRGF/FastRGF_classifier_on_iris_dataset.py
 
-Tuning Hyper-parameters
------------------------
+Tuning Hyperparameters
+----------------------
 
 RGF
 '''
 
-You can tune hyper-parameters as follows.
+You can tune hyperparameters as follows.
 
 -  *max\_leaf*: Appropriate values are data-dependent and usually varied from 1000 to 10000.
 -  *test\_interval*: For efficiency, it must be either multiple or divisor of 100 (default value of the optimization interval).
@@ -142,7 +142,7 @@ You can tune hyper-parameters as follows.
 -  *opt\_interval*: Weight optimization interval in terms of the number of leaf nodes.
 -  *learning\_rate*: Step size of Newton updates used in coordinate descent to optimize weights.
 
-Detailed instruction of tuning hyper-parameters is `here <https://github.com/RGF-team/rgf/blob/master/RGF/rgf-guide.pdf>`__.
+Detailed instruction of tuning hyperparameters is `here <https://github.com/RGF-team/rgf/blob/master/RGF/rgf-guide.pdf>`__.
 
 FastRGF
 '''''''
diff --git a/python-package/rgf/fastrgf_model.py b/python-package/rgf/fastrgf_model.py
index d9d9c56d..4e06eca8 100644
--- a/python-package/rgf/fastrgf_model.py
+++ b/python-package/rgf/fastrgf_model.py
@@ -83,7 +83,7 @@
 sparse_min_occurences : int, optional (default=5)
     Minimum number of occurrences for a feature to be selected.
     Meant for being used with sparse data.
-    (Original name: discretize.sparse.min_occrrences.)
+    (Original name: discretize.sparse.min_occurrences.)
 {%calc_prob_parameter%}
 n_jobs : int, optional (default=-1)
     The number of jobs to run in parallel for both fit and predict.
@@ -523,7 +523,7 @@ def _get_train_command(self):
             params.append("discretize.sparse.max_buckets=%s" % self.max_bin)
             params.append("discretize.sparse.lamL2=%s" % self.data_l2)
             params.append("discretize.sparse.min_bucket_weights=%s" % self.min_child_weight)
-            params.append("discretize.sparse.min_occrrences=%s" % self.sparse_min_occurences)
+            params.append("discretize.sparse.min_occurrences=%s" % self.sparse_min_occurences)
             params.append("trn.x-file_format=x.sparse")
             params.append("trn.y-file=%s" % self._train_y_loc)
             if self._use_sample_weight: