diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 15b39063..895db0f5 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -37,6 +37,8 @@ $ rust-code-analysis-cli -p src/algorithm/neighbour/fastpair.rs --ls 22 --le 213 ``` * find more information about what happens in your binary with [`twiggy`](https://rustwasm.github.io/twiggy/install.html). This need a compiled binary so create a brief `main {}` function using `smartcore` and then point `twiggy` to that file. +* Please take a look to the output of a profiler to spot most evident performance problems, see [this guide about using a profiler](http://www.codeofview.com/fix-rs/2017/01/24/how-to-optimize-rust-programs-on-linux/). + ## Issue Report Process 1. Go to the project's issues. diff --git a/Cargo.toml b/Cargo.toml index 4f7a6ddb..76d3e998 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ name = "smartcore" description = "Machine Learning in Rust." homepage = "https://smartcorelib.org" -version = "0.4.0" +version = "0.3.3" authors = ["smartcore Developers"] edition = "2021" license = "Apache-2.0" @@ -48,7 +48,7 @@ getrandom = { version = "0.2.8", optional = true } wasm-bindgen-test = "0.3" [dev-dependencies] -itertools = "0.10.5" +itertools = "0.12.0" serde_json = "1.0" bincode = "1.3.1" diff --git a/src/cluster/dbscan.rs b/src/cluster/dbscan.rs index 3ff6b07b..2e2aac10 100644 --- a/src/cluster/dbscan.rs +++ b/src/cluster/dbscan.rs @@ -315,8 +315,7 @@ impl, Y: Array1, D: Distance>> } } - while !neighbors.is_empty() { - let neighbor = neighbors.pop().unwrap(); + while let Some(neighbor) = neighbors.pop() { let index = neighbor.0; if y[index] == outlier { diff --git a/src/dataset/diabetes.rs b/src/dataset/diabetes.rs index faf169eb..a95b5116 100644 --- a/src/dataset/diabetes.rs +++ b/src/dataset/diabetes.rs @@ -40,7 +40,7 @@ pub fn load_dataset() -> Dataset { target: y, num_samples, num_features, - feature_names: vec![ + feature_names: [ "Age", "Sex", "BMI", "BP", "S1", "S2", "S3", "S4", "S5", "S6", ] .iter() diff --git a/src/dataset/digits.rs b/src/dataset/digits.rs index b3556e53..c32648cd 100644 --- a/src/dataset/digits.rs +++ b/src/dataset/digits.rs @@ -25,16 +25,14 @@ pub fn load_dataset() -> Dataset { target: y, num_samples, num_features, - feature_names: vec![ - "sepal length (cm)", + feature_names: ["sepal length (cm)", "sepal width (cm)", "petal length (cm)", - "petal width (cm)", - ] + "petal width (cm)"] .iter() .map(|s| s.to_string()) .collect(), - target_names: vec!["setosa", "versicolor", "virginica"] + target_names: ["setosa", "versicolor", "virginica"] .iter() .map(|s| s.to_string()) .collect(), diff --git a/src/dataset/iris.rs b/src/dataset/iris.rs index fe60241a..75c58acc 100644 --- a/src/dataset/iris.rs +++ b/src/dataset/iris.rs @@ -36,7 +36,7 @@ pub fn load_dataset() -> Dataset { target: y, num_samples, num_features, - feature_names: vec![ + feature_names: [ "sepal length (cm)", "sepal width (cm)", "petal length (cm)", @@ -45,7 +45,7 @@ pub fn load_dataset() -> Dataset { .iter() .map(|s| s.to_string()) .collect(), - target_names: vec!["setosa", "versicolor", "virginica"] + target_names: ["setosa", "versicolor", "virginica"] .iter() .map(|s| s.to_string()) .collect(), diff --git a/src/linalg/basic/arrays.rs b/src/linalg/basic/arrays.rs index a3bbe038..99df2078 100644 --- a/src/linalg/basic/arrays.rs +++ b/src/linalg/basic/arrays.rs @@ -188,8 +188,7 @@ pub trait ArrayView1: Array { _ => max, } }; - self.iterator(0) - .fold(T::min_value(), |max, x| max_f(max, x)) + self.iterator(0).fold(T::min_value(), max_f) } /// return min value from the view fn min(&self) -> T @@ -202,8 +201,7 @@ pub trait ArrayView1: Array { _ => min, } }; - self.iterator(0) - .fold(T::max_value(), |max, x| min_f(max, x)) + self.iterator(0).fold(T::max_value(), min_f) } /// return the position of the max value of the view fn argmax(&self) -> usize diff --git a/src/linalg/basic/matrix.rs b/src/linalg/basic/matrix.rs index 0c2e8696..49833b70 100644 --- a/src/linalg/basic/matrix.rs +++ b/src/linalg/basic/matrix.rs @@ -495,9 +495,9 @@ impl SVDDecomposable for DenseMatrix {} impl<'a, T: Debug + Display + Copy + Sized> Array for DenseMatrixView<'a, T> { fn get(&self, pos: (usize, usize)) -> &T { if self.column_major { - &self.values[(pos.0 + pos.1 * self.stride)] + &self.values[pos.0 + pos.1 * self.stride] } else { - &self.values[(pos.0 * self.stride + pos.1)] + &self.values[pos.0 * self.stride + pos.1] } } @@ -559,9 +559,9 @@ impl<'a, T: Debug + Display + Copy + Sized> ArrayView1 for DenseMatrixView<'a impl<'a, T: Debug + Display + Copy + Sized> Array for DenseMatrixMutView<'a, T> { fn get(&self, pos: (usize, usize)) -> &T { if self.column_major { - &self.values[(pos.0 + pos.1 * self.stride)] + &self.values[pos.0 + pos.1 * self.stride] } else { - &self.values[(pos.0 * self.stride + pos.1)] + &self.values[pos.0 * self.stride + pos.1] } } @@ -583,9 +583,9 @@ impl<'a, T: Debug + Display + Copy + Sized> MutArray { fn set(&mut self, pos: (usize, usize), x: T) { if self.column_major { - self.values[(pos.0 + pos.1 * self.stride)] = x; + self.values[pos.0 + pos.1 * self.stride] = x; } else { - self.values[(pos.0 * self.stride + pos.1)] = x; + self.values[pos.0 * self.stride + pos.1] = x; } } @@ -775,7 +775,7 @@ mod tests { #[test] fn test_from_iterator() { - let data = vec![1, 2, 3, 4, 5, 6]; + let data = [1, 2, 3, 4, 5, 6]; let m = DenseMatrix::from_iterator(data.iter(), 2, 3, 0); diff --git a/src/linalg/basic/vector.rs b/src/linalg/basic/vector.rs index 3fb25847..05c03756 100644 --- a/src/linalg/basic/vector.rs +++ b/src/linalg/basic/vector.rs @@ -15,6 +15,25 @@ pub struct VecView<'a, T: Debug + Display + Copy + Sized> { ptr: &'a [T], } +impl Array for &[T] { + fn get(&self, i: usize) -> &T { + &self[i] + } + + fn shape(&self) -> usize { + self.len() + } + + fn is_empty(&self) -> bool { + self.len() > 0 + } + + fn iterator<'b>(&'b self, axis: u8) -> Box + 'b> { + assert!(axis == 0, "For one dimensional array `axis` should == 0"); + Box::new(self.iter()) + } +} + impl Array for Vec { fn get(&self, i: usize) -> &T { &self[i] @@ -47,6 +66,7 @@ impl MutArray for Vec { } impl ArrayView1 for Vec {} +impl ArrayView1 for &[T] {} impl MutArrayView1 for Vec {} @@ -192,7 +212,7 @@ mod tests { #[test] fn test_len() { - let x = vec![1, 2, 3]; + let x = [1, 2, 3]; assert_eq!(3, x.len()); } diff --git a/src/linear/bg_solver.rs b/src/linear/bg_solver.rs index 6ac9b3e4..6ee4f0ec 100644 --- a/src/linear/bg_solver.rs +++ b/src/linear/bg_solver.rs @@ -162,7 +162,7 @@ mod tests { let a = DenseMatrix::from_2d_array(&[&[25., 15., -5.], &[15., 18., 0.], &[-5., 0., 11.]]) .unwrap(); let b = vec![40., 51., 28.]; - let expected = vec![1.0, 2.0, 3.0]; + let expected = [1.0, 2.0, 3.0]; let mut x = Vec::zeros(3); diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index d07b54f1..12ecf8d8 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -898,11 +898,7 @@ mod tests { let y_hat = lr.predict(&x).unwrap(); - let error: i32 = y - .into_iter() - .zip(y_hat.into_iter()) - .map(|(a, b)| (a - b).abs()) - .sum(); + let error: i32 = y.into_iter().zip(y_hat).map(|(a, b)| (a - b).abs()).sum(); assert!(error <= 1); diff --git a/src/model_selection/hyper_tuning/grid_search.rs b/src/model_selection/hyper_tuning/grid_search.rs index 3c914e48..74242c60 100644 --- a/src/model_selection/hyper_tuning/grid_search.rs +++ b/src/model_selection/hyper_tuning/grid_search.rs @@ -3,9 +3,9 @@ use crate::{ api::{Predictor, SupervisedEstimator}, error::{Failed, FailedError}, - linalg::basic::arrays::{Array2, Array1}, - numbers::realnum::RealNumber, + linalg::basic::arrays::{Array1, Array2}, numbers::basenum::Number, + numbers::realnum::RealNumber, }; use crate::model_selection::{cross_validate, BaseKFold, CrossValidationResult}; diff --git a/src/model_selection/kfold.rs b/src/model_selection/kfold.rs index 760881b7..d7ad22d2 100644 --- a/src/model_selection/kfold.rs +++ b/src/model_selection/kfold.rs @@ -283,9 +283,7 @@ mod tests { (vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]), (vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]), ]; - for ((train, test), (expected_train, expected_test)) in - k.split(&x).into_iter().zip(expected) - { + for ((train, test), (expected_train, expected_test)) in k.split(&x).zip(expected) { assert_eq!(test, expected_test); assert_eq!(train, expected_train); } @@ -307,9 +305,7 @@ mod tests { (vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]), (vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]), ]; - for ((train, test), (expected_train, expected_test)) in - k.split(&x).into_iter().zip(expected) - { + for ((train, test), (expected_train, expected_test)) in k.split(&x).zip(expected) { assert_eq!(test.len(), expected_test.len()); assert_eq!(train.len(), expected_train.len()); } diff --git a/src/naive_bayes/mod.rs b/src/naive_bayes/mod.rs index e7ab7f6d..11614d14 100644 --- a/src/naive_bayes/mod.rs +++ b/src/naive_bayes/mod.rs @@ -40,7 +40,7 @@ use crate::linalg::basic::arrays::{Array1, Array2, ArrayView1}; use crate::numbers::basenum::Number; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; -use std::marker::PhantomData; +use std::{cmp::Ordering, marker::PhantomData}; /// Distribution used in the Naive Bayes classifier. pub(crate) trait NBDistribution: Clone { @@ -92,11 +92,10 @@ impl, Y: Array1, D: NBDistribution Result { let y_classes = self.distribution.classes(); - let (rows, _) = x.shape(); - let predictions = (0..rows) - .map(|row_index| { - let row = x.get_row(row_index); - let (prediction, _probability) = y_classes + let predictions = x + .row_iter() + .map(|row| { + y_classes .iter() .enumerate() .map(|(class_index, class)| { @@ -106,11 +105,26 @@ impl, Y: Array1, D: NBDistribution ordering, + None => { + if p1.is_nan() { + Ordering::Less + } else if p2.is_nan() { + Ordering::Greater + } else { + Ordering::Equal + } + } + }) + .map(|(prediction, _probability)| *prediction) + .ok_or_else(|| Failed::predict("Failed to predict, there is no result")) }) - .collect::>(); + .collect::, Failed>>()?; let y_hat = Y::from_vec_slice(&predictions); Ok(y_hat) } @@ -119,3 +133,63 @@ pub mod bernoulli; pub mod categorical; pub mod gaussian; pub mod multinomial; + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::basic::arrays::Array; + use crate::linalg::basic::matrix::DenseMatrix; + use num_traits::float::Float; + + type Model<'d> = BaseNaiveBayes, Vec, TestDistribution<'d>>; + + #[derive(Debug, PartialEq, Clone)] + struct TestDistribution<'d>(&'d Vec); + + impl<'d> NBDistribution for TestDistribution<'d> { + fn prior(&self, _class_index: usize) -> f64 { + 1. + } + + fn log_likelihood<'a>( + &'a self, + class_index: usize, + _j: &'a Box + 'a>, + ) -> f64 { + match self.0.get(class_index) { + &v @ 2 | &v @ 10 | &v @ 20 => v as f64, + _ => f64::nan(), + } + } + + fn classes(&self) -> &Vec { + &self.0 + } + } + + #[test] + fn test_predict() { + let matrix = DenseMatrix::from_2d_array(&[&[1, 2, 3], &[4, 5, 6], &[7, 8, 9]]); + + let val = vec![]; + match Model::fit(TestDistribution(&val)).unwrap().predict(&matrix) { + Ok(_) => panic!("Should return error in case of empty classes"), + Err(err) => assert_eq!( + err.to_string(), + "Predict failed: Failed to predict, there is no result" + ), + } + + let val = vec![1, 2, 3]; + match Model::fit(TestDistribution(&val)).unwrap().predict(&matrix) { + Ok(r) => assert_eq!(r, vec![2, 2, 2]), + Err(_) => panic!("Should success in normal case with NaNs"), + } + + let val = vec![20, 2, 10]; + match Model::fit(TestDistribution(&val)).unwrap().predict(&matrix) { + Ok(r) => assert_eq!(r, vec![20, 20, 20]), + Err(_) => panic!("Should success in normal case without NaNs"), + } + } +} diff --git a/src/neighbors/knn_regressor.rs b/src/neighbors/knn_regressor.rs index 5b38ddd9..e4efe48a 100644 --- a/src/neighbors/knn_regressor.rs +++ b/src/neighbors/knn_regressor.rs @@ -298,7 +298,7 @@ mod tests { DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]) .unwrap(); let y: Vec = vec![1., 2., 3., 4., 5.]; - let y_exp = vec![1., 2., 3., 4., 5.]; + let y_exp = [1., 2., 3., 4., 5.]; let knn = KNNRegressor::fit( &x, &y, @@ -326,7 +326,7 @@ mod tests { DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]) .unwrap(); let y: Vec = vec![1., 2., 3., 4., 5.]; - let y_exp = vec![2., 2., 3., 4., 4.]; + let y_exp = [2., 2., 3., 4., 4.]; let knn = KNNRegressor::fit(&x, &y, Default::default()).unwrap(); let y_hat = knn.predict(&x).unwrap(); assert_eq!(5, Vec::len(&y_hat)); diff --git a/src/preprocessing/categorical.rs b/src/preprocessing/categorical.rs index feafd2d9..e17dea78 100644 --- a/src/preprocessing/categorical.rs +++ b/src/preprocessing/categorical.rs @@ -281,7 +281,7 @@ mod tests { )] #[test] fn hash_encode_f64_series() { - let series = vec![3.0, 1.0, 2.0, 1.0]; + let series = [3.0, 1.0, 2.0, 1.0]; let hashable_series: Vec = series.iter().map(|v| v.to_category()).collect(); let enc = CategoryMapper::from_positional_category_vec(hashable_series); diff --git a/src/readers/csv.rs b/src/readers/csv.rs index 7f902ff1..f8a03ebd 100644 --- a/src/readers/csv.rs +++ b/src/readers/csv.rs @@ -83,7 +83,7 @@ where Matrix: Array2, { let csv_text = read_string_from_source(source)?; - let rows: Vec> = extract_row_vectors_from_csv_text::( + let rows: Vec> = extract_row_vectors_from_csv_text( &csv_text, &definition, detect_row_format(&csv_text, &definition)?, @@ -103,12 +103,7 @@ where /// Given a string containing the contents of a csv file, extract its value /// into row-vectors. -fn extract_row_vectors_from_csv_text< - 'a, - T: Number + RealNumber + std::str::FromStr, - RowVector: Array1, - Matrix: Array2, ->( +fn extract_row_vectors_from_csv_text<'a, T: Number + RealNumber + std::str::FromStr>( csv_text: &'a str, definition: &'a CSVDefinition<'_>, row_format: CSVRowFormat<'_>, @@ -306,12 +301,11 @@ mod tests { } mod extract_row_vectors_from_csv_text { use super::super::{extract_row_vectors_from_csv_text, CSVDefinition, CSVRowFormat}; - use crate::linalg::basic::matrix::DenseMatrix; #[test] fn read_default_csv() { assert_eq!( - extract_row_vectors_from_csv_text::, DenseMatrix<_>>( + extract_row_vectors_from_csv_text::( "column 1, column 2, column3\n1.0,2.0,3.0\n4.0,5.0,6.0", &CSVDefinition::default(), CSVRowFormat { diff --git a/src/svm/mod.rs b/src/svm/mod.rs index b2bd79cb..0792fdb8 100644 --- a/src/svm/mod.rs +++ b/src/svm/mod.rs @@ -56,7 +56,7 @@ pub struct Kernels; impl Kernels { /// Return a default linear pub fn linear() -> LinearKernel { - LinearKernel::default() + LinearKernel } /// Return a default RBF pub fn rbf() -> RBFKernel { diff --git a/src/svm/svc.rs b/src/svm/svc.rs index 9833ac82..6477778b 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -322,19 +322,26 @@ impl<'a, TX: Number + RealNumber, TY: Number + Ord, X: Array2 + 'a, Y: Array let (n, _) = x.shape(); let mut y_hat: Vec = Array1::zeros(n); + let mut row = Vec::with_capacity(n); for i in 0..n { - let row_pred: TX = - self.predict_for_row(Vec::from_iterator(x.get_row(i).iterator(0).copied(), n)); + row.clear(); + row.extend(x.get_row(i).iterator(0).copied()); + let row_pred: TX = self.predict_for_row(&row); y_hat.set(i, row_pred); } Ok(y_hat) } - fn predict_for_row(&self, x: Vec) -> TX { + fn predict_for_row(&self, x: &[TX]) -> TX { let mut f = self.b.unwrap(); + let xi: Vec<_> = x.iter().map(|e| e.to_f64().unwrap()).collect(); for i in 0..self.instances.as_ref().unwrap().len() { + let xj: Vec<_> = self.instances.as_ref().unwrap()[i] + .iter() + .map(|e| e.to_f64().unwrap()) + .collect(); f += self.w.as_ref().unwrap()[i] * TX::from( self.parameters @@ -343,13 +350,7 @@ impl<'a, TX: Number + RealNumber, TY: Number + Ord, X: Array2 + 'a, Y: Array .kernel .as_ref() .unwrap() - .apply( - &x.iter().map(|e| e.to_f64().unwrap()).collect(), - &self.instances.as_ref().unwrap()[i] - .iter() - .map(|e| e.to_f64().unwrap()) - .collect(), - ) + .apply(&xi, &xj) .unwrap(), ) .unwrap(); @@ -472,14 +473,12 @@ impl<'a, TX: Number + RealNumber, TY: Number + Ord, X: Array2, Y: Array1 let tol = self.parameters.tol; let good_enough = TX::from_i32(1000).unwrap(); + let mut x = Vec::with_capacity(n); for _ in 0..self.parameters.epoch { for i in self.permutate(n) { - self.process( - i, - Vec::from_iterator(self.x.get_row(i).iterator(0).copied(), n), - *self.y.get(i), - &mut cache, - ); + x.clear(); + x.extend(self.x.get_row(i).iterator(0).take(n).copied()); + self.process(i, &x, *self.y.get(i), &mut cache); loop { self.reprocess(tol, &mut cache); self.find_min_max_gradient(); @@ -511,24 +510,17 @@ impl<'a, TX: Number + RealNumber, TY: Number + Ord, X: Array2, Y: Array1 let mut cp = 0; let mut cn = 0; + let mut x = Vec::with_capacity(n); for i in self.permutate(n) { + x.clear(); + x.extend(self.x.get_row(i).iterator(0).take(n).copied()); if *self.y.get(i) == TY::one() && cp < few { - if self.process( - i, - Vec::from_iterator(self.x.get_row(i).iterator(0).copied(), n), - *self.y.get(i), - cache, - ) { + if self.process(i, &x, *self.y.get(i), cache) { cp += 1; } } else if *self.y.get(i) == TY::from(-1).unwrap() && cn < few - && self.process( - i, - Vec::from_iterator(self.x.get_row(i).iterator(0).copied(), n), - *self.y.get(i), - cache, - ) + && self.process(i, &x, *self.y.get(i), cache) { cn += 1; } @@ -539,7 +531,7 @@ impl<'a, TX: Number + RealNumber, TY: Number + Ord, X: Array2, Y: Array1 } } - fn process(&mut self, i: usize, x: Vec, y: TY, cache: &mut Cache) -> bool { + fn process(&mut self, i: usize, x: &[TX], y: TY, cache: &mut Cache) -> bool { for j in 0..self.sv.len() { if self.sv[j].index == i { return true; @@ -551,15 +543,14 @@ impl<'a, TX: Number + RealNumber, TY: Number + Ord, X: Array2, Y: Array1 let mut cache_values: Vec<((usize, usize), TX)> = Vec::new(); for v in self.sv.iter() { + let xi: Vec<_> = v.x.iter().map(|e| e.to_f64().unwrap()).collect(); + let xj: Vec<_> = x.iter().map(|e| e.to_f64().unwrap()).collect(); let k = self .parameters .kernel .as_ref() .unwrap() - .apply( - &v.x.iter().map(|e| e.to_f64().unwrap()).collect(), - &x.iter().map(|e| e.to_f64().unwrap()).collect(), - ) + .apply(&xi, &xj) .unwrap(); cache_values.push(((i, v.index), TX::from(k).unwrap())); g -= v.alpha * k; @@ -578,7 +569,7 @@ impl<'a, TX: Number + RealNumber, TY: Number + Ord, X: Array2, Y: Array1 cache.insert(v.0, v.1.to_f64().unwrap()); } - let x_f64 = x.iter().map(|e| e.to_f64().unwrap()).collect(); + let x_f64: Vec<_> = x.iter().map(|e| e.to_f64().unwrap()).collect(); let k_v = self .parameters .kernel @@ -701,8 +692,10 @@ impl<'a, TX: Number + RealNumber, TY: Number + Ord, X: Array2, Y: Array1 let km = sv1.k; let gm = sv1.grad; let mut best = 0f64; + let xi: Vec<_> = sv1.x.iter().map(|e| e.to_f64().unwrap()).collect(); for i in 0..self.sv.len() { let v = &self.sv[i]; + let xj: Vec<_> = v.x.iter().map(|e| e.to_f64().unwrap()).collect(); let z = v.grad - gm; let k = cache.get( sv1, @@ -711,10 +704,7 @@ impl<'a, TX: Number + RealNumber, TY: Number + Ord, X: Array2, Y: Array1 .kernel .as_ref() .unwrap() - .apply( - &sv1.x.iter().map(|e| e.to_f64().unwrap()).collect(), - &v.x.iter().map(|e| e.to_f64().unwrap()).collect(), - ) + .apply(&xi, &xj) .unwrap(), ); let mut curv = km + v.k - 2f64 * k; @@ -732,6 +722,12 @@ impl<'a, TX: Number + RealNumber, TY: Number + Ord, X: Array2, Y: Array1 } } + let xi: Vec<_> = self.sv[idx_1] + .x + .iter() + .map(|e| e.to_f64().unwrap()) + .collect::>(); + idx_2.map(|idx_2| { ( idx_1, @@ -742,16 +738,12 @@ impl<'a, TX: Number + RealNumber, TY: Number + Ord, X: Array2, Y: Array1 .as_ref() .unwrap() .apply( - &self.sv[idx_1] - .x - .iter() - .map(|e| e.to_f64().unwrap()) - .collect(), + &xi, &self.sv[idx_2] .x .iter() .map(|e| e.to_f64().unwrap()) - .collect(), + .collect::>(), ) .unwrap() }), @@ -765,8 +757,11 @@ impl<'a, TX: Number + RealNumber, TY: Number + Ord, X: Array2, Y: Array1 let km = sv2.k; let gm = sv2.grad; let mut best = 0f64; + + let xi: Vec<_> = sv2.x.iter().map(|e| e.to_f64().unwrap()).collect(); for i in 0..self.sv.len() { let v = &self.sv[i]; + let xj: Vec<_> = v.x.iter().map(|e| e.to_f64().unwrap()).collect(); let z = gm - v.grad; let k = cache.get( sv2, @@ -775,10 +770,7 @@ impl<'a, TX: Number + RealNumber, TY: Number + Ord, X: Array2, Y: Array1 .kernel .as_ref() .unwrap() - .apply( - &sv2.x.iter().map(|e| e.to_f64().unwrap()).collect(), - &v.x.iter().map(|e| e.to_f64().unwrap()).collect(), - ) + .apply(&xi, &xj) .unwrap(), ); let mut curv = km + v.k - 2f64 * k; @@ -797,6 +789,12 @@ impl<'a, TX: Number + RealNumber, TY: Number + Ord, X: Array2, Y: Array1 } } + let xj: Vec<_> = self.sv[idx_2] + .x + .iter() + .map(|e| e.to_f64().unwrap()) + .collect(); + idx_1.map(|idx_1| { ( idx_1, @@ -811,12 +809,8 @@ impl<'a, TX: Number + RealNumber, TY: Number + Ord, X: Array2, Y: Array1 .x .iter() .map(|e| e.to_f64().unwrap()) - .collect(), - &self.sv[idx_2] - .x - .iter() - .map(|e| e.to_f64().unwrap()) - .collect(), + .collect::>(), + &xj, ) .unwrap() }), @@ -835,12 +829,12 @@ impl<'a, TX: Number + RealNumber, TY: Number + Ord, X: Array2, Y: Array1 .x .iter() .map(|e| e.to_f64().unwrap()) - .collect(), + .collect::>(), &self.sv[idx_2] .x .iter() .map(|e| e.to_f64().unwrap()) - .collect(), + .collect::>(), ) .unwrap(), )), @@ -895,7 +889,10 @@ impl<'a, TX: Number + RealNumber, TY: Number + Ord, X: Array2, Y: Array1 self.sv[v1].alpha -= step.to_f64().unwrap(); self.sv[v2].alpha += step.to_f64().unwrap(); + let xi_v1: Vec<_> = self.sv[v1].x.iter().map(|e| e.to_f64().unwrap()).collect(); + let xi_v2: Vec<_> = self.sv[v2].x.iter().map(|e| e.to_f64().unwrap()).collect(); for i in 0..self.sv.len() { + let xj: Vec<_> = self.sv[i].x.iter().map(|e| e.to_f64().unwrap()).collect(); let k2 = cache.get( &self.sv[v2], &self.sv[i], @@ -903,10 +900,7 @@ impl<'a, TX: Number + RealNumber, TY: Number + Ord, X: Array2, Y: Array1 .kernel .as_ref() .unwrap() - .apply( - &self.sv[v2].x.iter().map(|e| e.to_f64().unwrap()).collect(), - &self.sv[i].x.iter().map(|e| e.to_f64().unwrap()).collect(), - ) + .apply(&xi_v2, &xj) .unwrap(), ); let k1 = cache.get( @@ -916,10 +910,7 @@ impl<'a, TX: Number + RealNumber, TY: Number + Ord, X: Array2, Y: Array1 .kernel .as_ref() .unwrap() - .apply( - &self.sv[v1].x.iter().map(|e| e.to_f64().unwrap()).collect(), - &self.sv[i].x.iter().map(|e| e.to_f64().unwrap()).collect(), - ) + .apply(&xi_v1, &xj) .unwrap(), ); self.sv[i].grad -= step.to_f64().unwrap() * (k2 - k1); diff --git a/src/svm/svr.rs b/src/svm/svr.rs index fb2d8aa0..e68ebf85 100644 --- a/src/svm/svr.rs +++ b/src/svm/svr.rs @@ -248,19 +248,20 @@ impl<'a, T: Number + FloatNumber + PartialOrd, X: Array2, Y: Array1> SVR<' let mut y_hat: Vec = Vec::::zeros(n); + let mut x_i = Vec::with_capacity(n); for i in 0..n { - y_hat.set( - i, - self.predict_for_row(Vec::from_iterator(x.get_row(i).iterator(0).copied(), n)), - ); + x_i.clear(); + x_i.extend(x.get_row(i).iterator(0).copied()); + y_hat.set(i, self.predict_for_row(&x_i)); } Ok(y_hat) } - pub(crate) fn predict_for_row(&self, x: Vec) -> T { + pub(crate) fn predict_for_row(&self, x: &[T]) -> T { let mut f = self.b; + let xi: Vec<_> = x.iter().map(|e| e.to_f64().unwrap()).collect(); for i in 0..self.instances.as_ref().unwrap().len() { f += self.w.as_ref().unwrap()[i] * T::from( @@ -270,10 +271,7 @@ impl<'a, T: Number + FloatNumber + PartialOrd, X: Array2, Y: Array1> SVR<' .kernel .as_ref() .unwrap() - .apply( - &x.iter().map(|e| e.to_f64().unwrap()).collect(), - &self.instances.as_ref().unwrap()[i], - ) + .apply(&xi, &self.instances.as_ref().unwrap()[i]) .unwrap(), ) .unwrap() diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index abda96ed..495e07dc 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -116,6 +116,7 @@ pub struct DecisionTreeClassifier< num_classes: usize, classes: Vec, depth: u16, + num_features: usize, _phantom_tx: PhantomData, _phantom_x: PhantomData, _phantom_y: PhantomData, @@ -159,11 +160,13 @@ pub enum SplitCriterion { #[derive(Debug, Clone)] struct Node { output: usize, + n_node_samples: usize, split_feature: usize, split_value: Option, split_score: Option, true_child: Option, false_child: Option, + impurity: Option, } impl, Y: Array1> PartialEq @@ -400,14 +403,16 @@ impl Default for DecisionTreeClassifierSearchParameters { } impl Node { - fn new(output: usize) -> Self { + fn new(output: usize, n_node_samples: usize) -> Self { Node { output, + n_node_samples, split_feature: 0, split_value: Option::None, split_score: Option::None, true_child: Option::None, false_child: Option::None, + impurity: Option::None, } } } @@ -507,6 +512,7 @@ impl, Y: Array1> num_classes: 0usize, classes: vec![], depth: 0u16, + num_features: 0usize, _phantom_tx: PhantomData, _phantom_x: PhantomData, _phantom_y: PhantomData, @@ -578,7 +584,7 @@ impl, Y: Array1> count[yi[i]] += samples[i]; } - let root = Node::new(which_max(&count)); + let root = Node::new(which_max(&count), y_ncols); change_nodes.push(root); let mut order: Vec> = Vec::new(); @@ -593,6 +599,7 @@ impl, Y: Array1> num_classes: k, classes, depth: 0u16, + num_features: num_attributes, _phantom_tx: PhantomData, _phantom_x: PhantomData, _phantom_y: PhantomData, @@ -678,16 +685,7 @@ impl, Y: Array1> } } - if is_pure { - return false; - } - let n = visitor.samples.iter().sum(); - - if n <= self.parameters().min_samples_split { - return false; - } - let mut count = vec![0; self.num_classes]; let mut false_count = vec![0; self.num_classes]; for i in 0..n_rows { @@ -696,7 +694,15 @@ impl, Y: Array1> } } - let parent_impurity = impurity(&self.parameters().criterion, &count, n); + self.nodes[visitor.node].impurity = Some(impurity(&self.parameters().criterion, &count, n)); + + if is_pure { + return false; + } + + if n <= self.parameters().min_samples_split { + return false; + } let mut variables = (0..n_attr).collect::>(); @@ -705,14 +711,7 @@ impl, Y: Array1> } for variable in variables.iter().take(mtry) { - self.find_best_split( - visitor, - n, - &count, - &mut false_count, - parent_impurity, - *variable, - ); + self.find_best_split(visitor, n, &count, &mut false_count, *variable); } self.nodes()[visitor.node].split_score.is_some() @@ -724,7 +723,6 @@ impl, Y: Array1> n: usize, count: &[usize], false_count: &mut [usize], - parent_impurity: f64, j: usize, ) { let mut true_count = vec![0; self.num_classes]; @@ -760,6 +758,7 @@ impl, Y: Array1> let true_label = which_max(&true_count); let false_label = which_max(false_count); + let parent_impurity = self.nodes()[visitor.node].impurity.unwrap(); let gain = parent_impurity - tc as f64 / n as f64 * impurity(&self.parameters().criterion, &true_count, tc) @@ -827,9 +826,9 @@ impl, Y: Array1> let true_child_idx = self.nodes().len(); - self.nodes.push(Node::new(visitor.true_child_output)); + self.nodes.push(Node::new(visitor.true_child_output, tc)); let false_child_idx = self.nodes().len(); - self.nodes.push(Node::new(visitor.false_child_output)); + self.nodes.push(Node::new(visitor.false_child_output, fc)); self.nodes[visitor.node].true_child = Some(true_child_idx); self.nodes[visitor.node].false_child = Some(false_child_idx); @@ -863,6 +862,33 @@ impl, Y: Array1> true } + + /// Compute feature importances for the fitted tree. + pub fn compute_feature_importances(&self, normalize: bool) -> Vec { + let mut importances = vec![0f64; self.num_features]; + + for node in self.nodes().iter() { + if node.true_child.is_none() && node.false_child.is_none() { + continue; + } + let left = &self.nodes()[node.true_child.unwrap()]; + let right = &self.nodes()[node.false_child.unwrap()]; + + importances[node.split_feature] += node.n_node_samples as f64 * node.impurity.unwrap() + - left.n_node_samples as f64 * left.impurity.unwrap() + - right.n_node_samples as f64 * right.impurity.unwrap(); + } + for item in importances.iter_mut() { + *item /= self.nodes()[0].n_node_samples as f64; + } + if normalize { + let sum = importances.iter().sum::(); + for importance in importances.iter_mut() { + *importance /= sum; + } + } + importances + } } #[cfg(test)] @@ -1018,6 +1044,42 @@ mod tests { ); } + #[test] + fn test_compute_feature_importances() { + let x: DenseMatrix = DenseMatrix::from_2d_array(&[ + &[1., 1., 1., 0.], + &[1., 1., 1., 0.], + &[1., 1., 1., 1.], + &[1., 1., 0., 0.], + &[1., 1., 0., 1.], + &[1., 0., 1., 0.], + &[1., 0., 1., 0.], + &[1., 0., 1., 1.], + &[1., 0., 0., 0.], + &[1., 0., 0., 1.], + &[0., 1., 1., 0.], + &[0., 1., 1., 0.], + &[0., 1., 1., 1.], + &[0., 1., 0., 0.], + &[0., 1., 0., 1.], + &[0., 0., 1., 0.], + &[0., 0., 1., 0.], + &[0., 0., 1., 1.], + &[0., 0., 0., 0.], + &[0., 0., 0., 1.], + ]); + let y: Vec = vec![1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0]; + let tree = DecisionTreeClassifier::fit(&x, &y, Default::default()).unwrap(); + assert_eq!( + tree.compute_feature_importances(false), + vec![0., 0., 0.21333333333333332, 0.26666666666666666] + ); + assert_eq!( + tree.compute_feature_importances(true), + vec![0., 0., 0.4444444444444444, 0.5555555555555556] + ); + } + #[cfg_attr( all(target_arch = "wasm32", not(target_os = "wasi")), wasm_bindgen_test::wasm_bindgen_test diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index dca4acaf..1569af2e 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -768,7 +768,7 @@ mod tests { assert!((y_hat[i] - y[i]).abs() < 0.1); } - let expected_y = vec![ + let expected_y = [ 87.3, 87.3, 87.3, 87.3, 98.9, 98.9, 98.9, 98.9, 98.9, 107.9, 107.9, 107.9, 114.85, 114.85, 114.85, 114.85, ]; @@ -789,7 +789,7 @@ mod tests { assert!((y_hat[i] - expected_y[i]).abs() < 0.1); } - let expected_y = vec![ + let expected_y = [ 83.0, 88.35, 88.35, 89.5, 97.15, 97.15, 99.5, 99.5, 101.2, 104.6, 109.6, 109.6, 113.4, 113.4, 116.30, 116.30, ];