Skip to content

Commit 102de2d

Browse files
authored
feat: regularization for decision trees and random forests (#730)
Closes #700 ### Summary of Changes Add regularization options for decision trees and random forests: * maximum depth * minimum number of samples in leaves
1 parent 1cc14b1 commit 102de2d

File tree

8 files changed

+420
-51
lines changed

8 files changed

+420
-51
lines changed

src/safeds/ml/classical/classification/_decision_tree.py

Lines changed: 63 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from typing import TYPE_CHECKING
44

55
from safeds._utils import _structural_hash
6+
from safeds.exceptions import ClosedBound, OutOfBoundsError
67
from safeds.ml.classical._util_sklearn import fit, predict
78

89
from ._classifier import Classifier
@@ -16,17 +17,66 @@
1617

1718

1819
class DecisionTreeClassifier(Classifier):
19-
"""Decision tree classification."""
20+
"""
21+
Decision tree classification.
22+
23+
Parameters
24+
----------
25+
maximum_depth:
26+
The maximum depth of each tree. If None, the depth is not limited. Has to be greater than 0.
27+
minimum_number_of_samples_in_leaves:
28+
The minimum number of samples that must remain in the leaves of each tree. Has to be greater than 0.
29+
30+
Raises
31+
------
32+
OutOfBoundsError
33+
If `maximum_depth` is less than 1.
34+
OutOfBoundsError
35+
If `minimum_number_of_samples_in_leaves` is less than 1.
36+
"""
37+
38+
def __init__(
39+
self,
40+
*,
41+
maximum_depth: int | None = None,
42+
minimum_number_of_samples_in_leaves: int = 1,
43+
) -> None:
44+
# Validation
45+
if maximum_depth is not None and maximum_depth < 1:
46+
raise OutOfBoundsError(maximum_depth, name="maximum_depth", lower_bound=ClosedBound(1))
47+
if minimum_number_of_samples_in_leaves < 1:
48+
raise OutOfBoundsError(
49+
minimum_number_of_samples_in_leaves,
50+
name="minimum_number_of_samples_in_leaves",
51+
lower_bound=ClosedBound(1),
52+
)
53+
54+
# Hyperparameters
55+
self._maximum_depth: int | None = maximum_depth
56+
self._minimum_number_of_samples_in_leaves: int = minimum_number_of_samples_in_leaves
2057

21-
def __hash__(self) -> int:
22-
return _structural_hash(Classifier.__hash__(self), self._target_name, self._feature_names)
23-
24-
def __init__(self) -> None:
2558
# Internal state
2659
self._wrapped_classifier: sk_DecisionTreeClassifier | None = None
2760
self._feature_names: list[str] | None = None
2861
self._target_name: str | None = None
2962

63+
def __hash__(self) -> int:
64+
return _structural_hash(
65+
Classifier.__hash__(self),
66+
self._feature_names,
67+
self._target_name,
68+
)
69+
70+
@property
71+
def maximum_depth(self) -> int | None:
72+
"""The maximum depth of the tree."""
73+
return self._maximum_depth
74+
75+
@property
76+
def minimum_number_of_samples_in_leaves(self) -> int:
77+
"""The minimum number of samples that must remain in the leaves of the tree."""
78+
return self._minimum_number_of_samples_in_leaves
79+
3080
def fit(self, training_set: TabularDataset) -> DecisionTreeClassifier:
3181
"""
3282
Create a copy of this classifier and fit it with the given training data.
@@ -59,7 +109,10 @@ def fit(self, training_set: TabularDataset) -> DecisionTreeClassifier:
59109
wrapped_classifier = self._get_sklearn_classifier()
60110
fit(wrapped_classifier, training_set)
61111

62-
result = DecisionTreeClassifier()
112+
result = DecisionTreeClassifier(
113+
maximum_depth=self._maximum_depth,
114+
minimum_number_of_samples_in_leaves=self._minimum_number_of_samples_in_leaves,
115+
)
63116
result._wrapped_classifier = wrapped_classifier
64117
result._feature_names = training_set.features.column_names
65118
result._target_name = training_set.target.name
@@ -105,4 +158,7 @@ def is_fitted(self) -> bool:
105158
def _get_sklearn_classifier(self) -> ClassifierMixin:
106159
from sklearn.tree import DecisionTreeClassifier as sk_DecisionTreeClassifier
107160

108-
return sk_DecisionTreeClassifier()
161+
return sk_DecisionTreeClassifier(
162+
max_depth=self._maximum_depth,
163+
min_samples_leaf=self._minimum_number_of_samples_in_leaves,
164+
)

src/safeds/ml/classical/classification/_random_forest.py

Lines changed: 60 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -17,52 +17,82 @@
1717

1818

1919
class RandomForestClassifier(Classifier):
20-
"""Random forest classification.
20+
"""
21+
Random forest classification.
2122
2223
Parameters
2324
----------
2425
number_of_trees:
2526
The number of trees to be used in the random forest. Has to be greater than 0.
27+
maximum_depth:
28+
The maximum depth of each tree. If None, the depth is not limited. Has to be greater than 0.
29+
minimum_number_of_samples_in_leaves:
30+
The minimum number of samples that must remain in the leaves of each tree. Has to be greater than 0.
2631
2732
Raises
2833
------
2934
OutOfBoundsError
3035
If `number_of_trees` is less than 1.
36+
OutOfBoundsError
37+
If `maximum_depth` is less than 1.
38+
OutOfBoundsError
39+
If `minimum_number_of_samples_in_leaves` is less than 1.
3140
"""
3241

33-
def __hash__(self) -> int:
34-
return _structural_hash(
35-
Classifier.__hash__(self),
36-
self._target_name,
37-
self._feature_names,
38-
self._number_of_trees,
39-
)
40-
41-
def __init__(self, *, number_of_trees: int = 100) -> None:
42+
def __init__(
43+
self,
44+
*,
45+
number_of_trees: int = 100,
46+
maximum_depth: int | None = None,
47+
minimum_number_of_samples_in_leaves: int = 1,
48+
) -> None:
4249
# Validation
4350
if number_of_trees < 1:
4451
raise OutOfBoundsError(number_of_trees, name="number_of_trees", lower_bound=ClosedBound(1))
52+
if maximum_depth is not None and maximum_depth < 1:
53+
raise OutOfBoundsError(maximum_depth, name="maximum_depth", lower_bound=ClosedBound(1))
54+
if minimum_number_of_samples_in_leaves < 1:
55+
raise OutOfBoundsError(
56+
minimum_number_of_samples_in_leaves,
57+
name="minimum_number_of_samples_in_leaves",
58+
lower_bound=ClosedBound(1),
59+
)
4560

4661
# Hyperparameters
47-
self._number_of_trees = number_of_trees
62+
self._number_of_trees: int = number_of_trees
63+
self._maximum_depth: int | None = maximum_depth
64+
self._minimum_number_of_samples_in_leaves: int = minimum_number_of_samples_in_leaves
4865

4966
# Internal state
5067
self._wrapped_classifier: sk_RandomForestClassifier | None = None
5168
self._feature_names: list[str] | None = None
5269
self._target_name: str | None = None
5370

71+
def __hash__(self) -> int:
72+
return _structural_hash(
73+
Classifier.__hash__(self),
74+
self._feature_names,
75+
self._target_name,
76+
self._number_of_trees,
77+
self._maximum_depth,
78+
self._minimum_number_of_samples_in_leaves,
79+
)
80+
5481
@property
5582
def number_of_trees(self) -> int:
56-
"""
57-
Get the number of trees used in the random forest.
58-
59-
Returns
60-
-------
61-
result:
62-
The number of trees.
63-
"""
83+
"""The number of trees used in the random forest."""
6484
return self._number_of_trees
6585

86+
@property
87+
def maximum_depth(self) -> int | None:
88+
"""The maximum depth of each tree."""
89+
return self._maximum_depth
90+
91+
@property
92+
def minimum_number_of_samples_in_leaves(self) -> int:
93+
"""The minimum number of samples that must remain in the leaves of each tree."""
94+
return self._minimum_number_of_samples_in_leaves
95+
6696
def fit(self, training_set: TabularDataset) -> RandomForestClassifier:
6797
"""
6898
Create a copy of this classifier and fit it with the given training data.
@@ -95,7 +125,11 @@ def fit(self, training_set: TabularDataset) -> RandomForestClassifier:
95125
wrapped_classifier = self._get_sklearn_classifier()
96126
fit(wrapped_classifier, training_set)
97127

98-
result = RandomForestClassifier(number_of_trees=self._number_of_trees)
128+
result = RandomForestClassifier(
129+
number_of_trees=self._number_of_trees,
130+
maximum_depth=self._maximum_depth,
131+
minimum_number_of_samples_in_leaves=self._minimum_number_of_samples_in_leaves,
132+
)
99133
result._wrapped_classifier = wrapped_classifier
100134
result._feature_names = training_set.features.column_names
101135
result._target_name = training_set.target.name
@@ -149,4 +183,9 @@ def _get_sklearn_classifier(self) -> ClassifierMixin:
149183
"""
150184
from sklearn.ensemble import RandomForestClassifier as sk_RandomForestClassifier
151185

152-
return sk_RandomForestClassifier(self._number_of_trees, n_jobs=-1)
186+
return sk_RandomForestClassifier(
187+
n_estimators=self._number_of_trees,
188+
max_depth=self._maximum_depth,
189+
min_samples_leaf=self._minimum_number_of_samples_in_leaves,
190+
n_jobs=-1,
191+
)

src/safeds/ml/classical/regression/_decision_tree.py

Lines changed: 63 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from typing import TYPE_CHECKING
44

55
from safeds._utils import _structural_hash
6+
from safeds.exceptions import ClosedBound, OutOfBoundsError
67
from safeds.ml.classical._util_sklearn import fit, predict
78

89
from ._regressor import Regressor
@@ -16,17 +17,66 @@
1617

1718

1819
class DecisionTreeRegressor(Regressor):
19-
"""Decision tree regression."""
20+
"""
21+
Decision tree regression.
22+
23+
Parameters
24+
----------
25+
maximum_depth:
26+
The maximum depth of each tree. If None, the depth is not limited. Has to be greater than 0.
27+
minimum_number_of_samples_in_leaves:
28+
The minimum number of samples that must remain in the leaves of each tree. Has to be greater than 0.
29+
30+
Raises
31+
------
32+
OutOfBoundsError
33+
If `maximum_depth` is less than 1.
34+
OutOfBoundsError
35+
If `minimum_number_of_samples_in_leaves` is less than 1.
36+
"""
37+
38+
def __init__(
39+
self,
40+
*,
41+
maximum_depth: int | None = None,
42+
minimum_number_of_samples_in_leaves: int = 5,
43+
) -> None:
44+
# Validation
45+
if maximum_depth is not None and maximum_depth < 1:
46+
raise OutOfBoundsError(maximum_depth, name="maximum_depth", lower_bound=ClosedBound(1))
47+
if minimum_number_of_samples_in_leaves < 1:
48+
raise OutOfBoundsError(
49+
minimum_number_of_samples_in_leaves,
50+
name="minimum_number_of_samples_in_leaves",
51+
lower_bound=ClosedBound(1),
52+
)
53+
54+
# Hyperparameters
55+
self._maximum_depth: int | None = maximum_depth
56+
self._minimum_number_of_samples_in_leaves: int = minimum_number_of_samples_in_leaves
2057

21-
def __hash__(self) -> int:
22-
return _structural_hash(Regressor.__hash__(self), self._target_name, self._feature_names)
23-
24-
def __init__(self) -> None:
2558
# Internal state
2659
self._wrapped_regressor: sk_DecisionTreeRegressor | None = None
2760
self._feature_names: list[str] | None = None
2861
self._target_name: str | None = None
2962

63+
def __hash__(self) -> int:
64+
return _structural_hash(
65+
Regressor.__hash__(self),
66+
self._feature_names,
67+
self._target_name,
68+
)
69+
70+
@property
71+
def maximum_depth(self) -> int | None:
72+
"""The maximum depth of the tree."""
73+
return self._maximum_depth
74+
75+
@property
76+
def minimum_number_of_samples_in_leaves(self) -> int:
77+
"""The minimum number of samples that must remain in the leaves of the tree."""
78+
return self._minimum_number_of_samples_in_leaves
79+
3080
def fit(self, training_set: TabularDataset) -> DecisionTreeRegressor:
3181
"""
3282
Create a copy of this regressor and fit it with the given training data.
@@ -59,7 +109,10 @@ def fit(self, training_set: TabularDataset) -> DecisionTreeRegressor:
59109
wrapped_regressor = self._get_sklearn_regressor()
60110
fit(wrapped_regressor, training_set)
61111

62-
result = DecisionTreeRegressor()
112+
result = DecisionTreeRegressor(
113+
maximum_depth=self._maximum_depth,
114+
minimum_number_of_samples_in_leaves=self._minimum_number_of_samples_in_leaves,
115+
)
63116
result._wrapped_regressor = wrapped_regressor
64117
result._feature_names = training_set.features.column_names
65118
result._target_name = training_set.target.name
@@ -113,4 +166,7 @@ def _get_sklearn_regressor(self) -> RegressorMixin:
113166
"""
114167
from sklearn.tree import DecisionTreeRegressor as sk_DecisionTreeRegressor
115168

116-
return sk_DecisionTreeRegressor()
169+
return sk_DecisionTreeRegressor(
170+
max_depth=self._maximum_depth,
171+
min_samples_leaf=self._minimum_number_of_samples_in_leaves,
172+
)

0 commit comments

Comments
 (0)