diff --git a/src/pysubgroup/algorithms.py b/src/pysubgroup/algorithms.py index 41c297f..ac81791 100644 --- a/src/pysubgroup/algorithms.py +++ b/src/pysubgroup/algorithms.py @@ -132,6 +132,9 @@ def get_next_level_candidates_vectorized(self, task, result, next_level_candidat promising_candidates = [] statistics = [] optimistic_estimate_function = getattr(task.qf, self.optimistic_estimate_name) + next_level_candidates = list(next_level_candidates) + if len(next_level_candidates) == 0: + return [] for sg in next_level_candidates: statistics.append(task.qf.calculate_statistics(sg, task.target, task.data)) tpl_class = statistics[0].__class__ @@ -152,7 +155,7 @@ def get_next_level_candidates_vectorized(self, task, result, next_level_candidat promising_candidates.append(sg.selectors) return promising_candidates - def get_next_level_numba(self, promising_candidates): # pragma: no cover + def get_next_level_numba(self, promising_candidates): # pragma: no cover if not hasattr(self, "compiled_func") or self.compiled_func is None: self.compiled_func = getNewCandidates @@ -162,18 +165,25 @@ def get_next_level_numba(self, promising_candidates): # pragma: no cover tuple(all_selectors_ids[sel] for sel in selectors) for selectors in promising_candidates ] - arr = np.array(promising_candidates_selector_ids, dtype=int) + shape1 = len(promising_candidates_selector_ids) + if shape1 == 0: + return [] + shape2 = len(promising_candidates_selector_ids[0]) + arr = np.array(promising_candidates_selector_ids, dtype=np.int32).reshape( + shape1, shape2 + ) print(len(arr)) hashes = np.array( [hash(tuple(x[:-1])) for x in promising_candidates_selector_ids], dtype=np.int64, ) + print(len(arr), arr.dtype, hashes.dtype) candidates_int = self.compiled_func(arr, hashes) - return list( + return [ (*promising_candidates[i], promising_candidates[j][-1]) for i, j in candidates_int - ) + ] def get_next_level(self, promising_candidates): by_prefix_dict = defaultdict(list) @@ -219,6 +229,8 @@ def execute(self, task): promising_candidates = self.get_next_level_candidates( task, result, next_level_candidates ) + if len(promising_candidates) == 0: + break if depth == task.depth: break @@ -228,15 +240,17 @@ def execute(self, task): # select those selectors and build a subgroup from them # for which all subsets of length depth (=candidate length -1) # are in the set of promising candidates + curr_depth = depth # WARNING: need copy of depth for lazy eval set_promising_candidates = set(tuple(p) for p in promising_candidates) next_level_candidates = ( combine_selectors(selectors) for selectors in next_level_candidates_no_pruning if all( (subset in set_promising_candidates) - for subset in combinations(selectors, depth) + for subset in combinations(selectors, curr_depth) ) ) + depth = depth + 1 result = ps.prepare_subgroup_discovery_result(result, task) diff --git a/src/pysubgroup/subgroup_description.py b/src/pysubgroup/subgroup_description.py index e46c392..072319d 100644 --- a/src/pysubgroup/subgroup_description.py +++ b/src/pysubgroup/subgroup_description.py @@ -10,7 +10,6 @@ from itertools import chain import numpy as np -import pandas as pd import pysubgroup as ps @@ -134,6 +133,24 @@ def get_size(subgroup, data_len=None, data=None): return size +def pandas_sparse_eq(col, value): + import pandas as pd # pylint: disable=import-outside-toplevel + from pandas._libs.sparse import ( + IntIndex, # pylint: disable=import-outside-toplevel, no-name-in-module + ) + + col_arr = col.array + is_same_value = col_arr.sp_values == value + new_index_arr = col_arr.sp_index.indices[is_same_value] + index = IntIndex(len(col), new_index_arr) + return pd.arrays.SparseArray( + np.ones(len(new_index_arr), dtype=bool), + index, + col_arr.fill_value == value, + dtype=bool, + ) + + class EqualitySelector(SelectorBase): def __init__(self, attribute_name, attribute_value, selector_name=None): if attribute_name is None: @@ -188,10 +205,14 @@ def __repr__(self): def covers(self, data): import pandas as pd # pylint: disable=import-outside-toplevel - if isinstance(data[self.attribute_name].dtype, pd.SparseDtype): - row = data[self.attribute_name] + + column = data[self.attribute_name] + if isinstance(column.dtype, pd.SparseDtype): + row = column + if not pd.isnull(self.attribute_value): + return pandas_sparse_eq(column, self.attribute_value) else: - row = data[self.attribute_name].to_numpy() + row = column.to_numpy() if pd.isnull(self.attribute_value): return pd.isnull(row) return row == self.attribute_value @@ -437,9 +458,13 @@ def create_numeric_selectors( def create_numeric_selectors_for_attribute( data, attr_name, nbins=5, intervals_only=True, weighting_attribute=None ): + import pandas as pd # pylint: disable=import-outside-toplevel + numeric_selectors = [] if isinstance(data[attr_name].dtype, pd.SparseDtype): - numeric_selectors.append(EqualitySelector(attr_name, data[attr_name].sparse.fill_value)) + numeric_selectors.append( + EqualitySelector(attr_name, data[attr_name].sparse.fill_value) + ) dense_data = data[attr_name].sparse.sp_values data_not_null = dense_data[pd.notnull(dense_data)] uniqueValues = np.unique(data_not_null) diff --git a/src/pysubgroup/utils.py b/src/pysubgroup/utils.py index 9649303..8420a41 100644 --- a/src/pysubgroup/utils.py +++ b/src/pysubgroup/utils.py @@ -9,7 +9,6 @@ from heapq import heappop, heappush import numpy as np -import pandas as pd import pysubgroup as ps @@ -34,6 +33,8 @@ def prepare_subgroup_discovery_result(result, task): def equal_frequency_discretization( data, attribute_name, nbins=5, weighting_attribute=None ): + import pandas as pd # pylint: disable=import-outside-toplevel + cutpoints = [] if weighting_attribute is None: cleaned_data = data[attribute_name]