Skip to content

Commit

Permalink
made apriori evaluate lazily
Browse files Browse the repository at this point in the history
  • Loading branch information
Feelx234 committed Nov 14, 2023
1 parent ce8a6ca commit 78f42a3
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 11 deletions.
24 changes: 19 additions & 5 deletions src/pysubgroup/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,9 @@ def get_next_level_candidates_vectorized(self, task, result, next_level_candidat
promising_candidates = []
statistics = []
optimistic_estimate_function = getattr(task.qf, self.optimistic_estimate_name)
next_level_candidates = list(next_level_candidates)
if len(next_level_candidates) == 0:
return []
for sg in next_level_candidates:
statistics.append(task.qf.calculate_statistics(sg, task.target, task.data))
tpl_class = statistics[0].__class__
Expand All @@ -152,7 +155,7 @@ def get_next_level_candidates_vectorized(self, task, result, next_level_candidat
promising_candidates.append(sg.selectors)
return promising_candidates

def get_next_level_numba(self, promising_candidates): # pragma: no cover
def get_next_level_numba(self, promising_candidates): # pragma: no cover
if not hasattr(self, "compiled_func") or self.compiled_func is None:
self.compiled_func = getNewCandidates

Expand All @@ -162,18 +165,25 @@ def get_next_level_numba(self, promising_candidates): # pragma: no cover
tuple(all_selectors_ids[sel] for sel in selectors)
for selectors in promising_candidates
]
arr = np.array(promising_candidates_selector_ids, dtype=int)
shape1 = len(promising_candidates_selector_ids)
if shape1 == 0:
return []
shape2 = len(promising_candidates_selector_ids[0])
arr = np.array(promising_candidates_selector_ids, dtype=np.int32).reshape(
shape1, shape2
)

print(len(arr))
hashes = np.array(
[hash(tuple(x[:-1])) for x in promising_candidates_selector_ids],
dtype=np.int64,
)
print(len(arr), arr.dtype, hashes.dtype)
candidates_int = self.compiled_func(arr, hashes)
return list(
return [
(*promising_candidates[i], promising_candidates[j][-1])
for i, j in candidates_int
)
]

def get_next_level(self, promising_candidates):
by_prefix_dict = defaultdict(list)
Expand Down Expand Up @@ -219,6 +229,8 @@ def execute(self, task):
promising_candidates = self.get_next_level_candidates(
task, result, next_level_candidates
)
if len(promising_candidates) == 0:
break

if depth == task.depth:
break
Expand All @@ -228,15 +240,17 @@ def execute(self, task):
# select those selectors and build a subgroup from them
# for which all subsets of length depth (=candidate length -1)
# are in the set of promising candidates
curr_depth = depth # WARNING: need copy of depth for lazy eval
set_promising_candidates = set(tuple(p) for p in promising_candidates)
next_level_candidates = (
combine_selectors(selectors)
for selectors in next_level_candidates_no_pruning
if all(
(subset in set_promising_candidates)
for subset in combinations(selectors, depth)
for subset in combinations(selectors, curr_depth)
)
)

depth = depth + 1

result = ps.prepare_subgroup_discovery_result(result, task)
Expand Down
35 changes: 30 additions & 5 deletions src/pysubgroup/subgroup_description.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from itertools import chain

import numpy as np
import pandas as pd

import pysubgroup as ps

Expand Down Expand Up @@ -134,6 +133,24 @@ def get_size(subgroup, data_len=None, data=None):
return size


def pandas_sparse_eq(col, value):
import pandas as pd # pylint: disable=import-outside-toplevel
from pandas._libs.sparse import (
IntIndex, # pylint: disable=import-outside-toplevel, no-name-in-module
)

col_arr = col.array
is_same_value = col_arr.sp_values == value
new_index_arr = col_arr.sp_index.indices[is_same_value]
index = IntIndex(len(col), new_index_arr)
return pd.arrays.SparseArray(
np.ones(len(new_index_arr), dtype=bool),
index,
col_arr.fill_value == value,
dtype=bool,
)


class EqualitySelector(SelectorBase):
def __init__(self, attribute_name, attribute_value, selector_name=None):
if attribute_name is None:
Expand Down Expand Up @@ -188,10 +205,14 @@ def __repr__(self):

def covers(self, data):
import pandas as pd # pylint: disable=import-outside-toplevel
if isinstance(data[self.attribute_name].dtype, pd.SparseDtype):
row = data[self.attribute_name]

column = data[self.attribute_name]
if isinstance(column.dtype, pd.SparseDtype):
row = column
if not pd.isnull(self.attribute_value):
return pandas_sparse_eq(column, self.attribute_value)
else:
row = data[self.attribute_name].to_numpy()
row = column.to_numpy()
if pd.isnull(self.attribute_value):
return pd.isnull(row)
return row == self.attribute_value
Expand Down Expand Up @@ -437,9 +458,13 @@ def create_numeric_selectors(
def create_numeric_selectors_for_attribute(
data, attr_name, nbins=5, intervals_only=True, weighting_attribute=None
):
import pandas as pd # pylint: disable=import-outside-toplevel

numeric_selectors = []
if isinstance(data[attr_name].dtype, pd.SparseDtype):
numeric_selectors.append(EqualitySelector(attr_name, data[attr_name].sparse.fill_value))
numeric_selectors.append(
EqualitySelector(attr_name, data[attr_name].sparse.fill_value)
)
dense_data = data[attr_name].sparse.sp_values
data_not_null = dense_data[pd.notnull(dense_data)]
uniqueValues = np.unique(data_not_null)
Expand Down
3 changes: 2 additions & 1 deletion src/pysubgroup/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from heapq import heappop, heappush

import numpy as np
import pandas as pd

import pysubgroup as ps

Expand All @@ -34,6 +33,8 @@ def prepare_subgroup_discovery_result(result, task):
def equal_frequency_discretization(
data, attribute_name, nbins=5, weighting_attribute=None
):
import pandas as pd # pylint: disable=import-outside-toplevel

cutpoints = []
if weighting_attribute is None:
cleaned_data = data[attribute_name]
Expand Down

0 comments on commit 78f42a3

Please sign in to comment.