Skip to content

Commit 6634864

Browse files
fix(types): fix histogram bin allocation (#9711)
## Description of changes ``` import ibis ibis.options.interactive = True ibis.options.repr.interactive.max_rows = 20 t = ibis.range(1000).unnest().name("index").as_table() t.select(hist=t["index"].histogram(nbins=10)).value_counts() ``` ``` ┏━━━━━━━┳━━━━━━━━━━━━┓ ┃ hist ┃ hist_count ┃ ┡━━━━━━━╇━━━━━━━━━━━━┩ │ int64 │ int64 │ ├───────┼────────────┤ │ 5 │ 100 │ │ 9 │ 100 │ │ 0 │ 100 │ │ 3 │ 100 │ │ 6 │ 100 │ │ 2 │ 100 │ │ 7 │ 100 │ │ 8 │ 100 │ │ 1 │ 100 │ │ 4 │ 100 │ └───────┴────────────┘ ``` ## Issues closed * Resolves #9687. I had to make a slight change to ``histogram`` to account for an edge case that was tested for Impala. It would fail if ``nbins`` was not passed, which is a rather niche use case because ``np.histogram`` for example requires the number of bins to be passed either explicitly or implicitly. I also found a slight quirk with the current design when fixing this because if a ``base`` is passed that is not the minimum value, it would assign those out-of-bound values smaller than the base a negative bin index. It now clips those out-of-bound values to the bin index of -1 to group them together, rather than potentially having bin indices of -1 and -2 for example, so this now aligns with how ``np.histogram`` assigns a bin index of 0 for out-of-bound values.
1 parent 537fc87 commit 6634864

File tree

2 files changed

+18
-6
lines changed

2 files changed

+18
-6
lines changed

ibis/backends/tests/test_numeric.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1370,7 +1370,7 @@ def test_clip(backend, alltypes, df, ibis_func, pandas_func):
13701370
backend.assert_series_equal(result, expected, check_names=False)
13711371

13721372

1373-
@pytest.mark.notimpl(["polars"], raises=com.OperationNotDefinedError)
1373+
@pytest.mark.notimpl(["datafusion", "polars"], raises=com.OperationNotDefinedError)
13741374
@pytest.mark.notyet(
13751375
["druid"],
13761376
raises=PyDruidProgrammingError,
@@ -1382,6 +1382,15 @@ def test_histogram(con, alltypes):
13821382
vc = hist.value_counts().sort_index()
13831383
vc_np, _bin_edges = np.histogram(alltypes.int_col.execute(), bins=n)
13841384
assert vc.tolist() == vc_np.tolist()
1385+
assert (
1386+
con.execute(
1387+
ibis.memtable({"value": range(100)})
1388+
.select(bin=_.value.histogram(10))
1389+
.value_counts()
1390+
.bin_count.nunique()
1391+
)
1392+
== 1
1393+
)
13851394

13861395

13871396
@pytest.mark.parametrize("const", ["pi", "e"])

ibis/expr/types/numeric.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -996,16 +996,19 @@ def histogram(
996996
f"Cannot pass both `nbins` (got {nbins}) and `binwidth` (got {binwidth})"
997997
)
998998

999-
if binwidth is None or base is None:
999+
if base is None:
1000+
base = self.min() - eps
1001+
1002+
if binwidth is None:
10001003
if nbins is None:
10011004
raise ValueError("`nbins` is required if `binwidth` is not provided")
10021005

1003-
if base is None:
1004-
base = self.min() - eps
1005-
10061006
binwidth = (self.max() - base) / nbins
10071007

1008-
return ((self - base) / binwidth).floor()
1008+
if nbins is None:
1009+
nbins = ((self.max() - base) / binwidth).ceil()
1010+
1011+
return ((self - base) / binwidth).floor().clip(-1, nbins - 1)
10091012

10101013

10111014
@public

0 commit comments

Comments
 (0)