Skip to content

Commit

Permalink
Merge branch 'master' into release
Browse files Browse the repository at this point in the history
  • Loading branch information
venaturum committed May 4, 2022
2 parents d8b9b44 + ff8c88f commit f89d0b5
Show file tree
Hide file tree
Showing 13 changed files with 515 additions and 33 deletions.
6 changes: 3 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
repos:
- repo: https://github.com/pycqa/isort
rev: 5.8.0
rev: 5.10.1
hooks:
- id: isort
args: ["--profile", "black", "--filter-files"]
name: isort
- repo: https://github.com/psf/black
rev: 21.6b0
rev: 22.3.0
hooks:
- id: black
exclude: ^piso/test_data*
- repo: https://gitlab.com/pycqa/flake8
rev: 3.9.2
rev: 4.0.1
hooks:
- id: flake8
1 change: 1 addition & 0 deletions docs/reference/accessors.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,5 @@ Accessors
ArrayAccessor.complement
ArrayAccessor.contains
ArrayAccessor.split
ArrayAccessor.bridge
ArrayAccessor.adjacency_matrix
1 change: 1 addition & 0 deletions docs/reference/package.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Top level functions
complement
contains
split
bridge
lookup
join
adjacency_matrix
10 changes: 10 additions & 0 deletions docs/release_notes/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@ Release notes
========================


**v0.9.0 2022-05-04**

- Extended :func:`piso.adjacency_matrix` and :meth:`ArrayAccessor.adjacency_matrix() <piso.accessor.ArrayAccessor.adjacency_matrix>` to take multiple interval array parameters

Added the following methods

- :func:`piso.bridge`
- :meth:`ArrayAccessor.bridge() <piso.accessor.ArrayAccessor.bridge>`


**v0.8.0 2022-01-29**

- Added `bins` parameter to :func:`piso.coverage` and :meth:`ArrayAccessor.coverage() <piso.accessor.ArrayAccessor.coverage>`
Expand Down
1 change: 1 addition & 0 deletions piso/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from piso.graph import adjacency_matrix
from piso.intervalarray import (
bridge,
complement,
contains,
coverage,
Expand Down
10 changes: 9 additions & 1 deletion piso/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,13 +175,21 @@ def split(self, x):
)

@Appender(docstrings.adjacency_matrix_docstring, join="\n", indents=1)
def adjacency_matrix(self, edges="intersect", include_index=True):
def adjacency_matrix(self, *interval_arrays, edges="intersect", include_index=True):
return graph.adjacency_matrix(
self._interval_array,
*interval_arrays,
edges=edges,
include_index=include_index,
)

@Appender(docstrings.bridge_docstring, join="\n", indents=1)
def bridge(self, threshold):
return intervalarray.bridge(
self._interval_array,
threshold,
)


def _register_accessors():
_register_accessor("piso", pd.IntervalIndex)(ArrayAccessor)
Expand Down
47 changes: 47 additions & 0 deletions piso/docstrings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -948,3 +948,50 @@ def join_params(list_of_param_strings):
[8, 9] True True True False False
[9, 10] True True True False False
"""


bridge_docstring = """
Given a set of intervals, and a threshold, merges intervals which are separated by a gap less than
or equal to the threshold. Overlapping intervals will be merged, regardless of threshold value.
The intervals are contained in the object the accessor belongs to. They may be left-closed,
right-closed, both, or neither, and contain overlapping intervals.
Parameters
----------
threshold : scalar
The value should belong to the domain that arises from a subtraction over the domain of the intervals.
For instance, if intervals are timestamp data, then *threshold* should be timedelta.
Returns
----------
:class:`pandas.IntervalIndex` or :class:`pandas.arrays.IntervalArray`
Return type will be the same type as the object the accessor belongs to.
Examples
-----------
>>> import pandas as pd
>>> import piso
>>> piso.register_accessors()
>>> arr = pd.arrays.IntervalArray.from_tuples(
... [(0, 4), (3, 5), (7, 8), (11, 12)],
... )
>>> arr.piso.bridge(threshold=1)
<IntervalArray>
[(0.0, 5.0], (7.0, 8.0], (11.0, 12.0]]
Length: 3, closed: right, dtype: interval[float64]
>>> arr.piso.bridge(threshold=2)
<IntervalArray>
[(0.0, 8.0], (11.0, 12.0]]
Length: 2, closed: right, dtype: interval[float64]
>>> arr.piso.bridge(threshold=3)
<IntervalArray>
[(0.0, 12.0]]
Length: 1, closed: right, dtype: interval[float64]
"""
48 changes: 48 additions & 0 deletions piso/docstrings/intervalarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -893,3 +893,51 @@ def join_params(list_of_param_strings):
closed='neither',
dtype='interval[float64]')
"""


bridge_docstring = """
Given a set of intervals, and a threshold, merges intervals which are separated by a gap less than
or equal to the threshold. Overlapping intervals will be merged, regardless of threshold value.
The intervals are contained in the object the accessor belongs to. They may be left-closed,
right-closed, both, or neither, and contain overlapping intervals.
Parameters
----------
interval_array : :class:`pandas.IntervalIndex` or :class:`pandas.arrays.IntervalArray`
Contains the (possibly overlapping) intervals. May be left-closed or right-closed.
threshold : scalar
The value should belong to the domain that arises from a subtraction over the domain of the intervals.
For instance, if intervals are timestamp data, then *threshold* should be timedelta.
Returns
----------
:class:`pandas.IntervalIndex` or :class:`pandas.arrays.IntervalArray`
Return type will be the same type as the object the accessor belongs to.
Examples
-----------
>>> import pandas as pd
>>> import piso
>>> arr = pd.arrays.IntervalArray.from_tuples(
... [(0, 4), (3, 5), (7, 8), (11, 12)],
... )
>>> piso.bridge(arr, threshold=1)
<IntervalArray>
[(0.0, 5.0], (7.0, 8.0], (11.0, 12.0]]
Length: 3, closed: right, dtype: interval[float64]
>>> piso.bridge(arr, threshold=2)
<IntervalArray>
[(0.0, 8.0], (11.0, 12.0]]
Length: 2, closed: right, dtype: interval[float64]
>>> piso.bridge(arr, threshold=3)
<IntervalArray>
[(0.0, 12.0]]
Length: 1, closed: right, dtype: interval[float64]
"""
134 changes: 106 additions & 28 deletions piso/graph.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,100 @@
import itertools

import numpy as np
import pandas as pd
from pandas.core.indexes import interval

from piso.intervalarray import _validate_array_of_intervals_arrays


def _adj_mat_intersection(lefts, rights, closed, fill_diagonal=True):
result = np.greater.outer(rights, lefts) & np.less.outer(lefts, rights)
if closed == "both":
result = result | np.equal.outer(rights, lefts) | np.equal.outer(lefts, rights)
if fill_diagonal:
np.fill_diagonal(result, False)
return result


def _adjacency_matrix_set_of_intervals(interval_array, edges, include_index):
if edges == "intersect":
result = _adj_mat_intersection(
interval_array.left, interval_array.right, interval_array.closed
)
elif edges == "disjoint":
result = ~_adj_mat_intersection(
interval_array.left,
interval_array.right,
interval_array.closed,
fill_diagonal=False,
)
else:
raise ValueError(f"Invalid value for edges parameter: {edges}")

if include_index:
result = pd.DataFrame(result, index=interval_array, columns=interval_array)

return result


def _adjacency_matrix_set_of_sets(*interval_arrays, edges, include_index):
_validate_array_of_intervals_arrays(*interval_arrays, validate_intervals=False)
lefts = list(itertools.chain.from_iterable([ia.left for ia in interval_arrays]))
rights = list(itertools.chain.from_iterable([ia.right for ia in interval_arrays]))
closed = interval_arrays[0].closed

if edges == "intersect":
component_result = _adj_mat_intersection(lefts, rights, closed)
numpy_logical = np.logical_or
elif edges == "disjoint":
component_result = ~_adj_mat_intersection(
lefts, rights, closed, fill_diagonal=False
)
numpy_logical = np.logical_and
else:
raise ValueError(f"Invalid value for edges parameter: {edges}")

index = np.cumsum([0] + [len(ia) for ia in interval_arrays[:-1]])
result = numpy_logical.reduceat(
numpy_logical.reduceat(component_result, index, axis=0), index, axis=1
)
np.fill_diagonal(result, False)

if include_index:
result = pd.DataFrame(
result,
index=range(len(interval_arrays)),
columns=range(len(interval_arrays)),
)

return result

def adjacency_matrix(interval_array, edges="intersect", include_index=True):

def adjacency_matrix(
interval_array, *interval_arrays, edges="intersect", include_index=True
):
"""
Returns a 2D array (or dataframe) of boolean values indicating edges between nodes in a graph.
The set of nodes correspond to intervals and the edges are defined by the relationship
The nodes correspond to sets and the edges are defined by the relationship
defined by the *edges* parameter.
What is considered a set is determined by the number of positional arguments used, that is, determined by the
size of *interval_arrays*.
If *interval_arrays* is empty then the sets are considered to be the intervals contained in *interval_array*.
If *interval_arrays* is not empty then the sets are considered to be *interval_array* and the elements in *interval_arrays*.
Each of these arrays is assumed to contain disjoint intervals (and satisfy the definition of a set). Any array containing
overlaps between intervals will be mapped to one with disjoint intervals via a union operation.
Note that the diagonal is defined with False values by default.
Parameters
----------
interval_array : :class:`pandas.arrays.IntervalArray` or :class:`pandas.IntervalIndex`
Contains the intervals.
The first (and possibly only) operand.
*interval_arrays : argument list of :class:`pandas.IntervalIndex` or :class:`pandas.arrays.IntervalArray`
Must contain at least one argument.
edges : {"intersect", "disjoint"}, default "intersect"
Defines the relationship that edges between nodes represent.
include_index : bool, default True
Expand Down Expand Up @@ -60,30 +139,29 @@ def adjacency_matrix(interval_array, edges="intersect", include_index=True):
[5, 7] True False False True True
[8, 9] True True True False False
[9, 10] True True True False False
"""
if edges == "intersect":
result = _adj_mat_intersection(interval_array)
elif edges == "disjoint":
result = ~_adj_mat_intersection(interval_array, fill_diagonal=False)
else:
raise ValueError(f"Invalid value for edges parameter: {edges}")

if include_index:
result = pd.DataFrame(result, index=interval_array, columns=interval_array)

return result
>>> ii1 = pd.IntervalIndex.from_tuples([(0,3), (2,8), (11,15)], closed="left")
>>> ii2 = pd.IntervalIndex.from_tuples([(3,5), (7,12), (16,20)], closed="left")
>>> ii3 = pd.IntervalIndex.from_tuples([(9,11), (25,26)], closed="left")
>>> ii4 = pd.IntervalIndex.from_tuples([(23,24)], closed="left")
>>> piso.adjacency_matrix(ii1,ii2,ii3,ii4)
0 1 2 3
0 False True False False
1 True False True False
2 False True False False
3 False False False False
>>> piso.adjacency_matrix(ii1,ii2,ii3,ii4, edges="disjoint", include_index=False)
array([[False, False, True, True],
[False, False, False, True],
[ True, False, False, True],
[ True, True, True, False]])
"""

def _adj_mat_intersection(interval_array, fill_diagonal=True):
result = np.greater.outer(
interval_array.right, interval_array.left
) & np.less.outer(interval_array.left, interval_array.right)
if interval_array.closed == "both":
result = (
result
| np.equal.outer(interval_array.right, interval_array.left)
| np.equal.outer(interval_array.left, interval_array.right)
if len(interval_arrays) == 0:
return _adjacency_matrix_set_of_intervals(interval_array, edges, include_index)
else:
return _adjacency_matrix_set_of_sets(
interval_array, *interval_arrays, edges=edges, include_index=include_index
)
if fill_diagonal:
np.fill_diagonal(result, False)
return result
10 changes: 10 additions & 0 deletions piso/intervalarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,3 +262,13 @@ def split(interval_array, x):
return interval_array.from_arrays(
lefts[~np.isnan(lefts)], rights[~np.isnan(rights)], closed=interval_array.closed
)


@Appender(docstrings.bridge_docstring, join="\n", indents=1)
def bridge(interval_array, threshold):
# interval_array validation will occur in union and complement methods
complement_ = complement(union(interval_array))
return complement(
complement_[complement_.length > threshold],
(interval_array.left.min(), interval_array.right.max()),
)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ build-backend = "poetry.masonry.api"

[tool.poetry]
name = "piso"
version = "0.8.0"
version = "0.9.0"
description = "Pandas Interval Set Operations: methods for set operations, analytics, lookups and joins on pandas' Interval, IntervalArray and IntervalIndex"
readme = "README.md"
authors = ["Riley Clement <venaturum@gmail.com>"]
Expand Down
Loading

0 comments on commit f89d0b5

Please sign in to comment.