Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Filters #48

Merged
merged 22 commits into from
Feb 20, 2025
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 45 additions & 9 deletions phenex/filters/categorical_filter.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,56 @@
from phenex.filters.filter import Filter
from typing import List, Optional, Union
from ibis.expr.types.relations import Table
from typing import List, Optional, Union, Dict


class CategoricalFilter(Filter):
"""
This class filters events in an EventTable based on specified categorical values
This class filters events in an EventTable based on specified categorical values.

Attributes:
category (Optional[str]): The category to filter events by.
column_name (str): The name of the column to filter by.
allowed_values (List[Union[str, int]]): The list of allowed values for the column.
domain (Optional[str]): The domain to which the filter applies.

Methods:
_filter(table: MeasurementTable) -> MeasurementTable:
Filters the given MeasurementTable based on the specified category.
filter(table: PhenexTable) -> PhenexTable:
Filters the given PhenexTable based on the specified column and allowed values.
Parameters:
table (Measurement): The table containing events to be filtered.
table (PhenexTable): The table containing events to be filtered.
Returns:
MeasurementTable: The filtered MeasurementTable with events matching the category.
PhenexTable: The filtered PhenexTable with events matching the allowed values.

autojoin_filter(table: PhenexTable, tables: dict = None) -> PhenexTable:
Automatically joins the necessary tables and applies the filter. Use when the input table does not contain the column that defines the filter. For this to work, the tables must specify all required join keys. See DomainsDictionary for details.
Parameters:
table (PhenexTable): The table containing events to be filtered.
tables (dict): A dictionary of tables for joining.
Returns:
PhenexTable: The filtered PhenexTable with events matching the allowed values.

Examples:
# Example 1: Filter for SEX = 'Female'
sex_filter = CategoricalFilter(
column_name="SEX",
allowed_values=["Female"],
domain="PERSON"
)

# Example 2: Filter for inpatient (domain = encounter)
inpatient_filter = CategoricalFilter(
column_name="ENCOUNTER_TYPE",
allowed_values=["INPATIENT"],
domain="ENCOUNTER"
)

# Example 3: Filter for primary diagnosis position
primary_diagnosis_filter = CategoricalFilter(
column_name="DIAGNOSIS_POSITION",
allowed_values=[1],
domain="DIAGNOSIS"
)

# Example 4: Applying multiple filters in combination
inpatient_primary_position = inpatient_filter & primary_diagnosis_filter
"""

def __init__(
Expand All @@ -33,7 +67,9 @@ def __init__(
def _filter(self, table: "PhenexTable"):
return table.filter(table[self.column_name].isin(self.allowed_values))

def autojoin_filter(self, table: "PhenexTable", tables: dict = None):
def autojoin_filter(
self, table: "PhenexTable", tables: Optional[Dict[str, "PhenexTable"]] = None
) -> "PhenexTable":
if self.column_name not in table.columns:
if self.domain not in tables.keys():
raise ValueError(
Expand Down
55 changes: 24 additions & 31 deletions phenex/filters/date_range_filter.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,34 @@
from typing import Optional
from datetime import date
from typing import Optional, Union
from datetime import date, datetime

from phenex.tables import EventTable, is_phenex_event_table
from phenex.filters.value import Value
from phenex.filters.value_filter import ValueFilter


class DateRangeFilter:
class DateRangeFilter(ValueFilter):
"""
DateRangeFilter is a class designed to filter an EventTable between two specified dates.
DateRangeFilter is a ValueFilter applied to dates.

Attributes:
-----------
min_date : Optional[date]
The minimum date for the filter. Events occurring before this date will be excluded.
max_date : Optional[date]
The maximum date for the filter. Events occurring after this date will be excluded.
min_date (Optional[Union[date, str]]): The minimum date for the filter. If a string is provided, it will be converted to a date according to date_format.
max_date (Optional[Union[date, str]]): The maximum date for the filter. If a string is provided, it will be converted to a date according to date_format.
column_name (Optional[str]): The name of the column to apply the filter on. Defaults to EVENT_DATE, the default value for date columns in Phenex.
date_format (str): The format to use for parsing date strings.
"""

def __init__(
self, min_date: Optional[date] = None, max_date: Optional[date] = None
self,
min_date: Optional[Union[date, str]] = None,
max_date: Optional[Union[date, str]] = None,
column_name: Optional[str] = "EVENT_DATE",
date_format="YYYY-MM-DD",
):
self.min_date = min_date
self.max_date = max_date
super(DateRangeFilter, self).__init__()

def filter(self, table: EventTable):

assert is_phenex_event_table(table)

conditions = []
if self.min_date is not None:
conditions.append(table.EVENT_DATE >= self.min_date)
if self.max_date is not None:
conditions.append(table.EVENT_DATE <= self.max_date)

if conditions:
output_table = table.filter(conditions)
else:
output_table = table

return output_table
if isinstance(min_date, str):
min_date = datetime.strptime(min_date, date_format).date()
if isinstance(max_date, str):
max_date = datetime.strptime(max_date, date_format).date()
super(DateRangeFilter, self).__init__(
min=Value(">=", min_date),
max=Value("<=", max_date),
column_name=column_name,
)
93 changes: 87 additions & 6 deletions phenex/filters/filter.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,29 @@
from ibis.expr.types.relations import Table
from phenex.tables import PhenexTable
from typing import Optional, Dict


class Filter:
"""
Filters operate on single tables and return these tables with rows removed. Filters are
generally used within a Phenotype as a subquery. Filters know about their dependencies
but cannot trigger recursive execution. Fitlers can add columns but may not remove columns.
All classes in the filters module should subclass this class. Subclasses must implement
the _filter method.
Filters operate on single tables and return these tables with rows removed. Filters are generally used within a Phenotype as a subquery. Filters know about their dependencies but cannot trigger recursive execution. Fitlers can add columns but may not remove columns. All classes in the filters module should subclass this class. Subclasses must implement the _filter method.

Methods:
filter(table: PhenexTable) -> PhenexTable: Filters the given table.
"""

def __init__(self):
pass

def filter(self, table: Table) -> Table:
def filter(self, table: PhenexTable) -> PhenexTable:
"""
Filters the given table according to the rules of the Filter.

Args:
table (PhenexTable): The table to be filtered.

Returns:
PhenexTable: The filtered table. The returned table has the exact same schema as the input table but has rows removed.
"""
input_columns = table.columns
filtered_table = self._filter(table)
if not set(input_columns) <= set(filtered_table.columns):
Expand All @@ -23,4 +32,76 @@ def filter(self, table: Table) -> Table:
return type(table)(filtered_table.select(input_columns))

def _filter(self, table: Table) -> Table:
"""
Performs the operations required to filter the table.
"""
raise NotImplementedError()

def autojoin_filter(
self, table: "PhenexTable", tables: Optional[Dict[str, "PhenexTable"]] = None
) -> "PhenexTable":
raise NotImplementedError()

def __and__(self, other):
return AndFilter(self, other)

def __or__(self, other):
return OrFilter(self, other)

def __invert__(self):
return NotFilter(self)


class AndFilter(Filter):
"""
Combines two filters using logical AND.
"""

def __init__(self, filter1, filter2):
self.filter1 = filter1
self.filter2 = filter2

def filter(self, table: Table) -> Table:
table = self.filter1.filter(table)
return self.filter2.filter(table)

def autojoin_filter(self, table: "PhenexTable", tables: dict = None):
table = self.filter1.autojoin_filter(table, tables)
return self.filter2.autojoin_filter(table, tables)


class OrFilter(Filter):
"""
Combines two filters using logical OR.
"""

def __init__(self, filter1, filter2):
self.filter1 = filter1
self.filter2 = filter2

def filter(self, table: Table) -> Table:
table1 = self.filter1.filter(table).table
table2 = self.filter2.filter(table).table
return type(table)(table1.union(table2).distinct())

def autojoin_filter(self, table: "PhenexTable", tables: dict = None):
table1 = self.filter1.autojoin_filter(table, tables).table
table2 = self.filter2.autojoin_filter(table, tables).table
return type(table)(table1.union(table2, distinct=True))


class NotFilter(Filter):
"""
Negates a filter.
"""

def __init__(self, filter):
self.filter = filter

def filter(self, table: Table) -> Table:
filtered_table = self.filter.filter(table).table
return type(table)(table.difference(filtered_table))

def autojoin_filter(self, table: "PhenexTable", tables: dict = None):
filtered_table = self.filter.autojoin_filter(table, tables).table
return type(table)(table.difference(filtered_table))
11 changes: 0 additions & 11 deletions phenex/filters/relative_time_range_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,6 @@ class RelativeTimeRangeFilter(Filter):
option is mutually exclusive with min_years.
max_days (Optional[int]): Maximum number of days from the anchor date to filter events. This
option is mutually exclusive with max_years.
min_years (Optional[Value]): Minimum number of years from the anchor date to filter events. This
option is mutually exclusive with min_days.
max_years (Optional[Value]): Maximum number of years from the anchor date to filter events.
This option is mutually exclusive with max_days.
anchor_phenotype (Phenotype): A phenotype providing the anchor date for filtering.
when (Optional[str]): when can be "before" or "after"; if "before", days prior to anchor
event_date are positive, and days after are negative; using after, days before the
Expand All @@ -42,17 +38,10 @@ class RelativeTimeRangeFilter(Filter):
Filters an EventTable relative to some reference date.
"""

# FIXME this will become a problem when modern medicine allows people to live more
# than 365*4 years (so they accumulate enough leap days to get an extra year)
# ibis.delta counts leap days
DAYS_IN_YEAR = 365

def __init__(
self,
min_days: Optional[Value] = GreaterThanOrEqualTo(0),
max_days: Optional[Value] = None,
min_years: Optional[Value] = None,
max_years: Optional[Value] = None,
when: Optional[str] = "before",
anchor_phenotype: "Phenotype" = None,
):
Expand Down
32 changes: 12 additions & 20 deletions phenex/filters/value.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,19 @@
from typing import Union
from datetime import date
from phenex.filters.filter import Filter


class Value(Filter):
class Value:
"""
The Value class is used to define threshold on values in databases. Importantly, Value's define not just numeric values but also the boundary (including or excluding the endpoint).

Attributes:
operator (str): The comparison operator, one of '>', '>=', '<', '<=', '='.
value (Union[int, float, date]): The threshold value.

Examples:
greater_than_zero = Value(0, '>')
"""

def __init__(self, operator: str, value: Union[int, float, date]):
self.operator = operator
self.value = value
Expand All @@ -14,24 +24,6 @@ def __init__(self, operator: str, value: Union[int, float, date]):
"<=",
"=",
], "Operator must be >, >=, <, <=, or ="
super(Value, self).__init__()

def _filter(self, table, column):

if self.operator == ">":
table = getattr(table, column) > self.value
elif self.operator == ">=":
table = getattr(table, column) >= self.value
elif self.operator == "<":
table = getattr(table, column) < self.value
elif self.operator == "<=":
table = getattr(table, column) <= self.value
elif self.operator == "=":
table = getattr(table, column) == self.value
else:
raise ValueError("Operator must be >, >=, <, <=, or =")

return table


class GreaterThan(Value):
Expand Down
Loading