Skip to content

Commit

Permalink
logical combinations of filters
Browse files Browse the repository at this point in the history
  • Loading branch information
sprivite committed Feb 18, 2025
1 parent b075071 commit 723bb59
Show file tree
Hide file tree
Showing 9 changed files with 315 additions and 127 deletions.
4 changes: 2 additions & 2 deletions phenex/filters/categorical_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ def __init__(
self.domain = domain
super(CategoricalFilter, self).__init__()

def _get_predicate(self, table: "PhenexTable"):
return table[self.column_name].isin(self.allowed_values)
def _filter(self, table: "PhenexTable"):
return table.filter(table[self.column_name].isin(self.allowed_values))

def autojoin_filter(self, table: "PhenexTable", tables: dict = None):
if self.column_name not in table.columns:
Expand Down
49 changes: 27 additions & 22 deletions phenex/filters/codelist_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,16 @@ def _convert_codelist_to_tuples(self) -> List[Tuple[str, str]]:
]
return []

def _get_predicate(self, code_table: CodeTable) -> CodeTable:
def _filter(self, code_table: CodeTable) -> CodeTable:

assert is_phenex_code_table(code_table)

if self.codelist.fuzzy_match:
return self._get_predicate_fuzzy_codelist(code_table)
return self._filter_fuzzy_codelist(code_table)
else:
return self._get_predicate_literal_codelist(code_table)

def _get_predicate_fuzzy_codelist(self, code_table):
return self._filter_literal_codelist(code_table)

def _filter_fuzzy_codelist(self, code_table):
filter_condition = False
for code_type, codelist in self.codelist.codelist.items():
codelist = [str(code) for code in codelist]
Expand All @@ -56,23 +55,29 @@ def _get_predicate_fuzzy_codelist(self, code_table):
codelist
)

return filter_condition
filtered_table = code_table.filter(filter_condition)
return filtered_table

def _get_predicate_literal_codelist(self, code_table):
def _filter_literal_codelist(self, code_table):

# IN and JOIN have similar / identical performance?
# https://stackoverflow.com/questions/1200295/sql-join-vs-in-performance
filter_condition = False
for code_type, codelist in self.codelist.codelist.items():
codelist = [str(code) for code in codelist]
if self.codelist.use_code_type:
filter_condition = filter_condition | (
(code_table.CODE_TYPE == code_type)
& (code_table.CODE.isin(codelist))
)
else:
filter_condition = filter_condition | code_table.CODE.isin(
codelist
)
# Generate the codelist table as an Ibis literal set
codelist_df = pd.DataFrame(
self.codelist_as_tuples, columns=["code_type", "code"]
).fillna("")
codelist_table = ibis.memtable(codelist_df)

# Create a join condition based on code and possibly code_type
code_column = code_table.CODE
if self.codelist.use_code_type:
code_type_column = code_table.CODE_TYPE
join_condition = (code_column == codelist_table.code) & (
code_type_column == codelist_table.code_type
)
else:
join_condition = code_column == codelist_table.code

return filter_condition
# return table with downselected columns, of same type as input table
filtered_table = code_table.inner_join(codelist_table, join_condition).select(
code_table.columns
)
return filtered_table
9 changes: 7 additions & 2 deletions phenex/filters/date_range_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def __init__(
self.max_date = max_date
super(DateRangeFilter, self).__init__()

def _get_predicate(self, table: EventTable):
def filter(self, table: EventTable):

assert is_phenex_event_table(table)

Expand All @@ -33,4 +33,9 @@ def _get_predicate(self, table: EventTable):
if self.max_date is not None:
conditions.append(table.EVENT_DATE <= self.max_date)

return conditions or True
if conditions:
output_table = table.filter(conditions)
else:
output_table = table

return output_table
34 changes: 31 additions & 3 deletions phenex/filters/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@ def filter(self, table: Table) -> Table:
Table: The filtered table.
"""
input_columns = table.columns
filtered_table = table.filter(self._get_predicate(table))
filtered_table = self._filter(table)
if not set(input_columns) <= set(filtered_table.columns):
raise ValueError(f"Filter must not remove columns.")

return type(table)(filtered_table.select(input_columns))

def _get_predicate(self, table: Table) -> Table:
def _filter(self, table: Table) -> Table:
"""
Returns the logical condition that the filter is applying.
"""
Expand All @@ -41,7 +41,7 @@ def __and__(self, other):
def __or__(self, other):
return OrFilter(self, other)

def __neg__(self):
def __invert__(self):
return NotFilter(self)


Expand All @@ -56,6 +56,15 @@ def __init__(self, filter1, filter2):

def _get_predicate(self, table: Table) -> Table:
return self.filter1._get_predicate(table) & self.filter2._get_predicate(table)

def filter(self, table: Table) -> Table:
table = self.filter1.filter(table)
return self.filter2.filter(table)

def autojoin_filter(self, table: "PhenexTable", tables: dict = None):
table = self.filter1.autojoin_filter(table, tables)
return self.filter2.autojoin_filter(table, tables)


class OrFilter(Filter):
"""
Expand All @@ -68,6 +77,17 @@ def __init__(self, filter1, filter2):

def _get_predicate(self, table: Table) -> Table:
return self.filter1._get_predicate(table) | self.filter2._get_predicate(table)

def filter(self, table: Table) -> Table:
table1 = self.filter1.filter(table).table
table2 = self.filter2.filter(table).table
return type(table)(table1.union(table2).distinct())

def autojoin_filter(self, table: "PhenexTable", tables: dict = None):
table1 = self.filter1.autojoin_filter(table, tables).table
table2 = self.filter2.autojoin_filter(table, tables).table
return type(table)(table1.union(table2, distinct=True))


class NotFilter(Filter):
"""
Expand All @@ -79,3 +99,11 @@ def __init__(self, filter):

def _get_predicate(self, table: Table) -> Table:
return ~self.filter._get_predicate(table)

def filter(self, table: Table) -> Table:
filtered_table = self.filter.filter(table).table
return type(table)(table.difference(filtered_table))

def autojoin_filter(self, table: "PhenexTable", tables: dict = None):
filtered_table = self.filter.autojoin_filter(table, tables).table
return type(table)(table.difference(filtered_table))
20 changes: 19 additions & 1 deletion phenex/filters/value.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from typing import Union
from datetime import date
from phenex.filters.filter import Filter


class Value:
class Value(Filter):
def __init__(self, operator: str, value: Union[int, float, date]):
self.operator = operator
self.value = value
Expand All @@ -15,6 +16,23 @@ def __init__(self, operator: str, value: Union[int, float, date]):
], "Operator must be >, >=, <, <=, or ="
super(Value, self).__init__()

def _filter(self, table, column):

if self.operator == ">":
table = getattr(table, column) > self.value
elif self.operator == ">=":
table = getattr(table, column) >= self.value
elif self.operator == "<":
table = getattr(table, column) < self.value
elif self.operator == "<=":
table = getattr(table, column) <= self.value
elif self.operator == "=":
table = getattr(table, column) == self.value
else:
raise ValueError("Operator must be >, >=, <, <=, or =")

return table


class GreaterThan(Value):
def __init__(self, value: int):
Expand Down
4 changes: 2 additions & 2 deletions phenex/filters/value_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __init__(
self.max = max
super(ValueFilter, self).__init__()

def _get_predicate(self, table: MeasurementTable):
def _filter(self, table: MeasurementTable):
# TODO assert that value column is in table
# assert (
# "INDEX_DATE" in table.columns
Expand All @@ -68,4 +68,4 @@ def _get_predicate(self, table: MeasurementTable):
raise ValueError("Operator for max days be < or <=")
if conditions:
table = table.filter(conditions)
return conditions or True
return table
2 changes: 1 addition & 1 deletion phenex/phenotypes/cohort.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,6 @@ def table1(self):
if self._table1 is None:
logger.debug("Generating Table1 report ...")
reporter = Table1()
self._table1 = reporter.execute(self).to_pandas()
self._table1 = reporter.execute(self)
logger.debug("Table1 report generated.")
return self._table1
8 changes: 7 additions & 1 deletion phenex/phenotypes/phenotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
PHENOTYPE_TABLE_COLUMNS,
is_phenex_phenotype_table,
)
from phenex.util import create_logger

logger = create_logger(__name__)

class Phenotype:
"""
Expand Down Expand Up @@ -50,9 +52,13 @@ def execute(self, tables: Dict[str, Table]) -> PhenotypeTable:
ValueError: If the table returned by _execute() does not contain the required phenotype
columns.
"""
logger.info(f"Phenotype '{self.name}': executing...")
for child in self.children:
if child.table is None:
logger.debug(f"Phenotype {self.name}: executing child phenotype '{child.name}'...")
child.execute(tables)
else:
logger.debug(f"Phenotype {self.name}: skipping already computed child phenotype '{child.name}'.")

table = self._execute(tables).mutate(BOOLEAN=True)

Expand All @@ -63,7 +69,7 @@ def execute(self, tables: Dict[str, Table]) -> PhenotypeTable:

self.table = table.select(PHENOTYPE_TABLE_COLUMNS)
assert is_phenex_phenotype_table(self.table)

logger.info(f"Phenotype '{self.name}': execution completed.")
return self.table

@property
Expand Down
Loading

0 comments on commit 723bb59

Please sign in to comment.