diff --git a/.gitignore b/.gitignore index f657a50d6..cda4fe016 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,4 @@ multirun/ _autosummary *cyclops_reports* *dummy_reports* +.mypy_cache diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d76e45455..1ac557aea 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,7 +28,7 @@ repos: types_or: [python, jupyter] - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.5.1 + rev: v1.6.1 hooks: - id: mypy entry: python3 -m mypy --config-file pyproject.toml diff --git a/cyclops/query/__init__.py b/cyclops/query/__init__.py deleted file mode 100644 index 5f29c469b..000000000 --- a/cyclops/query/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -"""The ``query`` API provides classes to query EHR databases.""" - -try: - import sqlalchemy -except ImportError: - raise ImportError( - "CyclOps is not installed with query API support! Please install using 'pip install cyclops[query]'.", # noqa: E501 - ) from None - - -from cyclops.query.base import DatasetQuerier -from cyclops.query.eicu import EICUQuerier -from cyclops.query.gemini import GEMINIQuerier -from cyclops.query.mimiciii import MIMICIIIQuerier -from cyclops.query.mimiciv import MIMICIVQuerier -from cyclops.query.omop import OMOPQuerier diff --git a/cyclops/query/base.py b/cyclops/query/base.py deleted file mode 100644 index 83dd0b9ae..000000000 --- a/cyclops/query/base.py +++ /dev/null @@ -1,290 +0,0 @@ -"""Base querier class.""" - -import logging -from functools import partial -from typing import Any, Callable, Dict, List, Optional - -from sqlalchemy import MetaData -from sqlalchemy.sql.selectable import Subquery - -from cyclops.query import ops as qo -from cyclops.query.interface import QueryInterface -from cyclops.query.orm import Database, DatasetQuerierConfig -from cyclops.query.util import ( - DBSchema, - _to_subquery, - get_attr_name, -) -from cyclops.utils.log import setup_logging - - -# Logging. -LOGGER = logging.getLogger(__name__) -setup_logging(print_level="INFO", logger=LOGGER) - - -def _create_get_table_lambdafn(schema_name: str, table_name: str) -> Callable[..., Any]: - """Create a lambda function to access a table. - - Parameters - ---------- - schema_name - The schema name. - table_name - The table name. - - Returns - ------- - Callable - The lambda function. - - """ - return lambda db: getattr(getattr(db, schema_name), table_name) - - -def _cast_timestamp_cols(table: Subquery) -> Subquery: - """Cast timestamp columns to datetime. - - Parameters - ---------- - table - Table to cast. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Table with cast columns. - - """ - cols_to_cast = [] - for col in table.columns: - if str(col.type) == "TIMESTAMP": - cols_to_cast.append(col.name) - if cols_to_cast: - table = qo.Cast(cols_to_cast, "timestamp")(table) - - return table - - -class DatasetQuerier: - """Base class to query EHR datasets. - - Attributes - ---------- - db - ORM Database used to run queries. - - Parameters - ---------- - database - Name of database. - user - Username for database. - password - Password for database. - dbms - Database management system. - host - Hostname of database. - port - Port of database. - - Notes - ----- - This class is intended to be subclassed to provide methods for querying tables in - the database. This class automatically creates methods for querying tables in the - database. The methods are named after the schema and table name, i.e. - `self.schema_name.table_name()`. The methods are created when the class is - instantiated. The subclass can provide custom methods for querying tables in the - database which can build on the methods created by this class. - - """ - - def __init__( - self, - database: str, - user: str, - password: str, - dbms: str = "postgresql", - host: str = "localhost", - port: int = 5432, - ) -> None: - config = DatasetQuerierConfig( - database=database, - user=user, - password=password, - dbms=dbms, - host=host, - port=port, - ) - self.db = Database(config) - if not self.db.is_connected: - LOGGER.error("Database is not connected, cannot run queries.") - return - self._setup_table_methods() - - def list_schemas(self) -> List[str]: - """List schemas in the database to query. - - Returns - ------- - List[str] - List of schema names. - - """ - return list(self.db.inspector.get_schema_names()) - - def list_tables(self, schema_name: Optional[str] = None) -> List[str]: - """List table methods that can be queried using the database. - - Parameters - ---------- - schema_name - Name of schema in the database. - - Returns - ------- - List[str] - List of table names. - - """ - if schema_name: - table_names = [] - for table in self.db.list_tables(): - schema_name_, _ = table.split(".") - if schema_name_ == schema_name: - table_names.append(table) - else: - table_names = self.db.list_tables() - - return table_names - - def list_columns(self, schema_name: str, table_name: str) -> List[str]: - """List columns in a table. - - Parameters - ---------- - schema_name - Name of schema in the database. - table_name - Name of GEMINI table. - - Returns - ------- - List[str] - List of column names. - - """ - return list( - getattr(getattr(self.db, schema_name), table_name).data.columns.keys(), - ) - - def list_custom_tables(self) -> List[str]: - """List custom tables methods provided by the dataset API. - - Returns - ------- - List[str] - List of custom table names. - - """ - method_list = dir(self) - custom_tables = [] - for method in method_list: - if ( - not method.startswith( - "__", - ) - and not method.startswith("_") - and method not in self.list_schemas() - and not method.startswith("list_") - and not method.startswith("get_") - and method not in ["db"] - ): - custom_tables.append(method) - - return custom_tables - - def get_table( - self, - schema_name: str, - table_name: str, - cast_timestamp_cols: bool = True, - ) -> Subquery: - """Get a table and possibly map columns to have standard names. - - Standardizing column names allows for columns to be - recognized in downstream processing. - - Parameters - ---------- - schema_name - Name of schema in the database. - table_name - Name of GEMINI table. - cast_timestamp_cols - Whether to cast timestamp columns to datetime. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Table with mapped columns. - - """ - table = _create_get_table_lambdafn(schema_name, table_name)(self.db).data - - if cast_timestamp_cols: - table = _cast_timestamp_cols(table) - - return _to_subquery(table) - - def _template_table_method( - self, - schema_name: str, - table_name: str, - ) -> QueryInterface: - """Template method for table methods. - - Parameters - ---------- - schema_name - Name of schema in the database. - table_name - Name of table in the database. - - Returns - ------- - cyclops.query.interface.QueryInterface - A query interface object. - - """ - table = getattr(getattr(self.db, schema_name), table_name).data - table = _to_subquery(table) - - return QueryInterface(self.db, table) - - def _setup_table_methods(self) -> None: - """Add table methods. - - This method adds methods to the querier class that allow querying of tables in - the database. The methods are named after the table names. - - """ - schemas = self.list_schemas() - meta: Dict[str, MetaData] = {} - for schema_name in schemas: - metadata = MetaData(schema=schema_name) - metadata.reflect(bind=self.db.engine) - meta[schema_name] = metadata - schema = DBSchema(schema_name, meta[schema_name]) - for table_name in meta[schema_name].tables: - setattr( - schema, - get_attr_name(table_name), - partial( - self._template_table_method, - schema_name=schema_name, - table_name=get_attr_name(table_name), - ), - ) - setattr(self, schema_name, schema) diff --git a/cyclops/query/eicu.py b/cyclops/query/eicu.py deleted file mode 100644 index 6db1cc2b3..000000000 --- a/cyclops/query/eicu.py +++ /dev/null @@ -1,19 +0,0 @@ -"""EICU-CRD query API. - -Supports querying of eICU. - -""" - -import logging - -from cyclops.query.base import DatasetQuerier -from cyclops.utils.log import setup_logging - - -# Logging. -LOGGER = logging.getLogger(__name__) -setup_logging(print_level="INFO", logger=LOGGER) - - -class EICUQuerier(DatasetQuerier): - """EICU dataset querier.""" diff --git a/cyclops/query/gemini.py b/cyclops/query/gemini.py deleted file mode 100644 index 47e5632b4..000000000 --- a/cyclops/query/gemini.py +++ /dev/null @@ -1,221 +0,0 @@ -"""GEMINI query API.""" - -import logging - -from sqlalchemy import select -from sqlalchemy.sql.expression import union_all - -import cyclops.query.ops as qo -from cyclops.query.base import DatasetQuerier -from cyclops.query.interface import QueryInterface -from cyclops.utils.log import setup_logging - - -# Logging. -LOGGER = logging.getLogger(__name__) -setup_logging(print_level="INFO", logger=LOGGER) - - -# Custm column names. -CARE_UNIT = "care_unit" - - -class GEMINIQuerier(DatasetQuerier): - """GEMINI dataset querier.""" - - def ip_admin( - self, - ) -> QueryInterface: - """Query GEMINI patient encounters. - - Returns - ------- - cyclops.query.interface.QueryInterface - Constructed query, wrapped in an interface object. - - """ - table = self.get_table("public", "ip_administrative") - - # Possibly cast string representations to timestamps - table = qo.Cast(["admit_date_time", "discharge_date_time"], "timestamp")(table) - - # Get the discharge disposition code descriptions - lookup_table = self.get_table("public", "lookup_ip_administrative") - lookup_table = qo.ConditionEquals("variable", "discharge_disposition")( - lookup_table, - ) - table = qo.Join( - lookup_table, - on=("discharge_disposition", "value"), - on_to_type="int", - join_table_cols="description", - isouter=True, - )(table) - table = qo.Rename({"description": "discharge_description"})(table) - table = qo.Drop("value")(table) - - return QueryInterface(self.db, table) - - def diagnoses( - self, - ) -> QueryInterface: - """Query diagnosis data. - - Returns - ------- - cyclops.query.interface.QueryInterface - Constructed table, wrapped in an interface object. - - """ - table = self.get_table("public", "diagnosis") - - lookup_table = self.get_table("public", "lookup_diagnosis") - lookup_table = qo.ConditionEquals("variable", "diagnosis_type")(lookup_table) - table = qo.Join( - lookup_table, - on=("diagnosis_type", "value"), - join_table_cols="description", - isouter=True, - )(table) - table = qo.Drop("value")(table) - table = qo.Rename({"description": "diagnosis_type_description"})(table) - table = qo.ReorderAfter("diagnosis_type_description", "diagnosis_type")(table) - - # Trim whitespace from ICD codes. - table = qo.Trim("diagnosis_code")(table) - - return QueryInterface(self.db, table) - - def room_transfer( - self, - ) -> QueryInterface: - """Query room transfer data. - - Returns - ------- - cyclops.query.interface.QueryInterface - Constructed table, wrapped in an interface object. - - """ - table = self.get_table("public", "room_transfer") - - # Join with lookup to get transfer description. - lookup_table = self.get_table("public", "lookup_room_transfer") - lookup_table = qo.ConditionEquals("variable", "medical_service")(lookup_table) - - table = qo.Join( - lookup_table, - on=("medical_service", "value"), - join_table_cols="description", - isouter=True, - )(table) - table = qo.Rename({"description": "transfer_description"})(table) - - return QueryInterface(self.db, table) - - def care_units( - self, - ) -> QueryInterface: - """Query care unit data, fetches transfer info from multiple tables. - - Returns - ------- - cyclops.query.interface.QueryInterface - Constructed table, wrapped in an interface object. - - """ - filter_care_unit_cols = qo.Keep( - [ - "genc_id", - "admit", - "discharge", - CARE_UNIT, - ], - ) - - # In-patient table. - ip_table = self.get_table("public", "ip_administrative") - ip_table = qo.Rename( - { - "admit_date_time": "admit", - "discharge_date_time": "discharge", - }, - )(ip_table) - ip_table = qo.Literal("IP", CARE_UNIT)(ip_table) - ip_table = filter_care_unit_cols(ip_table) - - # Special care unit table. - scu_table = self.get_table("public", "ip_scu") - scu_table = qo.Rename( - { - "scu_admit_date_time": "admit", - "scu_discharge_date_time": "discharge", - }, - )(scu_table) - scu_table = qo.Literal("SCU", CARE_UNIT)(scu_table) - scu_table = filter_care_unit_cols(scu_table) - - # Emergency room/department table. - er_table = self.get_table("public", "er_administrative") - er_table = qo.Rename( - { - "er_admit_timestamp": "admit", - "er_discharge_timestamp": "discharge", - }, - )(er_table) - er_table = qo.Literal("ER", CARE_UNIT)(er_table) - er_table = filter_care_unit_cols(er_table) - - # Room transfer table. - rt_table = self.get_table("public", "room_transfer") - rt_table = qo.Rename( - { - "checkin_date_time": "admit", - "checkout_date_time": "discharge", - }, - )(rt_table) - rt_table = qo.Rename({"transfer_description": CARE_UNIT})(rt_table) - rt_table = filter_care_unit_cols(rt_table) - - # Combine. - table = union_all( - select(er_table), - select(scu_table), - select(ip_table), - select(rt_table), - ).subquery() - - return QueryInterface(self.db, table) - - def imaging( - self, - ) -> QueryInterface: - """Query imaging reports data. - - Returns - ------- - cyclops.query.interface.QueryInterface - Constructed table, wrapped in an interface object. - - """ - table = self.get_table("public", "imaging") - - # Get imaging test description - lookup_table = self.get_table("public", "lookup_imaging") - lookup_table = qo.ConditionEquals("variable", "imaging_test_name_mapped")( - lookup_table, - ) - - table = qo.Join( - lookup_table, - on=("imaging_test_name_mapped", "value"), - on_to_type="str", - join_table_cols="description", - )(table) - table = qo.Drop("value")(table) - table = qo.Rename({"description": "imaging_test_description"})(table) - table = qo.ReorderAfter("imaging_test_description", "imaging_test_name_mapped")( - table, - ) - - return QueryInterface(self.db, table) diff --git a/cyclops/query/interface.py b/cyclops/query/interface.py deleted file mode 100644 index 4f38b4166..000000000 --- a/cyclops/query/interface.py +++ /dev/null @@ -1,253 +0,0 @@ -"""A query interface class to wrap database objects and queries.""" - -import logging -from typing import List, Literal, Optional, Tuple, Union - -import dask.dataframe as dd -import pandas as pd -from sqlalchemy.sql.elements import BinaryExpression - -import cyclops.query.ops as qo -from cyclops.query.orm import Database -from cyclops.query.util import TableTypes -from cyclops.utils.common import to_list_optional -from cyclops.utils.file import save_dataframe -from cyclops.utils.log import setup_logging - - -# Logging. -LOGGER = logging.getLogger(__name__) -setup_logging(print_level="INFO", logger=LOGGER) - - -class QueryInterface: - """An interface dataclass to wrap queries, and run them. - - Parameters - ---------- - database - Database object to create ORM, and query data. - query: cyclops.query.util.TableTypes - The query. - - """ - - def __init__( - self, - database: Database, - query: Union[TableTypes, "QueryInterface"], - ) -> None: - """Initialize the QueryInterface object, join and chain operations.""" - self.database = database - if isinstance(query, QueryInterface): - self.query = query.query # type: ignore - else: - self.query = query - self._data = None - - @property - def data(self) -> Optional[Union[pd.DataFrame, dd.core.DataFrame]]: - """Get data.""" - return self._data - - def join( - self, - join_table: Union[TableTypes, "QueryInterface"], - on: Optional[ - Union[ - str, - List[str], - Tuple[str], - List[Tuple[str, str]], - ] - ] = None, - on_to_type: Optional[Union[type, List[type]]] = None, - cond: Optional[BinaryExpression] = None, - table_cols: Optional[Union[str, List[str]]] = None, - join_table_cols: Optional[Union[str, List[str]]] = None, - isouter: Optional[bool] = False, - ) -> "QueryInterface": - """Join the query with another table. - - Parameters - ---------- - join_table - Table to join with. - on - Column(s) to join on. - on_to_type - Type(s) to cast the column(s) to join on. - cond - Condition to join on. - table_cols - Columns to select from the original table. - join_table_cols - Columns to select from the joined table. - isouter - Whether to perform an outer join. - - Returns - ------- - QueryInterface - QueryInterface object with the join operation added. - - """ - on = to_list_optional(on) - on_to_type = to_list_optional(on_to_type) - table_cols = to_list_optional(table_cols) - join_table_cols = to_list_optional(join_table_cols) - if isinstance(join_table, QueryInterface): - join_table = join_table.query - query = qo.Join( - join_table=join_table, - on=on, - on_to_type=on_to_type, - cond=cond, - table_cols=table_cols, - join_table_cols=join_table_cols, - isouter=isouter, - )(self.query) - - return QueryInterface(self.database, query) - - def ops( - self, - ops: Union[qo.QueryOp, qo.Sequential], - ) -> "QueryInterface": - """Chain operations with the query. - - Parameters - ---------- - ops - Operations to perform on the query. - - Returns - ------- - QueryInterface - QueryInterface object with the operations added. - - """ - query = ops(self.query) - - return QueryInterface(self.database, query) - - def union( - self, - other: "QueryInterface", - ) -> "QueryInterface": - """Union the query with another query. - - Parameters - ---------- - other - The other query to union with. - - Returns - ------- - QueryInterface - QueryInterface object with the union operation added. - - """ - query = qo.Union(other.query)(self.query) - - return QueryInterface(self.database, query) - - def union_all( - self, - other: "QueryInterface", - ) -> "QueryInterface": - """Union all the query with another query. - - Parameters - ---------- - other - The other query to union all with. - - Returns - ------- - QueryInterface - QueryInterface object with the union all operation added. - - """ - query = qo.Union(other.query, union_all=True)(self.query) - - return QueryInterface(self.database, query) - - def run( - self, - limit: Optional[int] = None, - backend: Literal["pandas", "dask", "datasets"] = "pandas", - index_col: Optional[str] = None, - n_partitions: Optional[int] = None, - ) -> Union[pd.DataFrame, dd.core.DataFrame]: - """Run the query, and fetch data. - - Parameters - ---------- - limit - No. of rows to limit the query return. - backend - Backend computing framework to use, pandas or dask or datasets. - index_col - Column which becomes the index, and defines the partitioning. - Should be a indexed column in the SQL server, and any orderable type. - n_partitions - Number of partitions. Check dask documentation for additional details. - - Returns - ------- - pandas.DataFrame or dask.DataFrame or datasets.Dataset - Query result. - - """ - self._data = self.database.run_query( - self.query, - limit=limit, - backend=backend, - index_col=index_col, - n_partitions=n_partitions, - ) - - return self._data - - def save( - self, - path: str, - file_format: Literal["parquet", "csv"] = "parquet", - ) -> str: - """Save the query. - - Parameters - ---------- - path - Path where the file will be saved. - file_format - File format of the file to save. - - Returns - ------- - str - Processed save path for upstream use. - - """ - # If the query was already run. - if self._data is not None: - return save_dataframe(self._data, path, file_format=file_format) - - # Save without running. - if file_format == "csv": - path = self.database.save_query_to_csv(self.query, path) - elif file_format == "parquet": - path = self.database.save_query_to_parquet(self.query, path) - else: - raise ValueError("Invalid file format specified.") - - return path - - def clear_data(self) -> None: - """Clear data container. - - Sets the data attribute to None, thus clearing the dataframe contained. - - """ - self._data = None diff --git a/cyclops/query/mimiciii.py b/cyclops/query/mimiciii.py deleted file mode 100644 index 4152468b9..000000000 --- a/cyclops/query/mimiciii.py +++ /dev/null @@ -1,87 +0,0 @@ -"""MIMIC-III query API. - -Supports querying of MIMIC-III. - -""" - -import logging - -import cyclops.query.ops as qo -from cyclops.query.base import DatasetQuerier -from cyclops.query.interface import QueryInterface -from cyclops.utils.log import setup_logging - - -# Logging. -LOGGER = logging.getLogger(__name__) -setup_logging(print_level="INFO", logger=LOGGER) - - -class MIMICIIIQuerier(DatasetQuerier): - """MIMIC-III dataset querier.""" - - def diagnoses( - self, - ) -> QueryInterface: - """Query MIMICIII diagnosis data. - - Returns - ------- - cyclops.query.interface.QueryInterface - Constructed query, wrapped in an interface object. - - """ - table = self.get_table("mimiciii", "diagnoses_icd") - - # Join with diagnoses dimension table. - table = qo.Join( - join_table=self.get_table("mimiciii", "d_icd_diagnoses"), - on=["icd9_code"], - on_to_type=["str"], - )(table) - - return QueryInterface(self.db, table) - - def labevents( - self, - ) -> QueryInterface: - """Query MIMICIII labevents data. - - Returns - ------- - cyclops.query.interface.QueryInterface - Constructed query, wrapped in an interface object. - - """ - table = self.get_table("mimiciii", "labevents") - - # Join with lab dimension table. - table = qo.Join( - join_table=self.get_table("mimiciii", "d_labitems"), - on=["itemid"], - on_to_type=["str"], - )(table) - - return QueryInterface(self.db, table) - - def chartevents( - self, - ) -> QueryInterface: - """Query MIMICIII chartevents data. - - Returns - ------- - cyclops.query.interface.QueryInterface - Constructed query, wrapped in an interface object. - - """ - table = self.get_table("mimiciii", "chartevents") - - # Join with dimension table. - table = qo.Join( - join_table=self.get_table("mimiciii", "d_items"), - on=["itemid"], - on_to_type=["str"], - )(table) - - return QueryInterface(self.db, table) diff --git a/cyclops/query/mimiciv.py b/cyclops/query/mimiciv.py deleted file mode 100644 index 4fa0293e6..000000000 --- a/cyclops/query/mimiciv.py +++ /dev/null @@ -1,161 +0,0 @@ -"""MIMIC-IV query API. - -Supports querying of MIMICIV-2.0. - -""" - -import logging - -from sqlalchemy import Integer, func, select - -import cyclops.query.ops as qo -from cyclops.query.base import DatasetQuerier -from cyclops.query.interface import QueryInterface -from cyclops.query.util import get_column -from cyclops.utils.log import setup_logging - - -# Logging. -LOGGER = logging.getLogger(__name__) -setup_logging(print_level="INFO", logger=LOGGER) - - -class MIMICIVQuerier(DatasetQuerier): - """MIMICIV dataset querier.""" - - def patients( - self, - ) -> QueryInterface: - """Query MIMIC patient data. - - Returns - ------- - cyclops.query.interface.QueryInterface - Constructed query, wrapped in an interface object. - - Notes - ----- - The function infers the approximate year a patient received care, using the - `anchor_year` and `anchor_year_group` columns. The `join` and `ops` supplied - are applied after the approximate year is inferred. `dod` is - adjusted based on the inferred approximate year of care. - - """ - table = self.get_table("mimiciv_hosp", "patients") - - # Process and include patient's anchor year. - table = select( - table, - ( - func.substr(get_column(table, "anchor_year_group"), 1, 4).cast(Integer) - ).label("anchor_year_group_start"), - ( - func.substr(get_column(table, "anchor_year_group"), 8, 12).cast(Integer) - ).label("anchor_year_group_end"), - ).subquery() - - # Select the middle of the anchor year group as the anchor year - table = select( - table, - ( - get_column(table, "anchor_year_group_start") - + ( - get_column(table, "anchor_year_group_end") - - get_column(table, "anchor_year_group_start") - ) - / 2 - ).label("anchor_year_group_middle"), - ).subquery() - - table = select( - table, - ( - get_column(table, "anchor_year_group_middle") - - get_column(table, "anchor_year") - ).label("anchor_year_difference"), - ).subquery() - - # Shift relevant columns by anchor year difference - table = qo.AddDeltaColumn("dod", years="anchor_year_difference")(table) - table = qo.Drop( - [ - "anchor_year_group_start", - "anchor_year_group_end", - "anchor_year_group_middle", - ], - )(table) - - return QueryInterface(self.db, table) - - def diagnoses( - self, - ) -> QueryInterface: - """Query MIMIC diagnosis data. - - Parameters - ---------- - join - Join arguments. - ops - Additional operations to apply to the query. - - Returns - ------- - cyclops.query.interface.QueryInterface - Constructed query, wrapped in an interface object. - - """ - table = self.get_table("mimiciv_hosp", "diagnoses_icd") - - # Join with diagnoses dimension table. - table = qo.Join( - join_table=self.get_table("mimiciv_hosp", "d_icd_diagnoses"), - on=["icd_code", "icd_version"], - on_to_type=["str", "int"], - )(table) - - return QueryInterface(self.db, table) - - def labevents( - self, - ) -> QueryInterface: - """Query lab events from the hospital module. - - Returns - ------- - cyclops.query.interface.QueryInterface - Constructed query, wrapped in an interface object. - - """ - table = self.get_table("mimiciv_hosp", "labevents") - dim_items_table = self.get_table("mimiciv_hosp", "d_labitems") - - # Join with lab items dimension table. - table = qo.Join( - join_table=dim_items_table, - on=["itemid"], - )(table) - - return QueryInterface(self.db, table) - - def chartevents( - self, - ) -> QueryInterface: - """Query ICU chart events from the ICU module. - - Returns - ------- - cyclops.query.interface.QueryInterface - Constructed table, wrapped in an interface object. - - """ - table = self.get_table("mimiciv_icu", "chartevents") - dim_items_table = self.get_table("mimiciv_icu", "d_items") - - # Join with items dimension table. - table = qo.Join( - dim_items_table, - on="itemid", - )(table) - - return QueryInterface(self.db, table) diff --git a/cyclops/query/omop.py b/cyclops/query/omop.py deleted file mode 100644 index b3209ce8d..000000000 --- a/cyclops/query/omop.py +++ /dev/null @@ -1,258 +0,0 @@ -"""OMOP query API.""" - -import logging -from typing import List, Optional, Union - -from sqlalchemy.sql.selectable import Subquery - -import cyclops.query.ops as qo -from cyclops.query.base import DatasetQuerier -from cyclops.query.interface import QueryInterface -from cyclops.utils.common import to_list -from cyclops.utils.log import setup_logging - - -# Logging. -LOGGER = logging.getLogger(__name__) -setup_logging(print_level="INFO", logger=LOGGER) - - -# OMOP column names. -VISIT_OCCURRENCE_ID = "visit_occurrence_id" -PERSON_ID = "person_id" -VISIT_START_DATETIME = "visit_start_datetime" -VISIT_END_DATETIME = "visit_end_datetime" -VISIT_DETAIL_START_DATETIME = "visit_detail_start_datetime" -VISIT_DETAIL_END_DATETIME = "visit_detail_end_datetime" -VISIT_CONCEPT_ID = "visit_concept_id" -VISIT_TYPE_CONCEPT_ID = "visit_type_concept_id" -VISIT_DETAIL_CONCEPT_ID = "visit_detail_concept_id" -VISIT_DETAIL_TYPE_CONCEPT_ID = "visit_detail_type_concept_id" -CARE_SITE_ID = "care_site_id" -CONCEPT_NAME = "concept_name" -CONCEPT_ID = "concept_id" -CARE_SITE_SOURCE_VALUE = "care_site_source_value" -OBSERVATION_CONCEPT_ID = "observation_concept_id" -OBSERVATION_TYPE_CONCEPT_ID = "observation_type_concept_id" -OBSERVATION_DATETIME = "observation_datetime" -MEASUREMENT_CONCEPT_ID = "measurement_concept_id" -MEASUREMENT_TYPE_CONCEPT_ID = "measurement_type_concept_id" -MEASUREMENT_DATETIME = "measurement_datetime" -UNIT_CONCEPT_ID = "unit_concept_id" -VALUE_AS_CONCEPT_ID = "value_as_concept_id" - -# Created columns. -VISIT_DETAIL_CONCEPT_NAME = "visit_detail_concept_name" -CARE_SITE_NAME = "care_site_name" -GENDER_CONCEPT_NAME = "gender_concept_name" -RACE_CONCEPT_NAME = "race_concept_name" -ETHNICITY_CONCEPT_NAME = "ethnicity_concept_name" - -# Other constants. -ID = "id" -NAME = "name" - - -class OMOPQuerier(DatasetQuerier): - """OMOP querier.""" - - def __init__( - self, - database: str, - user: str, - password: str, - dbms: str = "postgresql", - host: str = "localhost", - port: int = 5432, - schema_name: str = "omop", - ) -> None: - super().__init__( - database=database, - user=user, - password=password, - dbms=dbms, - host=host, - port=port, - ) - self.schema_name = schema_name - - def map_concept_ids_to_name( - self, - src_table: Union[Subquery, QueryInterface], - src_cols: Union[str, List[str]], - dst_cols: Optional[Union[str, List[str]]] = None, - ) -> QueryInterface: - """Map concept IDs in a source table to concept names from concept table. - - For each concept ID column with a name like `somecol_concept_ID`, the mapped - concept name column will be named `somecol_concept_name`. If `dst_cols` is - specified, the mapped concept name column will be named according to the - corresponding name in `dst_cols`. - - Parameters - ---------- - src_table - Source table with concept IDs. - src_cols - Column name(s) to consider as concept IDs for mapping. - dst_cols - Column name(s) to assign for the mapped concept name columns. - - Returns - ------- - cyclops.query.interface.QueryInterface - Query with mapped columns from concept table. - - """ - if isinstance(src_table, QueryInterface): - src_table = src_table.query - concept_table = self.get_table(self.schema_name, "concept") - src_cols = to_list(src_cols) - if dst_cols: - dst_cols = to_list(dst_cols) - if len(dst_cols) != len(src_cols): - raise ValueError("dst_cols must be same length as src_cols") - - for i, col in enumerate(src_cols): - if ID not in col: - raise ValueError("Specified column not a concept ID column!") - src_table = qo.Join( - concept_table, - on=(col, CONCEPT_ID), - join_table_cols=[CONCEPT_NAME], - isouter=True, - )(src_table) - dst_col_name = dst_cols[i] if dst_cols else col.replace(ID, NAME) - src_table = qo.Rename({CONCEPT_NAME: dst_col_name})(src_table) - - return QueryInterface(self.db, src_table) - - def _map_care_site_id( - self, - source_table: Union[Subquery, QueryInterface], - ) -> QueryInterface: - """Map care_site_id in a source table to care_site table. - - Parameters - ---------- - source_table - Source table with care_site_id. - - Returns - ------- - cyclops.query.interface.QueryInterface - Query with mapped columns from care_site table. - - """ - if isinstance(source_table, QueryInterface): - source_table = source_table.query - care_site_table = self.get_table(self.schema_name, "care_site") - table = qo.Join( - care_site_table, - on=CARE_SITE_ID, - join_table_cols=[CARE_SITE_NAME, CARE_SITE_SOURCE_VALUE], - isouter=True, - )(source_table) - - return QueryInterface(self.db, table) - - def visit_occurrence( - self, - ) -> QueryInterface: - """Query OMOP visit_occurrence table. - - Returns - ------- - cyclops.query.interface.QueryInterface - Constructed query, wrapped in an interface object. - - """ - table = self.get_table(self.schema_name, "visit_occurrence") - table = self.map_concept_ids_to_name( - table, - [ - "visit_concept_id", - "visit_type_concept_id", - ], - ) - table = self._map_care_site_id(table) - - return QueryInterface(self.db, table) - - def visit_detail( - self, - ) -> QueryInterface: - """Query OMOP visit_detail table. - - Returns - ------- - cyclops.query.interface.QueryInterface - Constructed query, wrapped in an interface object. - - """ - table = self.get_table(self.schema_name, "visit_detail") - table = self.map_concept_ids_to_name( - table, - ["visit_detail_concept_id", "visit_detail_type_concept_id"], - ) - - return QueryInterface(self.db, table) - - def person( - self, - ) -> QueryInterface: - """Query OMOP person table. - - Returns - ------- - cyclops.query.interface.QueryInterface - Constructed query, wrapped in an interface object. - - """ - table = self.get_table(self.schema_name, "person") - table = self.map_concept_ids_to_name( - table, - ["gender_concept_id", "race_concept_id", "ethnicity_concept_id"], - ) - - return QueryInterface(self.db, table) - - def observation( - self, - ) -> QueryInterface: - """Query OMOP observation table. - - Returns - ------- - cyclops.query.interface.QueryInterface - Constructed query, wrapped in an interface object. - - """ - table = self.get_table(self.schema_name, "observation") - table = self.map_concept_ids_to_name( - table, - [OBSERVATION_CONCEPT_ID, OBSERVATION_TYPE_CONCEPT_ID], - ) - - return QueryInterface(self.db, table) - - def measurement( - self, - ) -> QueryInterface: - """Query OMOP measurement table. - - Returns - ------- - cyclops.query.interface.QueryInterface - Constructed query, wrapped in an interface object. - - """ - table = self.get_table(self.schema_name, "measurement") - # Cast value_as_concept_id to int. - table = qo.Cast([VALUE_AS_CONCEPT_ID], "int")(table) - table = self.map_concept_ids_to_name( - table, - [MEASUREMENT_CONCEPT_ID, MEASUREMENT_TYPE_CONCEPT_ID, UNIT_CONCEPT_ID], - ) - - return QueryInterface(self.db, table) diff --git a/cyclops/query/ops.py b/cyclops/query/ops.py deleted file mode 100644 index 626a6d253..000000000 --- a/cyclops/query/ops.py +++ /dev/null @@ -1,3262 +0,0 @@ -"""Query operations.""" - -from __future__ import annotations - -import logging -import operator -import typing -from abc import abstractmethod -from collections import OrderedDict -from datetime import datetime, timedelta -from itertools import islice - -import sqlalchemy -from sqlalchemy import and_, cast, extract, func, literal_column, or_, select -from sqlalchemy.sql.elements import BinaryExpression -from sqlalchemy.sql.expression import literal -from sqlalchemy.sql.selectable import Select, Subquery -from sqlalchemy.types import Boolean - -# Logging. -from cyclops.query.util import ( - TableTypes, - apply_to_columns, - check_timestamp_columns, - drop_columns, - ends_with, - equals, - filter_columns, - get_column, - get_column_names, - get_columns, - get_delta_column, - greater_than, - has_columns, - has_substring, - in_, - less_than, - not_equals, - process_column, - rename_columns, - reorder_columns, - starts_with, - table_params_to_type, - trim_columns, -) -from cyclops.utils.common import to_datetime_format, to_list, to_list_optional -from cyclops.utils.log import setup_logging - - -LOGGER = logging.getLogger(__name__) -setup_logging(print_level="INFO", logger=LOGGER) - - -# ruff: noqa: W505 - - -def _addindent(s_: str, num_spaces: int = 4) -> str: - """Add spaces to a string except the first line. - - Parameters - ---------- - s_ - String to add spaces to. - num_spaces - Number of spaces to add. - - Returns - ------- - str - String with spaces added. - - """ - s = s_.split("\n") - if len(s) == 1: - return s_ - first = s.pop(0) - s = [(num_spaces * " ") + line for line in s] - s = "\n".join(s) # type: ignore - - return first + "\n" + s # type: ignore - - -class QueryOp: - """Base class for query operations.""" - - _ops: typing.Dict[str, "QueryOp"] - - def __init__(self, *args: typing.Any, **kwargs: typing.Any) -> None: - super().__setattr__("_ops", OrderedDict()) - - @abstractmethod - def __call__(self, *args: typing.Any, **kwargs: typing.Any) -> Subquery: - """Implement a calling function.""" - pass - - def _add_op(self, name: str, op_: "QueryOp") -> None: - """Add a child operation to the current query operation. - - The query op can be accessed as an attribute using the given name. - - Parameters - ---------- - name - Name of the child op. The child op can be accessed from this op using - the given name - op_ - Child op to be added to the parent query op. - - """ - if not isinstance(op_, QueryOp) and op_ is not None: - raise TypeError("{} is not a QueryOp subclass".format(str(op_))) - if not isinstance(name, str): - raise TypeError("Query op name should be a string") - if hasattr(self, name) and name not in self._ops: - raise KeyError("Attribute '{}' already exists".format(name)) - if "." in name: - raise KeyError('Query op name can\'t contain ".", got: {}'.format(name)) - if name == "": - raise KeyError('Query op name can\'t be empty string ""') - self._ops[name] = op_ - - def _get_ops(self) -> typing.Iterator["QueryOp"]: - """Return an iterator over the child operations. - - Returns - ------- - typing.Iterator[QueryOp] - Iterator over the child operations. - - """ - for _, op_ in self._ops.items(): - yield op_ - - def _get_name(self) -> str: - """Get the name of the query op. - - Returns - ------- - str - Name of the query op. - - """ - return self.__class__.__name__ - - def __setattr__(self, name: str, value: "QueryOp") -> None: - """Set an attribute. - - Parameters - ---------- - name - Name of the attribute. - value - Value of the attribute. - - """ - ops = self.__dict__.get("_ops") - if isinstance(value, QueryOp): - if ops is None: - raise AttributeError("Can't assign op before QueryOp.__init__() call") - ops[name] = value - elif ops is not None and name in ops: - if value is not None: - raise TypeError( - "Cannot assign '{}' as child op '{}' " "(QueryOp or None expected)", - ) - ops[name] = value - else: - super().__setattr__(name, value) - - def _extra_repr(self) -> str: - """Set the extra representation of the query op. - - To print customized extra information, you should re-implement - this method in your own query ops. Both single-line and multi-line - strings are acceptable. - - Returns - ------- - str - Extra representation of the query op. - - """ - return "" - - def __repr__(self) -> str: - """Return the string representation of the query op. - - Returns - ------- - str - String representation of the query op. - - """ - extra_lines = [] - extra_repr = self._extra_repr() - if extra_repr: - extra_lines = extra_repr.split("\n") - child_lines = [] - for key, op_ in self._ops.items(): - mod_str = repr(op_) - mod_str = _addindent(mod_str, 2) - child_lines.append("(" + key + "): " + mod_str) - lines = extra_lines + child_lines - main_str = self._get_name() + "(" - if lines: - if len(extra_lines) == 1 and not child_lines: - main_str += extra_lines[0] - else: - main_str += "\n " + "\n ".join(lines) + "\n" - main_str += ")" - - return main_str - - def __getattr__(self, name: str) -> "QueryOp": - """Get an attribute. - - Parameters - ---------- - name - Name of the attribute. - - Returns - ------- - QueryOp - The child operation with the given name. - - """ - if name in self._ops: - return self._ops[name] - raise AttributeError( - f"'{self.__class__.__name__}' object has no attribute '{name}'", - ) - - -def _chain_ops( - query: Subquery, - ops: typing.Iterator[QueryOp], -) -> Subquery: - """Chain query ops. - - Parameters - ---------- - query - Query to chain the ops to. - ops - Query ops to chain. - - Returns - ------- - Subquery - Query with the ops chained. - - """ - for op_ in ops: - if isinstance(op_, Sequential): - query = _chain_ops(query, op_._get_ops()) - elif isinstance(op_, QueryOp): - query = op_(query) - - return query - - -class Sequential(QueryOp): - """Sequential query operations class. - - Chains a sequence of query operations and executes the final query on a table. - - Examples - -------- - >>> Sequential(Drop(["col1", "col2"]), ...) - >>> Sequential([Drop(["col1", "col2"]), ...]) - - """ - - @typing.overload - def __init__(self, *ops: QueryOp) -> None: - ... - - @typing.overload - def __init__(self, ops: typing.List[QueryOp]) -> None: - ... - - @typing.overload - def __init__(self, op: OrderedDict[str, QueryOp]) -> None: - ... - - def __init__(self, *args: QueryOp) -> None: # type: ignore - """Initialize the class. - - Parameters - ---------- - args - Query operations to be chained sequentially. - - """ - super().__init__() - if len(args) == 1 and isinstance(args[0], OrderedDict): - for key, op_ in args[0].items(): - self._add_op(key, op_) - elif len(args) == 1 and isinstance(args[0], list): - for idx, op_ in enumerate(args[0]): - self._add_op(str(idx), op_) - else: - for idx, op_ in enumerate(args): - self._add_op(str(idx), op_) - - def __len__(self) -> int: - """Return the number of query ops in the Sequential. - - Returns - ------- - int - Number of query ops in the Sequential. - - """ - return len(self._ops) - - def __iter__(self) -> typing.Iterator[QueryOp]: - """Return an iterator over the query ops. - - Returns - ------- - typing.Iterator[QueryOp] - Iterator over the query ops. - - """ - return iter(self._ops.values()) - - def __add__(self, other: "Sequential") -> "Sequential": - """Add two Sequential objects. - - Parameters - ---------- - other - Sequential object to be added. - - Returns - ------- - Sequential - Sequential object with the two Sequential objects chained. - - """ - if isinstance(other, Sequential): - ret = Sequential() - for op_ in self: - ret.append(op_) - for op_ in other: - ret.append(op_) - return ret - raise ValueError( - "Add operator supports only objects " - "of Sequential class, but {} is given.".format(str(type(other))), - ) - - def __iadd__(self, other: "Sequential") -> "Sequential": - """Add two Sequential objects inplace. - - Parameters - ---------- - other - Sequential object to be added. - - Returns - ------- - Sequential - Sequential object with the two Sequential objects chained. - - """ - if isinstance(other, Sequential): - offset = len(self) - for i, op_ in enumerate(other): - self._add_op(str(i + offset), op_) - return self - raise ValueError( - "Add operator supports only objects " - "of Sequential class, but {} is given.".format(str(type(other))), - ) - - def _get_item_by_idx( - self, - iterator: typing.Iterator[typing.Any], - idx: int, - ) -> typing.Any: - """Get the idx-th item of the iterator. - - Parameters - ---------- - iterator - Iterator to get the item from. - idx - Index of the item to get. - - Returns - ------- - QueryOp - The idx-th item of the iterator. - - """ - size = len(self) - idx = operator.index(idx) - if not -size <= idx < size: - raise IndexError("index {} is out of range".format(idx)) - idx %= size - - return next(islice(iterator, idx, None)) - - def __getitem__( - self, - idx: typing.Union[slice, int], - ) -> typing.Any: - """Get the idx-th item of the sequential query op. - - Parameters - ---------- - idx - Index of the item to get. - - Returns - ------- - Sequential or QueryOp - The idx-th item of the sequential query op. - - """ - if isinstance(idx, slice): - return self.__class__(OrderedDict(list(self._ops.items())[idx])) - - return self._get_item_by_idx(self._ops.values(), idx) # type: ignore - - def __setitem__(self, idx: int, op_: QueryOp) -> None: - """Set the idx-th item of the sequential query op. - - Parameters - ---------- - idx - Index of the item to set. - op_ - Query op to set. - - """ - key: str = self._get_item_by_idx(self._ops.keys(), idx) # type: ignore - return setattr(self, key, op_) - - def __delitem__(self, idx: typing.Union[slice, int]) -> None: - """Delete the idx-th item of the sequential query op. - - Parameters - ---------- - idx - Index of the item to delete. - - """ - if isinstance(idx, slice): - for key in list(self._ops.keys())[idx]: - delattr(self, key) - else: - key = self._get_item_by_idx(self._ops.keys(), idx) # type: ignore - delattr(self, key) - str_indices = [str(i) for i in range(len(self._ops))] - self._ops = OrderedDict(list(zip(str_indices, self._ops.values()))) - - def append(self, op_: QueryOp) -> "Sequential": - """Append a given query op to the end. - - Parameters - ---------- - op_ - Query op to append. - - Returns - ------- - Sequential - Sequential object with the query op appended. - - """ - self._add_op(str(len(self)), op_) - return self - - def pop(self, key: typing.Union[int, slice]) -> QueryOp: - """Pop the query op at the given index. - - Parameters - ---------- - key - Index of the query op to pop. - - Returns - ------- - QueryOp - Popped query op. - - """ - v = self[key] - del self[key] - - return v # type: ignore - - def insert(self, index: int, op_: QueryOp) -> "Sequential": - """Insert a given query op at the given index. - - Parameters - ---------- - index - Index to insert the query op at. - op_ - Query op to insert. - - Returns - ------- - Sequential - Sequential object with the query op inserted. - - """ - if not isinstance(op_, QueryOp): - raise AssertionError("Module should be of type: {}".format(QueryOp)) - n = len(self._ops) - if not (-n <= index <= n): - raise IndexError("Index out of range: {}".format(index)) - if index < 0: - index += n - for i in range(n, index, -1): - self._ops[str(i)] = self._ops[str(i - 1)] - self._ops[str(index)] = op_ - - return self - - def extend(self, sequential: "Sequential") -> "Sequential": - """Extend the sequential query op with another sequential query op. - - Parameters - ---------- - sequential - Sequential object to extend with. - - Returns - ------- - Sequential - Sequential object with the other sequential query op extended. - - """ - for op_ in sequential: - self.append(op_) - - return self - - @table_params_to_type(Subquery) - def __call__(self, table: TableTypes) -> Subquery: - """Execute the query operations on the table. - - Parameters - ---------- - table - Table to be queried. - - Returns - ------- - Subquery - Query result after chaining the query operations. - - """ - return _chain_ops(table, self._get_ops()) - - -def _append_if_missing( - table: TableTypes, - keep_cols: typing.Optional[typing.Union[str, typing.List[str]]] = None, - force_include_cols: typing.Optional[typing.Union[str, typing.List[str]]] = None, -) -> Subquery: - """Keep only certain columns in a table, but must include certain columns. - - Parameters - ---------- - table - Table on which to perform the operation. - keep_cols - Columns to keep. - force_include_cols - Columns to include (forcefully). - - """ - if keep_cols is None: - return table - keep_cols = to_list(keep_cols) - force_include_cols = to_list(force_include_cols) - extend_cols = [col for col in force_include_cols if col not in keep_cols] - keep_cols = extend_cols + keep_cols - - return Keep(keep_cols)(table) - - -def _none_add(obj1: typing.Any, obj2: typing.Any) -> typing.Any: - """Add two objects together while ignoring None values. - - If both objects are None, returns None. - - Parameters - ---------- - obj1 - First object to add. - obj2 - Second object to add. - - Returns - ------- - typing.Any - Result of adding the two objects. - - """ - if obj1 is None: - return obj2 - if obj2 is None: - return obj1 - return obj1 + obj2 - - -def _process_checks( - table: TableTypes, - cols: typing.Optional[typing.Union[str, typing.List[str]]] = None, - cols_not_in: typing.Optional[typing.Union[str, typing.List[str]]] = None, - timestamp_cols: typing.Optional[typing.Union[str, typing.List[str]]] = None, -) -> Subquery: - """Perform checks, and possibly alterations, on a table. - - Parameters - ---------- - table - Table on which to perform the operation. - cols - Columns to check. - timestamp_cols - Timestamp columns to check. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Checked and possibly altered table. - - """ - if cols is not None: - cols = to_list(cols) - has_columns(table, cols, raise_error=True) - if cols_not_in is not None: - cols_not_in = to_list(cols_not_in) - if has_columns(table, cols_not_in, raise_error=False): - raise ValueError(f"Cannot specify columns {cols_not_in}.") - if timestamp_cols is not None: - timestamp_cols = to_list(timestamp_cols) - has_columns(table, timestamp_cols, raise_error=True) - check_timestamp_columns(table, timestamp_cols, raise_error=True) - - return table - - -class FillNull(QueryOp): - """Fill NULL values with a given value. - - Parameters - ---------- - cols - Columns to fill. - fill_values - Value(s) to fill with. - new_col_names - New column name(s) for the filled columns. If not provided, - - Examples - -------- - >>> FillNull("col1", 0)(table) - >>> FillNull(["col1", "col2"], [0, 1])(table) - >>> FillNull(["col1", "col2"], [0, 1], ["col1_new", "col2_new"])(table) - - """ - - def __init__( - self, - cols: typing.Union[str, typing.List[str]], - fill_values: typing.Union[typing.Any, typing.List[typing.Any]], - new_col_names: typing.Optional[typing.Union[str, typing.List[str]]] = None, - ) -> None: - super().__init__() - self.cols = cols - self.fill_values = fill_values - self.new_col_names = new_col_names - - def __call__(self, table: TableTypes) -> Subquery: - """Fill NULL values with a given value. - - Parameters - ---------- - table - Table on which to perform the operation. - - Returns - ------- - Subquery - Table with NULL values filled. - - """ - cols = to_list(self.cols) - fill_values = to_list(self.fill_values) - new_col_names = to_list_optional(self.new_col_names) - if new_col_names and len(cols) != len(new_col_names): - raise ValueError( - """Number of columns to fill and number of new column names - must match.""", - ) - table = _process_checks(table, cols=self.cols) - if len(fill_values) == 1: - fill_values = fill_values * len(cols) - for col, fill in zip(cols, fill_values): - coalesced_col = func.coalesce(table.c[col], fill).label( - f"coalesced_col_{col}", - ) - table = select([table, coalesced_col]).subquery() - if new_col_names: - for col, new_col in zip(cols, new_col_names): - table = Rename({f"coalesced_col_{col}": new_col})(table) - else: - for col in cols: - table = drop_columns(table, col) - table = Rename({f"coalesced_col_{col}": col})(table) - - return table - - -class Drop(QueryOp): - """Drop some columns. - - Parameters - ---------- - cols - Columns to drop. - - Examples - -------- - >>> Drop("col1")(table) - >>> Drop(["col1", "col2"])(table) - - """ - - def __init__(self, cols: typing.Union[str, typing.List[str]]) -> None: - super().__init__() - self.cols = cols - - def __call__(self, table: TableTypes) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - table = _process_checks(table, cols=self.cols) - - return drop_columns(table, self.cols) - - -class Rename(QueryOp): - """Rename some columns. - - Parameters - ---------- - rename_map - Map from an existing column name to another name. - check_exists - Whether to check if all of the keys in the map exist as columns. - - Examples - -------- - >>> Rename({"col1": "col1_new"})(table) - - """ - - def __init__(self, rename_map: typing.Dict[str, str], check_exists: bool = True): - super().__init__() - self.rename_map = rename_map - self.check_exists = check_exists - - def __call__(self, table: TableTypes) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - if self.check_exists: - table = _process_checks(table, cols=list(self.rename_map.keys())) - - return rename_columns(table, self.rename_map) - - -class Substring(QueryOp): - """Get substring of a string column. - - Parameters - ---------- - col - Name of column which has string, where substring needs - to be extracted. - start_index - Start index of substring. - stop_index - Stop index of substring. - new_col_name - Name of the new column with extracted substring. - - Examples - -------- - >>> Substring("col1", 0, 2, "col1_substring")(table) - - """ - - def __init__( - self, - col: str, - start_index: int, - stop_index: int, - new_col_label: typing.Optional[str] = None, - ): - super().__init__() - self.col = col - self.start_index = start_index - self.stop_index = stop_index - self.new_col_label = new_col_label - - def __call__(self, table: TableTypes) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - table = _process_checks(table, cols=self.col, cols_not_in=self.new_col_label) - - return apply_to_columns( - table, - self.col, - lambda x: func.substr( - process_column(x, to_str=True), - self.start_index, - self.stop_index, - ), - new_col_labels=self.new_col_label, - ) - - -class Reorder(QueryOp): - """Reorder the columns in a table. - - Parameters - ---------- - cols - Complete list of table column names in the new order. - - Examples - -------- - >>> Reorder(["col2", "col1"])(table) - - """ - - def __init__(self, cols: typing.List[str]): - super().__init__() - self.cols = cols - - def __call__(self, table: TableTypes) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - table = _process_checks(table, cols=self.cols) - - return reorder_columns(table, self.cols) - - -class ReorderAfter(QueryOp): - """Reorder a number of columns to come after a specified column. - - Parameters - ---------- - cols - Ordered list of column names which will come after a specified column. - after - Column name for the column after which the other columns will follow. - - Examples - -------- - >>> ReorderAfter(["col2", "col1"], "col3")(table) - - """ - - def __init__(self, cols: typing.Union[str, typing.List[str]], after: str): - super().__init__() - self.cols = cols - self.after = after - - def __call__(self, table: TableTypes) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - self.cols = to_list(self.cols) - table = _process_checks(table, cols=self.cols + [self.after]) - names = get_column_names(table) - names = [name for name in names if name not in self.cols] - name_after_ind = names.index(self.after) + 1 - new_order = names[:name_after_ind] + self.cols + names[name_after_ind:] - - return Reorder(new_order)(table) - - -class Keep(QueryOp): - """Keep only the specified columns in a table. - - Parameters - ---------- - cols - The columns to keep. - - Examples - -------- - >>> Keep("col1")(table) - >>> Keep(["col1", "col2"])(table) - - """ - - def __init__(self, cols: typing.Union[str, typing.List[str]]): - super().__init__() - self.cols = cols - - def __call__(self, table: TableTypes) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - table = _process_checks(table, cols=self.cols) - - return filter_columns(table, self.cols) - - -class Trim(QueryOp): - """Trim the whitespace from some string columns. - - Parameters - ---------- - cols - Columns to trim. - new_col_labels - If specified, create new columns with these labels. Otherwise, - apply the function to the existing columns. - - Examples - -------- - >>> Trim("col1")(table) - >>> Trim(["col1", "col2"])(table) - >>> Trim("col1", "col1_trimmed")(table) - >>> Trim(["col1", "col2"], ["col1_trimmed", "col2_trimmed"])(table) - - """ - - def __init__( - self, - cols: typing.Union[str, typing.List[str]], - new_col_labels: typing.Optional[typing.Union[str, typing.List[str]]] = None, - ): - super().__init__() - self.cols = cols - self.new_col_labels = new_col_labels - - def __call__(self, table: TableTypes) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - table = _process_checks(table, cols=self.cols) - - return trim_columns(table, self.cols, new_col_labels=self.new_col_labels) - - -class Literal(QueryOp): - """Add a literal column to a table. - - Parameters - ---------- - value - Value of the literal, e.g., a string or integer. - col - Label of the new literal column. - - Examples - -------- - >>> Literal(1, "col1")(table) - - """ - - def __init__(self, value: typing.Any, col: str): - super().__init__() - self.value = value - self.col = col - - def __call__(self, table: TableTypes) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - table = _process_checks(table, cols_not_in=self.col) - - return select(table, literal(self.value).label(self.col)).subquery() - - -class ExtractTimestampComponent(QueryOp): - """Extract a component such as year or month from a timestamp column. - - Parameters - ---------- - timestamp_col - Timestamp column from which to extract the time component. - extract_str - Information to extract, e.g., "year", "month" - label - Column label for the extracted column. - - Examples - -------- - >>> ExtractTimestampComponent("col1", "year", "year")(table) - >>> ExtractTimestampComponent("col1", "month", "month")(table) - - """ - - def __init__(self, timestamp_col: str, extract_str: str, label: str): - super().__init__() - self.timestamp_col = timestamp_col - self.extract_str = extract_str - self.label = label - - def __call__(self, table: TableTypes) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - table = _process_checks( - table, - timestamp_cols=self.timestamp_col, - cols_not_in=self.label, - ) - table = select( - table, - extract(self.extract_str, get_column(table, self.timestamp_col)).label( - self.label, - ), - ) - - return Cast(self.label, "int")(table) - - -class AddNumeric(QueryOp): - """Add a numeric value to some columns. - - Parameters - ---------- - add_to - Column names specifying to which columns is being added. - add - Adds this value to the add_to columns. - new_col_labels - If specified, create new columns with these labels. Otherwise, - apply the function to the existing columns. - - Examples - -------- - >>> AddNumeric("col1", 1)(table) - >>> AddNumeric(["col1", "col2"], 1)(table) - >>> AddNumeric("col1", 1, "col1_plus_1")(table) - >>> AddNumeric(["col1", "col2"], 1, ["col1_plus_1", "col2_plus_1"])(table) - >>> AddNumeric(["col1", "col2"], [1, 2.2])(table) - - """ - - def __init__( - self, - add_to: typing.Union[str, typing.List[str]], - add: typing.Union[int, float, typing.List[int], typing.List[float]], - new_col_labels: typing.Optional[typing.Union[str, typing.List[str]]] = None, - ): - super().__init__() - self.add_to = add_to - self.add = add - self.new_col_labels = new_col_labels - - def _gen_lambda( - self, - add: typing.Union[int, float], - ) -> typing.Callable[[sqlalchemy.sql.schema.Column], sqlalchemy.sql.schema.Column]: - """Generate the lambda function.""" - return lambda x: x + add - - def __call__(self, table: TableTypes) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - table = _process_checks( - table, - cols=self.add_to, - cols_not_in=self.new_col_labels, - ) - self.add_to = to_list(self.add_to) - if isinstance(self.add, (int, float)) and len(self.add_to) > 1: - add = [self.add] * len(self.add_to) - elif isinstance(self.add, (int, float)) and len(self.add_to) == 1: - add = [self.add] - elif isinstance(self.add, list) and len(self.add_to) != len(self.add): - raise ValueError( - "Length of add_to and add must be the same if add is a list.", - ) - - return apply_to_columns( - table, - self.add_to, - [self._gen_lambda(add_num) for add_num in add], - new_col_labels=self.new_col_labels, - ) - - -class AddDeltaConstant(QueryOp): - """Construct and add a datetime.timedelta object to some columns. - - Parameters - ---------- - add_to - Column names specifying to which columns is being added. - delta - A timedelta object. - new_col_labels - If specified, create new columns with these labels. Otherwise, - apply the function to the existing columns. - - Examples - -------- - >>> AddDeltaConstant("col1", datetime.timedelta(days=1))(table) - >>> AddDeltaConstant(["col1", "col2"], datetime.timedelta(days=1))(table) - >>> AddDeltaConstant("col1", datetime.timedelta(days=1), "col1_plus_1")(table) - - """ - - def __init__( - self, - add_to: typing.Union[str, typing.List[str]], - delta: timedelta, - new_col_labels: typing.Optional[typing.Union[str, typing.List[str]]] = None, - ): - super().__init__() - self.add_to = add_to - self.delta = delta - self.new_col_labels = new_col_labels - - def __call__(self, table: TableTypes) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - table = _process_checks( - table, - timestamp_cols=self.add_to, - cols_not_in=self.new_col_labels, - ) - - return apply_to_columns( - table, - self.add_to, - lambda x: x + self.delta, - new_col_labels=self.new_col_labels, - ) - - -class AddColumn(QueryOp): - """Add a column to some columns. - - Parameters - ---------- - add_to - Column names specifying to which columns is being added. - col - Column name of column to add to the add_to columns. - negative - Subtract the column rather than adding. - new_col_labels - If specified, create new columns with these labels. Otherwise, - apply the function to the existing columns. - - Examples - -------- - >>> AddColumn("col1", "col2")(table) - >>> AddColumn(["col1", "col2"], "col3")(table) - >>> AddColumn("col1", "col2", negative=True)(table) - >>> AddColumn("col1", "col2", "col1_plus_col2")(table) - >>> AddColumn(["col1", "col2"], "col3", ["col1_plus_col3", "col2_plus_col3"])(table) - - Warning - ------- - Pay attention to column types. Some combinations will work, - whereas others will not. - - """ - - def __init__( - self, - add_to: typing.Union[str, typing.List[str]], - col: str, - negative: typing.Optional[bool] = False, - new_col_labels: typing.Optional[typing.Union[str, typing.List[str]]] = None, - ): - super().__init__() - self.add_to = add_to - self.col = col - self.negative = negative - self.new_col_labels = new_col_labels - - def __call__(self, table: TableTypes) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - # If the column being added is a timestamp column, ensure the others are too - if check_timestamp_columns(table, self.col): - table = _process_checks( - table, - timestamp_cols=self.add_to, - cols_not_in=self.new_col_labels, - ) - else: - table = _process_checks( - table, - cols=self.add_to, - cols_not_in=self.new_col_labels, - ) - col = get_column(table, self.col) - if self.negative: - return apply_to_columns( - table, - self.add_to, - lambda x: x - col, - new_col_labels=self.new_col_labels, - ) - - return apply_to_columns( - table, - self.add_to, - lambda x: x + col, - new_col_labels=self.new_col_labels, - ) - - -class AddDeltaColumn(QueryOp): - """Construct and add an interval column to some columns. - - Parameters - ---------- - add_to - Column names specifying to which columns is being added. - negative - Subtract the object rather than adding. - new_col_labels - If specified, create new columns with these labels. Otherwise, - apply the function to the existing columns. - **delta_kwargs - The arguments used to create the Interval column. - - Examples - -------- - >>> AddDeltaColumn("col1", "col2")(table) - >>> AddDeltaColumn(["col1", "col2"], "col3")(table) - >>> AddDeltaColumn("col1", "col2", negative=True)(table) - >>> AddDeltaColumn("col1", "col2", "col1_plus_col2")(table) - - """ - - def __init__( - self, - add_to: typing.Union[str, typing.List[str]], - negative: typing.Optional[bool] = False, - new_col_labels: typing.Optional[typing.Union[str, typing.List[str]]] = None, - **delta_kwargs: typing.Any, - ) -> None: - super().__init__() - self.add_to = add_to - self.negative = negative - self.new_col_labels = new_col_labels - self.delta_kwargs = delta_kwargs - - def __call__(self, table: TableTypes) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - table = _process_checks( - table, - timestamp_cols=self.add_to, - cols_not_in=self.new_col_labels, - ) - delta = get_delta_column(table, **self.delta_kwargs) - if self.negative: - return apply_to_columns( - table, - self.add_to, - lambda x: x - delta, - new_col_labels=self.new_col_labels, - ) - - return apply_to_columns( - table, - self.add_to, - lambda x: x + delta, - new_col_labels=self.new_col_labels, - ) - - -class Cast(QueryOp): - """Cast columns to a specified type. - - Currently supporting conversions to str, int, float, date, bool and timestamp. - - Parameters - ---------- - cols - Columns to cast. - type_ - Name of type to which to convert. Must be supported. - - Examples - -------- - >>> Cast("col1", "str")(table) - >>> Cast(["col1", "col2"], "int")(table) - >>> Cast("col1", "float")(table) - >>> Cast("col1", "date")(table) - >>> Cast("col1", "bool")(table) - >>> Cast("col1", "timestamp")(table) - - """ - - def __init__(self, cols: typing.Union[str, typing.List[str]], type_: str): - super().__init__() - self.cols = cols - self.type_ = type_ - - def __call__(self, table: TableTypes) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - table = _process_checks(table, cols=self.cols) - cast_type_map = { - "str": "to_str", - "int": "to_int", - "float": "to_float", - "date": "to_date", - "bool": "to_bool", - "timestamp": "to_timestamp", - } - # Assert that the type inputted is supported - if self.type_ not in cast_type_map: - supported_str = ", ".join(list(cast_type_map.keys())) - raise ValueError( - f"""Conversion to type {self.type_} not supported. Supporting - conversion to types {supported_str}""", - ) - # Cast - kwargs = {cast_type_map[self.type_]: True} - - return apply_to_columns( - table, - self.cols, - lambda x: process_column(x, **kwargs), - ) - - -class Union(QueryOp): - """Union two tables. - - Parameters - ---------- - union_table - Table to union with the first table. - union_all - Whether to use the all keyword in the union. - - Examples - -------- - >>> Union(table2)(table1) - >>> Union(table2, union_all=True)(table1) - - """ - - def __init__( - self, - union_table: TableTypes, - union_all: typing.Optional[bool] = False, - ): - super().__init__() - self.union_table = union_table - self.union_all = union_all - - def __call__(self, table: TableTypes) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - table = _process_checks(table) - union_table = _process_checks(self.union_table) - if self.union_all: - return select(table).union_all(select(union_table)).subquery() - - return select(table).union(select(union_table)).subquery() - - -class Join(QueryOp): - """Join a table with another table. - - Parameters - ---------- - join_table - Table on which to join. - on - A list of strings or tuples representing columns on which to join. - Strings represent columns of same name in both tables. A tuple of - style (table_col, join_table_col) is used to join on columns of - different names. Suggested to specify this parameter as opposed to - cond. - on_to_type - A list of types to which to convert the on columns before joining. Useful when - two columns have the same values but in different format, e.g., strings of int. - cond - Condition on which to join to tables. - table_cols - Filters to keep only these columns from the table. - join_table_cols - Filters to keep only these columns from the join_table. - isouter - Flag to say if the join is a left outer join. - - Examples - -------- - >>> Join(table2, on=["col1", ("col2", "col3")], on_to_type=[int, str])(table1) - >>> Join(table2, table_cols=["col1", "col2"])(table1) - >>> Join(table2, join_table_cols=["col1", "col2"])(table1) - - Warnings - -------- - If neither on nor cond parameters are specified, an - expensive Cartesian product is performed. - - """ - - @table_params_to_type(Subquery) - def __init__( - self, - join_table: TableTypes, - on: typing.Optional[ - typing.Union[ - str, - typing.List[str], - typing.Tuple[str], - typing.List[typing.Tuple[str, str]], - ] - ] = None, - on_to_type: typing.Optional[typing.Union[type, typing.List[type]]] = None, - cond: typing.Optional[BinaryExpression] = None, - table_cols: typing.Optional[typing.Union[str, typing.List[str]]] = None, - join_table_cols: typing.Optional[typing.Union[str, typing.List[str]]] = None, - isouter: typing.Optional[bool] = False, - ) -> None: - super().__init__() - if on is not None and cond is not None: - raise ValueError("Cannot specify both the 'on' and 'cond' arguments.") - - self.join_table = join_table - self.cond = cond - self.on_ = to_list_optional(on) - self.on_to_type = to_list_optional(on_to_type) - self.table_cols = to_list_optional(table_cols) - self.join_table_cols = to_list_optional(join_table_cols) - self.isouter = isouter - - @table_params_to_type(Subquery) - def __call__(self, table: TableTypes) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - # Join on the equality of values in columns of same name in both tables - if self.on_ is not None: - # Process on columns - on_table_cols = [ - col_obj if isinstance(col_obj, str) else col_obj[0] - for col_obj in self.on_ - ] - on_join_table_cols = [ - col_obj if isinstance(col_obj, str) else col_obj[1] - for col_obj in self.on_ - ] - table = _process_checks( - table, - cols=_none_add(self.table_cols, on_table_cols), - ) - self.join_table = _process_checks( - self.join_table, - cols=_none_add(self.join_table_cols, on_join_table_cols), - ) - # Filter columns, keeping those being joined on - table = _append_if_missing(table, self.table_cols, on_table_cols) - self.join_table = _append_if_missing( - self.join_table, - self.join_table_cols, - on_join_table_cols, - ) - # Perform type conversions if given - if self.on_to_type is not None: - for i, type_ in enumerate(self.on_to_type): - table = Cast(on_table_cols[i], type_)(table) - self.join_table = Cast(on_join_table_cols[i], type_)( - self.join_table, - ) - cond = and_( - *[ - get_column(table, on_table_cols[i]) - == get_column(self.join_table, on_join_table_cols[i]) - for i in range(len(on_table_cols)) - ], - ) - table = select(table.join(self.join_table, cond, isouter=self.isouter)) - else: - # Filter columns - if self.table_cols is not None: - table = Keep(self.table_cols)(table) - if self.join_table_cols is not None: - self.join_table = Keep(self.table_cols)(self.join_table) # type: ignore - - # Join on a specified condition - if self.cond is not None: - table = select( - table.join( # type: ignore - self.join_table, - self.cond, - isouter=self.isouter, - ), - ) - # Join on no condition, i.e., a Cartesian product - else: - LOGGER.warning("A Cartesian product has been queried.") - table = select(table, self.join_table) - - # Filter to include no duplicate columns - return select( - *[col for col in table.subquery().columns if "%(" not in col.name], - ).subquery() - - -class ConditionEquals(QueryOp): - """Filter rows based on being equal, or not equal, to some value. - - Parameters - ---------- - col - Column name on which to condition. - value - Value to equal. - not_ - Take negation of condition. - binarize_col - If specified, create a Boolean column of name binarize_col instead of filtering. - **cond_kwargs - Optional keyword arguments for processing the condition. - - Examples - -------- - >>> ConditionEquals("col1", 1)(table) - >>> ConditionEquals("col1", 1, binarize_col="col1_bool")(table) - - """ - - def __init__( - self, - col: str, - value: typing.Any, - not_: bool = False, - binarize_col: typing.Optional[str] = None, - **cond_kwargs: typing.Any, - ) -> None: - super().__init__() - self.col = col - self.value = value - self.not_ = not_ - self.binarize_col = binarize_col - self.cond_kwargs = cond_kwargs - - def __call__(self, table: TableTypes, return_cond: bool = False) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - return_cond - Return the condition instead of filtering. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - if return_cond and self.binarize_col: - raise ValueError( - "Cannot return condition and binarize column simultaneously.", - ) - table = _process_checks(table, cols=self.col, cols_not_in=self.binarize_col) - cond = equals( - get_column(table, self.col), - self.value, - True, - True, - **self.cond_kwargs, - ) - if self.not_: - cond = cond._negate() - if return_cond: - return cond - if self.binarize_col is not None: - return select( - table, - cast(cond, Boolean).label(self.binarize_col), - ).subquery() - - return select(table).where(cond).subquery() - - -class ConditionGreaterThan(QueryOp): - """Filter rows based on greater than (or equal), to some value. - - Parameters - ---------- - col - Column name on which to condition. - value - Value greater than. - equal - Include equality to the value. - not_ - Take negation of condition. - binarize_col - If specified, create a Boolean column of name binarize_col instead of filtering. - **cond_kwargs - Optional keyword arguments for processing the condition. - - Examples - -------- - >>> ConditionGreaterThan("col1", 1)(table) - >>> ConditionGreaterThan("col1", 1, binarize_col="col1_bool")(table) - - """ - - def __init__( - self, - col: str, - value: typing.Any, - equal: bool = False, - not_: bool = False, - binarize_col: typing.Optional[str] = None, - **cond_kwargs: typing.Any, - ) -> None: - super().__init__() - self.col = col - self.value = value - self.equal = equal - self.not_ = not_ - self.binarize_col = binarize_col - self.cond_kwargs = cond_kwargs - - def __call__(self, table: TableTypes, return_cond: bool = False) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - return_cond - Return the condition instead of filtering. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - if return_cond and self.binarize_col: - raise ValueError( - "Cannot return condition and binarize column simultaneously.", - ) - table = _process_checks(table, cols=self.col, cols_not_in=self.binarize_col) - cond = greater_than( - get_column(table, self.col), - self.value, - True, - True, - self.equal, - **self.cond_kwargs, - ) - if self.not_: - cond = cond._negate() - if return_cond: - return cond - if self.binarize_col is not None: - return select( - table, - cast(cond, Boolean).label(self.binarize_col), - ).subquery() - - return select(table).where(cond).subquery() - - -class ConditionLessThan(QueryOp): - """Filter rows based on less than (or equal), to some value. - - Parameters - ---------- - col - Column name on which to condition. - value - Value greater than. - equal - Include equality to the value. - not_ - Take negation of condition. - binarize_col - If specified, create a Boolean column of name binarize_col instead of filtering. - **cond_kwargs - Optional keyword arguments for processing the condition. - - Examples - -------- - >>> ConditionLessThan("col1", 1)(table) - >>> ConditionLessThan("col1", 1, binarize_col="col1_bool")(table) - - """ - - def __init__( - self, - col: str, - value: typing.Any, - equal: bool = False, - not_: bool = False, - binarize_col: typing.Optional[str] = None, - **cond_kwargs: typing.Any, - ) -> None: - super().__init__() - self.col = col - self.value = value - self.equal = equal - self.not_ = not_ - self.binarize_col = binarize_col - self.cond_kwargs = cond_kwargs - - def __call__(self, table: TableTypes, return_cond: bool = False) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - return_cond - Return the condition instead of filtering. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - if return_cond and self.binarize_col: - raise ValueError( - "Cannot return condition and binarize column simultaneously.", - ) - table = _process_checks(table, cols=self.col, cols_not_in=self.binarize_col) - cond = less_than( - get_column(table, self.col), - self.value, - True, - True, - self.equal, - **self.cond_kwargs, - ) - if self.not_: - cond = cond._negate() - if return_cond: - return cond - if self.binarize_col is not None: - return select( - table, - cast(cond, Boolean).label(self.binarize_col), - ).subquery() - - return select(table).where(cond).subquery() - - -class ConditionRegexMatch(QueryOp): - """Filter rows based on matching a regular expression. - - Parameters - ---------- - col - Column name on which to condition. - regex - Regular expression to match. - not_ - Take negation of condition. - binarize_col - If specified, create a Boolean column of name binarize_col instead of filtering. - - Examples - -------- - >>> ConditionRegexMatch("col1", ".*")(table) - >>> ConditionRegexMatch("col1", ".*", binarize_col="col1_bool")(table) - - """ - - def __init__( - self, - col: str, - regex: str, - not_: bool = False, - binarize_col: typing.Optional[str] = None, - ): - super().__init__() - self.col = col - self.regex = regex - self.not_ = not_ - self.binarize_col = binarize_col - - def __call__(self, table: TableTypes, return_cond: bool = False) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - return_cond - Return the condition instead of filtering. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - if return_cond and self.binarize_col: - raise ValueError( - "Cannot return condition and binarize column simultaneously.", - ) - table = _process_checks(table, cols=self.col, cols_not_in=self.binarize_col) - cond = get_column(table, self.col).regexp_match(self.regex) - if self.not_: - cond = cond._negate() - if return_cond: - return cond - if self.binarize_col is not None: - return select( - table, - cast(cond, Boolean).label(self.binarize_col), - ).subquery() - - return select(table).where(cond).subquery() - - -class ConditionIn(QueryOp): - """Filter rows based on having a value in list of values. - - Parameters - ---------- - col - Column name on which to condition. - values - Values in which the column value must be. - not_ - Take negation of condition. - binarize_col - If specified, create a Boolean column of name binarize_col instead of filtering. - **cond_kwargs - Optional keyword arguments for processing the condition. - - Examples - -------- - >>> ConditionIn("col1", [1, 2])(table) - >>> ConditionIn("col1", [1, 2], binarize_col="col1_bool")(table) - - """ - - def __init__( - self, - col: str, - values: typing.Union[typing.Any, typing.List[typing.Any]], - not_: bool = False, - binarize_col: typing.Optional[str] = None, - **cond_kwargs: typing.Any, - ) -> None: - super().__init__() - self.col = col - self.values = values - self.not_ = not_ - self.binarize_col = binarize_col - self.cond_kwargs = cond_kwargs - - def __call__(self, table: TableTypes, return_cond: bool = False) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - return_cond - Return the condition instead of filtering. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - if return_cond and self.binarize_col: - raise ValueError( - "Cannot return condition and binarize column simultaneously.", - ) - table = _process_checks(table, cols=self.col, cols_not_in=self.binarize_col) - cond = in_( - get_column(table, self.col), - to_list(self.values), - True, - True, - **self.cond_kwargs, - ) - if self.not_: - cond = cond._negate() - if return_cond: - return cond - if self.binarize_col is not None: - return select( - table, - cast(cond, Boolean).label(self.binarize_col), - ).subquery() - - return select(table).where(cond).subquery() - - -class ConditionSubstring(QueryOp): - """Filter rows on based on having substrings. - - Can be specified whether it must have any or all of the specified substrings. - This makes no difference when only one substring is provided - - Parameters - ---------- - col - Column name on which to condition. - substrings - Substrings. - any_ - If true, the row must have just one of the substrings. If false, it must - have all of the substrings. - not_ - Take negation of condition. - binarize_col - If specified, create a Boolean column of name binarize_col instead of filtering. - **cond_kwargs - Optional keyword arguments for processing the condition. - - Examples - -------- - >>> ConditionSubstring("col1", ["a", "b"])(table) - >>> ConditionSubstring("col1", ["a", "b"], any_=False)(table) - >>> ConditionSubstring("col1", ["a", "b"], binarize_col="col1_bool")(table) - - """ - - def __init__( - self, - col: str, - substrings: typing.Union[str, typing.List[str]], - any_: bool = True, - not_: bool = False, - binarize_col: typing.Optional[str] = None, - **cond_kwargs: typing.Any, - ) -> None: - super().__init__() - self.col = col - self.substrings = to_list(substrings) - self.any_ = any_ - self.not_ = not_ - self.binarize_col = binarize_col - self.cond_kwargs = cond_kwargs - - def __call__(self, table: TableTypes, return_cond: bool = False) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - return_cond - Return the condition instead of filtering. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - if return_cond and self.binarize_col: - raise ValueError( - "Cannot return condition and binarize column simultaneously.", - ) - table = _process_checks(table, cols=self.col, cols_not_in=self.binarize_col) - conds = [ - has_substring(get_column(table, self.col), sub, True, **self.cond_kwargs) - for sub in self.substrings - ] - cond = or_(*conds) if self.any_ else and_(*conds) - if self.not_: - cond = cond._negate() - if return_cond: - return cond - if self.binarize_col is not None: - return select( - table, - cast(cond, Boolean).label(self.binarize_col), - ).subquery() - - return select(table).where(cond).subquery() - - -class ConditionStartsWith(QueryOp): - """Filter rows based on starting with some string. - - Parameters - ---------- - col: str - Column name on which to condition. - string: any - String. - not_: bool, default=False - Take negation of condition. - binarize_col: str, optional - If specified, create a Boolean column of name binarize_col instead of filtering. - **cond_kwargs - Optional keyword arguments for processing the condition. - - Examples - -------- - >>> ConditionStartsWith("col1", "a")(table) - >>> ConditionStartsWith("col1", "a", binarize_col="col1_bool")(table) - - """ - - def __init__( - self, - col: str, - string: str, - not_: bool = False, - binarize_col: typing.Optional[str] = None, - **cond_kwargs: typing.Any, - ) -> None: - super().__init__() - self.col = col - self.string = string - self.not_ = not_ - self.binarize_col = binarize_col - self.cond_kwargs = cond_kwargs - - def __call__(self, table: TableTypes, return_cond: bool = False) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - return_cond - Return the condition instead of filtering. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - if return_cond and self.binarize_col: - raise ValueError( - "Cannot return condition and binarize column simultaneously.", - ) - table = _process_checks(table, cols=self.col, cols_not_in=self.binarize_col) - cond = starts_with( - get_column(table, self.col), - self.string, - True, - True, - **self.cond_kwargs, - ) - if self.not_: - cond = cond._negate() - if return_cond: - return cond - if self.binarize_col is not None: - return select( - table, - cast(cond, Boolean).label(self.binarize_col), - ).subquery() - - return select(table).where(cond).subquery() - - -class ConditionEndsWith(QueryOp): - """Filter rows based on ending with some string. - - Parameters - ---------- - col - Column name on which to condition. - string - String to end with. - not_ - Take negation of condition. - binarize_col - If specified, create a Boolean column of name binarize_col instead of filtering. - **cond_kwargs - Optional keyword arguments for processing the condition. - - Examples - -------- - >>> ConditionEndsWith("col1", "a")(table) - >>> ConditionEndsWith("col1", "a", binarize_col="col1_bool")(table) - - """ - - def __init__( - self, - col: str, - string: str, - not_: bool = False, - binarize_col: typing.Optional[str] = None, - **cond_kwargs: typing.Any, - ) -> None: - super().__init__() - self.col = col - self.string = string - self.not_ = not_ - self.binarize_col = binarize_col - self.cond_kwargs = cond_kwargs - - def __call__(self, table: TableTypes, return_cond: bool = False) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - return_cond - Return the condition instead of filtering. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - if return_cond and self.binarize_col: - raise ValueError( - "Cannot return condition and binarize column simultaneously.", - ) - table = _process_checks(table, cols=self.col, cols_not_in=self.binarize_col) - cond = ends_with( - get_column(table, self.col), - self.string, - True, - True, - **self.cond_kwargs, - ) - if self.not_: - cond = cond._negate() - if return_cond: - return cond - if self.binarize_col is not None: - return select( - table, - cast(cond, Boolean).label(self.binarize_col), - ).subquery() - - return select(table).where(cond).subquery() - - -class ConditionInYears(QueryOp): - """Filter rows based on a timestamp column being in a list of years. - - Parameters - ---------- - timestamp_col - Timestamp column name. - years - Years in which the timestamps must be. - not_ - Take negation of condition. - binarize_col - If specified, create a Boolean column of name binarize_col instead of filtering. - - Examples - -------- - >>> ConditionInYears("col1", [2019, 2020])(table) - >>> ConditionInYears("col1", 2019)(table) - >>> ConditionInYears("col1", 2019, binarize_col="col1_bool")(table) - - """ - - def __init__( - self, - timestamp_col: str, - years: typing.Union[int, typing.List[int]], - not_: bool = False, - binarize_col: typing.Optional[str] = None, - ): - super().__init__() - self.timestamp_col = timestamp_col - self.years = years - self.not_ = not_ - self.binarize_col = binarize_col - - def __call__(self, table: TableTypes, return_cond: bool = False) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - return_cond - Return the condition instead of filtering. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - if return_cond and self.binarize_col: - raise ValueError( - "Cannot return condition and binarize column simultaneously.", - ) - table = _process_checks( - table, - cols=self.timestamp_col, - cols_not_in=self.binarize_col, - ) - cond = in_( - extract("year", get_column(table, self.timestamp_col)), - to_list(self.years), - ) - if self.not_: - cond = cond._negate() - if return_cond: - return cond - if self.binarize_col is not None: - return select( - table, - cast(cond, Boolean).label(self.binarize_col), - ).subquery() - - return select(table).where(cond).subquery() - - -class ConditionInMonths(QueryOp): - """Filter rows based on a timestamp being in a list of years. - - Parameters - ---------- - timestamp_col - Timestamp column name. - months - Months in which the timestamps must be. - not_ - Take negation of condition. - binarize_col - If specified, create a Boolean column of name binarize_col instead of filtering. - - Examples - -------- - >>> ConditionInMonths("col1", [1, 2])(table) - >>> ConditionInMonths("col1", 1)(table) - >>> ConditionInMonths("col1", 1, binarize_col="col1_bool")(table) - - """ - - def __init__( - self, - timestamp_col: str, - months: typing.Union[int, typing.List[int]], - not_: bool = False, - binarize_col: typing.Optional[str] = None, - ): - super().__init__() - self.timestamp_col = timestamp_col - self.months = months - self.not_ = not_ - self.binarize_col = binarize_col - - def __call__(self, table: TableTypes, return_cond: bool = False) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - return_cond - Return the condition instead of filtering. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - if return_cond and self.binarize_col: - raise ValueError( - "Cannot return condition and binarize column simultaneously.", - ) - table = _process_checks( - table, - cols=self.timestamp_col, - cols_not_in=self.binarize_col, - ) - cond = in_( - extract("month", get_column(table, self.timestamp_col)), - to_list(self.months), - ) - if self.not_: - cond = cond._negate() - if return_cond: - return cond - if self.binarize_col is not None: - return select( - table, - cast(cond, Boolean).label(self.binarize_col), - ).subquery() - - return select(table).where(cond).subquery() - - -class ConditionBeforeDate(QueryOp): - """Filter rows based on a timestamp being before some date. - - Parameters - ---------- - timestamp_col - Timestamp column name. - timestamp - A datetime object or str in YYYY-MM-DD format. - not_ - Take negation of condition. - binarize_col - If specified, create a Boolean column of name binarize_col instead of filtering. - - Examples - -------- - >>> ConditionBeforeDate("col1", "2020-01-01")(table) - >>> ConditionBeforeDate("col1", datetime.datetime(2020, 1, 1))(table) - >>> ConditionBeforeDate("col1", "2020-01-01", binarize_col="col1_bool")(table) - - """ - - def __init__( - self, - timestamp_col: str, - timestamp: typing.Union[str, datetime], - not_: bool = False, - binarize_col: typing.Optional[str] = None, - ): - super().__init__() - self.timestamp_col = timestamp_col - self.timestamp = timestamp - self.not_ = not_ - self.binarize_col = binarize_col - - def __call__(self, table: TableTypes, return_cond: bool = False) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - return_cond - Return the condition instead of filtering. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - if return_cond and self.binarize_col: - raise ValueError( - "Cannot return condition and binarize column simultaneously.", - ) - table = _process_checks(table, timestamp_cols=self.timestamp_col) - if isinstance(self.timestamp, str): - timestamp = to_datetime_format(self.timestamp) - else: - timestamp = self.timestamp - cond = get_column(table, self.timestamp_col) <= timestamp - if self.not_: - cond = cond._negate() - if return_cond: - return cond - if self.binarize_col is not None: - return select( - table, - cast(cond, Boolean).label(self.binarize_col), - ).subquery() - - return select(table).where(cond).subquery() - - -class ConditionAfterDate(QueryOp): - """Filter rows based on a timestamp being after some date. - - Parameters - ---------- - timestamp_col - Timestamp column name. - timestamp - A datetime object or str in YYYY-MM-DD format. - not_ - Take negation of condition. - binarize_col - If specified, create a Boolean column of name binarize_col instead of filtering. - - Examples - -------- - >>> ConditionAfterDate("col1", "2020-01-01")(table) - >>> ConditionAfterDate("col1", datetime.datetime(2020, 1, 1))(table) - >>> ConditionAfterDate("col1", "2020-01-01", binarize_col="col1_bool")(table) - - """ - - def __init__( - self, - timestamp_col: str, - timestamp: typing.Union[str, datetime], - not_: bool = False, - binarize_col: typing.Optional[str] = None, - ): - super().__init__() - self.timestamp_col = timestamp_col - self.timestamp = timestamp - self.not_ = not_ - self.binarize_col = binarize_col - - def __call__(self, table: TableTypes, return_cond: bool = False) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - return_cond - Return the condition instead of filtering. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - if return_cond and self.binarize_col: - raise ValueError( - "Cannot return condition and binarize column simultaneously.", - ) - table = _process_checks(table, timestamp_cols=self.timestamp_col) - if isinstance(self.timestamp, str): - timestamp = to_datetime_format(self.timestamp) - else: - timestamp = self.timestamp - cond = get_column(table, self.timestamp_col) >= timestamp - if self.not_: - cond = cond._negate() - if return_cond: - return cond - if self.binarize_col is not None: - return select( - table, - cast(cond, Boolean).label(self.binarize_col), - ).subquery() - - return select(table).where(cond).subquery() - - -class ConditionLike(QueryOp): - """Filter rows by a LIKE condition. - - Parameters - ---------- - col - Column to filter on. - pattern - Pattern to filter on. - not_ - Take negation of condition. - binarize_col - If specified, create a Boolean column of name binarize_col instead of filtering. - - Examples - -------- - >>> ConditionLike("lab_name", "HbA1c")(table) - >>> ConditionLike("lab_name", "HbA1c", not_=True)(table) - >>> ConditionLike("lab_name", "HbA1c", binarize_col="lab_name_bool")(table) - - """ - - def __init__( - self, - col: str, - pattern: str, - not_: bool = False, - binarize_col: typing.Optional[str] = None, - ): - super().__init__() - self.col = col - self.pattern = pattern - self.not_ = not_ - self.binarize_col = binarize_col - - def __call__(self, table: TableTypes, return_cond: bool = False) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - return_cond - Return the condition instead of filtering. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - if return_cond and self.binarize_col: - raise ValueError( - "Cannot return condition and binarize column simultaneously.", - ) - table = _process_checks(table, cols=self.col) - cond = get_column(table, self.col).like(self.pattern) - if self.not_: - cond = cond._negate() - if return_cond: - return cond - if self.binarize_col is not None: - return select( - table, - cast(cond, Boolean).label(self.binarize_col), - ).subquery() - - return select(table).where(cond).subquery() - - -class Or(QueryOp): - """Combine multiple condition query ops using an OR. - - Parameters - ---------- - cond_ops - Condition Query ops to combine. - - Examples - -------- - >>> Or(ConditionLike("lab_name", "HbA1c"), ConditionIn("name", ["John", "Jane"])) - >>> Or([ConditionLike("lab_name", "HbA1c"), ConditionIn("name", ["John", "Jane"])]) - - """ - - def __init__(self, *cond_ops: typing.Union[QueryOp, typing.List[QueryOp]]): - super().__init__() - self.cond_ops = cond_ops - - def __call__(self, table: TableTypes, return_cond: bool = False) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - return_cond - Return the condition instead of filtering. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - ops = [] - for cond_op in self.cond_ops: - if isinstance(cond_op, list): - if len(self.cond_ops) != 1: - raise ValueError("Cannot combine multiple lists of conditions.") - ops = [op(table, return_cond=True) for op in cond_op] - if isinstance(cond_op, QueryOp): - if len(self.cond_ops) == 1: - return cond_op(table, return_cond=return_cond) - ops.append(cond_op(table, return_cond=True)) - cond = or_(*ops) - if return_cond: - return cond - - return select(table).where(cond).subquery() - - -class And(QueryOp): - """Combine multiple condition query ops using an And. - - Parameters - ---------- - ops - Query ops to combine. - - Examples - -------- - >>> And([ConditionLike("lab_name", "HbA1c"), ConditionIn("name", ["John", "Jane"])]) - >>> And(ConditionLike("lab_name", "HbA1c"), ConditionIn("name", ["John", "Jane"])) - - """ - - def __init__(self, *cond_ops: typing.Union[QueryOp, typing.List[QueryOp]]): - super().__init__() - self.cond_ops = cond_ops - - def __call__(self, table: TableTypes, return_cond: bool = False) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - return_cond - Return the condition instead of filtering. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - ops = [] - for cond_op in self.cond_ops: - if isinstance(cond_op, list): - if len(self.cond_ops) != 1: - raise ValueError("Cannot combine multiple lists of conditions.") - ops = [op(table, return_cond=True) for op in cond_op] - if isinstance(cond_op, QueryOp): - if len(self.cond_ops) == 1: - return cond_op(table, return_cond=return_cond) - ops.append(cond_op(table, return_cond=True)) - cond = and_(*ops) - if return_cond: - return cond - - return select(table).where(cond).subquery() - - -class Limit(QueryOp): - """Limit the number of rows returned in a query. - - Parameters - ---------- - number - Number of rows to return in the limit. - - Examples - -------- - >>> Limit(10)(table) - - """ - - def __init__(self, number: int): - super().__init__() - self.number = number - - @table_params_to_type(Select) - def __call__(self, table: TableTypes) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - return table.limit(self.number).subquery() # type: ignore - - -class RandomizeOrder(QueryOp): - """Randomize order of table rows. - - Useful when the data is ordered, so certain rows cannot - be seen or analyzed when limited. - - Examples - -------- - >>> RandomizeOrder()(table) - - Warnings - -------- - Becomes quite slow on large tables. - - """ - - @table_params_to_type(Subquery) - def __call__(self, table: TableTypes) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - return select(table).order_by(func.random()).subquery() - - -class DropNulls(QueryOp): - """Remove rows with null values in some specified columns. - - Parameters - ---------- - cols - Columns in which, if a value is null, the corresponding row is removed. - - Examples - -------- - >>> DropNulls("col1")(table) - >>> DropNulls(["col1", "col2"])(table) - - """ - - def __init__(self, cols: typing.Union[str, typing.List[str]]): - super().__init__() - self.cols = cols - - def __call__(self, table: TableTypes) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - self.cols = to_list(self.cols) - table = _process_checks(table, cols=self.cols) - - cond = and_(*[not_equals(get_column(table, col), None) for col in self.cols]) - return select(table).where(cond).subquery() - - -class DropEmpty(QueryOp): - """Remove rows with empty values in some specified columns. - - Parameters - ---------- - cols - Columns in which, if a value is empty, the corresponding row is removed. - - Examples - -------- - >>> DropEmpty("col1")(table) - >>> DropEmpty(["col1", "col2"])(table) - - """ - - def __init__(self, cols: typing.Union[str, typing.List[str]]): - super().__init__() - self.cols = cols - - def __call__(self, table: TableTypes) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - self.cols = to_list(self.cols) - table = _process_checks(table, cols=self.cols) - - cond = and_(*[not_equals(get_column(table, col), "") for col in self.cols]) - return select(table).where(cond).subquery() - - -class Apply(QueryOp): - """Apply function(s) to column(s). - - The function can take a sqlalchemy column object and also return a column object. - It can also take multiple columns and return a single column or multiple columns. - If multiple functions are provided, it is assumed that each function is applied to - each input column. - - Parameters - ---------- - cols - Column(s) to apply the function to. - funcs - Function(s) that takes in sqlalchemy column(s) object and returns column(s) - after applying the function or list of functions to apply to each column. - new_cols - New column name(s) after function is applied to the specified column(s). - - Examples - -------- - >>> Apply("col1", lambda x: x + 1)(table) - >>> Apply(["col1", "col2"], [lambda x: x + 1, lambda x: x + 2])(table) - >>> Apply("col1", lambda x: x + 1, new_cols="col1_new")(table) - >>> Apply(["col1", "col2"], lambda x, y: x + y, new_cols="col1_new")(table) - >>> Apply(["col1", "col2"], lambda x, y: (x + y, x - y), new_cols=["col1_new", "col2_new"])(table) # noqa: E501, pylint: disable=line-too-long - - """ - - def __init__( - self, - cols: typing.Union[str, typing.List[str]], - funcs: typing.Union[ - typing.Callable[ - [sqlalchemy.sql.schema.Column], - sqlalchemy.sql.schema.Column, - ], - typing.List[ - typing.Callable[ - [sqlalchemy.sql.schema.Column], - sqlalchemy.sql.schema.Column, - ] - ], - ], - new_cols: typing.Optional[typing.Union[str, typing.List[str]]] = None, - ): - super().__init__() - self.cols = cols - self.funcs = funcs - self.new_cols = new_cols - - def __call__(self, table: TableTypes) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - self.new_cols = to_list(self.new_cols) - if isinstance(self.funcs, list): - if len(self.funcs) != len(self.cols): - raise ValueError( - "Number of functions must be equal to number of columns.", - ) - if len(self.new_cols) != len(self.cols): - raise ValueError( - "Number of new columns must be equal to number of columns.", - ) - if callable(self.funcs): - cols = get_columns(table, self.cols) - result_cols = [ - self.funcs(*cols).label(new_col) for new_col in self.new_cols - ] # noqa: E501 - - return select(table).add_columns(*result_cols).subquery() - - return apply_to_columns(table, self.cols, self.funcs, self.new_cols) - - -class OrderBy(QueryOp): - """Order, or sort, the rows of a table by some columns. - - Parameters - ---------- - cols - Columns by which to order. - ascending - Whether to order each columns by ascending (True) or descending (False). - If not provided, orders all by ascending. - - Examples - -------- - >>> OrderBy("col1")(table) - >>> OrderBy(["col1", "col2"])(table) - >>> OrderBy(["col1", "col2"], [True, False])(table) - >>> OrderBy(["col1", "col2"], True)(table) - - """ - - def __init__( - self, - cols: typing.Union[str, typing.List[str]], - ascending: typing.Optional[typing.Union[bool, typing.List[bool]]] = None, - ): - super().__init__() - self.cols = cols - self.ascending = ascending - - def __call__(self, table: TableTypes) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - self.cols = to_list(self.cols) - ascending = to_list_optional(self.ascending) - table = _process_checks(table, cols=self.cols) - if ascending is None: - ascending = [True] * len(self.cols) - elif len(ascending) != len(self.cols): - raise ValueError( - "If ascending is specified. Must specify for all columns.", - ) - order_cols = [ - col if ascending[i] else col.desc() - for i, col in enumerate(get_columns(table, self.cols)) - ] - - return select(table).order_by(*order_cols).subquery() - - -class GroupByAggregate(QueryOp): - """Aggregate over a group by object. - - Parameters - ---------- - groupby_cols - Columns by which to group. - aggfuncs - Specify a dictionary of key-value pairs: - column name: aggfunc string or - column name: (aggfunc string, new column label) - This labelling prevents the aggregation of the same column using multiple - aggregation functions. - aggseps - Specify a dictionary of key-value pairs: - column name: string_aggfunc separator - If string_agg used as aggfunc for a column, then a separator must be provided - for the same column. - - Examples - -------- - >>> GroupByAggregate("person_id", {"person_id": "count"})(table) - >>> GroupByAggregate("person_id", {"person_id": ("count", "visit_count")})(table) - >>> GroupByAggregate("person_id", {"lab_name": "string_agg"}, {"lab_name": ", "})(table) - >>> GroupByAggregate("person_id", {"lab_name": ("string_agg", "lab_name_agg"}, {"lab_name": ", "})(table) - - """ - - def __init__( - self, - groupby_cols: typing.Union[str, typing.List[str]], - aggfuncs: typing.Union[ - typing.Dict[str, typing.Sequence[str]], - typing.Dict[str, str], - ], - aggseps: typing.Optional[typing.Dict[str, str]] = None, - ): - super().__init__() - self.groupby_cols = groupby_cols - self.aggfuncs = aggfuncs - if aggseps is None: - aggseps = {} - self.aggseps = aggseps - - def __call__(self, table: TableTypes) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - str_to_aggfunc = { - "sum": func.sum, - "average": func.avg, - "min": func.min, - "max": func.max, - "count": func.count, - "median": func.percentile_cont(0.5).within_group, - "string_agg": func.string_agg, - } - - aggfunc_tuples = list(self.aggfuncs.items()) - aggfunc_cols = [item[0] for item in aggfunc_tuples] - aggfunc_strs = [ - item[1] if isinstance(item[1], str) else item[1][0] - for item in aggfunc_tuples - ] - - # If not specified, aggregate column names default to that of - # the column being aggregated over - aggfunc_names = [ - aggfunc_cols[i] if isinstance(item[1], str) else item[1][1] - for i, item in enumerate(aggfunc_tuples) - ] - - groupby_names = to_list(self.groupby_cols) - table = _process_checks(table, cols=groupby_names + aggfunc_cols) - - # Error checking - for i, aggfunc_str in enumerate(aggfunc_strs): - if aggfunc_str not in str_to_aggfunc: - allowed_strs = ", ".join(list(str_to_aggfunc.keys())) - raise ValueError( - f"Invalid aggfuncs specified. Allowed values are {allowed_strs}.", - ) - if aggfunc_str == "string_agg" and ( - not bool(self.aggseps) or aggfunc_cols[i] not in self.aggseps - ): - raise ValueError( - f"""Column {aggfunc_cols[i]} needs to be aggregated as string, must specify a separator!""", # noqa: E501 - ) - - all_names = groupby_names + aggfunc_names - if len(all_names) != len(set(all_names)): - raise ValueError( - """Duplicate column names were found. Try naming aggregated columns - to avoid this issue.""", - ) - - # Perform group by - groupby_cols = get_columns(table, groupby_names) - to_agg_cols = get_columns(table, aggfunc_cols) - agg_cols = [] - for i, to_agg_col in enumerate(to_agg_cols): - if aggfunc_strs[i] == "string_agg": - agg_col = str_to_aggfunc[aggfunc_strs[i]]( - to_agg_col, - literal_column(f"'{self.aggseps[aggfunc_cols[i]]}'"), - ) - else: - agg_col = str_to_aggfunc[aggfunc_strs[i]](to_agg_col) - agg_cols.append(agg_col.label(aggfunc_names[i])) - - return select(*groupby_cols, *agg_cols).group_by(*groupby_cols).subquery() - - -class Distinct(QueryOp): - """Get distinct rows. - - Parameters - ---------- - cols - Columns to use for distinct. - - Examples - -------- - >>> Distinct("person_id")(table) - >>> Distinct(["person_id", "visit_id"])(table) - - """ - - def __init__(self, cols: typing.Union[str, typing.List[str]]): - super().__init__() - self.cols = cols - - def __call__(self, table: TableTypes) -> Subquery: - """Process the table. - - Parameters - ---------- - table - Table on which to perform the operation. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Processed table. - - """ - cols = to_list(self.cols) - table = _process_checks(table, cols=cols) - - return select(table).distinct(*get_columns(table, cols)).subquery() diff --git a/cyclops/query/orm.py b/cyclops/query/orm.py deleted file mode 100644 index 28ad26781..000000000 --- a/cyclops/query/orm.py +++ /dev/null @@ -1,316 +0,0 @@ -"""Object Relational Mapper (ORM) using sqlalchemy.""" - -import csv -import logging -import os -import socket -from dataclasses import dataclass -from typing import Dict, List, Literal, Optional, Union -from urllib.parse import quote_plus - -import dask.dataframe as dd -import pandas as pd -import pyarrow.csv as pv -import pyarrow.parquet as pq -from datasets import Dataset -from sqlalchemy import MetaData, create_engine, inspect -from sqlalchemy.engine.base import Engine -from sqlalchemy.orm import sessionmaker -from sqlalchemy.orm.session import Session -from sqlalchemy.sql.selectable import Select - -from cyclops.query.util import ( - DBSchema, - DBTable, - TableTypes, - get_attr_name, - table_params_to_type, -) -from cyclops.utils.file import exchange_extension, process_file_save_path -from cyclops.utils.log import setup_logging -from cyclops.utils.profile import time_function - - -# Logging. -LOGGER = logging.getLogger(__name__) -setup_logging(print_level="INFO", logger=LOGGER) - - -SOCKET_CONNECTION_TIMEOUT = 5 - - -def _get_db_url( - dbms: str, - user: str, - pwd: str, - host: str, - port: int, - database: str, -) -> str: - """Combine to make Database URL string.""" - return f"{dbms}://{user}:{quote_plus(pwd)}@{host}:{str(port)}/{database}" - - -@dataclass -class DatasetQuerierConfig: - """Configuration for the dataset querier. - - Attributes - ---------- - dbms - Database management system. - host - Hostname of database. - port - Port of database. - database - Name of database. - user - Username for database. - password - Password for database. - - """ - - database: str - user: str - password: str - dbms: str = "postgresql" - host: str = "localhost" - port: int = 5432 - - -class Database: - """Database class. - - Attributes - ---------- - config - Configuration stored in a dataclass. - engine - SQL extraction engine. - inspector - Module for schema inspection. - session - Session for ORM. - is_connected - Whether the database is setup, connected and ready to run queries. - - """ - - def __init__(self, config: DatasetQuerierConfig) -> None: - """Instantiate. - - Parameters - ---------- - config - Path to directory with config file, for overrides. - - """ - self.config = config - self.is_connected = False - - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - sock.settimeout(SOCKET_CONNECTION_TIMEOUT) - try: - is_port_open = sock.connect_ex((self.config.host, self.config.port)) - except socket.gaierror: - LOGGER.error("""Server name not known, cannot establish connection!""") - return - if is_port_open: - LOGGER.error( - """Valid server host but port seems open, check if server is up!""", - ) - return - - self.engine = self._create_engine() - self.session = self._create_session() - self._tables: List[str] = [] - self._setup() - self.is_connected = True - LOGGER.info("Database setup, ready to run queries!") - - def _create_engine(self) -> Engine: - """Create an engine.""" - self.conn = _get_db_url( - self.config.dbms, - self.config.user, - self.config.password, - self.config.host, - self.config.port, - self.config.database, - ) - return create_engine( - _get_db_url( - self.config.dbms, - self.config.user, - self.config.password, - self.config.host, - self.config.port, - self.config.database, - ), - ) - - def _create_session(self) -> Session: - """Create session.""" - self.inspector = inspect(self.engine) - - # Create a session for using ORM. - session = sessionmaker(self.engine) - session.configure(bind=self.engine) - - return session() - - def list_tables(self) -> List[str]: - """List tables in a schema. - - Returns - ------- - List[str] - List of table names. - - """ - return self._tables - - def _setup(self) -> None: - """Prepare ORM DB.""" - meta: Dict[str, MetaData] = {} - schemas = self.inspector.get_schema_names() - for schema_name in schemas: - metadata = MetaData(schema=schema_name) - metadata.reflect(bind=self.engine) - meta[schema_name] = metadata - schema = DBSchema(schema_name, meta[schema_name]) - for table_name in meta[schema_name].tables: - table = DBTable(table_name, meta[schema_name].tables[table_name]) - for column in meta[schema_name].tables[table_name].columns: - setattr(table, column.name, column) - if not isinstance(table.name, str): - table.name = str(table.name) - self._tables.append(table.name) - setattr(schema, get_attr_name(table.name), table) - setattr(self, schema_name, schema) - - @time_function - @table_params_to_type(Select) - def run_query( - self, - query: Union[TableTypes, str], - limit: Optional[int] = None, - backend: Literal["pandas", "dask", "datasets"] = "pandas", - index_col: Optional[str] = None, - n_partitions: Optional[int] = None, - ) -> Union[pd.DataFrame, dd.core.DataFrame, Dataset]: - """Run query. - - Parameters - ---------- - query - Query to run. - limit - Limit query result to limit. - backend - Backend library to use, Pandas or Dask or HF datasets. - index_col - Column which becomes the index, and defines the partitioning. - Should be a indexed column in the SQL server, and any orderable type. - n_partitions - Number of partitions. Check dask documentation for additional details. - - Returns - ------- - pandas.DataFrame or dask.DataFrame or datasets.Dataset - Extracted data from query. - - """ - if isinstance(query, str) and limit is not None: - raise ValueError( - "Cannot use limit argument when running raw SQL string query!", - ) - if backend in ["pandas", "datasets"] and n_partitions is not None: - raise ValueError( - "Partitions not applicable with pandas or datasets backend, use dask!", - ) - # Limit the results returned. - if limit is not None: - query = query.limit(limit) # type: ignore - - # Run the query and return the results. - with self.session.connection(): - if backend == "pandas": - data = pd.read_sql_query(query, self.engine, index_col=index_col) - elif backend == "datasets": - data = Dataset.from_sql(query, self.conn) - elif backend == "dask": - data = dd.read_sql_query( # type: ignore - query, - self.conn, - index_col=index_col, - npartitions=n_partitions, - ) - data = data.reset_index(drop=False) - else: - raise ValueError( - "Invalid backend, can either be pandas or dask or datasets!", - ) - LOGGER.info("Query returned successfully!") - - return data - - @time_function - @table_params_to_type(Select) - def save_query_to_csv(self, query: TableTypes, path: str) -> str: - """Save query in a .csv format. - - Parameters - ---------- - query - Query to save. - path - Save path. - - Returns - ------- - str - Processed save path for upstream use. - - """ - path = process_file_save_path(path, "csv") - - with self.session.connection(): - result = self.engine.execute(query) - with open(path, "w", encoding="utf-8") as file_descriptor: - outcsv = csv.writer(file_descriptor) - outcsv.writerow(result.keys()) - outcsv.writerows(result) - - return path - - @time_function - @table_params_to_type(Select) - def save_query_to_parquet(self, query: TableTypes, path: str) -> str: - """Save query in a .parquet format. - - Parameters - ---------- - query - Query to save. - path - Save path. - - Returns - ------- - str - Processed save path for upstream use. - - """ - path = process_file_save_path(path, "parquet") - - # Save to CSV, load with pyarrow, save to Parquet - csv_path = exchange_extension(path, "csv") - self.save_query_to_csv(query, csv_path) - table = pv.read_csv(csv_path) - os.remove(csv_path) - pq.write_table(table, path) - - return path diff --git a/cyclops/query/post_process/__init__.py b/cyclops/query/post_process/__init__.py deleted file mode 100644 index 958c2f82a..000000000 --- a/cyclops/query/post_process/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Query post-process functions for datasets.""" diff --git a/cyclops/query/post_process/gemini.py b/cyclops/query/post_process/gemini.py deleted file mode 100644 index 026e1ec9e..000000000 --- a/cyclops/query/post_process/gemini.py +++ /dev/null @@ -1,45 +0,0 @@ -"""Post-processing functions applied to queried GEMINI data (Pandas DataFrames).""" - -import pandas as pd - -from cyclops.query.post_process.util import process_care_unit_changepoints - - -CARE_UNIT_HIERARCHY = [ - "ER", - "Emergency", - "ICU", - "SCU", - "Peri-op", - "Palliative", - "Step-down", - "Rehab", - "Other ward", - "GIM ward", - "IP", -] - - -def process_gemini_care_unit_changepoints(data: pd.DataFrame) -> pd.DataFrame: - """Process GEMINI changepoint care unit information in a hierarchical fashion. - - Using the admit, discharge, and care unit information, create a - changepoint DataFrame usable for aggregation labelling purposes. - If a patient is in multiple care units at a changepoint, the care - unit highest in the hierarchy is selected. - - Parameters - ---------- - data: pandas.DataFrame - The admit, discharge, and care unit information for a single encounter. - Expects columns "admit", "discharge", and CARE_UNIT. - care_unit_hierarchy: list - Ordered list of care units from most relevant to to least. - - Returns - ------- - pandas.DataFrame - Changepoint information with associated care unit. - - """ - return process_care_unit_changepoints(data, CARE_UNIT_HIERARCHY) diff --git a/cyclops/query/post_process/mimiciv.py b/cyclops/query/post_process/mimiciv.py deleted file mode 100644 index 2f4b454f9..000000000 --- a/cyclops/query/post_process/mimiciv.py +++ /dev/null @@ -1,152 +0,0 @@ -"""Post-processing functions applied to queried MIMIC data (Pandas DataFrames).""" - -import pandas as pd - -from cyclops.process.column_names import CARE_UNIT -from cyclops.process.constants import ER, ICU, IP, SCU -from cyclops.query.post_process.util import process_care_unit_changepoints -from cyclops.utils.profile import time_function - - -CARE_UNIT_MAP = { - IP: { - "observation": ["Observation", "Psychiatry"], - "medicine": ["Medicine", "Medical/Surgical (Gynecology)"], - }, - ER: { - "er": ["Emergency Department", "Emergency Department Observation"], - }, - ICU: { - "icu": [ - "Surgical Intensive Care Unit (SICU)", - "Medical/Surgical Intensive Care Unit (MICU/SICU)", - "Medical Intensive Care Unit (MICU)", - "Trauma SICU (TSICU)", - "Neuro Surgical Intensive Care Unit (Neuro SICU)", - "Cardiac Vascular Intensive Care Unit (CVICU)", - ], - }, - SCU: { - "surgery": [ - "Med/Surg", - "Surgery", - "Surgery/Trauma", - "Med/Surg/Trauma", - "Med/Surg/GYN", - "Surgery/Vascular/Intermediate", - "Thoracic Surgery", - "Transplant", - "Cardiac Surgery", - "PACU", - "Surgery/Pancreatic/Biliary/Bariatric", - ], - "cardiology": [ - "Cardiology", - "Coronary Care Unit (CCU)", - "Cardiology Surgery Intermediate", - "Medicine/Cardiology", - "Medicine/Cardiology Intermediate", - ], - "vascular": [ - "Vascular", - "Hematology/Oncology", - "Hematology/Oncology Intermediate", - ], - "neuro": ["Neurology", "Neuro Intermediate", "Neuro Stepdown"], - "neonatal": [ - "Obstetrics (Postpartum & Antepartum)", - "Neonatal Intensive Care Unit (NICU)", - "Special Care Nursery (SCN)", - "Nursery - Well Babies", - "Obstetrics Antepartum", - "Obstetrics Postpartum", - "Labor & Delivery", - ], - }, -} -NONSPECIFIC_CARE_UNIT_MAP = { - "medicine": IP, - "observation": IP, - "er": ER, - "icu": ICU, - "cardiology": SCU, - "neuro": SCU, - "neonatal": SCU, - "surgery": SCU, - "vascular": SCU, -} -CARE_UNIT_HIERARCHY = [ER, ICU, SCU, IP] - - -def process_mimic_care_unit_changepoints(data: pd.DataFrame) -> pd.DataFrame: - """Process MIMIC changepoint care unit information in a hierarchical fashion. - - Using the admit, discharge, and care unit information, create a - changepoint DataFrame usable for aggregation labelling purposes. - If a patient is in multiple care units at a changepoint, the care - unit highest in the hierarchy is selected. - - Parameters - ---------- - data: pandas.DataFrame - The admit, discharge, and care unit information for a single encounter. - Expects columns "admit", "discharge", and CARE_UNIT. - - Returns - ------- - pandas.DataFrame - Changepoint information with associated care unit. - - """ - return process_care_unit_changepoints(data, CARE_UNIT_HIERARCHY) - - -@time_function -def process_mimic_care_units( - transfers: pd.DataFrame, - specific: bool = False, -) -> pd.DataFrame: - """Process care unit data. - - Processes the MIMIC Transfers table into a cleaned and simplified care - units DataFrame. - - Parameters - ---------- - transfers : pandas.DataFrame - MIMIC transfers table as a DataFrame. - specific : bool, optional - Whether care_unit_name column has specific or non-specific care units. - - Returns - ------- - pandas.DataFrame - Processed care units for MIMIC encounters. - - """ - transfers.rename( - columns={ - "intime": "admit", - "outtime": "discharge", - "careunit": CARE_UNIT, - }, - inplace=True, - ) - - # Drop rows with eventtype discharge. - # Its admit timestamp is the discharge timestamp of eventtype admit. - transfers = transfers[transfers["eventtype"] != "discharge"] - transfers = transfers.drop("eventtype", axis=1) - transfers = transfers[transfers[CARE_UNIT] != "Unknown"] - - # Create replacement dictionary for care unit categories depending on specificity. - replace_dict = {} - for unit, unit_dict in CARE_UNIT_MAP.items(): - for specific_unit, unit_list in unit_dict.items(): - value = specific_unit if specific else unit - replace_dict.update({elem: value for elem in unit_list}) - transfers[CARE_UNIT].replace(replace_dict, inplace=True) - - transfers.dropna(inplace=True) - - return transfers diff --git a/cyclops/query/post_process/util.py b/cyclops/query/post_process/util.py deleted file mode 100644 index 4ba766b81..000000000 --- a/cyclops/query/post_process/util.py +++ /dev/null @@ -1,111 +0,0 @@ -"""Post-processing functions applied to queried data (Pandas DataFrames).""" - -from typing import List - -import pandas as pd -from pandas import Timestamp - -from cyclops.process.column_names import CARE_UNIT - - -def event_time_between( - event: Timestamp, - admit: pd.Series, - discharge: pd.Series, - admit_inclusive: bool = True, - discharge_inclusive: bool = False, -) -> pd.Series: - """Return whether an event time is between some start and end time. - - May also specify whether the comparison operators are inclusive or not.. - - Parameters - ---------- - event: pandas.Timestamp - Event time. - admit: pandas.Series - A series of timestamps. - discharge: pandas.Series - A series of timestamps. - admit_inclusive: bool - Whether to have an inclusive inequality for the admit condition. - discharge_inclusive: bool - Whether to have an inclusive inequality for the discharge condition. - - Returns - ------- - pandas.Series - A boolean Series representing whether the event is between - the start and end timestamps. - - """ - admit_cond = event >= admit if admit_inclusive else event > admit - - discharge_cond = event <= discharge if discharge_inclusive else event < discharge - - return admit_cond & discharge_cond - - -def process_care_unit_changepoints( - data: pd.DataFrame, - care_unit_hierarchy: List[str], -) -> pd.DataFrame: - """Process changepoint care unit information in a hierarchical fashion. - - Using the admit, discharge, and care unit information, create a - changepoint DataFrame usable for aggregation labelling purposes. - If a patient is in multiple care units at a changepoint, the care - unit highest in the hierarchy is selected. - - Parameters - ---------- - data: pandas.DataFrame - The admit, discharge, and care unit information for a single encounter. - Expects columns "admit", "discharge", and CARE_UNIT. - care_unit_hierarchy: list - Ordered list of care units from most relevant to to least. - - Returns - ------- - pandas.DataFrame - Changepoint information with associated care unit. The care unit - information is relevant up until the next change point - - """ - # Define mapping dictionaries - hierarchy = {care_unit_hierarchy[i]: i for i in range(len(care_unit_hierarchy))} - hierarchy_inv = {i: care_unit_hierarchy[i] for i in range(len(care_unit_hierarchy))} - - # Create changepoints - changepoints = pd.concat([data["admit"], data["discharge"]]) - changepoints.sort_values(inplace=True) - changepoints = changepoints.unique() - - # Remove the final changepoint, which is the final discharge (has no careunit) - changepoints = changepoints[:-1] - - # Select the most relevant care unit for each changepoint - changepoint_data = [] - for changepoint in changepoints: - is_between = event_time_between( - changepoint, - data["admit"], - data["discharge"], - admit_inclusive=True, - discharge_inclusive=False, - ) - care_units = data[is_between][CARE_UNIT].unique() - if len(care_units) > 0: - care_unit_inds = [hierarchy[x] for x in care_units] - care_unit_selected = hierarchy_inv[min(care_unit_inds)] - else: - care_unit_selected = "unknown" - changepoint_data.append([changepoint, care_unit_selected]) - - checkpoint_df = pd.DataFrame(changepoint_data, columns={"changepoint", "care_unit"}) - - # Remove consequtive duplicates, i.e., remove a changepoint if the - # previous changepoint has the same care unit - change_mask = checkpoint_df["care_unit"] != checkpoint_df["care_unit"].shift(-1) - - return checkpoint_df[change_mask] diff --git a/cyclops/query/util.py b/cyclops/query/util.py deleted file mode 100644 index e35b05bfe..000000000 --- a/cyclops/query/util.py +++ /dev/null @@ -1,1229 +0,0 @@ -"""Utility functions for querying.""" - -import logging -from dataclasses import dataclass -from functools import wraps -from typing import Any, Callable, Dict, List, Optional, Union - -import sqlalchemy -from sqlalchemy import cast, func, select -from sqlalchemy.sql.elements import BinaryExpression -from sqlalchemy.sql.expression import ColumnClause -from sqlalchemy.sql.schema import Column, Table -from sqlalchemy.sql.selectable import Select, Subquery -from sqlalchemy.types import Boolean, Date, DateTime, Float, Integer, Interval, String - -from cyclops.utils.common import to_list, to_list_optional -from cyclops.utils.log import setup_logging - - -# Logging. -LOGGER = logging.getLogger(__name__) -setup_logging(print_level="INFO", logger=LOGGER) - -COLUMN_OBJECTS = [Column, ColumnClause] - - -def get_attr_name(name: str) -> str: - """Get attribute name (second part of first.second).""" - return name.split(".")[-1] - - -@dataclass -class DBSchema: - """Database schema wrapper. - - Parameters - ---------- - name: str - Name of schema. - data: sqlalchemy.sql.schema.MetaData - Metadata for schema. - - """ - - name: str - data: sqlalchemy.sql.schema.MetaData - - -@dataclass -class DBTable: - """Database table wrapper. - - Parameters - ---------- - name: str - Name of table. - data: sqlalchemy.sql.schema.Table - Metadata for schema. - - """ - - name: str - data: sqlalchemy.sql.schema.MetaData - - -TABLE_OBJECTS = [Table, Select, Subquery, DBTable] -TableTypes = Union[Select, Subquery, Table, DBTable] - - -def _to_subquery(table: TableTypes) -> Subquery: - """Convert a table from a table type object to the Subquery type. - - Parameters - ---------- - table: cyclops.query.util.TableTypes - Table to convert. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - The converted table. - - """ - if isinstance(table, Subquery): - return table - - if isinstance(table, Select): - return table.subquery() - - if isinstance(table, Table): - return select(table).subquery() - - if isinstance(table, DBTable): - return select(table.data).subquery() - - raise ValueError( - f"""Table has type {type(table)}, but must have one of the - following types: {", ".join(TABLE_OBJECTS)}""", - ) - - -def _to_select(table: TableTypes) -> Select: - """Convert a table from a table type object to the Select type. - - Parameters - ---------- - table: cyclops.query.util.TableTypes - Table to convert. - - Returns - ------- - sqlalchemy.sql.selectable.Select - The converted table. - - """ - if isinstance(table, Select): - return table - - if isinstance(table, Subquery): - return select(table) - - if isinstance(table, Table): - return select(table) - - if isinstance(table, DBTable): - return select(table.data) - - raise ValueError( - f"""Table has type {type(table)}, but must have one of the - following types: {", ".join(TABLE_OBJECTS)}""", - ) - - -def param_types_to_type( - relevant_types: List[Any], - to_type_fn: Callable[..., Any], -) -> Callable[..., Any]: - """Convert TableTypes parameters to a specified type. - - A decorator which processes a function's arguments by taking all - parameters with type in relevant_types and converting them using - some to_type_fn function. Non-relevant types are left alone. - - Parameters - ---------- - relevant_types : list - Types to process. - to_type_fn : Callable - Function to process the relevant types - - Returns - ------- - Callable - The processed function. - - """ - - def decorator(func_: Callable[..., Any]) -> Callable[..., Any]: - """Decorate function to convert TableTypes parameters to a specified type.""" - - @wraps(func_) - def wrapper_func(*args: Any, **kwargs: Any) -> Any: - # Convert relevant arguments. - args_list = list(args) - for i, arg in enumerate(args_list): - if type(arg) in relevant_types: - args_list[i] = to_type_fn(arg) - - # Convert relevant keyword arguments. - kwargs = dict(kwargs) - for key, kwarg in kwargs.items(): - if type(kwarg) in relevant_types: - kwargs[key] = to_type_fn(kwarg) - - return func_(*tuple(args_list), **kwargs) - - return wrapper_func - - return decorator - - -def table_params_to_type(to_type: TableTypes) -> Callable[..., Any]: - """Decorate to convert TableTypes params to a specified type. - - Parameters - ---------- - to_type: cyclops.query.util.TableTypes - The type to which to convert. - - Returns - ------- - Callable - The processed function. - - """ - # Dictionary mapping query type -> query type conversion function. - table_to_type_fn_map = { - Subquery: _to_subquery, - Select: _to_select, - Table: lambda x: x, - DBTable: lambda x: x, - } - if to_type not in TABLE_OBJECTS: - raise ValueError(f"to_type must be in {TABLE_OBJECTS}") - - to_type_fn = table_to_type_fn_map[to_type] - - return param_types_to_type(TABLE_OBJECTS, to_type_fn) - - -@table_params_to_type(Subquery) -def get_column( - table: TableTypes, - col: str, -) -> Column: - """Extract a column object from a table by name. - - Parameters - ---------- - table: cyclops.query.util.TableTypes - The table with the column. - col: str - Name of column to extract. - - Returns - ------- - sqlalchemy.sql.schema.Column - The corresponding column in the table. - - """ - col_names = get_column_names(table) - if col not in col_names: - raise ValueError(f"Table does not contain column {col}") - - return table.c[col_names.index(col)] # type: ignore - - -@table_params_to_type(Subquery) -def filter_columns( - table: TableTypes, - cols: Union[str, List[str]], -) -> Subquery: - """Filter a table, keeping only the specified columns. - - Parameters - ---------- - table: cyclops.query.util.TableTypes - The table with the column. - cols: str or list of str - Name of columns to keep. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - Table with only the specified columns. - - """ - cols = to_list(cols) - col_names = get_column_names(table) - filtered = [] - for col in cols: - if col not in col_names: - continue - filtered.append(table.c[col_names.index(col)]) # type: ignore - - return select(filtered).subquery() - - -@table_params_to_type(Subquery) -def get_columns( - table: TableTypes, - cols: Union[str, List[str]], -) -> List[Column]: - """Extract a number of columns from the table. - - Parameters - ---------- - table: cyclops.query.util.TableTypes - The table. - cols: str or list of str - Names of columns to extract. - - Returns - ------- - list of sqlalchemy.sql.schema.Column - The corresponding columns in the table. - - """ - return [get_column(table, col) for col in to_list(cols)] - - -@table_params_to_type(Subquery) -def get_column_names(table: TableTypes) -> List[str]: - """Extract column names from a table. - - Parameters - ---------- - table: cyclops.query.util.TableTypes - The table. - - Returns - ------- - list of str - The table column names. - - """ - return [c.name for c in table.columns] # type: ignore - - -@table_params_to_type(Subquery) -def has_columns( - table: TableTypes, - cols: Union[str, List[str]], - raise_error: bool = False, -) -> bool: - """Check whether a table has all of the specified columns. - - Parameters - ---------- - table : cyclops.query.util.TableTypes - Table to check. - cols: str or list of str - Required columns. - raise_error: bool - Whether to raise an error if the required columns are not found. - - Returns - ------- - bool - True if all required columns are present, otherwise False. - - """ - cols = to_list(cols) - required_set = set(cols) - columns = set(get_column_names(table)) - present = required_set.issubset(columns) - - if raise_error and not present: - missing = required_set - columns - raise ValueError(f"Missing required columns {', '.join(missing)}.") - - return present - - -@table_params_to_type(Subquery) -def assert_table_has_columns( - *args: Any, - **kwargs: Any, -) -> Callable[[TableTypes], TableTypes]: - """Assert that TableTypes params have the necessary columns. - - assert_table_has_columns(["A", "B"], None) is equivalent to - assert_table_has_columns(["A", "B"]) but may be necessary when - wanting to check, assert_table_has_columns(["A"], None, ["C"]) - - Can also check keyword arguments, e.g., optional queries, - assert_table_has_columns(["A"], kwarg_table=["D"]) - - Parameters - ---------- - *args - Ordered arguments corresponding to the function's table-type args. - **kwargs - Keyword arguments corresponding to the function's table-type kwargs. - - Returns - ------- - Callable - Decorator function. - - """ - - def decorator( - func_: Callable[..., Any], - ) -> Callable[..., Any]: - @wraps(func_) - def wrapper_func(*fn_args: Any, **fn_kwargs: Any) -> Any: - # Check only the table arguments - table_args = [i for i in fn_args if isinstance(i, Subquery)] - - assert len(args) <= len(table_args) - - for i, arg in enumerate(args): - if arg is None: # Can specify None to skip over checking a query - continue - has_columns(table_args[i], arg, raise_error=True) - - for key, required_cols in kwargs.items(): - # If an optional table is not provided, or is None, - # it is skipped - if key not in fn_kwargs: - continue - - if fn_kwargs[key] is None: - continue - - assert isinstance(fn_kwargs[key], Subquery) - has_columns(fn_kwargs[key], required_cols, raise_error=True) - - return func_(*fn_args, **fn_kwargs) - - return wrapper_func - - return decorator # type: ignore - - -@table_params_to_type(Subquery) -def drop_columns( - table: TableTypes, - drop_cols: Union[str, List[str]], -) -> Subquery: - """Drop, or remove, some columns from a table. - - Parameters - ---------- - table: cyclops.query.util.TableTypes - The table. - col : str or list of str - Names of columns to drop. - drop_cols: str or list of str - Names of columns to drop. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - The corresponding table with columns dropped. - - """ - drop_cols = get_columns(table, drop_cols) - - return select(*[c for c in table.c if c not in drop_cols]).subquery() # type: ignore - - -@table_params_to_type(Subquery) -def rename_columns(table: TableTypes, rename_map: Dict[str, str]) -> Subquery: - """Rename a table's columns. - - Rename the table's columns according to a dictionary of strings, - where the key is the current name, and the value is the replacement. - - Parameters - ---------- - table: cyclops.query.util.TableTypes - The table. - rename_map : dict - Dictionary mapping current column names (key) to new ones (value). - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - The corresponding table with columns renamed. - - """ - return select( - *[ - c.label(rename_map[c.name]) if c.name in rename_map else c - for c in table.columns # type: ignore - ], - ).subquery() - - -@table_params_to_type(Subquery) -def reorder_columns(table: TableTypes, cols: List[str]) -> Subquery: - """Reorder a table's columns. - - Parameters - ---------- - table: cyclops.query.util.TableTypes - The table to reorder. - cols : list of str - New order of columns, which must include all existing columns. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - The reordered table. - - """ - # Get the old/new column names. - old_order = get_column_names(table) - new_order = [c.name for c in get_columns(table, cols)] - - # Make sure we have exactly the same set of old/new column names. - if set(old_order) != set(new_order): - old_order_print = ", ".join(old_order) - new_order_print = ", ".join(new_order) - raise ValueError( - f"""Must specify all columns {old_order_print} - to re-order, not {new_order_print}.""", - ) - - # Reorder the columns. - new_cols = [] - for col in new_order: - new_cols.append(table.c[old_order.index(col)]) # type: ignore - - return select(*new_cols).subquery() - - -@table_params_to_type(Subquery) -def apply_to_columns( - table: TableTypes, - col_names: Union[str, List[str]], - funcs: Union[ - Callable[[sqlalchemy.sql.schema.Column], sqlalchemy.sql.schema.Column], - List[Callable[[sqlalchemy.sql.schema.Column], sqlalchemy.sql.schema.Column]], - ], - new_col_labels: Optional[Union[str, List[str]]] = None, -) -> Subquery: - """Apply a function to some columns. - - This function can change existing columns or create new - columns depending on whether new_col_labels is specified. - - Parameters - ---------- - table: cyclops.query.util.TableTypes - The table. - col_names: str or list of str - Columns to which to apply the function. - funcs: callable or list of callable - Function(s) to apply to the columns, where the function takes an column - as its only parameter and returns another column object. - new_col_labels: str or list of str, optional - If specified, create new columns with these labels. Otherwise, - apply the function to the existing columns. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - The table with function applied. - - """ - col_names = to_list(col_names) - new_col_labels = to_list_optional(new_col_labels) - cols = get_columns(table, col_names) - if isinstance(funcs, list): - if len(funcs) != len(cols): - raise ValueError( - f"Must specify a function for each column, not {len(funcs)} functions.", - ) - else: - funcs = [funcs] * len(cols) - if new_col_labels is None: - # Apply to existing columns - prev_order = get_column_names(table) - table = select(table).add_columns( - *[ - funcs[i](col).label("__" + col_names[i] + "__") - for i, col in enumerate(cols) - ], - ) - rename = {"__" + name + "__": name for name in col_names} - table = drop_columns(table, col_names) - table = rename_columns(table, rename) - table = reorder_columns(table, prev_order) - else: - # Apply to new columns - new_cols = [ - funcs[i](col).label(new_col_labels[i]) for i, col in enumerate(cols) - ] - table = select(table).add_columns(*new_cols) - - return _to_subquery(table) - - -def trim_columns( - table: TableTypes, - cols: Union[str, List[str]], - new_col_labels: Optional[Union[str, List[str]]] = None, -) -> Subquery: - """Trim, or strip, specified columns. - - Trimming refers to the removal of leading/trailing whitespace. - - Parameters - ---------- - table: cyclops.query.util.TableTypes - The table. - cols: str or list of str - Names of columns to trim. - new_col_labels: str or list of str, optional - If specified, create new columns with these labels. Otherwise, - apply the function to the existing columns. - - Returns - ------- - sqlalchemy.sql.selectable.Subquery - The table with the specified columns trimmed. - - """ - return apply_to_columns( - table, - cols, - lambda x: process_column(x, to_str=True, trim=True), - new_col_labels=new_col_labels, - ) - - -def process_elem(elem: Any, **kwargs: bool) -> Any: - """Preprocess some basic object such as an integer, float, or string. - - Parameters - ---------- - elem: any - An element such as an integer, float, or string. - **kwargs : dict, optional - Preprocessing keyword arguments. - - Returns - ------- - Any - The preprocessed element. - - """ - # Extract kwargs. - lower = kwargs.get("lower", False) - trim = kwargs.get("trim", False) - to_str = kwargs.get("to_str", False) - to_int = kwargs.get("to_int", False) - to_float = kwargs.get("to_float", False) - to_bool = kwargs.get("to_bool", False) - - # Convert to string. - if to_str: - elem = str(elem) - - # If a string. - if isinstance(elem, str): - if lower: - elem = elem.lower() - - if trim: - elem = elem.strip() - - if to_int: - elem = int(elem) - - if to_float: - elem = float(elem) - - if to_bool: - elem = bool(elem) - - return elem - - -def process_list(lst: Union[Any, List[Any]], **kwargs: bool) -> List[Any]: - """Preprocess a list of elements. - - Parameters - ---------- - lst : any or list of any - A list of elements such as integers, floats, or strings. - **kwargs : dict, optional - Preprocessing keyword arguments. - - Returns - ------- - Any - The preprocessed element. - - """ - # Convert potentially non-list variable to list. - lst = to_list(lst) - - # Process elements. - return [process_elem(i, **kwargs) for i in lst] - - -def process_column(col: Column, **kwargs: bool) -> Column: - """Preprocess a Column object. - - Parameters - ---------- - col : sqlalchemy.sql.schema.Column - A column to preprocess. - **kwargs : dict, optional - Preprocessing keyword arguments. - - Returns - ------- - sqlalchemy.sql.schema.Column - The processed column. - - """ - # Extract kwargs. - lower = kwargs.get("lower", False) - trim = kwargs.get("trim", False) - to_str = kwargs.get("to_str", False) - to_int = kwargs.get("to_int", False) - to_float = kwargs.get("to_float", False) - to_bool = kwargs.get("to_bool", False) - to_date = kwargs.get("to_date", False) - to_timestamp = kwargs.get("to_timestamp", False) - - # Convert to string. - if to_str: - col = cast(col, String) - - # If a string column. - if "VARCHAR" in str(col.type): - # Lower column. - if lower: - col = func.lower(col) - - # Trim whitespace. - if trim: - col = func.trim(col) - - if to_int: - col = cast(col, Integer) - - if to_float: - col = cast(col, Float) - - if to_bool: - col = cast(col, Boolean) - - if to_date: - col = cast(col, Date) - - if to_timestamp: - col = cast(col, DateTime) - - return col - - -def equals( - col: Column, - value: Any, - lower: bool = True, - trim: bool = True, - **kwargs: bool, -) -> BinaryExpression: - """Condition that a column has some value. - - Assumes that if searching for a string, both the value and column values - should be converted to lowercase and trimmed of leading/trailing whitespace. - - Parameters - ---------- - col : sqlalchemy.sql.schema.Column - The column to condition. - value : Any - The value to match in the column. - lower : bool, default=True - Whether to convert the value and column to lowercase. - This is only relevant when the column/value are strings. - trim : bool, default=True - Whether to trim (strip) whitespace on the value and column. - This is only relevant when the column/value are strings. - **kwargs : dict, optional - Remaining preprocessing keyword arguments. - - Returns - ------- - sqlalchemy.sql.elements.BinaryExpression - An expression representing where the condition was satisfied. - - """ - return process_column(col, lower=lower, trim=trim, **kwargs) == process_elem( - value, - lower=lower, - trim=trim, - **kwargs, - ) - - -def greater_than( - col: Column, - value: Any, - lower: bool = True, - trim: bool = True, - equal: bool = False, - **kwargs: bool, -) -> BinaryExpression: - """Condition that a column is greater than some value. - - Assumes that if searching for a string, both the value and column values - should be converted to lowercase and trimmed of leading/trailing whitespace. - - Parameters - ---------- - col : sqlalchemy.sql.schema.Column - The column to condition. - value : Any - The value to match in the column. - lower : bool, default=True - Whether to convert the value and column to lowercase. - This is only relevant when the column/value are strings. - trim : bool, default=True - Whether to trim (strip) whitespace on the value and column. - This is only relevant when the column/value are strings. - equal: bool, default=False - Whether to also include equal to the value. - **kwargs : dict, optional - Remaining preprocessing keyword arguments. - - Returns - ------- - sqlalchemy.sql.elements.BinaryExpression - An expression representing where the condition was satisfied. - - """ - if equal: - return process_column(col, lower=lower, trim=trim, **kwargs) >= process_elem( - value, - lower=lower, - trim=trim, - **kwargs, - ) - return process_column(col, lower=lower, trim=trim, **kwargs) > process_elem( - value, - lower=lower, - trim=trim, - **kwargs, - ) - - -def less_than( - col: Column, - value: Any, - lower: bool = True, - trim: bool = True, - equal: bool = False, - **kwargs: bool, -) -> BinaryExpression: - """Condition that a column is less than some value. - - Assumes that if searching for a string, both the value and column values - should be converted to lowercase and trimmed of leading/trailing whitespace. - - Parameters - ---------- - col : sqlalchemy.sql.schema.Column - The column to condition. - value : Any - The value to match in the column. - lower : bool, default=True - Whether to convert the value and column to lowercase. - This is only relevant when the column/value are strings. - trim : bool, default=True - Whether to trim (strip) whitespace on the value and column. - This is only relevant when the column/value are strings. - equal: bool, default=False - Whether to also include equal to the value. - **kwargs : dict, optional - Remaining preprocessing keyword arguments. - - Returns - ------- - sqlalchemy.sql.elements.BinaryExpression - An expression representing where the condition was satisfied. - - """ - if equal: - return process_column(col, lower=lower, trim=trim, **kwargs) <= process_elem( - value, - lower=lower, - trim=trim, - **kwargs, - ) - return process_column(col, lower=lower, trim=trim, **kwargs) < process_elem( - value, - lower=lower, - trim=trim, - **kwargs, - ) - - -def not_equals( - col: Column, - value: Any, - lower: bool = True, - trim: bool = True, - **kwargs: bool, -) -> BinaryExpression: - """Condition that a column is not equal to some value. - - Assumes that if searching for a string, both the value and column values - should be converted to lowercase and trimmed of leading/trailing whitespace. - - Parameters - ---------- - col : sqlalchemy.sql.schema.Column - The column to condition. - value : Any - The value to match in the column. - lower : bool, default=True - Whether to convert the value and column to lowercase. - This is only relevant when the column/value are strings. - trim : bool, default=True - Whether to trim (strip) whitespace on the value and column. - This is only relevant when the column/value are strings. - **kwargs : dict, optional - Remaining preprocessing keyword arguments. - - Returns - ------- - sqlalchemy.sql.elements.BinaryExpression - An expression representing where the condition was satisfied. - - """ - return process_column(col, lower=lower, trim=trim, **kwargs) != process_elem( - value, - lower=lower, - trim=trim, - **kwargs, - ) - - -def has_string_format( - col: Column, - value: Any, - fmt: str, - to_str: bool = True, - **kwargs: bool, -) -> BinaryExpression: - """Condition that a column has some string formatting. - - Assumes that we're searching for a string, performing - the relevant conversion. - - Parameters - ---------- - col : sqlalchemy.sql.schema.Column - The column to condition. - value: Any - A value to be implanted in the string formatting. - fmt : str - The string format to match in the column. - to_str : bool, default=True - Whether to convert the value/column to string type. - **kwargs : dict, optional - Remaining preprocessing keyword arguments. - - Returns - ------- - sqlalchemy.sql.elements.BinaryExpression - An expression representing where the condition was satisfied. - - """ - return process_column(col, to_str=to_str, **kwargs).like( - fmt.format(process_elem(value, to_str=to_str, **kwargs)), - ) - - -def has_substring( - col: Column, - substring: Any, - lower: bool = True, - **kwargs: bool, -) -> BinaryExpression: - """Condition that a column has some substring. - - Assumes that we're searching for a string, where both the value and - column values should be converted to strings and made lowercase. - - Parameters - ---------- - col : sqlalchemy.sql.schema.Column - The column to condition. - substring : Any - The substring to match in the column. - lower : bool, default=True - Whether to convert the value and column to lowercase. - This is only relevant when the column/value are strings. - **kwargs : dict, optional - Remaining preprocessing keyword arguments. - - Returns - ------- - sqlalchemy.sql.elements.BinaryExpression - An expression representing where the condition was satisfied. - - """ - return has_string_format(col, substring, "%%{}%%", lower=lower, **kwargs) - - -def starts_with( - col: Column, - value: Any, - lower: bool = True, - trim: bool = True, - **kwargs: bool, -) -> BinaryExpression: - """Condition that a column starts with some value/string. - - Assumes that we're searching for a string, where both the value and - column values should be converted to strings, made lowercase, and - trimmed of leading/trailing whitespace. - - Parameters - ---------- - col : sqlalchemy.sql.schema.Column - The column to condition. - value : Any - The value to match at the start. - lower : bool, default=True - Whether to convert the value and column to lowercase. - This is only relevant when the column/value are strings. - trim : bool, default=True - Whether to trim (strip) whitespace on the value and column. - This is only relevant when the column/value are strings. - **kwargs : dict, optional - Remaining preprocessing keyword arguments. - - Returns - ------- - sqlalchemy.sql.elements.BinaryExpression - An expression representing where the condition was satisfied. - - """ - return has_string_format(col, value, "{}%%", lower=lower, trim=trim, **kwargs) - - -def ends_with( - col: Column, - value: Any, - lower: bool = True, - trim: bool = True, - **kwargs: bool, -) -> BinaryExpression: - """Condition that a column ends with some value/string. - - Assumes that we're searching for a string, where both the value and - column values should be converted to strings, made lowercase, and - trimmed of leading/trailing whitespace. - - Parameters - ---------- - col : sqlalchemy.sql.schema.Column - The column to condition. - value : Any - The value to match at the end. - lower : bool, default=True - Whether to convert the value and column to lowercase. - This is only relevant when the column/value are strings. - trim : bool, default=True - Whether to trim (strip) whitespace on the value and column. - This is only relevant when the column/value are strings. - **kwargs : dict, optional - Remaining preprocessing keyword arguments. - - Returns - ------- - sqlalchemy.sql.elements.BinaryExpression - An expression representing where the condition was satisfied. - - """ - return has_string_format(col, value, "%%{}", lower=lower, trim=trim, **kwargs) - - -def in_( - col: Column, - lst: List[Any], - lower: bool = True, - trim: bool = True, - **kwargs: bool, -) -> BinaryExpression: - """Condition that a column value is in a list of values. - - Assumes that if searching for a string, both the value and column values - should be converted to lowercase and trimmed of leading/trailing whitespace. - - Parameters - ---------- - col : sqlalchemy.sql.schema.Column - The column to condition. - lst : list of any - The value to match at the start. - lower : bool, default=True - Whether to convert the value and column to lowercase. - This is only relevant when the column/value are strings. - trim : bool, default=True - Whether to trim (strip) whitespace on the value and column. - This is only relevant when the column/value are strings. - **kwargs : dict, optional - Remaining preprocessing keyword arguments. - - Returns - ------- - sqlalchemy.sql.elements.BinaryExpression - An expression representing where the condition was satisfied. - - """ - return process_column(col, lower=lower, trim=trim, **kwargs).in_( - process_list(lst, lower=lower, trim=trim, **kwargs), - ) - - -def _check_column_type( - table: TableTypes, - cols: Union[str, List[str]], - types: Union[Any, List[Any]], - raise_error: bool = False, -) -> bool: - """Check whether some columns are each one of a number of types. - - Parameters - ---------- - table: cyclops.query.util.TableTypes - The table. - cols: str or list of str - Column names to check. - types: any - The allowed types for each column. - raise_error: bool - Whether to raise an error if one of the columns are none of the types. - - Returns - ------- - bool - Whether all of the columns are one of the types. - - """ - cols = to_list(cols) - types = to_list(types) - is_type = [ - any(isinstance(get_column(table, col).type, type_) for type_ in types) - for col in cols - ] - if raise_error and not all(is_type): - incorrect_type = list( - set(cols) - {col for i, col in enumerate(cols) if is_type[i]}, - ) - types_str = ", ".join([type_.__name__ for type_ in types]) - actual_types_str = [type(col).__name__ for col in incorrect_type] - raise ValueError( - f"""{incorrect_type} columns are not one of types {types_str}. - They have types {actual_types_str}.""", - ) - - return all(is_type) - - -def check_timestamp_columns( - table: TableTypes, - cols: Union[str, List[str]], - raise_error: bool = False, -) -> bool: - """Check whether some columns are Date or DateTime columns. - - Parameters - ---------- - table: cyclops.query.util.TableTypes - The table. - cols: str or list of str - Column names to check. - raise_error: bool - Whether to raise an error if one of the columns are none of the types. - - Returns - ------- - bool - Whether all of the columns are one of the types. - - """ - return _check_column_type(table, cols, [Date, DateTime], raise_error=raise_error) - - -@table_params_to_type(Subquery) -def get_delta_column( - table: TableTypes, - years: Optional[str] = None, - months: Optional[str] = None, - days: Optional[str] = None, - hours: Optional[str] = None, -) -> Column: - """Create a time delta column. - - Create a time delta (interval) column from a number of - numeric timestamp columns. - - Warning: Null values in each specified numeric time column are coalesced to 0. - - Parameters - ---------- - table: cyclops.query.util.TableTypes - The table. - years: None or str - Years column. - months: None or str - Months column. - days: None or str - Days column. - hours: None or str - Hours column. - - Returns - ------- - sqlalchemy.sql.schema.Column - Combined delta/interval column. - - """ - - def get_col_or_none(col: Optional[str] = None) -> Optional[Column]: - """If col is not None, get interval column from names.""" - return None if col is None else get_column(table, col) - - years = get_col_or_none(years) - months = get_col_or_none(months) - days = get_col_or_none(days) - hours = get_col_or_none(hours) - - time_cols = [years, months, days, hours] - names = ["YEARS", "MONTHS", "DAYS", "HOURS"] - - # Consider only the non-null columns. - names = [names[i] for i in range(len(names)) if time_cols[i] is not None] - time_cols = [col for col in time_cols if col is not None] - - if len(time_cols) == 0: - raise ValueError("One or more time interval columns must be specified.") - - # Create interval columns. - interval_cols = [] - for i, col in enumerate(time_cols): - interval_cols.append( - func.cast(func.concat(func.coalesce(col, 0), " " + names[i]), Interval), - ) - - # Create combined interval column. - combined_interval_col = interval_cols[0] - for i in range(1, len(interval_cols)): - combined_interval_col = combined_interval_col + interval_cols[i] - - return combined_interval_col diff --git a/docs/source/api.rst b/docs/source/api.rst index aa6cfd3a7..caf347cc6 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -3,7 +3,6 @@ API Reference .. toctree:: - reference/api/cyclops.query.rst reference/api/cyclops.data.rst reference/api/cyclops.tasks.rst reference/api/cyclops.evaluate.rst diff --git a/docs/source/conf.py b/docs/source/conf.py index 534be55a1..288beebd9 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -22,7 +22,7 @@ # -- Project information ----------------------------------------------------- project = "cyclops" -copyright = "2022, Vector AI Engineering" # noqa: A001 +copyright = "2023, Vector AI Engineering" # noqa: A001 author = "Vector AI Engineering" diff --git a/docs/source/reference/api/cyclops.query.rst b/docs/source/reference/api/cyclops.query.rst deleted file mode 100644 index 6a38b33f2..000000000 --- a/docs/source/reference/api/cyclops.query.rst +++ /dev/null @@ -1,30 +0,0 @@ -.. role:: hidden - :class: hidden-section - -cyclops.query -============= - -.. automodule:: cyclops.query - -.. autosummary:: - :toctree: _autosummary - :nosignatures: - :template: custom-module-template.rst - - interface - ops - base - -dataset APIs ------------- - -.. autosummary:: - :toctree: _autosummary - :nosignatures: - :template: custom-module-template.rst - - mimiciii - mimiciv - eicu - omop - gemini diff --git a/docs/source/tutorials.rst b/docs/source/tutorials.rst index 788462a2a..490aaaf11 100644 --- a/docs/source/tutorials.rst +++ b/docs/source/tutorials.rst @@ -3,6 +3,5 @@ Tutorials .. toctree:: - tutorials_query tutorials_monitor tutorials_use_cases diff --git a/docs/source/tutorials/eicu/query_api.ipynb b/docs/source/tutorials/eicu/query_api.ipynb deleted file mode 100644 index 84e348c6f..000000000 --- a/docs/source/tutorials/eicu/query_api.ipynb +++ /dev/null @@ -1,203 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "28288104-b8ce-43fa-a22b-8f02239551a5", - "metadata": {}, - "source": [ - "# eICU-CRD query API tutorial" - ] - }, - { - "cell_type": "markdown", - "id": "74b155b9-9ba6-455c-b987-4c261c339364", - "metadata": {}, - "source": [ - "This notebook shows examples of how to use the cyclops.query API on [eicu-CRD](https://eicu-crd.mit.edu/).\n", - "\n", - "Each query is limit to 100 rows (for quick results).\n", - "\n", - "* First, setup the eICU database according to the instructions in [eicu-code](https://github.com/MIT-LCP/eicu-code/tree/master/build-db/postgres).\n", - "* The database is assumed to be hosted using postgres. Update the config parameters such as username and password, passed to `EICUQuerier` accordingly." - ] - }, - { - "cell_type": "markdown", - "id": "0c7c1635-3a70-42d1-99a6-c7f7a9cf9d21", - "metadata": {}, - "source": [ - "## Imports and instantiate `EICUQuerier`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "75a140e0-fb27-4319-862f-be54397abe5c", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "\"\"\"eICU-CRD query API tutorial.\"\"\"\n", - "\n", - "import cyclops.query.ops as qo\n", - "from cyclops.query import EICUQuerier\n", - "\n", - "\n", - "querier = EICUQuerier(\n", - " dbms=\"postgresql\",\n", - " port=5432,\n", - " host=\"localhost\",\n", - " database=\"eicu\",\n", - " user=\"postgres\",\n", - " password=\"pwd\",\n", - ")\n", - "# List all tables.\n", - "querier.list_tables(\"eicu_crd\")" - ] - }, - { - "cell_type": "markdown", - "id": "ea04cc11-e947-4097-91df-729f28b3732c", - "metadata": {}, - "source": [ - "## Example 1. Get all female patients discharged in 2014 (limit to 100 rows)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c1efa964-8978-4a0e-9892-5ea4ce9953a3", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "ops = qo.Sequential(\n", - " qo.ConditionEquals(\"hospitaldischargeyear\", 2014),\n", - " qo.ConditionEquals(\"gender\", \"Female\"),\n", - ")\n", - "patients = querier.eicu_crd.patient()\n", - "patients = patients.ops(ops).run(limit=100)\n", - "print(f\"{len(patients)} rows extracted!\")" - ] - }, - { - "cell_type": "markdown", - "id": "cd43679b-3d33-4135-b97c-373447289a9c", - "metadata": { - "tags": [] - }, - "source": [ - "## Example 2. Get all patient encounters with diagnoses (`schizophrenia` in `diagnosisstring`), discharged in the year 2015." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a7ab5fa3-e26b-47a7-818f-1bf367a55760", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "patients = querier.eicu_crd.patient()\n", - "diagnoses = querier.eicu_crd.diagnosis()\n", - "diagnoses = diagnoses.ops(qo.ConditionSubstring(\"diagnosisstring\", \"schizophrenia\"))\n", - "patient_diagnoses = patients.join(\n", - " join_table=diagnoses,\n", - " on=\"patientunitstayid\",\n", - ")\n", - "patient_diagnoses = patient_diagnoses.run(limit=100)\n", - "print(f\"{len(patient_diagnoses)} rows extracted!\")" - ] - }, - { - "cell_type": "markdown", - "id": "241f7d10-9e04-44ae-b325-87f5a4046df2", - "metadata": {}, - "source": [ - "## Example 3. Get potassium lab tests for patients discharged in the year 2014, for all teaching hospitals." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "24043abc-1878-4e00-8229-36d4a0368b98", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "hospitals = querier.eicu_crd.hospital()\n", - "hospitals = hospitals.ops(qo.ConditionEquals(\"teachingstatus\", True))\n", - "patients = querier.eicu_crd.patient()\n", - "patients = patients.ops(qo.ConditionEquals(\"hospitaldischargeyear\", 2015))\n", - "patients = patients.join(\n", - " join_table=hospitals,\n", - " on=\"hospitalid\",\n", - ")\n", - "labs = querier.eicu_crd.lab()\n", - "labs = labs.ops(qo.ConditionEquals(\"labname\", \"potassium\"))\n", - "patient_labs = patients.join(\n", - " join_table=labs,\n", - " on=\"patientunitstayid\",\n", - ").run(limit=100)\n", - "print(f\"{len(patient_labs)} rows extracted!\")" - ] - }, - { - "cell_type": "markdown", - "id": "2dbf0fdc-8d99-4fde-ab3e-7c8a67b72f8b", - "metadata": {}, - "source": [ - "## Example 4. Get glucose medications (substring search) for female patients discharged in 2014." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f6142f27-e8d1-453c-bfe2-2265d9ff1914", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "ops = qo.Sequential(\n", - " qo.ConditionEquals(\"hospitaldischargeyear\", 2014),\n", - " qo.ConditionEquals(\"gender\", \"Female\"),\n", - ")\n", - "patients = querier.eicu_crd.patient()\n", - "patients = patients.ops(ops)\n", - "medications = querier.eicu_crd.medication()\n", - "medications = medications.ops(qo.ConditionSubstring(\"drugname\", \"glucose\"))\n", - "patient_medications = patients.join(\n", - " join_table=medications,\n", - " on=\"patientunitstayid\",\n", - ").run(limit=100)\n", - "print(f\"{len(patient_medications)} rows extracted!\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "cyclops", - "language": "python", - "name": "cyclops" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/source/tutorials/gemini/query_api.ipynb b/docs/source/tutorials/gemini/query_api.ipynb deleted file mode 100644 index ce7c159aa..000000000 --- a/docs/source/tutorials/gemini/query_api.ipynb +++ /dev/null @@ -1,289 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "6b8f5515-0d77-4989-8e08-33872eafb790", - "metadata": {}, - "source": [ - "# GEMINI query API tutorial" - ] - }, - { - "cell_type": "markdown", - "id": "633e60f1-d4cc-4d4a-87de-db14c4152ac6", - "metadata": {}, - "source": [ - "## Imports and instantiate `GEMINIQuerier`." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "00c4a25b-1f3d-4613-b20f-469e7b1043b7", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-09-21 09:17:45,449 \u001b[1;37mINFO\u001b[0m cyclops.query.orm - Database setup, ready to run queries!\n" - ] - }, - { - "data": { - "text/plain": [ - "['public.lookup_icd10_ca_description',\n", - " 'public.lookup_statcan',\n", - " 'public.lookup_cci',\n", - " 'public.lookup_icd10_ca_to_ccsr',\n", - " 'public.lookup_ip_administrative',\n", - " 'public.lookup_lab_concept',\n", - " 'public.lookup_vitals_concept',\n", - " 'public.lookup_pharmacy_concept',\n", - " 'public.lookup_diagnosis',\n", - " 'public.locality_variables',\n", - " 'public.admdad',\n", - " 'public.derived_variables',\n", - " 'public.ipscu',\n", - " 'public.lookup_phy_characteristics',\n", - " 'public.lab',\n", - " 'public.ipintervention',\n", - " 'public.lookup_ccsr',\n", - " 'public.lookup_pharmacy_route',\n", - " 'public.lookup_transfusion_concept',\n", - " 'public.lookup_ip_scu',\n", - " 'public.lookup_er_administrative',\n", - " 'public.lookup_imaging',\n", - " 'public.pharmacy',\n", - " 'public.radiology',\n", - " 'public.lookup_transfer',\n", - " 'public.ipdiagnosis',\n", - " 'public.lookup_room_transfer',\n", - " 'public.er',\n", - " 'public.erdiagnosis',\n", - " 'public.erintervention',\n", - " 'public.roomtransfer',\n", - " 'public.transfusion',\n", - " 'public.vitals',\n", - " 'public.lookup_hospital_num']" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "\"\"\"GEMINI query API tutorial.\"\"\"\n", - "\n", - "import cyclops.query.ops as qo\n", - "from cyclops.query import GEMINIQuerier\n", - "\n", - "\n", - "querier = GEMINIQuerier(\n", - " host=\"db.gemini-hpc.ca\",\n", - " database=\"delirium_v4_0_1\",\n", - " user=\"username\",\n", - " password=\"password\",\n", - ")\n", - "# List all tables.\n", - "querier.list_tables(\"public\")" - ] - }, - { - "cell_type": "markdown", - "id": "7980e403-1f99-4886-8856-d6d6b11ec15d", - "metadata": {}, - "source": [ - "## Example 1a. Create a table with only one hospitalization per patient, keeping the most recent encounter for each patient. Sort the dataset by `patient_id_hashed` and `discharge_date_time`, and then keep the recent record." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "2f3e9b5f-3156-4414-a296-1e45bde9d147", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-09-21 09:17:51,902 \u001b[1;37mINFO\u001b[0m cyclops.query.orm - Query returned successfully!\n", - "2023-09-21 09:17:51,903 \u001b[1;37mINFO\u001b[0m cyclops.utils.profile - Finished executing function run_query in 6.093352 s\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "189734 rows extracted!\n" - ] - } - ], - "source": [ - "ops = qo.Sequential(\n", - " qo.OrderBy(\n", - " [\"patient_id_hashed\", \"discharge_date_time\"],\n", - " ascending=[True, False],\n", - " ),\n", - " qo.Distinct(\"patient_id_hashed\"),\n", - ")\n", - "encounters = querier.public.admdad()\n", - "encounters = encounters.ops(ops).run()\n", - "print(f\"{len(encounters)} rows extracted!\")" - ] - }, - { - "cell_type": "markdown", - "id": "9040fd47-79b7-4d1d-9ca4-2ea09efea267", - "metadata": {}, - "source": [ - "## Example 1b. From the above set of encounters, take a subset of patients who were discharged between April 1, 2015 and March 31, 2016." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "105c0824-b735-4f6b-a008-7ddf8279eb20", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-09-21 09:17:52,591 \u001b[1;37mINFO\u001b[0m cyclops.query.orm - Query returned successfully!\n", - "2023-09-21 09:17:52,592 \u001b[1;37mINFO\u001b[0m cyclops.utils.profile - Finished executing function run_query in 0.675141 s\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "32567 rows extracted!\n" - ] - } - ], - "source": [ - "ops = qo.Sequential(\n", - " qo.Cast(\"discharge_date_time\", \"timestamp\"),\n", - " qo.ConditionAfterDate(\"discharge_date_time\", \"2015-04-01\"),\n", - " qo.ConditionBeforeDate(\"discharge_date_time\", \"2016-03-31\"),\n", - ")\n", - "encounters_query = querier.public.admdad()\n", - "encounters_query = encounters_query.ops(ops)\n", - "encounters = encounters_query.run()\n", - "print(f\"{len(encounters)} rows extracted!\")" - ] - }, - { - "cell_type": "markdown", - "id": "f4d8b0bc-4fe1-4b07-86ad-ce1032708d78", - "metadata": {}, - "source": [ - "## Example 1c. From the above set of encounters, get the total number of admissions for each hospital." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "65d92139-c1d5-45ce-908f-9052c9131ed3", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-09-21 09:17:52,856 \u001b[1;37mINFO\u001b[0m cyclops.query.orm - Query returned successfully!\n", - "2023-09-21 09:17:52,857 \u001b[1;37mINFO\u001b[0m cyclops.utils.profile - Finished executing function run_query in 0.145693 s\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "7 rows extracted!\n" - ] - } - ], - "source": [ - "ops = qo.GroupByAggregate(\"hospital_num\", {\"hospital_num\": (\"count\", \"count\")})\n", - "encounters_per_site = encounters_query.ops(ops).run()\n", - "print(f\"{len(encounters_per_site)} rows extracted!\")" - ] - }, - { - "cell_type": "markdown", - "id": "848c5b90-1e66-4a5a-9ebb-9c0caa105c3f", - "metadata": {}, - "source": [ - "## Example 2a. How many sodium tests were placed between Apr 1, 2015 and May 31, 2015 at hospital 101?" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "33bd3450-985f-4cc5-9da9-09846ad297e7", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-09-21 09:26:19,814 \u001b[1;37mINFO\u001b[0m cyclops.query.orm - Query returned successfully!\n", - "2023-09-21 09:26:19,815 \u001b[1;37mINFO\u001b[0m cyclops.utils.profile - Finished executing function run_query in 506.939296 s\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1 rows extracted!\n", - " hospital_num count\n", - "0 101 9305\n" - ] - } - ], - "source": [ - "encounters = querier.public.admdad()\n", - "encounters = encounters.ops(qo.ConditionEquals(\"hospital_num\", 101))\n", - "lab_ops = qo.Sequential(\n", - " qo.DropEmpty(\"collection_date_time\"),\n", - " qo.Cast(\"collection_date_time\", \"timestamp\"),\n", - " qo.ConditionAfterDate(\"collection_date_time\", \"2015-04-01\"),\n", - " qo.ConditionBeforeDate(\"collection_date_time\", \"2015-05-31\"),\n", - " qo.ConditionSubstring(\"test_type_mapped\", \"sodium\"),\n", - ")\n", - "labs = querier.public.lab()\n", - "labs = labs.ops(lab_ops)\n", - "encounters_labs = encounters.join(labs, on=\"genc_id\")\n", - "encounters_labs = encounters_labs.ops(\n", - " qo.GroupByAggregate(\"hospital_num\", {\"hospital_num\": (\"count\", \"count\")}),\n", - ")\n", - "sodium_tests = encounters_labs.run()\n", - "print(f\"{len(sodium_tests)} rows extracted!\")\n", - "print(sodium_tests)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "cyclops", - "language": "python", - "name": "cyclops" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/source/tutorials/mimiciii/query_api.ipynb b/docs/source/tutorials/mimiciii/query_api.ipynb deleted file mode 100644 index a4239dfa2..000000000 --- a/docs/source/tutorials/mimiciii/query_api.ipynb +++ /dev/null @@ -1,197 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "28288104-b8ce-43fa-a22b-8f02239551a5", - "metadata": {}, - "source": [ - "# MIMIC-III query API tutorial" - ] - }, - { - "cell_type": "markdown", - "id": "74b155b9-9ba6-455c-b987-4c261c339364", - "metadata": {}, - "source": [ - "This notebook shows examples of how to use the cyclops.query API on [MIMIC-III v1.4](https://physionet.org/content/mimiciii/1.4/).\n", - "\n", - "Each query is limit to 100 rows (for quick results).\n", - "\n", - "* First, setup the MIMIC-III database according to the instructions in [mimic-code](https://github.com/MIT-LCP/mimic-code/tree/main/mimic-iii/buildmimic/postgres).\n", - "* The database is assumed to be hosted using postgres. Update the config parameters such as username and password, passed to `MIMICIIIQuerier` accordingly." - ] - }, - { - "cell_type": "markdown", - "id": "0c7c1635-3a70-42d1-99a6-c7f7a9cf9d21", - "metadata": {}, - "source": [ - "## Imports and instantiate `MIMICIIIQuerier`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "75a140e0-fb27-4319-862f-be54397abe5c", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "\"\"\"MIMICIII query API tutorial.\"\"\"\n", - "\n", - "import cyclops.query.ops as qo\n", - "from cyclops.query import MIMICIIIQuerier\n", - "\n", - "\n", - "querier = MIMICIIIQuerier(\n", - " dbms=\"postgresql\",\n", - " port=5432,\n", - " host=\"localhost\",\n", - " database=\"mimiciii\",\n", - " user=\"postgres\",\n", - " password=\"pwd\",\n", - ")\n", - "# List all custom table methods.\n", - "querier.list_custom_tables()" - ] - }, - { - "cell_type": "markdown", - "id": "ea04cc11-e947-4097-91df-729f28b3732c", - "metadata": {}, - "source": [ - "## Example 1. Get all male patients with a mortality outcome." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c1efa964-8978-4a0e-9892-5ea4ce9953a3", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "ops = qo.Sequential(\n", - " qo.ConditionEquals(\"expire_flag\", 1),\n", - " qo.ConditionEquals(\"gender\", \"M\"),\n", - ")\n", - "patients = querier.mimiciii.patients()\n", - "patients = patients.ops(ops).run(limit=100)\n", - "print(f\"{len(patients)} rows extracted!\")" - ] - }, - { - "cell_type": "markdown", - "id": "cd43679b-3d33-4135-b97c-373447289a9c", - "metadata": { - "tags": [] - }, - "source": [ - "## Example 2. Get all female patient encounters with diagnoses (`gastroenteritis` in ICD-9 long title)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a7ab5fa3-e26b-47a7-818f-1bf367a55760", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "patients = querier.mimiciii.patients()\n", - "patients = patients.ops(qo.ConditionEquals(\"gender\", \"F\"))\n", - "admissions = querier.mimiciii.admissions()\n", - "patient_admissions = patients.join(\n", - " join_table=admissions,\n", - " on=\"subject_id\",\n", - ")\n", - "diagnoses = querier.diagnoses()\n", - "diagnoses = diagnoses.ops(qo.ConditionSubstring(\"long_title\", \"gastroenteritis\"))\n", - "patient_admissions_diagnoses = patient_admissions.join(\n", - " join_table=diagnoses,\n", - " on=[\"subject_id\", \"hadm_id\"],\n", - ").run(limit=100)\n", - "print(f\"{len(patient_admissions_diagnoses)} rows extracted!\")" - ] - }, - { - "cell_type": "markdown", - "id": "241f7d10-9e04-44ae-b325-87f5a4046df2", - "metadata": {}, - "source": [ - "## Example 3. Get potassium lab tests for female patients." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "24043abc-1878-4e00-8229-36d4a0368b98", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "patients = querier.mimiciii.patients()\n", - "patients = patients.ops(qo.ConditionEquals(\"gender\", \"F\"))\n", - "labs = querier.labevents()\n", - "labs = labs.ops(qo.ConditionEquals(\"label\", \"potassium\"))\n", - "patient_labs = patients.join(labs, on=\"subject_id\").run(limit=100)\n", - "print(f\"{len(patient_labs)} rows extracted!\")" - ] - }, - { - "cell_type": "markdown", - "id": "2dbf0fdc-8d99-4fde-ab3e-7c8a67b72f8b", - "metadata": {}, - "source": [ - "## Example 4. Get AaDO2 carevue chart events for male patients that have a `valuenum` of less than 20." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f6142f27-e8d1-453c-bfe2-2265d9ff1914", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "chartevents_ops = qo.Sequential(\n", - " qo.ConditionEquals(\"dbsource\", \"carevue\"),\n", - " qo.ConditionEquals(\"label\", \"AaDO2\"),\n", - " qo.ConditionLessThan(\"valuenum\", 20),\n", - ")\n", - "patients = querier.mimiciii.patients()\n", - "patients = patients.ops(qo.ConditionEquals(\"gender\", \"M\"))\n", - "chart_events = querier.chartevents()\n", - "chart_events = chart_events.ops(chartevents_ops)\n", - "patient_chart_events = patients.join(chart_events, on=\"subject_id\").run(limit=100)\n", - "print(f\"{len(patient_chart_events)} rows extracted!\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "cyclops", - "language": "python", - "name": "cyclops" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/source/tutorials/mimiciv/query_api.ipynb b/docs/source/tutorials/mimiciv/query_api.ipynb deleted file mode 100644 index 98bce4897..000000000 --- a/docs/source/tutorials/mimiciv/query_api.ipynb +++ /dev/null @@ -1,341 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "e337389b-1cfe-4796-a846-b4e1ba5690d6", - "metadata": {}, - "source": [ - "# MIMIC-IV query API tutorial" - ] - }, - { - "cell_type": "markdown", - "id": "1efffc64", - "metadata": {}, - "source": [ - "This notebook shows examples of how to use the cyclops.query API on [MIMIC-IV v2.0](https://physionet.org/content/mimiciv/2.0/).\n", - "\n", - "Each query is limit to 100 rows (for quick results).\n", - "\n", - "* First, setup the MIMIC-IV database according to the instructions in [mimic-code](https://github.com/MIT-LCP/mimic-code/tree/main/mimic-iv/buildmimic/postgres).\n", - "* The database is assumed to be hosted using postgres. Update the config parameters such as username and password, passed to `MIMICIVQuerier` accordingly." - ] - }, - { - "cell_type": "markdown", - "id": "12c18656-7f16-4230-85d0-944563d6a13e", - "metadata": {}, - "source": [ - "## Imports and instantiate `MIMICIVQuerier`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "53009e6b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "\"\"\"MIMICIV query API tutorial.\"\"\"\n", - "\n", - "import cyclops.query.ops as qo\n", - "from cyclops.query import MIMICIVQuerier\n", - "\n", - "\n", - "querier = MIMICIVQuerier(\n", - " dbms=\"postgresql\",\n", - " port=5432,\n", - " host=\"localhost\",\n", - " database=\"mimiciv-2.0\",\n", - " user=\"postgres\",\n", - " password=\"pwd\",\n", - ")\n", - "# List all schemas.\n", - "querier.list_schemas()" - ] - }, - { - "cell_type": "markdown", - "id": "7b6214f8", - "metadata": { - "tags": [] - }, - "source": [ - "## Example 1. Get all patient admissions from 2021 or later (approx year of admission)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cdfadaa4-6fd6-4fd7-85cf-e012aa0799e1", - "metadata": {}, - "outputs": [], - "source": [ - "patients = querier.patients()\n", - "admissions = querier.mimiciv_hosp.admissions()\n", - "patient_admissions = patients.join(admissions, on=\"subject_id\")\n", - "ops = qo.Sequential(\n", - " qo.AddDeltaColumn([\"admittime\", \"dischtime\"], years=\"anchor_year_difference\"),\n", - " qo.ConditionAfterDate(\"admittime\", \"2021-01-01\"),\n", - ")\n", - "patient_admissions = patient_admissions.ops(ops).run(limit=100)\n", - "print(f\"{len(patient_admissions)} rows extracted!\")" - ] - }, - { - "cell_type": "markdown", - "id": "80d9f06e", - "metadata": {}, - "source": [ - "## Example 2. Get all patient encounters with diagnoses (`schizophrenia` in ICD-10 long title), in the year 2015." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a89a9cf0", - "metadata": {}, - "outputs": [], - "source": [ - "patients = querier.patients()\n", - "admissions = querier.mimiciv_hosp.admissions()\n", - "patient_admissions = patients.join(admissions, on=\"subject_id\")\n", - "ops = qo.Sequential(\n", - " qo.AddDeltaColumn([\"admittime\", \"dischtime\"], years=\"anchor_year_difference\"),\n", - " qo.ConditionInYears(\"admittime\", \"2015\"),\n", - ")\n", - "patient_admissions = patient_admissions.ops(ops)\n", - "diagnoses = querier.diagnoses()\n", - "diagnoses_ops = qo.Sequential(\n", - " qo.ConditionEquals(\"icd_version\", 10),\n", - " qo.ConditionSubstring(\"long_title\", \"schizophrenia\"),\n", - ")\n", - "diagnoses = diagnoses.ops(diagnoses_ops)\n", - "patient_admissions_diagnoses = patient_admissions.join(\n", - " join_table=diagnoses,\n", - " on=[\"subject_id\", \"hadm_id\"],\n", - ").run(limit=100)\n", - "print(f\"{len(patient_admissions_diagnoses)} rows extracted!\")" - ] - }, - { - "cell_type": "markdown", - "id": "e2baea54", - "metadata": {}, - "source": [ - "## Example 3. Advanced - uses `ConditionRegexMatch` from `cyclops.query.ops`. Get all patient encounters with diagnoses (ICD-9 long title contains `schizophrenia` and `chronic` ), in the year 2015." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "03936cee", - "metadata": {}, - "outputs": [], - "source": [ - "patients = querier.patients()\n", - "admissions = querier.mimiciv_hosp.admissions()\n", - "patient_admissions = patients.join(admissions, on=\"subject_id\")\n", - "ops = qo.Sequential(\n", - " qo.AddDeltaColumn([\"admittime\", \"dischtime\"], years=\"anchor_year_difference\"),\n", - " qo.ConditionInYears(\"admittime\", \"2015\"),\n", - ")\n", - "patient_admissions = patient_admissions.ops(ops)\n", - "diagnoses = querier.diagnoses()\n", - "diagnoses_ops = qo.Sequential(\n", - " qo.ConditionEquals(\"icd_version\", 9),\n", - " qo.ConditionRegexMatch(\"long_title\", r\"(?=.*schizophrenia)(?=.*chronic)\"),\n", - ")\n", - "diagnoses = diagnoses.ops(diagnoses_ops)\n", - "patient_admissions_diagnoses = patient_admissions.join(\n", - " join_table=diagnoses,\n", - " on=[\"subject_id\", \"hadm_id\"],\n", - ").run(limit=100)\n", - "print(f\"{len(patient_admissions_diagnoses)} rows extracted!\")" - ] - }, - { - "cell_type": "markdown", - "id": "30b0d604", - "metadata": {}, - "source": [ - "## Example 4. Get routine vital signs for patients from year 2015." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "56a72377", - "metadata": {}, - "outputs": [], - "source": [ - "patients = querier.patients()\n", - "admissions = querier.mimiciv_hosp.admissions()\n", - "patient_admissions = patients.join(admissions, on=\"subject_id\")\n", - "ops = qo.Sequential(\n", - " qo.AddDeltaColumn([\"admittime\", \"dischtime\"], years=\"anchor_year_difference\"),\n", - " qo.ConditionInYears(\"admittime\", \"2015\"),\n", - ")\n", - "patient_admissions = patient_admissions.ops(ops)\n", - "chart_events = querier.chartevents()\n", - "vitals = chart_events.ops(qo.ConditionEquals(\"category\", \"Routine Vital Signs\"))\n", - "patient_admissions_vitals = patient_admissions.join(\n", - " join_table=vitals,\n", - " on=[\"subject_id\", \"hadm_id\"],\n", - ").run(limit=100)\n", - "print(f\"{len(patient_admissions_vitals)} rows extracted!\")" - ] - }, - { - "cell_type": "markdown", - "id": "621479f0", - "metadata": {}, - "source": [ - "## Example 5. Get hemoglobin lab tests for patients from year 2009." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bce11f81", - "metadata": {}, - "outputs": [], - "source": [ - "patients = querier.patients()\n", - "admissions = querier.mimiciv_hosp.admissions()\n", - "patient_admissions = patients.join(admissions, on=\"subject_id\")\n", - "ops = qo.Sequential(\n", - " qo.AddDeltaColumn([\"admittime\", \"dischtime\"], years=\"anchor_year_difference\"),\n", - " qo.ConditionInYears(\"admittime\", \"2009\"),\n", - ")\n", - "patient_admissions = patient_admissions.ops(ops)\n", - "chart_events = querier.chartevents()\n", - "labs = chart_events.ops(qo.ConditionEquals(\"label\", \"hemoglobin\"))\n", - "patient_admissions_labs = patient_admissions.join(\n", - " join_table=labs,\n", - " on=[\"subject_id\", \"hadm_id\"],\n", - ").run(limit=100)\n", - "print(f\"{len(patient_admissions_labs)} rows extracted!\")" - ] - }, - { - "cell_type": "markdown", - "id": "b0a9bc12-dda3-4445-9156-52d295a1c48f", - "metadata": {}, - "source": [ - "## Example 6. Get radiology reports and filter on keywords `lymphadenopathy` and `infectious` occurring together from year 2009." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f00d270c-d78f-4dc0-8dae-ff4d52958c8b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "patients = querier.patients()\n", - "admissions = querier.mimiciv_hosp.admissions()\n", - "patient_admissions = patients.join(admissions, on=\"subject_id\")\n", - "ops = qo.Sequential(\n", - " qo.AddDeltaColumn([\"admittime\", \"dischtime\"], years=\"anchor_year_difference\"),\n", - " qo.ConditionInYears(\"admittime\", \"2009\"),\n", - ")\n", - "patient_admissions = patient_admissions.ops(ops)\n", - "radiology_notes = querier.mimiciv_note.radiology()\n", - "radiology_notes_ops = qo.Sequential(\n", - " qo.And(\n", - " qo.ConditionLike(\"text\", \"% lymphadenopathy %\"),\n", - " qo.ConditionLike(\"text\", \"% infectious %\"),\n", - " ),\n", - ")\n", - "radiology_notes = radiology_notes.ops(radiology_notes_ops)\n", - "patient_admissions_radiology_notes = patient_admissions.join(\n", - " join_table=radiology_notes,\n", - " on=[\"subject_id\", \"hadm_id\"],\n", - ").run(limit=100)\n", - "print(f\"{len(patient_admissions_radiology_notes)} rows extracted!\")" - ] - }, - { - "cell_type": "markdown", - "id": "9550ba62-c128-4259-a075-0cbd9c70b662", - "metadata": {}, - "source": [ - "## Example 7. Get all female patient encounters from year 2015, and return as dask dataframe (lazy evaluation) with 4 partitions (batches) aggregated based on `subject_id`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "28683d70-376e-4d9b-883d-1a7de634e455", - "metadata": {}, - "outputs": [], - "source": [ - "patients = querier.patients()\n", - "admissions = querier.mimiciv_hosp.admissions()\n", - "patient_admissions = patients.join(admissions, on=\"subject_id\")\n", - "ops = qo.Sequential(\n", - " qo.AddDeltaColumn([\"admittime\", \"dischtime\"], years=\"anchor_year_difference\"),\n", - " qo.ConditionInYears(\"admittime\", \"2015\"),\n", - " qo.Cast(\"gender\", \"str\"),\n", - " qo.ConditionEquals(\"gender\", \"F\"),\n", - ")\n", - "patient_admissions = patient_admissions.ops(ops)\n", - "patient_admissions = patient_admissions.run(\n", - " backend=\"dask\",\n", - " index_col=\"subject_id\",\n", - " n_partitions=4,\n", - ")\n", - "print(f\"{len(patient_admissions)} rows extracted!\")\n", - "print(f\"Return type: {type(patient_admissions)}\")\n", - "print(f\"Number of partitions: {patient_admissions.npartitions}\")" - ] - }, - { - "cell_type": "markdown", - "id": "e1ed2708", - "metadata": {}, - "source": [ - "## Example 8. Running a raw SQL string." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a853deec", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "data = querier.db.run_query(\"SELECT * FROM mimiciv_hosp.admissions LIMIT 100\")\n", - "print(f\"{len(data)} rows extracted!\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "cyclops", - "language": "python", - "name": "cyclops" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/source/tutorials/omop/query_api.ipynb b/docs/source/tutorials/omop/query_api.ipynb deleted file mode 100644 index 23fb06cac..000000000 --- a/docs/source/tutorials/omop/query_api.ipynb +++ /dev/null @@ -1,288 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "e337389b-1cfe-4796-a846-b4e1ba5690d6", - "metadata": {}, - "source": [ - "# OMOP query API tutorial" - ] - }, - { - "cell_type": "markdown", - "id": "1efffc64", - "metadata": {}, - "source": [ - "This notebook shows examples of how to use the cyclops.query API to query EHR databases that follow the OMOP common data model. Each query is limit to 100 rows (for quick results).\n", - "\n", - "We showcase the examples on:\n", - "\n", - "1. [Synthea](https://github.com/synthetichealth/synthea) in OMOP format.\n", - "\n", - " * First, generate synthea data using their releases. We used [v2.7.0](https://github.com/synthetichealth/synthea/releases/tag/v2.7.0) to generate data .\n", - " * Follow instructions provided in [ETL-Synthea](https://github.com/OHDSI/ETL-Synthea) to load the CSV data into a postgres database, and perform ETL to load the data into OMOP format." - ] - }, - { - "cell_type": "markdown", - "id": "12c18656-7f16-4230-85d0-944563d6a13e", - "metadata": {}, - "source": [ - "## Imports and instantiate `OMOPQuerier`." - ] - }, - { - "cell_type": "markdown", - "id": "c4cfec2d-af3f-431d-a851-c8853e31df08", - "metadata": {}, - "source": [ - "Pass in the `schema_name` which is the name of the postgres schema which houses all the OMOP tables." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "53009e6b", - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"OMOP query API tutorial.\"\"\"\n", - "\n", - "import pandas as pd\n", - "\n", - "import cyclops.query.ops as qo\n", - "from cyclops.query import OMOPQuerier\n", - "\n", - "\n", - "querier = OMOPQuerier(\n", - " dbms=\"postgresql\",\n", - " port=5432,\n", - " host=\"localhost\",\n", - " database=\"synthea_integration_test\",\n", - " user=\"postgres\",\n", - " password=\"pwd\",\n", - " schema_name=\"cdm_synthea10\",\n", - ")\n", - "# List all tables.\n", - "querier.list_tables(\"cdm_synthea10\")" - ] - }, - { - "cell_type": "markdown", - "id": "7b6214f8", - "metadata": { - "tags": [] - }, - "source": [ - "## Example 1. Get all patient visits in or after 2010." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3a3d9cb9-fe40-45b8-ba2f-8de52a3b7f4f", - "metadata": {}, - "outputs": [], - "source": [ - "visits = querier.visit_occurrence()\n", - "visits = visits.ops(qo.ConditionAfterDate(\"visit_start_date\", \"2010-01-01\"))\n", - "visits = visits.run(limit=100)\n", - "print(f\"{len(visits)} rows extracted!\")\n", - "pd.to_datetime(visits[\"visit_start_date\"]).dt.year.value_counts().sort_index()" - ] - }, - { - "cell_type": "markdown", - "id": "fcaea674-b967-4fbc-a7be-4d8b4492ef56", - "metadata": {}, - "source": [ - "## Example 2. Get measurements for all visits in or after 2020." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "030e2491-a7cc-42f3-a1ca-618212b3524c", - "metadata": {}, - "outputs": [], - "source": [ - "visits = querier.visit_occurrence()\n", - "visits = visits.ops(qo.ConditionAfterDate(\"visit_start_date\", \"2020-01-01\"))\n", - "measurements = querier.measurement()\n", - "visits_measurements = visits.join(\n", - " join_table=measurements,\n", - " on=\"visit_occurrence_id\",\n", - ").run(limit=100)\n", - "print(f\"{len(visits_measurements)} rows extracted!\")" - ] - }, - { - "cell_type": "markdown", - "id": "602273d9-8d78-4e69-8566-7f9d04553d3a", - "metadata": { - "tags": [] - }, - "source": [ - "2. [MIMIC-III v1.4](https://physionet.org/content/mimiciii/1.4/) in OMOP format.\n", - "\n", - "* First, setup the MIMIC-III database according to the instructions in [mimic-code](https://github.com/MIT-LCP/mimic-code/tree/main/mimic-iii/buildmimic/postgres).\n", - "* Perform the ETL in the [mimic-omop](https://github.com/MIT-LCP/mimic-omop) repo.\n", - "* The database is assumed to be hosted using postgres. Update the config parameters such as username and password, passed to `MIMICIIIQuerier` accordingly." - ] - }, - { - "cell_type": "markdown", - "id": "10da36a2-28e2-4975-83c7-68d0d366e459", - "metadata": {}, - "source": [ - "## Imports and instantiate `OMOPQuerier`." - ] - }, - { - "cell_type": "markdown", - "id": "4a050fa9-442b-44dc-b241-86e73516e693", - "metadata": {}, - "source": [ - "Pass in the `schema_name` which is the name of the postgres schema which houses all the OMOP tables." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0622b3df-2864-4f32-bd98-806019f59c50", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "querier = OMOPQuerier(\n", - " dbms=\"postgresql\",\n", - " port=5432,\n", - " host=\"localhost\",\n", - " database=\"mimiciii\",\n", - " user=\"postgres\",\n", - " password=\"pwd\",\n", - " schema_name=\"omop\",\n", - ")\n", - "# List all schemas.\n", - "querier.list_schemas()" - ] - }, - { - "cell_type": "markdown", - "id": "c4967160-bd45-4ce0-8b00-27f4f9742c68", - "metadata": {}, - "source": [ - "## Example 1. Get all patient visits that ended in a mortality outcome in or after 2010." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "40ff2e83-75e4-4119-aa33-26f95e63ddaa", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "visits = querier.visit_occurrence()\n", - "visits = visits.ops(qo.ConditionAfterDate(\"visit_start_date\", \"2010-01-01\"))\n", - "visits_concept_mapped = querier.map_concept_ids_to_name(\n", - " visits,\n", - " [\n", - " \"discharge_to_concept_id\",\n", - " \"admitting_concept_id\",\n", - " ],\n", - ")\n", - "visits_concept_mapped_died = visits_concept_mapped.ops(\n", - " qo.ConditionSubstring(\"discharge_to_concept_name\", \"died\"),\n", - ").run()\n", - "print(f\"{len(visits_concept_mapped_died)} rows extracted!\")" - ] - }, - { - "cell_type": "markdown", - "id": "996ad6ff-8213-43ea-b701-e52a1d0b01bf", - "metadata": { - "tags": [] - }, - "source": [ - "## Example 2. Get all measurements for female patient visits with `sepsis` diagnoses, that ended in a mortality outcome." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "46fd771c-5da7-4bce-aec7-08a5210a069b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "persons = querier.person()\n", - "persons = persons.ops(qo.ConditionSubstring(\"gender_concept_name\", \"FEMALE\"))\n", - "visits = querier.visit_occurrence()\n", - "person_visits = persons.join(visits, on=\"person_id\")\n", - "conditions = querier.omop.condition_occurrence()\n", - "person_visits_conditions = person_visits.join(\n", - " conditions,\n", - " on=\"visit_occurrence_id\",\n", - " isouter=True,\n", - ")\n", - "measurement = querier.measurement()\n", - "person_visits_conditions_measurements = person_visits_conditions.join(\n", - " measurement,\n", - " on=\"visit_occurrence_id\",\n", - " isouter=True,\n", - ")\n", - "person_visits_conditions_measurements = querier.map_concept_ids_to_name(\n", - " person_visits_conditions_measurements,\n", - " [\n", - " \"discharge_to_concept_id\",\n", - " \"admitting_concept_id\",\n", - " \"condition_concept_id\",\n", - " ],\n", - ")\n", - "ops = qo.Sequential(\n", - " qo.ConditionSubstring(\"discharge_to_concept_name\", \"died\"),\n", - " qo.ConditionSubstring(\"condition_concept_name\", \"sepsis\"),\n", - ")\n", - "cohort = person_visits_conditions_measurements.ops(ops).run(limit=100)\n", - "print(f\"{len(cohort)} rows extracted!\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d20a2581-f613-4ab8-9feb-3e84b8835db1", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "cohort[\"measurement_concept_name\"].value_counts()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "cyclops", - "language": "python", - "name": "cyclops" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/source/tutorials_query.rst b/docs/source/tutorials_query.rst deleted file mode 100644 index c540bc4c0..000000000 --- a/docs/source/tutorials_query.rst +++ /dev/null @@ -1,16 +0,0 @@ -query API -========= - -The query API allows you to query EHR databases. It is a Python API that -that communicates with postgresql databases. It is a wrapper around the -SQLAlchemy ORM and uses SQLAlchemy query objects and functions to build -queries. - - -.. toctree:: - - tutorials/mimiciii/query_api.ipynb - tutorials/mimiciv/query_api.ipynb - tutorials/eicu/query_api.ipynb - tutorials/omop/query_api.ipynb - tutorials/gemini/query_api.ipynb diff --git a/poetry.lock b/poetry.lock index 653c9e709..6025091fd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -440,13 +440,13 @@ tomli = {version = "*", markers = "python_version < \"3.11\""} [[package]] name = "babel" -version = "2.13.0" +version = "2.13.1" description = "Internationalization utilities" optional = false python-versions = ">=3.7" files = [ - {file = "Babel-2.13.0-py3-none-any.whl", hash = "sha256:fbfcae1575ff78e26c7449136f1abbefc3c13ce542eeb13d43d50d8b047216ec"}, - {file = "Babel-2.13.0.tar.gz", hash = "sha256:04c3e2d28d2b7681644508f836be388ae49e0cfe91465095340395b60d00f210"}, + {file = "Babel-2.13.1-py3-none-any.whl", hash = "sha256:7077a4984b02b6727ac10f1f7294484f737443d7e2e66c5e4380e41a3ae0b4ed"}, + {file = "Babel-2.13.1.tar.gz", hash = "sha256:33e0952d7dd6374af8dbf6768cc4ddf3ccfefc244f9986d4074704f2fbd18900"}, ] [package.extras] @@ -1121,33 +1121,34 @@ files = [ [[package]] name = "dask" -version = "2022.12.1" +version = "2023.10.0" description = "Parallel PyData with Task Scheduling" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "dask-2022.12.1-py3-none-any.whl", hash = "sha256:a833ee774bf702c08d22f31412358d12b007df36c6e8c107f32f17a4b20f1f68"}, - {file = "dask-2022.12.1.tar.gz", hash = "sha256:ef12c98a6681964494ddfee4ba8071ebc8895d3c4ea27f5c5160a14e29f01d92"}, + {file = "dask-2023.10.0-py3-none-any.whl", hash = "sha256:cb133919ff6f1fc021efe1eef24df0e4acecf33a7708e7b04d2dea6b45e166bb"}, + {file = "dask-2023.10.0.tar.gz", hash = "sha256:3fdfdbdb5f9f3a556487bf37142e5a730dab2f2c8eca0b6c79d11199c30220e3"}, ] [package.dependencies] -click = ">=7.0" -cloudpickle = ">=1.1.1" -fsspec = ">=0.6.0" -numpy = {version = ">=1.18", optional = true, markers = "extra == \"dataframe\""} +click = ">=8.0" +cloudpickle = ">=1.5.0" +fsspec = ">=2021.09.0" +importlib-metadata = ">=4.13.0" +numpy = {version = ">=1.21", optional = true, markers = "extra == \"array\""} packaging = ">=20.0" -pandas = {version = ">=1.0", optional = true, markers = "extra == \"dataframe\""} -partd = ">=0.3.10" +pandas = {version = ">=1.3", optional = true, markers = "extra == \"dataframe\""} +partd = ">=1.2.0" pyyaml = ">=5.3.1" -toolz = ">=0.8.2" +toolz = ">=0.10.0" [package.extras] -array = ["numpy (>=1.18)"] -complete = ["bokeh (>=2.4.2,<3)", "distributed (==2022.12.1)", "jinja2", "numpy (>=1.18)", "pandas (>=1.0)"] -dataframe = ["numpy (>=1.18)", "pandas (>=1.0)"] -diagnostics = ["bokeh (>=2.4.2,<3)", "jinja2"] -distributed = ["distributed (==2022.12.1)"] -test = ["pandas[test]", "pre-commit", "pytest", "pytest-rerunfailures", "pytest-xdist"] +array = ["numpy (>=1.21)"] +complete = ["dask[array,dataframe,diagnostics,distributed]", "lz4 (>=4.3.2)", "pyarrow (>=7.0)"] +dataframe = ["dask[array]", "pandas (>=1.3)"] +diagnostics = ["bokeh (>=2.4.2)", "jinja2 (>=2.10.3)"] +distributed = ["distributed (==2023.10.0)"] +test = ["pandas[test]", "pre-commit", "pytest", "pytest-cov", "pytest-rerunfailures", "pytest-timeout", "pytest-xdist"] [[package]] name = "datasets" @@ -1535,81 +1536,6 @@ smb = ["smbprotocol"] ssh = ["paramiko"] tqdm = ["tqdm"] -[[package]] -name = "greenlet" -version = "3.0.0" -description = "Lightweight in-process concurrent programming" -optional = false -python-versions = ">=3.7" -files = [ - {file = "greenlet-3.0.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e09dea87cc91aea5500262993cbd484b41edf8af74f976719dd83fe724644cd6"}, - {file = "greenlet-3.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f47932c434a3c8d3c86d865443fadc1fbf574e9b11d6650b656e602b1797908a"}, - {file = "greenlet-3.0.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bdfaeecf8cc705d35d8e6de324bf58427d7eafb55f67050d8f28053a3d57118c"}, - {file = "greenlet-3.0.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6a68d670c8f89ff65c82b936275369e532772eebc027c3be68c6b87ad05ca695"}, - {file = "greenlet-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38ad562a104cd41e9d4644f46ea37167b93190c6d5e4048fcc4b80d34ecb278f"}, - {file = "greenlet-3.0.0-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:02a807b2a58d5cdebb07050efe3d7deaf915468d112dfcf5e426d0564aa3aa4a"}, - {file = "greenlet-3.0.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b1660a15a446206c8545edc292ab5c48b91ff732f91b3d3b30d9a915d5ec4779"}, - {file = "greenlet-3.0.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:813720bd57e193391dfe26f4871186cf460848b83df7e23e6bef698a7624b4c9"}, - {file = "greenlet-3.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:aa15a2ec737cb609ed48902b45c5e4ff6044feb5dcdfcf6fa8482379190330d7"}, - {file = "greenlet-3.0.0-cp310-universal2-macosx_11_0_x86_64.whl", hash = "sha256:7709fd7bb02b31908dc8fd35bfd0a29fc24681d5cc9ac1d64ad07f8d2b7db62f"}, - {file = "greenlet-3.0.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:211ef8d174601b80e01436f4e6905aca341b15a566f35a10dd8d1e93f5dbb3b7"}, - {file = "greenlet-3.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6512592cc49b2c6d9b19fbaa0312124cd4c4c8a90d28473f86f92685cc5fef8e"}, - {file = "greenlet-3.0.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:871b0a8835f9e9d461b7fdaa1b57e3492dd45398e87324c047469ce2fc9f516c"}, - {file = "greenlet-3.0.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b505fcfc26f4148551826a96f7317e02c400665fa0883fe505d4fcaab1dabfdd"}, - {file = "greenlet-3.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:123910c58234a8d40eaab595bc56a5ae49bdd90122dde5bdc012c20595a94c14"}, - {file = "greenlet-3.0.0-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:96d9ea57292f636ec851a9bb961a5cc0f9976900e16e5d5647f19aa36ba6366b"}, - {file = "greenlet-3.0.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0b72b802496cccbd9b31acea72b6f87e7771ccfd7f7927437d592e5c92ed703c"}, - {file = "greenlet-3.0.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:527cd90ba3d8d7ae7dceb06fda619895768a46a1b4e423bdb24c1969823b8362"}, - {file = "greenlet-3.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:37f60b3a42d8b5499be910d1267b24355c495064f271cfe74bf28b17b099133c"}, - {file = "greenlet-3.0.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:1482fba7fbed96ea7842b5a7fc11d61727e8be75a077e603e8ab49d24e234383"}, - {file = "greenlet-3.0.0-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:be557119bf467d37a8099d91fbf11b2de5eb1fd5fc5b91598407574848dc910f"}, - {file = "greenlet-3.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:73b2f1922a39d5d59cc0e597987300df3396b148a9bd10b76a058a2f2772fc04"}, - {file = "greenlet-3.0.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d1e22c22f7826096ad503e9bb681b05b8c1f5a8138469b255eb91f26a76634f2"}, - {file = "greenlet-3.0.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1d363666acc21d2c204dd8705c0e0457d7b2ee7a76cb16ffc099d6799744ac99"}, - {file = "greenlet-3.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:334ef6ed8337bd0b58bb0ae4f7f2dcc84c9f116e474bb4ec250a8bb9bd797a66"}, - {file = "greenlet-3.0.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6672fdde0fd1a60b44fb1751a7779c6db487e42b0cc65e7caa6aa686874e79fb"}, - {file = "greenlet-3.0.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:952256c2bc5b4ee8df8dfc54fc4de330970bf5d79253c863fb5e6761f00dda35"}, - {file = "greenlet-3.0.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:269d06fa0f9624455ce08ae0179430eea61085e3cf6457f05982b37fd2cefe17"}, - {file = "greenlet-3.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:9adbd8ecf097e34ada8efde9b6fec4dd2a903b1e98037adf72d12993a1c80b51"}, - {file = "greenlet-3.0.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c6b5ce7f40f0e2f8b88c28e6691ca6806814157ff05e794cdd161be928550f4c"}, - {file = "greenlet-3.0.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ecf94aa539e97a8411b5ea52fc6ccd8371be9550c4041011a091eb8b3ca1d810"}, - {file = "greenlet-3.0.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:80dcd3c938cbcac986c5c92779db8e8ce51a89a849c135172c88ecbdc8c056b7"}, - {file = "greenlet-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e52a712c38e5fb4fd68e00dc3caf00b60cb65634d50e32281a9d6431b33b4af1"}, - {file = "greenlet-3.0.0-cp37-cp37m-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d5539f6da3418c3dc002739cb2bb8d169056aa66e0c83f6bacae0cd3ac26b423"}, - {file = "greenlet-3.0.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:343675e0da2f3c69d3fb1e894ba0a1acf58f481f3b9372ce1eb465ef93cf6fed"}, - {file = "greenlet-3.0.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:abe1ef3d780de56defd0c77c5ba95e152f4e4c4e12d7e11dd8447d338b85a625"}, - {file = "greenlet-3.0.0-cp37-cp37m-win32.whl", hash = "sha256:e693e759e172fa1c2c90d35dea4acbdd1d609b6936115d3739148d5e4cd11947"}, - {file = "greenlet-3.0.0-cp37-cp37m-win_amd64.whl", hash = "sha256:bdd696947cd695924aecb3870660b7545a19851f93b9d327ef8236bfc49be705"}, - {file = "greenlet-3.0.0-cp37-universal2-macosx_11_0_x86_64.whl", hash = "sha256:cc3e2679ea13b4de79bdc44b25a0c4fcd5e94e21b8f290791744ac42d34a0353"}, - {file = "greenlet-3.0.0-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:63acdc34c9cde42a6534518e32ce55c30f932b473c62c235a466469a710bfbf9"}, - {file = "greenlet-3.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a1a6244ff96343e9994e37e5b4839f09a0207d35ef6134dce5c20d260d0302c"}, - {file = "greenlet-3.0.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b822fab253ac0f330ee807e7485769e3ac85d5eef827ca224feaaefa462dc0d0"}, - {file = "greenlet-3.0.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8060b32d8586e912a7b7dac2d15b28dbbd63a174ab32f5bc6d107a1c4143f40b"}, - {file = "greenlet-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:621fcb346141ae08cb95424ebfc5b014361621b8132c48e538e34c3c93ac7365"}, - {file = "greenlet-3.0.0-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6bb36985f606a7c49916eff74ab99399cdfd09241c375d5a820bb855dfb4af9f"}, - {file = "greenlet-3.0.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:10b5582744abd9858947d163843d323d0b67be9432db50f8bf83031032bc218d"}, - {file = "greenlet-3.0.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f351479a6914fd81a55c8e68963609f792d9b067fb8a60a042c585a621e0de4f"}, - {file = "greenlet-3.0.0-cp38-cp38-win32.whl", hash = "sha256:9de687479faec7db5b198cc365bc34addd256b0028956501f4d4d5e9ca2e240a"}, - {file = "greenlet-3.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:3fd2b18432e7298fcbec3d39e1a0aa91ae9ea1c93356ec089421fabc3651572b"}, - {file = "greenlet-3.0.0-cp38-universal2-macosx_11_0_x86_64.whl", hash = "sha256:3c0d36f5adc6e6100aedbc976d7428a9f7194ea79911aa4bf471f44ee13a9464"}, - {file = "greenlet-3.0.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:4cd83fb8d8e17633ad534d9ac93719ef8937568d730ef07ac3a98cb520fd93e4"}, - {file = "greenlet-3.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a5b2d4cdaf1c71057ff823a19d850ed5c6c2d3686cb71f73ae4d6382aaa7a06"}, - {file = "greenlet-3.0.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e7dcdfad252f2ca83c685b0fa9fba00e4d8f243b73839229d56ee3d9d219314"}, - {file = "greenlet-3.0.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c94e4e924d09b5a3e37b853fe5924a95eac058cb6f6fb437ebb588b7eda79870"}, - {file = "greenlet-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad6fb737e46b8bd63156b8f59ba6cdef46fe2b7db0c5804388a2d0519b8ddb99"}, - {file = "greenlet-3.0.0-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d55db1db455c59b46f794346efce896e754b8942817f46a1bada2d29446e305a"}, - {file = "greenlet-3.0.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:56867a3b3cf26dc8a0beecdb4459c59f4c47cdd5424618c08515f682e1d46692"}, - {file = "greenlet-3.0.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9a812224a5fb17a538207e8cf8e86f517df2080c8ee0f8c1ed2bdaccd18f38f4"}, - {file = "greenlet-3.0.0-cp39-cp39-win32.whl", hash = "sha256:0d3f83ffb18dc57243e0151331e3c383b05e5b6c5029ac29f754745c800f8ed9"}, - {file = "greenlet-3.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:831d6f35037cf18ca5e80a737a27d822d87cd922521d18ed3dbc8a6967be50ce"}, - {file = "greenlet-3.0.0-cp39-universal2-macosx_11_0_x86_64.whl", hash = "sha256:a048293392d4e058298710a54dfaefcefdf49d287cd33fb1f7d63d55426e4355"}, - {file = "greenlet-3.0.0.tar.gz", hash = "sha256:19834e3f91f485442adc1ee440171ec5d9a4840a1f7bd5ed97833544719ce10b"}, -] - -[package.extras] -docs = ["Sphinx"] -test = ["objgraph", "psutil"] - [[package]] name = "huggingface-hub" version = "0.18.0" @@ -1776,13 +1702,13 @@ files = [ [[package]] name = "ipykernel" -version = "6.25.2" +version = "6.26.0" description = "IPython Kernel for Jupyter" optional = false python-versions = ">=3.8" files = [ - {file = "ipykernel-6.25.2-py3-none-any.whl", hash = "sha256:2e2ee359baba19f10251b99415bb39de1e97d04e1fab385646f24f0596510b77"}, - {file = "ipykernel-6.25.2.tar.gz", hash = "sha256:f468ddd1f17acb48c8ce67fcfa49ba6d46d4f9ac0438c1f441be7c3d1372230b"}, + {file = "ipykernel-6.26.0-py3-none-any.whl", hash = "sha256:3ba3dc97424b87b31bb46586b5167b3161b32d7820b9201a9e698c71e271602c"}, + {file = "ipykernel-6.26.0.tar.gz", hash = "sha256:553856658eb8430bbe9653ea041a41bff63e9606fc4628873fc92a6cf3abd404"}, ] [package.dependencies] @@ -2370,13 +2296,13 @@ qtconsole = "*" [[package]] name = "jupyter-client" -version = "8.4.0" +version = "8.5.0" description = "Jupyter protocol implementation and client libraries" optional = false python-versions = ">=3.8" files = [ - {file = "jupyter_client-8.4.0-py3-none-any.whl", hash = "sha256:6a2a950ec23a8f62f9e4c66acec7f0ea6c7d1f80ba0992e747b10c56ce2e6dbe"}, - {file = "jupyter_client-8.4.0.tar.gz", hash = "sha256:dc1b857d5d7d76ac101766c6e9b646bf18742721126e72e5d484c75a993cada2"}, + {file = "jupyter_client-8.5.0-py3-none-any.whl", hash = "sha256:c3877aac7257ec68d79b5c622ce986bd2a992ca42f6ddc9b4dd1da50e89f7028"}, + {file = "jupyter_client-8.5.0.tar.gz", hash = "sha256:e8754066510ce456358df363f97eae64b50860f30dc1fe8c6771440db3be9a63"}, ] [package.dependencies] @@ -2462,13 +2388,13 @@ test = ["click", "pre-commit", "pytest (>=7.0)", "pytest-asyncio (>=0.19.0)", "p [[package]] name = "jupyter-server" -version = "2.8.0" +version = "2.9.1" description = "The backend—i.e. core services, APIs, and REST endpoints—to Jupyter web applications." optional = false python-versions = ">=3.8" files = [ - {file = "jupyter_server-2.8.0-py3-none-any.whl", hash = "sha256:c57270faa6530393ae69783a2d2f1874c718b9f109080581ea076b05713249fa"}, - {file = "jupyter_server-2.8.0.tar.gz", hash = "sha256:b11e2ba80667c75f55630faf8ac3d5809f8734f9006d65cce117c46a0a516ab8"}, + {file = "jupyter_server-2.9.1-py3-none-any.whl", hash = "sha256:21ad1a3d455d5a79ce4bef5201925cd17510c17898cf9d54e3ccfb6b12734948"}, + {file = "jupyter_server-2.9.1.tar.gz", hash = "sha256:9ba71be4b9c16e479e4c50c929f8ac4b1015baf90237a08681397a98c76c7e5e"}, ] [package.dependencies] @@ -4309,26 +4235,6 @@ files = [ [package.extras] test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] -[[package]] -name = "psycopg2" -version = "2.9.9" -description = "psycopg2 - Python-PostgreSQL Database Adapter" -optional = false -python-versions = ">=3.7" -files = [ - {file = "psycopg2-2.9.9-cp310-cp310-win32.whl", hash = "sha256:38a8dcc6856f569068b47de286b472b7c473ac7977243593a288ebce0dc89516"}, - {file = "psycopg2-2.9.9-cp310-cp310-win_amd64.whl", hash = "sha256:426f9f29bde126913a20a96ff8ce7d73fd8a216cfb323b1f04da402d452853c3"}, - {file = "psycopg2-2.9.9-cp311-cp311-win32.whl", hash = "sha256:ade01303ccf7ae12c356a5e10911c9e1c51136003a9a1d92f7aa9d010fb98372"}, - {file = "psycopg2-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:121081ea2e76729acfb0673ff33755e8703d45e926e416cb59bae3a86c6a4981"}, - {file = "psycopg2-2.9.9-cp37-cp37m-win32.whl", hash = "sha256:5e0d98cade4f0e0304d7d6f25bbfbc5bd186e07b38eac65379309c4ca3193efa"}, - {file = "psycopg2-2.9.9-cp37-cp37m-win_amd64.whl", hash = "sha256:7e2dacf8b009a1c1e843b5213a87f7c544b2b042476ed7755be813eaf4e8347a"}, - {file = "psycopg2-2.9.9-cp38-cp38-win32.whl", hash = "sha256:ff432630e510709564c01dafdbe996cb552e0b9f3f065eb89bdce5bd31fabf4c"}, - {file = "psycopg2-2.9.9-cp38-cp38-win_amd64.whl", hash = "sha256:bac58c024c9922c23550af2a581998624d6e02350f4ae9c5f0bc642c633a2d5e"}, - {file = "psycopg2-2.9.9-cp39-cp39-win32.whl", hash = "sha256:c92811b2d4c9b6ea0285942b2e7cac98a59e166d59c588fe5cfe1eda58e72d59"}, - {file = "psycopg2-2.9.9-cp39-cp39-win_amd64.whl", hash = "sha256:de80739447af31525feddeb8effd640782cf5998e1a4e9192ebdf829717e3913"}, - {file = "psycopg2-2.9.9.tar.gz", hash = "sha256:d1454bde93fb1e224166811694d600e746430c006fbb031ea06ecc2ea41bf156"}, -] - [[package]] name = "ptyprocess" version = "0.7.0" @@ -4554,13 +4460,13 @@ diagrams = ["jinja2", "railroad-diagrams"] [[package]] name = "pytest" -version = "7.4.2" +version = "7.4.3" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.7" files = [ - {file = "pytest-7.4.2-py3-none-any.whl", hash = "sha256:1d881c6124e08ff0a1bb75ba3ec0bfd8b5354a01c194ddd5a0a870a48d99b002"}, - {file = "pytest-7.4.2.tar.gz", hash = "sha256:a766259cfab564a2ad52cb1aae1b881a75c3eb7e34ca3779697c23ed47c47069"}, + {file = "pytest-7.4.3-py3-none-any.whl", hash = "sha256:0d009c083ea859a71b76adf7c1d502e4bc170b80a8ef002da5806527b9591fac"}, + {file = "pytest-7.4.3.tar.gz", hash = "sha256:d989d136982de4e3b29dabcc838ad581c64e8ed52c11fbe86ddebd9da0818cd5"}, ] [package.dependencies] @@ -4926,13 +4832,13 @@ test = ["flaky", "pytest", "pytest-qt"] [[package]] name = "qtpy" -version = "2.4.0" +version = "2.4.1" description = "Provides an abstraction layer on top of the various Qt bindings (PyQt5/6 and PySide2/6)." optional = false python-versions = ">=3.7" files = [ - {file = "QtPy-2.4.0-py3-none-any.whl", hash = "sha256:4d4f045a41e09ac9fa57fcb47ef05781aa5af294a0a646acc1b729d14225e741"}, - {file = "QtPy-2.4.0.tar.gz", hash = "sha256:db2d508167aa6106781565c8da5c6f1487debacba33519cedc35fa8997d424d4"}, + {file = "QtPy-2.4.1-py3-none-any.whl", hash = "sha256:1c1d8c4fa2c884ae742b069151b0abe15b3f70491f3972698c683b8e38de839b"}, + {file = "QtPy-2.4.1.tar.gz", hash = "sha256:a5a15ffd519550a1361bdc56ffc07fda56a6af7292f17c7b395d4083af632987"}, ] [package.dependencies] @@ -5230,28 +5136,28 @@ files = [ [[package]] name = "ruff" -version = "0.1.1" +version = "0.1.2" description = "An extremely fast Python linter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.1.1-py3-none-macosx_10_7_x86_64.whl", hash = "sha256:b7cdc893aef23ccc14c54bd79a8109a82a2c527e11d030b62201d86f6c2b81c5"}, - {file = "ruff-0.1.1-py3-none-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:620d4b34302538dbd8bbbe8fdb8e8f98d72d29bd47e972e2b59ce6c1e8862257"}, - {file = "ruff-0.1.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2a909d3930afdbc2e9fd893b0034479e90e7981791879aab50ce3d9f55205bd6"}, - {file = "ruff-0.1.1-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3305d1cb4eb8ff6d3e63a48d1659d20aab43b49fe987b3ca4900528342367145"}, - {file = "ruff-0.1.1-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c34ae501d0ec71acf19ee5d4d889e379863dcc4b796bf8ce2934a9357dc31db7"}, - {file = "ruff-0.1.1-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:6aa7e63c3852cf8fe62698aef31e563e97143a4b801b57f920012d0e07049a8d"}, - {file = "ruff-0.1.1-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2d68367d1379a6b47e61bc9de144a47bcdb1aad7903bbf256e4c3d31f11a87ae"}, - {file = "ruff-0.1.1-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bc11955f6ce3398d2afe81ad7e49d0ebf0a581d8bcb27b8c300281737735e3a3"}, - {file = "ruff-0.1.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cbbd8eead88ea83a250499074e2a8e9d80975f0b324b1e2e679e4594da318c25"}, - {file = "ruff-0.1.1-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:f4780e2bb52f3863a565ec3f699319d3493b83ff95ebbb4993e59c62aaf6e75e"}, - {file = "ruff-0.1.1-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:8f5b24daddf35b6c207619301170cae5d2699955829cda77b6ce1e5fc69340df"}, - {file = "ruff-0.1.1-py3-none-musllinux_1_2_i686.whl", hash = "sha256:d3f9ac658ba29e07b95c80fa742b059a55aefffa8b1e078bc3c08768bdd4b11a"}, - {file = "ruff-0.1.1-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:3521bf910104bf781e6753282282acc145cbe3eff79a1ce6b920404cd756075a"}, - {file = "ruff-0.1.1-py3-none-win32.whl", hash = "sha256:ba3208543ab91d3e4032db2652dcb6c22a25787b85b8dc3aeff084afdc612e5c"}, - {file = "ruff-0.1.1-py3-none-win_amd64.whl", hash = "sha256:3ff3006c97d9dc396b87fb46bb65818e614ad0181f059322df82bbfe6944e264"}, - {file = "ruff-0.1.1-py3-none-win_arm64.whl", hash = "sha256:e140bd717c49164c8feb4f65c644046fe929c46f42493672853e3213d7bdbce2"}, - {file = "ruff-0.1.1.tar.gz", hash = "sha256:c90461ae4abec261609e5ea436de4a4b5f2822921cf04c16d2cc9327182dbbcc"}, + {file = "ruff-0.1.2-py3-none-macosx_10_7_x86_64.whl", hash = "sha256:0d3ee66b825b713611f89aa35d16de984f76f26c50982a25d52cd0910dff3923"}, + {file = "ruff-0.1.2-py3-none-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:f85f850a320ff532b8f93e8d1da6a36ef03698c446357c8c43b46ef90bb321eb"}, + {file = "ruff-0.1.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:809c6d4e45683696d19ca79e4c6bd3b2e9204fe9546923f2eb3b126ec314b0dc"}, + {file = "ruff-0.1.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:46005e4abb268e93cad065244e17e2ea16b6fcb55a5c473f34fbc1fd01ae34cb"}, + {file = "ruff-0.1.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10cdb302f519664d5e2cf954562ac86c9d20ca05855e5b5c2f9d542228f45da4"}, + {file = "ruff-0.1.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:f89ebcbe57a1eab7d7b4ceb57ddf0af9ed13eae24e443a7c1dc078000bd8cc6b"}, + {file = "ruff-0.1.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7344eaca057d4c32373c9c3a7afb7274f56040c225b6193dd495fcf69453b436"}, + {file = "ruff-0.1.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dffa25f6e03c4950b6ac6f216bc0f98a4be9719cb0c5260c8e88d1bac36f1683"}, + {file = "ruff-0.1.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42ddaea52cb7ba7c785e8593a7532866c193bc774fe570f0e4b1ccedd95b83c5"}, + {file = "ruff-0.1.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:a8533efda625bbec0bf27da2886bd641dae0c209104f6c39abc4be5b7b22de2a"}, + {file = "ruff-0.1.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:b0b1b82221ba7c50e03b7a86b983157b5d3f4d8d4f16728132bdf02c6d651f77"}, + {file = "ruff-0.1.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:6c1362eb9288f8cc95535294cb03bd4665c8cef86ec32745476a4e5c6817034c"}, + {file = "ruff-0.1.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:ffa7ef5ded0563329a35bd5a1cfdae40f05a75c0cc2dd30f00b1320b1fb461fc"}, + {file = "ruff-0.1.2-py3-none-win32.whl", hash = "sha256:6e8073f85e47072256e2e1909f1ae515cf61ff5a4d24730a63b8b4ac24b6704a"}, + {file = "ruff-0.1.2-py3-none-win_amd64.whl", hash = "sha256:b836ddff662a45385948ee0878b0a04c3a260949905ad861a37b931d6ee1c210"}, + {file = "ruff-0.1.2-py3-none-win_arm64.whl", hash = "sha256:b0c42d00db5639dbd5f7f9923c63648682dd197bf5de1151b595160c96172691"}, + {file = "ruff-0.1.2.tar.gz", hash = "sha256:afd4785ae060ce6edcd52436d0c197628a918d6d09e3107a892a1bad6a4c6608"}, ] [[package]] @@ -5989,13 +5895,13 @@ rtd = ["ipython", "myst-nb", "sphinx", "sphinx-book-theme", "sphinx-examples"] [[package]] name = "sphinx-rtd-theme" -version = "2.0.0rc2" +version = "2.0.0rc3" description = "Read the Docs theme for Sphinx" optional = false python-versions = ">=3.6" files = [ - {file = "sphinx_rtd_theme-2.0.0rc2-py2.py3-none-any.whl", hash = "sha256:f04df9213acf421c3b42f4f39005c8bc68fc4696c5b4ed4ef13d1678369713f7"}, - {file = "sphinx_rtd_theme-2.0.0rc2.tar.gz", hash = "sha256:d1270effe620df9164b1cd2d617909472a63531e21a716fd22d0fbcedf9d24ff"}, + {file = "sphinx_rtd_theme-2.0.0rc3-py2.py3-none-any.whl", hash = "sha256:9f1b959ba570ea3795cc3e057955d2720533291fa2b8b4e8f43c4ec94a21f732"}, + {file = "sphinx_rtd_theme-2.0.0rc3.tar.gz", hash = "sha256:c7157c46ee75f8c223781fb553b9881861613931ed7daccbb31f4113fb2c183e"}, ] [package.dependencies] @@ -6139,87 +6045,6 @@ Sphinx = ">=5" lint = ["docutils-stubs", "flake8", "mypy"] test = ["pytest"] -[[package]] -name = "sqlalchemy" -version = "1.4.49" -description = "Database Abstraction Library" -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" -files = [ - {file = "SQLAlchemy-1.4.49-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:2e126cf98b7fd38f1e33c64484406b78e937b1a280e078ef558b95bf5b6895f6"}, - {file = "SQLAlchemy-1.4.49-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:03db81b89fe7ef3857b4a00b63dedd632d6183d4ea5a31c5d8a92e000a41fc71"}, - {file = "SQLAlchemy-1.4.49-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:95b9df9afd680b7a3b13b38adf6e3a38995da5e162cc7524ef08e3be4e5ed3e1"}, - {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a63e43bf3f668c11bb0444ce6e809c1227b8f067ca1068898f3008a273f52b09"}, - {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca46de16650d143a928d10842939dab208e8d8c3a9a8757600cae9b7c579c5cd"}, - {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f835c050ebaa4e48b18403bed2c0fda986525896efd76c245bdd4db995e51a4c"}, - {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c21b172dfb22e0db303ff6419451f0cac891d2e911bb9fbf8003d717f1bcf91"}, - {file = "SQLAlchemy-1.4.49-cp310-cp310-win32.whl", hash = "sha256:5fb1ebdfc8373b5a291485757bd6431de8d7ed42c27439f543c81f6c8febd729"}, - {file = "SQLAlchemy-1.4.49-cp310-cp310-win_amd64.whl", hash = "sha256:f8a65990c9c490f4651b5c02abccc9f113a7f56fa482031ac8cb88b70bc8ccaa"}, - {file = "SQLAlchemy-1.4.49-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:8923dfdf24d5aa8a3adb59723f54118dd4fe62cf59ed0d0d65d940579c1170a4"}, - {file = "SQLAlchemy-1.4.49-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9ab2c507a7a439f13ca4499db6d3f50423d1d65dc9b5ed897e70941d9e135b0"}, - {file = "SQLAlchemy-1.4.49-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5debe7d49b8acf1f3035317e63d9ec8d5e4d904c6e75a2a9246a119f5f2fdf3d"}, - {file = "SQLAlchemy-1.4.49-cp311-cp311-win32.whl", hash = "sha256:82b08e82da3756765c2e75f327b9bf6b0f043c9c3925fb95fb51e1567fa4ee87"}, - {file = "SQLAlchemy-1.4.49-cp311-cp311-win_amd64.whl", hash = "sha256:171e04eeb5d1c0d96a544caf982621a1711d078dbc5c96f11d6469169bd003f1"}, - {file = "SQLAlchemy-1.4.49-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f23755c384c2969ca2f7667a83f7c5648fcf8b62a3f2bbd883d805454964a800"}, - {file = "SQLAlchemy-1.4.49-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8396e896e08e37032e87e7fbf4a15f431aa878c286dc7f79e616c2feacdb366c"}, - {file = "SQLAlchemy-1.4.49-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66da9627cfcc43bbdebd47bfe0145bb662041472393c03b7802253993b6b7c90"}, - {file = "SQLAlchemy-1.4.49-cp312-cp312-win32.whl", hash = "sha256:9a06e046ffeb8a484279e54bda0a5abfd9675f594a2e38ef3133d7e4d75b6214"}, - {file = "SQLAlchemy-1.4.49-cp312-cp312-win_amd64.whl", hash = "sha256:7cf8b90ad84ad3a45098b1c9f56f2b161601e4670827d6b892ea0e884569bd1d"}, - {file = "SQLAlchemy-1.4.49-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:36e58f8c4fe43984384e3fbe6341ac99b6b4e083de2fe838f0fdb91cebe9e9cb"}, - {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b31e67ff419013f99ad6f8fc73ee19ea31585e1e9fe773744c0f3ce58c039c30"}, - {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebc22807a7e161c0d8f3da34018ab7c97ef6223578fcdd99b1d3e7ed1100a5db"}, - {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c14b29d9e1529f99efd550cd04dbb6db6ba5d690abb96d52de2bff4ed518bc95"}, - {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c40f3470e084d31247aea228aa1c39bbc0904c2b9ccbf5d3cfa2ea2dac06f26d"}, - {file = "SQLAlchemy-1.4.49-cp36-cp36m-win32.whl", hash = "sha256:706bfa02157b97c136547c406f263e4c6274a7b061b3eb9742915dd774bbc264"}, - {file = "SQLAlchemy-1.4.49-cp36-cp36m-win_amd64.whl", hash = "sha256:a7f7b5c07ae5c0cfd24c2db86071fb2a3d947da7bd487e359cc91e67ac1c6d2e"}, - {file = "SQLAlchemy-1.4.49-cp37-cp37m-macosx_11_0_x86_64.whl", hash = "sha256:4afbbf5ef41ac18e02c8dc1f86c04b22b7a2125f2a030e25bbb4aff31abb224b"}, - {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:24e300c0c2147484a002b175f4e1361f102e82c345bf263242f0449672a4bccf"}, - {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:393cd06c3b00b57f5421e2133e088df9cabcececcea180327e43b937b5a7caa5"}, - {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:201de072b818f8ad55c80d18d1a788729cccf9be6d9dc3b9d8613b053cd4836d"}, - {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7653ed6817c710d0c95558232aba799307d14ae084cc9b1f4c389157ec50df5c"}, - {file = "SQLAlchemy-1.4.49-cp37-cp37m-win32.whl", hash = "sha256:647e0b309cb4512b1f1b78471fdaf72921b6fa6e750b9f891e09c6e2f0e5326f"}, - {file = "SQLAlchemy-1.4.49-cp37-cp37m-win_amd64.whl", hash = "sha256:ab73ed1a05ff539afc4a7f8cf371764cdf79768ecb7d2ec691e3ff89abbc541e"}, - {file = "SQLAlchemy-1.4.49-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:37ce517c011560d68f1ffb28af65d7e06f873f191eb3a73af5671e9c3fada08a"}, - {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1878ce508edea4a879015ab5215546c444233881301e97ca16fe251e89f1c55"}, - {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95ab792ca493891d7a45a077e35b418f68435efb3e1706cb8155e20e86a9013c"}, - {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:0e8e608983e6f85d0852ca61f97e521b62e67969e6e640fe6c6b575d4db68557"}, - {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ccf956da45290df6e809ea12c54c02ace7f8ff4d765d6d3dfb3655ee876ce58d"}, - {file = "SQLAlchemy-1.4.49-cp38-cp38-win32.whl", hash = "sha256:f167c8175ab908ce48bd6550679cc6ea20ae169379e73c7720a28f89e53aa532"}, - {file = "SQLAlchemy-1.4.49-cp38-cp38-win_amd64.whl", hash = "sha256:45806315aae81a0c202752558f0df52b42d11dd7ba0097bf71e253b4215f34f4"}, - {file = "SQLAlchemy-1.4.49-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:b6d0c4b15d65087738a6e22e0ff461b407533ff65a73b818089efc8eb2b3e1de"}, - {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a843e34abfd4c797018fd8d00ffffa99fd5184c421f190b6ca99def4087689bd"}, - {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:738d7321212941ab19ba2acf02a68b8ee64987b248ffa2101630e8fccb549e0d"}, - {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:1c890421651b45a681181301b3497e4d57c0d01dc001e10438a40e9a9c25ee77"}, - {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d26f280b8f0a8f497bc10573849ad6dc62e671d2468826e5c748d04ed9e670d5"}, - {file = "SQLAlchemy-1.4.49-cp39-cp39-win32.whl", hash = "sha256:ec2268de67f73b43320383947e74700e95c6770d0c68c4e615e9897e46296294"}, - {file = "SQLAlchemy-1.4.49-cp39-cp39-win_amd64.whl", hash = "sha256:bbdf16372859b8ed3f4d05f925a984771cd2abd18bd187042f24be4886c2a15f"}, - {file = "SQLAlchemy-1.4.49.tar.gz", hash = "sha256:06ff25cbae30c396c4b7737464f2a7fc37a67b7da409993b182b024cec80aed9"}, -] - -[package.dependencies] -greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\")"} - -[package.extras] -aiomysql = ["aiomysql", "greenlet (!=0.4.17)"] -aiosqlite = ["aiosqlite", "greenlet (!=0.4.17)", "typing-extensions (!=3.10.0.1)"] -asyncio = ["greenlet (!=0.4.17)"] -asyncmy = ["asyncmy (>=0.2.3,!=0.2.4)", "greenlet (!=0.4.17)"] -mariadb-connector = ["mariadb (>=1.0.1,!=1.1.2)"] -mssql = ["pyodbc"] -mssql-pymssql = ["pymssql"] -mssql-pyodbc = ["pyodbc"] -mypy = ["mypy (>=0.910)", "sqlalchemy2-stubs"] -mysql = ["mysqlclient (>=1.4.0)", "mysqlclient (>=1.4.0,<2)"] -mysql-connector = ["mysql-connector-python"] -oracle = ["cx-oracle (>=7)", "cx-oracle (>=7,<8)"] -postgresql = ["psycopg2 (>=2.7)"] -postgresql-asyncpg = ["asyncpg", "greenlet (!=0.4.17)"] -postgresql-pg8000 = ["pg8000 (>=1.16.6,!=1.29.0)"] -postgresql-psycopg2binary = ["psycopg2-binary"] -postgresql-psycopg2cffi = ["psycopg2cffi"] -pymysql = ["pymysql", "pymysql (<1)"] -sqlcipher = ["sqlcipher3-binary"] - [[package]] name = "srsly" version = "2.4.8" @@ -6722,18 +6547,18 @@ telegram = ["requests"] [[package]] name = "traitlets" -version = "5.11.2" +version = "5.12.0" description = "Traitlets Python configuration system" optional = false python-versions = ">=3.8" files = [ - {file = "traitlets-5.11.2-py3-none-any.whl", hash = "sha256:98277f247f18b2c5cabaf4af369187754f4fb0e85911d473f72329db8a7f4fae"}, - {file = "traitlets-5.11.2.tar.gz", hash = "sha256:7564b5bf8d38c40fa45498072bf4dc5e8346eb087bbf1e2ae2d8774f6a0f078e"}, + {file = "traitlets-5.12.0-py3-none-any.whl", hash = "sha256:81539f07f7aebcde2e4b5ab76727f53eabf18ad155c6ed7979a681411602fa47"}, + {file = "traitlets-5.12.0.tar.gz", hash = "sha256:833273bf645d8ce31dcb613c56999e2e055b1ffe6d09168a164bcd91c36d5d35"}, ] [package.extras] docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] -test = ["argcomplete (>=3.0.3)", "mypy (>=1.5.1)", "pre-commit", "pytest (>=7.0,<7.5)", "pytest-mock", "pytest-mypy-testing"] +test = ["argcomplete (>=3.0.3)", "mypy (>=1.6.0)", "pre-commit", "pytest (>=7.0,<7.5)", "pytest-mock", "pytest-mypy-testing"] [[package]] name = "transformers" @@ -6891,13 +6716,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "virtualenv" -version = "20.24.5" +version = "20.24.6" description = "Virtual Python Environment builder" optional = false python-versions = ">=3.7" files = [ - {file = "virtualenv-20.24.5-py3-none-any.whl", hash = "sha256:b80039f280f4919c77b30f1c23294ae357c4c8701042086e3fc005963e4e537b"}, - {file = "virtualenv-20.24.5.tar.gz", hash = "sha256:e8361967f6da6fbdf1426483bfe9fca8287c242ac0bc30429905721cefbff752"}, + {file = "virtualenv-20.24.6-py3-none-any.whl", hash = "sha256:520d056652454c5098a00c0f073611ccbea4c79089331f60bf9d7ba247bb7381"}, + {file = "virtualenv-20.24.6.tar.gz", hash = "sha256:02ece4f56fbf939dbbc33c0715159951d6bf14aaf5457b092e4548e1382455af"}, ] [package.dependencies] @@ -7384,10 +7209,9 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [extras] models = ["alibi", "alibi-detect", "hydra-core", "llvmlite", "scikit-learn", "torch", "torchxrayvision", "xgboost"] -query = ["SQLAlchemy", "hydra-core", "psycopg2"] report = ["kaleido", "pillow", "plotly", "pybtex", "pydantic", "scour", "spdx-tools"] [metadata] lock-version = "2.0" python-versions = ">=3.9, <3.12" -content-hash = "dc106051a4c981643b4baf50117da9c0853a087400c3337b0c0bfd1a9344a4e8" +content-hash = "e64e88e54c99e4cafd23e829a6d51090b738de4dccaea4a5d0d5ee1e86c47c97" diff --git a/pyproject.toml b/pyproject.toml index 3506f4022..898cb12f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,11 +19,9 @@ datasets = "^2.10.1" psutil = "^5.9.4" pyarrow = "^11.0.0" monai = { version = "^1.1.0", extras = ["itk"] } -dask = { version = "^2022.9.1", extras = ["dataframe"] } +dask = { version = "^2023.10.0", extras = ["dataframe"] } -psycopg2 = { version = "^2.9.6", optional = true } hydra-core = { version = "^1.2.0", optional = true } -SQLAlchemy = { version = "^1.4.32, <2.0", optional = true } scikit-learn = { version = "^1.2.2", optional = true } torch = { version = "^1.11.0", optional = true } torchxrayvision = { version = "^1.2.0", optional = true } @@ -39,10 +37,6 @@ scour = { version = "^0.38.2", optional = true } plotly = { version = "^5.7.0", optional = true } pillow = { version = "^9.5.0", optional = true } -[tool.poetry.group.query.dependencies] -psycopg2 = "^2.9.5" -SQLAlchemy = "^1.4.32, <2.0" - [tool.poetry.group.models.dependencies] hydra-core = "^1.2.0" scikit-learn = "^1.2.2" @@ -96,7 +90,6 @@ ipympl = "^0.9.3" ipywidgets = "^8.0.6" [tool.poetry.extras] -query = ["psycopg2", "hydra-core", "SQLAlchemy"] models = ["hydra-core", "scikit-learn", "torch", "torchxrayvision", "xgboost", "alibi", "alibi-detect", "llvmlite"] report = ["pydantic", "spdx-tools", "pybtex", "kaleido", "scour", "plotly", "pillow"] @@ -121,7 +114,6 @@ warn_return_any = true implicit_reexport = false strict_equality = true extra_checks = true -plugins = ["sqlalchemy.ext.mypy.plugin"] [tool.ruff] include = ["*.py", "pyproject.toml", "*.ipynb"] diff --git a/tests/cyclops/query/__init__.py b/tests/cyclops/query/__init__.py deleted file mode 100644 index 9783b8c41..000000000 --- a/tests/cyclops/query/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for query API package.""" diff --git a/tests/cyclops/query/post_process/__init__.py b/tests/cyclops/query/post_process/__init__.py deleted file mode 100644 index 27484c073..000000000 --- a/tests/cyclops/query/post_process/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for query postprocess functions.""" diff --git a/tests/cyclops/query/post_process/test_util.py b/tests/cyclops/query/post_process/test_util.py deleted file mode 100644 index 41296af9d..000000000 --- a/tests/cyclops/query/post_process/test_util.py +++ /dev/null @@ -1,51 +0,0 @@ -"""Tests for post-processing functions in the query package.""" - -import pandas as pd - -from cyclops.query.post_process.util import event_time_between - - -def test_event_time_between(): - """Test event_time_between fn.""" - admit_ts = pd.Series( - [ - pd.Timestamp(year=2017, month=1, day=1, hour=12), - pd.Timestamp(year=2017, month=1, day=1, hour=12), - ], - ) - discharge_ts = pd.Series( - [ - pd.Timestamp(year=2017, month=1, day=7, hour=12), - pd.Timestamp(year=2018, month=1, day=9, hour=12), - ], - ) - is_between = event_time_between( - pd.Timestamp(year=2017, month=1, day=2), - admit_ts, - discharge_ts, - ) - assert is_between[0] - assert is_between[1] - is_between = event_time_between( - pd.Timestamp(year=2017, month=1, day=7, hour=12), - admit_ts, - discharge_ts, - ) - assert not is_between[0] - assert is_between[1] - is_between = event_time_between( - pd.Timestamp(year=2017, month=1, day=7, hour=12), - admit_ts, - discharge_ts, - discharge_inclusive=True, - ) - assert is_between[0] - assert is_between[1] - is_between = event_time_between( - pd.Timestamp(year=2017, month=1, day=1, hour=12), - admit_ts, - discharge_ts, - admit_inclusive=False, - ) - assert not is_between[0] - assert not is_between[1] diff --git a/tests/cyclops/query/test_base.py b/tests/cyclops/query/test_base.py deleted file mode 100644 index 797b0c10a..000000000 --- a/tests/cyclops/query/test_base.py +++ /dev/null @@ -1,22 +0,0 @@ -"""Test base dataset querier, using OMOPQuerier as an example.""" - -import pytest - -from cyclops.query import OMOPQuerier - - -@pytest.mark.integration_test() -def test_dataset_querier(): - """Test base querier methods using OMOPQuerier.""" - querier = OMOPQuerier( - database="synthea_integration_test", - schema_name="cdm_synthea10", - user="postgres", - password="pwd", - ) - assert len(querier.list_tables()) == 69 - assert len(querier.list_schemas()) == 4 - assert len(querier.list_tables(schema_name="cdm_synthea10")) == 43 - visit_occrrence_columns = querier.list_columns("cdm_synthea10", "visit_occurrence") - assert len(visit_occrrence_columns) == 17 - assert "visit_occurrence_id" in visit_occrrence_columns diff --git a/tests/cyclops/query/test_eicu.py b/tests/cyclops/query/test_eicu.py deleted file mode 100644 index 42c490e59..000000000 --- a/tests/cyclops/query/test_eicu.py +++ /dev/null @@ -1,29 +0,0 @@ -"""Test eICU query API.""" - -import pytest - -from cyclops.query import EICUQuerier - - -@pytest.mark.integration_test() -def test_eicu_querier(): - """Test EICUQuerier on eICU-CRD.""" - querier = EICUQuerier( - database="eicu", - user="postgres", - password="pwd", - ) - - patients = querier.eicu_crd.patient().run(limit=10) - assert len(patients) == 10 - assert "age" in patients - - diagnoses = querier.eicu_crd.diagnosis().run(limit=10) - assert len(diagnoses) == 10 - assert "diagnosisstring" in diagnoses - - vital_periods = querier.eicu_crd.vitalperiodic().run(limit=10) - assert "heartrate" in vital_periods - - vital_aperiodic = querier.eicu_crd.vitalaperiodic().run(limit=10) - assert "pvri" in vital_aperiodic diff --git a/tests/cyclops/query/test_interface.py b/tests/cyclops/query/test_interface.py deleted file mode 100644 index 3cca1d5a8..000000000 --- a/tests/cyclops/query/test_interface.py +++ /dev/null @@ -1,81 +0,0 @@ -"""Test functions for interface module in query package.""" - -import os -import shutil -from unittest.mock import patch - -import dask.dataframe as dd -import pandas as pd -import pytest - -from cyclops.query.interface import QueryInterface -from cyclops.query.omop import OMOPQuerier - - -@pytest.fixture() -def test_data(): - """Dummy dataframe for testing.""" - return pd.DataFrame([[1, "a", 1], [5.1, "b", 0]], columns=["col1", "col2", "col3"]) - - -@patch("cyclops.query.orm.Database") -@patch("sqlalchemy.sql.selectable.Subquery") -def test_query_interface( - database, - query, - test_data, -): - """Test QueryInterface.""" - query_interface = QueryInterface(database, query) - query_interface.run() - - query_interface._data = test_data - path = os.path.join("test_save", "test_features.parquet") - query_interface.save(path) - loaded_data = pd.read_parquet(path) - assert loaded_data.equals(test_data) - shutil.rmtree("test_save") - query_interface.clear_data() - assert not query_interface.data - - with pytest.raises(ValueError): - query_interface.save(path, file_format="donkey") - - -@pytest.mark.integration_test() -def test_query_interface_integration(): - """Test QueryInterface with OMOPQuerier.""" - synthea = OMOPQuerier( - database="synthea_integration_test", - schema_name="cdm_synthea10", - user="postgres", - password="pwd", - ) - visits = synthea.visit_occurrence() - assert isinstance(visits, QueryInterface) - visits_pd_df = visits.run() - assert isinstance(visits_pd_df, pd.DataFrame) - assert visits_pd_df.shape[0] > 0 - visits_dd_df = visits.run(backend="dask", index_col="visit_occurrence_id") - assert isinstance(visits_dd_df, dd.DataFrame) - assert ( - "visit_occurrence_id" in visits_dd_df.columns - ) # reset index and keep index column - assert visits_dd_df.shape[0].compute() > 0 - visits_dd_df = visits.run( - backend="dask", - index_col="visit_occurrence_id", - n_partitions=2, - ) - assert isinstance(visits_dd_df, dd.DataFrame) - assert visits_dd_df.npartitions == 2 - vistit_ids_0 = visits_dd_df.partitions[0].compute()["visit_occurrence_id"] - vistit_ids_1 = visits_dd_df.partitions[1].compute()["visit_occurrence_id"] - # check that the partitions don't overlap - assert len(set(vistit_ids_0).intersection(set(vistit_ids_1))) == 0 - - # test running a query using SQL string - synthea_db = visits.database - visits_df = synthea_db.run_query("SELECT * FROM cdm_synthea10.visit_occurrence") - assert isinstance(visits_df, pd.DataFrame) - assert visits_df.shape[0] > 0 diff --git a/tests/cyclops/query/test_mimiciii.py b/tests/cyclops/query/test_mimiciii.py deleted file mode 100644 index 1361d84ae..000000000 --- a/tests/cyclops/query/test_mimiciii.py +++ /dev/null @@ -1,34 +0,0 @@ -"""Test MIMICIII query API.""" - -import pytest - -from cyclops.query import MIMICIIIQuerier - - -@pytest.mark.integration_test() -def test_mimiciii_querier(): - """Test MIMICIIIQuerier.""" - querier = MIMICIIIQuerier( - dbms="postgresql", - port=5432, - host="localhost", - database="mimiciii", - user="postgres", - password="pwd", - ) - custom_tables = querier.list_custom_tables() - assert "diagnoses" in custom_tables - assert "labevents" in custom_tables - assert "chartevents" in custom_tables - - diagnoses = querier.diagnoses().run(limit=10) - assert len(diagnoses) == 10 - assert "long_title" in diagnoses - - labevents = querier.labevents().run(limit=10) - assert len(labevents) == 10 - assert "itemid" in labevents - - chartevents = querier.chartevents().run(limit=10) - assert len(chartevents) == 10 - assert "itemid" in chartevents diff --git a/tests/cyclops/query/test_mimiciv.py b/tests/cyclops/query/test_mimiciv.py deleted file mode 100644 index 8b356849b..000000000 --- a/tests/cyclops/query/test_mimiciv.py +++ /dev/null @@ -1,39 +0,0 @@ -"""Test MIMICIV-2.0 query API.""" - -import pytest - -from cyclops.query import MIMICIVQuerier - - -@pytest.mark.integration_test() -def test_mimiciv_querier(): - """Test MIMICQuerier on MIMICIV-2.0.""" - querier = MIMICIVQuerier( - database="mimiciv-2.0", - user="postgres", - password="pwd", - ) - - patients = querier.patients().run(limit=10) - assert len(patients) == 10 - assert "anchor_year_difference" in patients - - diagnoses = querier.diagnoses().run(limit=10) - assert len(diagnoses) == 10 - assert "long_title" in diagnoses - - lab_events = querier.labevents().run(limit=10) - assert "category" in lab_events - - chart_events = querier.chartevents().run(limit=10) - assert "value" in chart_events - assert "category" in chart_events - - custom_tables = querier.list_custom_tables() - assert "patients" in custom_tables - assert "diagnoses" in custom_tables - assert "labevents" in custom_tables - assert "chartevents" in custom_tables - - with pytest.raises(AttributeError): - querier.get_table("invalid_schema", "invalid_table") diff --git a/tests/cyclops/query/test_omop.py b/tests/cyclops/query/test_omop.py deleted file mode 100644 index 8995e37a5..000000000 --- a/tests/cyclops/query/test_omop.py +++ /dev/null @@ -1,51 +0,0 @@ -"""Test OMOP query API.""" - -import pytest - -import cyclops.query.ops as qo -from cyclops.query import OMOPQuerier - - -@pytest.mark.integration_test() -def test_omop_querier_synthea(): - """Test OMOPQuerier on synthea data.""" - querier = OMOPQuerier( - database="synthea_integration_test", - schema_name="cdm_synthea10", - user="postgres", - password="pwd", - ) - ops = qo.Sequential( - qo.ConditionEquals("gender_source_value", "M"), - qo.Rename({"race_source_value": "race"}), - ) - persons = querier.person() - persons = persons.ops(ops) - visits = querier.visit_occurrence() - visits = visits.join(persons, "person_id").run() - persons = persons.run() - observations = querier.observation().run() - measurements = querier.measurement().run() - visit_details = querier.visit_detail().run() - providers = querier.cdm_synthea10.provider().run() - conditions = querier.cdm_synthea10.condition_occurrence().run() - assert len(persons) == 54 - assert len(visits) == 1798 - assert len(visit_details) == 4320 - assert len(observations) == 17202 - assert len(measurements) == 19994 - assert len(providers) == 212 - assert len(conditions) == 1419 - - -@pytest.mark.integration_test() -def test_omop_querier_mimiciii(): - """Test OMOPQuerier on MIMICIII data.""" - querier = OMOPQuerier( - database="mimiciii", - schema_name="omop", - user="postgres", - password="pwd", - ) - visits = querier.visit_occurrence().run() - assert len(visits) == 58976 diff --git a/tests/cyclops/query/test_ops.py b/tests/cyclops/query/test_ops.py deleted file mode 100644 index 8247c2083..000000000 --- a/tests/cyclops/query/test_ops.py +++ /dev/null @@ -1,655 +0,0 @@ -"""Test low-level query API processing functions.""" - -from math import isclose - -import pandas as pd -import pytest -from sqlalchemy import column, select - -from cyclops.query.omop import OMOPQuerier -from cyclops.query.ops import ( - AddColumn, - AddNumeric, - And, - Apply, - Cast, - ConditionAfterDate, - ConditionBeforeDate, - ConditionEndsWith, - ConditionEquals, - ConditionGreaterThan, - ConditionIn, - ConditionInMonths, - ConditionInYears, - ConditionLessThan, - ConditionLike, - ConditionRegexMatch, - ConditionStartsWith, - ConditionSubstring, - Distinct, - Drop, - DropEmpty, - DropNulls, - ExtractTimestampComponent, - FillNull, - GroupByAggregate, - Limit, - Literal, - Or, - OrderBy, - QueryOp, - Rename, - ReorderAfter, - Sequential, - Substring, - Trim, - _addindent, - _none_add, - _process_checks, -) -from cyclops.query.util import process_column - - -QUERIER = OMOPQuerier( - database="synthea_integration_test", - user="postgres", - password="pwd", - schema_name="cdm_synthea10", -) - - -@pytest.fixture() -def table_input(): - """Test table input.""" - column_a = process_column(column("a"), to_timestamp=True) - return select(column_a, column("b"), column("c")) - - -@pytest.fixture() -def visits_table(): - """Test visits table input.""" - return QUERIER.visit_occurrence() - - -@pytest.fixture() -def measurements_table(): - """Test measurement table input.""" - return QUERIER.measurement() - - -def test__none_add(): - """Test _none_add fn.""" - assert _none_add("1", "2") == "12" - assert _none_add("1", None) == "1" - assert _none_add(None, "2") == "2" - - -def test__process_checks(table_input): - """Test _process_checks fn.""" - _process_checks(table_input, cols=["a"], cols_not_in=["d"], timestamp_cols=["a"]) - with pytest.raises(ValueError): - _process_checks(table_input, cols_not_in=["a"]) - - -class TestAddndent: - """Test _addindent fn.""" - - def test_addindent_multiple_lines(self): - """Test _addindent fn with multiple lines.""" - input_string = "This is a\nmultiline\nstring" - expected_output = "This is a\n multiline\n string" - assert _addindent(input_string, 4) == expected_output - - def test_addindent_single_line(self): - """Test _addindent fn with single line.""" - input_string = "This is a single line string" - assert _addindent(input_string, 4) == input_string - - -class TestQueryOp: - """Test QueryOp class.""" - - def test_add_child_operation(self): - """Test adding a child operation.""" - query_op = QueryOp() - child_op = QueryOp() - query_op._add_op("child", child_op) - assert query_op.child == child_op - - def test_get_query_op_name(self): - """Test getting the name of the query op.""" - query_op = QueryOp() - assert query_op._get_name() == "QueryOp" - - def test_set_attribute(self): - """Test setting an attribute of the query op.""" - query_op = QueryOp() - child_op = QueryOp() - query_op.child = child_op - assert query_op.child == child_op - - def test_string_representation(self): - """Test string representation of the query op.""" - query_op = QueryOp() - child_op = QueryOp() - query_op._add_op("child", child_op) - assert repr(query_op) == "QueryOp(\n (child): QueryOp()\n)" - - def test_add_child_operation_empty_name(self): - """Test adding a child operation with an empty name.""" - query_op = QueryOp() - child_op = QueryOp() - with pytest.raises(KeyError): - query_op._add_op("", child_op) - - def test_add_child_operation_dot_name(self): - """Test adding a child operation with a dot in the name.""" - query_op = QueryOp() - child_op = QueryOp() - with pytest.raises(KeyError): - query_op._add_op("child.name", child_op) - - -@pytest.mark.integration_test() -def test_drop(visits_table): - """Test Drop.""" - visits = visits_table.ops(Drop("care_site_source_value")).run() - assert "care_site_source_value" not in visits.columns - - -@pytest.mark.integration_test() -def test_fill_null(visits_table): - """Test FillNull.""" - visits_before = visits_table.run() - unique_before = visits_before["preceding_visit_occurrence_id"].unique() - visits_after = visits_table.ops( - FillNull(["preceding_visit_occurrence_id", "care_site_id"], 0), - ).run() - unique_after = visits_after["preceding_visit_occurrence_id"].unique() - assert visits_after["preceding_visit_occurrence_id"].isna().sum() == 0 - assert visits_after["care_site_id"].isna().sum() == 0 - assert 0 not in unique_before - assert len(unique_after) == len(unique_before) - assert len(visits_after["care_site_id"].unique()) == 1 - - visits_after = visits_table.ops( - FillNull( - ["preceding_visit_occurrence_id", "care_site_id"], - [0, -99], - ["col1", "col2"], - ), - ).run() - assert visits_after["preceding_visit_occurrence_id"].isna().sum() != 0 - assert visits_after["care_site_id"].isna().sum() != 0 - assert visits_after["col1"].isna().sum() == 0 - assert visits_after["col2"].isna().sum() == 0 - assert len(visits_after["col2"].unique()) == 1 - assert -99 in visits_after["col2"].unique() - - -@pytest.mark.integration_test() -def test_add_column(visits_table): - """Test AddColumn.""" - ops = Sequential( - Literal(2, "test_col1"), - Literal(3, "test_col2"), - AddColumn("test_col1", "test_col2", new_col_labels="test_col3"), - ) - visits = visits_table.ops(ops).run() - assert "test_col3" in visits.columns - assert (visits["test_col3"] == 5).all() - - ops = Sequential( - Literal(2, "test_col1"), - Literal(3, "test_col2"), - AddColumn( - "test_col1", - "test_col2", - negative=True, - new_col_labels="test_col3", - ), - ) - visits = visits_table.ops(ops).run() - assert "test_col3" in visits.columns - assert (visits["test_col3"] == -1).all() - - -@pytest.mark.integration_test() -def test_rename(visits_table): - """Test Rename.""" - rename_op = Rename({"care_site_name": "hospital_name"}) - visits = visits_table.ops(rename_op).run() - assert "hospital_name" in visits.columns - assert "care_site_name" not in visits.columns - - -@pytest.mark.integration_test() -def test_literal(visits_table): - """Test Literal.""" - literal_ops = Sequential(Literal(1, "new_col"), Literal("a", "new_col2")) - visits = visits_table.ops(literal_ops).run() - assert "new_col" in visits.columns - assert visits["new_col"].iloc[0] == 1 - assert "new_col2" in visits.columns - assert visits["new_col2"].iloc[0] == "a" - - -@pytest.mark.integration_test() -def test_reorder_after(visits_table): - """Test ReorderAfter.""" - reorder_op = ReorderAfter("visit_concept_name", "care_site_id") - visits = visits_table.ops(reorder_op).run() - assert list(visits.columns).index("care_site_id") + 1 == list(visits.columns).index( - "visit_concept_name", - ) - - -@pytest.mark.integration_test() -def test_limit(visits_table): - """Test Limit.""" - visits = visits_table.ops(Limit(10)).run() - assert len(visits) == 10 - - -@pytest.mark.integration_test() -def test_order_by(visits_table): - """Test OrderBy.""" - orderby_op = OrderBy("visit_concept_name") - visits = visits_table.ops(orderby_op).run() - assert visits["visit_concept_name"].is_monotonic_increasing - - -@pytest.mark.integration_test() -def test_substring(visits_table): - """Test Substring.""" - substring_op = Substring("visit_concept_name", 0, 3, "visit_concept_name_substr") - visits = visits_table.ops(substring_op).run() - assert visits["visit_concept_name_substr"].value_counts()["Ou"] == 4057 - - -@pytest.mark.integration_test() -def test_trim(visits_table): - """Test Trim.""" - trim_op = Trim("visit_concept_name", "visit_concept_name_trim") - visits = visits_table.ops(trim_op).run() - assert visits["visit_concept_name_trim"].value_counts()["Inpatient Visit"] == 108 - - -@pytest.mark.integration_test() -def test_extract_timestamp_component( - visits_table, -): - """Test ExtractTimestampComponent.""" - extract_ts_op = ExtractTimestampComponent( - "visit_start_date", - "year", - "visit_start_date_year", - ) - visits = visits_table.ops(extract_ts_op).run() - assert visits["visit_start_date_year"].value_counts()[2021] == 371 - - -@pytest.mark.integration_test() -def test_add_numeric(visits_table): - """Test AddNumeric.""" - ops = Sequential(Literal(1, "new_col"), AddNumeric("new_col", 1, "new_col_plus_1")) - visits = visits_table.ops(ops).run() - assert visits["new_col_plus_1"].iloc[0] == 2 - - -@pytest.mark.integration_test() -def test_apply(visits_table): - """Test Apply.""" - apply_op = Apply( - "visit_concept_name", - lambda x: x + "!", - "visit_concept_name_exclaim", - ) - visits = visits_table.ops(apply_op).run() - assert ( - visits["visit_concept_name_exclaim"].value_counts()["Outpatient Visit!"] == 4057 - ) - apply_op = Apply( - ["visit_occurrence_id", "person_id"], - lambda x, y: x + y, - "sum_id", - ) - visits = visits_table.ops(apply_op).run() - assert ( - visits["sum_id"].iloc[0] - == visits["visit_occurrence_id"].iloc[0] + visits["person_id"].iloc[0] - ) - assert visits["sum_id"].isna().sum() == visits["person_id"].isna().sum() - apply_op = Apply( - ["visit_occurrence_id", "person_id"], - [lambda x: x + 1, lambda x: x + 2], - ["sum_id", "sum_id2"], - ) - visits = visits_table.ops(apply_op).run() - assert visits["sum_id"].iloc[0] == visits["visit_occurrence_id"].iloc[0] + 1 - assert visits["sum_id2"].iloc[0] == visits["person_id"].iloc[0] + 2 - - -@pytest.mark.integration_test() -def test_condition_regex_match( - measurements_table, -): - """Test ConditionRegexMatch.""" - measurements_op = ConditionRegexMatch( - "value_source_value", - r"^[0-9]+(\.[0-9]+)?$", - binarize_col="value_source_value_match", - ) - measurements = measurements_table.ops(measurements_op).run() - assert "value_source_value_match" in measurements.columns - assert ( - measurements["value_source_value_match"].sum() - == measurements["value_source_value"].str.match(r"^[0-9]+(\.[0-9]+)?$").sum() - ) - - -@pytest.mark.integration_test() -def test_group_by_aggregate( - visits_table, - measurements_table, -): - """Test GroupByAggregate.""" - with pytest.raises(ValueError): - visits_table.ops( - GroupByAggregate("person_id", {"person_id": ("donkey", "visit_count")}), - ) - with pytest.raises(ValueError): - visits_table.ops( - GroupByAggregate("person_id", {"person_id": ("count", "person_id")}), - ) - - visits_count = visits_table.ops( - GroupByAggregate( - "person_id", - {"person_id": ("count", "num_visits")}, - ), - ).run() - visits_string_agg = visits_table.ops( - GroupByAggregate( - "person_id", - {"visit_concept_name": ("string_agg", "visit_concept_names")}, - {"visit_concept_name": ", "}, - ), - ).run() - measurements_sum = measurements_table.ops( - GroupByAggregate( - "person_id", - {"value_as_number": ("sum", "value_as_number_sum")}, - ), - ).run() - measurements_average = measurements_table.ops( - GroupByAggregate( - "person_id", - {"value_as_number": ("average", "value_as_number_average")}, - ), - ).run() - measurements_min = measurements_table.ops( - GroupByAggregate( - "person_id", - {"value_as_number": ("min", "value_as_number_min")}, - ), - ).run() - measurements_max = measurements_table.ops( - GroupByAggregate( - "person_id", - {"value_as_number": ("max", "value_as_number_max")}, - ), - ).run() - measurements_median = measurements_table.ops( - GroupByAggregate( - "person_id", - {"value_as_number": ("median", "value_as_number_median")}, - ), - ).run() - - assert "num_visits" in visits_count.columns - assert visits_count[visits_count["person_id"] == 33]["num_visits"][0] == 86 - assert "visit_concept_names" in visits_string_agg.columns - test_visit_concept_names = visits_string_agg[visits_string_agg["person_id"] == 33][ - "visit_concept_names" - ][0].split(",") - test_visit_concept_names = [item.strip() for item in test_visit_concept_names] - assert len(test_visit_concept_names) == 86 - assert "Outpatient Visit" in test_visit_concept_names - assert "value_as_number_sum" in measurements_sum.columns - assert ( - measurements_sum[measurements_sum["person_id"] == 33]["value_as_number_sum"][0] - == 9881.3 - ) - assert "value_as_number_average" in measurements_average.columns - assert isclose( - measurements_average[measurements_average["person_id"] == 33][ - "value_as_number_average" - ][0], - 75.42, - abs_tol=0.01, - ) - assert "value_as_number_min" in measurements_min.columns - assert ( - measurements_min[measurements_min["person_id"] == 33]["value_as_number_min"][0] - == 0.0 - ) - assert "value_as_number_max" in measurements_max.columns - assert ( - measurements_max[measurements_max["person_id"] == 33]["value_as_number_max"][0] - == 360.7 - ) - assert "value_as_number_median" in measurements_median.columns - assert ( - measurements_median[measurements_median["person_id"] == 33][ - "value_as_number_median" - ].item() - == 75.7 - ) - - -@pytest.mark.integration_test() -def test_drop_nulls(visits_table): - """Test DropNulls.""" - visits = visits_table.ops(DropNulls("preceding_visit_occurrence_id")).run() - assert visits["preceding_visit_occurrence_id"].isnull().sum() == 0 - - -@pytest.mark.integration_test() -def test_drop_empty(visits_table): - """Test DropEmpty.""" - visits = visits_table.ops(DropEmpty("visit_concept_name")).run() - assert (visits["visit_concept_name"] == "").sum() == 0 - - -@pytest.mark.integration_test() -def test_condition_before_date(visits_table): - """Test ConditionBeforeDate.""" - visits = visits_table.ops( - ConditionBeforeDate("visit_start_date", "2018-01-01"), - ).run() - assert pd.Timestamp(visits["visit_start_date"].max()) < pd.Timestamp("2018-01-01") - - -@pytest.mark.integration_test() -def test_condition_after_date(visits_table): - """Test ConditionAfterDate.""" - visits = visits_table.ops( - ConditionAfterDate("visit_start_date", "2018-01-01"), - ).run() - assert pd.Timestamp(visits["visit_start_date"].min()) > pd.Timestamp("2018-01-01") - - -@pytest.mark.integration_test() -def test_condition_in(visits_table): - """Test ConditionIn.""" - visits = visits_table.ops( - ConditionIn("visit_concept_name", ["Outpatient Visit"]), - ).run() - assert all(visits["visit_concept_name"] == "Outpatient Visit") - - -@pytest.mark.integration_test() -def test_condition_in_months(visits_table): - """Test ConditionInMonths.""" - ops = Sequential( - Cast("visit_start_date", "timestamp"), - ConditionInMonths("visit_start_date", 6), - ) - visits = visits_table.ops(ops).run() - assert (visits["visit_start_date"].dt.month == 6).all() - - -@pytest.mark.integration_test() -def test_condition_in_years(visits_table): - """Test ConditionInYears.""" - ops = Sequential( - Cast("visit_start_date", "timestamp"), - ConditionInYears("visit_start_date", 2018), - ) - visits = visits_table.ops(ops).run() - assert (visits["visit_start_date"].dt.year == 2018).all() - - -@pytest.mark.integration_test() -def test_condition_substring(visits_table): - """Test ConditionSubstring.""" - visits = visits_table.ops( - ConditionSubstring("visit_concept_name", "Outpatient"), - ).run() - assert all(visits["visit_concept_name"].str.contains("Outpatient")) - - -@pytest.mark.integration_test() -def test_condition_starts_with(visits_table): - """Test ConditionStartsWith.""" - visits = visits_table.ops( - ConditionStartsWith("visit_concept_name", "Outpatient"), - ).run() - assert all(visits["visit_concept_name"].str.startswith("Outpatient")) - - -@pytest.mark.integration_test() -def test_condition_ends_with(visits_table): - """Test ConditionEndsWith.""" - visits = visits_table.ops(ConditionEndsWith("visit_concept_name", "Visit")).run() - assert all(visits["visit_concept_name"].str.endswith("Visit")) - - -@pytest.mark.integration_test() -def test_condition_equals(visits_table): - """Test ConditionEquals.""" - visits = visits_table.ops( - ConditionEquals("visit_concept_name", "Outpatient Visit"), - ).run() - assert all(visits["visit_concept_name"] == "Outpatient Visit") - visits = visits_table.ops( - ConditionEquals("visit_concept_name", "Outpatient Visit", not_=True), - ).run() - assert all(visits["visit_concept_name"] != "Outpatient Visit") - - -@pytest.mark.integration_test() -def test_condition_greater_than(visits_table): - """Test ConditionGreaterThan.""" - visits = visits_table.ops(ConditionGreaterThan("visit_concept_id", 9300)).run() - assert all(visits["visit_concept_id"] > 9300) - - -@pytest.mark.integration_test() -def test_condition_less_than(visits_table): - """Test ConditionLessThan.""" - visits = visits_table.ops(ConditionLessThan("visit_concept_id", 9300)).run() - assert all(visits["visit_concept_id"] < 9300) - - -@pytest.mark.integration_test() -def test_union(visits_table): - """Test Union.""" - outpatient_filtered = visits_table.ops( - ConditionEquals("visit_concept_name", "Outpatient Visit"), - ) - emergency_filtered = visits_table.ops( - ConditionEquals("visit_concept_name", "Emergency Room Visit"), - ) - visits = emergency_filtered.union(outpatient_filtered).run() - assert len(visits) == 4212 - assert all( - visits["visit_concept_name"].isin(["Outpatient Visit", "Emergency Room Visit"]), - ) - visits = emergency_filtered.union_all(emergency_filtered).run() - assert len(visits) == 310 - - -@pytest.mark.integration_test() -def test_sequential(visits_table): - """Test Sequential.""" - substr_op = Substring("visit_concept_name", 0, 4, "visit_concept_name_substr") - operations = [ - Literal(33, "const"), - Rename({"care_site_name": "hospital_name"}), - Apply("visit_concept_name", lambda x: x + "!", "visit_concept_name_exclaim"), - OrderBy(["person_id", "visit_start_date"]), - substr_op, - ] - sequential_ops = Sequential(operations) - visits = visits_table.ops(sequential_ops).run() - assert "hospital_name" in visits.columns - assert "visit_concept_name_exclaim" in visits.columns - assert list(visits[visits["person_id"] == 33]["visit_concept_name_exclaim"])[0] == ( - "Outpatient Visit!" - ) - assert "visit_concept_name_substr" in visits.columns - assert list(visits[visits["person_id"] == 33]["visit_concept_name_substr"])[0] == ( - "Out" - ) - - -@pytest.mark.integration_test() -def test_or(visits_table): - """Test Or.""" - or_op = Or( - ConditionEquals("visit_concept_name", "Outpatient Visit"), - ConditionLike("visit_concept_name", "%Emergency%"), - ) - visits = visits_table.ops(or_op).run() - assert len(visits) == 4212 - assert all( - visits["visit_concept_name"].isin(["Outpatient Visit", "Emergency Room Visit"]), - ) - - -@pytest.mark.integration_test() -def test_and(visits_table): - """Test And.""" - and_op = And( - [ - ConditionEquals("visit_concept_name", "Outpatient Visit"), - ConditionLike("visit_concept_name", "%Emergency%", not_=True), - ], - ) - visits = visits_table.ops(and_op).run() - assert len(visits) == 4057 - and_op = And( - ConditionEquals("visit_concept_name", "Outpatient Visit"), - ConditionLike("visit_concept_name", "%Emergency%", not_=True), - ) - visits = visits_table.ops(and_op).run() - assert len(visits) == 4057 - - -@pytest.mark.integration_test() -def test_distinct(visits_table): - """Test Distinct.""" - distinct_op = Distinct(["person_id"]) - visits = visits_table.ops(distinct_op).run() - assert len(visits) == 109 - - -@pytest.mark.integration_test() -def test_condition_like(visits_table): - """Test ConditionLike.""" - like_op = ConditionLike("visit_concept_name", "Outpatient%") - visits = visits_table.ops(like_op).run() - assert len(visits) == 4057 - assert all(visits["visit_concept_name"].str.startswith("Outpatient")) diff --git a/tests/cyclops/query/test_orm.py b/tests/cyclops/query/test_orm.py deleted file mode 100644 index 1b6691d59..000000000 --- a/tests/cyclops/query/test_orm.py +++ /dev/null @@ -1,31 +0,0 @@ -"""Test cyclops.query.orm module.""" - -import os - -import pandas as pd -import pytest - -from cyclops.query import OMOPQuerier - - -@pytest.mark.integration_test() -def test_omop_querier(): - """Test ORM using OMOPQuerier.""" - querier = OMOPQuerier( - database="synthea_integration_test", - schema_name="cdm_synthea10", - user="postgres", - password="pwd", - ) - assert querier is not None - db_ = querier.db - visits_query = querier.visit_occurrence().query - db_.save_query_to_csv(visits_query, "visits.csv") - visits_df = pd.read_csv("visits.csv") - assert len(visits_df) == 4320 - os.remove("visits.csv") - - db_.save_query_to_parquet(visits_query, "visits.parquet") - visits_df = pd.read_parquet("visits.parquet") - assert len(visits_df) == 4320 - os.remove("visits.parquet") diff --git a/tests/cyclops/query/test_util.py b/tests/cyclops/query/test_util.py deleted file mode 100644 index 5e0b0fb6b..000000000 --- a/tests/cyclops/query/test_util.py +++ /dev/null @@ -1,208 +0,0 @@ -"""Test query API util functions.""" - -import pytest -from sqlalchemy import Table, column, select -from sqlalchemy.sql.selectable import Select, Subquery -from sqlalchemy.types import Integer - -from cyclops.query.util import ( - DBTable, - _check_column_type, - _to_select, - _to_subquery, - drop_columns, - ends_with, - equals, - filter_columns, - get_column, - get_column_names, - get_columns, - greater_than, - has_columns, - less_than, - not_equals, - process_column, - process_elem, - process_list, - rename_columns, - reorder_columns, - starts_with, - table_params_to_type, - trim_columns, -) - - -@pytest.fixture() -def test_table(): - """Test table input.""" - return select(process_column(column("a"), to_int=True), column("b"), column("c")) - - -def test__check_column_type(test_table): - """Test _check_column_type fn.""" - assert _check_column_type(test_table, ["a"], Integer) - with pytest.raises(ValueError): - assert _check_column_type(test_table, ["b"], Integer, raise_error=True) - - -def test_ends_with(): - """Test ends_with.""" - test_col = column("a") - assert ( - str(ends_with(test_col, "a")) == "trim(lower(CAST(a AS VARCHAR))) LIKE :trim_1" - ) - - -def test_starts_with(): - """Test starts_with fn.""" - test_col = column("a") - assert ( - str(starts_with(test_col, "a")) - == "trim(lower(CAST(a AS VARCHAR))) LIKE :trim_1" - ) - - -def test__to_subquery(): - """Test _to_subquery fn.""" - assert isinstance(_to_subquery(select().subquery()), Subquery) - assert isinstance(_to_subquery(select()), Subquery) - assert isinstance(_to_subquery(Table()), Subquery) - assert isinstance(_to_subquery(DBTable("a", Table())), Subquery) - with pytest.raises(TypeError): - _to_subquery("a") - with pytest.raises(ValueError): - table_params_to_type(int) - - -def test__to_select(): - """Test _to_select fn.""" - assert isinstance(_to_select(select().subquery()), Select) - assert isinstance(_to_select(select()), Select) - assert isinstance(_to_select(Table()), Select) - assert isinstance(_to_select(DBTable("a", Table())), Select) - with pytest.raises(TypeError): - _to_select("a") - - -def test_get_column(test_table): - """Test get_column fn.""" - assert str(get_column(test_table, "a")) == "anon_1.a" - with pytest.raises(ValueError): - get_column(select(column("a")), "b") - - -def test_get_columns(test_table): - """Test get_columns fn.""" - cols = get_columns(test_table, "c") - cols = [str(col) for col in cols] - assert cols == ["anon_1.c"] - with pytest.raises(ValueError): - get_column(select(column("a")), "b") - - -def test_get_column_names(test_table): - """Test get_column_names fn.""" - assert get_column_names(test_table) == ["a", "b", "c"] - - -def test_filter_columns(test_table): - """Test filter_columns fn.""" - filtered = filter_columns(test_table, ["a", "c", "d"]) - assert get_column_names(filtered) == ["a", "c"] - - -def test_has_columns(test_table): - """Test has_columns fn.""" - assert not has_columns(test_table, ["a", "d"]) - assert has_columns(test_table, ["a", "b"]) - with pytest.raises(ValueError): - has_columns(test_table, ["a", "d"], raise_error=True) - - -def test_drop_columns(test_table): - """Test drop_columns fn.""" - after_drop = drop_columns(test_table, ["a"]) - assert get_column_names(after_drop) == ["b", "c"] - - -def test_rename_columns(test_table): - """Test rename_columns fn.""" - after_rename = rename_columns(test_table, {"a": "apple", "b": "ball"}) - assert get_column_names(after_rename) == ["apple", "ball", "c"] - - -def test_reorder_columns(test_table): - """Test reorder_columns fn.""" - with pytest.raises(ValueError): - reorder_columns(test_table, ["ball", "c", "a"]) - with pytest.raises(ValueError): - reorder_columns(test_table, ["c", "a"]) - after_reorder = reorder_columns(test_table, ["b", "c", "a"]) - assert get_column_names(after_reorder) == ["b", "c", "a"] - - -def test_trim_columns(test_table): - """Test apply_to_columns fn.""" - after_trim = trim_columns(test_table, ["a"], ["apple"]) - assert get_column_names(after_trim) == ["a", "b", "c", "apple"] - - -def test_process_elem(): - """Test process_elem fn.""" - assert process_elem("Test", lower=True) == "test" - assert process_elem("Test ", lower=True, trim=True) == "test" - assert process_elem("1", to_int=True) == 1 - assert process_elem("1.2", to_float=True) == 1.2 - assert process_elem(1, to_bool=True) is True - assert process_elem(0, to_bool=True) is False - - -def test_process_list(): - """Test process_list fn.""" - assert process_list([1, 2, 3, 0], to_bool=True) == [True, True, True, False] - - -def test_process_column(): - """Test process_column fn.""" - test_col = column("a") - processed_col = process_column(test_col, to_int=True) - assert str(processed_col) == "CAST(a AS INTEGER)" - processed_col = process_column(test_col, to_float=True) - assert str(processed_col) == "CAST(a AS FLOAT)" - processed_col = process_column(test_col, to_str=True) - assert str(processed_col) == "CAST(a AS VARCHAR)" - processed_col = process_column(test_col, to_bool=True) - assert str(processed_col) == "CAST(a AS BOOLEAN)" - processed_col = process_column(test_col, to_date=True) - assert str(processed_col) == "CAST(a AS DATE)" - processed_col = process_column(test_col, to_timestamp=True) - assert str(processed_col) == "CAST(a AS DATETIME)" - test_col.type = "VARCHAR" - processed_col = process_column(test_col, lower=True, trim=True) - assert str(processed_col) == "trim(lower(a))" - - -def test_equals(): - """Test equals fn.""" - test_col = column("a") - assert str(equals(test_col, "bat")) == "a = :a_1" - - -def test_greater_than(): - """Test greater_than fn.""" - test_col = column("a") - assert str(greater_than(test_col, 1)) == "a > :a_1" - assert str(greater_than(test_col, 1, equal=True)) == "a >= :a_1" - - -def test_less_than(): - """Test less_than fn.""" - test_col = column("a") - assert str(less_than(test_col, 1)) == "a < :a_1" - assert str(less_than(test_col, 1, equal=True)) == "a <= :a_1" - - -def test_not_equals(): - """Test not_equals fn.""" - test_col = column("a") - assert str(not_equals(test_col, "bat")) == "a != :a_1"