From cc41e94c94b54d897c0e5b92f3fc16f8c9181dd7 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 2 Apr 2021 13:21:35 -0400 Subject: [PATCH 01/65] init branch --- docs/changelog.md | 3 +++ pipestat/_version.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/changelog.md b/docs/changelog.md index 8419336f..e59f61c0 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,6 +2,9 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. +## [0.1.0] - unreleased + + ## [0.0.4] - 2021-04-02 ### Added - config validation diff --git a/pipestat/_version.py b/pipestat/_version.py index 81f0fdec..607f7a49 100644 --- a/pipestat/_version.py +++ b/pipestat/_version.py @@ -1 +1 @@ -__version__ = "0.0.4" +__version__ = "0.1.0-dev" From 9ac892646f98d35225807ec6dfebb02c46da9ae3 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 2 Apr 2021 15:27:48 -0400 Subject: [PATCH 02/65] update pre-commit cfg --- .pre-commit-config.yaml | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 16cccb97..ab5489e2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,18 +4,17 @@ repos: hooks: - id: trailing-whitespace - id: check-yaml - - id: check-json - id: end-of-file-fixer - - id: name-tests-test - id: requirements-txt-fixer - id: trailing-whitespace - - repo: https://github.com/psf/black - rev: 20.8b1 - hooks: - - id: black - - repo: https://github.com/PyCQA/isort rev: 5.7.0 hooks: - id: isort + args: ["--profile", "black"] + + - repo: https://github.com/psf/black + rev: 20.8b1 + hooks: + - id: black From 06dfab2854b0d71c536383c785131af26ea13b94 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 2 Apr 2021 15:28:17 -0400 Subject: [PATCH 03/65] start ORM implementation --- pipestat/pipestat_orm.py | 552 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 552 insertions(+) create mode 100644 pipestat/pipestat_orm.py diff --git a/pipestat/pipestat_orm.py b/pipestat/pipestat_orm.py new file mode 100644 index 00000000..99556f52 --- /dev/null +++ b/pipestat/pipestat_orm.py @@ -0,0 +1,552 @@ +from contextlib import contextmanager +from copy import deepcopy +from logging import getLogger +from typing import Any, Dict, List, Optional, Union +from urllib.parse import quote_plus + +from attmap import PathExAttMap as PXAM +from jsonschema import validate +from sqlalchemy import Column, Float, ForeignKey, Integer, String, Table, create_engine +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import relationship, sessionmaker +from ubiquerg import create_lock, remove_lock +from yacman import YacAttMap + +from .const import * +from .exceptions import * +from .helpers import * + +_LOGGER = getLogger(PKG_NAME) + + +class PipestatManagerORM(dict): + """ + Pipestat standardizes reporting of pipeline results and + pipeline status management. It formalizes a way for pipeline developers + and downstream tools developers to communicate -- results produced by a + pipeline can easily and reliably become an input for downstream analyses. + The object exposes API for interacting with the results and + pipeline status and can be backed by either a YAML-formatted file + or a PostgreSQL database. + """ + + def __init__( + self, + namespace: str = None, + record_identifier: str = None, + schema_path: str = None, + results_file_path: str = None, + database_only: bool = False, + config: Union[str, dict] = None, + status_schema_path: str = None, + flag_file_dir: str = None, + ): + """ + Initialize the object + + :param str namespace: namespace to report into. This will be the DB + table name if using DB as the object back-end + :param str record_identifier: record identifier to report for. This + creates a weak bound to the record, which can be overriden in + this object method calls + :param str schema_path: path to the output schema that formalizes + the results structure + :param str results_file_path: YAML file to report into, if file is + used as the object back-end + :param bool database_only: whether the reported data should not be + stored in the memory, but only in the database + :param str | dict config: path to the configuration file or a mapping + with the config file content + :param str status_schema_path: path to the status schema that formalizes + the status flags structure + """ + + def _check_cfg_key(cfg: dict, key: str) -> bool: + if key not in cfg: + _LOGGER.warning(f"Key '{key}' not found in config") + return False + return True + + def _mk_abs_via_cfg( + path: Optional[str], + cfg_path: Optional[str], + ) -> Optional[str]: + if path is None: + return path + assert isinstance(path, str), TypeError("Path is expected to be a str") + if os.path.isabs(path): + return path + if cfg_path is None: + rel_to_cwd = os.path.join(os.getcwd(), path) + if os.path.exists(rel_to_cwd) or os.access( + os.path.dirname(rel_to_cwd), os.W_OK + ): + return rel_to_cwd + raise OSError(f"Could not make this path absolute: {path}") + joined = os.path.join(os.path.dirname(cfg_path), path) + if os.path.isabs(joined): + return joined + raise OSError(f"Could not make this path absolute: {path}") + + def _select_value( + arg_name: str, + arg_value: Any, + cfg: dict, + strict: bool = True, + env_var: str = None, + ) -> Any: + if arg_value is not None: + return arg_value + if arg_name not in cfg or cfg[arg_name] is None: + if env_var is not None: + arg = os.getenv(env_var, None) + if arg is not None: + _LOGGER.debug(f"Value '{arg}' sourced from '{env_var}' env var") + return expandpath(arg) + if strict: + raise PipestatError( + f"Value for the required '{arg_name}' argument could not be" + f" determined. Provide it in the config or pass to the " + f"object constructor." + ) + return + return cfg[arg_name] + + super(PipestatManagerORM, self).__init__() + self[CONFIG_KEY] = YacAttMap() + # read config or config data + config = config or os.getenv(ENV_VARS["config"]) + if config is not None: + if isinstance(config, str): + config = os.path.abspath(expandpath(config)) + self[CONFIG_KEY] = YacAttMap(filepath=config) + self._config_path = config + elif isinstance(config, dict): + self[CONFIG_KEY] = YacAttMap(entries=config) + self._config_path = None + else: + raise TypeError( + "database_config has to be either path to the " + "file to read or a dict" + ) + # validate config + cfg = self[CONFIG_KEY].to_dict(expand=True) + _, cfg_schema = read_yaml_data(CFG_SCHEMA, "config schema") + validate(cfg, cfg_schema) + + self[NAME_KEY] = _select_value( + "namespace", namespace, self[CONFIG_KEY], env_var=ENV_VARS["namespace"] + ) + self[RECORD_ID_KEY] = _select_value( + "record_identifier", + record_identifier, + self[CONFIG_KEY], + False, + ENV_VARS["record_identifier"], + ) + self[DB_ONLY_KEY] = database_only + # read results schema + self._schema_path = _select_value( + "schema_path", + schema_path, + self[CONFIG_KEY], + False, + env_var=ENV_VARS["schema"], + ) + if self._schema_path is not None: + _, self[SCHEMA_KEY] = read_yaml_data( + _mk_abs_via_cfg(self._schema_path, self.config_path), "schema" + ) + self.validate_schema() + # determine the highlighted results + self[HIGHLIGHTED_KEY] = [ + k + for k, v in self.schema.items() + if "highlight" in v and v["highlight"] is True + ] + if self[HIGHLIGHTED_KEY]: + assert isinstance(self[HIGHLIGHTED_KEY], list), TypeError( + f"highlighted results specification " + f"({self[HIGHLIGHTED_KEY]}) has to be a list" + ) + # read status schema + status_schema_path = ( + _mk_abs_via_cfg( + _select_value( + "status_schema_path", + status_schema_path, + self[CONFIG_KEY], + False, + env_var=ENV_VARS["status_schema"], + ), + self.config_path, + ) + or STATUS_SCHEMA + ) + self[STATUS_SCHEMA_SOURCE_KEY], self[STATUS_SCHEMA_KEY] = read_yaml_data( + status_schema_path, "status schema" + ) + # determine results file + results_file_path = _mk_abs_via_cfg( + _select_value( + "results_file_path", + results_file_path, + self[CONFIG_KEY], + False, + ENV_VARS["results_file"], + ), + self.config_path, + ) + if results_file_path: + if self[DB_ONLY_KEY]: + raise ValueError( + "Running in database only mode does not make " + "sense with a YAML file as a backend." + ) + self[FILE_KEY] = results_file_path + self._init_results_file() + flag_file_dir = _select_value( + "flag_file_dir", flag_file_dir, self[CONFIG_KEY], False + ) or os.path.dirname(self.file) + self[STATUS_FILE_DIR] = _mk_abs_via_cfg(flag_file_dir, self.config_path) + elif CFG_DATABASE_KEY in self[CONFIG_KEY]: + if not all( + [ + _check_cfg_key(self[CONFIG_KEY][CFG_DATABASE_KEY], key) + for key in DB_CREDENTIALS + ] + ): + raise MissingConfigDataError( + "Must specify all database login " "credentials or result_file_path" + ) + self[DATA_KEY] = YacAttMap() + self._init_postgres_table() + # self._init_status_table() + else: + raise MissingConfigDataError( + "Must specify either database login " "credentials or a YAML file path" + ) + + def _get_flag_file( + self, record_identifier: str = None + ) -> Union[str, List[str], None]: + """ + Get path to the status flag file for the specified record + + :param str record_identifier: unique record identifier + :return str | list[str] | None: path to the status flag file + """ + from glob import glob + + r_id = self._strict_record_id(record_identifier) + if self.file is None: + return + if self.file is not None: + regex = os.path.join( + self[STATUS_FILE_DIR], f"{self.namespace}_{r_id}_*.flag" + ) + file_list = glob(regex) + if len(file_list) > 1: + _LOGGER.warning("Multiple flag files found") + return file_list + elif len(file_list) == 1: + return file_list[0] + else: + _LOGGER.debug("No flag files found") + return None + + @property + def record_count(self) -> int: + """ + Number of records reported + + :return int: number of records reported + """ + return ( + len(self.data[self.namespace]) + if self.file + else self._count_rows(self.namespace) + ) + + @property + def highlighted_results(self) -> List[str]: + """ + Highlighted results + + :return List[str]: a collection of highlighted results + """ + return self._get_attr(HIGHLIGHTED_KEY) or [] + + @property + def namespace(self) -> str: + """ + Namespace the object writes the results to + + :return str: namespace the object writes the results to + """ + return self._get_attr(NAME_KEY) + + @property + def record_identifier(self) -> str: + """ + Unique identifier of the record + + :return str: unique identifier of the record + """ + return self._get_attr(RECORD_ID_KEY) + + @property + def schema(self) -> Dict: + """ + Schema mapping + + :return dict: schema that formalizes the results structure + """ + return self._get_attr(SCHEMA_KEY) + + @property + def status_schema(self) -> Dict: + """ + Status schema mapping + + :return dict: schema that formalizes the pipeline status structure + """ + return self._get_attr(STATUS_SCHEMA_KEY) + + @property + def status_schema_source(self) -> Dict: + """ + Status schema source + + :return dict: source of the schema that formalizes + the pipeline status structure + """ + return self._get_attr(STATUS_SCHEMA_SOURCE_KEY) + + @property + def schema_path(self) -> str: + """ + Schema path + + :return str: path to the provided schema + """ + return self._schema_path + + @property + def config_path(self) -> str: + """ + Config path. None if the config was not provided or if provided + as a mapping of the config contents + + :return str: path to the provided config + """ + return getattr(self, "_config_path", None) + + @property + def result_schemas(self) -> Dict: + """ + Result schema mappings + + :return dict: schemas that formalize the structure of each result + in a canonical jsonschema way + """ + return self._get_attr(RES_SCHEMAS_KEY) + + @property + def file(self) -> str: + """ + File path that the object is reporting the results into + + :return str: file path that the object is reporting the results into + """ + return self._get_attr(FILE_KEY) + + @property + def data(self) -> YacAttMap: + """ + Data object + + :return yacman.YacAttMap: the object that stores the reported data + """ + return self._get_attr(DATA_KEY) + + @property + @contextmanager + def session(self): + """ + Provide a transactional scope around a series of query + operations, no commit afterwards. + """ + if not self.is_db_connected(): + self.establish_postgres_connection_orm() + with self[DB_SESSION_KEY]() as session: + _LOGGER.debug("Created session") + yield session + _LOGGER.debug("Ending session") + + def _create_table_orm(self, table_name: str, schema: Dict[str, Any]): + """ + Create a table + + :param str table_name: name of the table to create + :param Dict[str, Any] schema: schema to base table creation on + """ + tn = table_name or self.namespace + attr_dict = dict(__tablename__=tn, id=Column(Integer, primary_key=True)) + for result_id, result_metadata in schema.items(): + col_type = SQL_CLASSES_BY_TYPE[result_metadata[SCHEMA_TYPE_KEY]] + _LOGGER.info(f"Adding object: {result_id} of type: {str(col_type)}") + attr_dict.update({result_id: Column(col_type)}) + _LOGGER.debug(f"Creating '{tn}' ORM with args: {attr_dict}") + Base = declarative_base() + type(tn, (Base,), attr_dict) + Base.metadata.create_all(bind=self[DB_ENGINE_KEY]) + + def establish_postgres_connection_orm(self) -> bool: + """ + Establish PostgreSQL connection using the config data + + :return bool: whether the connection has been established successfully + """ + if self.is_db_connected(): + raise PipestatDatabaseError("Connection is already established") + + self[DB_ENGINE_KEY] = create_engine(self.db_url, echo=True) + self[DB_SESSION_KEY] = sessionmaker(bind=self[DB_ENGINE_KEY]) + + def is_db_connected(self) -> bool: + """ + Check whether a PostgreSQL connection has been established + + :return bool: whether the connection has been established + """ + if self.file is not None: + raise PipestatDatabaseError( + f"The {self.__class__.__name__} object is not backed by a database" + ) + if DB_SESSION_KEY in self and isinstance(self[DB_SESSION_KEY], sessionmaker): + return True + return False + + @property + def db_url(self) -> str: + """ + Database URL, generated based on config credentials + + :return str: database URL + """ + try: + creds = dict( + name=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_NAME_KEY], + user=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_USER_KEY], + passwd=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_PASSWORD_KEY], + host=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_HOST_KEY], + port=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_PORT_KEY], + dialect=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_DIALECT_KEY], + ) + except (KeyError, AttributeError) as e: + raise PipestatDatabaseError( + f"Could not determine database URL. Caught error: {str(e)}" + ) + # parsed_creds = {k: quote_plus(v) for k, v in creds.items()} + parsed_creds = creds + return "{dialect}://{user}:{passwd}@{host}:{port}/{name}".format(**parsed_creds) + + def validate_schema(self) -> None: + """ + Check schema for any possible issues + + :raises SchemaError: if any schema format issue is detected + """ + + def _recursively_replace_custom_types(s: dict) -> Dict: + """ + Replace the custom types in pipestat schema with canonical types + + :param dict s: schema to replace types in + :return dict: schema with types replaced + """ + for k, v in s.items(): + assert SCHEMA_TYPE_KEY in v, SchemaError( + f"Result '{k}' is missing '{SCHEMA_TYPE_KEY}' key" + ) + if v[SCHEMA_TYPE_KEY] == "object" and SCHEMA_PROP_KEY in s[k]: + _recursively_replace_custom_types(s[k][SCHEMA_PROP_KEY]) + if v[SCHEMA_TYPE_KEY] in CANONICAL_TYPES.keys(): + s.setdefault(k, {}) + s[k].setdefault(SCHEMA_PROP_KEY, {}) + s[k][SCHEMA_PROP_KEY].update( + CANONICAL_TYPES[v[SCHEMA_TYPE_KEY]][SCHEMA_PROP_KEY] + ) + s[k].setdefault("required", []) + s[k]["required"].extend( + CANONICAL_TYPES[v[SCHEMA_TYPE_KEY]]["required"] + ) + s[k][SCHEMA_TYPE_KEY] = CANONICAL_TYPES[v[SCHEMA_TYPE_KEY]][ + SCHEMA_TYPE_KEY + ] + return s + + schema = deepcopy(self.schema) + _LOGGER.debug(f"Validating input schema") + assert isinstance(schema, dict), SchemaError( + f"The schema has to be a {dict().__class__.__name__}" + ) + for col_name in RESERVED_COLNAMES: + assert col_name not in schema.keys(), PipestatError( + f"'{col_name}' is an identifier reserved by pipestat" + ) + self[RES_SCHEMAS_KEY] = {} + schema = _recursively_replace_custom_types(schema) + self[RES_SCHEMAS_KEY] = schema + + def _init_postgres_table(self) -> bool: + """ + Initialize a PostgreSQL table based on the provided schema, + if it does not exist. Read the data stored in the database into the + memory otherwise. + + :return bool: whether the table has been created + """ + if self.schema is None: + raise SchemaNotFoundError("initialize the database table") + if self._check_table_exists(table_name=self.namespace): + _LOGGER.debug(f"Table '{self.namespace}' already exists in the database") + if not self[DB_ONLY_KEY]: + self._table_to_dict() + return False + _LOGGER.info(f"Initializing '{self.namespace}' table in '{PKG_NAME}' database") + self._create_table_orm(table_name=self.namespace, schema=self.result_schemas) + return True + + def _init_status_table(self): + status_table_name = f"{self.namespace}_{STATUS}" + # self._create_status_type() + if not self._check_table_exists(table_name=status_table_name): + _LOGGER.info( + f"Initializing '{status_table_name}' table in " f"'{PKG_NAME}' database" + ) + self._create_table_orm( + table_name=status_table_name, schema=self.status_schema + ) + + def _get_attr(self, attr: str) -> Any: + """ + Safely get the name of the selected attribute of this object + + :param str attr: attr to select + :return: + """ + return self[attr] if attr in self else None + + def _check_table_exists(self, table_name: str) -> bool: + """ + Check if the specified table exists + + :param str table_name: table name to be checked + :return bool: whether the specified table exists + """ + from sqlalchemy import inspect + + with self.session as s: + return inspect(s.bind).has_table(table_name=table_name) From 179f6dcef35eedb413842eb3476f15b07e0d7d89 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 8 Apr 2021 09:37:29 -0400 Subject: [PATCH 04/65] fix db credentials parsing, other minor changes --- pipestat/const.py | 16 ++++++++++++++++ pipestat/pipestat_orm.py | 6 ++++-- tests/data/config.yaml | 1 + 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/pipestat/const.py b/pipestat/const.py index 3c72253a..1a0c97a0 100644 --- a/pipestat/const.py +++ b/pipestat/const.py @@ -1,5 +1,7 @@ import os +from sqlalchemy.types import ARRAY, JSON, Boolean, Float, Integer, String + PKG_NAME = "pipestat" LOCK_PREFIX = "lock." REPORT_CMD = "report" @@ -43,6 +45,7 @@ CFG_PORT_KEY = "port" CFG_PASSWORD_KEY = "password" CFG_USER_KEY = "user" +CFG_DIALECT_KEY = "dialect" # sqlite, mysql, postgresql, oracle, or mssql DB_CREDENTIALS = [ CFG_HOST_KEY, @@ -66,6 +69,8 @@ FILE_KEY = "_file" RECORD_ID_KEY = "_record_id" DB_CONNECTION_KEY = "_db_connnection" +DB_SESSION_KEY = "_db_session" +DB_ENGINE_KEY = "_db_engine" HIGHLIGHTED_KEY = "_highlighted" # schema keys @@ -127,6 +132,17 @@ "boolean": bool, } +SQL_CLASSES_BY_TYPE = { + "number": Float, + "integer": Integer, + "object": JSON, + "image": JSON, + "file": JSON, + "string": String, + "array": ARRAY(String), + "boolean": Boolean, +} + CFG_SCHEMA = os.path.join( os.path.dirname(os.path.abspath(__file__)), "schemas", "pipestat_config_schema.yaml" ) diff --git a/pipestat/pipestat_orm.py b/pipestat/pipestat_orm.py index 99556f52..6d8248df 100644 --- a/pipestat/pipestat_orm.py +++ b/pipestat/pipestat_orm.py @@ -413,6 +413,7 @@ def establish_postgres_connection_orm(self) -> bool: self[DB_ENGINE_KEY] = create_engine(self.db_url, echo=True) self[DB_SESSION_KEY] = sessionmaker(bind=self[DB_ENGINE_KEY]) + return True def is_db_connected(self) -> bool: """ @@ -448,8 +449,9 @@ def db_url(self) -> str: raise PipestatDatabaseError( f"Could not determine database URL. Caught error: {str(e)}" ) - # parsed_creds = {k: quote_plus(v) for k, v in creds.items()} - parsed_creds = creds + import urllib.parse + + parsed_creds = {k: urllib.parse.quote_plus(str(v)) for k, v in creds.items()} return "{dialect}://{user}:{passwd}@{host}:{port}/{name}".format(**parsed_creds) def validate_schema(self) -> None: diff --git a/tests/data/config.yaml b/tests/data/config.yaml index 1538ae9f..1e619592 100644 --- a/tests/data/config.yaml +++ b/tests/data/config.yaml @@ -2,6 +2,7 @@ namespace: test record_identifier: sample1 schema_path: sample_output_schema.yaml #$HOME/Desktop/sample_output_schema.yaml database: + dialect: postgresql name: pipestat-test user: postgres password: pipestat-password From 63f073f2548cdaae93d8111fb4e2de07eaf79818 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 8 Apr 2021 09:43:53 -0400 Subject: [PATCH 05/65] restrict DB dialects in schema --- pipestat/pipestat_orm.py | 4 +--- pipestat/schemas/pipestat_config_schema.yaml | 6 +++++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pipestat/pipestat_orm.py b/pipestat/pipestat_orm.py index 6d8248df..e123ba47 100644 --- a/pipestat/pipestat_orm.py +++ b/pipestat/pipestat_orm.py @@ -449,9 +449,7 @@ def db_url(self) -> str: raise PipestatDatabaseError( f"Could not determine database URL. Caught error: {str(e)}" ) - import urllib.parse - - parsed_creds = {k: urllib.parse.quote_plus(str(v)) for k, v in creds.items()} + parsed_creds = {k: quote_plus(str(v)) for k, v in creds.items()} return "{dialect}://{user}:{passwd}@{host}:{port}/{name}".format(**parsed_creds) def validate_schema(self) -> None: diff --git a/pipestat/schemas/pipestat_config_schema.yaml b/pipestat/schemas/pipestat_config_schema.yaml index ee11bd51..03670227 100644 --- a/pipestat/schemas/pipestat_config_schema.yaml +++ b/pipestat/schemas/pipestat_config_schema.yaml @@ -1,6 +1,6 @@ description: "Pipestat configuration file schema; docs: http://pipestat.databio.org/en/latest/config/" properties: - name: + name: type: string pattern: "^\\S*$" description: "Namespace identifier with no whitespace" @@ -17,6 +17,10 @@ properties: database: type: object properties: + dialect: + type: string + enum: ["sqlite", "mysql", "postgresql", "oracle", "mssql"] + description: "Dialect names include the identifying name of the SQLAlchemy dialect, a name such as sqlite, mysql, postgresql, oracle, or mssql" name: type: string description: "DB name" From 96455c890e3396028c2b3e56df66168566b88a01 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 8 Apr 2021 11:00:54 -0400 Subject: [PATCH 06/65] status table initialization and schema processing --- pipestat/const.py | 4 ++++ pipestat/helpers.py | 18 ++++++++++++++++++ pipestat/pipestat_orm.py | 8 ++++---- pipestat/schemas/status_table_schema.yaml | 7 +++++++ 4 files changed, 33 insertions(+), 4 deletions(-) create mode 100644 pipestat/schemas/status_table_schema.yaml diff --git a/pipestat/const.py b/pipestat/const.py index 1a0c97a0..b8438c7d 100644 --- a/pipestat/const.py +++ b/pipestat/const.py @@ -149,3 +149,7 @@ STATUS_SCHEMA = os.path.join( os.path.dirname(os.path.abspath(__file__)), "schemas", "status_schema.yaml" ) + +STATUS_TABLE_SCHEMA = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "schemas", "status_table_schema.yaml" +) diff --git a/pipestat/helpers.py b/pipestat/helpers.py index 44af23a3..d4cda351 100644 --- a/pipestat/helpers.py +++ b/pipestat/helpers.py @@ -1,5 +1,6 @@ import logging from re import findall +from typing import Any, Dict import jsonschema from oyaml import safe_load @@ -11,6 +12,23 @@ _LOGGER = logging.getLogger(__name__) +def get_status_table_schema(status_schema: Dict[str, Any]) -> Dict[str, Any]: + """ + Update and return a status_table_schema based on user-provided status schema + + :param Dict[str, Any] status_schema: status schema provided by the user + :return Dict[str, Any]: status_schema status table scheme + to use as a base for status table generation + """ + defined_status_codes = list(status_schema.keys()) + _, status_table_schema = read_yaml_data( + path=STATUS_TABLE_SCHEMA, what="status table schema" + ) + status_table_schema["status"].update({"enum": defined_status_codes}) + _LOGGER.debug(f"Updated status table schema: {status_table_schema}") + return status_table_schema + + def schema_to_columns(schema): """ Get a list of database table columns from a schema diff --git a/pipestat/pipestat_orm.py b/pipestat/pipestat_orm.py index e123ba47..eac36a0f 100644 --- a/pipestat/pipestat_orm.py +++ b/pipestat/pipestat_orm.py @@ -221,7 +221,7 @@ def _select_value( ) self[DATA_KEY] = YacAttMap() self._init_postgres_table() - # self._init_status_table() + self._init_status_table() else: raise MissingConfigDataError( "Must specify either database login " "credentials or a YAML file path" @@ -395,7 +395,7 @@ def _create_table_orm(self, table_name: str, schema: Dict[str, Any]): attr_dict = dict(__tablename__=tn, id=Column(Integer, primary_key=True)) for result_id, result_metadata in schema.items(): col_type = SQL_CLASSES_BY_TYPE[result_metadata[SCHEMA_TYPE_KEY]] - _LOGGER.info(f"Adding object: {result_id} of type: {str(col_type)}") + _LOGGER.debug(f"Adding object: {result_id} of type: {str(col_type)}") attr_dict.update({result_id: Column(col_type)}) _LOGGER.debug(f"Creating '{tn}' ORM with args: {attr_dict}") Base = declarative_base() @@ -521,13 +521,13 @@ def _init_postgres_table(self) -> bool: def _init_status_table(self): status_table_name = f"{self.namespace}_{STATUS}" - # self._create_status_type() if not self._check_table_exists(table_name=status_table_name): _LOGGER.info( f"Initializing '{status_table_name}' table in " f"'{PKG_NAME}' database" ) self._create_table_orm( - table_name=status_table_name, schema=self.status_schema + table_name=status_table_name, + schema=get_status_table_schema(status_schema=self.status_schema), ) def _get_attr(self, attr: str) -> Any: diff --git a/pipestat/schemas/status_table_schema.yaml b/pipestat/schemas/status_table_schema.yaml new file mode 100644 index 00000000..8bbcf2e8 --- /dev/null +++ b/pipestat/schemas/status_table_schema.yaml @@ -0,0 +1,7 @@ +record_identifier: + type: string + description: "record identifier" +status: + # this is updated with an 'enum' based on the status_schema provided by the user + type: string + description: "status code, only statuses defined in the status_schema are defined" From efd06b16b24a48f4c379dc8ae10cb7dbe5f5300f Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 8 Apr 2021 13:22:59 -0400 Subject: [PATCH 07/65] multiple changes: - auto-generate repr for ORMs - store ORMs mapping in PipestatManager objs - add _count_rows method - status table --- pipestat/const.py | 2 ++ pipestat/pipestat_orm.py | 77 +++++++++++++++++++++++++++------------- 2 files changed, 55 insertions(+), 24 deletions(-) diff --git a/pipestat/const.py b/pipestat/const.py index b8438c7d..0b75a84d 100644 --- a/pipestat/const.py +++ b/pipestat/const.py @@ -64,6 +64,8 @@ STATUS_SCHEMA_SOURCE_KEY = "_status_schema_source" STATUS_FILE_DIR = "_status_file_dir" RES_SCHEMAS_KEY = "_result_schemas" +DB_BASE_KEY = "_declarative_base" +DB_ORMS_KEY = "_orms" DATA_KEY = "_data" NAME_KEY = "_name" FILE_KEY = "_file" diff --git a/pipestat/pipestat_orm.py b/pipestat/pipestat_orm.py index eac36a0f..585d479b 100644 --- a/pipestat/pipestat_orm.py +++ b/pipestat/pipestat_orm.py @@ -27,7 +27,7 @@ class PipestatManagerORM(dict): pipeline can easily and reliably become an input for downstream analyses. The object exposes API for interacting with the results and pipeline status and can be backed by either a YAML-formatted file - or a PostgreSQL database. + or a database. """ def __init__( @@ -219,8 +219,10 @@ def _select_value( raise MissingConfigDataError( "Must specify all database login " "credentials or result_file_path" ) + self[DB_ORMS_KEY] = {} + self[DB_BASE_KEY] = declarative_base() self[DATA_KEY] = YacAttMap() - self._init_postgres_table() + self._init_db_table() self._init_status_table() else: raise MissingConfigDataError( @@ -378,7 +380,7 @@ def session(self): operations, no commit afterwards. """ if not self.is_db_connected(): - self.establish_postgres_connection_orm() + self.establish_db_connection_orm() with self[DB_SESSION_KEY]() as session: _LOGGER.debug("Created session") yield session @@ -391,20 +393,33 @@ def _create_table_orm(self, table_name: str, schema: Dict[str, Any]): :param str table_name: name of the table to create :param Dict[str, Any] schema: schema to base table creation on """ + + def _auto_repr(x: Any) -> str: + """ + Auto-generated __repr__ fun + + :param Any x: + :return str: string object representation + """ + attr_strs = [ + f"{k}={str(v)}" for k, v in x.__dict__.items() if not k.startswith("_") + ] + return "<{}: {}>".format(x.__class__.__name__, ", ".join(attr_strs)) + tn = table_name or self.namespace attr_dict = dict(__tablename__=tn, id=Column(Integer, primary_key=True)) for result_id, result_metadata in schema.items(): col_type = SQL_CLASSES_BY_TYPE[result_metadata[SCHEMA_TYPE_KEY]] _LOGGER.debug(f"Adding object: {result_id} of type: {str(col_type)}") attr_dict.update({result_id: Column(col_type)}) + attr_dict.update({"__repr__": _auto_repr}) _LOGGER.debug(f"Creating '{tn}' ORM with args: {attr_dict}") - Base = declarative_base() - type(tn, (Base,), attr_dict) - Base.metadata.create_all(bind=self[DB_ENGINE_KEY]) + self[DB_ORMS_KEY][tn] = type(tn.capitalize(), (self[DB_BASE_KEY],), attr_dict) + self[DB_BASE_KEY].metadata.create_all(bind=self[DB_ENGINE_KEY]) - def establish_postgres_connection_orm(self) -> bool: + def establish_db_connection_orm(self) -> bool: """ - Establish PostgreSQL connection using the config data + Establish DB connection using the config data :return bool: whether the connection has been established successfully """ @@ -417,7 +432,7 @@ def establish_postgres_connection_orm(self) -> bool: def is_db_connected(self) -> bool: """ - Check whether a PostgreSQL connection has been established + Check whether a DB connection has been established :return bool: whether the connection has been established """ @@ -500,9 +515,9 @@ def _recursively_replace_custom_types(s: dict) -> Dict: schema = _recursively_replace_custom_types(schema) self[RES_SCHEMAS_KEY] = schema - def _init_postgres_table(self) -> bool: + def _init_db_table(self) -> bool: """ - Initialize a PostgreSQL table based on the provided schema, + Initialize a database table based on the provided schema, if it does not exist. Read the data stored in the database into the memory otherwise. @@ -510,25 +525,29 @@ def _init_postgres_table(self) -> bool: """ if self.schema is None: raise SchemaNotFoundError("initialize the database table") - if self._check_table_exists(table_name=self.namespace): - _LOGGER.debug(f"Table '{self.namespace}' already exists in the database") - if not self[DB_ONLY_KEY]: - self._table_to_dict() - return False + if not self.is_db_connected(): + self.establish_db_connection_orm() + # if self._check_table_exists(table_name=self.namespace): + # _LOGGER.debug(f"Table '{self.namespace}' already exists in the database") + # if not self[DB_ONLY_KEY]: + # self._table_to_dict() + # # return False _LOGGER.info(f"Initializing '{self.namespace}' table in '{PKG_NAME}' database") self._create_table_orm(table_name=self.namespace, schema=self.result_schemas) return True def _init_status_table(self): status_table_name = f"{self.namespace}_{STATUS}" - if not self._check_table_exists(table_name=status_table_name): - _LOGGER.info( - f"Initializing '{status_table_name}' table in " f"'{PKG_NAME}' database" - ) - self._create_table_orm( - table_name=status_table_name, - schema=get_status_table_schema(status_schema=self.status_schema), - ) + if not self.is_db_connected(): + self.establish_db_connection_orm() + # if not self._check_table_exists(table_name=status_table_name): + _LOGGER.debug( + f"Initializing '{status_table_name}' table in " f"'{PKG_NAME}' database" + ) + self._create_table_orm( + table_name=status_table_name, + schema=get_status_table_schema(status_schema=self.status_schema), + ) def _get_attr(self, attr: str) -> Any: """ @@ -550,3 +569,13 @@ def _check_table_exists(self, table_name: str) -> bool: with self.session as s: return inspect(s.bind).has_table(table_name=table_name) + + def _count_rows(self, table_name: str) -> int: + """ + Count rows in a selected table + + :param str table_name: table to count rows for + :return int: number of rows in the selected table + """ + with self.session as s: + return s.query(self[DB_ORMS_KEY][table_name].id).count() From 5a89679fa6c60286301c134c0cc1441b8e8a8da0 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 8 Apr 2021 14:50:36 -0400 Subject: [PATCH 08/65] add reporting skeleton, auto create record ID column --- pipestat/pipestat_orm.py | 75 ++++++++++++++++++++++- pipestat/schemas/status_table_schema.yaml | 3 - 2 files changed, 72 insertions(+), 6 deletions(-) diff --git a/pipestat/pipestat_orm.py b/pipestat/pipestat_orm.py index 585d479b..4eea2306 100644 --- a/pipestat/pipestat_orm.py +++ b/pipestat/pipestat_orm.py @@ -8,7 +8,7 @@ from jsonschema import validate from sqlalchemy import Column, Float, ForeignKey, Integer, String, Table, create_engine from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship, sessionmaker +from sqlalchemy.orm import DeclarativeMeta, relationship, sessionmaker from ubiquerg import create_lock, remove_lock from yacman import YacAttMap @@ -398,7 +398,7 @@ def _auto_repr(x: Any) -> str: """ Auto-generated __repr__ fun - :param Any x: + :param Any x: object to generate __repr__ method for :return str: string object representation """ attr_strs = [ @@ -407,7 +407,11 @@ def _auto_repr(x: Any) -> str: return "<{}: {}>".format(x.__class__.__name__, ", ".join(attr_strs)) tn = table_name or self.namespace - attr_dict = dict(__tablename__=tn, id=Column(Integer, primary_key=True)) + attr_dict = dict( + __tablename__=tn, + id=Column(Integer, primary_key=True), + record_identifier=Column(String, unique=True), + ) for result_id, result_metadata in schema.items(): col_type = SQL_CLASSES_BY_TYPE[result_metadata[SCHEMA_TYPE_KEY]] _LOGGER.debug(f"Adding object: {result_id} of type: {str(col_type)}") @@ -579,3 +583,68 @@ def _count_rows(self, table_name: str) -> int: """ with self.session as s: return s.query(self[DB_ORMS_KEY][table_name].id).count() + + def _get_orm(self, table_name: str = None) -> Any: + """ + Get an object relational mapper class + + :param str table_name: table name to get a class for + :return Any: Object relational mapper class + """ + if DB_ORMS_KEY not in self: + raise PipestatDatabaseError("Object relational mapper classes not defined") + tn = f"{table_name or self.namespace}" + if tn not in self[DB_ORMS_KEY]: + raise PipestatDatabaseError( + f"No object relational mapper class defined for table: {tn}" + ) + if not isinstance(self[DB_ORMS_KEY][tn], DeclarativeMeta): + raise PipestatDatabaseError( + f"Object relational mapper class for table '{tn}' is invalid" + ) + return self[DB_ORMS_KEY][tn] + + def check_record_exists( + self, record_identifier: str, table_name: str = None + ) -> bool: + """ + Check if the specified record exists in the table + + :param ste record_identifier: record to check for + :param str table_name: table name to check + :return bool: whether the record exists in the table + """ + with self.session as s: + return ( + s.query(self._get_orm(table_name).id) + .filter_by(record_identifier=record_identifier) + .first() + is not None + ) + + def _report( + self, value: Dict[str, Any], record_identifier: str, table_name: str = None + ) -> int: + """ + + + :param value: + :param record_identifier: + :param table_name: + :return: + """ + ORMClass = self._get_orm(table_name) + value.update({RECORD_ID: record_identifier}) + if not self.check_record_exists( + record_identifier=record_identifier, table_name=table_name + ): + x = ORMClass(**value) + with self.session as s: + s.add(x) + s.commit() + else: + with self.session as s: + s.query(ORMClass).filter( + getattr(ORMClass, RECORD_ID) == record_identifier + ).update(value) + s.commit() diff --git a/pipestat/schemas/status_table_schema.yaml b/pipestat/schemas/status_table_schema.yaml index 8bbcf2e8..530dfc07 100644 --- a/pipestat/schemas/status_table_schema.yaml +++ b/pipestat/schemas/status_table_schema.yaml @@ -1,6 +1,3 @@ -record_identifier: - type: string - description: "record identifier" status: # this is updated with an 'enum' based on the status_schema provided by the user type: string From 990925f3663f990df1682d566be4ba2757e497e8 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 8 Apr 2021 16:44:40 -0400 Subject: [PATCH 09/65] return row id when inserting/updating --- pipestat/pipestat_orm.py | 53 +++++++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/pipestat/pipestat_orm.py b/pipestat/pipestat_orm.py index 4eea2306..efbfbd8c 100644 --- a/pipestat/pipestat_orm.py +++ b/pipestat/pipestat_orm.py @@ -386,6 +386,23 @@ def session(self): yield session _LOGGER.debug("Ending session") + def _strict_record_id(self, forced_value: str = None) -> str: + """ + Get record identifier from the outer source or stored with this object + + :param str forced_value: return this value + :return str: record identifier + """ + if forced_value is not None: + return forced_value + if self.record_identifier is not None: + return self.record_identifier + raise PipestatError( + f"You must provide the record identifier you want to perform " + f"the action on. Either in the {self.__class__.__name__} " + f"constructor or as an argument to the method." + ) + def _create_table_orm(self, table_name: str, schema: Dict[str, Any]): """ Create a table @@ -610,7 +627,7 @@ def check_record_exists( """ Check if the specified record exists in the table - :param ste record_identifier: record to check for + :param str record_identifier: record to check for :param str table_name: table name to check :return bool: whether the record exists in the table """ @@ -622,29 +639,37 @@ def check_record_exists( is not None ) - def _report( - self, value: Dict[str, Any], record_identifier: str, table_name: str = None + def _report_db( + self, values: Dict[str, Any], record_identifier: str, table_name: str = None ) -> int: """ + Report a result to a database - - :param value: - :param record_identifier: - :param table_name: - :return: + :param Dict[str, Any] values: values to report + :param str record_identifier: record to report the result for + :param str table_name: name of the table to report the result in + :return : """ + record_identifier = self._strict_record_id(record_identifier) ORMClass = self._get_orm(table_name) - value.update({RECORD_ID: record_identifier}) + values.update({RECORD_ID: record_identifier}) if not self.check_record_exists( record_identifier=record_identifier, table_name=table_name ): - x = ORMClass(**value) + new_record = ORMClass(**values) with self.session as s: - s.add(x) + s.add(new_record) s.commit() + returned_id = new_record.id else: with self.session as s: - s.query(ORMClass).filter( - getattr(ORMClass, RECORD_ID) == record_identifier - ).update(value) + record_to_update = ( + s.query(ORMClass) + .filter(getattr(ORMClass, RECORD_ID) == record_identifier) + .first() + ) + for k, v in values.items(): + setattr(record_to_update, k, v) s.commit() + returned_id = record_to_update.id + return returned_id From 9c0547de51d70207ca3cf224d2d126e6b0385235 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 9 Apr 2021 13:31:33 -0400 Subject: [PATCH 10/65] add check_results_exist method --- pipestat/pipestat_orm.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/pipestat/pipestat_orm.py b/pipestat/pipestat_orm.py index efbfbd8c..0976b81f 100644 --- a/pipestat/pipestat_orm.py +++ b/pipestat/pipestat_orm.py @@ -639,6 +639,25 @@ def check_record_exists( is not None ) + def check_results_exist( + self, results: List[str], rid: str = None, table_name: str = None + ) -> List[str]: + """ + Check if the specified record exists in the table + + :param str rid: record to check for + :param List[str] results: results identifiers to check for + :return List[str]: results identifiers that exist + """ + table_name = table_name or self.namespace + with self.session as s: + record = ( + s.query(self._get_orm(table_name)) + .filter_by(record_identifier=rid) + .first() + ) + return [r for r in results if getattr(record, r, None) is not None] + def _report_db( self, values: Dict[str, Any], record_identifier: str, table_name: str = None ) -> int: @@ -648,7 +667,7 @@ def _report_db( :param Dict[str, Any] values: values to report :param str record_identifier: record to report the result for :param str table_name: name of the table to report the result in - :return : + :return int: updated/inserted row """ record_identifier = self._strict_record_id(record_identifier) ORMClass = self._get_orm(table_name) @@ -668,8 +687,8 @@ def _report_db( .filter(getattr(ORMClass, RECORD_ID) == record_identifier) .first() ) - for k, v in values.items(): - setattr(record_to_update, k, v) + for result_id, result_value in values.items(): + setattr(record_to_update, result_id, result_value) s.commit() returned_id = record_to_update.id return returned_id From 175f9c650cce684a0ae04858949e474db74746c8 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 9 Apr 2021 14:03:00 -0400 Subject: [PATCH 11/65] implement _retrieve_db method --- pipestat/pipestat_orm.py | 53 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/pipestat/pipestat_orm.py b/pipestat/pipestat_orm.py index 0976b81f..8b3f8244 100644 --- a/pipestat/pipestat_orm.py +++ b/pipestat/pipestat_orm.py @@ -643,13 +643,15 @@ def check_results_exist( self, results: List[str], rid: str = None, table_name: str = None ) -> List[str]: """ - Check if the specified record exists in the table + Check if the specified results exist in the table :param str rid: record to check for :param List[str] results: results identifiers to check for + :param str table_name: name of the table to search for results in :return List[str]: results identifiers that exist """ table_name = table_name or self.namespace + rid = self._strict_record_id(rid) with self.session as s: record = ( s.query(self._get_orm(table_name)) @@ -658,6 +660,55 @@ def check_results_exist( ) return [r for r in results if getattr(record, r, None) is not None] + def _retrieve_db( + self, + result_identifier: str = None, + record_identifier: str = None, + table_name: str = None, + ) -> Dict[str, Any]: + """ + Retrieve a result for a record. + + If no result ID specified, results for the entire record will + be returned. + + :param str record_identifier: unique identifier of the record + :param str result_identifier: name of the result to be retrieved + :param str table_name: name of the table to search for results in + :return Dict[str, any]: a single result or a mapping with all the results + reported for the record + """ + table_name = table_name or self.namespace + record_identifier = self._strict_record_id(record_identifier) + if result_identifier is not None: + existing = self.check_results_exist( + results=[result_identifier], + rid=record_identifier, + table_name=table_name, + ) + if not existing: + raise PipestatDatabaseError( + f"Result '{result_identifier}' not found for record " + f"'{record_identifier}'" + ) + + with self.session as s: + record = ( + s.query(self._get_orm(table_name)) + .filter_by(record_identifier=record_identifier) + .first() + ) + + if record is not None: + if result_identifier is not None: + return {result_identifier: getattr(record, result_identifier)} + return { + column: getattr(record, column) + for column in [c.name for c in record.__table__.columns] + if getattr(record, column, None) is not None + } + raise PipestatDatabaseError(f"Record '{record_identifier}' not found") + def _report_db( self, values: Dict[str, Any], record_identifier: str, table_name: str = None ) -> int: From a67bf393bebd315326de8121afc415fca4fd9e35 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 9 Apr 2021 15:07:37 -0400 Subject: [PATCH 12/65] implement _remove_db and check_result_exists methods --- pipestat/pipestat_orm.py | 74 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/pipestat/pipestat_orm.py b/pipestat/pipestat_orm.py index 8b3f8244..3de4625a 100644 --- a/pipestat/pipestat_orm.py +++ b/pipestat/pipestat_orm.py @@ -639,7 +639,7 @@ def check_record_exists( is not None ) - def check_results_exist( + def check_which_results_exist( self, results: List[str], rid: str = None, table_name: str = None ) -> List[str]: """ @@ -660,6 +660,26 @@ def check_results_exist( ) return [r for r in results if getattr(record, r, None) is not None] + def check_result_exists( + self, + result_identifier: str, + record_identifier: str = None, + table_name: str = None, + ) -> bool: + """ + Check if the result has been reported + + :param str record_identifier: unique identifier of the record + :param str result_identifier: name of the result to check + :param str table_name: name of the table to search for results in + :return bool: whether the specified result has been reported for the + indicated record in current namespace + """ + record_identifier = self._strict_record_id(record_identifier) + return self.check_which_results_exist( + results=[result_identifier], rid=record_identifier, table_name=table_name + ) + def _retrieve_db( self, result_identifier: str = None, @@ -681,7 +701,7 @@ def _retrieve_db( table_name = table_name or self.namespace record_identifier = self._strict_record_id(record_identifier) if result_identifier is not None: - existing = self.check_results_exist( + existing = self.check_which_results_exist( results=[result_identifier], rid=record_identifier, table_name=table_name, @@ -743,3 +763,53 @@ def _report_db( s.commit() returned_id = record_to_update.id return returned_id + + def _remove_db( + self, + record_identifier: str = None, + result_identifier: str = None, + table_name: str = None, + ) -> bool: + """ + Remove a result. + + If no result ID specified or last result is removed, the entire record + will be removed. + + :param str record_identifier: unique identifier of the record + :param str result_identifier: name of the result to be removed or None + if the record should be removed. + :param str table_name: name of the table to report the result in + :return bool: whether the result has been removed + :raise PipestatDatabaseError: if either record or result specified are not found + """ + table_name = table_name or self.namespace + record_identifier = self._strict_record_id(record_identifier) + ORMClass = self._get_orm(table_name=table_name) + if self.check_record_exists( + record_identifier=record_identifier, table_name=table_name + ): + with self.session as s: + record = ( + s.query(ORMClass) + .filter(getattr(ORMClass, RECORD_ID) == record_identifier) + .first() + ) + if result_identifier is None: + # delete row + record.delete() + else: + # set the value to None + if not self.check_result_exists( + record_identifier=record_identifier, + result_identifier=result_identifier, + table_name=table_name, + ): + raise PipestatDatabaseError( + f"Result '{result_identifier}' not found for record " + f"'{record_identifier}'" + ) + setattr(record, result_identifier, None) + s.commit() + else: + raise PipestatDatabaseError(f"Record '{record_identifier}' not found") From 661ccadddb9df020515b0397184233e6f9d8a942 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 9 Apr 2021 15:37:57 -0400 Subject: [PATCH 13/65] implement status management --- pipestat/pipestat_orm.py | 155 ++++++++++++++++++++++++++++++++++----- 1 file changed, 138 insertions(+), 17 deletions(-) diff --git a/pipestat/pipestat_orm.py b/pipestat/pipestat_orm.py index 3de4625a..01f5dfb1 100644 --- a/pipestat/pipestat_orm.py +++ b/pipestat/pipestat_orm.py @@ -372,6 +372,29 @@ def data(self) -> YacAttMap: """ return self._get_attr(DATA_KEY) + @property + def db_url(self) -> str: + """ + Database URL, generated based on config credentials + + :return str: database URL + """ + try: + creds = dict( + name=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_NAME_KEY], + user=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_USER_KEY], + passwd=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_PASSWORD_KEY], + host=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_HOST_KEY], + port=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_PORT_KEY], + dialect=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_DIALECT_KEY], + ) + except (KeyError, AttributeError) as e: + raise PipestatDatabaseError( + f"Could not determine database URL. Caught error: {str(e)}" + ) + parsed_creds = {k: quote_plus(str(v)) for k, v in creds.items()} + return "{dialect}://{user}:{passwd}@{host}:{port}/{name}".format(**parsed_creds) + @property @contextmanager def session(self): @@ -465,28 +488,126 @@ def is_db_connected(self) -> bool: return True return False - @property - def db_url(self) -> str: + def set_status(self, status_identifier: str, record_identifier: str = None) -> None: """ - Database URL, generated based on config credentials + Set pipeline run status. - :return str: database URL + The status identifier needs to match one of identifiers specified in + the status schema. A basic, ready to use, status schema is shipped with + this package. + + :param str status_identifier: status to set, one of statuses defined + in the status schema + :param str record_identifier: record identifier to set the + pipeline status for """ - try: - creds = dict( - name=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_NAME_KEY], - user=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_USER_KEY], - passwd=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_PASSWORD_KEY], - host=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_HOST_KEY], - port=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_PORT_KEY], - dialect=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_DIALECT_KEY], + r_id = self._strict_record_id(record_identifier) + known_status_identifiers = self.status_schema.keys() + if status_identifier not in known_status_identifiers: + raise PipestatError( + f"'{status_identifier}' is not a defined status identifier. " + f"These are allowed: {known_status_identifiers}" ) - except (KeyError, AttributeError) as e: - raise PipestatDatabaseError( - f"Could not determine database URL. Caught error: {str(e)}" + prev_status = self.get_status(r_id) + if self.file is not None: + if prev_status: + prev_flag_path = self.get_status_flag_path(prev_status, r_id) + os.remove(prev_flag_path) + flag_path = self.get_status_flag_path(status_identifier, r_id) + create_lock(flag_path) + with open(flag_path, "w") as f: + f.write(status_identifier) + remove_lock(flag_path) + else: + try: + self._report_db( + values={STATUS: status_identifier}, + record_identifier=r_id, + table_name=f"{self.namespace}_{STATUS}", + ) + except Exception as e: + _LOGGER.error(f"Could not insert into the status table. Exception: {e}") + raise + if prev_status: + _LOGGER.debug( + f"Changed status from '{prev_status}' to '{status_identifier}'" ) - parsed_creds = {k: quote_plus(str(v)) for k, v in creds.items()} - return "{dialect}://{user}:{passwd}@{host}:{port}/{name}".format(**parsed_creds) + + def get_status(self, record_identifier: str = None) -> Optional[str]: + """ + Get the current pipeline status + + :return str: status identifier, like 'running' + """ + r_id = self._strict_record_id(record_identifier) + if self.file is None: + try: + result = self._retrieve_db( + result_identifier=STATUS, + record_identifier=r_id, + table_name=f"{self.namespace}_{STATUS}", + ) + except PipestatDatabaseError: + return None + return result[STATUS] + else: + flag_file = self._get_flag_file(record_identifier=r_id) + if flag_file is not None: + assert isinstance(flag_file, str), TypeError( + "Flag file path is expected to be a str, were multiple flags found?" + ) + with open(flag_file, "r") as f: + status = f.read() + return status + _LOGGER.debug( + f"Could not determine status for '{r_id}' record. " + f"No flags found in: {self[STATUS_FILE_DIR]}" + ) + return None + + def clear_status( + self, record_identifier: str = None, flag_names: List[str] = None + ) -> List[str]: + """ + Remove status flags + + :param str record_identifier: name of the record to remove flags for + :param Iterable[str] flag_names: Names of flags to remove, optional; if + unspecified, all schema-defined flag names will be used. + :return List[str]: Collection of names of flags removed + """ + r_id = self._strict_record_id(record_identifier) + if self.file is not None: + flag_names = flag_names or list(self.status_schema.keys()) + if isinstance(flag_names, str): + flag_names = [flag_names] + removed = [] + for f in flag_names: + path_flag_file = self.get_status_flag_path( + status_identifier=f, record_identifier=r_id + ) + try: + os.remove(path_flag_file) + except: + pass + else: + _LOGGER.info(f"Removed existing flag: {path_flag_file}") + removed.append(f) + return removed + else: + removed = self.get_status(r_id) + try: + self._remove_db( + record_identifier=r_id, + table_name=f"{self.namespace}_{STATUS}", + ) + except Exception as e: + _LOGGER.error( + f"Could not remove the status from the database. Exception: {e}" + ) + return [] + else: + return [removed] def validate_schema(self) -> None: """ From f5128deda6b8ebcb4f949cffb7c56c4cf64ad0a3 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 12 Apr 2021 13:41:34 -0400 Subject: [PATCH 14/65] fix versions --- requirements/requirements-all.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index e0d427df..ab2db6b7 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -3,5 +3,6 @@ jsonschema logmuse>=0.2.5 oyaml psycopg2-binary +sqlalchemy>=1.4.7 ubiquerg>=0.6.1 -yacman>=0.8.0 +yacman>=0.8.1 From 1b49f9e4254b4b85b011552c86e381070fce7d86 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 13 Apr 2021 09:49:45 -0400 Subject: [PATCH 15/65] implement file support, rename files, major refactoring --- docs_jupyter/python_api.ipynb | 21 +- pipestat/pipestat.py | 1367 ++++++++++++++++----------------- pipestat/pipestat_dirver.py | 1270 ++++++++++++++++++++++++++++++ pipestat/pipestat_orm.py | 936 ---------------------- 4 files changed, 1963 insertions(+), 1631 deletions(-) create mode 100644 pipestat/pipestat_dirver.py delete mode 100644 pipestat/pipestat_orm.py diff --git a/docs_jupyter/python_api.ipynb b/docs_jupyter/python_api.ipynb index f70bce46..77b349fd 100644 --- a/docs_jupyter/python_api.ipynb +++ b/docs_jupyter/python_api.ipynb @@ -22,6 +22,7 @@ "outputs": [], "source": [ "import pipestat\n", + "from pipestat.pipestat_orm import PipestatManagerORM as PipestatManager, SchemaNotFoundError\n", "from jsonschema import ValidationError" ] }, @@ -57,7 +58,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/tmpa6poai5_.yaml\n" + "/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/tmp9cozi7pc.yaml\n" ] } ], @@ -81,7 +82,7 @@ "metadata": {}, "outputs": [], "source": [ - "psm = pipestat.PipestatManager(\n", + "psm = PipestatManager(\n", " namespace=\"test\",\n", " record_identifier=\"sample1\",\n", " results_file_path=temp_file,\n", @@ -460,7 +461,7 @@ "metadata": {}, "outputs": [], "source": [ - "psm1 = pipestat.PipestatManager(\n", + "psm1 = PipestatManager(\n", " namespace=\"test\",\n", " record_identifier=\"sample1\",\n", " results_file_path=temp_file,\n", @@ -508,7 +509,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/tmpa6poai5_.yaml\n", + "/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/tmp9cozi7pc.yaml\n", "test:\n", " sample1:\n", " output_file:\n", @@ -828,7 +829,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/tmpf9m81e1n.yaml\n" + "/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/tmpsfzryntz.yaml\n" ] } ], @@ -838,7 +839,7 @@ "_, temp_file_highlight = mkstemp(suffix=\".yaml\")\n", "print(temp_file_highlight)\n", "\n", - "psm_highlight = pipestat.PipestatManager(\n", + "psm_highlight = PipestatManager(\n", " namespace=\"test_highlight\",\n", " record_identifier=\"sample1\",\n", " results_file_path=temp_file_highlight,\n", @@ -1035,7 +1036,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/tmpq81inuvn.yaml\n" + "/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/tmpxwy566e3.yaml\n" ] } ], @@ -1043,7 +1044,7 @@ "_, temp_file_no_schema = mkstemp(suffix=\".yaml\")\n", "print(temp_file_no_schema)\n", "\n", - "psm_no_schema = pipestat.PipestatManager(\n", + "psm_no_schema = PipestatManager(\n", " namespace=\"test_no_schema\", results_file_path=temp_file_no_schema\n", ")" ] @@ -1073,7 +1074,7 @@ "source": [ "try:\n", " psm_no_schema.report(record_identifier=\"sample1\", values={\"key\": \"val\"})\n", - "except pipestat.SchemaNotFoundError as e:\n", + "except SchemaNotFoundError as e:\n", " print(e)" ] }, @@ -1122,7 +1123,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.9.4" } }, "nbformat": 4, diff --git a/pipestat/pipestat.py b/pipestat/pipestat.py index f3f75262..f3149fcc 100644 --- a/pipestat/pipestat.py +++ b/pipestat/pipestat.py @@ -2,12 +2,13 @@ from copy import deepcopy from logging import getLogger from typing import Any, Dict, List, Optional, Union +from urllib.parse import quote_plus -import psycopg2 from attmap import PathExAttMap as PXAM from jsonschema import validate -from psycopg2.extensions import connection -from psycopg2.extras import DictCursor, Json +from sqlalchemy import Column, Float, ForeignKey, Integer, String, Table, create_engine +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import DeclarativeMeta, relationship, sessionmaker from ubiquerg import create_lock, remove_lock from yacman import YacAttMap @@ -18,25 +19,6 @@ _LOGGER = getLogger(PKG_NAME) -class LoggingCursor(psycopg2.extras.DictCursor): - """ - Logging db cursor - """ - - def execute(self, query, vars=None): - """ - Execute a database operation (query or command) and issue a debug - and info level log messages - - :param query: - :param vars: - :return: - """ - _LOGGER.debug(f"Executing query: {self.mogrify(query, vars)}") - super(LoggingCursor, self).execute(query=query, vars=vars) - _LOGGER.debug(f"Executed query: {self.query}") - - class PipestatManager(dict): """ Pipestat standardizes reporting of pipeline results and @@ -45,7 +27,7 @@ class PipestatManager(dict): pipeline can easily and reliably become an input for downstream analyses. The object exposes API for interacting with the results and pipeline status and can be backed by either a YAML-formatted file - or a PostgreSQL database. + or a database. """ def __init__( @@ -237,31 +219,16 @@ def _select_value( raise MissingConfigDataError( "Must specify all database login " "credentials or result_file_path" ) + self[DB_ORMS_KEY] = {} + self[DB_BASE_KEY] = declarative_base() self[DATA_KEY] = YacAttMap() - self._init_postgres_table() + self._init_db_table() self._init_status_table() else: raise MissingConfigDataError( "Must specify either database login " "credentials or a YAML file path" ) - def __str__(self): - """ - Generate string representation of the object - - :return str: string representation of the object - """ - res = f"{self.__class__.__name__} ({self.namespace})" - res += "\nBackend: {}".format( - f"file ({self.file})" if self.file else "PostgreSQL" - ) - res += f"\nResults schema source: {self.schema_path}" - res += f"\nStatus schema source: {self.status_schema_source}" - res += f"\nRecords count: {self.record_count}" - if self.highlighted_results: - res += f"\nHighlighted results: {', '.join(self.highlighted_results)}" - return res - def _get_flag_file( self, record_identifier: str = None ) -> Union[str, List[str], None]: @@ -290,15 +257,6 @@ def _get_flag_file( _LOGGER.debug("No flag files found") return None - @property - def highlighted_results(self) -> List[str]: - """ - Highlighted results - - :return List[str]: a collection of highlighted results - """ - return self._get_attr(HIGHLIGHTED_KEY) or [] - @property def record_count(self) -> int: """ @@ -312,6 +270,15 @@ def record_count(self) -> int: else self._count_rows(self.namespace) ) + @property + def highlighted_results(self) -> List[str]: + """ + Highlighted results + + :return List[str]: a collection of highlighted results + """ + return self._get_attr(HIGHLIGHTED_KEY) or [] + @property def namespace(self) -> str: """ @@ -406,302 +373,162 @@ def data(self) -> YacAttMap: return self._get_attr(DATA_KEY) @property - @contextmanager - def db_cursor(self): + def db_url(self) -> str: """ - Establish connection and get a PostgreSQL database cursor, - commit and close the connection afterwards + Database URL, generated based on config credentials - :return LoggingCursor: Database cursor object + :return str: database URL + :raise PipestatDatabaseError: if the object is not backed by a database """ + if self.file is not None: + raise PipestatDatabaseError( + "Can't determine database URL if the object is backed by a file" + ) try: - if not self.check_connection(): - self.establish_postgres_connection() - with self[DB_CONNECTION_KEY] as c, c.cursor( - cursor_factory=LoggingCursor - ) as cur: - yield cur - except Exception: - raise - finally: - self.close_postgres_connection() - - def get_status(self, record_identifier: str = None) -> Optional[str]: - """ - Get the current pipeline status - - :return str: status identifier, like 'running' - """ - r_id = self._strict_record_id(record_identifier) - if self.file is None: - with self.db_cursor as cur: - query = sql.SQL( - f"SELECT {STATUS} " - f"FROM {f'{self.namespace}_{STATUS}'} " - f"WHERE {RECORD_ID}=%s" - ) - cur.execute(query, (r_id,)) - result = cur.fetchone() - return result[0] if result is not None else None - else: - flag_file = self._get_flag_file(record_identifier=r_id) - if flag_file is not None: - assert isinstance(flag_file, str), TypeError( - "Flag file path is expected to be a str, were multiple flags found?" - ) - with open(flag_file, "r") as f: - status = f.read() - return status - _LOGGER.debug( - f"Could not determine status for '{r_id}' record. " - f"No flags found in: {self[STATUS_FILE_DIR]}" + creds = dict( + name=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_NAME_KEY], + user=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_USER_KEY], + passwd=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_PASSWORD_KEY], + host=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_HOST_KEY], + port=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_PORT_KEY], + dialect=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_DIALECT_KEY], ) - return None - - def _get_attr(self, attr: str) -> Any: - """ - Safely get the name of the selected attribute of this object - - :param str attr: attr to select - :return: - """ - return self[attr] if attr in self else None + except (KeyError, AttributeError) as e: + raise PipestatDatabaseError( + f"Could not determine database URL. Caught error: {str(e)}" + ) + parsed_creds = {k: quote_plus(str(v)) for k, v in creds.items()} + return "{dialect}://{user}:{passwd}@{host}:{port}/{name}".format(**parsed_creds) - def _table_to_dict(self) -> None: + @property + @contextmanager + def session(self): """ - Create a dictionary from the database table data + Provide a transactional scope around a series of query + operations, no commit afterwards. """ - with self.db_cursor as cur: - cur.execute(f"SELECT * FROM {self.namespace}") - data = cur.fetchall() - _LOGGER.debug(f"Reading data from database for '{self.namespace}' namespace") - for record in data: - record_id = record[RECORD_ID] - for res_id, val in record.items(): - if val is not None: - self._report_data_element( - record_identifier=record_id, values={res_id: val} - ) + if not self.is_db_connected(): + self.establish_db_connection_orm() + with self[DB_SESSION_KEY]() as session: + _LOGGER.debug("Created session") + yield session + _LOGGER.debug("Ending session") - def _init_postgres_table(self) -> bool: + def _strict_record_id(self, forced_value: str = None) -> str: """ - Initialize a PostgreSQL table based on the provided schema, - if it does not exist. Read the data stored in the database into the - memory otherwise. + Get record identifier from the outer source or stored with this object - :return bool: whether the table has been created + :param str forced_value: return this value + :return str: record identifier """ - if self.schema is None: - raise SchemaNotFoundError("initialize the database table") - if self._check_table_exists(table_name=self.namespace): - _LOGGER.debug(f"Table '{self.namespace}' already exists in the database") - if not self[DB_ONLY_KEY]: - self._table_to_dict() - return False - _LOGGER.info(f"Initializing '{self.namespace}' table in '{PKG_NAME}' database") - columns = FIXED_COLUMNS + schema_to_columns(schema=self.schema) - self._create_table(table_name=self.namespace, columns=columns) - return True - - # def _create_status_type(self): - # with self.db_cursor as cur: - # s = sql.SQL(f"SELECT exists (SELECT 1 FROM pg_type WHERE typname = '{STATUS}');") - # cur.execute(s) - # if cur.fetchone()[0]: - # return - # with self.db_cursor as cur: - # status_strs = [f"'{st_id}'" for st_id in self.status_schema.keys()] - # status_str = ", ".join(status_strs) - # s = sql.SQL(f"CREATE TYPE {STATUS} as enum({status_str});") - # cur.execute(s) - - def _init_status_table(self): - status_table_name = f"{self.namespace}_{STATUS}" - # self._create_status_type() - if not self._check_table_exists(table_name=status_table_name): - _LOGGER.info( - f"Initializing '{status_table_name}' table in " f"'{PKG_NAME}' database" - ) - self._create_table(status_table_name, STATUS_TABLE_COLUMNS) + if forced_value is not None: + return forced_value + if self.record_identifier is not None: + return self.record_identifier + raise PipestatError( + f"You must provide the record identifier you want to perform " + f"the action on. Either in the {self.__class__.__name__} " + f"constructor or as an argument to the method." + ) - def _create_table(self, table_name: str, columns: List[str]): + def _create_table_orm(self, table_name: str, schema: Dict[str, Any]): """ Create a table :param str table_name: name of the table to create - :param str | List[str] columns: columns definition list, - for instance: ['name VARCHAR(50) NOT NULL'] + :param Dict[str, Any] schema: schema to base table creation on """ - columns = mk_list_of_str(columns) - with self.db_cursor as cur: - s = sql.SQL(f"CREATE TABLE {table_name} ({','.join(columns)})") - cur.execute(s) - def _init_results_file(self) -> bool: - """ - Initialize YAML results file if it does not exist. - Read the data stored in the existing file into the memory otherwise. + def _auto_repr(x: Any) -> str: + """ + Auto-generated __repr__ fun - :return bool: whether the file has been created - """ - if not os.path.exists(self.file): - _LOGGER.info(f"Initializing results file '{self.file}'") - data = YacAttMap(entries={self.namespace: "{}"}) - data.write(filepath=self.file) - data.make_readonly() - self[DATA_KEY] = data - return True - _LOGGER.debug(f"Reading data from '{self.file}'") - data = YacAttMap(filepath=self.file) - filtered = list(filter(lambda x: not x.startswith("_"), data.keys())) - if filtered and self.namespace not in filtered: - raise PipestatDatabaseError( - f"'{self.file}' is already used to report results for " - f"other namespace: {filtered[0]}" - ) - self[DATA_KEY] = data - return False + :param Any x: object to generate __repr__ method for + :return str: string object representation + """ + attr_strs = [ + f"{k}={str(v)}" for k, v in x.__dict__.items() if not k.startswith("_") + ] + return "<{}: {}>".format(x.__class__.__name__, ", ".join(attr_strs)) - def _check_table_exists(self, table_name: str) -> bool: - """ - Check if the specified table exists + tn = table_name or self.namespace + attr_dict = dict( + __tablename__=tn, + id=Column(Integer, primary_key=True), + record_identifier=Column(String, unique=True), + ) + for result_id, result_metadata in schema.items(): + col_type = SQL_CLASSES_BY_TYPE[result_metadata[SCHEMA_TYPE_KEY]] + _LOGGER.debug(f"Adding object: {result_id} of type: {str(col_type)}") + attr_dict.update({result_id: Column(col_type)}) + attr_dict.update({"__repr__": _auto_repr}) + _LOGGER.debug(f"Creating '{tn}' ORM with args: {attr_dict}") + self[DB_ORMS_KEY][tn] = type(tn.capitalize(), (self[DB_BASE_KEY],), attr_dict) + self[DB_BASE_KEY].metadata.create_all(bind=self[DB_ENGINE_KEY]) - :param str table_name: table name to be checked - :return bool: whether the specified table exists + def establish_db_connection_orm(self) -> bool: """ - with self.db_cursor as cur: - cur.execute( - "SELECT EXISTS(SELECT * FROM information_schema.tables " - "WHERE table_name=%s)", - (table_name,), - ) - return cur.fetchone()[0] + Establish DB connection using the config data - def _check_record( - self, condition_col: str, condition_val: str, table_name: str - ) -> bool: + :return bool: whether the connection has been established successfully """ - Check if the record matching the condition is in the table + if self.is_db_connected(): + raise PipestatDatabaseError("Connection is already established") - :param str condition_col: column to base the check on - :param str condition_val: value in the selected column - :param str table_name: name of the table ot check the record in - :return bool: whether any record matches the provided condition - """ - with self.db_cursor as cur: - statement = ( - f"SELECT EXISTS(SELECT 1 from {table_name} " - f"WHERE {condition_col}=%s)" - ) - cur.execute(statement, (condition_val,)) - return cur.fetchone()[0] + self[DB_ENGINE_KEY] = create_engine(self.db_url, echo=True) + self[DB_SESSION_KEY] = sessionmaker(bind=self[DB_ENGINE_KEY]) + return True - def _count_rows(self, table_name: str) -> int: + def is_db_connected(self) -> bool: """ - Count rows in a selected table + Check whether a DB connection has been established - :param str table_name: table to count rows for - :return int: number of rows in the selected table + :return bool: whether the connection has been established """ - with self.db_cursor as cur: - statement = sql.SQL("SELECT COUNT(*) FROM {}").format( - sql.Identifier(table_name) + if self.file is not None: + raise PipestatDatabaseError( + f"The {self.__class__.__name__} object is not backed by a database" ) - cur.execute(statement) - return cur.fetchall()[0][0] - - def _report_postgres( - self, value: Dict[str, Any], record_identifier: str, table_name: str = None - ) -> int: - """ - Check if record with this record identifier in table, create new record - if not (INSERT), update the record if yes (UPDATE). - - Currently supports just one column at a time. + if DB_SESSION_KEY in self and isinstance(self[DB_SESSION_KEY], sessionmaker): + return True + return False - :param str record_identifier: unique identifier of the record, value to - in 'record_identifier' column to look for to determine if the record - already exists in the table - :param dict value: a mapping of pair of table column names and - respective values to be inserted to the database - :return int: id of the row just inserted + def set_status(self, status_identifier: str, record_identifier: str = None) -> None: """ - table_name = table_name or self.namespace - if not self._check_record( - condition_col=RECORD_ID, - condition_val=record_identifier, - table_name=table_name, - ): - with self.db_cursor as cur: - cur.execute( - f"INSERT INTO {table_name} ({RECORD_ID}) VALUES (%s)", - (record_identifier,), - ) - # prep a list of SQL objects with column-named value placeholders - columns = sql.SQL(",").join( - [ - sql.SQL("{}=%({})s").format(sql.Identifier(k), sql.SQL(k)) - for k in list(value.keys()) - ] - ) - # construct the query template to execute - query = sql.SQL("UPDATE {n} SET {c} WHERE {id}=%({id})s RETURNING id").format( - n=sql.Identifier(table_name), c=columns, id=sql.SQL(RECORD_ID) - ) - # preprocess the values, dict -> Json - values = {k: Json(v) if isinstance(v, dict) else v for k, v in value.items()} - # add record_identifier column, which is specified outside of values - values.update({RECORD_ID: record_identifier}) - with self.db_cursor as cur: - cur.execute(query, values) - return cur.fetchone()[0] + Set pipeline run status. - def clear_status( - self, record_identifier: str = None, flag_names: List[str] = None - ) -> List[str]: - """ - Remove status flags + The status identifier needs to match one of identifiers specified in + the status schema. A basic, ready to use, status schema is shipped with + this package. - :param str record_identifier: name of the record to remove flags for - :param Iterable[str] flag_names: Names of flags to remove, optional; if - unspecified, all schema-defined flag names will be used. - :return List[str]: Collection of names of flags removed + :param str status_identifier: status to set, one of statuses defined + in the status schema + :param str record_identifier: record identifier to set the + pipeline status for """ r_id = self._strict_record_id(record_identifier) + known_status_identifiers = self.status_schema.keys() + if status_identifier not in known_status_identifiers: + raise PipestatError( + f"'{status_identifier}' is not a defined status identifier. " + f"These are allowed: {known_status_identifiers}" + ) + prev_status = self.get_status(r_id) if self.file is not None: - flag_names = flag_names or list(self.status_schema.keys()) - if isinstance(flag_names, str): - flag_names = [flag_names] - removed = [] - for f in flag_names: - path_flag_file = self.get_status_flag_path( - status_identifier=f, record_identifier=r_id - ) - try: - os.remove(path_flag_file) - except: - pass - else: - _LOGGER.info(f"Removed existing flag: {path_flag_file}") - removed.append(f) - return removed + self._set_status_file( + status_identifier=status_identifier, + record_identifier=r_id, + prev_status=prev_status, + ) else: - removed = self.get_status(r_id) - status_table_name = f"{self.namespace}_{STATUS}" - with self.db_cursor as cur: - try: - cur.execute( - f"DELETE FROM {status_table_name} WHERE " - f"{RECORD_ID}='{r_id}'" - ) - except Exception as e: - _LOGGER.error( - f"Could not remove the status from the " - f"database. Exception: {e}" - ) - return [] - else: - return [removed] + self._set_status_db( + status_identifier=status_identifier, + record_identifier=r_id, + ) + if prev_status: + _LOGGER.debug( + f"Changed status from '{prev_status}' to '{status_identifier}'" + ) def get_status_flag_path( self, status_identifier: str, record_identifier=None @@ -723,125 +550,476 @@ def get_status_flag_path( self[STATUS_FILE_DIR], f"{self.namespace}_{r_id}_{status_identifier}.flag" ) - def set_status(self, status_identifier: str, record_identifier: str = None) -> None: - """ - Set pipeline run status. + def _set_status_file( + self, + status_identifier: str, + record_identifier: str, + prev_status: Optional[str] = None, + ) -> None: + if prev_status is not None: + prev_flag_path = self.get_status_flag_path(prev_status, record_identifier) + os.remove(prev_flag_path) + flag_path = self.get_status_flag_path(status_identifier, record_identifier) + create_lock(flag_path) + with open(flag_path, "w") as f: + f.write(status_identifier) + remove_lock(flag_path) + + def _set_status_db( + self, + status_identifier: str, + record_identifier: str, + ) -> None: + try: + self._report_db( + values={STATUS: status_identifier}, + record_identifier=record_identifier, + table_name=f"{self.namespace}_{STATUS}", + ) + except Exception as e: + _LOGGER.error(f"Could not insert into the status table. Exception: {e}") + raise - The status identifier needs to match one of identifiers specified in - the status schema. A basic, ready to use, status schema is shipped with - this package. + def get_status(self, record_identifier: str = None) -> Optional[str]: + """ + Get the current pipeline status - :param str status_identifier: status to set, one of statuses defined - in the status schema - :param str record_identifier: record identifier to set the - pipeline status for + :return str: status identifier, like 'running' """ r_id = self._strict_record_id(record_identifier) - known_status_identifiers = self.status_schema.keys() - if status_identifier not in known_status_identifiers: - raise PipestatError( - f"'{status_identifier}' is not a defined status identifier. " - f"These are allowed: {known_status_identifiers}" - ) - prev_status = self.get_status(r_id) - if self.file is not None: - if prev_status: - prev_flag_path = self.get_status_flag_path(prev_status, r_id) - os.remove(prev_flag_path) - flag_path = self.get_status_flag_path(status_identifier, r_id) - create_lock(flag_path) - with open(flag_path, "w") as f: - f.write(status_identifier) - remove_lock(flag_path) + if self.file is None: + return self._get_status_db(record_identifier=r_id) else: - try: - self._report_postgres( - value={STATUS: status_identifier}, - record_identifier=r_id, - table_name=f"{self.namespace}_{STATUS}", - ) - except Exception as e: - _LOGGER.error( - f"Could not insert into the status table. " f"Exception: {e}" - ) - raise - if prev_status: - _LOGGER.debug( - f"Changed status from '{prev_status}' to '{status_identifier}'" + return self._get_status_file(record_identifier=r_id) + + def _get_status_file(self, record_identifier: str) -> Optional[str]: + r_id = self._strict_record_id(record_identifier) + flag_file = self._get_flag_file(record_identifier=record_identifier) + if flag_file is not None: + assert isinstance(flag_file, str), TypeError( + "Flag file path is expected to be a str, were multiple flags found?" ) + with open(flag_file, "r") as f: + status = f.read() + return status + _LOGGER.debug( + f"Could not determine status for '{r_id}' record. " + f"No flags found in: {self[STATUS_FILE_DIR]}" + ) + return None - def check_result_exists(self, result_identifier, record_identifier=None): - """ - Check if the result has been reported + def _get_status_db(self, record_identifier: str) -> Optional[str]: + try: + result = self._retrieve_db( + result_identifier=STATUS, + record_identifier=record_identifier, + table_name=f"{self.namespace}_{STATUS}", + ) + except PipestatDatabaseError: + return None + return result[STATUS] - :param str record_identifier: unique identifier of the record - :param str result_identifier: name of the result to check - :return bool: whether the specified result has been reported for the - indicated record in current namespace + def clear_status( + self, record_identifier: str = None, flag_names: List[str] = None + ) -> List[Union[str, None]]: """ - record_identifier = self._strict_record_id(record_identifier) - return self._check_which_results_exist( - results=[result_identifier], rid=record_identifier - ) + Remove status flags - def _check_which_results_exist( - self, results: List[str], rid: str = None - ) -> List[str]: + :param str record_identifier: name of the record to remove flags for + :param Iterable[str] flag_names: Names of flags to remove, optional; if + unspecified, all schema-defined flag names will be used. + :return List[str]: Collection of names of flags removed """ - Check which results have been reported - - :param str rid: unique identifier of the record + r_id = self._strict_record_id(record_identifier) + if self.file is not None: + return self._clear_status_file( + record_identifier=r_id, flag_names=flag_names + ) + else: + return self._clear_status_db(record_identifier=r_id) + + def _clear_status_file( + self, record_identifier: str = None, flag_names: List[str] = None + ) -> List[Union[str, None]]: + flag_names = flag_names or list(self.status_schema.keys()) + if isinstance(flag_names, str): + flag_names = [flag_names] + removed = [] + for f in flag_names: + path_flag_file = self.get_status_flag_path( + status_identifier=f, record_identifier=record_identifier + ) + try: + os.remove(path_flag_file) + except: + pass + else: + _LOGGER.info(f"Removed existing flag: {path_flag_file}") + removed.append(f) + return removed + + def _clear_status_db(self, record_identifier: str = None) -> List[Union[str, None]]: + removed = self.get_status(record_identifier) + try: + self._remove_db( + record_identifier=record_identifier, + table_name=f"{self.namespace}_{STATUS}", + ) + except Exception as e: + _LOGGER.error( + f"Could not remove the status from the database. Exception: {e}" + ) + return [] + else: + return [removed] + + def validate_schema(self) -> None: + """ + Check schema for any possible issues + + :raises SchemaError: if any schema format issue is detected + """ + + def _recursively_replace_custom_types(s: dict) -> Dict: + """ + Replace the custom types in pipestat schema with canonical types + + :param dict s: schema to replace types in + :return dict: schema with types replaced + """ + for k, v in s.items(): + assert SCHEMA_TYPE_KEY in v, SchemaError( + f"Result '{k}' is missing '{SCHEMA_TYPE_KEY}' key" + ) + if v[SCHEMA_TYPE_KEY] == "object" and SCHEMA_PROP_KEY in s[k]: + _recursively_replace_custom_types(s[k][SCHEMA_PROP_KEY]) + if v[SCHEMA_TYPE_KEY] in CANONICAL_TYPES.keys(): + s.setdefault(k, {}) + s[k].setdefault(SCHEMA_PROP_KEY, {}) + s[k][SCHEMA_PROP_KEY].update( + CANONICAL_TYPES[v[SCHEMA_TYPE_KEY]][SCHEMA_PROP_KEY] + ) + s[k].setdefault("required", []) + s[k]["required"].extend( + CANONICAL_TYPES[v[SCHEMA_TYPE_KEY]]["required"] + ) + s[k][SCHEMA_TYPE_KEY] = CANONICAL_TYPES[v[SCHEMA_TYPE_KEY]][ + SCHEMA_TYPE_KEY + ] + return s + + schema = deepcopy(self.schema) + _LOGGER.debug(f"Validating input schema") + assert isinstance(schema, dict), SchemaError( + f"The schema has to be a {dict().__class__.__name__}" + ) + for col_name in RESERVED_COLNAMES: + assert col_name not in schema.keys(), PipestatError( + f"'{col_name}' is an identifier reserved by pipestat" + ) + self[RES_SCHEMAS_KEY] = {} + schema = _recursively_replace_custom_types(schema) + self[RES_SCHEMAS_KEY] = schema + + def _init_results_file(self) -> bool: + """ + Initialize YAML results file if it does not exist. + Read the data stored in the existing file into the memory otherwise. + + :return bool: whether the file has been created + """ + if not os.path.exists(self.file): + _LOGGER.info(f"Initializing results file '{self.file}'") + data = YacAttMap(entries={self.namespace: "{}"}) + data.write(filepath=self.file) + data.make_readonly() + self[DATA_KEY] = data + return True + _LOGGER.debug(f"Reading data from '{self.file}'") + data = YacAttMap(filepath=self.file) + filtered = list(filter(lambda x: not x.startswith("_"), data.keys())) + if filtered and self.namespace not in filtered: + raise PipestatDatabaseError( + f"'{self.file}' is already used to report results for " + f"other namespace: {filtered[0]}" + ) + self[DATA_KEY] = data + return False + + def _init_db_table(self) -> bool: + """ + Initialize a database table based on the provided schema, + if it does not exist. Read the data stored in the database into the + memory otherwise. + + :return bool: whether the table has been created + """ + if self.schema is None: + raise SchemaNotFoundError("initialize the database table") + if not self.is_db_connected(): + self.establish_db_connection_orm() + # if self._check_table_exists(table_name=self.namespace): + # _LOGGER.debug(f"Table '{self.namespace}' already exists in the database") + # if not self[DB_ONLY_KEY]: + # self._table_to_dict() + # # return False + _LOGGER.info(f"Initializing '{self.namespace}' table in '{PKG_NAME}' database") + self._create_table_orm(table_name=self.namespace, schema=self.result_schemas) + return True + + def _init_status_table(self): + status_table_name = f"{self.namespace}_{STATUS}" + if not self.is_db_connected(): + self.establish_db_connection_orm() + # if not self._check_table_exists(table_name=status_table_name): + _LOGGER.debug( + f"Initializing '{status_table_name}' table in " f"'{PKG_NAME}' database" + ) + self._create_table_orm( + table_name=status_table_name, + schema=get_status_table_schema(status_schema=self.status_schema), + ) + + def _get_attr(self, attr: str) -> Any: + """ + Safely get the name of the selected attribute of this object + + :param str attr: attr to select + :return: + """ + return self[attr] if attr in self else None + + def _check_table_exists(self, table_name: str) -> bool: + """ + Check if the specified table exists + + :param str table_name: table name to be checked + :return bool: whether the specified table exists + """ + from sqlalchemy import inspect + + with self.session as s: + return inspect(s.bind).has_table(table_name=table_name) + + def _count_rows(self, table_name: str) -> int: + """ + Count rows in a selected table + + :param str table_name: table to count rows for + :return int: number of rows in the selected table + """ + with self.session as s: + return s.query(self[DB_ORMS_KEY][table_name].id).count() + + def _get_orm(self, table_name: str = None) -> Any: + """ + Get an object relational mapper class + + :param str table_name: table name to get a class for + :return Any: Object relational mapper class + """ + if DB_ORMS_KEY not in self: + raise PipestatDatabaseError("Object relational mapper classes not defined") + tn = f"{table_name or self.namespace}" + if tn not in self[DB_ORMS_KEY]: + raise PipestatDatabaseError( + f"No object relational mapper class defined for table: {tn}" + ) + if not isinstance(self[DB_ORMS_KEY][tn], DeclarativeMeta): + raise PipestatDatabaseError( + f"Object relational mapper class for table '{tn}' is invalid" + ) + return self[DB_ORMS_KEY][tn] + + def check_record_exists( + self, record_identifier: str, table_name: str = None + ) -> bool: + """ + Check if the specified record exists in the table + + :param str record_identifier: record to check for + :param str table_name: table name to check + :return bool: whether the record exists in the table + """ + if self.file is None: + with self.session as s: + return ( + s.query(self._get_orm(table_name).id) + .filter_by(record_identifier=record_identifier) + .first() + is not None + ) + else: + if ( + self.namespace in self.data + and record_identifier in self.data[table_name] + ): + return True + return False + + def check_which_results_exist( + self, results: List[str], rid: str = None + ) -> List[str]: + """ + Check which results have been reported + + :param str rid: unique identifier of the record :param List[str] results: names of the results to check :return List[str]: whether the specified result has been reported for the indicated record in current namespace """ rid = self._strict_record_id(rid) - existing = [] - for r in results: - if not self[DB_ONLY_KEY]: + if self.file is None: + existing = self._check_which_results_exist_db(results=results, rid=rid) + else: + existing = [] + for r in results: if ( self.namespace in self.data and rid in self.data[self.namespace] and r in self.data[self.namespace][rid] ): existing.append(r) - else: - with self.db_cursor as cur: - try: - cur.execute( - f"SELECT {r} FROM {self.namespace} WHERE {RECORD_ID}=%s", - (rid,), - ) - except Exception: - continue - else: - res = cur.fetchone() - if res is not None and res[0] is not None: - existing.append(r) return existing - def check_record_exists(self, record_identifier: str = None) -> bool: + def _check_which_results_exist_db( + self, results: List[str], rid: str = None, table_name: str = None + ) -> List[str]: """ - Check if the record exists + Check if the specified results exist in the table + + :param str rid: record to check for + :param List[str] results: results identifiers to check for + :param str table_name: name of the table to search for results in + :return List[str]: results identifiers that exist + """ + table_name = table_name or self.namespace + rid = self._strict_record_id(rid) + with self.session as s: + record = ( + s.query(self._get_orm(table_name)) + .filter_by(record_identifier=rid) + .first() + ) + return [r for r in results if getattr(record, r, None) is not None] + + def check_result_exists( + self, + result_identifier: str, + record_identifier: str = None, + ) -> bool: + """ + Check if the result has been reported :param str record_identifier: unique identifier of the record - :return bool: whether the record exists + :param str result_identifier: name of the result to check + :return bool: whether the specified result has been reported for the + indicated record in current namespace """ record_identifier = self._strict_record_id(record_identifier) - if self[DB_ONLY_KEY]: - with self.db_cursor as cur: - cur.execute( - f"SELECT exists(SELECT 1 from {self.namespace} " - f"WHERE {RECORD_ID}=%s)", - (record_identifier,), + return ( + len( + self.check_which_results_exist( + results=[result_identifier], + rid=record_identifier, ) - return cur.fetchone() - if ( - self.namespace in self.data - and record_identifier in self.data[self.namespace] - ): - return True - return False + ) + > 0 + ) + + def retrieve( + self, record_identifier: str = None, result_identifier: str = None + ) -> Union[Any, Dict[str, Any]]: + """ + Retrieve a result for a record. + + If no result ID specified, results for the entire record will + be returned. + + :param str record_identifier: unique identifier of the record + :param str result_identifier: name of the result to be retrieved + :return any | Dict[str, any]: a single result or a mapping with all the + results reported for the record + """ + r_id = self._strict_record_id(record_identifier) + if self.file is None: + return self._retrieve_db( + result_identifier=result_identifier, record_identifier=r_id + ) + else: + if r_id not in self.data[self.namespace]: + raise PipestatDatabaseError(f"Record '{r_id}' not found") + if result_identifier is None: + return self.data[self.namespace][r_id].to_dict() + if result_identifier not in self.data[self.namespace][r_id]: + raise PipestatDatabaseError( + f"Result '{result_identifier}' not found for record '{r_id}'" + ) + return self.data[self.namespace][r_id][result_identifier] + + def _retrieve_db( + self, + result_identifier: str = None, + record_identifier: str = None, + table_name: str = None, + ) -> Dict[str, Any]: + """ + Retrieve a result for a record. + + If no result ID specified, results for the entire record will + be returned. + + :param str record_identifier: unique identifier of the record + :param str result_identifier: name of the result to be retrieved + :param str table_name: name of the table to search for results in + :return Dict[str, any]: a single result or a mapping with all the results + reported for the record + """ + table_name = table_name or self.namespace + record_identifier = self._strict_record_id(record_identifier) + if result_identifier is not None: + existing = self.check_which_results_exist( + results=[result_identifier], + rid=record_identifier, + table_name=table_name, + ) + if not existing: + raise PipestatDatabaseError( + f"Result '{result_identifier}' not found for record " + f"'{record_identifier}'" + ) + + with self.session as s: + record = ( + s.query(self._get_orm(table_name)) + .filter_by(record_identifier=record_identifier) + .first() + ) + + if record is not None: + if result_identifier is not None: + return {result_identifier: getattr(record, result_identifier)} + return { + column: getattr(record, column) + for column in [c.name for c in record.__table__.columns] + if getattr(record, column, None) is not None + } + raise PipestatDatabaseError(f"Record '{record_identifier}' not found") + + def assert_results_defined(self, results: List[str]) -> None: + """ + Assert provided list of results is defined in the schema + + :param List[str] results: list of results to + check for existence in the schema + :raises SchemaError: if any of the results is not defined in the schema + """ + known_results = self.result_schemas.keys() + for r in results: + assert r in known_results, SchemaError( + f"'{r}' is not a known result. Results defined in the " + f"schema are: {list(known_results)}." + ) def report( self, @@ -873,12 +1051,12 @@ def report( "There is no way to return the updated object ID while using " "results file as the object backend" ) - updated_ids = False if self.schema is None: raise SchemaNotFoundError("report results") + updated_ids = False result_identifiers = list(values.keys()) self.assert_results_defined(results=result_identifiers) - existing = self._check_which_results_exist( + existing = self.check_which_results_exist( rid=record_identifier, results=result_identifiers ) if existing: @@ -903,12 +1081,12 @@ def report( self.data.make_readonly() else: try: - updated_ids = self._report_postgres( + updated_ids = self._report_db( record_identifier=record_identifier, value=values ) except Exception as e: _LOGGER.error( - f"Could not insert the result into the database. " f"Exception: {e}" + f"Could not insert the result into the database. Exception: {e}" ) if not self[DB_ONLY_KEY]: for r in result_identifiers: @@ -922,6 +1100,41 @@ def report( ) return True if not return_id else updated_ids + def _report_db( + self, values: Dict[str, Any], record_identifier: str, table_name: str = None + ) -> int: + """ + Report a result to a database + + :param Dict[str, Any] values: values to report + :param str record_identifier: record to report the result for + :param str table_name: name of the table to report the result in + :return int: updated/inserted row + """ + record_identifier = self._strict_record_id(record_identifier) + ORMClass = self._get_orm(table_name) + values.update({RECORD_ID: record_identifier}) + if not self.check_record_exists( + record_identifier=record_identifier, table_name=table_name + ): + new_record = ORMClass(**values) + with self.session as s: + s.add(new_record) + s.commit() + returned_id = new_record.id + else: + with self.session as s: + record_to_update = ( + s.query(ORMClass) + .filter(getattr(ORMClass, RECORD_ID) == record_identifier) + .first() + ) + for result_id, result_value in values.items(): + setattr(record_to_update, result_id, result_value) + s.commit() + returned_id = record_to_update.id + return returned_id + def _report_data_element( self, record_identifier: str, values: Dict[str, Any] ) -> None: @@ -932,7 +1145,7 @@ def _report_data_element( hierarchical mapping structure if needed. :param str record_identifier: unique identifier of the record - :param Dict[str,Any] values: dict of results identifiers and values + :param Dict[str, Any] values: dict of results identifiers and values to be reported """ self[DATA_KEY].setdefault(self.namespace, PXAM()) @@ -940,104 +1153,10 @@ def _report_data_element( for res_id, val in values.items(): self[DATA_KEY][self.namespace][record_identifier][res_id] = val - def select( - self, - columns: Union[str, List[str]] = None, - condition: str = None, - condition_val: str = None, - offset: int = None, - limit: int = None, - ) -> List[psycopg2.extras.DictRow]: - """ - Get all the contents from the selected table, possibly restricted by - the provided condition. - - :param str | List[str] columns: columns to select - :param str condition: condition to restrict the results - with, will be appended to the end of the SELECT statement and - safely populated with 'condition_val', - for example: `"id=%s"` - :param list condition_val: values to fill the placeholder - in 'condition' with - :param int offset: number of records to be skipped - :param int limit: max number of records to be returned - :return List[psycopg2.extras.DictRow]: all table contents - """ - if self.file: - raise NotImplementedError( - "Selection is not supported on objects backed by results files." - " Use 'retrieve' method instead." - ) - condition, condition_val = preprocess_condition_pair(condition, condition_val) - if not columns: - columns = sql.SQL("*") - else: - columns = sql.SQL(",").join( - [sql.Identifier(x) for x in mk_list_of_str(columns)] - ) - statement = sql.SQL("SELECT {} FROM {}").format( - columns, sql.Identifier(self.namespace) - ) - if condition: - statement += sql.SQL(" WHERE ") - statement += condition - statement = paginate_query(statement, offset, limit) - with self.db_cursor as cur: - cur.execute(query=statement, vars=condition_val) - result = cur.fetchall() - return result - - def retrieve( - self, record_identifier: str = None, result_identifier: str = None - ) -> Union[Any, Dict[str, Any]]: - """ - Retrieve a result for a record. - - If no result ID specified, results for the entire record will - be returned. - - :param str record_identifier: unique identifier of the record - :param str result_identifier: name of the result to be retrieved - :return any | Dict[str, any]: a single result or a mapping with all the - results reported for the record - """ - record_identifier = self._strict_record_id(record_identifier) - if self[DB_ONLY_KEY]: - if result_identifier is not None: - existing = self._check_which_results_exist( - results=[result_identifier], rid=record_identifier - ) - if not existing: - raise PipestatDatabaseError( - f"Result '{result_identifier}' not found for record " - f"'{record_identifier}'" - ) - with self.db_cursor as cur: - query = sql.SQL( - f"SELECT {result_identifier or '*'} " - f"FROM {self.namespace} WHERE {RECORD_ID}=%s" - ) - cur.execute(query, (record_identifier,)) - result = cur.fetchall() - if len(result) > 0: - if result_identifier is None: - return {k: v for k, v in dict(result[0]).items() if v is not None} - return dict(result[0])[result_identifier] - raise PipestatDatabaseError(f"Record '{record_identifier}' not found") - else: - if record_identifier not in self.data[self.namespace]: - raise PipestatDatabaseError(f"Record '{record_identifier}' not found") - if result_identifier is None: - return self.data[self.namespace][record_identifier].to_dict() - if result_identifier not in self.data[self.namespace][record_identifier]: - raise PipestatDatabaseError( - f"Result '{result_identifier}' not found for record " - f"'{record_identifier}'" - ) - return self.data[self.namespace][record_identifier][result_identifier] - def remove( - self, record_identifier: str = None, result_identifier: str = None + self, + record_identifier: str = None, + result_identifier: str = None, ) -> bool: """ Remove a result. @@ -1050,221 +1169,99 @@ def remove( if the record should be removed. :return bool: whether the result has been removed """ - record_identifier = self._strict_record_id(record_identifier) + r_id = self._strict_record_id(record_identifier) rm_record = True if result_identifier is None else False - if not self.check_record_exists(record_identifier): - _LOGGER.error(f"Record '{record_identifier}' not found") - return False - if result_identifier and not self.check_result_exists( - result_identifier, record_identifier + if not self.check_record_exists( + record_identifier=r_id, table_name=self.namespace ): - _LOGGER.error( - f"'{result_identifier}' has not been reported for " - f"'{record_identifier}'" - ) + _LOGGER.error(f"Record '{r_id}' not found") + return False + if result_identifier and not self.check_result_exists(result_identifier, r_id): + _LOGGER.error(f"'{result_identifier}' has not been reported for '{r_id}'") return False if self.file: self.data.make_writable() if not self[DB_ONLY_KEY]: if rm_record: - _LOGGER.info(f"Removing '{record_identifier}' record") - del self[DATA_KEY][self.namespace][record_identifier] + _LOGGER.info(f"Removing '{r_id}' record") + del self[DATA_KEY][self.namespace][r_id] else: - val_backup = self[DATA_KEY][self.namespace][record_identifier][ - result_identifier - ] - del self[DATA_KEY][self.namespace][record_identifier][result_identifier] + val_backup = self[DATA_KEY][self.namespace][r_id][result_identifier] + del self[DATA_KEY][self.namespace][r_id][result_identifier] _LOGGER.info( f"Removed result '{result_identifier}' for record " - f"'{record_identifier}' from '{self.namespace}' namespace" + f"'{r_id}' from '{self.namespace}' namespace" ) - if not self[DATA_KEY][self.namespace][record_identifier]: + if not self[DATA_KEY][self.namespace][r_id]: _LOGGER.info( - f"Last result removed for '{record_identifier}'. " - f"Removing the record" + f"Last result removed for '{r_id}'. " f"Removing the record" ) - del self[DATA_KEY][self.namespace][record_identifier] + del self[DATA_KEY][self.namespace][r_id] rm_record = True if self.file: self.data.write() self.data.make_readonly() if self.file is None: - if rm_record: - try: - with self.db_cursor as cur: - cur.execute( - f"DELETE FROM {self.namespace} WHERE " - f"{RECORD_ID}='{record_identifier}'" - ) - except Exception as e: - _LOGGER.error( - f"Could not remove the result from the " - f"database. Exception: {e}" - ) - self[DATA_KEY][self.namespace].setdefault(record_identifier, PXAM()) - raise - return True try: - with self.db_cursor as cur: - cur.execute( - f"UPDATE {self.namespace} SET {result_identifier}=null " - f"WHERE {RECORD_ID}='{record_identifier}'" - ) + self._remove_db( + record_identifier=r_id, + result_identifier=None if rm_record else result_identifier, + ) except Exception as e: _LOGGER.error( - f"Could not remove the result from the database. " f"Exception: {e}" + f"Could not remove the result from the database. Exception: {e}" ) - if not self[DB_ONLY_KEY]: - self[DATA_KEY][self.namespace][record_identifier][ - result_identifier - ] = val_backup + if not self[DB_ONLY_KEY] and not rm_record: + self[DATA_KEY][self.namespace][r_id][result_identifier] = val_backup raise return True - def validate_schema(self) -> None: - """ - Check schema for any possible issues - - :raises SchemaError: if any schema format issue is detected - """ - - def _recursively_replace_custom_types(s: dict) -> Dict: - """ - Replace the custom types in pipestat schema with canonical types - - :param dict s: schema to replace types in - :return dict: schema with types replaced - """ - for k, v in s.items(): - assert SCHEMA_TYPE_KEY in v, SchemaError( - f"Result '{k}' is missing '{SCHEMA_TYPE_KEY}' key" - ) - if v[SCHEMA_TYPE_KEY] == "object" and SCHEMA_PROP_KEY in s[k]: - _recursively_replace_custom_types(s[k][SCHEMA_PROP_KEY]) - if v[SCHEMA_TYPE_KEY] in CANONICAL_TYPES.keys(): - s.setdefault(k, {}) - s[k].setdefault(SCHEMA_PROP_KEY, {}) - s[k][SCHEMA_PROP_KEY].update( - CANONICAL_TYPES[v[SCHEMA_TYPE_KEY]][SCHEMA_PROP_KEY] - ) - s[k].setdefault("required", []) - s[k]["required"].extend( - CANONICAL_TYPES[v[SCHEMA_TYPE_KEY]]["required"] - ) - s[k][SCHEMA_TYPE_KEY] = CANONICAL_TYPES[v[SCHEMA_TYPE_KEY]][ - SCHEMA_TYPE_KEY - ] - return s - - schema = deepcopy(self.schema) - _LOGGER.debug(f"Validating input schema") - assert isinstance(schema, dict), SchemaError( - f"The schema has to be a {dict().__class__.__name__}" - ) - for col_name in RESERVED_COLNAMES: - assert col_name not in schema.keys(), PipestatError( - f"'{col_name}' is an identifier reserved by pipestat" - ) - self[RES_SCHEMAS_KEY] = {} - schema = _recursively_replace_custom_types(schema) - self[RES_SCHEMAS_KEY] = schema - - def assert_results_defined(self, results: List[str]) -> None: - """ - Assert provided list of results is defined in the schema - - :param List[str] results: list of results to - check for existence in the schema - :raises SchemaError: if any of the results is not defined in the schema + def _remove_db( + self, + record_identifier: str = None, + result_identifier: str = None, + table_name: str = None, + ) -> bool: """ - known_results = self.result_schemas.keys() - for r in results: - assert r in known_results, SchemaError( - f"'{r}' is not a known result. Results defined in the " - f"schema are: {list(known_results)}." - ) + Remove a result. - def check_connection(self) -> bool: - """ - Check whether a PostgreSQL connection has been established + If no result ID specified or last result is removed, the entire record + will be removed. - :return bool: whether the connection has been established + :param str record_identifier: unique identifier of the record + :param str result_identifier: name of the result to be removed or None + if the record should be removed. + :param str table_name: name of the table to report the result in + :return bool: whether the result has been removed + :raise PipestatDatabaseError: if either record or result specified are not found """ - if self.file is not None: - raise PipestatDatabaseError( - f"The {self.__class__.__name__} object " f"is not backed by a database" - ) - if DB_CONNECTION_KEY in self and isinstance( - self[DB_CONNECTION_KEY], psycopg2.extensions.connection + table_name = table_name or self.namespace + record_identifier = self._strict_record_id(record_identifier) + ORMClass = self._get_orm(table_name=table_name) + if self.check_record_exists( + record_identifier=record_identifier, table_name=table_name ): - return True - return False - - def establish_postgres_connection(self, suppress: bool = False) -> bool: - """ - Establish PostgreSQL connection using the config data - - :param bool suppress: whether to suppress any connection errors - :return bool: whether the connection has been established successfully - """ - if self.check_connection(): - raise PipestatDatabaseError( - f"Connection is already established: " - f"{self[DB_CONNECTION_KEY].info.host}" - ) - try: - self[DB_CONNECTION_KEY] = psycopg2.connect( - dbname=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_NAME_KEY], - user=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_USER_KEY], - password=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_PASSWORD_KEY], - host=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_HOST_KEY], - port=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_PORT_KEY], - ) - except psycopg2.Error as e: - _LOGGER.error( - f"Could not connect to: " - f"{self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_HOST_KEY]}" - ) - _LOGGER.info(f"Caught error: {e}") - if suppress: - return False - raise + with self.session as s: + record = ( + s.query(ORMClass) + .filter(getattr(ORMClass, RECORD_ID) == record_identifier) + .first() + ) + if result_identifier is None: + # delete row + record.delete() + else: + # set the value to None + if not self.check_result_exists( + record_identifier=record_identifier, + result_identifier=result_identifier, + table_name=table_name, + ): + raise PipestatDatabaseError( + f"Result '{result_identifier}' not found for record " + f"'{record_identifier}'" + ) + setattr(record, result_identifier, None) + s.commit() else: - _LOGGER.debug( - f"Established connection with PostgreSQL: " - f"{self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_HOST_KEY]}" - ) - return True - - def close_postgres_connection(self) -> None: - """ - Close connection and remove client bound - """ - if not self.check_connection(): - raise PipestatDatabaseError( - f"The connection has not been established: " - f"{self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_HOST_KEY]}" - ) - self[DB_CONNECTION_KEY].close() - del self[DB_CONNECTION_KEY] - _LOGGER.debug( - f"Closed connection with PostgreSQL: " - f"{self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_HOST_KEY]}" - ) - - def _strict_record_id(self, forced_value: str = None) -> str: - """ - Get record identifier from the outer source or stored with this object - - :param str forced_value: return this value - :return str: record identifier - """ - if forced_value is not None: - return forced_value - if self.record_identifier is not None: - return self.record_identifier - raise PipestatError( - f"You must provide the record identifier you want to perform " - f"the action on. Either in the {self.__class__.__name__} " - f"constructor or as an argument to the method." - ) + raise PipestatDatabaseError(f"Record '{record_identifier}' not found") diff --git a/pipestat/pipestat_dirver.py b/pipestat/pipestat_dirver.py new file mode 100644 index 00000000..ea9cdb42 --- /dev/null +++ b/pipestat/pipestat_dirver.py @@ -0,0 +1,1270 @@ +from contextlib import contextmanager +from copy import deepcopy +from logging import getLogger +from typing import Any, Dict, List, Optional, Union + +import psycopg2 +from attmap import PathExAttMap as PXAM +from jsonschema import validate +from psycopg2.extensions import connection +from psycopg2.extras import DictCursor, Json +from ubiquerg import create_lock, remove_lock +from yacman import YacAttMap + +from .const import * +from .exceptions import * +from .helpers import * + +_LOGGER = getLogger(PKG_NAME) + + +class LoggingCursor(psycopg2.extras.DictCursor): + """ + Logging db cursor + """ + + def execute(self, query, vars=None): + """ + Execute a database operation (query or command) and issue a debug + and info level log messages + + :param query: + :param vars: + :return: + """ + _LOGGER.debug(f"Executing query: {self.mogrify(query, vars)}") + super(LoggingCursor, self).execute(query=query, vars=vars) + _LOGGER.debug(f"Executed query: {self.query}") + + +class PipestatManagerDirver(dict): + """ + Pipestat standardizes reporting of pipeline results and + pipeline status management. It formalizes a way for pipeline developers + and downstream tools developers to communicate -- results produced by a + pipeline can easily and reliably become an input for downstream analyses. + The object exposes API for interacting with the results and + pipeline status and can be backed by either a YAML-formatted file + or a PostgreSQL database. + """ + + def __init__( + self, + namespace: str = None, + record_identifier: str = None, + schema_path: str = None, + results_file_path: str = None, + database_only: bool = False, + config: Union[str, dict] = None, + status_schema_path: str = None, + flag_file_dir: str = None, + ): + """ + Initialize the object + + :param str namespace: namespace to report into. This will be the DB + table name if using DB as the object back-end + :param str record_identifier: record identifier to report for. This + creates a weak bound to the record, which can be overriden in + this object method calls + :param str schema_path: path to the output schema that formalizes + the results structure + :param str results_file_path: YAML file to report into, if file is + used as the object back-end + :param bool database_only: whether the reported data should not be + stored in the memory, but only in the database + :param str | dict config: path to the configuration file or a mapping + with the config file content + :param str status_schema_path: path to the status schema that formalizes + the status flags structure + """ + + def _check_cfg_key(cfg: dict, key: str) -> bool: + if key not in cfg: + _LOGGER.warning(f"Key '{key}' not found in config") + return False + return True + + def _mk_abs_via_cfg( + path: Optional[str], + cfg_path: Optional[str], + ) -> Optional[str]: + if path is None: + return path + assert isinstance(path, str), TypeError("Path is expected to be a str") + if os.path.isabs(path): + return path + if cfg_path is None: + rel_to_cwd = os.path.join(os.getcwd(), path) + if os.path.exists(rel_to_cwd) or os.access( + os.path.dirname(rel_to_cwd), os.W_OK + ): + return rel_to_cwd + raise OSError(f"Could not make this path absolute: {path}") + joined = os.path.join(os.path.dirname(cfg_path), path) + if os.path.isabs(joined): + return joined + raise OSError(f"Could not make this path absolute: {path}") + + def _select_value( + arg_name: str, + arg_value: Any, + cfg: dict, + strict: bool = True, + env_var: str = None, + ) -> Any: + if arg_value is not None: + return arg_value + if arg_name not in cfg or cfg[arg_name] is None: + if env_var is not None: + arg = os.getenv(env_var, None) + if arg is not None: + _LOGGER.debug(f"Value '{arg}' sourced from '{env_var}' env var") + return expandpath(arg) + if strict: + raise PipestatError( + f"Value for the required '{arg_name}' argument could not be" + f" determined. Provide it in the config or pass to the " + f"object constructor." + ) + return + return cfg[arg_name] + + super(PipestatManagerDirver, self).__init__() + self[CONFIG_KEY] = YacAttMap() + # read config or config data + config = config or os.getenv(ENV_VARS["config"]) + if config is not None: + if isinstance(config, str): + config = os.path.abspath(expandpath(config)) + self[CONFIG_KEY] = YacAttMap(filepath=config) + self._config_path = config + elif isinstance(config, dict): + self[CONFIG_KEY] = YacAttMap(entries=config) + self._config_path = None + else: + raise TypeError( + "database_config has to be either path to the " + "file to read or a dict" + ) + # validate config + cfg = self[CONFIG_KEY].to_dict(expand=True) + _, cfg_schema = read_yaml_data(CFG_SCHEMA, "config schema") + validate(cfg, cfg_schema) + + self[NAME_KEY] = _select_value( + "namespace", namespace, self[CONFIG_KEY], env_var=ENV_VARS["namespace"] + ) + self[RECORD_ID_KEY] = _select_value( + "record_identifier", + record_identifier, + self[CONFIG_KEY], + False, + ENV_VARS["record_identifier"], + ) + self[DB_ONLY_KEY] = database_only + # read results schema + self._schema_path = _select_value( + "schema_path", + schema_path, + self[CONFIG_KEY], + False, + env_var=ENV_VARS["schema"], + ) + if self._schema_path is not None: + _, self[SCHEMA_KEY] = read_yaml_data( + _mk_abs_via_cfg(self._schema_path, self.config_path), "schema" + ) + self.validate_schema() + # determine the highlighted results + self[HIGHLIGHTED_KEY] = [ + k + for k, v in self.schema.items() + if "highlight" in v and v["highlight"] is True + ] + if self[HIGHLIGHTED_KEY]: + assert isinstance(self[HIGHLIGHTED_KEY], list), TypeError( + f"highlighted results specification " + f"({self[HIGHLIGHTED_KEY]}) has to be a list" + ) + # read status schema + status_schema_path = ( + _mk_abs_via_cfg( + _select_value( + "status_schema_path", + status_schema_path, + self[CONFIG_KEY], + False, + env_var=ENV_VARS["status_schema"], + ), + self.config_path, + ) + or STATUS_SCHEMA + ) + self[STATUS_SCHEMA_SOURCE_KEY], self[STATUS_SCHEMA_KEY] = read_yaml_data( + status_schema_path, "status schema" + ) + # determine results file + results_file_path = _mk_abs_via_cfg( + _select_value( + "results_file_path", + results_file_path, + self[CONFIG_KEY], + False, + ENV_VARS["results_file"], + ), + self.config_path, + ) + if results_file_path: + if self[DB_ONLY_KEY]: + raise ValueError( + "Running in database only mode does not make " + "sense with a YAML file as a backend." + ) + self[FILE_KEY] = results_file_path + self._init_results_file() + flag_file_dir = _select_value( + "flag_file_dir", flag_file_dir, self[CONFIG_KEY], False + ) or os.path.dirname(self.file) + self[STATUS_FILE_DIR] = _mk_abs_via_cfg(flag_file_dir, self.config_path) + elif CFG_DATABASE_KEY in self[CONFIG_KEY]: + if not all( + [ + _check_cfg_key(self[CONFIG_KEY][CFG_DATABASE_KEY], key) + for key in DB_CREDENTIALS + ] + ): + raise MissingConfigDataError( + "Must specify all database login " "credentials or result_file_path" + ) + self[DATA_KEY] = YacAttMap() + self._init_postgres_table() + self._init_status_table() + else: + raise MissingConfigDataError( + "Must specify either database login " "credentials or a YAML file path" + ) + + def __str__(self): + """ + Generate string representation of the object + + :return str: string representation of the object + """ + res = f"{self.__class__.__name__} ({self.namespace})" + res += "\nBackend: {}".format( + f"file ({self.file})" if self.file else "PostgreSQL" + ) + res += f"\nResults schema source: {self.schema_path}" + res += f"\nStatus schema source: {self.status_schema_source}" + res += f"\nRecords count: {self.record_count}" + if self.highlighted_results: + res += f"\nHighlighted results: {', '.join(self.highlighted_results)}" + return res + + def _get_flag_file( + self, record_identifier: str = None + ) -> Union[str, List[str], None]: + """ + Get path to the status flag file for the specified record + + :param str record_identifier: unique record identifier + :return str | list[str] | None: path to the status flag file + """ + from glob import glob + + r_id = self._strict_record_id(record_identifier) + if self.file is None: + return + if self.file is not None: + regex = os.path.join( + self[STATUS_FILE_DIR], f"{self.namespace}_{r_id}_*.flag" + ) + file_list = glob(regex) + if len(file_list) > 1: + _LOGGER.warning("Multiple flag files found") + return file_list + elif len(file_list) == 1: + return file_list[0] + else: + _LOGGER.debug("No flag files found") + return None + + @property + def highlighted_results(self) -> List[str]: + """ + Highlighted results + + :return List[str]: a collection of highlighted results + """ + return self._get_attr(HIGHLIGHTED_KEY) or [] + + @property + def record_count(self) -> int: + """ + Number of records reported + + :return int: number of records reported + """ + return ( + len(self.data[self.namespace]) + if self.file + else self._count_rows(self.namespace) + ) + + @property + def namespace(self) -> str: + """ + Namespace the object writes the results to + + :return str: namespace the object writes the results to + """ + return self._get_attr(NAME_KEY) + + @property + def record_identifier(self) -> str: + """ + Unique identifier of the record + + :return str: unique identifier of the record + """ + return self._get_attr(RECORD_ID_KEY) + + @property + def schema(self) -> Dict: + """ + Schema mapping + + :return dict: schema that formalizes the results structure + """ + return self._get_attr(SCHEMA_KEY) + + @property + def status_schema(self) -> Dict: + """ + Status schema mapping + + :return dict: schema that formalizes the pipeline status structure + """ + return self._get_attr(STATUS_SCHEMA_KEY) + + @property + def status_schema_source(self) -> Dict: + """ + Status schema source + + :return dict: source of the schema that formalizes + the pipeline status structure + """ + return self._get_attr(STATUS_SCHEMA_SOURCE_KEY) + + @property + def schema_path(self) -> str: + """ + Schema path + + :return str: path to the provided schema + """ + return self._schema_path + + @property + def config_path(self) -> str: + """ + Config path. None if the config was not provided or if provided + as a mapping of the config contents + + :return str: path to the provided config + """ + return getattr(self, "_config_path", None) + + @property + def result_schemas(self) -> Dict: + """ + Result schema mappings + + :return dict: schemas that formalize the structure of each result + in a canonical jsonschema way + """ + return self._get_attr(RES_SCHEMAS_KEY) + + @property + def file(self) -> str: + """ + File path that the object is reporting the results into + + :return str: file path that the object is reporting the results into + """ + return self._get_attr(FILE_KEY) + + @property + def data(self) -> YacAttMap: + """ + Data object + + :return yacman.YacAttMap: the object that stores the reported data + """ + return self._get_attr(DATA_KEY) + + @property + @contextmanager + def db_cursor(self): + """ + Establish connection and get a PostgreSQL database cursor, + commit and close the connection afterwards + + :return LoggingCursor: Database cursor object + """ + try: + if not self.check_connection(): + self.establish_postgres_connection() + with self[DB_CONNECTION_KEY] as c, c.cursor( + cursor_factory=LoggingCursor + ) as cur: + yield cur + except Exception: + raise + finally: + self.close_postgres_connection() + + def get_status(self, record_identifier: str = None) -> Optional[str]: + """ + Get the current pipeline status + + :return str: status identifier, like 'running' + """ + r_id = self._strict_record_id(record_identifier) + if self.file is None: + with self.db_cursor as cur: + query = sql.SQL( + f"SELECT {STATUS} " + f"FROM {f'{self.namespace}_{STATUS}'} " + f"WHERE {RECORD_ID}=%s" + ) + cur.execute(query, (r_id,)) + result = cur.fetchone() + return result[0] if result is not None else None + else: + flag_file = self._get_flag_file(record_identifier=r_id) + if flag_file is not None: + assert isinstance(flag_file, str), TypeError( + "Flag file path is expected to be a str, were multiple flags found?" + ) + with open(flag_file, "r") as f: + status = f.read() + return status + _LOGGER.debug( + f"Could not determine status for '{r_id}' record. " + f"No flags found in: {self[STATUS_FILE_DIR]}" + ) + return None + + def _get_attr(self, attr: str) -> Any: + """ + Safely get the name of the selected attribute of this object + + :param str attr: attr to select + :return: + """ + return self[attr] if attr in self else None + + def _table_to_dict(self) -> None: + """ + Create a dictionary from the database table data + """ + with self.db_cursor as cur: + cur.execute(f"SELECT * FROM {self.namespace}") + data = cur.fetchall() + _LOGGER.debug(f"Reading data from database for '{self.namespace}' namespace") + for record in data: + record_id = record[RECORD_ID] + for res_id, val in record.items(): + if val is not None: + self._report_data_element( + record_identifier=record_id, values={res_id: val} + ) + + def _init_postgres_table(self) -> bool: + """ + Initialize a PostgreSQL table based on the provided schema, + if it does not exist. Read the data stored in the database into the + memory otherwise. + + :return bool: whether the table has been created + """ + if self.schema is None: + raise SchemaNotFoundError("initialize the database table") + if self._check_table_exists(table_name=self.namespace): + _LOGGER.debug(f"Table '{self.namespace}' already exists in the database") + if not self[DB_ONLY_KEY]: + self._table_to_dict() + return False + _LOGGER.info(f"Initializing '{self.namespace}' table in '{PKG_NAME}' database") + columns = FIXED_COLUMNS + schema_to_columns(schema=self.schema) + self._create_table(table_name=self.namespace, columns=columns) + return True + + # def _create_status_type(self): + # with self.db_cursor as cur: + # s = sql.SQL(f"SELECT exists (SELECT 1 FROM pg_type WHERE typname = '{STATUS}');") + # cur.execute(s) + # if cur.fetchone()[0]: + # return + # with self.db_cursor as cur: + # status_strs = [f"'{st_id}'" for st_id in self.status_schema.keys()] + # status_str = ", ".join(status_strs) + # s = sql.SQL(f"CREATE TYPE {STATUS} as enum({status_str});") + # cur.execute(s) + + def _init_status_table(self): + status_table_name = f"{self.namespace}_{STATUS}" + # self._create_status_type() + if not self._check_table_exists(table_name=status_table_name): + _LOGGER.info( + f"Initializing '{status_table_name}' table in " f"'{PKG_NAME}' database" + ) + self._create_table(status_table_name, STATUS_TABLE_COLUMNS) + + def _create_table(self, table_name: str, columns: List[str]): + """ + Create a table + + :param str table_name: name of the table to create + :param str | List[str] columns: columns definition list, + for instance: ['name VARCHAR(50) NOT NULL'] + """ + columns = mk_list_of_str(columns) + with self.db_cursor as cur: + s = sql.SQL(f"CREATE TABLE {table_name} ({','.join(columns)})") + cur.execute(s) + + def _init_results_file(self) -> bool: + """ + Initialize YAML results file if it does not exist. + Read the data stored in the existing file into the memory otherwise. + + :return bool: whether the file has been created + """ + if not os.path.exists(self.file): + _LOGGER.info(f"Initializing results file '{self.file}'") + data = YacAttMap(entries={self.namespace: "{}"}) + data.write(filepath=self.file) + data.make_readonly() + self[DATA_KEY] = data + return True + _LOGGER.debug(f"Reading data from '{self.file}'") + data = YacAttMap(filepath=self.file) + filtered = list(filter(lambda x: not x.startswith("_"), data.keys())) + if filtered and self.namespace not in filtered: + raise PipestatDatabaseError( + f"'{self.file}' is already used to report results for " + f"other namespace: {filtered[0]}" + ) + self[DATA_KEY] = data + return False + + def _check_table_exists(self, table_name: str) -> bool: + """ + Check if the specified table exists + + :param str table_name: table name to be checked + :return bool: whether the specified table exists + """ + with self.db_cursor as cur: + cur.execute( + "SELECT EXISTS(SELECT * FROM information_schema.tables " + "WHERE table_name=%s)", + (table_name,), + ) + return cur.fetchone()[0] + + def _check_record( + self, condition_col: str, condition_val: str, table_name: str + ) -> bool: + """ + Check if the record matching the condition is in the table + + :param str condition_col: column to base the check on + :param str condition_val: value in the selected column + :param str table_name: name of the table ot check the record in + :return bool: whether any record matches the provided condition + """ + with self.db_cursor as cur: + statement = ( + f"SELECT EXISTS(SELECT 1 from {table_name} " + f"WHERE {condition_col}=%s)" + ) + cur.execute(statement, (condition_val,)) + return cur.fetchone()[0] + + def _count_rows(self, table_name: str) -> int: + """ + Count rows in a selected table + + :param str table_name: table to count rows for + :return int: number of rows in the selected table + """ + with self.db_cursor as cur: + statement = sql.SQL("SELECT COUNT(*) FROM {}").format( + sql.Identifier(table_name) + ) + cur.execute(statement) + return cur.fetchall()[0][0] + + def _report_postgres( + self, value: Dict[str, Any], record_identifier: str, table_name: str = None + ) -> int: + """ + Check if record with this record identifier in table, create new record + if not (INSERT), update the record if yes (UPDATE). + + Currently supports just one column at a time. + + :param str record_identifier: unique identifier of the record, value to + in 'record_identifier' column to look for to determine if the record + already exists in the table + :param dict value: a mapping of pair of table column names and + respective values to be inserted to the database + :return int: id of the row just inserted + """ + table_name = table_name or self.namespace + if not self._check_record( + condition_col=RECORD_ID, + condition_val=record_identifier, + table_name=table_name, + ): + with self.db_cursor as cur: + cur.execute( + f"INSERT INTO {table_name} ({RECORD_ID}) VALUES (%s)", + (record_identifier,), + ) + # prep a list of SQL objects with column-named value placeholders + columns = sql.SQL(",").join( + [ + sql.SQL("{}=%({})s").format(sql.Identifier(k), sql.SQL(k)) + for k in list(value.keys()) + ] + ) + # construct the query template to execute + query = sql.SQL("UPDATE {n} SET {c} WHERE {id}=%({id})s RETURNING id").format( + n=sql.Identifier(table_name), c=columns, id=sql.SQL(RECORD_ID) + ) + # preprocess the values, dict -> Json + values = {k: Json(v) if isinstance(v, dict) else v for k, v in value.items()} + # add record_identifier column, which is specified outside of values + values.update({RECORD_ID: record_identifier}) + with self.db_cursor as cur: + cur.execute(query, values) + return cur.fetchone()[0] + + def clear_status( + self, record_identifier: str = None, flag_names: List[str] = None + ) -> List[str]: + """ + Remove status flags + + :param str record_identifier: name of the record to remove flags for + :param Iterable[str] flag_names: Names of flags to remove, optional; if + unspecified, all schema-defined flag names will be used. + :return List[str]: Collection of names of flags removed + """ + r_id = self._strict_record_id(record_identifier) + if self.file is not None: + flag_names = flag_names or list(self.status_schema.keys()) + if isinstance(flag_names, str): + flag_names = [flag_names] + removed = [] + for f in flag_names: + path_flag_file = self.get_status_flag_path( + status_identifier=f, record_identifier=r_id + ) + try: + os.remove(path_flag_file) + except: + pass + else: + _LOGGER.info(f"Removed existing flag: {path_flag_file}") + removed.append(f) + return removed + else: + removed = self.get_status(r_id) + status_table_name = f"{self.namespace}_{STATUS}" + with self.db_cursor as cur: + try: + cur.execute( + f"DELETE FROM {status_table_name} WHERE " + f"{RECORD_ID}='{r_id}'" + ) + except Exception as e: + _LOGGER.error( + f"Could not remove the status from the " + f"database. Exception: {e}" + ) + return [] + else: + return [removed] + + def get_status_flag_path( + self, status_identifier: str, record_identifier=None + ) -> str: + """ + Get the path to the status file flag + + :param str status_identifier: one of the defined status IDs in schema + :param str record_identifier: unique record ID, optional if + specified in the object constructor + :return str: absolute path to the flag file or None if object is + backed by a DB + """ + if self.file is None: + # DB as the backend + return + r_id = self._strict_record_id(record_identifier) + return os.path.join( + self[STATUS_FILE_DIR], f"{self.namespace}_{r_id}_{status_identifier}.flag" + ) + + def set_status(self, status_identifier: str, record_identifier: str = None) -> None: + """ + Set pipeline run status. + + The status identifier needs to match one of identifiers specified in + the status schema. A basic, ready to use, status schema is shipped with + this package. + + :param str status_identifier: status to set, one of statuses defined + in the status schema + :param str record_identifier: record identifier to set the + pipeline status for + """ + r_id = self._strict_record_id(record_identifier) + known_status_identifiers = self.status_schema.keys() + if status_identifier not in known_status_identifiers: + raise PipestatError( + f"'{status_identifier}' is not a defined status identifier. " + f"These are allowed: {known_status_identifiers}" + ) + prev_status = self.get_status(r_id) + if self.file is not None: + if prev_status: + prev_flag_path = self.get_status_flag_path(prev_status, r_id) + os.remove(prev_flag_path) + flag_path = self.get_status_flag_path(status_identifier, r_id) + create_lock(flag_path) + with open(flag_path, "w") as f: + f.write(status_identifier) + remove_lock(flag_path) + else: + try: + self._report_postgres( + value={STATUS: status_identifier}, + record_identifier=r_id, + table_name=f"{self.namespace}_{STATUS}", + ) + except Exception as e: + _LOGGER.error( + f"Could not insert into the status table. " f"Exception: {e}" + ) + raise + if prev_status: + _LOGGER.debug( + f"Changed status from '{prev_status}' to '{status_identifier}'" + ) + + def check_result_exists(self, result_identifier, record_identifier=None): + """ + Check if the result has been reported + + :param str record_identifier: unique identifier of the record + :param str result_identifier: name of the result to check + :return bool: whether the specified result has been reported for the + indicated record in current namespace + """ + record_identifier = self._strict_record_id(record_identifier) + return self._check_which_results_exist( + results=[result_identifier], rid=record_identifier + ) + + def _check_which_results_exist( + self, results: List[str], rid: str = None + ) -> List[str]: + """ + Check which results have been reported + + :param str rid: unique identifier of the record + :param List[str] results: names of the results to check + :return List[str]: whether the specified result has been reported for the + indicated record in current namespace + """ + rid = self._strict_record_id(rid) + existing = [] + for r in results: + if not self[DB_ONLY_KEY]: + if ( + self.namespace in self.data + and rid in self.data[self.namespace] + and r in self.data[self.namespace][rid] + ): + existing.append(r) + else: + with self.db_cursor as cur: + try: + cur.execute( + f"SELECT {r} FROM {self.namespace} WHERE {RECORD_ID}=%s", + (rid,), + ) + except Exception: + continue + else: + res = cur.fetchone() + if res is not None and res[0] is not None: + existing.append(r) + return existing + + def check_record_exists(self, record_identifier: str = None) -> bool: + """ + Check if the record exists + + :param str record_identifier: unique identifier of the record + :return bool: whether the record exists + """ + record_identifier = self._strict_record_id(record_identifier) + if self[DB_ONLY_KEY]: + with self.db_cursor as cur: + cur.execute( + f"SELECT exists(SELECT 1 from {self.namespace} " + f"WHERE {RECORD_ID}=%s)", + (record_identifier,), + ) + return cur.fetchone() + if ( + self.namespace in self.data + and record_identifier in self.data[self.namespace] + ): + return True + return False + + def report( + self, + values: Dict[str, Any], + record_identifier: str = None, + force_overwrite: bool = False, + strict_type: bool = True, + return_id: bool = False, + ) -> Union[bool, int]: + """ + Report a result. + + :param Dict[str, any] values: dictionary of result-value pairs + :param str record_identifier: unique identifier of the record, value + in 'record_identifier' column to look for to determine if the record + already exists + :param bool force_overwrite: whether to overwrite the existing record + :param bool strict_type: whether the type of the reported values should + remain as is. Pipestat would attempt to convert to the + schema-defined one otherwise + :param bool return_id: PostgreSQL IDs of the records that have been + updated. Not available with results file as backend + :return bool | int: whether the result has been reported or the ID of + the updated record in the table, if requested + """ + record_identifier = self._strict_record_id(record_identifier) + if return_id and self.file is not None: + raise NotImplementedError( + "There is no way to return the updated object ID while using " + "results file as the object backend" + ) + if self.schema is None: + raise SchemaNotFoundError("report results") + updated_ids = False + result_identifiers = list(values.keys()) + self.assert_results_defined(results=result_identifiers) + existing = self._check_which_results_exist( + rid=record_identifier, results=result_identifiers + ) + if existing: + _LOGGER.warning( + f"These results exist for '{record_identifier}': {existing}" + ) + if not force_overwrite: + return False + _LOGGER.info(f"Overwriting existing results: {existing}") + for r in result_identifiers: + validate_type( + value=values[r], schema=self.result_schemas[r], strict_type=strict_type + ) + if self.file is not None: + self.data.make_writable() + if not self[DB_ONLY_KEY]: + self._report_data_element( + record_identifier=record_identifier, values=values + ) + if self.file is not None: + self.data.write() + self.data.make_readonly() + else: + try: + updated_ids = self._report_postgres( + record_identifier=record_identifier, value=values + ) + except Exception as e: + _LOGGER.error( + f"Could not insert the result into the database. " f"Exception: {e}" + ) + if not self[DB_ONLY_KEY]: + for r in result_identifiers: + del self[DATA_KEY][self.namespace][record_identifier][r] + raise + nl = "\n" + rep_strs = [f"{k}: {v}" for k, v in values.items()] + _LOGGER.info( + f"Reported records for '{record_identifier}' in '{self.namespace}' " + f"namespace:{nl} - {(nl + ' - ').join(rep_strs)}" + ) + return True if not return_id else updated_ids + + def _report_data_element( + self, record_identifier: str, values: Dict[str, Any] + ) -> None: + """ + Update the value of a result in a current namespace. + + This method overwrites any existing data and creates the required + hierarchical mapping structure if needed. + + :param str record_identifier: unique identifier of the record + :param Dict[str, Any] values: dict of results identifiers and values + to be reported + """ + self[DATA_KEY].setdefault(self.namespace, PXAM()) + self[DATA_KEY][self.namespace].setdefault(record_identifier, PXAM()) + for res_id, val in values.items(): + self[DATA_KEY][self.namespace][record_identifier][res_id] = val + + def select( + self, + columns: Union[str, List[str]] = None, + condition: str = None, + condition_val: str = None, + offset: int = None, + limit: int = None, + ) -> List[psycopg2.extras.DictRow]: + """ + Get all the contents from the selected table, possibly restricted by + the provided condition. + + :param str | List[str] columns: columns to select + :param str condition: condition to restrict the results + with, will be appended to the end of the SELECT statement and + safely populated with 'condition_val', + for example: `"id=%s"` + :param list condition_val: values to fill the placeholder + in 'condition' with + :param int offset: number of records to be skipped + :param int limit: max number of records to be returned + :return List[psycopg2.extras.DictRow]: all table contents + """ + if self.file: + raise NotImplementedError( + "Selection is not supported on objects backed by results files." + " Use 'retrieve' method instead." + ) + condition, condition_val = preprocess_condition_pair(condition, condition_val) + if not columns: + columns = sql.SQL("*") + else: + columns = sql.SQL(",").join( + [sql.Identifier(x) for x in mk_list_of_str(columns)] + ) + statement = sql.SQL("SELECT {} FROM {}").format( + columns, sql.Identifier(self.namespace) + ) + if condition: + statement += sql.SQL(" WHERE ") + statement += condition + statement = paginate_query(statement, offset, limit) + with self.db_cursor as cur: + cur.execute(query=statement, vars=condition_val) + result = cur.fetchall() + return result + + def retrieve( + self, record_identifier: str = None, result_identifier: str = None + ) -> Union[Any, Dict[str, Any]]: + """ + Retrieve a result for a record. + + If no result ID specified, results for the entire record will + be returned. + + :param str record_identifier: unique identifier of the record + :param str result_identifier: name of the result to be retrieved + :return any | Dict[str, any]: a single result or a mapping with all the + results reported for the record + """ + record_identifier = self._strict_record_id(record_identifier) + if self[DB_ONLY_KEY]: + if result_identifier is not None: + existing = self._check_which_results_exist( + results=[result_identifier], rid=record_identifier + ) + if not existing: + raise PipestatDatabaseError( + f"Result '{result_identifier}' not found for record " + f"'{record_identifier}'" + ) + with self.db_cursor as cur: + query = sql.SQL( + f"SELECT {result_identifier or '*'} " + f"FROM {self.namespace} WHERE {RECORD_ID}=%s" + ) + cur.execute(query, (record_identifier,)) + result = cur.fetchall() + if len(result) > 0: + if result_identifier is None: + return {k: v for k, v in dict(result[0]).items() if v is not None} + return dict(result[0])[result_identifier] + raise PipestatDatabaseError(f"Record '{record_identifier}' not found") + else: + if record_identifier not in self.data[self.namespace]: + raise PipestatDatabaseError(f"Record '{record_identifier}' not found") + if result_identifier is None: + return self.data[self.namespace][record_identifier].to_dict() + if result_identifier not in self.data[self.namespace][record_identifier]: + raise PipestatDatabaseError( + f"Result '{result_identifier}' not found for record " + f"'{record_identifier}'" + ) + return self.data[self.namespace][record_identifier][result_identifier] + + def remove( + self, record_identifier: str = None, result_identifier: str = None + ) -> bool: + """ + Remove a result. + + If no result ID specified or last result is removed, the entire record + will be removed. + + :param str record_identifier: unique identifier of the record + :param str result_identifier: name of the result to be removed or None + if the record should be removed. + :return bool: whether the result has been removed + """ + record_identifier = self._strict_record_id(record_identifier) + rm_record = True if result_identifier is None else False + if not self.check_record_exists(record_identifier): + _LOGGER.error(f"Record '{record_identifier}' not found") + return False + if result_identifier and not self.check_result_exists( + result_identifier, record_identifier + ): + _LOGGER.error( + f"'{result_identifier}' has not been reported for " + f"'{record_identifier}'" + ) + return False + if self.file: + self.data.make_writable() + if not self[DB_ONLY_KEY]: + if rm_record: + _LOGGER.info(f"Removing '{record_identifier}' record") + del self[DATA_KEY][self.namespace][record_identifier] + else: + val_backup = self[DATA_KEY][self.namespace][record_identifier][ + result_identifier + ] + del self[DATA_KEY][self.namespace][record_identifier][result_identifier] + _LOGGER.info( + f"Removed result '{result_identifier}' for record " + f"'{record_identifier}' from '{self.namespace}' namespace" + ) + if not self[DATA_KEY][self.namespace][record_identifier]: + _LOGGER.info( + f"Last result removed for '{record_identifier}'. " + f"Removing the record" + ) + del self[DATA_KEY][self.namespace][record_identifier] + rm_record = True + if self.file: + self.data.write() + self.data.make_readonly() + if self.file is None: + if rm_record: + try: + with self.db_cursor as cur: + cur.execute( + f"DELETE FROM {self.namespace} WHERE " + f"{RECORD_ID}='{record_identifier}'" + ) + except Exception as e: + _LOGGER.error( + f"Could not remove the result from the " + f"database. Exception: {e}" + ) + self[DATA_KEY][self.namespace].setdefault(record_identifier, PXAM()) + raise + return True + try: + with self.db_cursor as cur: + cur.execute( + f"UPDATE {self.namespace} SET {result_identifier}=null " + f"WHERE {RECORD_ID}='{record_identifier}'" + ) + except Exception as e: + _LOGGER.error( + f"Could not remove the result from the database. " f"Exception: {e}" + ) + if not self[DB_ONLY_KEY]: + self[DATA_KEY][self.namespace][record_identifier][ + result_identifier + ] = val_backup + raise + return True + + def validate_schema(self) -> None: + """ + Check schema for any possible issues + + :raises SchemaError: if any schema format issue is detected + """ + + def _recursively_replace_custom_types(s: dict) -> Dict: + """ + Replace the custom types in pipestat schema with canonical types + + :param dict s: schema to replace types in + :return dict: schema with types replaced + """ + for k, v in s.items(): + assert SCHEMA_TYPE_KEY in v, SchemaError( + f"Result '{k}' is missing '{SCHEMA_TYPE_KEY}' key" + ) + if v[SCHEMA_TYPE_KEY] == "object" and SCHEMA_PROP_KEY in s[k]: + _recursively_replace_custom_types(s[k][SCHEMA_PROP_KEY]) + if v[SCHEMA_TYPE_KEY] in CANONICAL_TYPES.keys(): + s.setdefault(k, {}) + s[k].setdefault(SCHEMA_PROP_KEY, {}) + s[k][SCHEMA_PROP_KEY].update( + CANONICAL_TYPES[v[SCHEMA_TYPE_KEY]][SCHEMA_PROP_KEY] + ) + s[k].setdefault("required", []) + s[k]["required"].extend( + CANONICAL_TYPES[v[SCHEMA_TYPE_KEY]]["required"] + ) + s[k][SCHEMA_TYPE_KEY] = CANONICAL_TYPES[v[SCHEMA_TYPE_KEY]][ + SCHEMA_TYPE_KEY + ] + return s + + schema = deepcopy(self.schema) + _LOGGER.debug(f"Validating input schema") + assert isinstance(schema, dict), SchemaError( + f"The schema has to be a {dict().__class__.__name__}" + ) + for col_name in RESERVED_COLNAMES: + assert col_name not in schema.keys(), PipestatError( + f"'{col_name}' is an identifier reserved by pipestat" + ) + self[RES_SCHEMAS_KEY] = {} + schema = _recursively_replace_custom_types(schema) + self[RES_SCHEMAS_KEY] = schema + + def assert_results_defined(self, results: List[str]) -> None: + """ + Assert provided list of results is defined in the schema + + :param List[str] results: list of results to + check for existence in the schema + :raises SchemaError: if any of the results is not defined in the schema + """ + known_results = self.result_schemas.keys() + for r in results: + assert r in known_results, SchemaError( + f"'{r}' is not a known result. Results defined in the " + f"schema are: {list(known_results)}." + ) + + def check_connection(self) -> bool: + """ + Check whether a PostgreSQL connection has been established + + :return bool: whether the connection has been established + """ + if self.file is not None: + raise PipestatDatabaseError( + f"The {self.__class__.__name__} object " f"is not backed by a database" + ) + if DB_CONNECTION_KEY in self and isinstance( + self[DB_CONNECTION_KEY], psycopg2.extensions.connection + ): + return True + return False + + def establish_postgres_connection(self, suppress: bool = False) -> bool: + """ + Establish PostgreSQL connection using the config data + + :param bool suppress: whether to suppress any connection errors + :return bool: whether the connection has been established successfully + """ + if self.check_connection(): + raise PipestatDatabaseError( + f"Connection is already established: " + f"{self[DB_CONNECTION_KEY].info.host}" + ) + try: + self[DB_CONNECTION_KEY] = psycopg2.connect( + dbname=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_NAME_KEY], + user=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_USER_KEY], + password=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_PASSWORD_KEY], + host=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_HOST_KEY], + port=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_PORT_KEY], + ) + except psycopg2.Error as e: + _LOGGER.error( + f"Could not connect to: " + f"{self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_HOST_KEY]}" + ) + _LOGGER.info(f"Caught error: {e}") + if suppress: + return False + raise + else: + _LOGGER.debug( + f"Established connection with PostgreSQL: " + f"{self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_HOST_KEY]}" + ) + return True + + def close_postgres_connection(self) -> None: + """ + Close connection and remove client bound + """ + if not self.check_connection(): + raise PipestatDatabaseError( + f"The connection has not been established: " + f"{self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_HOST_KEY]}" + ) + self[DB_CONNECTION_KEY].close() + del self[DB_CONNECTION_KEY] + _LOGGER.debug( + f"Closed connection with PostgreSQL: " + f"{self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_HOST_KEY]}" + ) + + def _strict_record_id(self, forced_value: str = None) -> str: + """ + Get record identifier from the outer source or stored with this object + + :param str forced_value: return this value + :return str: record identifier + """ + if forced_value is not None: + return forced_value + if self.record_identifier is not None: + return self.record_identifier + raise PipestatError( + f"You must provide the record identifier you want to perform " + f"the action on. Either in the {self.__class__.__name__} " + f"constructor or as an argument to the method." + ) diff --git a/pipestat/pipestat_orm.py b/pipestat/pipestat_orm.py deleted file mode 100644 index 01f5dfb1..00000000 --- a/pipestat/pipestat_orm.py +++ /dev/null @@ -1,936 +0,0 @@ -from contextlib import contextmanager -from copy import deepcopy -from logging import getLogger -from typing import Any, Dict, List, Optional, Union -from urllib.parse import quote_plus - -from attmap import PathExAttMap as PXAM -from jsonschema import validate -from sqlalchemy import Column, Float, ForeignKey, Integer, String, Table, create_engine -from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import DeclarativeMeta, relationship, sessionmaker -from ubiquerg import create_lock, remove_lock -from yacman import YacAttMap - -from .const import * -from .exceptions import * -from .helpers import * - -_LOGGER = getLogger(PKG_NAME) - - -class PipestatManagerORM(dict): - """ - Pipestat standardizes reporting of pipeline results and - pipeline status management. It formalizes a way for pipeline developers - and downstream tools developers to communicate -- results produced by a - pipeline can easily and reliably become an input for downstream analyses. - The object exposes API for interacting with the results and - pipeline status and can be backed by either a YAML-formatted file - or a database. - """ - - def __init__( - self, - namespace: str = None, - record_identifier: str = None, - schema_path: str = None, - results_file_path: str = None, - database_only: bool = False, - config: Union[str, dict] = None, - status_schema_path: str = None, - flag_file_dir: str = None, - ): - """ - Initialize the object - - :param str namespace: namespace to report into. This will be the DB - table name if using DB as the object back-end - :param str record_identifier: record identifier to report for. This - creates a weak bound to the record, which can be overriden in - this object method calls - :param str schema_path: path to the output schema that formalizes - the results structure - :param str results_file_path: YAML file to report into, if file is - used as the object back-end - :param bool database_only: whether the reported data should not be - stored in the memory, but only in the database - :param str | dict config: path to the configuration file or a mapping - with the config file content - :param str status_schema_path: path to the status schema that formalizes - the status flags structure - """ - - def _check_cfg_key(cfg: dict, key: str) -> bool: - if key not in cfg: - _LOGGER.warning(f"Key '{key}' not found in config") - return False - return True - - def _mk_abs_via_cfg( - path: Optional[str], - cfg_path: Optional[str], - ) -> Optional[str]: - if path is None: - return path - assert isinstance(path, str), TypeError("Path is expected to be a str") - if os.path.isabs(path): - return path - if cfg_path is None: - rel_to_cwd = os.path.join(os.getcwd(), path) - if os.path.exists(rel_to_cwd) or os.access( - os.path.dirname(rel_to_cwd), os.W_OK - ): - return rel_to_cwd - raise OSError(f"Could not make this path absolute: {path}") - joined = os.path.join(os.path.dirname(cfg_path), path) - if os.path.isabs(joined): - return joined - raise OSError(f"Could not make this path absolute: {path}") - - def _select_value( - arg_name: str, - arg_value: Any, - cfg: dict, - strict: bool = True, - env_var: str = None, - ) -> Any: - if arg_value is not None: - return arg_value - if arg_name not in cfg or cfg[arg_name] is None: - if env_var is not None: - arg = os.getenv(env_var, None) - if arg is not None: - _LOGGER.debug(f"Value '{arg}' sourced from '{env_var}' env var") - return expandpath(arg) - if strict: - raise PipestatError( - f"Value for the required '{arg_name}' argument could not be" - f" determined. Provide it in the config or pass to the " - f"object constructor." - ) - return - return cfg[arg_name] - - super(PipestatManagerORM, self).__init__() - self[CONFIG_KEY] = YacAttMap() - # read config or config data - config = config or os.getenv(ENV_VARS["config"]) - if config is not None: - if isinstance(config, str): - config = os.path.abspath(expandpath(config)) - self[CONFIG_KEY] = YacAttMap(filepath=config) - self._config_path = config - elif isinstance(config, dict): - self[CONFIG_KEY] = YacAttMap(entries=config) - self._config_path = None - else: - raise TypeError( - "database_config has to be either path to the " - "file to read or a dict" - ) - # validate config - cfg = self[CONFIG_KEY].to_dict(expand=True) - _, cfg_schema = read_yaml_data(CFG_SCHEMA, "config schema") - validate(cfg, cfg_schema) - - self[NAME_KEY] = _select_value( - "namespace", namespace, self[CONFIG_KEY], env_var=ENV_VARS["namespace"] - ) - self[RECORD_ID_KEY] = _select_value( - "record_identifier", - record_identifier, - self[CONFIG_KEY], - False, - ENV_VARS["record_identifier"], - ) - self[DB_ONLY_KEY] = database_only - # read results schema - self._schema_path = _select_value( - "schema_path", - schema_path, - self[CONFIG_KEY], - False, - env_var=ENV_VARS["schema"], - ) - if self._schema_path is not None: - _, self[SCHEMA_KEY] = read_yaml_data( - _mk_abs_via_cfg(self._schema_path, self.config_path), "schema" - ) - self.validate_schema() - # determine the highlighted results - self[HIGHLIGHTED_KEY] = [ - k - for k, v in self.schema.items() - if "highlight" in v and v["highlight"] is True - ] - if self[HIGHLIGHTED_KEY]: - assert isinstance(self[HIGHLIGHTED_KEY], list), TypeError( - f"highlighted results specification " - f"({self[HIGHLIGHTED_KEY]}) has to be a list" - ) - # read status schema - status_schema_path = ( - _mk_abs_via_cfg( - _select_value( - "status_schema_path", - status_schema_path, - self[CONFIG_KEY], - False, - env_var=ENV_VARS["status_schema"], - ), - self.config_path, - ) - or STATUS_SCHEMA - ) - self[STATUS_SCHEMA_SOURCE_KEY], self[STATUS_SCHEMA_KEY] = read_yaml_data( - status_schema_path, "status schema" - ) - # determine results file - results_file_path = _mk_abs_via_cfg( - _select_value( - "results_file_path", - results_file_path, - self[CONFIG_KEY], - False, - ENV_VARS["results_file"], - ), - self.config_path, - ) - if results_file_path: - if self[DB_ONLY_KEY]: - raise ValueError( - "Running in database only mode does not make " - "sense with a YAML file as a backend." - ) - self[FILE_KEY] = results_file_path - self._init_results_file() - flag_file_dir = _select_value( - "flag_file_dir", flag_file_dir, self[CONFIG_KEY], False - ) or os.path.dirname(self.file) - self[STATUS_FILE_DIR] = _mk_abs_via_cfg(flag_file_dir, self.config_path) - elif CFG_DATABASE_KEY in self[CONFIG_KEY]: - if not all( - [ - _check_cfg_key(self[CONFIG_KEY][CFG_DATABASE_KEY], key) - for key in DB_CREDENTIALS - ] - ): - raise MissingConfigDataError( - "Must specify all database login " "credentials or result_file_path" - ) - self[DB_ORMS_KEY] = {} - self[DB_BASE_KEY] = declarative_base() - self[DATA_KEY] = YacAttMap() - self._init_db_table() - self._init_status_table() - else: - raise MissingConfigDataError( - "Must specify either database login " "credentials or a YAML file path" - ) - - def _get_flag_file( - self, record_identifier: str = None - ) -> Union[str, List[str], None]: - """ - Get path to the status flag file for the specified record - - :param str record_identifier: unique record identifier - :return str | list[str] | None: path to the status flag file - """ - from glob import glob - - r_id = self._strict_record_id(record_identifier) - if self.file is None: - return - if self.file is not None: - regex = os.path.join( - self[STATUS_FILE_DIR], f"{self.namespace}_{r_id}_*.flag" - ) - file_list = glob(regex) - if len(file_list) > 1: - _LOGGER.warning("Multiple flag files found") - return file_list - elif len(file_list) == 1: - return file_list[0] - else: - _LOGGER.debug("No flag files found") - return None - - @property - def record_count(self) -> int: - """ - Number of records reported - - :return int: number of records reported - """ - return ( - len(self.data[self.namespace]) - if self.file - else self._count_rows(self.namespace) - ) - - @property - def highlighted_results(self) -> List[str]: - """ - Highlighted results - - :return List[str]: a collection of highlighted results - """ - return self._get_attr(HIGHLIGHTED_KEY) or [] - - @property - def namespace(self) -> str: - """ - Namespace the object writes the results to - - :return str: namespace the object writes the results to - """ - return self._get_attr(NAME_KEY) - - @property - def record_identifier(self) -> str: - """ - Unique identifier of the record - - :return str: unique identifier of the record - """ - return self._get_attr(RECORD_ID_KEY) - - @property - def schema(self) -> Dict: - """ - Schema mapping - - :return dict: schema that formalizes the results structure - """ - return self._get_attr(SCHEMA_KEY) - - @property - def status_schema(self) -> Dict: - """ - Status schema mapping - - :return dict: schema that formalizes the pipeline status structure - """ - return self._get_attr(STATUS_SCHEMA_KEY) - - @property - def status_schema_source(self) -> Dict: - """ - Status schema source - - :return dict: source of the schema that formalizes - the pipeline status structure - """ - return self._get_attr(STATUS_SCHEMA_SOURCE_KEY) - - @property - def schema_path(self) -> str: - """ - Schema path - - :return str: path to the provided schema - """ - return self._schema_path - - @property - def config_path(self) -> str: - """ - Config path. None if the config was not provided or if provided - as a mapping of the config contents - - :return str: path to the provided config - """ - return getattr(self, "_config_path", None) - - @property - def result_schemas(self) -> Dict: - """ - Result schema mappings - - :return dict: schemas that formalize the structure of each result - in a canonical jsonschema way - """ - return self._get_attr(RES_SCHEMAS_KEY) - - @property - def file(self) -> str: - """ - File path that the object is reporting the results into - - :return str: file path that the object is reporting the results into - """ - return self._get_attr(FILE_KEY) - - @property - def data(self) -> YacAttMap: - """ - Data object - - :return yacman.YacAttMap: the object that stores the reported data - """ - return self._get_attr(DATA_KEY) - - @property - def db_url(self) -> str: - """ - Database URL, generated based on config credentials - - :return str: database URL - """ - try: - creds = dict( - name=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_NAME_KEY], - user=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_USER_KEY], - passwd=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_PASSWORD_KEY], - host=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_HOST_KEY], - port=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_PORT_KEY], - dialect=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_DIALECT_KEY], - ) - except (KeyError, AttributeError) as e: - raise PipestatDatabaseError( - f"Could not determine database URL. Caught error: {str(e)}" - ) - parsed_creds = {k: quote_plus(str(v)) for k, v in creds.items()} - return "{dialect}://{user}:{passwd}@{host}:{port}/{name}".format(**parsed_creds) - - @property - @contextmanager - def session(self): - """ - Provide a transactional scope around a series of query - operations, no commit afterwards. - """ - if not self.is_db_connected(): - self.establish_db_connection_orm() - with self[DB_SESSION_KEY]() as session: - _LOGGER.debug("Created session") - yield session - _LOGGER.debug("Ending session") - - def _strict_record_id(self, forced_value: str = None) -> str: - """ - Get record identifier from the outer source or stored with this object - - :param str forced_value: return this value - :return str: record identifier - """ - if forced_value is not None: - return forced_value - if self.record_identifier is not None: - return self.record_identifier - raise PipestatError( - f"You must provide the record identifier you want to perform " - f"the action on. Either in the {self.__class__.__name__} " - f"constructor or as an argument to the method." - ) - - def _create_table_orm(self, table_name: str, schema: Dict[str, Any]): - """ - Create a table - - :param str table_name: name of the table to create - :param Dict[str, Any] schema: schema to base table creation on - """ - - def _auto_repr(x: Any) -> str: - """ - Auto-generated __repr__ fun - - :param Any x: object to generate __repr__ method for - :return str: string object representation - """ - attr_strs = [ - f"{k}={str(v)}" for k, v in x.__dict__.items() if not k.startswith("_") - ] - return "<{}: {}>".format(x.__class__.__name__, ", ".join(attr_strs)) - - tn = table_name or self.namespace - attr_dict = dict( - __tablename__=tn, - id=Column(Integer, primary_key=True), - record_identifier=Column(String, unique=True), - ) - for result_id, result_metadata in schema.items(): - col_type = SQL_CLASSES_BY_TYPE[result_metadata[SCHEMA_TYPE_KEY]] - _LOGGER.debug(f"Adding object: {result_id} of type: {str(col_type)}") - attr_dict.update({result_id: Column(col_type)}) - attr_dict.update({"__repr__": _auto_repr}) - _LOGGER.debug(f"Creating '{tn}' ORM with args: {attr_dict}") - self[DB_ORMS_KEY][tn] = type(tn.capitalize(), (self[DB_BASE_KEY],), attr_dict) - self[DB_BASE_KEY].metadata.create_all(bind=self[DB_ENGINE_KEY]) - - def establish_db_connection_orm(self) -> bool: - """ - Establish DB connection using the config data - - :return bool: whether the connection has been established successfully - """ - if self.is_db_connected(): - raise PipestatDatabaseError("Connection is already established") - - self[DB_ENGINE_KEY] = create_engine(self.db_url, echo=True) - self[DB_SESSION_KEY] = sessionmaker(bind=self[DB_ENGINE_KEY]) - return True - - def is_db_connected(self) -> bool: - """ - Check whether a DB connection has been established - - :return bool: whether the connection has been established - """ - if self.file is not None: - raise PipestatDatabaseError( - f"The {self.__class__.__name__} object is not backed by a database" - ) - if DB_SESSION_KEY in self and isinstance(self[DB_SESSION_KEY], sessionmaker): - return True - return False - - def set_status(self, status_identifier: str, record_identifier: str = None) -> None: - """ - Set pipeline run status. - - The status identifier needs to match one of identifiers specified in - the status schema. A basic, ready to use, status schema is shipped with - this package. - - :param str status_identifier: status to set, one of statuses defined - in the status schema - :param str record_identifier: record identifier to set the - pipeline status for - """ - r_id = self._strict_record_id(record_identifier) - known_status_identifiers = self.status_schema.keys() - if status_identifier not in known_status_identifiers: - raise PipestatError( - f"'{status_identifier}' is not a defined status identifier. " - f"These are allowed: {known_status_identifiers}" - ) - prev_status = self.get_status(r_id) - if self.file is not None: - if prev_status: - prev_flag_path = self.get_status_flag_path(prev_status, r_id) - os.remove(prev_flag_path) - flag_path = self.get_status_flag_path(status_identifier, r_id) - create_lock(flag_path) - with open(flag_path, "w") as f: - f.write(status_identifier) - remove_lock(flag_path) - else: - try: - self._report_db( - values={STATUS: status_identifier}, - record_identifier=r_id, - table_name=f"{self.namespace}_{STATUS}", - ) - except Exception as e: - _LOGGER.error(f"Could not insert into the status table. Exception: {e}") - raise - if prev_status: - _LOGGER.debug( - f"Changed status from '{prev_status}' to '{status_identifier}'" - ) - - def get_status(self, record_identifier: str = None) -> Optional[str]: - """ - Get the current pipeline status - - :return str: status identifier, like 'running' - """ - r_id = self._strict_record_id(record_identifier) - if self.file is None: - try: - result = self._retrieve_db( - result_identifier=STATUS, - record_identifier=r_id, - table_name=f"{self.namespace}_{STATUS}", - ) - except PipestatDatabaseError: - return None - return result[STATUS] - else: - flag_file = self._get_flag_file(record_identifier=r_id) - if flag_file is not None: - assert isinstance(flag_file, str), TypeError( - "Flag file path is expected to be a str, were multiple flags found?" - ) - with open(flag_file, "r") as f: - status = f.read() - return status - _LOGGER.debug( - f"Could not determine status for '{r_id}' record. " - f"No flags found in: {self[STATUS_FILE_DIR]}" - ) - return None - - def clear_status( - self, record_identifier: str = None, flag_names: List[str] = None - ) -> List[str]: - """ - Remove status flags - - :param str record_identifier: name of the record to remove flags for - :param Iterable[str] flag_names: Names of flags to remove, optional; if - unspecified, all schema-defined flag names will be used. - :return List[str]: Collection of names of flags removed - """ - r_id = self._strict_record_id(record_identifier) - if self.file is not None: - flag_names = flag_names or list(self.status_schema.keys()) - if isinstance(flag_names, str): - flag_names = [flag_names] - removed = [] - for f in flag_names: - path_flag_file = self.get_status_flag_path( - status_identifier=f, record_identifier=r_id - ) - try: - os.remove(path_flag_file) - except: - pass - else: - _LOGGER.info(f"Removed existing flag: {path_flag_file}") - removed.append(f) - return removed - else: - removed = self.get_status(r_id) - try: - self._remove_db( - record_identifier=r_id, - table_name=f"{self.namespace}_{STATUS}", - ) - except Exception as e: - _LOGGER.error( - f"Could not remove the status from the database. Exception: {e}" - ) - return [] - else: - return [removed] - - def validate_schema(self) -> None: - """ - Check schema for any possible issues - - :raises SchemaError: if any schema format issue is detected - """ - - def _recursively_replace_custom_types(s: dict) -> Dict: - """ - Replace the custom types in pipestat schema with canonical types - - :param dict s: schema to replace types in - :return dict: schema with types replaced - """ - for k, v in s.items(): - assert SCHEMA_TYPE_KEY in v, SchemaError( - f"Result '{k}' is missing '{SCHEMA_TYPE_KEY}' key" - ) - if v[SCHEMA_TYPE_KEY] == "object" and SCHEMA_PROP_KEY in s[k]: - _recursively_replace_custom_types(s[k][SCHEMA_PROP_KEY]) - if v[SCHEMA_TYPE_KEY] in CANONICAL_TYPES.keys(): - s.setdefault(k, {}) - s[k].setdefault(SCHEMA_PROP_KEY, {}) - s[k][SCHEMA_PROP_KEY].update( - CANONICAL_TYPES[v[SCHEMA_TYPE_KEY]][SCHEMA_PROP_KEY] - ) - s[k].setdefault("required", []) - s[k]["required"].extend( - CANONICAL_TYPES[v[SCHEMA_TYPE_KEY]]["required"] - ) - s[k][SCHEMA_TYPE_KEY] = CANONICAL_TYPES[v[SCHEMA_TYPE_KEY]][ - SCHEMA_TYPE_KEY - ] - return s - - schema = deepcopy(self.schema) - _LOGGER.debug(f"Validating input schema") - assert isinstance(schema, dict), SchemaError( - f"The schema has to be a {dict().__class__.__name__}" - ) - for col_name in RESERVED_COLNAMES: - assert col_name not in schema.keys(), PipestatError( - f"'{col_name}' is an identifier reserved by pipestat" - ) - self[RES_SCHEMAS_KEY] = {} - schema = _recursively_replace_custom_types(schema) - self[RES_SCHEMAS_KEY] = schema - - def _init_db_table(self) -> bool: - """ - Initialize a database table based on the provided schema, - if it does not exist. Read the data stored in the database into the - memory otherwise. - - :return bool: whether the table has been created - """ - if self.schema is None: - raise SchemaNotFoundError("initialize the database table") - if not self.is_db_connected(): - self.establish_db_connection_orm() - # if self._check_table_exists(table_name=self.namespace): - # _LOGGER.debug(f"Table '{self.namespace}' already exists in the database") - # if not self[DB_ONLY_KEY]: - # self._table_to_dict() - # # return False - _LOGGER.info(f"Initializing '{self.namespace}' table in '{PKG_NAME}' database") - self._create_table_orm(table_name=self.namespace, schema=self.result_schemas) - return True - - def _init_status_table(self): - status_table_name = f"{self.namespace}_{STATUS}" - if not self.is_db_connected(): - self.establish_db_connection_orm() - # if not self._check_table_exists(table_name=status_table_name): - _LOGGER.debug( - f"Initializing '{status_table_name}' table in " f"'{PKG_NAME}' database" - ) - self._create_table_orm( - table_name=status_table_name, - schema=get_status_table_schema(status_schema=self.status_schema), - ) - - def _get_attr(self, attr: str) -> Any: - """ - Safely get the name of the selected attribute of this object - - :param str attr: attr to select - :return: - """ - return self[attr] if attr in self else None - - def _check_table_exists(self, table_name: str) -> bool: - """ - Check if the specified table exists - - :param str table_name: table name to be checked - :return bool: whether the specified table exists - """ - from sqlalchemy import inspect - - with self.session as s: - return inspect(s.bind).has_table(table_name=table_name) - - def _count_rows(self, table_name: str) -> int: - """ - Count rows in a selected table - - :param str table_name: table to count rows for - :return int: number of rows in the selected table - """ - with self.session as s: - return s.query(self[DB_ORMS_KEY][table_name].id).count() - - def _get_orm(self, table_name: str = None) -> Any: - """ - Get an object relational mapper class - - :param str table_name: table name to get a class for - :return Any: Object relational mapper class - """ - if DB_ORMS_KEY not in self: - raise PipestatDatabaseError("Object relational mapper classes not defined") - tn = f"{table_name or self.namespace}" - if tn not in self[DB_ORMS_KEY]: - raise PipestatDatabaseError( - f"No object relational mapper class defined for table: {tn}" - ) - if not isinstance(self[DB_ORMS_KEY][tn], DeclarativeMeta): - raise PipestatDatabaseError( - f"Object relational mapper class for table '{tn}' is invalid" - ) - return self[DB_ORMS_KEY][tn] - - def check_record_exists( - self, record_identifier: str, table_name: str = None - ) -> bool: - """ - Check if the specified record exists in the table - - :param str record_identifier: record to check for - :param str table_name: table name to check - :return bool: whether the record exists in the table - """ - with self.session as s: - return ( - s.query(self._get_orm(table_name).id) - .filter_by(record_identifier=record_identifier) - .first() - is not None - ) - - def check_which_results_exist( - self, results: List[str], rid: str = None, table_name: str = None - ) -> List[str]: - """ - Check if the specified results exist in the table - - :param str rid: record to check for - :param List[str] results: results identifiers to check for - :param str table_name: name of the table to search for results in - :return List[str]: results identifiers that exist - """ - table_name = table_name or self.namespace - rid = self._strict_record_id(rid) - with self.session as s: - record = ( - s.query(self._get_orm(table_name)) - .filter_by(record_identifier=rid) - .first() - ) - return [r for r in results if getattr(record, r, None) is not None] - - def check_result_exists( - self, - result_identifier: str, - record_identifier: str = None, - table_name: str = None, - ) -> bool: - """ - Check if the result has been reported - - :param str record_identifier: unique identifier of the record - :param str result_identifier: name of the result to check - :param str table_name: name of the table to search for results in - :return bool: whether the specified result has been reported for the - indicated record in current namespace - """ - record_identifier = self._strict_record_id(record_identifier) - return self.check_which_results_exist( - results=[result_identifier], rid=record_identifier, table_name=table_name - ) - - def _retrieve_db( - self, - result_identifier: str = None, - record_identifier: str = None, - table_name: str = None, - ) -> Dict[str, Any]: - """ - Retrieve a result for a record. - - If no result ID specified, results for the entire record will - be returned. - - :param str record_identifier: unique identifier of the record - :param str result_identifier: name of the result to be retrieved - :param str table_name: name of the table to search for results in - :return Dict[str, any]: a single result or a mapping with all the results - reported for the record - """ - table_name = table_name or self.namespace - record_identifier = self._strict_record_id(record_identifier) - if result_identifier is not None: - existing = self.check_which_results_exist( - results=[result_identifier], - rid=record_identifier, - table_name=table_name, - ) - if not existing: - raise PipestatDatabaseError( - f"Result '{result_identifier}' not found for record " - f"'{record_identifier}'" - ) - - with self.session as s: - record = ( - s.query(self._get_orm(table_name)) - .filter_by(record_identifier=record_identifier) - .first() - ) - - if record is not None: - if result_identifier is not None: - return {result_identifier: getattr(record, result_identifier)} - return { - column: getattr(record, column) - for column in [c.name for c in record.__table__.columns] - if getattr(record, column, None) is not None - } - raise PipestatDatabaseError(f"Record '{record_identifier}' not found") - - def _report_db( - self, values: Dict[str, Any], record_identifier: str, table_name: str = None - ) -> int: - """ - Report a result to a database - - :param Dict[str, Any] values: values to report - :param str record_identifier: record to report the result for - :param str table_name: name of the table to report the result in - :return int: updated/inserted row - """ - record_identifier = self._strict_record_id(record_identifier) - ORMClass = self._get_orm(table_name) - values.update({RECORD_ID: record_identifier}) - if not self.check_record_exists( - record_identifier=record_identifier, table_name=table_name - ): - new_record = ORMClass(**values) - with self.session as s: - s.add(new_record) - s.commit() - returned_id = new_record.id - else: - with self.session as s: - record_to_update = ( - s.query(ORMClass) - .filter(getattr(ORMClass, RECORD_ID) == record_identifier) - .first() - ) - for result_id, result_value in values.items(): - setattr(record_to_update, result_id, result_value) - s.commit() - returned_id = record_to_update.id - return returned_id - - def _remove_db( - self, - record_identifier: str = None, - result_identifier: str = None, - table_name: str = None, - ) -> bool: - """ - Remove a result. - - If no result ID specified or last result is removed, the entire record - will be removed. - - :param str record_identifier: unique identifier of the record - :param str result_identifier: name of the result to be removed or None - if the record should be removed. - :param str table_name: name of the table to report the result in - :return bool: whether the result has been removed - :raise PipestatDatabaseError: if either record or result specified are not found - """ - table_name = table_name or self.namespace - record_identifier = self._strict_record_id(record_identifier) - ORMClass = self._get_orm(table_name=table_name) - if self.check_record_exists( - record_identifier=record_identifier, table_name=table_name - ): - with self.session as s: - record = ( - s.query(ORMClass) - .filter(getattr(ORMClass, RECORD_ID) == record_identifier) - .first() - ) - if result_identifier is None: - # delete row - record.delete() - else: - # set the value to None - if not self.check_result_exists( - record_identifier=record_identifier, - result_identifier=result_identifier, - table_name=table_name, - ): - raise PipestatDatabaseError( - f"Result '{result_identifier}' not found for record " - f"'{record_identifier}'" - ) - setattr(record, result_identifier, None) - s.commit() - else: - raise PipestatDatabaseError(f"Record '{record_identifier}' not found") From 505c8308df460de1288c2f80ffda3244c6052d2b Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 13 Apr 2021 11:41:17 -0400 Subject: [PATCH 16/65] read DB to dict at init if requested --- docs_jupyter/python_api.ipynb | 10 +-- pipestat/pipestat.py | 115 +++++++++++++++++++++------------- tests/test_pipestat.py | 45 +------------ 3 files changed, 79 insertions(+), 91 deletions(-) diff --git a/docs_jupyter/python_api.ipynb b/docs_jupyter/python_api.ipynb index 77b349fd..aded8c5d 100644 --- a/docs_jupyter/python_api.ipynb +++ b/docs_jupyter/python_api.ipynb @@ -17,12 +17,12 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "import pipestat\n", - "from pipestat.pipestat_orm import PipestatManagerORM as PipestatManager, SchemaNotFoundError\n", + "from pipestat import PipestatManager, SchemaNotFoundError\n", "from jsonschema import ValidationError" ] }, @@ -51,14 +51,14 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/tmp9cozi7pc.yaml\n" + "/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/tmplgf49zfa.yaml\n" ] } ], @@ -78,7 +78,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ diff --git a/pipestat/pipestat.py b/pipestat/pipestat.py index f3149fcc..6f9d384d 100644 --- a/pipestat/pipestat.py +++ b/pipestat/pipestat.py @@ -229,33 +229,22 @@ def _select_value( "Must specify either database login " "credentials or a YAML file path" ) - def _get_flag_file( - self, record_identifier: str = None - ) -> Union[str, List[str], None]: + def __str__(self): """ - Get path to the status flag file for the specified record + Generate string representation of the object - :param str record_identifier: unique record identifier - :return str | list[str] | None: path to the status flag file + :return str: string representation of the object """ - from glob import glob - - r_id = self._strict_record_id(record_identifier) - if self.file is None: - return - if self.file is not None: - regex = os.path.join( - self[STATUS_FILE_DIR], f"{self.namespace}_{r_id}_*.flag" - ) - file_list = glob(regex) - if len(file_list) > 1: - _LOGGER.warning("Multiple flag files found") - return file_list - elif len(file_list) == 1: - return file_list[0] - else: - _LOGGER.debug("No flag files found") - return None + res = f"{self.__class__.__name__} ({self.namespace})" + res += "\nBackend: {}".format( + f"file ({self.file})" if self.file else "PostgreSQL" + ) + res += f"\nResults schema source: {self.schema_path}" + res += f"\nStatus schema source: {self.status_schema_source}" + res += f"\nRecords count: {self.record_count}" + if self.highlighted_results: + res += f"\nHighlighted results: {', '.join(self.highlighted_results)}" + return res @property def record_count(self) -> int: @@ -408,12 +397,40 @@ def session(self): operations, no commit afterwards. """ if not self.is_db_connected(): - self.establish_db_connection_orm() + self.establish_db_connection() with self[DB_SESSION_KEY]() as session: _LOGGER.debug("Created session") yield session _LOGGER.debug("Ending session") + def _get_flag_file( + self, record_identifier: str = None + ) -> Union[str, List[str], None]: + """ + Get path to the status flag file for the specified record + + :param str record_identifier: unique record identifier + :return str | list[str] | None: path to the status flag file + """ + from glob import glob + + r_id = self._strict_record_id(record_identifier) + if self.file is None: + return + if self.file is not None: + regex = os.path.join( + self[STATUS_FILE_DIR], f"{self.namespace}_{r_id}_*.flag" + ) + file_list = glob(regex) + if len(file_list) > 1: + _LOGGER.warning("Multiple flag files found") + return file_list + elif len(file_list) == 1: + return file_list[0] + else: + _LOGGER.debug("No flag files found") + return None + def _strict_record_id(self, forced_value: str = None) -> str: """ Get record identifier from the outer source or stored with this object @@ -466,7 +483,7 @@ def _auto_repr(x: Any) -> str: self[DB_ORMS_KEY][tn] = type(tn.capitalize(), (self[DB_BASE_KEY],), attr_dict) self[DB_BASE_KEY].metadata.create_all(bind=self[DB_ENGINE_KEY]) - def establish_db_connection_orm(self) -> bool: + def establish_db_connection(self) -> bool: """ Establish DB connection using the config data @@ -757,20 +774,33 @@ def _init_db_table(self) -> bool: if self.schema is None: raise SchemaNotFoundError("initialize the database table") if not self.is_db_connected(): - self.establish_db_connection_orm() - # if self._check_table_exists(table_name=self.namespace): - # _LOGGER.debug(f"Table '{self.namespace}' already exists in the database") - # if not self[DB_ONLY_KEY]: - # self._table_to_dict() - # # return False + self.establish_db_connection() _LOGGER.info(f"Initializing '{self.namespace}' table in '{PKG_NAME}' database") self._create_table_orm(table_name=self.namespace, schema=self.result_schemas) + if not self[DB_ONLY_KEY]: + self._table_to_dict() return True + def _table_to_dict(self) -> None: + """ + Create a dictionary from the database table data + """ + with self.session as s: + records = s.query(self._get_orm(self.namespace)).all() + _LOGGER.debug(f"Reading data from database for '{self.namespace}' namespace") + for record in records: + record_id = getattr(record, RECORD_ID) + for column in record.__table__.columns: + val = getattr(record, column.name, None) + if val is not None: + self._report_data_element( + record_identifier=record_id, values={column.name: val} + ) + def _init_status_table(self): status_table_name = f"{self.namespace}_{STATUS}" if not self.is_db_connected(): - self.establish_db_connection_orm() + self.establish_db_connection() # if not self._check_table_exists(table_name=status_table_name): _LOGGER.debug( f"Initializing '{status_table_name}' table in " f"'{PKG_NAME}' database" @@ -943,9 +973,12 @@ def retrieve( """ r_id = self._strict_record_id(record_identifier) if self.file is None: - return self._retrieve_db( + results = self._retrieve_db( result_identifier=result_identifier, record_identifier=r_id ) + if result_identifier is not None: + return results[result_identifier] + return results else: if r_id not in self.data[self.namespace]: raise PipestatDatabaseError(f"Record '{r_id}' not found") @@ -981,7 +1014,6 @@ def _retrieve_db( existing = self.check_which_results_exist( results=[result_identifier], rid=record_identifier, - table_name=table_name, ) if not existing: raise PipestatDatabaseError( @@ -1082,7 +1114,7 @@ def report( else: try: updated_ids = self._report_db( - record_identifier=record_identifier, value=values + record_identifier=record_identifier, values=values ) except Exception as e: _LOGGER.error( @@ -1242,26 +1274,23 @@ def _remove_db( record_identifier=record_identifier, table_name=table_name ): with self.session as s: - record = ( - s.query(ORMClass) - .filter(getattr(ORMClass, RECORD_ID) == record_identifier) - .first() + records = s.query(ORMClass).filter( + getattr(ORMClass, RECORD_ID) == record_identifier ) if result_identifier is None: # delete row - record.delete() + records.delete() else: # set the value to None if not self.check_result_exists( record_identifier=record_identifier, result_identifier=result_identifier, - table_name=table_name, ): raise PipestatDatabaseError( f"Result '{result_identifier}' not found for record " f"'{record_identifier}'" ) - setattr(record, result_identifier, None) + setattr(records.first(), result_identifier, None) s.commit() else: raise PipestatDatabaseError(f"Record '{record_identifier}' not found") diff --git a/tests/test_pipestat.py b/tests/test_pipestat.py index b8eef8df..9e87819f 100644 --- a/tests/test_pipestat.py +++ b/tests/test_pipestat.py @@ -39,10 +39,7 @@ def test_connection_checker(self, config_file_path, schema_file_path): schema_path=schema_file_path, namespace="test", ) - assert not pm.check_connection() - pm.establish_postgres_connection() - assert pm.check_connection() - pm.close_postgres_connection() + assert pm.is_db_connected() def test_connection_overwrite_error(self, config_file_path, schema_file_path): pm = PipestatManager( @@ -51,46 +48,8 @@ def test_connection_overwrite_error(self, config_file_path, schema_file_path): schema_path=schema_file_path, namespace="test", ) - pm.establish_postgres_connection() with pytest.raises(PipestatDatabaseError): - pm.establish_postgres_connection() - pm.close_postgres_connection() - - @pytest.mark.parametrize("suppress", [True, False]) - def test_connection_error(self, config_file_path, schema_file_path, suppress): - pm = PipestatManager( - config=config_file_path, - database_only=True, - schema_path=schema_file_path, - namespace="test", - ) - pm[CONFIG_KEY][CFG_DATABASE_KEY][CFG_HOST_KEY] = "bogus_host" - if suppress: - assert not pm.establish_postgres_connection(suppress=suppress) - else: - with pytest.raises(psycopg2Error): - pm.establish_postgres_connection(suppress=suppress) - - def test_connection_closing(self, config_file_path, schema_file_path): - pm = PipestatManager( - config=config_file_path, - database_only=True, - schema_path=schema_file_path, - namespace="test", - ) - pm.establish_postgres_connection() - pm.close_postgres_connection() - assert not pm.check_connection() - - def test_connection_closing_closed(self, config_file_path, schema_file_path): - pm = PipestatManager( - config=config_file_path, - database_only=True, - schema_path=schema_file_path, - namespace="test", - ) - with pytest.raises(PipestatDatabaseError): - pm.close_postgres_connection() + pm.establish_db_connection() class TestPipestatManagerInstantiation: From 978f2831fa2c199a2538f07c59974cd85c24751c Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 14 Apr 2021 14:20:18 -0400 Subject: [PATCH 17/65] implement select --- pipestat/pipestat.py | 93 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 92 insertions(+), 1 deletion(-) diff --git a/pipestat/pipestat.py b/pipestat/pipestat.py index 6f9d384d..8b26067c 100644 --- a/pipestat/pipestat.py +++ b/pipestat/pipestat.py @@ -1,9 +1,10 @@ from contextlib import contextmanager from copy import deepcopy from logging import getLogger -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Tuple, Union from urllib.parse import quote_plus +import sqlalchemy.orm from attmap import PathExAttMap as PXAM from jsonschema import validate from sqlalchemy import Column, Float, ForeignKey, Integer, String, Table, create_engine @@ -957,6 +958,96 @@ def check_result_exists( > 0 ) + def select( + self, + table_name: Optional[str] = None, + columns: Optional[List[str]] = None, + filter_condition: Optional[List[Tuple[str, str, Union[str, List[str]]]]] = None, + offset: Optional[int] = None, + limit: Optional[int] = None, + ) -> List[Any]: + """ + Perform a SELECT on the table, filtering limited to a single condition + + :param str table_name: name of the table to SELECT from + :param List[str] columns: columns to include in the result + :param [(key,operator,value)] filter_condition: e.g. [("id", "eq", 1)] operator list + - eq for == + - lt for < + - ge for >= + - in for in_ + - like for like + :param int offset: skip this number of rows + :param int limit: include this number of rows + """ + + def _dynamic_filter( + ORM: sqlalchemy.orm.DeclarativeMeta, + query: sqlalchemy.orm.Query, + filter_condition: List[Tuple[str, str, Union[str, List[str]]]], + ): + """ + Return filtered query based on condition. + + :param sqlalchemy.orm.DeclarativeMeta ORM: + :param sqlalchemy.orm.Query query: takes query + :param [(key,operator,value)] filter_condition: e.g. [("id", "eq", 1)] operator list + - eq for == + - lt for < + - ge for >= + - in for in_ + - like for like + :return: query + """ + for raw in filter_condition: + try: + key, op, value = raw + except ValueError: + raise Exception("Invalid filter: %s" % raw) + column = getattr(ORM, key, None) + if column is None: + raise Exception("Invalid filter column: %s" % key) + if op == "in": + if isinstance(value, list): + filt = column.in_(value) + else: + filt = column.in_(value.split(",")) + else: + try: + attr = ( + list( + filter( + lambda e: hasattr(column, e % op), + ["%s", "%s_", "__%s__"], + ) + )[0] + % op + ) + except IndexError: + raise Exception(f"Invalid filter operator: {op}") + if value == "null": + value = None + filt = getattr(column, attr)(value) + query = query.filter(filt) + return query + + ORM = self._get_orm(table_name or self.namespace) + with self.session as s: + if columns is not None: + query = s.query(*[getattr(ORM, column) for column in columns]) + else: + query = s.query(ORM) + if filter_condition is not None: + query = _dynamic_filter( + ORM=ORM, query=query, filter_condition=filter_condition + ) + if isinstance(offset, int): + query = query.offset(offset) + if isinstance(limit, int): + query = query.limit(limit) + result = query.all() + return result + def retrieve( self, record_identifier: str = None, result_identifier: str = None ) -> Union[Any, Dict[str, Any]]: From 879d26160620e7bb1384a91814d6dfba8f7d8c92 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 14 Apr 2021 15:35:39 -0400 Subject: [PATCH 18/65] tests refactoring, add table_name parameter to methods --- pipestat/pipestat.py | 10 +- tests/test_db_only_mode.py | 86 +++++++++++ tests/test_init.py | 146 +++++++++++++++++++ tests/test_pipestat.py | 286 ------------------------------------- tests/test_status.py | 62 ++++++++ 5 files changed, 302 insertions(+), 288 deletions(-) create mode 100644 tests/test_db_only_mode.py create mode 100644 tests/test_init.py create mode 100644 tests/test_status.py diff --git a/pipestat/pipestat.py b/pipestat/pipestat.py index 8b26067c..5b814480 100644 --- a/pipestat/pipestat.py +++ b/pipestat/pipestat.py @@ -889,7 +889,10 @@ def check_record_exists( return False def check_which_results_exist( - self, results: List[str], rid: str = None + self, + results: List[str], + rid: Optional[str] = None, + table_name: Optional[str] = None, ) -> List[str]: """ Check which results have been reported @@ -901,7 +904,9 @@ def check_which_results_exist( """ rid = self._strict_record_id(rid) if self.file is None: - existing = self._check_which_results_exist_db(results=results, rid=rid) + existing = self._check_which_results_exist_db( + results=results, rid=rid, table_name=table_name + ) else: existing = [] for r in results: @@ -1105,6 +1110,7 @@ def _retrieve_db( existing = self.check_which_results_exist( results=[result_identifier], rid=record_identifier, + table_name=table_name, ) if not existing: raise PipestatDatabaseError( diff --git a/tests/test_db_only_mode.py b/tests/test_db_only_mode.py new file mode 100644 index 00000000..660006ec --- /dev/null +++ b/tests/test_db_only_mode.py @@ -0,0 +1,86 @@ +import pytest + +from pipestat import PipestatManager +from pipestat.const import * + + +class TestDatabaseOnly: + @pytest.mark.parametrize( + "val", + [ + {"name_of_something": "test_name"}, + {"number_of_things": 1}, + {"percentage_of_things": 10.1}, + ], + ) + def test_report(self, val, config_file_path, schema_file_path, results_file_path): + REC_ID = "constant_record_id" + psm = PipestatManager( + schema_path=schema_file_path, + namespace="test", + record_identifier=REC_ID, + database_only=True, + config=config_file_path, + ) + psm.report(values=val) + assert len(psm.data) == 0 + val_name = list(val.keys())[0] + assert psm.select(filter_condition=[(val_name, "eq", str(val[val_name]))]) + + @pytest.mark.parametrize(["rec_id", "res_id"], [("sample2", "number_of_things")]) + @pytest.mark.parametrize("backend", ["db"]) + @pytest.mark.parametrize("limit", [1, 2, 3, 15555]) + def test_select_limit( + self, + rec_id, + res_id, + config_file_path, + results_file_path, + schema_file_path, + backend, + limit, + ): + args = dict( + schema_path=schema_file_path, namespace="test", config=config_file_path + ) + psm = PipestatManager(**args) + result = psm.select( + filter_condition=[(RECORD_ID, "eq", rec_id)], + columns=[res_id], + limit=limit, + ) + assert len(result) <= limit + + @pytest.mark.parametrize("backend", ["db"]) + @pytest.mark.parametrize("offset", [0, 1, 2, 3, 15555]) + def test_select_offset( + self, config_file_path, results_file_path, schema_file_path, backend, offset + ): + args = dict( + schema_path=schema_file_path, namespace="test", config=config_file_path + ) + psm = PipestatManager(**args) + result = psm.select(offset=offset) + print(result) + assert len(result) == max((psm.record_count - offset), 0) + + @pytest.mark.parametrize("backend", ["db"]) + @pytest.mark.parametrize( + ["offset", "limit"], [(0, 0), (0, 1), (0, 2), (0, 11111), (1, 1), (1, 0)] + ) + def test_select_pagination( + self, + config_file_path, + results_file_path, + schema_file_path, + backend, + offset, + limit, + ): + args = dict( + schema_path=schema_file_path, namespace="test", config=config_file_path + ) + psm = PipestatManager(**args) + result = psm.select(offset=offset, limit=limit) + print(result) + assert len(result) == min(max((psm.record_count - offset), 0), limit) diff --git a/tests/test_init.py b/tests/test_init.py new file mode 100644 index 00000000..1490a78a --- /dev/null +++ b/tests/test_init.py @@ -0,0 +1,146 @@ +from tempfile import mkdtemp + +import pytest +from yaml import dump + +from pipestat import PipestatManager +from pipestat.const import * +from pipestat.exceptions import * + + +class TestConnection: + def test_connection_checker(self, config_file_path, schema_file_path): + pm = PipestatManager( + config=config_file_path, + database_only=True, + schema_path=schema_file_path, + namespace="test", + ) + assert pm.is_db_connected() + + def test_connection_overwrite_error(self, config_file_path, schema_file_path): + pm = PipestatManager( + config=config_file_path, + database_only=True, + schema_path=schema_file_path, + namespace="test", + ) + with pytest.raises(PipestatDatabaseError): + pm.establish_db_connection() + + +class TestPipestatManagerInstantiation: + def test_obj_creation_file(self, schema_file_path, results_file_path): + """ Object constructor works with file as backend""" + assert isinstance( + PipestatManager( + namespace="test", + results_file_path=results_file_path, + schema_path=schema_file_path, + ), + PipestatManager, + ) + + def test_obj_creation_db(self, config_file_path): + """ Object constructor works with database as backend""" + assert isinstance(PipestatManager(config=config_file_path), PipestatManager) + + @pytest.mark.xfail(reason="schema is no longer required to init the object") + def test_schema_req(self, results_file_path): + """ + Object constructor raises exception if schema is not provided + """ + with pytest.raises(PipestatError): + PipestatManager(namespace="test", results_file_path=results_file_path) + + def test_schema_recursive_custom_type_conversion( + self, recursive_schema_file_path, results_file_path + ): + psm = PipestatManager( + namespace="test", + results_file_path=results_file_path, + schema_path=recursive_schema_file_path, + ) + assert ( + "path" + in psm.result_schemas["output_file_in_object"]["properties"]["prop1"][ + "properties" + ] + ) + assert ( + "thumbnail_path" + in psm.result_schemas["output_file_in_object"]["properties"]["prop2"][ + "properties" + ] + ) + + def test_missing_cfg_data(self, schema_file_path): + """ Object constructor raises exception if cfg is missing data """ + tmp_pth = os.path.join(mkdtemp(), "res.yml") + with open(tmp_pth, "w") as file: + dump({"database": {"host": "localhost"}}, file) + with pytest.raises(MissingConfigDataError): + PipestatManager( + namespace="test", config=tmp_pth, schema_path=schema_file_path + ) + + def test_unknown_backend(self, schema_file_path): + """ Either db config or results file path needs to be provided """ + with pytest.raises(MissingConfigDataError): + PipestatManager(namespace="test", schema_path=schema_file_path) + + def test_create_results_file(self, schema_file_path): + """ Results file is created if a nonexistent path provided """ + tmp_res_file = os.path.join(mkdtemp(), "res.yml") + print(f"Temporary results file: {tmp_res_file}") + assert not os.path.exists(tmp_res_file) + PipestatManager( + namespace="test", + results_file_path=tmp_res_file, + schema_path=schema_file_path, + ) + assert os.path.exists(tmp_res_file) + + def test_use_other_namespace_file(self, schema_file_path): + """ Results file can be used with just one namespace """ + tmp_res_file = os.path.join(mkdtemp(), "res.yml") + print(f"Temporary results file: {tmp_res_file}") + assert not os.path.exists(tmp_res_file) + PipestatManager( + namespace="test", + results_file_path=tmp_res_file, + schema_path=schema_file_path, + ) + assert os.path.exists(tmp_res_file) + with pytest.raises(PipestatDatabaseError): + PipestatManager( + namespace="new_test", + results_file_path=tmp_res_file, + schema_path=schema_file_path, + ) + + @pytest.mark.parametrize("pth", [["/$HOME/path.yaml"], 1]) + def test_wrong_class_results_file(self, schema_file_path, pth): + """ Input string that is not a file path raises an informative error """ + with pytest.raises((TypeError, AssertionError)): + PipestatManager( + namespace="test", results_file_path=pth, schema_path=schema_file_path + ) + + def test_results_file_contents_loaded(self, results_file_path, schema_file_path): + """ Contents of the results file are present after loading """ + psm = PipestatManager( + namespace="test", + results_file_path=results_file_path, + schema_path=schema_file_path, + ) + assert "test" in psm.data + + def test_str_representation(self, results_file_path, schema_file_path): + """ Test string representation identifies number of records """ + psm = PipestatManager( + namespace="test", + results_file_path=results_file_path, + schema_path=schema_file_path, + ) + assert f"Records count: {len(psm.data[psm.namespace])}" in str(psm) diff --git a/tests/test_pipestat.py b/tests/test_pipestat.py index 9e87819f..f8c199d6 100644 --- a/tests/test_pipestat.py +++ b/tests/test_pipestat.py @@ -1,12 +1,7 @@ -import os from collections import Mapping -from tempfile import mkdtemp import pytest -from _pytest.monkeypatch import monkeypatch from jsonschema import ValidationError -from psycopg2 import Error as psycopg2Error -from yaml import dump from pipestat import PipestatManager from pipestat.const import * @@ -31,144 +26,6 @@ def is_in_file(fs, s, reverse=False): assert s in fh.read() -class TestConnection: - def test_connection_checker(self, config_file_path, schema_file_path): - pm = PipestatManager( - config=config_file_path, - database_only=True, - schema_path=schema_file_path, - namespace="test", - ) - assert pm.is_db_connected() - - def test_connection_overwrite_error(self, config_file_path, schema_file_path): - pm = PipestatManager( - config=config_file_path, - database_only=True, - schema_path=schema_file_path, - namespace="test", - ) - with pytest.raises(PipestatDatabaseError): - pm.establish_db_connection() - - -class TestPipestatManagerInstantiation: - def test_obj_creation_file(self, schema_file_path, results_file_path): - """ Object constructor works with file as backend""" - assert isinstance( - PipestatManager( - namespace="test", - results_file_path=results_file_path, - schema_path=schema_file_path, - ), - PipestatManager, - ) - - def test_obj_creation_db(self, config_file_path): - """ Object constructor works with database as backend""" - assert isinstance(PipestatManager(config=config_file_path), PipestatManager) - - @pytest.mark.xfail(reason="schema is no longer required to init the object") - def test_schema_req(self, results_file_path): - """ - Object constructor raises exception if schema is not provided - """ - with pytest.raises(PipestatError): - PipestatManager(namespace="test", results_file_path=results_file_path) - - def test_schema_recursive_custom_type_conversion( - self, recursive_schema_file_path, results_file_path - ): - psm = PipestatManager( - namespace="test", - results_file_path=results_file_path, - schema_path=recursive_schema_file_path, - ) - assert ( - "path" - in psm.result_schemas["output_file_in_object"]["properties"]["prop1"][ - "properties" - ] - ) - assert ( - "thumbnail_path" - in psm.result_schemas["output_file_in_object"]["properties"]["prop2"][ - "properties" - ] - ) - - def test_missing_cfg_data(self, schema_file_path): - """ Object constructor raises exception if cfg is missing data """ - tmp_pth = os.path.join(mkdtemp(), "res.yml") - with open(tmp_pth, "w") as file: - dump({"database": {"host": "localhost"}}, file) - with pytest.raises(MissingConfigDataError): - PipestatManager( - namespace="test", config=tmp_pth, schema_path=schema_file_path - ) - - def test_unknown_backend(self, schema_file_path): - """ Either db config or results file path needs to be provided """ - with pytest.raises(MissingConfigDataError): - PipestatManager(namespace="test", schema_path=schema_file_path) - - def test_create_results_file(self, schema_file_path): - """ Results file is created if a nonexistent path provided """ - tmp_res_file = os.path.join(mkdtemp(), "res.yml") - print(f"Temporary results file: {tmp_res_file}") - assert not os.path.exists(tmp_res_file) - PipestatManager( - namespace="test", - results_file_path=tmp_res_file, - schema_path=schema_file_path, - ) - assert os.path.exists(tmp_res_file) - - def test_use_other_namespace_file(self, schema_file_path): - """ Results file can be used with just one namespace """ - tmp_res_file = os.path.join(mkdtemp(), "res.yml") - print(f"Temporary results file: {tmp_res_file}") - assert not os.path.exists(tmp_res_file) - PipestatManager( - namespace="test", - results_file_path=tmp_res_file, - schema_path=schema_file_path, - ) - assert os.path.exists(tmp_res_file) - with pytest.raises(PipestatDatabaseError): - PipestatManager( - namespace="new_test", - results_file_path=tmp_res_file, - schema_path=schema_file_path, - ) - - @pytest.mark.parametrize("pth", [["/$HOME/path.yaml"], 1]) - def test_wrong_class_results_file(self, schema_file_path, pth): - """ Input string that is not a file path raises an informative error """ - with pytest.raises((TypeError, AssertionError)): - PipestatManager( - namespace="test", results_file_path=pth, schema_path=schema_file_path - ) - - def test_results_file_contents_loaded(self, results_file_path, schema_file_path): - """ Contents of the results file are present after loading """ - psm = PipestatManager( - namespace="test", - results_file_path=results_file_path, - schema_path=schema_file_path, - ) - assert "test" in psm.data - - def test_str_representation(self, results_file_path, schema_file_path): - """ Test string representation identifies number of records """ - psm = PipestatManager( - namespace="test", - results_file_path=results_file_path, - schema_path=schema_file_path, - ) - assert f"Records count: {len(psm.data[psm.namespace])}" in str(psm) - - class TestReporting: @pytest.mark.parametrize( ["rec_id", "val"], @@ -576,91 +433,6 @@ def test_remove( assert psm.remove(result_identifier=list(val.keys())[0]) -class TestDatabaseOnly: - @pytest.mark.parametrize( - "val", - [ - {"name_of_something": "test_name"}, - {"number_of_things": 1}, - {"percentage_of_things": 10.1}, - ], - ) - def test_report(self, val, config_file_path, schema_file_path, results_file_path): - REC_ID = "constant_record_id" - psm = PipestatManager( - schema_path=schema_file_path, - namespace="test", - record_identifier=REC_ID, - database_only=True, - config=config_file_path, - ) - psm.report(values=val) - assert len(psm.data) == 0 - val_name = list(val.keys())[0] - assert psm.select( - condition=val_name + "=%s", condition_val=[str(val[val_name])] - ) - - @pytest.mark.parametrize(["rec_id", "res_id"], [("sample2", "number_of_things")]) - @pytest.mark.parametrize("backend", ["db"]) - @pytest.mark.parametrize("limit", [1, 2, 3, 15555]) - def test_select_limit( - self, - rec_id, - res_id, - config_file_path, - results_file_path, - schema_file_path, - backend, - limit, - ): - args = dict( - schema_path=schema_file_path, namespace="test", config=config_file_path - ) - psm = PipestatManager(**args) - result = psm.select( - condition=f"{RECORD_ID}=%s", - condition_val=[rec_id], - columns=[res_id], - limit=limit, - ) - assert len(result) <= limit - - @pytest.mark.parametrize("backend", ["db"]) - @pytest.mark.parametrize("offset", [0, 1, 2, 3, 15555]) - def test_select_offset( - self, config_file_path, results_file_path, schema_file_path, backend, offset - ): - args = dict( - schema_path=schema_file_path, namespace="test", config=config_file_path - ) - psm = PipestatManager(**args) - result = psm.select(offset=offset) - print(result) - assert len(result) == max((psm.record_count - offset), 0) - - @pytest.mark.parametrize("backend", ["db"]) - @pytest.mark.parametrize( - ["offset", "limit"], [(0, 0), (0, 1), (0, 2), (0, 11111), (1, 1), (1, 0)] - ) - def test_select_pagination( - self, - config_file_path, - results_file_path, - schema_file_path, - backend, - offset, - limit, - ): - args = dict( - schema_path=schema_file_path, namespace="test", config=config_file_path - ) - psm = PipestatManager(**args) - result = psm.select(offset=offset, limit=limit) - print(result) - assert len(result) == min(max((psm.record_count - offset), 0), limit) - - class TestHighlighting: def test_highlighting_works(self, highlight_schema_file_path, results_file_path): """the highlighted results are sourced from the schema and only ones @@ -677,64 +449,6 @@ def test_highlighting_works(self, highlight_schema_file_path, results_file_path) assert psm.highlighted_results == schema_highlighted_results -class TestStatus: - def test_status_file_defult_location(self, schema_file_path, results_file_path): - """status file location is set to the results file dir - if not specified""" - psm = PipestatManager( - namespace="test", - results_file_path=results_file_path, - schema_path=schema_file_path, - ) - assert psm[STATUS_FILE_DIR] == os.path.dirname(psm.file) - - @pytest.mark.parametrize("backend", ["file", "db"]) - @pytest.mark.parametrize("status_id", ["running", "failed", "completed"]) - def test_status_not_configured( - self, schema_file_path, config_file_path, results_file_path, backend, status_id - ): - """ status management works even in case it has not been configured""" - args = dict(schema_path=schema_file_path, namespace="test") - backend_data = ( - {"config": config_file_path} - if backend == "db" - else {"results_file_path": results_file_path} - ) - args.update(backend_data) - psm = PipestatManager(**args) - psm.set_status(record_identifier="sample1", status_identifier=status_id) - assert psm.get_status(record_identifier="sample1") == status_id - - @pytest.mark.parametrize("backend", ["file", "db"]) - @pytest.mark.parametrize( - "status_id", ["running_custom", "failed_custom", "completed_custom"] - ) - def test_custom_status_schema( - self, - schema_file_path, - config_file_path, - results_file_path, - backend, - status_id, - custom_status_schema, - ): - """ status management works even in case it has not been configured""" - args = dict( - schema_path=schema_file_path, - namespace="test", - status_schema_path=custom_status_schema, - ) - backend_data = ( - {"config": config_file_path} - if backend == "db" - else {"results_file_path": results_file_path} - ) - args.update(backend_data) - psm = PipestatManager(**args) - psm.set_status(record_identifier="sample1", status_identifier=status_id) - assert psm.get_status(record_identifier="sample1") == status_id - - class TestEnvVars: def test_no_config(self, monkeypatch, results_file_path, schema_file_path): """ diff --git a/tests/test_status.py b/tests/test_status.py new file mode 100644 index 00000000..a8addf6f --- /dev/null +++ b/tests/test_status.py @@ -0,0 +1,62 @@ +import pytest + +from pipestat import PipestatManager +from pipestat.const import * + + +class TestStatus: + def test_status_file_defult_location(self, schema_file_path, results_file_path): + """status file location is set to the results file dir + if not specified""" + psm = PipestatManager( + namespace="test", + results_file_path=results_file_path, + schema_path=schema_file_path, + ) + assert psm[STATUS_FILE_DIR] == os.path.dirname(psm.file) + + @pytest.mark.parametrize("backend", ["file", "db"]) + @pytest.mark.parametrize("status_id", ["running", "failed", "completed"]) + def test_status_not_configured( + self, schema_file_path, config_file_path, results_file_path, backend, status_id + ): + """ status management works even in case it has not been configured""" + args = dict(schema_path=schema_file_path, namespace="test") + backend_data = ( + {"config": config_file_path} + if backend == "db" + else {"results_file_path": results_file_path} + ) + args.update(backend_data) + psm = PipestatManager(**args) + psm.set_status(record_identifier="sample1", status_identifier=status_id) + assert psm.get_status(record_identifier="sample1") == status_id + + @pytest.mark.parametrize("backend", ["file", "db"]) + @pytest.mark.parametrize( + "status_id", ["running_custom", "failed_custom", "completed_custom"] + ) + def test_custom_status_schema( + self, + schema_file_path, + config_file_path, + results_file_path, + backend, + status_id, + custom_status_schema, + ): + """ status management works even in case it has not been configured""" + args = dict( + schema_path=schema_file_path, + namespace="test", + status_schema_path=custom_status_schema, + ) + backend_data = ( + {"config": config_file_path} + if backend == "db" + else {"results_file_path": results_file_path} + ) + args.update(backend_data) + psm = PipestatManager(**args) + psm.set_status(record_identifier="sample1", status_identifier=status_id) + assert psm.get_status(record_identifier="sample1") == status_id From dd9f62feb4f9f6e6b820cfa2a8ed0acdf8b1c904 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 16 Apr 2021 09:26:36 -0400 Subject: [PATCH 19/65] fix deprecated import --- tests/test_pipestat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_pipestat.py b/tests/test_pipestat.py index f8c199d6..1822c9a1 100644 --- a/tests/test_pipestat.py +++ b/tests/test_pipestat.py @@ -1,4 +1,4 @@ -from collections import Mapping +from collections.abc import Mapping import pytest from jsonschema import ValidationError From f12ce065c3bf01e98f932f180ce7b2acafad60b6 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 16 Apr 2021 09:33:51 -0400 Subject: [PATCH 20/65] remove prev implementation --- pipestat/const.py | 21 +- pipestat/helpers.py | 20 - pipestat/pipestat_dirver.py | 1270 ------------------------------ tests/data/config_no_schema.yaml | 1 + 4 files changed, 2 insertions(+), 1310 deletions(-) delete mode 100644 pipestat/pipestat_dirver.py diff --git a/pipestat/const.py b/pipestat/const.py index 0b75a84d..09178d07 100644 --- a/pipestat/const.py +++ b/pipestat/const.py @@ -25,17 +25,6 @@ STATUS_GET_CMD: "Get status.", } -TABLE_COLS_BY_TYPE = { - "integer": "{} INT", - "number": "{} NUMERIC", - "string": "{} TEXT", - "boolean": "{} BOOLEAN", - "object": "{} JSONB", - "array": "{} TEXT[]", - "file": "{} JSONB", - "image": "{} JSONB", -} - DOC_URL = "http://pipestat.databio.org/en/latest/db_config/" # DB config keys @@ -53,6 +42,7 @@ CFG_PASSWORD_KEY, CFG_USER_KEY, CFG_NAME_KEY, + CFG_DIALECT_KEY, ] # object attribute names @@ -70,7 +60,6 @@ NAME_KEY = "_name" FILE_KEY = "_file" RECORD_ID_KEY = "_record_id" -DB_CONNECTION_KEY = "_db_connnection" DB_SESSION_KEY = "_db_session" DB_ENGINE_KEY = "_db_engine" HIGHLIGHTED_KEY = "_highlighted" @@ -86,14 +75,6 @@ RESERVED_COLNAMES = [ID, RECORD_ID] -FIXED_COLUMNS = [f"{ID} BIGSERIAL PRIMARY KEY", f"{RECORD_ID} TEXT UNIQUE NOT NULL"] - -STATUS_TABLE_COLUMNS = [ - f"{ID} BIGSERIAL PRIMARY KEY", - f"{RECORD_ID} TEXT UNIQUE NOT NULL", - f"{STATUS} TEXT", -] -# f"{STATUS} {STATUS}"] # custom type 'status' CANONICAL_TYPES = { "image": { "type": "object", diff --git a/pipestat/helpers.py b/pipestat/helpers.py index d4cda351..b24c0a0c 100644 --- a/pipestat/helpers.py +++ b/pipestat/helpers.py @@ -29,26 +29,6 @@ def get_status_table_schema(status_schema: Dict[str, Any]) -> Dict[str, Any]: return status_table_schema -def schema_to_columns(schema): - """ - Get a list of database table columns from a schema - - :param dict schema: schema to parse - :return list[str]: columns to inial ize database table with - """ - columns = [] - for colname, col_dict in schema.items(): - if col_dict[SCHEMA_TYPE_KEY] not in TABLE_COLS_BY_TYPE: - _LOGGER.warning( - f"'{col_dict[SCHEMA_TYPE_KEY]}' result type defined" - f" in schema is not supported" - ) - continue - columns.append(TABLE_COLS_BY_TYPE[col_dict[SCHEMA_TYPE_KEY]].format(colname)) - _LOGGER.info(f"Table columns created based on schema: {columns}") - return columns - - def validate_type(value, schema, strict_type=False): """ Validate reported result against a partial schema, in case of failure try diff --git a/pipestat/pipestat_dirver.py b/pipestat/pipestat_dirver.py deleted file mode 100644 index ea9cdb42..00000000 --- a/pipestat/pipestat_dirver.py +++ /dev/null @@ -1,1270 +0,0 @@ -from contextlib import contextmanager -from copy import deepcopy -from logging import getLogger -from typing import Any, Dict, List, Optional, Union - -import psycopg2 -from attmap import PathExAttMap as PXAM -from jsonschema import validate -from psycopg2.extensions import connection -from psycopg2.extras import DictCursor, Json -from ubiquerg import create_lock, remove_lock -from yacman import YacAttMap - -from .const import * -from .exceptions import * -from .helpers import * - -_LOGGER = getLogger(PKG_NAME) - - -class LoggingCursor(psycopg2.extras.DictCursor): - """ - Logging db cursor - """ - - def execute(self, query, vars=None): - """ - Execute a database operation (query or command) and issue a debug - and info level log messages - - :param query: - :param vars: - :return: - """ - _LOGGER.debug(f"Executing query: {self.mogrify(query, vars)}") - super(LoggingCursor, self).execute(query=query, vars=vars) - _LOGGER.debug(f"Executed query: {self.query}") - - -class PipestatManagerDirver(dict): - """ - Pipestat standardizes reporting of pipeline results and - pipeline status management. It formalizes a way for pipeline developers - and downstream tools developers to communicate -- results produced by a - pipeline can easily and reliably become an input for downstream analyses. - The object exposes API for interacting with the results and - pipeline status and can be backed by either a YAML-formatted file - or a PostgreSQL database. - """ - - def __init__( - self, - namespace: str = None, - record_identifier: str = None, - schema_path: str = None, - results_file_path: str = None, - database_only: bool = False, - config: Union[str, dict] = None, - status_schema_path: str = None, - flag_file_dir: str = None, - ): - """ - Initialize the object - - :param str namespace: namespace to report into. This will be the DB - table name if using DB as the object back-end - :param str record_identifier: record identifier to report for. This - creates a weak bound to the record, which can be overriden in - this object method calls - :param str schema_path: path to the output schema that formalizes - the results structure - :param str results_file_path: YAML file to report into, if file is - used as the object back-end - :param bool database_only: whether the reported data should not be - stored in the memory, but only in the database - :param str | dict config: path to the configuration file or a mapping - with the config file content - :param str status_schema_path: path to the status schema that formalizes - the status flags structure - """ - - def _check_cfg_key(cfg: dict, key: str) -> bool: - if key not in cfg: - _LOGGER.warning(f"Key '{key}' not found in config") - return False - return True - - def _mk_abs_via_cfg( - path: Optional[str], - cfg_path: Optional[str], - ) -> Optional[str]: - if path is None: - return path - assert isinstance(path, str), TypeError("Path is expected to be a str") - if os.path.isabs(path): - return path - if cfg_path is None: - rel_to_cwd = os.path.join(os.getcwd(), path) - if os.path.exists(rel_to_cwd) or os.access( - os.path.dirname(rel_to_cwd), os.W_OK - ): - return rel_to_cwd - raise OSError(f"Could not make this path absolute: {path}") - joined = os.path.join(os.path.dirname(cfg_path), path) - if os.path.isabs(joined): - return joined - raise OSError(f"Could not make this path absolute: {path}") - - def _select_value( - arg_name: str, - arg_value: Any, - cfg: dict, - strict: bool = True, - env_var: str = None, - ) -> Any: - if arg_value is not None: - return arg_value - if arg_name not in cfg or cfg[arg_name] is None: - if env_var is not None: - arg = os.getenv(env_var, None) - if arg is not None: - _LOGGER.debug(f"Value '{arg}' sourced from '{env_var}' env var") - return expandpath(arg) - if strict: - raise PipestatError( - f"Value for the required '{arg_name}' argument could not be" - f" determined. Provide it in the config or pass to the " - f"object constructor." - ) - return - return cfg[arg_name] - - super(PipestatManagerDirver, self).__init__() - self[CONFIG_KEY] = YacAttMap() - # read config or config data - config = config or os.getenv(ENV_VARS["config"]) - if config is not None: - if isinstance(config, str): - config = os.path.abspath(expandpath(config)) - self[CONFIG_KEY] = YacAttMap(filepath=config) - self._config_path = config - elif isinstance(config, dict): - self[CONFIG_KEY] = YacAttMap(entries=config) - self._config_path = None - else: - raise TypeError( - "database_config has to be either path to the " - "file to read or a dict" - ) - # validate config - cfg = self[CONFIG_KEY].to_dict(expand=True) - _, cfg_schema = read_yaml_data(CFG_SCHEMA, "config schema") - validate(cfg, cfg_schema) - - self[NAME_KEY] = _select_value( - "namespace", namespace, self[CONFIG_KEY], env_var=ENV_VARS["namespace"] - ) - self[RECORD_ID_KEY] = _select_value( - "record_identifier", - record_identifier, - self[CONFIG_KEY], - False, - ENV_VARS["record_identifier"], - ) - self[DB_ONLY_KEY] = database_only - # read results schema - self._schema_path = _select_value( - "schema_path", - schema_path, - self[CONFIG_KEY], - False, - env_var=ENV_VARS["schema"], - ) - if self._schema_path is not None: - _, self[SCHEMA_KEY] = read_yaml_data( - _mk_abs_via_cfg(self._schema_path, self.config_path), "schema" - ) - self.validate_schema() - # determine the highlighted results - self[HIGHLIGHTED_KEY] = [ - k - for k, v in self.schema.items() - if "highlight" in v and v["highlight"] is True - ] - if self[HIGHLIGHTED_KEY]: - assert isinstance(self[HIGHLIGHTED_KEY], list), TypeError( - f"highlighted results specification " - f"({self[HIGHLIGHTED_KEY]}) has to be a list" - ) - # read status schema - status_schema_path = ( - _mk_abs_via_cfg( - _select_value( - "status_schema_path", - status_schema_path, - self[CONFIG_KEY], - False, - env_var=ENV_VARS["status_schema"], - ), - self.config_path, - ) - or STATUS_SCHEMA - ) - self[STATUS_SCHEMA_SOURCE_KEY], self[STATUS_SCHEMA_KEY] = read_yaml_data( - status_schema_path, "status schema" - ) - # determine results file - results_file_path = _mk_abs_via_cfg( - _select_value( - "results_file_path", - results_file_path, - self[CONFIG_KEY], - False, - ENV_VARS["results_file"], - ), - self.config_path, - ) - if results_file_path: - if self[DB_ONLY_KEY]: - raise ValueError( - "Running in database only mode does not make " - "sense with a YAML file as a backend." - ) - self[FILE_KEY] = results_file_path - self._init_results_file() - flag_file_dir = _select_value( - "flag_file_dir", flag_file_dir, self[CONFIG_KEY], False - ) or os.path.dirname(self.file) - self[STATUS_FILE_DIR] = _mk_abs_via_cfg(flag_file_dir, self.config_path) - elif CFG_DATABASE_KEY in self[CONFIG_KEY]: - if not all( - [ - _check_cfg_key(self[CONFIG_KEY][CFG_DATABASE_KEY], key) - for key in DB_CREDENTIALS - ] - ): - raise MissingConfigDataError( - "Must specify all database login " "credentials or result_file_path" - ) - self[DATA_KEY] = YacAttMap() - self._init_postgres_table() - self._init_status_table() - else: - raise MissingConfigDataError( - "Must specify either database login " "credentials or a YAML file path" - ) - - def __str__(self): - """ - Generate string representation of the object - - :return str: string representation of the object - """ - res = f"{self.__class__.__name__} ({self.namespace})" - res += "\nBackend: {}".format( - f"file ({self.file})" if self.file else "PostgreSQL" - ) - res += f"\nResults schema source: {self.schema_path}" - res += f"\nStatus schema source: {self.status_schema_source}" - res += f"\nRecords count: {self.record_count}" - if self.highlighted_results: - res += f"\nHighlighted results: {', '.join(self.highlighted_results)}" - return res - - def _get_flag_file( - self, record_identifier: str = None - ) -> Union[str, List[str], None]: - """ - Get path to the status flag file for the specified record - - :param str record_identifier: unique record identifier - :return str | list[str] | None: path to the status flag file - """ - from glob import glob - - r_id = self._strict_record_id(record_identifier) - if self.file is None: - return - if self.file is not None: - regex = os.path.join( - self[STATUS_FILE_DIR], f"{self.namespace}_{r_id}_*.flag" - ) - file_list = glob(regex) - if len(file_list) > 1: - _LOGGER.warning("Multiple flag files found") - return file_list - elif len(file_list) == 1: - return file_list[0] - else: - _LOGGER.debug("No flag files found") - return None - - @property - def highlighted_results(self) -> List[str]: - """ - Highlighted results - - :return List[str]: a collection of highlighted results - """ - return self._get_attr(HIGHLIGHTED_KEY) or [] - - @property - def record_count(self) -> int: - """ - Number of records reported - - :return int: number of records reported - """ - return ( - len(self.data[self.namespace]) - if self.file - else self._count_rows(self.namespace) - ) - - @property - def namespace(self) -> str: - """ - Namespace the object writes the results to - - :return str: namespace the object writes the results to - """ - return self._get_attr(NAME_KEY) - - @property - def record_identifier(self) -> str: - """ - Unique identifier of the record - - :return str: unique identifier of the record - """ - return self._get_attr(RECORD_ID_KEY) - - @property - def schema(self) -> Dict: - """ - Schema mapping - - :return dict: schema that formalizes the results structure - """ - return self._get_attr(SCHEMA_KEY) - - @property - def status_schema(self) -> Dict: - """ - Status schema mapping - - :return dict: schema that formalizes the pipeline status structure - """ - return self._get_attr(STATUS_SCHEMA_KEY) - - @property - def status_schema_source(self) -> Dict: - """ - Status schema source - - :return dict: source of the schema that formalizes - the pipeline status structure - """ - return self._get_attr(STATUS_SCHEMA_SOURCE_KEY) - - @property - def schema_path(self) -> str: - """ - Schema path - - :return str: path to the provided schema - """ - return self._schema_path - - @property - def config_path(self) -> str: - """ - Config path. None if the config was not provided or if provided - as a mapping of the config contents - - :return str: path to the provided config - """ - return getattr(self, "_config_path", None) - - @property - def result_schemas(self) -> Dict: - """ - Result schema mappings - - :return dict: schemas that formalize the structure of each result - in a canonical jsonschema way - """ - return self._get_attr(RES_SCHEMAS_KEY) - - @property - def file(self) -> str: - """ - File path that the object is reporting the results into - - :return str: file path that the object is reporting the results into - """ - return self._get_attr(FILE_KEY) - - @property - def data(self) -> YacAttMap: - """ - Data object - - :return yacman.YacAttMap: the object that stores the reported data - """ - return self._get_attr(DATA_KEY) - - @property - @contextmanager - def db_cursor(self): - """ - Establish connection and get a PostgreSQL database cursor, - commit and close the connection afterwards - - :return LoggingCursor: Database cursor object - """ - try: - if not self.check_connection(): - self.establish_postgres_connection() - with self[DB_CONNECTION_KEY] as c, c.cursor( - cursor_factory=LoggingCursor - ) as cur: - yield cur - except Exception: - raise - finally: - self.close_postgres_connection() - - def get_status(self, record_identifier: str = None) -> Optional[str]: - """ - Get the current pipeline status - - :return str: status identifier, like 'running' - """ - r_id = self._strict_record_id(record_identifier) - if self.file is None: - with self.db_cursor as cur: - query = sql.SQL( - f"SELECT {STATUS} " - f"FROM {f'{self.namespace}_{STATUS}'} " - f"WHERE {RECORD_ID}=%s" - ) - cur.execute(query, (r_id,)) - result = cur.fetchone() - return result[0] if result is not None else None - else: - flag_file = self._get_flag_file(record_identifier=r_id) - if flag_file is not None: - assert isinstance(flag_file, str), TypeError( - "Flag file path is expected to be a str, were multiple flags found?" - ) - with open(flag_file, "r") as f: - status = f.read() - return status - _LOGGER.debug( - f"Could not determine status for '{r_id}' record. " - f"No flags found in: {self[STATUS_FILE_DIR]}" - ) - return None - - def _get_attr(self, attr: str) -> Any: - """ - Safely get the name of the selected attribute of this object - - :param str attr: attr to select - :return: - """ - return self[attr] if attr in self else None - - def _table_to_dict(self) -> None: - """ - Create a dictionary from the database table data - """ - with self.db_cursor as cur: - cur.execute(f"SELECT * FROM {self.namespace}") - data = cur.fetchall() - _LOGGER.debug(f"Reading data from database for '{self.namespace}' namespace") - for record in data: - record_id = record[RECORD_ID] - for res_id, val in record.items(): - if val is not None: - self._report_data_element( - record_identifier=record_id, values={res_id: val} - ) - - def _init_postgres_table(self) -> bool: - """ - Initialize a PostgreSQL table based on the provided schema, - if it does not exist. Read the data stored in the database into the - memory otherwise. - - :return bool: whether the table has been created - """ - if self.schema is None: - raise SchemaNotFoundError("initialize the database table") - if self._check_table_exists(table_name=self.namespace): - _LOGGER.debug(f"Table '{self.namespace}' already exists in the database") - if not self[DB_ONLY_KEY]: - self._table_to_dict() - return False - _LOGGER.info(f"Initializing '{self.namespace}' table in '{PKG_NAME}' database") - columns = FIXED_COLUMNS + schema_to_columns(schema=self.schema) - self._create_table(table_name=self.namespace, columns=columns) - return True - - # def _create_status_type(self): - # with self.db_cursor as cur: - # s = sql.SQL(f"SELECT exists (SELECT 1 FROM pg_type WHERE typname = '{STATUS}');") - # cur.execute(s) - # if cur.fetchone()[0]: - # return - # with self.db_cursor as cur: - # status_strs = [f"'{st_id}'" for st_id in self.status_schema.keys()] - # status_str = ", ".join(status_strs) - # s = sql.SQL(f"CREATE TYPE {STATUS} as enum({status_str});") - # cur.execute(s) - - def _init_status_table(self): - status_table_name = f"{self.namespace}_{STATUS}" - # self._create_status_type() - if not self._check_table_exists(table_name=status_table_name): - _LOGGER.info( - f"Initializing '{status_table_name}' table in " f"'{PKG_NAME}' database" - ) - self._create_table(status_table_name, STATUS_TABLE_COLUMNS) - - def _create_table(self, table_name: str, columns: List[str]): - """ - Create a table - - :param str table_name: name of the table to create - :param str | List[str] columns: columns definition list, - for instance: ['name VARCHAR(50) NOT NULL'] - """ - columns = mk_list_of_str(columns) - with self.db_cursor as cur: - s = sql.SQL(f"CREATE TABLE {table_name} ({','.join(columns)})") - cur.execute(s) - - def _init_results_file(self) -> bool: - """ - Initialize YAML results file if it does not exist. - Read the data stored in the existing file into the memory otherwise. - - :return bool: whether the file has been created - """ - if not os.path.exists(self.file): - _LOGGER.info(f"Initializing results file '{self.file}'") - data = YacAttMap(entries={self.namespace: "{}"}) - data.write(filepath=self.file) - data.make_readonly() - self[DATA_KEY] = data - return True - _LOGGER.debug(f"Reading data from '{self.file}'") - data = YacAttMap(filepath=self.file) - filtered = list(filter(lambda x: not x.startswith("_"), data.keys())) - if filtered and self.namespace not in filtered: - raise PipestatDatabaseError( - f"'{self.file}' is already used to report results for " - f"other namespace: {filtered[0]}" - ) - self[DATA_KEY] = data - return False - - def _check_table_exists(self, table_name: str) -> bool: - """ - Check if the specified table exists - - :param str table_name: table name to be checked - :return bool: whether the specified table exists - """ - with self.db_cursor as cur: - cur.execute( - "SELECT EXISTS(SELECT * FROM information_schema.tables " - "WHERE table_name=%s)", - (table_name,), - ) - return cur.fetchone()[0] - - def _check_record( - self, condition_col: str, condition_val: str, table_name: str - ) -> bool: - """ - Check if the record matching the condition is in the table - - :param str condition_col: column to base the check on - :param str condition_val: value in the selected column - :param str table_name: name of the table ot check the record in - :return bool: whether any record matches the provided condition - """ - with self.db_cursor as cur: - statement = ( - f"SELECT EXISTS(SELECT 1 from {table_name} " - f"WHERE {condition_col}=%s)" - ) - cur.execute(statement, (condition_val,)) - return cur.fetchone()[0] - - def _count_rows(self, table_name: str) -> int: - """ - Count rows in a selected table - - :param str table_name: table to count rows for - :return int: number of rows in the selected table - """ - with self.db_cursor as cur: - statement = sql.SQL("SELECT COUNT(*) FROM {}").format( - sql.Identifier(table_name) - ) - cur.execute(statement) - return cur.fetchall()[0][0] - - def _report_postgres( - self, value: Dict[str, Any], record_identifier: str, table_name: str = None - ) -> int: - """ - Check if record with this record identifier in table, create new record - if not (INSERT), update the record if yes (UPDATE). - - Currently supports just one column at a time. - - :param str record_identifier: unique identifier of the record, value to - in 'record_identifier' column to look for to determine if the record - already exists in the table - :param dict value: a mapping of pair of table column names and - respective values to be inserted to the database - :return int: id of the row just inserted - """ - table_name = table_name or self.namespace - if not self._check_record( - condition_col=RECORD_ID, - condition_val=record_identifier, - table_name=table_name, - ): - with self.db_cursor as cur: - cur.execute( - f"INSERT INTO {table_name} ({RECORD_ID}) VALUES (%s)", - (record_identifier,), - ) - # prep a list of SQL objects with column-named value placeholders - columns = sql.SQL(",").join( - [ - sql.SQL("{}=%({})s").format(sql.Identifier(k), sql.SQL(k)) - for k in list(value.keys()) - ] - ) - # construct the query template to execute - query = sql.SQL("UPDATE {n} SET {c} WHERE {id}=%({id})s RETURNING id").format( - n=sql.Identifier(table_name), c=columns, id=sql.SQL(RECORD_ID) - ) - # preprocess the values, dict -> Json - values = {k: Json(v) if isinstance(v, dict) else v for k, v in value.items()} - # add record_identifier column, which is specified outside of values - values.update({RECORD_ID: record_identifier}) - with self.db_cursor as cur: - cur.execute(query, values) - return cur.fetchone()[0] - - def clear_status( - self, record_identifier: str = None, flag_names: List[str] = None - ) -> List[str]: - """ - Remove status flags - - :param str record_identifier: name of the record to remove flags for - :param Iterable[str] flag_names: Names of flags to remove, optional; if - unspecified, all schema-defined flag names will be used. - :return List[str]: Collection of names of flags removed - """ - r_id = self._strict_record_id(record_identifier) - if self.file is not None: - flag_names = flag_names or list(self.status_schema.keys()) - if isinstance(flag_names, str): - flag_names = [flag_names] - removed = [] - for f in flag_names: - path_flag_file = self.get_status_flag_path( - status_identifier=f, record_identifier=r_id - ) - try: - os.remove(path_flag_file) - except: - pass - else: - _LOGGER.info(f"Removed existing flag: {path_flag_file}") - removed.append(f) - return removed - else: - removed = self.get_status(r_id) - status_table_name = f"{self.namespace}_{STATUS}" - with self.db_cursor as cur: - try: - cur.execute( - f"DELETE FROM {status_table_name} WHERE " - f"{RECORD_ID}='{r_id}'" - ) - except Exception as e: - _LOGGER.error( - f"Could not remove the status from the " - f"database. Exception: {e}" - ) - return [] - else: - return [removed] - - def get_status_flag_path( - self, status_identifier: str, record_identifier=None - ) -> str: - """ - Get the path to the status file flag - - :param str status_identifier: one of the defined status IDs in schema - :param str record_identifier: unique record ID, optional if - specified in the object constructor - :return str: absolute path to the flag file or None if object is - backed by a DB - """ - if self.file is None: - # DB as the backend - return - r_id = self._strict_record_id(record_identifier) - return os.path.join( - self[STATUS_FILE_DIR], f"{self.namespace}_{r_id}_{status_identifier}.flag" - ) - - def set_status(self, status_identifier: str, record_identifier: str = None) -> None: - """ - Set pipeline run status. - - The status identifier needs to match one of identifiers specified in - the status schema. A basic, ready to use, status schema is shipped with - this package. - - :param str status_identifier: status to set, one of statuses defined - in the status schema - :param str record_identifier: record identifier to set the - pipeline status for - """ - r_id = self._strict_record_id(record_identifier) - known_status_identifiers = self.status_schema.keys() - if status_identifier not in known_status_identifiers: - raise PipestatError( - f"'{status_identifier}' is not a defined status identifier. " - f"These are allowed: {known_status_identifiers}" - ) - prev_status = self.get_status(r_id) - if self.file is not None: - if prev_status: - prev_flag_path = self.get_status_flag_path(prev_status, r_id) - os.remove(prev_flag_path) - flag_path = self.get_status_flag_path(status_identifier, r_id) - create_lock(flag_path) - with open(flag_path, "w") as f: - f.write(status_identifier) - remove_lock(flag_path) - else: - try: - self._report_postgres( - value={STATUS: status_identifier}, - record_identifier=r_id, - table_name=f"{self.namespace}_{STATUS}", - ) - except Exception as e: - _LOGGER.error( - f"Could not insert into the status table. " f"Exception: {e}" - ) - raise - if prev_status: - _LOGGER.debug( - f"Changed status from '{prev_status}' to '{status_identifier}'" - ) - - def check_result_exists(self, result_identifier, record_identifier=None): - """ - Check if the result has been reported - - :param str record_identifier: unique identifier of the record - :param str result_identifier: name of the result to check - :return bool: whether the specified result has been reported for the - indicated record in current namespace - """ - record_identifier = self._strict_record_id(record_identifier) - return self._check_which_results_exist( - results=[result_identifier], rid=record_identifier - ) - - def _check_which_results_exist( - self, results: List[str], rid: str = None - ) -> List[str]: - """ - Check which results have been reported - - :param str rid: unique identifier of the record - :param List[str] results: names of the results to check - :return List[str]: whether the specified result has been reported for the - indicated record in current namespace - """ - rid = self._strict_record_id(rid) - existing = [] - for r in results: - if not self[DB_ONLY_KEY]: - if ( - self.namespace in self.data - and rid in self.data[self.namespace] - and r in self.data[self.namespace][rid] - ): - existing.append(r) - else: - with self.db_cursor as cur: - try: - cur.execute( - f"SELECT {r} FROM {self.namespace} WHERE {RECORD_ID}=%s", - (rid,), - ) - except Exception: - continue - else: - res = cur.fetchone() - if res is not None and res[0] is not None: - existing.append(r) - return existing - - def check_record_exists(self, record_identifier: str = None) -> bool: - """ - Check if the record exists - - :param str record_identifier: unique identifier of the record - :return bool: whether the record exists - """ - record_identifier = self._strict_record_id(record_identifier) - if self[DB_ONLY_KEY]: - with self.db_cursor as cur: - cur.execute( - f"SELECT exists(SELECT 1 from {self.namespace} " - f"WHERE {RECORD_ID}=%s)", - (record_identifier,), - ) - return cur.fetchone() - if ( - self.namespace in self.data - and record_identifier in self.data[self.namespace] - ): - return True - return False - - def report( - self, - values: Dict[str, Any], - record_identifier: str = None, - force_overwrite: bool = False, - strict_type: bool = True, - return_id: bool = False, - ) -> Union[bool, int]: - """ - Report a result. - - :param Dict[str, any] values: dictionary of result-value pairs - :param str record_identifier: unique identifier of the record, value - in 'record_identifier' column to look for to determine if the record - already exists - :param bool force_overwrite: whether to overwrite the existing record - :param bool strict_type: whether the type of the reported values should - remain as is. Pipestat would attempt to convert to the - schema-defined one otherwise - :param bool return_id: PostgreSQL IDs of the records that have been - updated. Not available with results file as backend - :return bool | int: whether the result has been reported or the ID of - the updated record in the table, if requested - """ - record_identifier = self._strict_record_id(record_identifier) - if return_id and self.file is not None: - raise NotImplementedError( - "There is no way to return the updated object ID while using " - "results file as the object backend" - ) - if self.schema is None: - raise SchemaNotFoundError("report results") - updated_ids = False - result_identifiers = list(values.keys()) - self.assert_results_defined(results=result_identifiers) - existing = self._check_which_results_exist( - rid=record_identifier, results=result_identifiers - ) - if existing: - _LOGGER.warning( - f"These results exist for '{record_identifier}': {existing}" - ) - if not force_overwrite: - return False - _LOGGER.info(f"Overwriting existing results: {existing}") - for r in result_identifiers: - validate_type( - value=values[r], schema=self.result_schemas[r], strict_type=strict_type - ) - if self.file is not None: - self.data.make_writable() - if not self[DB_ONLY_KEY]: - self._report_data_element( - record_identifier=record_identifier, values=values - ) - if self.file is not None: - self.data.write() - self.data.make_readonly() - else: - try: - updated_ids = self._report_postgres( - record_identifier=record_identifier, value=values - ) - except Exception as e: - _LOGGER.error( - f"Could not insert the result into the database. " f"Exception: {e}" - ) - if not self[DB_ONLY_KEY]: - for r in result_identifiers: - del self[DATA_KEY][self.namespace][record_identifier][r] - raise - nl = "\n" - rep_strs = [f"{k}: {v}" for k, v in values.items()] - _LOGGER.info( - f"Reported records for '{record_identifier}' in '{self.namespace}' " - f"namespace:{nl} - {(nl + ' - ').join(rep_strs)}" - ) - return True if not return_id else updated_ids - - def _report_data_element( - self, record_identifier: str, values: Dict[str, Any] - ) -> None: - """ - Update the value of a result in a current namespace. - - This method overwrites any existing data and creates the required - hierarchical mapping structure if needed. - - :param str record_identifier: unique identifier of the record - :param Dict[str, Any] values: dict of results identifiers and values - to be reported - """ - self[DATA_KEY].setdefault(self.namespace, PXAM()) - self[DATA_KEY][self.namespace].setdefault(record_identifier, PXAM()) - for res_id, val in values.items(): - self[DATA_KEY][self.namespace][record_identifier][res_id] = val - - def select( - self, - columns: Union[str, List[str]] = None, - condition: str = None, - condition_val: str = None, - offset: int = None, - limit: int = None, - ) -> List[psycopg2.extras.DictRow]: - """ - Get all the contents from the selected table, possibly restricted by - the provided condition. - - :param str | List[str] columns: columns to select - :param str condition: condition to restrict the results - with, will be appended to the end of the SELECT statement and - safely populated with 'condition_val', - for example: `"id=%s"` - :param list condition_val: values to fill the placeholder - in 'condition' with - :param int offset: number of records to be skipped - :param int limit: max number of records to be returned - :return List[psycopg2.extras.DictRow]: all table contents - """ - if self.file: - raise NotImplementedError( - "Selection is not supported on objects backed by results files." - " Use 'retrieve' method instead." - ) - condition, condition_val = preprocess_condition_pair(condition, condition_val) - if not columns: - columns = sql.SQL("*") - else: - columns = sql.SQL(",").join( - [sql.Identifier(x) for x in mk_list_of_str(columns)] - ) - statement = sql.SQL("SELECT {} FROM {}").format( - columns, sql.Identifier(self.namespace) - ) - if condition: - statement += sql.SQL(" WHERE ") - statement += condition - statement = paginate_query(statement, offset, limit) - with self.db_cursor as cur: - cur.execute(query=statement, vars=condition_val) - result = cur.fetchall() - return result - - def retrieve( - self, record_identifier: str = None, result_identifier: str = None - ) -> Union[Any, Dict[str, Any]]: - """ - Retrieve a result for a record. - - If no result ID specified, results for the entire record will - be returned. - - :param str record_identifier: unique identifier of the record - :param str result_identifier: name of the result to be retrieved - :return any | Dict[str, any]: a single result or a mapping with all the - results reported for the record - """ - record_identifier = self._strict_record_id(record_identifier) - if self[DB_ONLY_KEY]: - if result_identifier is not None: - existing = self._check_which_results_exist( - results=[result_identifier], rid=record_identifier - ) - if not existing: - raise PipestatDatabaseError( - f"Result '{result_identifier}' not found for record " - f"'{record_identifier}'" - ) - with self.db_cursor as cur: - query = sql.SQL( - f"SELECT {result_identifier or '*'} " - f"FROM {self.namespace} WHERE {RECORD_ID}=%s" - ) - cur.execute(query, (record_identifier,)) - result = cur.fetchall() - if len(result) > 0: - if result_identifier is None: - return {k: v for k, v in dict(result[0]).items() if v is not None} - return dict(result[0])[result_identifier] - raise PipestatDatabaseError(f"Record '{record_identifier}' not found") - else: - if record_identifier not in self.data[self.namespace]: - raise PipestatDatabaseError(f"Record '{record_identifier}' not found") - if result_identifier is None: - return self.data[self.namespace][record_identifier].to_dict() - if result_identifier not in self.data[self.namespace][record_identifier]: - raise PipestatDatabaseError( - f"Result '{result_identifier}' not found for record " - f"'{record_identifier}'" - ) - return self.data[self.namespace][record_identifier][result_identifier] - - def remove( - self, record_identifier: str = None, result_identifier: str = None - ) -> bool: - """ - Remove a result. - - If no result ID specified or last result is removed, the entire record - will be removed. - - :param str record_identifier: unique identifier of the record - :param str result_identifier: name of the result to be removed or None - if the record should be removed. - :return bool: whether the result has been removed - """ - record_identifier = self._strict_record_id(record_identifier) - rm_record = True if result_identifier is None else False - if not self.check_record_exists(record_identifier): - _LOGGER.error(f"Record '{record_identifier}' not found") - return False - if result_identifier and not self.check_result_exists( - result_identifier, record_identifier - ): - _LOGGER.error( - f"'{result_identifier}' has not been reported for " - f"'{record_identifier}'" - ) - return False - if self.file: - self.data.make_writable() - if not self[DB_ONLY_KEY]: - if rm_record: - _LOGGER.info(f"Removing '{record_identifier}' record") - del self[DATA_KEY][self.namespace][record_identifier] - else: - val_backup = self[DATA_KEY][self.namespace][record_identifier][ - result_identifier - ] - del self[DATA_KEY][self.namespace][record_identifier][result_identifier] - _LOGGER.info( - f"Removed result '{result_identifier}' for record " - f"'{record_identifier}' from '{self.namespace}' namespace" - ) - if not self[DATA_KEY][self.namespace][record_identifier]: - _LOGGER.info( - f"Last result removed for '{record_identifier}'. " - f"Removing the record" - ) - del self[DATA_KEY][self.namespace][record_identifier] - rm_record = True - if self.file: - self.data.write() - self.data.make_readonly() - if self.file is None: - if rm_record: - try: - with self.db_cursor as cur: - cur.execute( - f"DELETE FROM {self.namespace} WHERE " - f"{RECORD_ID}='{record_identifier}'" - ) - except Exception as e: - _LOGGER.error( - f"Could not remove the result from the " - f"database. Exception: {e}" - ) - self[DATA_KEY][self.namespace].setdefault(record_identifier, PXAM()) - raise - return True - try: - with self.db_cursor as cur: - cur.execute( - f"UPDATE {self.namespace} SET {result_identifier}=null " - f"WHERE {RECORD_ID}='{record_identifier}'" - ) - except Exception as e: - _LOGGER.error( - f"Could not remove the result from the database. " f"Exception: {e}" - ) - if not self[DB_ONLY_KEY]: - self[DATA_KEY][self.namespace][record_identifier][ - result_identifier - ] = val_backup - raise - return True - - def validate_schema(self) -> None: - """ - Check schema for any possible issues - - :raises SchemaError: if any schema format issue is detected - """ - - def _recursively_replace_custom_types(s: dict) -> Dict: - """ - Replace the custom types in pipestat schema with canonical types - - :param dict s: schema to replace types in - :return dict: schema with types replaced - """ - for k, v in s.items(): - assert SCHEMA_TYPE_KEY in v, SchemaError( - f"Result '{k}' is missing '{SCHEMA_TYPE_KEY}' key" - ) - if v[SCHEMA_TYPE_KEY] == "object" and SCHEMA_PROP_KEY in s[k]: - _recursively_replace_custom_types(s[k][SCHEMA_PROP_KEY]) - if v[SCHEMA_TYPE_KEY] in CANONICAL_TYPES.keys(): - s.setdefault(k, {}) - s[k].setdefault(SCHEMA_PROP_KEY, {}) - s[k][SCHEMA_PROP_KEY].update( - CANONICAL_TYPES[v[SCHEMA_TYPE_KEY]][SCHEMA_PROP_KEY] - ) - s[k].setdefault("required", []) - s[k]["required"].extend( - CANONICAL_TYPES[v[SCHEMA_TYPE_KEY]]["required"] - ) - s[k][SCHEMA_TYPE_KEY] = CANONICAL_TYPES[v[SCHEMA_TYPE_KEY]][ - SCHEMA_TYPE_KEY - ] - return s - - schema = deepcopy(self.schema) - _LOGGER.debug(f"Validating input schema") - assert isinstance(schema, dict), SchemaError( - f"The schema has to be a {dict().__class__.__name__}" - ) - for col_name in RESERVED_COLNAMES: - assert col_name not in schema.keys(), PipestatError( - f"'{col_name}' is an identifier reserved by pipestat" - ) - self[RES_SCHEMAS_KEY] = {} - schema = _recursively_replace_custom_types(schema) - self[RES_SCHEMAS_KEY] = schema - - def assert_results_defined(self, results: List[str]) -> None: - """ - Assert provided list of results is defined in the schema - - :param List[str] results: list of results to - check for existence in the schema - :raises SchemaError: if any of the results is not defined in the schema - """ - known_results = self.result_schemas.keys() - for r in results: - assert r in known_results, SchemaError( - f"'{r}' is not a known result. Results defined in the " - f"schema are: {list(known_results)}." - ) - - def check_connection(self) -> bool: - """ - Check whether a PostgreSQL connection has been established - - :return bool: whether the connection has been established - """ - if self.file is not None: - raise PipestatDatabaseError( - f"The {self.__class__.__name__} object " f"is not backed by a database" - ) - if DB_CONNECTION_KEY in self and isinstance( - self[DB_CONNECTION_KEY], psycopg2.extensions.connection - ): - return True - return False - - def establish_postgres_connection(self, suppress: bool = False) -> bool: - """ - Establish PostgreSQL connection using the config data - - :param bool suppress: whether to suppress any connection errors - :return bool: whether the connection has been established successfully - """ - if self.check_connection(): - raise PipestatDatabaseError( - f"Connection is already established: " - f"{self[DB_CONNECTION_KEY].info.host}" - ) - try: - self[DB_CONNECTION_KEY] = psycopg2.connect( - dbname=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_NAME_KEY], - user=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_USER_KEY], - password=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_PASSWORD_KEY], - host=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_HOST_KEY], - port=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_PORT_KEY], - ) - except psycopg2.Error as e: - _LOGGER.error( - f"Could not connect to: " - f"{self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_HOST_KEY]}" - ) - _LOGGER.info(f"Caught error: {e}") - if suppress: - return False - raise - else: - _LOGGER.debug( - f"Established connection with PostgreSQL: " - f"{self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_HOST_KEY]}" - ) - return True - - def close_postgres_connection(self) -> None: - """ - Close connection and remove client bound - """ - if not self.check_connection(): - raise PipestatDatabaseError( - f"The connection has not been established: " - f"{self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_HOST_KEY]}" - ) - self[DB_CONNECTION_KEY].close() - del self[DB_CONNECTION_KEY] - _LOGGER.debug( - f"Closed connection with PostgreSQL: " - f"{self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_HOST_KEY]}" - ) - - def _strict_record_id(self, forced_value: str = None) -> str: - """ - Get record identifier from the outer source or stored with this object - - :param str forced_value: return this value - :return str: record identifier - """ - if forced_value is not None: - return forced_value - if self.record_identifier is not None: - return self.record_identifier - raise PipestatError( - f"You must provide the record identifier you want to perform " - f"the action on. Either in the {self.__class__.__name__} " - f"constructor or as an argument to the method." - ) diff --git a/tests/data/config_no_schema.yaml b/tests/data/config_no_schema.yaml index a82d0c92..e9202da9 100644 --- a/tests/data/config_no_schema.yaml +++ b/tests/data/config_no_schema.yaml @@ -2,6 +2,7 @@ namespace: test record_identifier: sample1 #schema_path: sample_output_schema.yaml #$HOME/Desktop/sample_output_schema.yaml database: + dialect: postgresql name: pipestat-test user: postgres password: pipestat-password From b384c37aed04fc710f3b718f690ef73662eaef15 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 16 Apr 2021 13:54:32 -0400 Subject: [PATCH 21/65] accept a custom declarative base if needed, repr --- pipestat/pipestat.py | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/pipestat/pipestat.py b/pipestat/pipestat.py index 5b814480..b6ef451a 100644 --- a/pipestat/pipestat.py +++ b/pipestat/pipestat.py @@ -1,15 +1,15 @@ from contextlib import contextmanager from copy import deepcopy from logging import getLogger -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union from urllib.parse import quote_plus import sqlalchemy.orm from attmap import PathExAttMap as PXAM from jsonschema import validate -from sqlalchemy import Column, Float, ForeignKey, Integer, String, Table, create_engine +from sqlalchemy import Column, create_engine from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import DeclarativeMeta, relationship, sessionmaker +from sqlalchemy.orm import DeclarativeMeta, sessionmaker from ubiquerg import create_lock, remove_lock from yacman import YacAttMap @@ -33,14 +33,15 @@ class PipestatManager(dict): def __init__( self, - namespace: str = None, - record_identifier: str = None, - schema_path: str = None, - results_file_path: str = None, - database_only: bool = False, - config: Union[str, dict] = None, - status_schema_path: str = None, - flag_file_dir: str = None, + namespace: Optional[str] = None, + record_identifier: Optional[str] = None, + schema_path: Optional[str] = None, + results_file_path: Optional[str] = None, + database_only: Optional[bool] = False, + config: Optional[Union[str, dict]] = None, + status_schema_path: Optional[str] = None, + flag_file_dir: Optional[str] = None, + custom_declarative_base: Optional[sqlalchemy.orm.DeclarativeMeta] = None, ): """ Initialize the object @@ -60,6 +61,8 @@ def __init__( with the config file content :param str status_schema_path: path to the status schema that formalizes the status flags structure + :param sqlalchemy.orm.DeclarativeMeta custom_declarative_base: a declarative base to + use for ORMs creation a new instance will be created if not provided """ def _check_cfg_key(cfg: dict, key: str) -> bool: @@ -218,16 +221,16 @@ def _select_value( ] ): raise MissingConfigDataError( - "Must specify all database login " "credentials or result_file_path" + "Must specify all database login credentials or result_file_path" ) self[DB_ORMS_KEY] = {} - self[DB_BASE_KEY] = declarative_base() + self[DB_BASE_KEY] = custom_declarative_base or declarative_base() self[DATA_KEY] = YacAttMap() self._init_db_table() self._init_status_table() else: raise MissingConfigDataError( - "Must specify either database login " "credentials or a YAML file path" + "Must specify either database login credentials or a YAML file path" ) def __str__(self): @@ -238,7 +241,9 @@ def __str__(self): """ res = f"{self.__class__.__name__} ({self.namespace})" res += "\nBackend: {}".format( - f"file ({self.file})" if self.file else "PostgreSQL" + f"file ({self.file})" + if self.file + else f"Database (dialect: {self[DB_ENGINE_KEY].dialect.name})" ) res += f"\nResults schema source: {self.schema_path}" res += f"\nStatus schema source: {self.status_schema_source}" @@ -1054,7 +1059,9 @@ def _dynamic_filter( return result def retrieve( - self, record_identifier: str = None, result_identifier: str = None + self, + record_identifier: Optional[str] = None, + result_identifier: Optional[str] = None, ) -> Union[Any, Dict[str, Any]]: """ Retrieve a result for a record. From 420bbfbfd5b638df0aeaef980fcf7cd05c606028 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 19 Apr 2021 09:23:20 -0400 Subject: [PATCH 22/65] enable selection of custom DBAPI drivers --- pipestat/const.py | 2 ++ pipestat/pipestat.py | 5 ++++- tests/data/config.yaml | 3 ++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pipestat/const.py b/pipestat/const.py index 09178d07..acee68b6 100644 --- a/pipestat/const.py +++ b/pipestat/const.py @@ -35,6 +35,7 @@ CFG_PASSWORD_KEY = "password" CFG_USER_KEY = "user" CFG_DIALECT_KEY = "dialect" # sqlite, mysql, postgresql, oracle, or mssql +CFG_DRIVER_KEY = "driver" DB_CREDENTIALS = [ CFG_HOST_KEY, @@ -43,6 +44,7 @@ CFG_USER_KEY, CFG_NAME_KEY, CFG_DIALECT_KEY, + CFG_DRIVER_KEY, ] # object attribute names diff --git a/pipestat/pipestat.py b/pipestat/pipestat.py index b6ef451a..39d91417 100644 --- a/pipestat/pipestat.py +++ b/pipestat/pipestat.py @@ -387,13 +387,16 @@ def db_url(self) -> str: host=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_HOST_KEY], port=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_PORT_KEY], dialect=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_DIALECT_KEY], + driver=self[CONFIG_KEY][CFG_DATABASE_KEY][CFG_DRIVER_KEY], ) except (KeyError, AttributeError) as e: raise PipestatDatabaseError( f"Could not determine database URL. Caught error: {str(e)}" ) parsed_creds = {k: quote_plus(str(v)) for k, v in creds.items()} - return "{dialect}://{user}:{passwd}@{host}:{port}/{name}".format(**parsed_creds) + return "{dialect}+{driver}://{user}:{passwd}@{host}:{port}/{name}".format( + **parsed_creds + ) @property @contextmanager diff --git a/tests/data/config.yaml b/tests/data/config.yaml index 1e619592..7385e630 100644 --- a/tests/data/config.yaml +++ b/tests/data/config.yaml @@ -3,9 +3,10 @@ record_identifier: sample1 schema_path: sample_output_schema.yaml #$HOME/Desktop/sample_output_schema.yaml database: dialect: postgresql + driver: psycopg2 name: pipestat-test user: postgres password: pipestat-password - host: localhost + host: 127.0.0.1 port: 5432 #results_file_path: results_file.yaml #$HOME/Desktop/results.yaml From 4bbc6cbe3af228184902c701e986788a15b2c064 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 19 Apr 2021 10:24:51 -0400 Subject: [PATCH 23/65] make pipestat DB type agnostic --- pipestat/const.py | 4 +-- pipestat/helpers.py | 56 ++++++++++++++++++++++++++++++++++++++ pipestat/pipestat.py | 65 ++++++-------------------------------------- 3 files changed, 66 insertions(+), 59 deletions(-) diff --git a/pipestat/const.py b/pipestat/const.py index acee68b6..e72ea111 100644 --- a/pipestat/const.py +++ b/pipestat/const.py @@ -123,8 +123,8 @@ "object": JSON, "image": JSON, "file": JSON, - "string": String, - "array": ARRAY(String), + "string": String(500), + "array": JSON, "boolean": Boolean, } diff --git a/pipestat/helpers.py b/pipestat/helpers.py index b24c0a0c..dbd35ef7 100644 --- a/pipestat/helpers.py +++ b/pipestat/helpers.py @@ -163,3 +163,59 @@ def paginate_query(query, offset, limit): ) query += sql.SQL(f" LIMIT {limit}") return query + + +from typing import Dict, List, Optional, Tuple, Union + +from sqlalchemy.orm import DeclarativeMeta, Query + + +def dynamic_filter( + ORM: DeclarativeMeta, + query: Query, + filter_conditions: List[Tuple[str, str, Union[str, List[str]]]], +): + """ + Return filtered query based on condition. + + :param sqlalchemy.orm.DeclarativeMeta ORM: + :param sqlalchemy.orm.Query query: takes query + :param [(key,operator,value)] filter_conditions: e.g. [("id", "eq", 1)] operator list + - eq for == + - lt for < + - ge for >= + - in for in_ + - like for like + :return: query + """ + for raw in filter_conditions: + try: + key, op, value = raw + except ValueError: + raise Exception(f"Invalid filter: {raw}") + column = getattr(ORM, key, None) + if column is None: + raise Exception(f"Invalid filter column: {key}") + if op == "in": + if isinstance(value, list): + filt = column.in_(value) + else: + filt = column.in_(value.split(",")) + else: + try: + attr = ( + list( + filter( + lambda e: hasattr(column, e % op), + ["%s", "%s_", "__%s__"], + ) + )[0] + % op + ) + except IndexError: + raise Exception(f"Invalid filter operator: {op}") + if value == "null": + value = None + filt = getattr(column, attr)(value) + query = query.filter(filt) + return query diff --git a/pipestat/pipestat.py b/pipestat/pipestat.py index 39d91417..73bff113 100644 --- a/pipestat/pipestat.py +++ b/pipestat/pipestat.py @@ -481,7 +481,7 @@ def _auto_repr(x: Any) -> str: attr_dict = dict( __tablename__=tn, id=Column(Integer, primary_key=True), - record_identifier=Column(String, unique=True), + record_identifier=Column(SQL_CLASSES_BY_TYPE["string"], unique=True), ) for result_id, result_metadata in schema.items(): col_type = SQL_CLASSES_BY_TYPE[result_metadata[SCHEMA_TYPE_KEY]] @@ -500,7 +500,6 @@ def establish_db_connection(self) -> bool: """ if self.is_db_connected(): raise PipestatDatabaseError("Connection is already established") - self[DB_ENGINE_KEY] = create_engine(self.db_url, echo=True) self[DB_SESSION_KEY] = sessionmaker(bind=self[DB_ENGINE_KEY]) return True @@ -975,7 +974,9 @@ def select( self, table_name: Optional[str] = None, columns: Optional[List[str]] = None, - filter_condition: Optional[List[Tuple[str, str, Union[str, List[str]]]]] = None, + filter_conditions: Optional[ + List[Tuple[str, str, Union[str, List[str]]]] + ] = None, offset: Optional[int] = None, limit: Optional[int] = None, ) -> List[Any]: @@ -984,7 +985,7 @@ def select( :param str table_name: name of the table to SELECT from :param List[str] columns: columns to include in the result - :param [(key,operator,value)] filter_condition: e.g. [("id", "eq", 1)] operator list + :param [(key,operator,value)] filter_conditions: e.g. [("id", "eq", 1)] operator list - eq for == - lt for < - ge for >= @@ -994,65 +995,15 @@ def select( :param int limit: include this number of rows """ - def _dynamic_filter( - ORM: sqlalchemy.orm.DeclarativeMeta, - query: sqlalchemy.orm.Query, - filter_condition: List[Tuple[str, str, Union[str, List[str]]]], - ): - """ - Return filtered query based on condition. - - :param sqlalchemy.orm.DeclarativeMeta ORM: - :param sqlalchemy.orm.Query query: takes query - :param [(key,operator,value)] filter_condition: e.g. [("id", "eq", 1)] operator list - - eq for == - - lt for < - - ge for >= - - in for in_ - - like for like - :return: query - """ - for raw in filter_condition: - try: - key, op, value = raw - except ValueError: - raise Exception("Invalid filter: %s" % raw) - column = getattr(ORM, key, None) - if column is None: - raise Exception("Invalid filter column: %s" % key) - if op == "in": - if isinstance(value, list): - filt = column.in_(value) - else: - filt = column.in_(value.split(",")) - else: - try: - attr = ( - list( - filter( - lambda e: hasattr(column, e % op), - ["%s", "%s_", "__%s__"], - ) - )[0] - % op - ) - except IndexError: - raise Exception(f"Invalid filter operator: {op}") - if value == "null": - value = None - filt = getattr(column, attr)(value) - query = query.filter(filt) - return query - ORM = self._get_orm(table_name or self.namespace) with self.session as s: if columns is not None: query = s.query(*[getattr(ORM, column) for column in columns]) else: query = s.query(ORM) - if filter_condition is not None: - query = _dynamic_filter( - ORM=ORM, query=query, filter_condition=filter_condition + if filter_conditions is not None: + query = dynamic_filter( + ORM=ORM, query=query, filter_conditions=filter_conditions ) if isinstance(offset, int): query = query.offset(offset) From ace33a8f07b95ba61d7dd24021421d9fd2aa4d11 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 20 Apr 2021 11:10:50 -0400 Subject: [PATCH 24/65] add JSONB querying support --- pipestat/const.py | 9 +++--- pipestat/helpers.py | 73 ++++++++++++++++++++++++++------------------ pipestat/pipestat.py | 19 ++++++++---- 3 files changed, 62 insertions(+), 39 deletions(-) diff --git a/pipestat/const.py b/pipestat/const.py index e72ea111..1772e5fd 100644 --- a/pipestat/const.py +++ b/pipestat/const.py @@ -1,5 +1,6 @@ import os +from sqlalchemy.dialects.postgresql.json import JSONB from sqlalchemy.types import ARRAY, JSON, Boolean, Float, Integer, String PKG_NAME = "pipestat" @@ -120,11 +121,11 @@ SQL_CLASSES_BY_TYPE = { "number": Float, "integer": Integer, - "object": JSON, - "image": JSON, - "file": JSON, + "object": JSONB, + "image": JSONB, + "file": JSONB, "string": String(500), - "array": JSON, + "array": JSONB, "boolean": Boolean, } diff --git a/pipestat/helpers.py b/pipestat/helpers.py index dbd35ef7..e51dd49e 100644 --- a/pipestat/helpers.py +++ b/pipestat/helpers.py @@ -3,6 +3,7 @@ from typing import Any, Dict import jsonschema +import sqlalchemy.orm from oyaml import safe_load from psycopg2 import sql from ubiquerg import expandpath @@ -173,8 +174,9 @@ def paginate_query(query, offset, limit): def dynamic_filter( ORM: DeclarativeMeta, query: Query, - filter_conditions: List[Tuple[str, str, Union[str, List[str]]]], -): + filter_conditions: Optional[List[Tuple[str, str, Union[str, List[str]]]]] = None, + json_filter_conditions: Optional[List[Tuple[str, str, str]]] = None, +) -> sqlalchemy.orm.Query: """ Return filtered query based on condition. @@ -186,36 +188,49 @@ def dynamic_filter( - ge for >= - in for in_ - like for like + :param [(col,key,value)] json_filter_conditions: conditions for JSONB column to query. + Only '==' is supported e.g. [("other", "genome", "hg38")] :return: query """ - for raw in filter_conditions: + + def _unpack_tripartite(x): try: - key, op, value = raw + e1, e2, e3 = x + return e1, e2, e3 except ValueError: - raise Exception(f"Invalid filter: {raw}") - column = getattr(ORM, key, None) - if column is None: - raise Exception(f"Invalid filter column: {key}") - if op == "in": - if isinstance(value, list): - filt = column.in_(value) + raise Exception(f"Invalid tripartite element: {x}") + + if filter_conditions is not None: + for filter_condition in filter_conditions: + key, op, value = _unpack_tripartite(filter_condition) + column = getattr(ORM, key, None) + if column is None: + raise Exception(f"Invalid filter column: {key}") + if op == "in": + if isinstance(value, list): + filt = column.in_(value) + else: + filt = column.in_(value.split(",")) else: - filt = column.in_(value.split(",")) - else: - try: - attr = ( - list( - filter( - lambda e: hasattr(column, e % op), - ["%s", "%s_", "__%s__"], - ) - )[0] - % op - ) - except IndexError: - raise Exception(f"Invalid filter operator: {op}") - if value == "null": - value = None - filt = getattr(column, attr)(value) - query = query.filter(filt) + try: + attr = ( + list( + filter( + lambda e: hasattr(column, e % op), + ["%s", "%s_", "__%s__"], + ) + )[0] + % op + ) + except IndexError: + raise Exception(f"Invalid filter operator: {op}") + if value == "null": + value = None + filt = getattr(column, attr)(value) + query = query.filter(filt) + + if json_filter_conditions is not None: + for json_filter_condition in json_filter_conditions: + col, key, value = _unpack_tripartite(json_filter_condition) + query = query.filter(getattr(ORM, col)[key].astext == value) return query diff --git a/pipestat/pipestat.py b/pipestat/pipestat.py index 73bff113..79e0779f 100644 --- a/pipestat/pipestat.py +++ b/pipestat/pipestat.py @@ -977,20 +977,25 @@ def select( filter_conditions: Optional[ List[Tuple[str, str, Union[str, List[str]]]] ] = None, + json_filter_conditions: Optional[List[Tuple[str, str, str]]] = None, offset: Optional[int] = None, limit: Optional[int] = None, ) -> List[Any]: """ - Perform a SELECT on the table, filtering limited to a single condition + Perform a SELECT on the table :param str table_name: name of the table to SELECT from :param List[str] columns: columns to include in the result - :param [(key,operator,value)] filter_conditions: e.g. [("id", "eq", 1)] operator list + :param [(key,operator,value)] filter_conditions: e.g. [("id", "eq", 1)], operator list: - eq for == - lt for < - ge for >= - in for in_ - like for like + :param [(col,key,value)] json_filter_conditions: conditions for JSONB column to + query that include JSON column name, key withing the JSON object in that + column and the value to check the identity against. Therefore only '==' is + supported in non-nested checks, e.g. [("other", "genome", "hg38")] :param int offset: skip this number of rows :param int limit: include this number of rows """ @@ -1001,10 +1006,12 @@ def select( query = s.query(*[getattr(ORM, column) for column in columns]) else: query = s.query(ORM) - if filter_conditions is not None: - query = dynamic_filter( - ORM=ORM, query=query, filter_conditions=filter_conditions - ) + query = dynamic_filter( + ORM=ORM, + query=query, + filter_conditions=filter_conditions, + json_filter_conditions=json_filter_conditions, + ) if isinstance(offset, int): query = query.offset(offset) if isinstance(limit, int): From ca1df08fb6fb3ea24414782c4f5e7c3552594542 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 27 Apr 2021 15:15:51 -0400 Subject: [PATCH 25/65] store scoped_session as obj attr --- pipestat/const.py | 1 + pipestat/pipestat.py | 4 +++- tests/data/config_no_schema.yaml | 1 + tests/test_db_only_mode.py | 4 ++-- 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pipestat/const.py b/pipestat/const.py index 1772e5fd..eac46bcd 100644 --- a/pipestat/const.py +++ b/pipestat/const.py @@ -64,6 +64,7 @@ FILE_KEY = "_file" RECORD_ID_KEY = "_record_id" DB_SESSION_KEY = "_db_session" +DB_SCOPED_SESSION_KEY = "_db_scoped_session" DB_ENGINE_KEY = "_db_engine" HIGHLIGHTED_KEY = "_highlighted" diff --git a/pipestat/pipestat.py b/pipestat/pipestat.py index 79e0779f..8e19b635 100644 --- a/pipestat/pipestat.py +++ b/pipestat/pipestat.py @@ -9,7 +9,7 @@ from jsonschema import validate from sqlalchemy import Column, create_engine from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import DeclarativeMeta, sessionmaker +from sqlalchemy.orm import DeclarativeMeta, scoped_session, sessionmaker from ubiquerg import create_lock, remove_lock from yacman import YacAttMap @@ -482,6 +482,7 @@ def _auto_repr(x: Any) -> str: __tablename__=tn, id=Column(Integer, primary_key=True), record_identifier=Column(SQL_CLASSES_BY_TYPE["string"], unique=True), + query=self[DB_SCOPED_SESSION_KEY].query_property(), ) for result_id, result_metadata in schema.items(): col_type = SQL_CLASSES_BY_TYPE[result_metadata[SCHEMA_TYPE_KEY]] @@ -502,6 +503,7 @@ def establish_db_connection(self) -> bool: raise PipestatDatabaseError("Connection is already established") self[DB_ENGINE_KEY] = create_engine(self.db_url, echo=True) self[DB_SESSION_KEY] = sessionmaker(bind=self[DB_ENGINE_KEY]) + self[DB_SCOPED_SESSION_KEY] = scoped_session(self[DB_SESSION_KEY]) return True def is_db_connected(self) -> bool: diff --git a/tests/data/config_no_schema.yaml b/tests/data/config_no_schema.yaml index e9202da9..0248ec5a 100644 --- a/tests/data/config_no_schema.yaml +++ b/tests/data/config_no_schema.yaml @@ -3,6 +3,7 @@ record_identifier: sample1 #schema_path: sample_output_schema.yaml #$HOME/Desktop/sample_output_schema.yaml database: dialect: postgresql + driver: psycopg2 name: pipestat-test user: postgres password: pipestat-password diff --git a/tests/test_db_only_mode.py b/tests/test_db_only_mode.py index 660006ec..dd259f3a 100644 --- a/tests/test_db_only_mode.py +++ b/tests/test_db_only_mode.py @@ -25,7 +25,7 @@ def test_report(self, val, config_file_path, schema_file_path, results_file_path psm.report(values=val) assert len(psm.data) == 0 val_name = list(val.keys())[0] - assert psm.select(filter_condition=[(val_name, "eq", str(val[val_name]))]) + assert psm.select(filter_conditions=[(val_name, "eq", str(val[val_name]))]) @pytest.mark.parametrize(["rec_id", "res_id"], [("sample2", "number_of_things")]) @pytest.mark.parametrize("backend", ["db"]) @@ -45,7 +45,7 @@ def test_select_limit( ) psm = PipestatManager(**args) result = psm.select( - filter_condition=[(RECORD_ID, "eq", rec_id)], + filter_conditions=[(RECORD_ID, "eq", rec_id)], columns=[res_id], limit=limit, ) From 294031e19281c8de689f9749bc7c0f7c95bf60cf Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 27 Apr 2021 15:18:58 -0400 Subject: [PATCH 26/65] reformat --- pipestat/cli.py | 2 +- pipestat/exceptions.py | 14 +++++++------- tests/test_init.py | 18 +++++++++--------- tests/test_status.py | 4 ++-- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/pipestat/cli.py b/pipestat/cli.py index ab52c931..73bfe4f1 100644 --- a/pipestat/cli.py +++ b/pipestat/cli.py @@ -12,7 +12,7 @@ def main(): - """ Primary workflow """ + """Primary workflow""" from inspect import getdoc parser = logmuse.add_logging_options(build_argparser(getdoc(PipestatManager))) diff --git a/pipestat/exceptions.py b/pipestat/exceptions.py index 8d4f4522..6185aa39 100644 --- a/pipestat/exceptions.py +++ b/pipestat/exceptions.py @@ -16,20 +16,20 @@ class PipestatError(Exception): - """ Base exception type for this package """ + """Base exception type for this package""" __metaclass__ = abc.ABCMeta class SchemaError(PipestatError): - """ Schema error """ + """Schema error""" def __init__(self, msg): super(SchemaError, self).__init__(msg) class SchemaNotFoundError(SchemaError): - """ Schema not found error """ + """Schema not found error""" def __init__(self, msg): txt = ( @@ -40,7 +40,7 @@ def __init__(self, msg): class MissingConfigDataError(PipestatError): - """ Exception for invalid config file. """ + """Exception for invalid config file.""" def __init__(self, msg): spacing = " " if msg[-1] in ["?", ".", "\n"] else "; " @@ -49,14 +49,14 @@ def __init__(self, msg): class PipestatDatabaseError(PipestatError): - """ Database error """ + """Database error""" def __init__(self, msg): super(PipestatDatabaseError, self).__init__(msg) class InvalidTypeError(PipestatError): - """ Type of the reported value is not supported """ + """Type of the reported value is not supported""" def __init__(self, type): super(InvalidTypeError, self).__init__( @@ -66,7 +66,7 @@ def __init__(self, type): class IncompatibleClassError(PipestatError): - """ Class of the reported value is not supported """ + """Class of the reported value is not supported""" def __init__(self, cls, req_cls, type): super(IncompatibleClassError, self).__init__( diff --git a/tests/test_init.py b/tests/test_init.py index 1490a78a..e614c52e 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -31,7 +31,7 @@ def test_connection_overwrite_error(self, config_file_path, schema_file_path): class TestPipestatManagerInstantiation: def test_obj_creation_file(self, schema_file_path, results_file_path): - """ Object constructor works with file as backend""" + """Object constructor works with file as backend""" assert isinstance( PipestatManager( namespace="test", @@ -42,7 +42,7 @@ def test_obj_creation_file(self, schema_file_path, results_file_path): ) def test_obj_creation_db(self, config_file_path): - """ Object constructor works with database as backend""" + """Object constructor works with database as backend""" assert isinstance(PipestatManager(config=config_file_path), PipestatManager) @pytest.mark.xfail(reason="schema is no longer required to init the object") @@ -75,7 +75,7 @@ def test_schema_recursive_custom_type_conversion( ) def test_missing_cfg_data(self, schema_file_path): - """ Object constructor raises exception if cfg is missing data """ + """Object constructor raises exception if cfg is missing data""" tmp_pth = os.path.join(mkdtemp(), "res.yml") with open(tmp_pth, "w") as file: dump({"database": {"host": "localhost"}}, file) @@ -85,12 +85,12 @@ def test_missing_cfg_data(self, schema_file_path): ) def test_unknown_backend(self, schema_file_path): - """ Either db config or results file path needs to be provided """ + """Either db config or results file path needs to be provided""" with pytest.raises(MissingConfigDataError): PipestatManager(namespace="test", schema_path=schema_file_path) def test_create_results_file(self, schema_file_path): - """ Results file is created if a nonexistent path provided """ + """Results file is created if a nonexistent path provided""" tmp_res_file = os.path.join(mkdtemp(), "res.yml") print(f"Temporary results file: {tmp_res_file}") assert not os.path.exists(tmp_res_file) @@ -102,7 +102,7 @@ def test_create_results_file(self, schema_file_path): assert os.path.exists(tmp_res_file) def test_use_other_namespace_file(self, schema_file_path): - """ Results file can be used with just one namespace """ + """Results file can be used with just one namespace""" tmp_res_file = os.path.join(mkdtemp(), "res.yml") print(f"Temporary results file: {tmp_res_file}") assert not os.path.exists(tmp_res_file) @@ -121,14 +121,14 @@ def test_use_other_namespace_file(self, schema_file_path): @pytest.mark.parametrize("pth", [["/$HOME/path.yaml"], 1]) def test_wrong_class_results_file(self, schema_file_path, pth): - """ Input string that is not a file path raises an informative error """ + """Input string that is not a file path raises an informative error""" with pytest.raises((TypeError, AssertionError)): PipestatManager( namespace="test", results_file_path=pth, schema_path=schema_file_path ) def test_results_file_contents_loaded(self, results_file_path, schema_file_path): - """ Contents of the results file are present after loading """ + """Contents of the results file are present after loading""" psm = PipestatManager( namespace="test", results_file_path=results_file_path, @@ -137,7 +137,7 @@ def test_results_file_contents_loaded(self, results_file_path, schema_file_path) assert "test" in psm.data def test_str_representation(self, results_file_path, schema_file_path): - """ Test string representation identifies number of records """ + """Test string representation identifies number of records""" psm = PipestatManager( namespace="test", results_file_path=results_file_path, diff --git a/tests/test_status.py b/tests/test_status.py index a8addf6f..58fd3f86 100644 --- a/tests/test_status.py +++ b/tests/test_status.py @@ -20,7 +20,7 @@ def test_status_file_defult_location(self, schema_file_path, results_file_path): def test_status_not_configured( self, schema_file_path, config_file_path, results_file_path, backend, status_id ): - """ status management works even in case it has not been configured""" + """status management works even in case it has not been configured""" args = dict(schema_path=schema_file_path, namespace="test") backend_data = ( {"config": config_file_path} @@ -45,7 +45,7 @@ def test_custom_status_schema( status_id, custom_status_schema, ): - """ status management works even in case it has not been configured""" + """status management works even in case it has not been configured""" args = dict( schema_path=schema_file_path, namespace="test", From f3354acc1bf5b8b76cbc5a1c1dd0dfab1df83c11 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 29 Apr 2021 08:31:33 -0400 Subject: [PATCH 27/65] add documentation to the columns --- pipestat/pipestat.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pipestat/pipestat.py b/pipestat/pipestat.py index 8e19b635..44887790 100644 --- a/pipestat/pipestat.py +++ b/pipestat/pipestat.py @@ -481,13 +481,19 @@ def _auto_repr(x: Any) -> str: attr_dict = dict( __tablename__=tn, id=Column(Integer, primary_key=True), - record_identifier=Column(SQL_CLASSES_BY_TYPE["string"], unique=True), + record_identifier=Column( + SQL_CLASSES_BY_TYPE["string"], + unique=True, + doc="A unique identifier of the record", + ), query=self[DB_SCOPED_SESSION_KEY].query_property(), ) for result_id, result_metadata in schema.items(): col_type = SQL_CLASSES_BY_TYPE[result_metadata[SCHEMA_TYPE_KEY]] _LOGGER.debug(f"Adding object: {result_id} of type: {str(col_type)}") - attr_dict.update({result_id: Column(col_type)}) + attr_dict.update( + {result_id: Column(col_type, doc=result_metadata["description"])} + ) attr_dict.update({"__repr__": _auto_repr}) _LOGGER.debug(f"Creating '{tn}' ORM with args: {attr_dict}") self[DB_ORMS_KEY][tn] = type(tn.capitalize(), (self[DB_BASE_KEY],), attr_dict) From 8291db023ccff5d2ce3a451e7bde4604668430ac Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 29 Apr 2021 09:15:22 -0400 Subject: [PATCH 28/65] require description in results schema --- pipestat/const.py | 1 + pipestat/pipestat.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/pipestat/const.py b/pipestat/const.py index eac46bcd..c1e105fe 100644 --- a/pipestat/const.py +++ b/pipestat/const.py @@ -71,6 +71,7 @@ # schema keys SCHEMA_PROP_KEY = "properties" SCHEMA_TYPE_KEY = "type" +SCHEMA_DESC_KEY = "description" # DB column names ID = "id" diff --git a/pipestat/pipestat.py b/pipestat/pipestat.py index 44887790..81597426 100644 --- a/pipestat/pipestat.py +++ b/pipestat/pipestat.py @@ -724,6 +724,9 @@ def _recursively_replace_custom_types(s: dict) -> Dict: assert SCHEMA_TYPE_KEY in v, SchemaError( f"Result '{k}' is missing '{SCHEMA_TYPE_KEY}' key" ) + assert SCHEMA_DESC_KEY in v, SchemaError( + f"Result '{k}' is missing '{SCHEMA_DESC_KEY}' key" + ) if v[SCHEMA_TYPE_KEY] == "object" and SCHEMA_PROP_KEY in s[k]: _recursively_replace_custom_types(s[k][SCHEMA_PROP_KEY]) if v[SCHEMA_TYPE_KEY] in CANONICAL_TYPES.keys(): From 821da6009d922870375268d0049919a0bbae669b Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 30 Apr 2021 13:50:45 -0400 Subject: [PATCH 29/65] move/update log --- pipestat/pipestat.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pipestat/pipestat.py b/pipestat/pipestat.py index 81597426..a6def569 100644 --- a/pipestat/pipestat.py +++ b/pipestat/pipestat.py @@ -477,6 +477,9 @@ def _auto_repr(x: Any) -> str: ] return "<{}: {}>".format(x.__class__.__name__, ", ".join(attr_strs)) + _LOGGER.info( + f"Creating models for '{self.namespace}' table in '{PKG_NAME}' database" + ) tn = table_name or self.namespace attr_dict = dict( __tablename__=tn, @@ -794,7 +797,6 @@ def _init_db_table(self) -> bool: raise SchemaNotFoundError("initialize the database table") if not self.is_db_connected(): self.establish_db_connection() - _LOGGER.info(f"Initializing '{self.namespace}' table in '{PKG_NAME}' database") self._create_table_orm(table_name=self.namespace, schema=self.result_schemas) if not self[DB_ONLY_KEY]: self._table_to_dict() From 45d1cd27851f9c59e33a2aecaa8f6750cb9a51d1 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 4 May 2021 17:08:41 -0400 Subject: [PATCH 30/65] dont show DB engine logs by default --- pipestat/__init__.py | 2 +- pipestat/pipestat.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pipestat/__init__.py b/pipestat/__init__.py index 02cdabbb..57d036ff 100644 --- a/pipestat/__init__.py +++ b/pipestat/__init__.py @@ -9,4 +9,4 @@ __classes__ = ["PipestatManager"] __all__ = __classes__ -logmuse.init_logger("pipestat") +logmuse.init_logger(PKG_NAME) diff --git a/pipestat/pipestat.py b/pipestat/pipestat.py index a6def569..959e5df9 100644 --- a/pipestat/pipestat.py +++ b/pipestat/pipestat.py @@ -42,6 +42,7 @@ def __init__( status_schema_path: Optional[str] = None, flag_file_dir: Optional[str] = None, custom_declarative_base: Optional[sqlalchemy.orm.DeclarativeMeta] = None, + show_db_logs: bool = False, ): """ Initialize the object @@ -226,6 +227,7 @@ def _select_value( self[DB_ORMS_KEY] = {} self[DB_BASE_KEY] = custom_declarative_base or declarative_base() self[DATA_KEY] = YacAttMap() + self._show_db_logs = show_db_logs self._init_db_table() self._init_status_table() else: @@ -477,7 +479,7 @@ def _auto_repr(x: Any) -> str: ] return "<{}: {}>".format(x.__class__.__name__, ", ".join(attr_strs)) - _LOGGER.info( + _LOGGER.debug( f"Creating models for '{self.namespace}' table in '{PKG_NAME}' database" ) tn = table_name or self.namespace @@ -510,7 +512,7 @@ def establish_db_connection(self) -> bool: """ if self.is_db_connected(): raise PipestatDatabaseError("Connection is already established") - self[DB_ENGINE_KEY] = create_engine(self.db_url, echo=True) + self[DB_ENGINE_KEY] = create_engine(self.db_url, echo=self._show_db_logs) self[DB_SESSION_KEY] = sessionmaker(bind=self[DB_ENGINE_KEY]) self[DB_SCOPED_SESSION_KEY] = scoped_session(self[DB_SESSION_KEY]) return True From b69c0a138321bb428ef100ad60b86b60aed0198c Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 5 May 2021 13:56:40 -0400 Subject: [PATCH 31/65] dont store data in memory be default when using DB --- pipestat/pipestat.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/pipestat/pipestat.py b/pipestat/pipestat.py index 959e5df9..6a9cf7f2 100644 --- a/pipestat/pipestat.py +++ b/pipestat/pipestat.py @@ -37,7 +37,7 @@ def __init__( record_identifier: Optional[str] = None, schema_path: Optional[str] = None, results_file_path: Optional[str] = None, - database_only: Optional[bool] = False, + database_only: Optional[bool] = True, config: Optional[Union[str, dict]] = None, status_schema_path: Optional[str] = None, flag_file_dir: Optional[str] = None, @@ -203,11 +203,6 @@ def _select_value( self.config_path, ) if results_file_path: - if self[DB_ONLY_KEY]: - raise ValueError( - "Running in database only mode does not make " - "sense with a YAML file as a backend." - ) self[FILE_KEY] = results_file_path self._init_results_file() flag_file_dir = _select_value( @@ -1172,12 +1167,13 @@ def report( rid=record_identifier, results=result_identifiers ) if existing: + existing_str = ", ".join(existing) _LOGGER.warning( - f"These results exist for '{record_identifier}': {existing}" + f"These results exist for '{record_identifier}': {existing_str}" ) if not force_overwrite: return False - _LOGGER.info(f"Overwriting existing results: {existing}") + _LOGGER.info(f"Overwriting existing results: {existing_str}") for r in result_identifiers: validate_type( value=values[r], schema=self.result_schemas[r], strict_type=strict_type From fbdd3003c151fde8a62e1c70f247b1289f2d6427 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 5 May 2021 14:03:34 -0400 Subject: [PATCH 32/65] add descriptions to test properties --- tests/data/sample_output_schema_recursive.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/data/sample_output_schema_recursive.yaml b/tests/data/sample_output_schema_recursive.yaml index f398275d..74a281c8 100644 --- a/tests/data/sample_output_schema_recursive.yaml +++ b/tests/data/sample_output_schema_recursive.yaml @@ -7,8 +7,10 @@ output_file_in_object: type: object properties: prop1: + description: "This is an example file" type: file prop2: + description: "This is an example image" type: image description: "Object output" output_file: @@ -16,4 +18,4 @@ output_file: description: "This a path to the output file" output_image: type: image - description: "This a path to the output image" \ No newline at end of file + description: "This a path to the output image" From 0b11661245c08f5fc759ff62b7d95d7af2b6f83a Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 5 May 2021 14:08:02 -0400 Subject: [PATCH 33/65] update tests after chaging mem data behavior --- tests/test_pipestat.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tests/test_pipestat.py b/tests/test_pipestat.py index 1822c9a1..3bfb272a 100644 --- a/tests/test_pipestat.py +++ b/tests/test_pipestat.py @@ -48,7 +48,7 @@ def test_report_basic( results_file_path, backend, ): - args = dict(schema_path=schema_file_path, namespace="test") + args = dict(schema_path=schema_file_path, namespace="test", database_only=False) backend_data = ( {"config": config_file_path} if backend == "db" @@ -115,7 +115,7 @@ def test_report_overwrite( results_file_path, backend, ): - args = dict(schema_path=schema_file_path, namespace="test") + args = dict(schema_path=schema_file_path, namespace="test", database_only=False) backend_data = ( {"config": config_file_path} if backend == "db" @@ -263,7 +263,7 @@ def test_remove_basic( schema_file_path, backend, ): - args = dict(schema_path=schema_file_path, namespace="test") + args = dict(schema_path=schema_file_path, namespace="test", database_only=False) backend_data = ( {"config": config_file_path} if backend == "db" @@ -279,7 +279,7 @@ def test_remove_basic( def test_remove_record( self, rec_id, schema_file_path, config_file_path, results_file_path, backend ): - args = dict(schema_path=schema_file_path, namespace="test") + args = dict(schema_path=schema_file_path, namespace="test", database_only=False) backend_data = ( {"config": config_file_path} if backend == "db" @@ -339,7 +339,7 @@ def test_last_result_removal_removes_record( results_file_path, backend, ): - args = dict(schema_path=schema_file_path, namespace="test") + args = dict(schema_path=schema_file_path, namespace="test", database_only=False) backend_data = ( {"config": config_file_path} if backend == "db" @@ -366,7 +366,10 @@ def test_report( ): REC_ID = "constant_record_id" args = dict( - schema_path=schema_file_path, namespace="test", record_identifier=REC_ID + schema_path=schema_file_path, + namespace="test", + record_identifier=REC_ID, + database_only=False, ) backend_data = ( {"config": config_file_path} From 5e33a1313d40716d729e210bd751cae341064eef Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 7 May 2021 10:17:38 -0400 Subject: [PATCH 34/65] always save results in memory in case of file backend --- pipestat/pipestat.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pipestat/pipestat.py b/pipestat/pipestat.py index 6a9cf7f2..7912488f 100644 --- a/pipestat/pipestat.py +++ b/pipestat/pipestat.py @@ -203,6 +203,12 @@ def _select_value( self.config_path, ) if results_file_path: + if self[DB_ONLY_KEY]: + _LOGGER.warning( + "Running in database only mode does not make sense with a YAML file as a backend. " + "Changing back to using memory." + ) + self[DB_ONLY_KEY] = not self[DB_ONLY_KEY] self[FILE_KEY] = results_file_path self._init_results_file() flag_file_dir = _select_value( From aadc4211e4e6c194ddbdaeebd48c5f057c90ad3c Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 10 May 2021 10:53:25 -0400 Subject: [PATCH 35/65] add piestat configuration docs --- docs/configuration.md | 17 +++++++++++++++++ pipestat/pipestat.py | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 docs/configuration.md diff --git a/docs/configuration.md b/docs/configuration.md new file mode 100644 index 00000000..0ac6460f --- /dev/null +++ b/docs/configuration.md @@ -0,0 +1,17 @@ +# Pipestat configuration + +Pipestat *requires* a few pieces of information to run: + +- a **namespace** to write into, for example the name of the pipeline +- a path to the **schema** file that describes results that can be reported +- **backend info**: either path to a YAML-formatted file or pipestat config with PostgreSQL database login credentials + +Apart from that, there are many other *optional* configuration points that have defaults. Please refer to the [environment variables reference](http://pipestat.databio.org/en/dev/env_vars/) to learn about the the optional configuration options and their meaning. + +## Configuration sources + +Pipestat configuration can come from 3 sources, with the following priority: + +1. `PipestatManager` constructor +2. Pipestat configuration file +3. Environment variables diff --git a/pipestat/pipestat.py b/pipestat/pipestat.py index 7912488f..be23e0fe 100644 --- a/pipestat/pipestat.py +++ b/pipestat/pipestat.py @@ -204,7 +204,7 @@ def _select_value( ) if results_file_path: if self[DB_ONLY_KEY]: - _LOGGER.warning( + _LOGGER.debug( "Running in database only mode does not make sense with a YAML file as a backend. " "Changing back to using memory." ) From de5ab704da0d984ee2d146394c98dc79c14dc13a Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 10 May 2021 13:18:51 -0400 Subject: [PATCH 36/65] remove obsolete code, add tests, adjust exception class --- docs_jupyter/cli.ipynb | 35 +++++++-------- docs_jupyter/python_api.ipynb | 20 ++++----- mkdocs.yml | 5 ++- pipestat/helpers.py | 81 +++-------------------------------- tests/test_db_only_mode.py | 44 +++++++++++++++++++ 5 files changed, 79 insertions(+), 106 deletions(-) diff --git a/docs_jupyter/cli.ipynb b/docs_jupyter/cli.ipynb index 914ace6f..50115062 100644 --- a/docs_jupyter/cli.ipynb +++ b/docs_jupyter/cli.ipynb @@ -51,7 +51,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "version: 0.0.3\n", + "version: 0.1.0-dev\n", "usage: pipestat [-h] [--version] [--silent] [--verbosity V] [--logdev]\n", " {report,inspect,remove,retrieve,status} ...\n", "\n", @@ -77,7 +77,7 @@ "developers to communicate -- results produced by a pipeline can easily and\n", "reliably become an input for downstream analyses. The object exposes API for\n", "interacting with the results and pipeline status and can be backed by either a\n", - "YAML-formatted file or a PostgreSQL database.\n" + "YAML-formatted file or a database.\n" ] } ], @@ -116,8 +116,7 @@ " will be used. Currently set to:\n", " ../tests/data/sample_output_schema.yaml\n", " --status-schema ST Path to the status schema. Default will be used if not\n", - " provided: /Library/Frameworks/Python.framework/Versions/3.6\n", - " /lib/python3.6/site-\n", + " provided: /opt/homebrew/lib/python3.9/site-\n", " packages/pipestat/schemas/status_schema.yaml\n", " --flag-dir FD Path to the flag directory in case YAML file is the\n", " pipestat backend.\n", @@ -169,8 +168,7 @@ " will be used. Currently set to:\n", " ../tests/data/sample_output_schema.yaml\n", " --status-schema ST Path to the status schema. Default will be used if not\n", - " provided: /Library/Frameworks/Python.framework/Versions/3.6\n", - " /lib/python3.6/site-\n", + " provided: /opt/homebrew/lib/python3.9/site-\n", " packages/pipestat/schemas/status_schema.yaml\n", " --flag-dir FD Path to the flag directory in case YAML file is the\n", " pipestat backend.\n", @@ -217,8 +215,7 @@ " will be used. Currently set to:\n", " ../tests/data/sample_output_schema.yaml\n", " --status-schema ST Path to the status schema. Default will be used if not\n", - " provided: /Library/Frameworks/Python.framework/Versions/3.6\n", - " /lib/python3.6/site-\n", + " provided: /opt/homebrew/lib/python3.9/site-\n", " packages/pipestat/schemas/status_schema.yaml\n", " --flag-dir FD Path to the flag directory in case YAML file is the\n", " pipestat backend.\n", @@ -265,8 +262,8 @@ " be used. Currently set to:\n", " ../tests/data/sample_output_schema.yaml\n", " --status-schema ST Path to the status schema. Default will be used if not provided:\n", - " /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/\n", - " site-packages/pipestat/schemas/status_schema.yaml\n", + " /opt/homebrew/lib/python3.9/site-\n", + " packages/pipestat/schemas/status_schema.yaml\n", " --flag-dir FD Path to the flag directory in case YAML file is the pipestat\n", " backend.\n", " -d, --data Whether to display the data\n" @@ -335,8 +332,7 @@ " will be used. Currently set to:\n", " ../tests/data/sample_output_schema.yaml\n", " --status-schema ST Path to the status schema. Default will be used if not\n", - " provided: /Library/Frameworks/Python.framework/Versions/3.6\n", - " /lib/python3.6/site-\n", + " provided: /opt/homebrew/lib/python3.9/site-\n", " packages/pipestat/schemas/status_schema.yaml\n", " --flag-dir FD Path to the flag directory in case YAML file is the\n", " pipestat backend.\n", @@ -382,8 +378,7 @@ " will be used. Currently set to:\n", " ../tests/data/sample_output_schema.yaml\n", " --status-schema ST Path to the status schema. Default will be used if not\n", - " provided: /Library/Frameworks/Python.framework/Versions/3.6\n", - " /lib/python3.6/site-\n", + " provided: /opt/homebrew/lib/python3.9/site-\n", " packages/pipestat/schemas/status_schema.yaml\n", " --flag-dir FD Path to the flag directory in case YAML file is the\n", " pipestat backend.\n", @@ -518,9 +513,9 @@ "\n", "\n", "PipestatManager (test)\n", - "Backend: file (/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/tmp.Zid7BMd1)\n", + "Backend: file (/var/folders/z_/bkm4rp412zjctjm28b54tbvr0000gn/T/tmp.h5YnTVAA)\n", "Results schema source: ../tests/data/sample_output_schema.yaml\n", - "Status schema source: /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pipestat/schemas/status_schema.yaml\n", + "Status schema source: /opt/homebrew/lib/python3.9/site-packages/pipestat/schemas/status_schema.yaml\n", "Records count: 1\n" ] } @@ -548,9 +543,9 @@ "\n", "\n", "PipestatManager (test)\n", - "Backend: file (/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/tmp.Zid7BMd1)\n", + "Backend: file (/var/folders/z_/bkm4rp412zjctjm28b54tbvr0000gn/T/tmp.h5YnTVAA)\n", "Results schema source: ../tests/data/sample_output_schema.yaml\n", - "Status schema source: /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pipestat/schemas/status_schema.yaml\n", + "Status schema source: /opt/homebrew/lib/python3.9/site-packages/pipestat/schemas/status_schema.yaml\n", "Records count: 1\n", "\n", "Data:\n", @@ -655,9 +650,9 @@ "\n", "\n", "PipestatManager (test)\n", - "Backend: file (/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/tmp.Zid7BMd1)\n", + "Backend: file (/var/folders/z_/bkm4rp412zjctjm28b54tbvr0000gn/T/tmp.h5YnTVAA)\n", "Results schema source: ../tests/data/sample_output_schema.yaml\n", - "Status schema source: /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pipestat/schemas/status_schema.yaml\n", + "Status schema source: /opt/homebrew/lib/python3.9/site-packages/pipestat/schemas/status_schema.yaml\n", "Records count: 1\n", "\n", "Data:\n", diff --git a/docs_jupyter/python_api.ipynb b/docs_jupyter/python_api.ipynb index aded8c5d..85d972a4 100644 --- a/docs_jupyter/python_api.ipynb +++ b/docs_jupyter/python_api.ipynb @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -51,14 +51,14 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/tmplgf49zfa.yaml\n" + "/var/folders/z_/bkm4rp412zjctjm28b54tbvr0000gn/T/tmpylsi8onv.yaml\n" ] } ], @@ -78,7 +78,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -380,7 +380,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "These results exist for 'sample1': ['output_file']\n" + "These results exist for 'sample1': output_file\n" ] }, { @@ -414,8 +414,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "These results exist for 'sample1': ['output_file']\n", - "Overwriting existing results: ['output_file']\n", + "These results exist for 'sample1': output_file\n", + "Overwriting existing results: output_file\n", "Reported records for 'sample1' in 'test' namespace:\n", " - output_file: {'path': '/home/user/path_new.csv', 'title': 'new CSV file with some data'}\n" ] @@ -509,7 +509,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/tmp9cozi7pc.yaml\n", + "/var/folders/z_/bkm4rp412zjctjm28b54tbvr0000gn/T/tmpylsi8onv.yaml\n", "test:\n", " sample1:\n", " output_file:\n", @@ -829,7 +829,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/tmpsfzryntz.yaml\n" + "/var/folders/z_/bkm4rp412zjctjm28b54tbvr0000gn/T/tmplfz72deu.yaml\n" ] } ], @@ -1036,7 +1036,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/tmpxwy566e3.yaml\n" + "/var/folders/z_/bkm4rp412zjctjm28b54tbvr0000gn/T/tmp8q7552ae.yaml\n" ] } ], diff --git a/mkdocs.yml b/mkdocs.yml index c76d9ec4..b4ebeaf7 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -5,12 +5,13 @@ repo_url: http://github.com/pepkit/pipestat pypi_name: pipestat nav: - - Getting Started: + - Getting Started: - Introduction: README.md - Install and configure: install.md - How-to guides: - Use Python API: python_api.md - Use command line interface: cli.md + - Configure pipestat: configuration.md - Reference: - Configuration format : config.md - Environment variables: env_vars.md @@ -29,4 +30,4 @@ plugins: autodoc_build: "docs/autodoc_build" autodoc_package: "pipestat" no_top_level: true -- search \ No newline at end of file +- search diff --git a/pipestat/helpers.py b/pipestat/helpers.py index e51dd49e..71f60a30 100644 --- a/pipestat/helpers.py +++ b/pipestat/helpers.py @@ -1,11 +1,12 @@ import logging from re import findall -from typing import Any, Dict +from typing import Any, Dict, List, Optional, Tuple, Union import jsonschema import sqlalchemy.orm from oyaml import safe_load from psycopg2 import sql +from sqlalchemy.orm import DeclarativeMeta, Query from ubiquerg import expandpath from .const import * @@ -101,76 +102,6 @@ def mk_list_of_str(x): ) -def preprocess_condition_pair(condition, condition_val): - """ - Preprocess query condition and values to ensure sanity and compatibility - - :param str condition: condition string - :param tuple condition_val: values to populate condition string with - :return (psycopg2.sql.SQL, tuple): condition pair - """ - - def _check_semicolon(x): - """ - recursively check for semicolons in an object - - :param aby x: object to inspect - :raises ValueError: if semicolon detected - """ - if isinstance(x, str): - assert ";" not in x, ValueError( - f"semicolons are not permitted in condition values: '{str(x)}'" - ) - if isinstance(x, list): - list(map(lambda v: _check_semicolon(v), x)) - - if condition: - if not isinstance(condition, str): - raise TypeError("Condition has to be a string") - else: - _check_semicolon(condition) - placeholders = findall("%s", condition) - condition = sql.SQL(condition) - if not condition_val: - raise ValueError("condition provided but condition_val missing") - assert isinstance(condition_val, list), TypeError( - "condition_val has to be a list" - ) - condition_val = tuple(condition_val) - assert len(placeholders) == len(condition_val), ValueError( - f"Number of condition ({len(condition_val)}) values not equal " - f"number of placeholders in: {condition}" - ) - return condition, condition_val - - -def paginate_query(query, offset, limit): - """ - Apply offset and limit to the query string - - :param sql.SQL query: query string to apply limit and offset to - :param int offset: offset to apply; no. of records to skip - :param int limit: limit to apply; max no. of records to return - :return sql.SQL: a possibly paginated query - """ - if offset is not None: - assert isinstance(offset, int), TypeError( - f"Provided offset ({offset}) must be an int" - ) - query += sql.SQL(f" OFFSET {offset}") - if limit is not None: - assert isinstance(limit, int), TypeError( - f"Provided limit ({limit}) must be an int" - ) - query += sql.SQL(f" LIMIT {limit}") - return query - - -from typing import Dict, List, Optional, Tuple, Union - -from sqlalchemy.orm import DeclarativeMeta, Query - - def dynamic_filter( ORM: DeclarativeMeta, query: Query, @@ -198,14 +129,16 @@ def _unpack_tripartite(x): e1, e2, e3 = x return e1, e2, e3 except ValueError: - raise Exception(f"Invalid tripartite element: {x}") + raise ValueError( + f"Invalid filter value: {x}. The filters must be tripartite" + ) if filter_conditions is not None: for filter_condition in filter_conditions: key, op, value = _unpack_tripartite(filter_condition) column = getattr(ORM, key, None) if column is None: - raise Exception(f"Invalid filter column: {key}") + raise ValueError(f"Selected filter column does not exist: {key}") if op == "in": if isinstance(value, list): filt = column.in_(value) @@ -223,7 +156,7 @@ def _unpack_tripartite(x): % op ) except IndexError: - raise Exception(f"Invalid filter operator: {op}") + raise ValueError()(f"Invalid filter operator: {op}") if value == "null": value = None filt = getattr(column, attr)(value) diff --git a/tests/test_db_only_mode.py b/tests/test_db_only_mode.py index dd259f3a..d2fffcd8 100644 --- a/tests/test_db_only_mode.py +++ b/tests/test_db_only_mode.py @@ -27,6 +27,50 @@ def test_report(self, val, config_file_path, schema_file_path, results_file_path val_name = list(val.keys())[0] assert psm.select(filter_conditions=[(val_name, "eq", str(val[val_name]))]) + @pytest.mark.parametrize(["rec_id", "res_id"], [("sample2", "number_of_things")]) + @pytest.mark.parametrize("backend", ["db"]) + def test_select_invalid_filter_column( + self, + rec_id, + res_id, + config_file_path, + results_file_path, + schema_file_path, + backend, + ): + args = dict( + schema_path=schema_file_path, namespace="test", config=config_file_path + ) + psm = PipestatManager(**args) + with pytest.raises(ValueError): + psm.select( + filter_conditions=[("bogus_column", "eq", rec_id)], + columns=[res_id], + ) + + @pytest.mark.parametrize(["rec_id", "res_id"], [("sample2", "number_of_things")]) + @pytest.mark.parametrize("backend", ["db"]) + @pytest.mark.parametrize("filter", [("column", "eq", 1), "a", [1, 2, 3]]) + def test_select_invalid_filter_structure( + self, + rec_id, + res_id, + config_file_path, + results_file_path, + schema_file_path, + backend, + filter, + ): + args = dict( + schema_path=schema_file_path, namespace="test", config=config_file_path + ) + psm = PipestatManager(**args) + with pytest.raises(ValueError): + psm.select( + filter_conditions=[filter], + columns=[res_id], + ) + @pytest.mark.parametrize(["rec_id", "res_id"], [("sample2", "number_of_things")]) @pytest.mark.parametrize("backend", ["db"]) @pytest.mark.parametrize("limit", [1, 2, 3, 15555]) From 906a04a99fa11ee69faae4be33dd6238f1c1206d Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 10 May 2021 13:27:34 -0400 Subject: [PATCH 37/65] update exception --- pipestat/helpers.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pipestat/helpers.py b/pipestat/helpers.py index 71f60a30..34cfcdfd 100644 --- a/pipestat/helpers.py +++ b/pipestat/helpers.py @@ -126,11 +126,14 @@ def dynamic_filter( def _unpack_tripartite(x): try: + assert isinstance(x, Iterable) and not isinstance(x, str), TypeError( + "Wrong filter class, a non-string iterable is required" + ) e1, e2, e3 = x return e1, e2, e3 - except ValueError: + except Exception: raise ValueError( - f"Invalid filter value: {x}. The filters must be tripartite" + f"Invalid filter value: {x}. The filter must be a tripartite iterable" ) if filter_conditions is not None: From cdeddf0098a8035f5b77cabd8bbad9ff34c64511 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 10 May 2021 13:30:17 -0400 Subject: [PATCH 38/65] update exception --- pipestat/helpers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipestat/helpers.py b/pipestat/helpers.py index 34cfcdfd..942f9d75 100644 --- a/pipestat/helpers.py +++ b/pipestat/helpers.py @@ -126,8 +126,8 @@ def dynamic_filter( def _unpack_tripartite(x): try: - assert isinstance(x, Iterable) and not isinstance(x, str), TypeError( - "Wrong filter class, a non-string iterable is required" + assert isinstance(x, List) or isinstance(x, Tuple), TypeError( + "Wrong filter class, a List or Tuple is required" ) e1, e2, e3 = x return e1, e2, e3 From dc03da4f54f6ae8f16383f2703c0ed33410b575c Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 11 May 2021 14:35:23 -0400 Subject: [PATCH 39/65] update docs --- docs/changelog.md | 18 +++++- docs/config.md | 38 +++++++------ docs/contributing.md | 4 +- docs/env_vars.md | 3 +- docs/install.md | 3 +- docs/pipestat_specification.md | 46 +++++++-------- docs/schema.md | 100 --------------------------------- 7 files changed, 62 insertions(+), 150 deletions(-) delete mode 100644 docs/schema.md diff --git a/docs/changelog.md b/docs/changelog.md index e59f61c0..b00a5aa8 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -4,18 +4,32 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ## [0.1.0] - unreleased +**This update introduces some backwards-incompatible changes due to database interface redesign** + +### Changed + +- database interface type from a driver to an Object–relational mapping (ORM) approach + +### Added + +- results highligting support +- static typing ## [0.0.4] - 2021-04-02 + ### Added + - config validation - typing in code - ## [0.0.3] - 2021-03-12 + ### Added -- possibility to initialize the `PipestatManager` object (or use the `pipestat status` CLI) with no results schema defined for pipeline status management; [Issue #1](https://github.com/pepkit/pipestat/issues/1) +- possibility to initialize the `PipestatManager` object (or use the `pipestat status` CLI) with no results schema defined for pipeline status management; [Issue #1](https://github.com/pepkit/pipestat/issues/1) ## [0.0.2] - 2021-02-22 + ### Added + - initial package release diff --git a/docs/config.md b/docs/config.md index 21019931..a9a28755 100644 --- a/docs/config.md +++ b/docs/config.md @@ -8,28 +8,30 @@ schema_path: record_identifier: results_file_path: # either "results_file_path" database: # or DB login credentials - name: - user: - password: - host: - port: + name: + user: + password: + host: + port: + dialect: + driver: ``` If both `results_file_path` and DB login credentials are provided, the YAML results file is given priority. -Any of the settings specified in the configuration file, apart from the database login credentials, can be overwritten with the respectively named arguments in the `PipestatManager` object constructor, which implies the fact that the configuration file is *required* only if the the intended pipestat back-end is a database. +Any of the settings specified in the configuration file, apart from the database login credentials, can be overwritten with the respectively named arguments in the `PipestatManager` object constructor, which implies the fact that the configuration file is *required* only if the the intended pipestat back-end is a database. ## Example For the [PostgreSQL](https://www.postgresql.org/) instance has been started in a container, with the following command: ```console -docker run -d - --name pipestat-postgres \ - -p 5432:5432 \ +docker run -d + --name pipestat-postgres \ + -p 5432:5432 \ -e POSTGRES_PASSWORD=b4fd34f^Fshdwede \ - -e POSTGRES_USER=john \ - -e POSTGRES_DB=pipestat-test \ + -e POSTGRES_USER=john \ + -e POSTGRES_DB=pipestat-test \ -v postgres-data:/var/lib/postgresql/data postgres ``` @@ -39,9 +41,11 @@ The configuration file should look like this: namespace: my_pipeline schema_path: /path/to/schema.yaml database: - name: pipestat-test - user: john - password: b4fd34f^Fshdwede - host: localhost - port: 5432 -``` \ No newline at end of file + name: pipestat-test + user: john + password: b4fd34f^Fshdwede + host: 127.0.0.1 + port: 5432 + dialect: postgresql + driver: psycopg2 +``` diff --git a/docs/contributing.md b/docs/contributing.md index e53b307a..d2e6cd6b 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -1,7 +1,7 @@ -## Contributing +# Contributing We welcome contributions from the community. -## Suggestions and feedback +# Suggestions and feedback Please open an issue on the github issue tracker with suggestions, bug reports, or other feedback. diff --git a/docs/env_vars.md b/docs/env_vars.md index c68fc9ab..0dc32e51 100644 --- a/docs/env_vars.md +++ b/docs/env_vars.md @@ -4,7 +4,6 @@ Both the command line interface (CLI) and Python API support a collection of env Here is a list of the supported environment variables: - | Environment variable | API argument | Description | |----------------------------|--------------------|-----------------------------------------------------------------------------------------------------------------------------| | **PIPESTAT_NAMESPACE** | namespace | namespace to report into. This will be the DBtable name if using DB as the object back-end | @@ -12,4 +11,4 @@ Here is a list of the supported environment variables: | **PIPESTAT_CONFIG** | config | path to the configuration file or a mappingwith the config file content | | **PIPESTAT_RESULTS_FILE** | results_file_path | YAML file to report into, if file isused as the object back-end | | **PIPESTAT_RESULTS_SCHEMA** | schema_path | path to the output schema that formalizesthe results structure | -| **PIPESTAT_STATUS_SCHEMA** | status_schema_path | path to the status schema that formalizes the status flags structure | \ No newline at end of file +| **PIPESTAT_STATUS_SCHEMA** | status_schema_path | path to the status schema that formalizes the status flags structure | diff --git a/docs/install.md b/docs/install.md index 6ccde5e5..9282d686 100644 --- a/docs/install.md +++ b/docs/install.md @@ -8,6 +8,7 @@ Install pipestat from [GitHub releases](https://github.com/pepkit/pipestat/relea - `pip install --upgrade pipestat`: update in virtual environment. See if your install worked by calling `pipestat -h` on the command line. If the `pipestat` executable is not in your `$PATH`, append this to your `.bashrc` or `.profile` (or `.bash_profile` on macOS): + ```console export PATH=~/.local/bin:$PATH -``` \ No newline at end of file +``` diff --git a/docs/pipestat_specification.md b/docs/pipestat_specification.md index 61169abf..f3608619 100644 --- a/docs/pipestat_specification.md +++ b/docs/pipestat_specification.md @@ -6,11 +6,10 @@ Pipelines, or workflows, are made from a set of commands that process input data and produce results. These results may take many forms, such as simple statistics, string variables, images, or processed data files. How do pipelines structure the results they produce? There is no standard structure for results, so usually, it's done differently for each pipeline. This restricts the portability of the outputs of pipelines, and makes it difficult to write software that can process results from a variety of different pipelines. As a result, each pipeline author usually writes dedicated report functions for each pipeline. -Pipestat provides a formal specification for how a pipeline should structure its results. Therefore, any pipeline that follows the pipestat specification will record results in the same way. This makes it possible to build generic report software that can work with any pipestat-compatible pipeline, offloading the task of making pretty result reports to generic tools. +Pipestat provides a formal specification for how a pipeline should structure its results. Therefore, any pipeline that follows the pipestat specification will record results in the same way. This makes it possible to build generic report software that can work with any pipestat-compatible pipeline, offloading the task of making pretty result reports to generic tools. This document outlines the specification for pipestat results. If your pipeline stores results like this, then downstream tools that read pipestat results will be able to build nice summaries of your pipeline runs automatically. To write results according to this specification, you can use the reference implementation (the `pipestat` python package), or you can simply write your results to this specification using whatever system you like. - # Terminology - *result*: An element produced by a pipeline. Results have defined data types, described herein. @@ -33,13 +32,13 @@ Each *result* reported by a pipeline must have a specified data type. Pipestat i Importantly, pipestat extends the jsonschema vocabulary by adding two additional types, which are common results of a pipeline: `image` and `file`. These types require reporting objects with the following attributes: -- `file`: - - `path`: path to the reported file - - `title`: human readable description of the file -- `image`: - - `path`: path to the reported image, usually PDF - - `thumbnail`: path to the reported thumbnail, usually PNG or JPEG - - `title`: human readable description of the image +- `file`: + - `path`: path to the reported file + - `title`: human readable description of the file +- `image`: + - `path`: path to the reported image, usually PDF + - `thumbnail`: path to the reported thumbnail, usually PNG or JPEG + - `title`: human readable description of the image # Pipestat schema @@ -60,11 +59,10 @@ The pipestat schema is a YAML-formatted file. The top level keys are the unique ```yaml result_identifier: type: -``` +``` Here, `result_identifier` can be whatever name you want to use to identify this result. Here's a simple schema example that showcases most of the supported types: - ```yaml number_of_things: type: integer @@ -90,7 +88,7 @@ output_file: output_image: type: image description: "This a path to the output image" -``` +``` Here's a more complex schema example that showcases some of the more advanced jsonschema features: @@ -114,18 +112,18 @@ output_object: properties: property1: array: - items: + items: type: integer property2: type: boolean required: - property1 description: "Object output with required array of integers and optional boolean" -``` +``` ## Results highlighting -The pipestat specification allows to highlight results by adding `highlight: true` attribute under result identifier in the schema file. In the example below the `log_file` result will be highlighted. +The pipestat specification allows to highlight results by adding `highlight: true` attribute under result identifier in the schema file. In the example below the `log_file` result will be highlighted. ```yaml number_of_things: @@ -148,7 +146,6 @@ Apart from results reporting pipestat provides a robust pipeline status manageme Here's an example of the pipestat status schema, which at the same time is the default status schema shipped with the pipestat Python package: - ```yaml running: description: "the pipeline is running" @@ -174,7 +171,7 @@ As depicted above the top-level attributes are the status identifiers. Within ea # Backends -The pipestat specification describes two backend types for storing results: a [YAML-formatted file](https://yaml.org/spec/1.2/spec.html) or a [PostgreSQL database](https://www.postgresql.org/). This flexibility makes pipestat useful for a wide variety of use cases. Some users just need a simple text file for smaller-scale needs, which is convenient and universal, requiring no database infrastructure. For larger-scale systems, a database back-end is necssary. The pipestat specification provides a layer that spans the two possibilities, so that reports can be made in the same way, regardless of which back-end is used in a particular use case. +The pipestat specification describes two backend types for storing results: a [YAML-formatted file](https://yaml.org/spec/1.2/spec.html) or a [PostgreSQL database](https://www.postgresql.org/). This flexibility makes pipestat useful for a wide variety of use cases. Some users just need a simple text file for smaller-scale needs, which is convenient and universal, requiring no database infrastructure. For larger-scale systems, a database back-end is necssary. The pipestat specification provides a layer that spans the two possibilities, so that reports can be made in the same way, regardless of which back-end is used in a particular use case. By using the `pipestat` package to write results, the pipeline author need not be concerned with database connections or dealing with racefree file writing, as these tasks are already implemented. The user who runs the pipeline will simply configure the pipestat backend as required. @@ -190,32 +187,29 @@ For the YAML file backend, each file represents a namespace. The file always beg my_namespace: record1: my_result: 10 - my_result1: + my_result1: key: "value1" record2: my_result: 3 - my_result1: + my_result1: key: "value2" -``` +``` A more concrete example would be: - ```yaml rnaseq-pipe: patient1: duplicate_rate: 10 - genomic_distribution: + genomic_distribution: promoter: 15 enhancer: 85 patient2: duplicate_rate: 3 - genomic_distribution: + genomic_distribution: promoter: 30 enhancer: 70 -``` - - +``` ## PostgreSQL database diff --git a/docs/schema.md b/docs/schema.md deleted file mode 100644 index 2e9bb4c5..00000000 --- a/docs/schema.md +++ /dev/null @@ -1,100 +0,0 @@ -# Schema specification - -One of the *required* pipestat inputs is a schema file. **The schema specifies the results types and names that can be reported with pipestat.** As a pipeline developer, you create a schema to describe all of the important results to be recorded from your pipeline. - -Pipestat uses the schema as a base for creating a collection of self-contained result-specific [jsonschema schemas](https://json-schema.org/) that are used to **validate** the reported results prior to inserting into the database or saving in the YAML results file, depending on the selected backend. - -## Components - -Each schema is a YAML-formatted file composed of a set of self-contained result definitions. The top level keys are the unique result identifiers. The result definitions are jsonschema schemas. For a minimal schema, only the `type` attribute is required, which indicates the required type of the result to be reported. Please refer to the jsonschema documentation to learn more about the types and other attributes. This is an example of such component: - -```yaml -result_identifier: - type: -``` - -Here, `result_identifier` can be whatever name you want to use to identify this result. Importantly, pipestat extends the jsonschema vocabulary by adding two additional types: `image` and `file`. These types require reporting objects with the following attributes: - -- `file`: - - `path`: path to the reported file - - `title`: human readable description of the file -- `image`: - - `path`: path to the reported image, usually PDF - - `thumbnail`: path to the reported thumbnail, usually PNG or JPEG - - `title`: human readable description of the image - -Therefore, in practice, a result of type `file` is equivalent to: - -```yaml -type: object -properties: - path: - type: string - title: - type: string -``` - -## Basic example - -Here's a simple schema example that showcases most of the supported types: - - -```yaml -number_of_things: - type: integer - description: "Number of things" -percentage_of_things: - type: number - description: "Percentage of things" -name_of_something: - type: string - description: "Name of something" -swtich_value: - type: boolean - description: "Is the switch on of off" -collection_of_things: - type: array - description: "This store collection of values" -output_object: - type: object - description: "Object output" -output_file: - type: file - description: "This a path to the output file" -output_image: - type: image - description: "This a path to the output image" -``` - -## More complex example - -Here's a more complex schema example that showcases some of the more advanced jsonschema features: - -```yaml -number_of_things: - type: integer - description: "Number of things, min 20, multiple of 10" - multipleOf: 10 - minimum: 20 -name_of_something: - type: string - description: "Name of something, min len 2 characters" - minLength: 2 -collection_of_things: - type: array - items: - type: string - description: "This store collection of strings" -output_object: - type: object - properties: - property1: - array: - items: - type: integer - property2: - type: boolean - required: - - property1 - description: "Object output with required array of integers and optional boolean" -``` From 1adfb5fe2c6df9eb2dc0f77b4a6e80c5755df6a3 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 14 May 2021 17:16:13 -0400 Subject: [PATCH 40/65] set up a system for schema-based DB column parametrization --- .gitignore | 1 + pipestat/pipestat.py | 21 ++++++++++++++++++- tests/data/sample_output_schema.yaml | 10 ++++++++- .../data/sample_output_schema_highlight.yaml | 10 ++++----- 4 files changed, 35 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index fcf0f3c8..b07a8870 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ __pycache__ *.py[cod] *$py.class __pycache__/ +.vscode/ # C extensions *.so diff --git a/pipestat/pipestat.py b/pipestat/pipestat.py index be23e0fe..88768690 100644 --- a/pipestat/pipestat.py +++ b/pipestat/pipestat.py @@ -277,6 +277,19 @@ def highlighted_results(self) -> List[str]: """ return self._get_attr(HIGHLIGHTED_KEY) or [] + @property + def db_column_kwargs_by_result(self) -> Dict[str, Any]: + """ + Database column key word arguments for every result, sourced from the results schema + + :return Dict[str, Any]: key word arguments for every result + """ + return { + result_id: self.schema[result_id]["db_column"] + for result_id in self.schema.keys() + if "db_column" in self.schema[result_id] + } + @property def namespace(self) -> str: """ @@ -498,7 +511,13 @@ def _auto_repr(x: Any) -> str: col_type = SQL_CLASSES_BY_TYPE[result_metadata[SCHEMA_TYPE_KEY]] _LOGGER.debug(f"Adding object: {result_id} of type: {str(col_type)}") attr_dict.update( - {result_id: Column(col_type, doc=result_metadata["description"])} + { + result_id: Column( + col_type, + doc=result_metadata["description"], + **self.db_column_kwargs_by_result.get(result_id, {}), + ) + } ) attr_dict.update({"__repr__": _auto_repr}) _LOGGER.debug(f"Creating '{tn}' ORM with args: {attr_dict}") diff --git a/tests/data/sample_output_schema.yaml b/tests/data/sample_output_schema.yaml index 392bb832..c7c8b79e 100644 --- a/tests/data/sample_output_schema.yaml +++ b/tests/data/sample_output_schema.yaml @@ -21,4 +21,12 @@ output_file: description: "This a path to the output file" output_image: type: image - description: "This a path to the output image" \ No newline at end of file + description: "This a path to the output image" +md5sum: + type: string + description: "MD5SUM of an object" + highlighted: true + db_column: + unique: true + nullable: false + \ No newline at end of file diff --git a/tests/data/sample_output_schema_highlight.yaml b/tests/data/sample_output_schema_highlight.yaml index e51fd0bd..3373f6e8 100644 --- a/tests/data/sample_output_schema_highlight.yaml +++ b/tests/data/sample_output_schema_highlight.yaml @@ -21,21 +21,21 @@ output_file: description: "This a path to the output file" output_image: type: image - highlight: false + highlighted: false description: "This a path to the output image" log: type: file - highlight: true + highlighted: true description: "The log file of the pipeline run" profile: type: file - highlight: true + highlighted: true description: "The profile of the pipeline run" commands: type: file - highlight: true + highlighted: true description: "The file with shell commands executed by this pipeline" version: type: string - highlight: true + highlighted: true description: "Pipeline version" \ No newline at end of file From dc554e0902ce2c497c163e7ef9705d75544e98f7 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 14 May 2021 17:19:12 -0400 Subject: [PATCH 41/65] fix keys, add const --- pipestat/const.py | 1 + pipestat/pipestat.py | 4 ++-- tests/data/sample_output_schema.yaml | 2 +- tests/data/sample_output_schema_highlight.yaml | 10 +++++----- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/pipestat/const.py b/pipestat/const.py index c1e105fe..a4854b6c 100644 --- a/pipestat/const.py +++ b/pipestat/const.py @@ -67,6 +67,7 @@ DB_SCOPED_SESSION_KEY = "_db_scoped_session" DB_ENGINE_KEY = "_db_engine" HIGHLIGHTED_KEY = "_highlighted" +DB_COLUMN = "db_column" # schema keys SCHEMA_PROP_KEY = "properties" diff --git a/pipestat/pipestat.py b/pipestat/pipestat.py index 88768690..6c9da8df 100644 --- a/pipestat/pipestat.py +++ b/pipestat/pipestat.py @@ -285,9 +285,9 @@ def db_column_kwargs_by_result(self) -> Dict[str, Any]: :return Dict[str, Any]: key word arguments for every result """ return { - result_id: self.schema[result_id]["db_column"] + result_id: self.schema[result_id][DB_COLUMN] for result_id in self.schema.keys() - if "db_column" in self.schema[result_id] + if DB_COLUMN in self.schema[result_id] } @property diff --git a/tests/data/sample_output_schema.yaml b/tests/data/sample_output_schema.yaml index c7c8b79e..b577d418 100644 --- a/tests/data/sample_output_schema.yaml +++ b/tests/data/sample_output_schema.yaml @@ -25,7 +25,7 @@ output_image: md5sum: type: string description: "MD5SUM of an object" - highlighted: true + highlight: true db_column: unique: true nullable: false diff --git a/tests/data/sample_output_schema_highlight.yaml b/tests/data/sample_output_schema_highlight.yaml index 3373f6e8..e51fd0bd 100644 --- a/tests/data/sample_output_schema_highlight.yaml +++ b/tests/data/sample_output_schema_highlight.yaml @@ -21,21 +21,21 @@ output_file: description: "This a path to the output file" output_image: type: image - highlighted: false + highlight: false description: "This a path to the output image" log: type: file - highlighted: true + highlight: true description: "The log file of the pipeline run" profile: type: file - highlighted: true + highlight: true description: "The profile of the pipeline run" commands: type: file - highlighted: true + highlight: true description: "The file with shell commands executed by this pipeline" version: type: string - highlighted: true + highlight: true description: "Pipeline version" \ No newline at end of file From 8f533f6026523662430d53c042e9bb915b79a336 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 17 May 2021 17:03:59 -0400 Subject: [PATCH 42/65] experimental: set up a system for schema-based relationships specification --- pipestat/const.py | 13 ++++++++- pipestat/pipestat.py | 66 ++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 73 insertions(+), 6 deletions(-) diff --git a/pipestat/const.py b/pipestat/const.py index a4854b6c..b5868bfa 100644 --- a/pipestat/const.py +++ b/pipestat/const.py @@ -67,7 +67,18 @@ DB_SCOPED_SESSION_KEY = "_db_scoped_session" DB_ENGINE_KEY = "_db_engine" HIGHLIGHTED_KEY = "_highlighted" -DB_COLUMN = "db_column" +DB_COLUMN_KEY = "db_column" +DB_RELATIONSHIP_KEY = "relationship" +DB_RELATIONSHIP_NAME_KEY = "name" +DB_RELATIONSHIP_TABLE_KEY = "table" +DB_RELATIONSHIP_COL_KEY = "column" +DB_RELATIONSHIP_BACKREF_KEY = "backref" +DB_RELATIONSHIP_ELEMENTS = [ + DB_RELATIONSHIP_BACKREF_KEY, + DB_RELATIONSHIP_COL_KEY, + DB_RELATIONSHIP_NAME_KEY, + DB_RELATIONSHIP_TABLE_KEY, +] # schema keys SCHEMA_PROP_KEY = "properties" diff --git a/pipestat/pipestat.py b/pipestat/pipestat.py index 6c9da8df..79ce0787 100644 --- a/pipestat/pipestat.py +++ b/pipestat/pipestat.py @@ -7,9 +7,15 @@ import sqlalchemy.orm from attmap import PathExAttMap as PXAM from jsonschema import validate -from sqlalchemy import Column, create_engine +from sqlalchemy import Column, ForeignKey, create_engine from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import DeclarativeMeta, scoped_session, sessionmaker +from sqlalchemy.orm import ( + DeclarativeMeta, + backref, + relationship, + scoped_session, + sessionmaker, +) from ubiquerg import create_lock, remove_lock from yacman import YacAttMap @@ -280,14 +286,46 @@ def highlighted_results(self) -> List[str]: @property def db_column_kwargs_by_result(self) -> Dict[str, Any]: """ - Database column key word arguments for every result, sourced from the results schema + Database column key word arguments for every result, + sourced from the results schema in the `db_column` section :return Dict[str, Any]: key word arguments for every result """ return { - result_id: self.schema[result_id][DB_COLUMN] + result_id: self.schema[result_id][DB_COLUMN_KEY] for result_id in self.schema.keys() - if DB_COLUMN in self.schema[result_id] + if DB_COLUMN_KEY in self.schema[result_id] + } + + @property + def db_column_relationships_by_result(self) -> Dict[str, str]: + """ + Database column relationships for every result, + sourced from the results schema in the `relationship` section + + *Note: this is an experimental feature* + + :return Dict[str, Dict[str, str]]: relationships for every result + """ + + def _validate_rel_section(result_id): + if not all( + [ + k in self.schema[result_id][DB_RELATIONSHIP_KEY].keys() + for k in DB_RELATIONSHIP_ELEMENTS + ] + ): + PipestatDatabaseError( + f"Not all required {DB_RELATIONSHIP_KEY} settings ({DB_RELATIONSHIP_ELEMENTS}) were " + f"provided for result: {result_id}" + ) + return True + + return { + result_id: self.schema[result_id][DB_RELATIONSHIP_KEY] + for result_id in self.schema.keys() + if DB_RELATIONSHIP_KEY in self.schema[result_id] + and _validate_rel_section(result_id) } @property @@ -510,11 +548,29 @@ def _auto_repr(x: Any) -> str: for result_id, result_metadata in schema.items(): col_type = SQL_CLASSES_BY_TYPE[result_metadata[SCHEMA_TYPE_KEY]] _LOGGER.debug(f"Adding object: {result_id} of type: {str(col_type)}") + + rel_info = self.db_column_relationships_by_result.get(result_id, {}) + col_args = [] + # if there is a relationship defined for this result, include it + if rel_info: + attr_dict.update( + { + rel_info["name"]: relationship( + rel_info["table"].capitalize(), + backref=backref( + rel_info["backref"], uselist=True, cascade="delete,all" + ), + ) + } + ) + col_args = [ForeignKey(f"{rel_info['table']}.{rel_info['column']}")] + attr_dict.update( { result_id: Column( col_type, doc=result_metadata["description"], + *col_args, **self.db_column_kwargs_by_result.get(result_id, {}), ) } From ee4aeed1275b54d67789520940058621bd90e839 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 18 May 2021 09:38:13 -0400 Subject: [PATCH 43/65] add textual filter support for db --- pipestat/pipestat.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/pipestat/pipestat.py b/pipestat/pipestat.py index 79ce0787..ba98f958 100644 --- a/pipestat/pipestat.py +++ b/pipestat/pipestat.py @@ -8,6 +8,7 @@ from attmap import PathExAttMap as PXAM from jsonschema import validate from sqlalchemy import Column, ForeignKey, create_engine +from sqlalchemy import text from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import ( DeclarativeMeta, @@ -1194,6 +1195,48 @@ def _retrieve_db( } raise PipestatDatabaseError(f"Record '{record_identifier}' not found") + def select_txt( + self, + filter_templ: Optional[str] = "", + filter_params: Optional[Dict[str, Any]] = {}, + table_name: Optional[str] = None, + offset: Optional[int] = None, + limit: Optional[int] = None, + ) -> List[Any]: + """ + Execute a query with a textual filter. Returns all results. + + To retrieve all table contents, leave the filter arguments out. + Table name defaults to the namespace + + :param str filter_templ: filter template with value placeholders, + formatted as follows `id<:value and name=:name` + :param Dict[str, Any] filter_params: a mapping keys specified in the `filter_templ` + to parameters that are supposed to replace the placeholders + :param str table_name: name of the table to query + :param int offset: skip this number of rows + :param int limit: include this number of rows + :return List[Any]: a list of matched records + """ + if self.file: + raise PipestatDatabaseError( + f"The {self.__class__.__name__} object is not backed by a database. " + f"This operation is not supported for file backend." + ) + table_name = table_name or self.namespace + with self.session as s: + q = ( + s.query(self._get_orm(table_name)) + .filter(text(filter_templ)) + .params(**filter_params) + ) + if isinstance(offset, int): + q = q.offset(offset) + if isinstance(limit, int): + q = q.limit(limit) + results = q.all() + return results + def assert_results_defined(self, results: List[str]) -> None: """ Assert provided list of results is defined in the schema From 0932733cca9e1fd1bfd1d8e6c45445bbcca764ff Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 2 Jun 2021 13:57:20 -0400 Subject: [PATCH 44/65] add more docs, make public --- docs/api_docs.md | 185 ++++++++++++++++++++++++---------- docs_jupyter/python_api.ipynb | 105 ++++++++++++++++++- pipestat/pipestat.py | 24 ++--- 3 files changed, 245 insertions(+), 69 deletions(-) diff --git a/docs/api_docs.md b/docs/api_docs.md index 21f94abe..1eb7bc19 100644 --- a/docs/api_docs.md +++ b/docs/api_docs.md @@ -31,11 +31,11 @@ h4 .content { # Package `pipestat` Documentation ## Class `PipestatManager` -Pipestat standardizes reporting of pipeline results and pipeline status management. It formalizes a way for pipeline developers and downstream tools developers to communicate -- results produced by a pipeline can easily and reliably become an input for downstream analyses. The object exposes API for interacting with the results and pipeline status and can be backed by either a YAML-formatted file or a PostgreSQL database. +Pipestat standardizes reporting of pipeline results and pipeline status management. It formalizes a way for pipeline developers and downstream tools developers to communicate -- results produced by a pipeline can easily and reliably become an input for downstream analyses. The object exposes API for interacting with the results and pipeline status and can be backed by either a YAML-formatted file or a database. ```python -def __init__(self, namespace=None, record_identifier=None, schema_path=None, results_file_path=None, database_only=False, config=None, status_schema_path=None, flag_file_dir=None) +def __init__(self, namespace: Optional[str]=None, record_identifier: Optional[str]=None, schema_path: Optional[str]=None, results_file_path: Optional[str]=None, database_only: Optional[bool]=True, config: Union[str, dict, NoneType]=None, status_schema_path: Optional[str]=None, flag_file_dir: Optional[str]=None, custom_declarative_base: Optional[sqlalchemy.orm.decl_api.DeclarativeMeta]=None, show_db_logs: bool=False) ``` Initialize the object @@ -48,18 +48,19 @@ Initialize the object - `database_only` (`bool`): whether the reported data should not bestored in the memory, but only in the database - `config` (`str | dict`): path to the configuration file or a mappingwith the config file content - `status_schema_path` (`str`): path to the status schema that formalizesthe status flags structure +- `custom_declarative_base` (`sqlalchemy.orm.DeclarativeMeta`): a declarative base touse for ORMs creation a new instance will be created if not provided ```python -def assert_results_defined(self, results) +def assert_results_defined(self, results: List[str]) -> None ``` Assert provided list of results is defined in the schema #### Parameters: -- `results` (`list[str]`): list of results tocheck for existence in the schema +- `results` (`List[str]`): list of results tocheck for existence in the schema #### Raises: @@ -70,54 +71,61 @@ Assert provided list of results is defined in the schema ```python -def check_connection(self) +def check_record_exists(self, record_identifier: str, table_name: str=None) -> bool ``` -Check whether a PostgreSQL connection has been established +Check if the specified record exists in the table +#### Parameters: + +- `record_identifier` (`str`): record to check for +- `table_name` (`str`): table name to check + + #### Returns: -- `bool`: whether the connection has been established +- `bool`: whether the record exists in the table ```python -def check_record_exists(self, record_identifier=None) +def check_result_exists(self, result_identifier: str, record_identifier: str=None) -> bool ``` -Check if the record exists +Check if the result has been reported #### Parameters: - `record_identifier` (`str`): unique identifier of the record +- `result_identifier` (`str`): name of the result to check #### Returns: -- `bool`: whether the record exists +- `bool`: whether the specified result has been reported for theindicated record in current namespace ```python -def check_result_exists(self, result_identifier, record_identifier=None) +def check_which_results_exist(self, results: List[str], rid: Optional[str]=None, table_name: Optional[str]=None) -> List[str] ``` -Check if the result has been reported +Check which results have been reported #### Parameters: -- `record_identifier` (`str`): unique identifier of the record -- `result_identifier` (`str`): name of the result to check +- `rid` (`str`): unique identifier of the record +- `results` (`List[str]`): names of the results to check #### Returns: -- `bool`: whether the specified result has been reported for theindicated record in current namespace +- `List[str]`: whether the specified result has been reported for theindicated record in current namespace ```python -def clear_status(self, record_identifier=None, flag_names=None) +def clear_status(self, record_identifier: str=None, flag_names: List[str]=None) -> List[Optional[str]] ``` Remove status flags @@ -129,16 +137,8 @@ Remove status flags #### Returns: -- `list[str]`: Collection of names of flags removed - - - +- `List[str]`: Collection of names of flags removed -```python -def close_postgres_connection(self) -``` - -Close connection and remove client bound @@ -167,27 +167,53 @@ Data object ```python -def db_cursor(self) +def db_column_kwargs_by_result(self) ``` -Establish connection and get a PostgreSQL database cursor, commit and close the connection afterwards +Database column key word arguments for every result, sourced from the results schema in the `db_column` section #### Returns: -- `LoggingCursor`: Database cursor object +- `Dict[str, Any]`: key word arguments for every result ```python -def establish_postgres_connection(self, suppress=False) +def db_column_relationships_by_result(self) ``` -Establish PostgreSQL connection using the config data -#### Parameters: +Database column relationships for every result, sourced from the results schema in the `relationship` section -- `suppress` (`bool`): whether to suppress any connection errors +*Note: this is an experimental feature* +#### Returns: +- `Dict[str, Dict[str, str]]`: relationships for every result + + + + +```python +def db_url(self) +``` + +Database URL, generated based on config credentials +#### Returns: + +- `str`: database URL + + +#### Raises: + +- `PipestatDatabaseError`: if the object is not backed by a database + + + + +```python +def establish_db_connection(self) -> bool +``` +Establish DB connection using the config data #### Returns: - `bool`: whether the connection has been established successfully @@ -208,7 +234,24 @@ File path that the object is reporting the results into ```python -def get_status(self, record_identifier=None) +def get_orm(self, table_name: str=None) -> Any +``` + +Get an object relational mapper class +#### Parameters: + +- `table_name` (`str`): table name to get a class for + + +#### Returns: + +- `Any`: Object relational mapper class + + + + +```python +def get_status(self, record_identifier: str=None) -> Optional[str] ``` Get the current pipeline status @@ -220,7 +263,7 @@ Get the current pipeline status ```python -def get_status_flag_path(self, status_identifier, record_identifier=None) +def get_status_flag_path(self, status_identifier: str, record_identifier=None) -> str ``` Get the path to the status file flag @@ -244,7 +287,19 @@ def highlighted_results(self) Highlighted results #### Returns: -- `list[str]`: a collection of highlighted results +- `List[str]`: a collection of highlighted results + + + + +```python +def is_db_connected(self) -> bool +``` + +Check whether a DB connection has been established +#### Returns: + +- `bool`: whether the connection has been established @@ -286,7 +341,7 @@ Unique identifier of the record ```python -def remove(self, record_identifier=None, result_identifier=None) +def remove(self, record_identifier: str=None, result_identifier: str=None) -> bool ``` Remove a result. @@ -307,13 +362,13 @@ will be removed. ```python -def report(self, values, record_identifier=None, force_overwrite=False, strict_type=True, return_id=False) +def report(self, values: Dict[str, Any], record_identifier: str=None, force_overwrite: bool=False, strict_type: bool=True, return_id: bool=False) -> Union[bool, int] ``` Report a result. #### Parameters: -- `values` (`dict[str, any]`): dictionary of result-value pairs +- `values` (`Dict[str, any]`): dictionary of result-value pairs - `record_identifier` (`str`): unique identifier of the record, valuein 'record_identifier' column to look for to determine if the record already exists - `force_overwrite` (`bool`): whether to overwrite the existing record - `strict_type` (`bool`): whether the type of the reported values shouldremain as is. Pipestat would attempt to convert to the schema-defined one otherwise @@ -340,7 +395,7 @@ Result schema mappings ```python -def retrieve(self, record_identifier=None, result_identifier=None) +def retrieve(self, record_identifier: Optional[str]=None, result_identifier: Optional[str]=None) -> Union[Any, Dict[str, Any]] ``` Retrieve a result for a record. @@ -355,7 +410,7 @@ be returned. #### Returns: -- `any | dict[str, any]`: a single result or a mapping with all theresults reported for the record +- `any | Dict[str, any]`: a single result or a mapping with all theresults reported for the record @@ -385,28 +440,56 @@ Schema path ```python -def select(self, columns=None, condition=None, condition_val=None, offset=None, limit=None) +def select(self, table_name: Optional[str]=None, columns: Optional[List[str]]=None, filter_conditions: Optional[List[Tuple[str, str, Union[str, List[str]]]]]=None, json_filter_conditions: Optional[List[Tuple[str, str, str]]]=None, offset: Optional[int]=None, limit: Optional[int]=None) -> List[Any] ``` -Get all the contents from the selected table, possibly restricted by the provided condition. +Perform a `SELECT` on the table #### Parameters: -- `columns` (`str | list[str]`): columns to select -- `condition` (`str`): condition to restrict the resultswith, will be appended to the end of the SELECT statement and safely populated with 'condition_val', for example: `"id=%s"` -- `condition_val` (`list`): values to fill the placeholderin 'condition' with -- `offset` (`int`): number of records to be skipped -- `limit` (`int`): max number of records to be returned +- `table_name` (`str`): name of the table to SELECT from +- `columns` (`List[str]`): columns to include in the result +- `filter_conditions` (`[(key,operator,value)]`): - eq for == - lt for < - ge for >= - in for in_ - like for like +- `json_filter_conditions` (`[(col,key,value)]`): conditions for JSONB column toquery that include JSON column name, key withing the JSON object in that column and the value to check the identity against. Therefore only '==' is supported in non-nested checks, e.g. [("other", "genome", "hg38")] +- `offset` (`int`): skip this number of rows +- `limit` (`int`): include this number of rows + + + + +```python +def select_txt(self, filter_templ: Optional[str]='', filter_params: Optional[Dict[str, Any]]={}, table_name: Optional[str]=None, offset: Optional[int]=None, limit: Optional[int]=None) -> List[Any] +``` + +Execute a query with a textual filter. Returns all results. + +To retrieve all table contents, leave the filter arguments out. +Table name defaults to the namespace +#### Parameters: + +- `filter_templ` (`str`): filter template with value placeholders,formatted as follows `id<:value and name=:name` +- `filter_params` (`Dict[str, Any]`): a mapping keys specified in the `filter_templ`to parameters that are supposed to replace the placeholders +- `table_name` (`str`): name of the table to query +- `offset` (`int`): skip this number of rows +- `limit` (`int`): include this number of rows #### Returns: -- `list[psycopg2.extras.DictRow]`: all table contents +- `List[Any]`: a list of matched records + + + + +```python +def session(self) +``` +Provide a transactional scope around a series of query operations, no commit afterwards. ```python -def set_status(self, status_identifier, record_identifier=None) +def set_status(self, status_identifier: str, record_identifier: str=None) -> None ``` Set pipeline run status. @@ -447,7 +530,7 @@ Status schema source ```python -def validate_schema(self) +def validate_schema(self) -> None ``` Check schema for any possible issues @@ -461,4 +544,4 @@ Check schema for any possible issues -*Version Information: `pipestat` v0.0.3-dev, generated by `lucidoc` v0.4.3* +*Version Information: `pipestat` v0.1.0-dev, generated by `lucidoc` v0.4.2* diff --git a/docs_jupyter/python_api.ipynb b/docs_jupyter/python_api.ipynb index 85d972a4..3fc17a69 100644 --- a/docs_jupyter/python_api.ipynb +++ b/docs_jupyter/python_api.ipynb @@ -1105,13 +1105,103 @@ "psm_no_schema.set_status(status_identifier=\"running\", record_identifier=\"sample1\")\n", "psm_no_schema.get_status(record_identifier=\"sample1\")" ] + }, + { + "source": [ + "## Extras\n", + "\n", + "Here are some additional convenience methods \n", + "\n", + "### Methods avaialable to a database-backed `PipestatManager`\n", + "\n", + "To make it possible to benefit from the performance gain provided by the database backend you may use the following `PipestatManager` methods or properties\n", + "\n" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "source": [ + "- `get_orm` method\n", + "\n", + "This way you can retrieve the object-relational mapper for the `PipestatManger` instance.\n", + "\n", + "\n", + "```python\n", + "TestORM = psm.get_orm(\"test\")\n", + "```\n" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "source": [ + "\n", + "- `session` property\n", + "\n", + "This is a universal way to get access to a database session. The `session` property is a context manager, so it needs to be used in a `with` block.\n", + "\n", + "Database queries of any kind can be performed with the `PipestatManager.session` property. Although, it requires knowledge of [`sqlalchemy`](https://www.sqlalchemy.org/) library. If you'd like to perform simple filtered queries the `select*` methods described below may be sufficient." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "source": [ + "```python\n", + "# get the object-relational mapper\n", + "TestORM = psm.get_orm(\"test\")\n", + "\n", + "# query the database\n", + "with psm.session as s:\n", + " results = s.query(TestORM).filter(TestORM.number_of_things > 1).all()\n", + "```\n", + "\n", + "For more information regarding database querying and ORM concepts please refer to the [`sqlalchemy` library documentation](https://www.sqlalchemy.org/)\n", + "\n", + "- `select` method\n", + "\n", + "This way you can perform a `SELECT` query on the results database. This is the safest way of performing custom `SELECT`s. Here are the avaialable operators:\n", + "- `eq` for ==\n", + "- `lt` for <\n", + "- `ge` for >=\n", + "- `in` for in_\n", + "- `like` for LIKE\n", + "\n", + "The operators need to be provided in a list filter condition tuples in the following way: `[(key,operator,value)]`, for example `[(\"id\", \"eq\", 1)]` would match the record with value `1` in `id` column. Multiple filter conditions are treated as `AND`.\n", + "\n", + "Columns of `JSONB` type are treated differently. The conditions for `JSONB` columns include JSON column name, key within the JSON object in that column and the value to check the identity against. Therefore only '==' operation is supported in non-nested checks, for example `[(\"other\", \"genome\", \"hg38\")]` would match the records with value `hg38` for `genome` key in the `other` (`JSONB`) column.\n", + "\n", + "```python\n", + "select(\n", + " filter_conditions=[(\"id\", \"eq\", 1)], \n", + " json_filter_conditions=[(\"other\", \"genome\", \"hg38\")], \n", + " offset=1, \n", + " limit=10,\n", + ")\n", + "```\n", + "\n", + "- `select_txt` method\n", + "\n", + "This way you can perform a more flexible, text based query `SELECT`. Please note the security shortcomings related to this method and do not pass the input text from external sources, which may result in SQL injections.\n", + "\n", + "```python\n", + "select_txt(\n", + " filter_templ=\"id<:value and name=:name\", \n", + " filter_params={\"value\": 10, \"name\": \"test\"}, \n", + " offset=1, \n", + " limit=10,\n", + ")\n", + "```" + ], + "cell_type": "markdown", + "metadata": {} } ], "metadata": { "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" + "name": "python395jvsc74a57bd0aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49", + "display_name": "Python 3.9.5 64-bit" }, "language_info": { "codemirror_mode": { @@ -1123,9 +1213,14 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.4" + "version": "3.9.5" + }, + "metadata": { + "interpreter": { + "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" + } } }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/pipestat/pipestat.py b/pipestat/pipestat.py index ba98f958..ca195460 100644 --- a/pipestat/pipestat.py +++ b/pipestat/pipestat.py @@ -7,8 +7,7 @@ import sqlalchemy.orm from attmap import PathExAttMap as PXAM from jsonschema import validate -from sqlalchemy import Column, ForeignKey, create_engine -from sqlalchemy import text +from sqlalchemy import Column, ForeignKey, create_engine, text from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import ( DeclarativeMeta, @@ -886,7 +885,7 @@ def _table_to_dict(self) -> None: Create a dictionary from the database table data """ with self.session as s: - records = s.query(self._get_orm(self.namespace)).all() + records = s.query(self.get_orm(self.namespace)).all() _LOGGER.debug(f"Reading data from database for '{self.namespace}' namespace") for record in records: record_id = getattr(record, RECORD_ID) @@ -941,7 +940,7 @@ def _count_rows(self, table_name: str) -> int: with self.session as s: return s.query(self[DB_ORMS_KEY][table_name].id).count() - def _get_orm(self, table_name: str = None) -> Any: + def get_orm(self, table_name: str = None) -> Any: """ Get an object relational mapper class @@ -974,7 +973,7 @@ def check_record_exists( if self.file is None: with self.session as s: return ( - s.query(self._get_orm(table_name).id) + s.query(self.get_orm(table_name).id) .filter_by(record_identifier=record_identifier) .first() is not None @@ -1032,7 +1031,7 @@ def _check_which_results_exist_db( rid = self._strict_record_id(rid) with self.session as s: record = ( - s.query(self._get_orm(table_name)) + s.query(self.get_orm(table_name)) .filter_by(record_identifier=rid) .first() ) @@ -1074,7 +1073,7 @@ def select( limit: Optional[int] = None, ) -> List[Any]: """ - Perform a SELECT on the table + Perform a `SELECT` on the table :param str table_name: name of the table to SELECT from :param List[str] columns: columns to include in the result @@ -1092,7 +1091,7 @@ def select( :param int limit: include this number of rows """ - ORM = self._get_orm(table_name or self.namespace) + ORM = self.get_orm(table_name or self.namespace) with self.session as s: if columns is not None: query = s.query(*[getattr(ORM, column) for column in columns]) @@ -1180,7 +1179,7 @@ def _retrieve_db( with self.session as s: record = ( - s.query(self._get_orm(table_name)) + s.query(self.get_orm(table_name)) .filter_by(record_identifier=record_identifier) .first() ) @@ -1223,10 +1222,9 @@ def select_txt( f"The {self.__class__.__name__} object is not backed by a database. " f"This operation is not supported for file backend." ) - table_name = table_name or self.namespace with self.session as s: q = ( - s.query(self._get_orm(table_name)) + s.query(self.get_orm(table_name or self.namespace)) .filter(text(filter_templ)) .params(**filter_params) ) @@ -1344,7 +1342,7 @@ def _report_db( :return int: updated/inserted row """ record_identifier = self._strict_record_id(record_identifier) - ORMClass = self._get_orm(table_name) + ORMClass = self.get_orm(table_name) values.update({RECORD_ID: record_identifier}) if not self.check_record_exists( record_identifier=record_identifier, table_name=table_name @@ -1469,7 +1467,7 @@ def _remove_db( """ table_name = table_name or self.namespace record_identifier = self._strict_record_id(record_identifier) - ORMClass = self._get_orm(table_name=table_name) + ORMClass = self.get_orm(table_name=table_name) if self.check_record_exists( record_identifier=record_identifier, table_name=table_name ): From fe7e4ed0c9ca2363af78835437ddbe3708f66e1f Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 3 Jun 2021 16:31:13 -0400 Subject: [PATCH 45/65] document db_column section in the result schema --- docs/pipestat_specification.md | 16 +++ docs_jupyter/python_api.ipynb | 220 ++++++++++++++++----------------- 2 files changed, 122 insertions(+), 114 deletions(-) diff --git a/docs/pipestat_specification.md b/docs/pipestat_specification.md index f3608619..a1a987e2 100644 --- a/docs/pipestat_specification.md +++ b/docs/pipestat_specification.md @@ -140,6 +140,22 @@ log_file: The highlighted results can be later retrieved by pipestat clients via `PipestatManager.highlighted_results` property, which simply returns a list of result identifiers. +## Database columns configuration (DB backend only) +If the `PipestatManager` object is backed by a database, the database columns can be easily configured using the results schema via `db_column` section. For example: + +```yaml +important_numeric_id: + type: integer + description: "An important ID that must be unique and always exist" + db_column: + unique: true + nullable: false +``` + +The values provided in the `db_column` section are passed to the `sqlalchemy.schema.Column` constructor. Therefore, please refer to [`sqlalchemy.Column` class constructor documentation](https://docs.sqlalchemy.org/en/14/core/metadata.html?highlight=column#sqlalchemy.schema.Column.__init__) to learn more about the keys that can be specified in this section. + +In the above example, the `important_numeric_id` result reported with the `PipestatManager` instance initialized with that schema will be forced to be always provided and unique across all records. + # Status schema Apart from results reporting pipestat provides a robust pipeline status management system, which can be used to report pipeline status from within the pipeline and monitor pipeline's status in other software. Status schema file defines the possible pipeline status identifiers and provides other metadata, like `description` or `color` for display purposes. diff --git a/docs_jupyter/python_api.ipynb b/docs_jupyter/python_api.ipynb index 3fc17a69..6907c65f 100644 --- a/docs_jupyter/python_api.ipynb +++ b/docs_jupyter/python_api.ipynb @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ @@ -51,14 +51,14 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 70, "metadata": {}, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ - "/var/folders/z_/bkm4rp412zjctjm28b54tbvr0000gn/T/tmpylsi8onv.yaml\n" + "/var/folders/h8/8npwnh2s4rb8lr6hsy2ydrsh0000gp/T/tmpx1zhr9bi.yaml\n" ] } ], @@ -78,7 +78,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 71, "metadata": {}, "outputs": [], "source": [ @@ -99,18 +99,18 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 72, "metadata": {}, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "'test'" ] }, - "execution_count": 4, "metadata": {}, - "output_type": "execute_result" + "execution_count": 72 } ], "source": [ @@ -126,18 +126,18 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 73, "metadata": {}, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "'sample1'" ] }, - "execution_count": 5, "metadata": {}, - "output_type": "execute_result" + "execution_count": 73 } ], "source": [ @@ -153,18 +153,18 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 74, "metadata": {}, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "YacAttMap: {}" ] }, - "execution_count": 6, "metadata": {}, - "output_type": "execute_result" + "execution_count": 74 } ], "source": [ @@ -194,10 +194,11 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 75, "metadata": {}, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "{'number_of_things': {'type': 'integer', 'description': 'Number of things'},\n", @@ -211,12 +212,15 @@ " 'output_file': {'type': 'file',\n", " 'description': 'This a path to the output file'},\n", " 'output_image': {'type': 'image',\n", - " 'description': 'This a path to the output image'}}" + " 'description': 'This a path to the output image'},\n", + " 'md5sum': {'type': 'string',\n", + " 'description': 'MD5SUM of an object',\n", + " 'highlight': True,\n", + " 'db_column': {'unique': True, 'nullable': False}}}" ] }, - "execution_count": 7, "metadata": {}, - "output_type": "execute_result" + "execution_count": 75 } ], "source": [ @@ -232,10 +236,11 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 76, "metadata": {}, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "{'type': 'object',\n", @@ -244,9 +249,8 @@ " 'required': ['path', 'title']}" ] }, - "execution_count": 8, "metadata": {}, - "output_type": "execute_result" + "execution_count": 76 } ], "source": [ @@ -263,24 +267,14 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 77, "metadata": {}, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ - "'title' is a required property\n", - "\n", - "Failed validating 'required' in schema:\n", - " {'description': 'This a path to the output file',\n", - " 'properties': {'path': {'type': 'string'},\n", - " 'title': {'type': 'string'}},\n", - " 'required': ['path', 'title'],\n", - " 'type': 'object'}\n", - "\n", - "On instance:\n", - " {'path': '/home/user/path.csv'}\n" + "'title' is a required property\n\nFailed validating 'required' in schema:\n {'description': 'This a path to the output file',\n 'properties': {'path': {'type': 'string'},\n 'title': {'type': 'string'}},\n 'required': ['path', 'title'],\n 'type': 'object'}\n\nOn instance:\n {'path': '/home/user/path.csv'}\n" ] } ], @@ -300,26 +294,25 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 78, "metadata": {}, "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ - "Reported records for 'sample1' in 'test' namespace:\n", - " - output_file: {'path': '/home/user/path.csv', 'title': 'CSV file with some data'}\n" + "Reported records for 'sample1' in 'test' namespace:\n - output_file: {'path': '/home/user/path.csv', 'title': 'CSV file with some data'}\n" ] }, { + "output_type": "execute_result", "data": { "text/plain": [ "True" ] }, - "execution_count": 10, "metadata": {}, - "output_type": "execute_result" + "execution_count": 78 } ], "source": [ @@ -342,10 +335,11 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 79, "metadata": {}, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "test:\n", @@ -355,9 +349,8 @@ " title: CSV file with some data" ] }, - "execution_count": 11, "metadata": {}, - "output_type": "execute_result" + "execution_count": 79 } ], "source": [ @@ -373,25 +366,25 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 80, "metadata": {}, "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ "These results exist for 'sample1': output_file\n" ] }, { + "output_type": "execute_result", "data": { "text/plain": [ "False" ] }, - "execution_count": 12, "metadata": {}, - "output_type": "execute_result" + "execution_count": 80 } ], "source": [ @@ -407,12 +400,12 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 81, "metadata": {}, "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ "These results exist for 'sample1': output_file\n", "Overwriting existing results: output_file\n", @@ -421,6 +414,7 @@ ] }, { + "output_type": "execute_result", "data": { "text/plain": [ "test:\n", @@ -430,9 +424,8 @@ " title: new CSV file with some data" ] }, - "execution_count": 13, "metadata": {}, - "output_type": "execute_result" + "execution_count": 81 } ], "source": [ @@ -457,7 +450,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ @@ -471,10 +464,11 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 83, "metadata": {}, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "test:\n", @@ -484,9 +478,8 @@ " title: new CSV file with some data" ] }, - "execution_count": 15, "metadata": {}, - "output_type": "execute_result" + "execution_count": 83 } ], "source": [ @@ -502,14 +495,14 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 84, "metadata": {}, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ - "/var/folders/z_/bkm4rp412zjctjm28b54tbvr0000gn/T/tmpylsi8onv.yaml\n", + "/var/folders/h8/8npwnh2s4rb8lr6hsy2ydrsh0000gp/T/tmpx1zhr9bi.yaml\n", "test:\n", " sample1:\n", " output_file:\n", @@ -538,18 +531,18 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 85, "metadata": {}, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "{'type': 'integer', 'description': 'Number of things'}" ] }, - "execution_count": 17, "metadata": {}, - "output_type": "execute_result" + "execution_count": 85 } ], "source": [ @@ -558,26 +551,25 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 86, "metadata": {}, "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ - "Reported records for 'sample1' in 'test' namespace:\n", - " - number_of_things: 10\n" + "Reported records for 'sample1' in 'test' namespace:\n - number_of_things: 10\n" ] }, { + "output_type": "execute_result", "data": { "text/plain": [ "True" ] }, - "execution_count": 18, "metadata": {}, - "output_type": "execute_result" + "execution_count": 86 } ], "source": [ @@ -593,12 +585,12 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 87, "metadata": {}, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "int() argument must be a string, a bytes-like object or a number, not 'list'\n" ] @@ -622,10 +614,11 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 88, "metadata": {}, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "test:\n", @@ -636,9 +629,8 @@ " number_of_things: 10" ] }, - "execution_count": 20, "metadata": {}, - "output_type": "execute_result" + "execution_count": 88 } ], "source": [ @@ -658,18 +650,18 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 89, "metadata": {}, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "'10'" ] }, - "execution_count": 21, "metadata": {}, - "output_type": "execute_result" + "execution_count": 89 } ], "source": [ @@ -685,10 +677,11 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 90, "metadata": {}, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "{'output_file': {'path': '/home/user/path_new.csv',\n", @@ -696,9 +689,8 @@ " 'number_of_things': '10'}" ] }, - "execution_count": 22, "metadata": {}, - "output_type": "execute_result" + "execution_count": 90 } ], "source": [ @@ -721,25 +713,25 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 91, "metadata": {}, "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ "Removed result 'number_of_things' for record 'sample1' from 'test' namespace\n" ] }, { + "output_type": "execute_result", "data": { "text/plain": [ "True" ] }, - "execution_count": 23, "metadata": {}, - "output_type": "execute_result" + "execution_count": 91 } ], "source": [ @@ -755,25 +747,25 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 92, "metadata": {}, "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ "Removing 'sample1' record\n" ] }, { + "output_type": "execute_result", "data": { "text/plain": [ "True" ] }, - "execution_count": 24, "metadata": {}, - "output_type": "execute_result" + "execution_count": 92 } ], "source": [ @@ -789,7 +781,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 93, "metadata": { "pycharm": { "name": "#%%\n" @@ -797,14 +789,14 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "test: OrderedDict()" ] }, - "execution_count": 25, "metadata": {}, - "output_type": "execute_result" + "execution_count": 93 } ], "source": [ @@ -822,14 +814,14 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 94, "metadata": {}, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ - "/var/folders/z_/bkm4rp412zjctjm28b54tbvr0000gn/T/tmplfz72deu.yaml\n" + "/var/folders/h8/8npwnh2s4rb8lr6hsy2ydrsh0000gp/T/tmp6gz01fu4.yaml\n" ] } ], @@ -856,10 +848,11 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 95, "metadata": {}, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "{'number_of_things': {'type': 'integer', 'description': 'Number of things'},\n", @@ -889,9 +882,8 @@ " 'description': 'Pipeline version'}}" ] }, - "execution_count": 27, "metadata": {}, - "output_type": "execute_result" + "execution_count": 95 } ], "source": [ @@ -907,18 +899,18 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 96, "metadata": {}, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "['log', 'profile', 'commands', 'version']" ] }, - "execution_count": 28, "metadata": {}, - "output_type": "execute_result" + "execution_count": 96 } ], "source": [ @@ -938,7 +930,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 97, "metadata": {}, "outputs": [], "source": [ @@ -954,18 +946,18 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 98, "metadata": {}, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "'running'" ] }, - "execution_count": 30, "metadata": {}, - "output_type": "execute_result" + "execution_count": 98 } ], "source": [ @@ -981,10 +973,11 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 99, "metadata": {}, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "{'running': {'description': 'the pipeline is running',\n", @@ -998,9 +991,8 @@ " 'color': [169, 169, 169]}}" ] }, - "execution_count": 31, "metadata": {}, - "output_type": "execute_result" + "execution_count": 99 } ], "source": [ @@ -1029,14 +1021,14 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 100, "metadata": {}, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ - "/var/folders/z_/bkm4rp412zjctjm28b54tbvr0000gn/T/tmp8q7552ae.yaml\n" + "/var/folders/h8/8npwnh2s4rb8lr6hsy2ydrsh0000gp/T/tmpm44tb0h2.yaml\n" ] } ], @@ -1060,12 +1052,12 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 101, "metadata": {}, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "Results schema not found. The schema is required to report results. It needs to be supplied to the object constructor.\n" ] @@ -1087,18 +1079,18 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 102, "metadata": {}, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "'running'" ] }, - "execution_count": 34, "metadata": {}, - "output_type": "execute_result" + "execution_count": 102 } ], "source": [ @@ -1223,4 +1215,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} From 20d9eda9e50f14f730190c8e61318106603fa85d Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 4 Jun 2021 14:58:49 -0400 Subject: [PATCH 46/65] reformat, remove db column params from test schema --- .github/workflows/run-pytest.yml | 2 +- .pre-commit-config.yaml | 6 +-- MANIFEST.in | 2 +- README.md | 4 +- codecov.yml | 2 +- docs/README.md | 4 +- docs/api_docs.md | 4 +- docs/autodoc_build/pipestat.md | 6 +-- docs/changelog.md | 1 + docs_jupyter/cli.md | 41 +++++++++---------- pipestat/pipestat.py | 2 +- pipestat/schemas/status_schema.yaml | 2 +- requirements/requirements-doc.txt | 6 +-- requirements/requirements-test.txt | 2 +- tests/data/custom_status_schema.yaml | 2 +- tests/data/sample_output_schema.yaml | 4 -- .../data/sample_output_schema_highlight.yaml | 2 +- update_api_docs.sh | 2 +- 18 files changed, 43 insertions(+), 51 deletions(-) diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml index 216adf26..7fe43476 100644 --- a/.github/workflows/run-pytest.yml +++ b/.github/workflows/run-pytest.yml @@ -47,4 +47,4 @@ jobs: uses: codecov/codecov-action@v1 with: file: ./coverage.xml - name: py-${{ matrix.python-version }}-${{ matrix.os }} \ No newline at end of file + name: py-${{ matrix.python-version }}-${{ matrix.os }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ab5489e2..de8f7bdf 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v3.4.0 + rev: v4.0.1 hooks: - id: trailing-whitespace - id: check-yaml @@ -9,12 +9,12 @@ repos: - id: trailing-whitespace - repo: https://github.com/PyCQA/isort - rev: 5.7.0 + rev: 5.8.0 hooks: - id: isort args: ["--profile", "black"] - repo: https://github.com/psf/black - rev: 20.8b1 + rev: 21.5b2 hooks: - id: black diff --git a/MANIFEST.in b/MANIFEST.in index 19044bd6..be6e936b 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,3 @@ include requirements/* include README.md -include pipestat/schemas/* \ No newline at end of file +include pipestat/schemas/* diff --git a/README.md b/README.md index ea931792..cec9c068 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ import pipestat psm = pipestat.PipestatManager() psm.report(values={"result_name": 1.1}) ``` - + ## Retrieve a result From command line: @@ -63,5 +63,3 @@ import pipestat psm = pipestat.PipestatManager() psm.retrieve(result_identifier="result_name") ``` - - diff --git a/codecov.yml b/codecov.yml index bc5f12fb..be72bc25 100644 --- a/codecov.yml +++ b/codecov.yml @@ -2,4 +2,4 @@ ignore: - "*/argparser.py" - "*/cli.py" - "*/__main__.py" - - "setup.py" \ No newline at end of file + - "setup.py" diff --git a/docs/README.md b/docs/README.md index ea931792..cec9c068 100644 --- a/docs/README.md +++ b/docs/README.md @@ -46,7 +46,7 @@ import pipestat psm = pipestat.PipestatManager() psm.report(values={"result_name": 1.1}) ``` - + ## Retrieve a result From command line: @@ -63,5 +63,3 @@ import pipestat psm = pipestat.PipestatManager() psm.retrieve(result_identifier="result_name") ``` - - diff --git a/docs/api_docs.md b/docs/api_docs.md index 1eb7bc19..005e2bb7 100644 --- a/docs/api_docs.md +++ b/docs/api_docs.md @@ -8,7 +8,7 @@ document.addEventListener('DOMContentLoaded', (event) => {