From a8d86040e0b54fb0b5f29e8a6235b22f898e3ab4 Mon Sep 17 00:00:00 2001 From: Ben Knoll Date: Wed, 19 Apr 2023 14:22:31 -0400 Subject: [PATCH] release/v3.2.1 (#174) * Small changes, documentation updates * Fixed print_all_processors_metadata importing modules at cli initialization --- .github/workflows/ci.yml | 4 +- .../biomedicus_client/{pipeline => }/_run.py | 4 +- .../src/biomedicus_client/cli.py | 15 +++++--- .../{pipeline => }/default_pipeline.py | 38 +++++++++---------- .../pipeline_confs/__init__.py | 21 ++++++++++ .../biomedicus_default_pipeline.yml | 0 .../rtf_to_text_pipeline.yml | 0 .../scaleout_pipeline.yml} | 0 .../{pipeline => }/rtf_to_text.py | 25 ++++++------ .../{pipeline => }/sources.py | 10 ++++- pyproject.toml | 6 ++- python/biomedicus/cli.py | 12 ++++-- .../biomedicus/deployment/confs}/__init__.py | 11 ++++-- .../biomedicus_deploy.yml} | 0 .../rtf_to_text_deploy.yml} | 0 .../scaleout_deploy.yml} | 0 .../deployment/default_deployment.py | 28 +++++++------- python/biomedicus/deployment/rtf_to_text.py | 29 +++++++------- .../biomedicus/examples/tutorial/__init__.py | 0 .../examples/tutorial/medications.py | 20 ++++++++++ .../examples/{ => tutorial}/sql_pipeline.py | 5 ++- .../{ => tutorial}/sql_pipeline_rtf.py | 30 ++++++++------- .../{ => tutorial}/sql_pipeline_rtf_only.py | 32 +++++++++------- .../print_all_processors_metadata.py | 6 +-- python/tests/scaleout/test_scaleout.py | 4 +- tools/docker/Dockerfile | 2 +- 26 files changed, 191 insertions(+), 111 deletions(-) rename biomedicus_client/src/biomedicus_client/{pipeline => }/_run.py (95%) rename biomedicus_client/src/biomedicus_client/{pipeline => }/default_pipeline.py (76%) create mode 100644 biomedicus_client/src/biomedicus_client/pipeline_confs/__init__.py rename biomedicus_client/src/biomedicus_client/{pipeline => pipeline_confs}/biomedicus_default_pipeline.yml (100%) rename biomedicus_client/src/biomedicus_client/{pipeline => pipeline_confs}/rtf_to_text_pipeline.yml (100%) rename biomedicus_client/src/biomedicus_client/{pipeline/scaleout_pipeline_config.yml => pipeline_confs/scaleout_pipeline.yml} (100%) rename biomedicus_client/src/biomedicus_client/{pipeline => }/rtf_to_text.py (83%) rename biomedicus_client/src/biomedicus_client/{pipeline => }/sources.py (95%) rename {biomedicus_client/src/biomedicus_client/pipeline => python/biomedicus/deployment/confs}/__init__.py (56%) rename python/biomedicus/deployment/{biomedicus_deploy_config.yml => confs/biomedicus_deploy.yml} (100%) rename python/biomedicus/deployment/{rtf_to_text_deploy_config.yml => confs/rtf_to_text_deploy.yml} (100%) rename python/biomedicus/deployment/{scaleout_deploy_config.yml => confs/scaleout_deploy.yml} (100%) create mode 100644 python/biomedicus/examples/tutorial/__init__.py create mode 100644 python/biomedicus/examples/tutorial/medications.py rename python/biomedicus/examples/{ => tutorial}/sql_pipeline.py (96%) rename python/biomedicus/examples/{ => tutorial}/sql_pipeline_rtf.py (66%) rename python/biomedicus/examples/{ => tutorial}/sql_pipeline_rtf_only.py (66%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 456a49a4..26d7fde5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -44,14 +44,14 @@ jobs: pip install flake8 pytest pip install git+https://github.com/nlpie/mtap@main#egg=mtap pip install ./biomedicus_client - pip install .[test] --extra-index-url https://download.pytorch.org/whl/cpu + pip install .[test,stanza] --extra-index-url https://download.pytorch.org/whl/cpu - name: Install dependencies (release) if: ${{ startsWith(github.head_ref, 'release') && success() }} run: | python -m pip install --upgrade pip setuptools wheel pip install flake8 pytest SETUPTOOLS_SCM_PRETEND_VERSION=${GITHUB_HEAD_REF##*/} pip install ./biomedicus_client - pip install .[test] --extra-index-url https://download.pytorch.org/whl/cpu + pip install .[test,stanza] --extra-index-url https://download.pytorch.org/whl/cpu - name: Lint with flake8 run: | pip install flake8 diff --git a/biomedicus_client/src/biomedicus_client/pipeline/_run.py b/biomedicus_client/src/biomedicus_client/_run.py similarity index 95% rename from biomedicus_client/src/biomedicus_client/pipeline/_run.py rename to biomedicus_client/src/biomedicus_client/_run.py index 2eaf7294..dc4dc842 100644 --- a/biomedicus_client/src/biomedicus_client/pipeline/_run.py +++ b/biomedicus_client/src/biomedicus_client/_run.py @@ -16,12 +16,12 @@ from argparse import ArgumentParser from typing import List +from biomedicus_client.sources import WatcherSource, rtf_source, RtfHandler, TxtHandler from mtap import events_client from mtap.pipeline import FilesInDirectoryProcessingSource +from biomedicus_client import default_pipeline from biomedicus_client.cli_tools import Command -from biomedicus_client.pipeline import default_pipeline -from biomedicus_client.pipeline.sources import WatcherSource, RtfHandler, rtf_source, TxtHandler class RunCommand(Command): diff --git a/biomedicus_client/src/biomedicus_client/cli.py b/biomedicus_client/src/biomedicus_client/cli.py index 890b2051..3168bdb9 100644 --- a/biomedicus_client/src/biomedicus_client/cli.py +++ b/biomedicus_client/src/biomedicus_client/cli.py @@ -15,13 +15,18 @@ import logging +from biomedicus_client import pipeline_confs +from biomedicus_client._run import RunCommand from biomedicus_client.cli_tools import create_parser, WriteConfigsCommand -from biomedicus_client.pipeline import RunCommand, default_pipeline, rtf_to_text +from biomedicus_client.rtf_to_text import RunRtfToTextCommand + +__all__ = ('main',) + CLIENT_CONFIGS = { - 'pipeline': default_pipeline.default_pipeline_config, - 'scaleout_pipeline': default_pipeline.scaleout_pipeline_config, - 'rtf_only_pipeline': rtf_to_text.default_rtf_to_text_pipeline_config + 'pipeline': pipeline_confs.DEFAULT, + 'scaleout_pipeline': pipeline_confs.SCALEOUT, + 'rtf_only_pipeline': pipeline_confs.RTF_TO_TEXT } @@ -29,7 +34,7 @@ def main(args=None): parser = create_parser( WriteConfigsCommand(CLIENT_CONFIGS), RunCommand(), - rtf_to_text.RunRtfToTextCommand() + RunRtfToTextCommand() ) conf = parser.parse_args(args) logging.basicConfig(level=conf.log_level) diff --git a/biomedicus_client/src/biomedicus_client/pipeline/default_pipeline.py b/biomedicus_client/src/biomedicus_client/default_pipeline.py similarity index 76% rename from biomedicus_client/src/biomedicus_client/pipeline/default_pipeline.py rename to biomedicus_client/src/biomedicus_client/default_pipeline.py index b3a36723..66074f26 100644 --- a/biomedicus_client/src/biomedicus_client/pipeline/default_pipeline.py +++ b/biomedicus_client/src/biomedicus_client/default_pipeline.py @@ -17,17 +17,16 @@ from pathlib import Path from typing import Optional, Union -from importlib_resources import files +from importlib_resources import as_file from mtap import Pipeline, LocalProcessor, RemoteProcessor from mtap.serialization import SerializerRegistry, SerializationProcessor -__all__ = ['default_pipeline_config', 'scaleout_pipeline_config', 'create', 'from_args', 'argument_parser'] +__all__ = ['create', 'from_args', 'argument_parser'] -default_pipeline_config = files('biomedicus_client.pipeline').joinpath('biomedicus_default_pipeline.yml') -scaleout_pipeline_config = files('biomedicus_client.pipeline').joinpath('scaleout_pipeline_config.yml') +from biomedicus_client import pipeline_confs -def create(config: Optional[Union[str, Path]] = None, +def create(config: Optional[Union[str, bytes, Path]] = None, *, events_addresses: Optional[str] = None, rtf: bool = False, rtf_address: str = "localhost:50200", @@ -39,27 +38,26 @@ def create(config: Optional[Union[str, Path]] = None, """The biomedicus default pipeline for processing clinical documents. Args - config (Optional[Union[str, Path]]): A path to an MTAP pipeline configuration YAML file to - use instead of the default. + config: A path to an MTAP pipeline configuration YAML file to use instead of the default. Keyword Args - events_addresses (Optional[str]): The address (or addresses, comma separated) for the - events service. - rtf (bool): Whether to include the rtf processor at the start of the pipeline. The rtf - processor will convert RTF data stored in the "rtf" Binary on the event to the - "plaintext" Document. - rtf_address (str): The address of the remote rtf processor. - serializer (Optional[str]): An optional serializer (examples: 'json', 'yml', 'pickle'). - output_directory (Optional[Path]): Where the serializer should output the serialized files. - address (Optional[str]): An optional address to use for all processors. + events_addresses: The address (or addresses, comma separated) for the events service. + rtf: Whether to include the rtf processor at the start of the pipeline. The rtf processor will convert RTF data + stored in the "rtf" Binary on the event to the "plaintext" Document. + rtf_address: The address of the remote rtf processor. + serializer: An optional serializer (examples: 'json', 'yml', 'pickle'). + output_directory: Where the serializer should output the serialized files. + address: An optional address to use for all processors. Returns Pipeline """ if config is None: - config = default_pipeline_config - pipeline = Pipeline.from_yaml_file(config) + with as_file(pipeline_confs.DEFAULT) as config: + pipeline = Pipeline.from_yaml_file(config) + else: + pipeline = Pipeline.from_yaml_file(config) if events_addresses is not None: pipeline.events_address = events_addresses @@ -73,6 +71,9 @@ def create(config: Optional[Union[str, Path]] = None, pipeline.append(ser_comp) if rtf: + if rtf_address is None: + rtf_address = 'localhost:50200' + rtf_processor = RemoteProcessor(processor_name='biomedicus-rtf', address=rtf_address, params={'output_document_name': 'plaintext'}) @@ -126,7 +127,6 @@ def argument_parser() -> ArgumentParser: ) parser.add_argument( '--rtf-address', - default="localhost:50200", help="The address (or addresses, comma separated) for the rtf to text converter processor." ) parser.add_argument( diff --git a/biomedicus_client/src/biomedicus_client/pipeline_confs/__init__.py b/biomedicus_client/src/biomedicus_client/pipeline_confs/__init__.py new file mode 100644 index 00000000..6b7c7fd7 --- /dev/null +++ b/biomedicus_client/src/biomedicus_client/pipeline_confs/__init__.py @@ -0,0 +1,21 @@ +# Copyright 2023 Regents of the University of Minnesota. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Provides importlib_resources Traversable objects for built-in pipeline configuration files.""" +from importlib_resources import files + +__all__ = ('DEFAULT', 'SCALEOUT', 'RTF_TO_TEXT') + +DEFAULT = files(__name__).joinpath("biomedicus_default_pipeline.yml") +SCALEOUT = files(__name__).joinpath("scaleout_pipeline.yml") +RTF_TO_TEXT = files(__name__).joinpath("rtf_to_text_pipeline.yml") diff --git a/biomedicus_client/src/biomedicus_client/pipeline/biomedicus_default_pipeline.yml b/biomedicus_client/src/biomedicus_client/pipeline_confs/biomedicus_default_pipeline.yml similarity index 100% rename from biomedicus_client/src/biomedicus_client/pipeline/biomedicus_default_pipeline.yml rename to biomedicus_client/src/biomedicus_client/pipeline_confs/biomedicus_default_pipeline.yml diff --git a/biomedicus_client/src/biomedicus_client/pipeline/rtf_to_text_pipeline.yml b/biomedicus_client/src/biomedicus_client/pipeline_confs/rtf_to_text_pipeline.yml similarity index 100% rename from biomedicus_client/src/biomedicus_client/pipeline/rtf_to_text_pipeline.yml rename to biomedicus_client/src/biomedicus_client/pipeline_confs/rtf_to_text_pipeline.yml diff --git a/biomedicus_client/src/biomedicus_client/pipeline/scaleout_pipeline_config.yml b/biomedicus_client/src/biomedicus_client/pipeline_confs/scaleout_pipeline.yml similarity index 100% rename from biomedicus_client/src/biomedicus_client/pipeline/scaleout_pipeline_config.yml rename to biomedicus_client/src/biomedicus_client/pipeline_confs/scaleout_pipeline.yml diff --git a/biomedicus_client/src/biomedicus_client/pipeline/rtf_to_text.py b/biomedicus_client/src/biomedicus_client/rtf_to_text.py similarity index 83% rename from biomedicus_client/src/biomedicus_client/pipeline/rtf_to_text.py rename to biomedicus_client/src/biomedicus_client/rtf_to_text.py index 05a77c95..04ad18b6 100644 --- a/biomedicus_client/src/biomedicus_client/pipeline/rtf_to_text.py +++ b/biomedicus_client/src/biomedicus_client/rtf_to_text.py @@ -15,19 +15,17 @@ from argparse import ArgumentParser, Namespace from os import PathLike - -from importlib_resources import files from pathlib import Path from typing import Union, Optional, List +from biomedicus_client.sources import rtf_source +from importlib_resources import as_file from mtap import Pipeline, LocalProcessor, EventProcessor, processor, events_client +from biomedicus_client import pipeline_confs from biomedicus_client.cli_tools import Command -from biomedicus_client.pipeline.sources import rtf_source - -__all__ = ['default_rtf_to_text_pipeline_config', 'create', 'from_args', 'argument_parser', 'RunRtfToTextCommand'] -default_rtf_to_text_pipeline_config = files('biomedicus_client.pipeline').joinpath('rtf_to_text_pipeline.yml') +__all__ = ['create', 'from_args', 'argument_parser', 'RunRtfToTextCommand'] @processor('write-plaintext') @@ -57,8 +55,10 @@ def create(config: Optional[Union[str, PathLike]] = None, """ if config is None: - config = default_rtf_to_text_pipeline_config - pipeline = Pipeline.from_yaml_file(config) + with as_file(pipeline_confs.RTF_TO_TEXT) as config: + pipeline = Pipeline.from_yaml_file(config) + else: + pipeline = Pipeline.from_yaml_file(config) if events_addresses is not None: pipeline.events_address = events_addresses @@ -78,18 +78,15 @@ def argument_parser(): """ parser = ArgumentParser(add_help=False) - parser.add_argument('--config', default=None, - help='Path to the pipeline configuration file.') + parser.add_argument('--config', default=None, help='Path to the pipeline configuration file.') parser.add_argument('--output_directory', '-o', default='output', help="The output directory to write txt out.") - parser.add_argument('--events-addresses', default=None, - help="The address for the events service.") + parser.add_argument('--events-addresses', default=None, help="The address for the events service.") return parser def from_args(args: Namespace) -> Pipeline: if not isinstance(args, Namespace): - raise ValueError('"args" parameter should be the parsed arguments from ' - '"rtf_to_text.argument_parser()"') + raise ValueError('"args" parameter should be the parsed arguments from "rtf_to_text.argument_parser()"') return create(**vars(args)) diff --git a/biomedicus_client/src/biomedicus_client/pipeline/sources.py b/biomedicus_client/src/biomedicus_client/sources.py similarity index 95% rename from biomedicus_client/src/biomedicus_client/pipeline/sources.py rename to biomedicus_client/src/biomedicus_client/sources.py index e0d26b0a..30aec765 100644 --- a/biomedicus_client/src/biomedicus_client/pipeline/sources.py +++ b/biomedicus_client/src/biomedicus_client/sources.py @@ -16,14 +16,22 @@ import fnmatch import time from pathlib import Path +from typing import Generator, Iterator from mtap import Event from mtap.pipeline import ProcessingSource from mtap.types import EventsClient from watchdog.events import FileSystemEventHandler, FileSystemEvent +__all__ = [ + 'rtf_source', + 'RtfHandler', + 'TxtHandler', + 'WatcherSource' +] -def rtf_source(input_directory: Path, extension_glob: str, events_client: EventsClient): + +def rtf_source(input_directory: Path, extension_glob: str, events_client: EventsClient) -> Iterator[Event]: input_directory = Path(input_directory) for path in input_directory.rglob(extension_glob): with path.open('rb', errors=None) as f: diff --git a/pyproject.toml b/pyproject.toml index 4a0ad383..ba9db3a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,12 +34,11 @@ classifiers = [ 'Topic :: Text Processing :: Linguistic' ] dependencies = [ - "biomedicus_client==3.2.0", # We get mtap, tqdm, and importlib_resources transitively from the client + "biomedicus_client==3.2.1", # We get mtap, tqdm, and importlib_resources transitively from the client "numpy==1.24.2", "pyyaml==6.0", "regex==2023.3.23", "torch==2.0.0", - "stanza==1.5.0", "requests==2.28.2", "watchdog==3.0.0", ] @@ -52,6 +51,9 @@ test = [ docs = [ "sphinx==6.1.3", ] +stanza = [ + "stanza==1.5.0", +] [project.scripts] b9 = "biomedicus.cli:main" diff --git a/python/biomedicus/cli.py b/python/biomedicus/cli.py index f53a2407..e46175b5 100644 --- a/python/biomedicus/cli.py +++ b/python/biomedicus/cli.py @@ -16,8 +16,9 @@ import logging from biomedicus.deployment import ( - default_deployment, DownloadDataCommand, + confs as deployment_confs, + default_deployment, rtf_to_text ) from biomedicus.java_support import RunJavaCommand @@ -25,10 +26,13 @@ from biomedicus_client import cli_tools from biomedicus_client.cli_tools import WriteConfigsCommand +__all__ = ('main',) + + SERVER_CONFIGS = { - 'deploy': default_deployment.deployment_config, - 'scaleout_deploy': default_deployment.scaleout_deploy_config, - 'rtf_to_text': rtf_to_text.deployment_config + 'deploy': deployment_confs.DEFAULT, + 'scaleout_deploy': deployment_confs.SCALEOUT, + 'rtf_to_text': deployment_confs.RTF_TO_TEXT } diff --git a/biomedicus_client/src/biomedicus_client/pipeline/__init__.py b/python/biomedicus/deployment/confs/__init__.py similarity index 56% rename from biomedicus_client/src/biomedicus_client/pipeline/__init__.py rename to python/biomedicus/deployment/confs/__init__.py index b98b7ae1..88be1016 100644 --- a/biomedicus_client/src/biomedicus_client/pipeline/__init__.py +++ b/python/biomedicus/deployment/confs/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2022 Regents of the University of Minnesota. +# Copyright 2023 Regents of the University of Minnesota. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,6 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Support for running biomedicus pipelines.""" +"""Provides importlib_resources Traversable objects for built-in deployment configuration files.""" +from importlib_resources import files -from biomedicus_client.pipeline._run import RunCommand +__all__ = ('DEFAULT', 'SCALEOUT', 'RTF_TO_TEXT') + +DEFAULT = files(__name__).joinpath("biomedicus_deploy.yml") +SCALEOUT = files(__name__).joinpath("scaleout_deploy.yml") +RTF_TO_TEXT = files(__name__).joinpath("rtf_to_text_deploy.yml") diff --git a/python/biomedicus/deployment/biomedicus_deploy_config.yml b/python/biomedicus/deployment/confs/biomedicus_deploy.yml similarity index 100% rename from python/biomedicus/deployment/biomedicus_deploy_config.yml rename to python/biomedicus/deployment/confs/biomedicus_deploy.yml diff --git a/python/biomedicus/deployment/rtf_to_text_deploy_config.yml b/python/biomedicus/deployment/confs/rtf_to_text_deploy.yml similarity index 100% rename from python/biomedicus/deployment/rtf_to_text_deploy_config.yml rename to python/biomedicus/deployment/confs/rtf_to_text_deploy.yml diff --git a/python/biomedicus/deployment/scaleout_deploy_config.yml b/python/biomedicus/deployment/confs/scaleout_deploy.yml similarity index 100% rename from python/biomedicus/deployment/scaleout_deploy_config.yml rename to python/biomedicus/deployment/confs/scaleout_deploy.yml diff --git a/python/biomedicus/deployment/default_deployment.py b/python/biomedicus/deployment/default_deployment.py index 985b8abb..af94a7e5 100644 --- a/python/biomedicus/deployment/default_deployment.py +++ b/python/biomedicus/deployment/default_deployment.py @@ -12,45 +12,48 @@ # See the License for the specific language governing permissions and # limitations under the License. -import logging from argparse import ArgumentParser from contextlib import contextmanager from typing import List, Optional, ContextManager -from importlib_resources import files +from importlib_resources import as_file from mtap.deployment import Deployment +from biomedicus.deployment import confs from biomedicus.deployment._data_downloading import check_data from biomedicus.java_support import attach_biomedicus_jar from biomedicus_client.cli_tools import Command -logger = logging.getLogger(__name__) - -deployment_config = files('biomedicus.deployment').joinpath('biomedicus_deploy_config.yml') -scaleout_deploy_config = files('biomedicus.deployment').joinpath('scaleout_deploy_config.yml') - @contextmanager -def create_deployment(offline: bool = False, +def create_deployment(config: Optional[str] = None, + offline: bool = False, download_data: bool = False, noninteractive: bool = False, - config: Optional[str] = None, log_level: Optional[str] = None, jvm_classpath: Optional[str] = None, rtf: bool = False, host: Optional[str] = None, startup_timeout: Optional[float] = None, **_) -> ContextManager[Deployment]: - config = config if config is not None else deployment_config - log_level = log_level if log_level is not None else 'INFO' if not offline: check_data(download_data, noninteractive=noninteractive) - deployment = Deployment.from_yaml_file(config) + + if config is None: + with as_file(confs.DEFAULT) as config: + deployment = Deployment.from_yaml_file(config) + else: + deployment = Deployment.from_yaml_file(config) + if host is not None: deployment.global_settings.host = host + + log_level = log_level if log_level is not None else 'INFO' deployment.global_settings.log_level = log_level + startup_timeout = startup_timeout or 30 deployment.shared_processor_config.startup_timeout = startup_timeout + with attach_biomedicus_jar( deployment.shared_processor_config.java_classpath, jvm_classpath @@ -72,7 +75,6 @@ def argument_parser(): parser = ArgumentParser(add_help=False) parser.add_argument( '--config', - default=deployment_config, help='A path to a deployment configuration file to use instead of the' 'default deployment configuration.' ) diff --git a/python/biomedicus/deployment/rtf_to_text.py b/python/biomedicus/deployment/rtf_to_text.py index a5fe7ea7..aa8a19dc 100644 --- a/python/biomedicus/deployment/rtf_to_text.py +++ b/python/biomedicus/deployment/rtf_to_text.py @@ -13,31 +13,34 @@ # limitations under the License. from argparse import ArgumentParser, Namespace from contextlib import contextmanager +from importlib_resources import as_file from typing import Optional, List, ContextManager -from importlib_resources import files from mtap.deployment import Deployment +from biomedicus.deployment import confs from biomedicus.java_support import attach_biomedicus_jar from biomedicus_client.cli_tools import Command -deployment_config = files('biomedicus.deployment').joinpath('rtf_to_text_deploy_config.yml') - @contextmanager -def create_deployment(config_file: Optional[str] = None, +def create_deployment(config: Optional[str] = None, jvm_classpath: Optional[str] = None, log_level: Optional[str] = None, startup_timeout: Optional[float] = None, **_) -> ContextManager[Deployment]: - if config_file is None: - config_file = deployment_config - if log_level is None: - log_level = 'INFO' - deployment = Deployment.from_yaml_file(config_file) + if config is None: + with as_file(confs.RTF_TO_TEXT) as config: + deployment = Deployment.from_yaml_file(config) + else: + deployment = Deployment.from_yaml_file(config) + + log_level = 'INFO' if log_level is None else log_level deployment.global_settings.log_level = log_level - if startup_timeout is not None: - deployment.shared_processor_config.startup_timeout = startup_timeout + + startup_timeout = startup_timeout or 30 + deployment.shared_processor_config.startup_timeout = startup_timeout + with attach_biomedicus_jar( deployment.shared_processor_config.java_classpath, jvm_classpath @@ -56,7 +59,6 @@ def argument_parser() -> ArgumentParser: '--config', help='A path to a deployment configuration file to use instead of the' 'default deployment configuration.', - default=deployment_config ) parser.add_argument( '--jvm-classpath', @@ -67,7 +69,8 @@ def argument_parser() -> ArgumentParser: help="The log level for pipeline runners." ) parser.add_argument( - '--startup-timeout', type=float, default=10, + '--startup-timeout', + type=float, help="The timeout (in seconds) for individual processor services to deploy before failure." ) return parser diff --git a/python/biomedicus/examples/tutorial/__init__.py b/python/biomedicus/examples/tutorial/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/biomedicus/examples/tutorial/medications.py b/python/biomedicus/examples/tutorial/medications.py new file mode 100644 index 00000000..80af6ae6 --- /dev/null +++ b/python/biomedicus/examples/tutorial/medications.py @@ -0,0 +1,20 @@ +from mtap import DocumentProcessor, run_processor + + +class MedicationsProcessor(DocumentProcessor): + def process_document(self, document, params): + sentences = document.labels['sentences'] + umls_concepts = document.labels['umls_concepts'] + with document.get_labeler('medication_sentences') as MedicationSentence: + for sentence in sentences: + medication_concepts = [] + for concept in umls_concepts.inside(sentence): + if concept.tui == 'T121': + medication_concepts.append(concept) + if len(medication_concepts) > 0: + MedicationSentence(sentence.start_index, sentence.end_index, + concepts=medication_concepts) + + +if __name__ == '__main__': + run_processor(MedicationsProcessor()) diff --git a/python/biomedicus/examples/sql_pipeline.py b/python/biomedicus/examples/tutorial/sql_pipeline.py similarity index 96% rename from python/biomedicus/examples/sql_pipeline.py rename to python/biomedicus/examples/tutorial/sql_pipeline.py index 7f70756b..2cbb1e4a 100644 --- a/python/biomedicus/examples/sql_pipeline.py +++ b/python/biomedicus/examples/tutorial/sql_pipeline.py @@ -18,9 +18,10 @@ from argparse import ArgumentParser import sqlite3 -from biomedicus_client.pipeline import default_pipeline from mtap import Event, events_client +from biomedicus_client import default_pipeline + if __name__ == '__main__': parser = ArgumentParser(add_help=True, parents=[default_pipeline.argument_parser()]) parser.add_argument('input_file') @@ -31,12 +32,14 @@ con = sqlite3.connect(args.input_file) cur = con.cursor() + def source(): for name, text in cur.execute("SELECT NAME, TEXT FROM DOCUMENTS"): with Event(event_id=name, client=events) as e: doc = e.create_document('plaintext', text) yield doc + count, = next(cur.execute("SELECT COUNT(*) FROM DOCUMENTS")) times = pipeline.run_multithread(source(), total=count) times.print() diff --git a/python/biomedicus/examples/sql_pipeline_rtf.py b/python/biomedicus/examples/tutorial/sql_pipeline_rtf.py similarity index 66% rename from python/biomedicus/examples/sql_pipeline_rtf.py rename to python/biomedicus/examples/tutorial/sql_pipeline_rtf.py index e9abe350..95048e75 100644 --- a/python/biomedicus/examples/sql_pipeline_rtf.py +++ b/python/biomedicus/examples/tutorial/sql_pipeline_rtf.py @@ -18,37 +18,41 @@ from argparse import ArgumentParser import sqlite3 -from biomedicus_client.pipeline import default_pipeline from mtap import Event, events_client +from biomedicus_client import default_pipeline + if __name__ == '__main__': parser = ArgumentParser(add_help=True, parents=[default_pipeline.argument_parser()]) parser.add_argument('input_file') args = parser.parse_args() args.rtf = True # Toggles --rtf flag always on. - # Can also skip parsing arguments and programmatically create the pipeline, see :func:`default_pipeline.create`. + # Can also skip parsing arguments and programmatically create the pipeline, + # see :func:`rtf_to_text.create`. pipeline = default_pipeline.from_args(args) with events_client(pipeline.events_address) as events: con = sqlite3.connect(args.input_file) cur = con.cursor() def source(): - # Note I recommended that RTF documents be stored as BLOBs since most databases do not support - # storing text in the standard Windows-1252 encoding of rtf documents. - # (RTF documents can actually use different encodings specified by a keyword like \ansicpg1252 - # at the beginning of the document, but this is uncommon). - # If you are storing RTF documents ensure that they are initially read from file using the correct - # encoding [i.e. open('file.rtf', 'r', encoding='cp1252')] before storing in the database, - # so that special characters are preserved. + # Note I recommended that RTF documents be stored as BLOBs since most + # databases do not support storing text in the standard Windows-1252 + # encoding of rtf documents. (RTF documents can actually use different + # encodings specified by a keyword like \ansicpg1252 at the beginning of + # the document, but this is uncommon). + # If you are storing RTF documents ensure that they are initially read from + # file using the correct encoding [i.e. open('file.rtf', 'r', encoding='cp1252')] + # before storing in the database, so that special characters are preserved. for name, text in cur.execute("SELECT NAME, TEXT FROM DOCUMENTS"): with Event(event_id=name, client=events) as e: - e.binaries['rtf'] = text # or "e.binaries['rtf'] = text.encode('cp1252')" in TEXT column case + e.binaries['rtf'] = text + # or "e.binaries['rtf'] = text.encode('cp1252')" in TEXT column case yield e count, = next(cur.execute("SELECT COUNT(*) FROM DOCUMENTS")) - # Here we're adding the params since we're calling the pipeline with a source that provides Events rather - # than documents. This param will tell DocumentProcessors which document they need to process after the - # rtf converter creates that document. + # Here we're adding the params since we're calling the pipeline with a source that + # provides Events rather than documents. This param will tell DocumentProcessors + # which document they need to process after the rtf converter creates that document. times = pipeline.run_multithread(source(), params={'document_name': 'plaintext'}, total=count) times.print() con.close() diff --git a/python/biomedicus/examples/sql_pipeline_rtf_only.py b/python/biomedicus/examples/tutorial/sql_pipeline_rtf_only.py similarity index 66% rename from python/biomedicus/examples/sql_pipeline_rtf_only.py rename to python/biomedicus/examples/tutorial/sql_pipeline_rtf_only.py index 39d15cc0..25362d65 100644 --- a/python/biomedicus/examples/sql_pipeline_rtf_only.py +++ b/python/biomedicus/examples/tutorial/sql_pipeline_rtf_only.py @@ -18,37 +18,43 @@ from argparse import ArgumentParser import sqlite3 -from biomedicus_client.pipeline import rtf_to_text from mtap import Event, events_client +from biomedicus_client import rtf_to_text + if __name__ == '__main__': parser = ArgumentParser(add_help=True, parents=[rtf_to_text.argument_parser()]) parser.add_argument('input_file') args = parser.parse_args() args.rtf = True # Toggles --rtf flag always on. - # Can also skip parsing arguments and programmatically create the pipeline, see :func:`rtf_to_text.create`. + # Can also skip parsing arguments and programmatically create the pipeline, + # see :func:`rtf_to_text.create`. pipeline = rtf_to_text.from_args(args) with events_client(pipeline.events_address) as events: con = sqlite3.connect(args.input_file) cur = con.cursor() + def source(): - # Note I recommended that RTF documents be stored as BLOBs since most databases do not support - # storing text in the standard Windows-1252 encoding of rtf documents. - # (RTF documents can actually use different encodings specified by a keyword like \ansicpg1252 - # at the beginning of the document, but this is uncommon). - # If you are storing RTF documents ensure that they are initially read from file using the correct - # encoding [i.e. open('file.rtf', 'r', encoding='cp1252')] before storing in the database, - # so that special characters are preserved. + # Note I recommended that RTF documents be stored as BLOBs since most + # databases do not support storing text in the standard Windows-1252 + # encoding of rtf documents. (RTF documents can actually use different + # encodings specified by a keyword like \ansicpg1252 at the beginning of + # the document, but this is uncommon). + # If you are storing RTF documents ensure that they are initially read from + # file using the correct encoding [i.e. open('file.rtf', 'r', encoding='cp1252')] + # before storing in the database, so that special characters are preserved. for name, text in cur.execute("SELECT NAME, TEXT FROM DOCUMENTS"): with Event(event_id=name, client=events) as e: - e.binaries['rtf'] = text # or "e.binaries['rtf'] = text.encode('cp1252')" in TEXT column case + e.binaries['rtf'] = text + # or "e.binaries['rtf'] = text.encode('cp1252')" in TEXT column case yield e + count, = next(cur.execute("SELECT COUNT(*) FROM DOCUMENTS")) - # Here we're adding the params since we're calling the pipeline with a source that provides Events rather - # than documents. This param will tell DocumentProcessors which document they need to process after the - # rtf converter creates that document. + # Here we're adding the params since we're calling the pipeline with a source that + # provides Events rather than documents. This param will tell DocumentProcessors + # which document they need to process after the rtf converter creates that document. times = pipeline.run_multithread(source(), params={'document_name': 'plaintext'}, total=count) times.print() con.close() diff --git a/python/biomedicus/utilities/print_all_processors_metadata.py b/python/biomedicus/utilities/print_all_processors_metadata.py index 12a74f8b..1ba3b9bf 100644 --- a/python/biomedicus/utilities/print_all_processors_metadata.py +++ b/python/biomedicus/utilities/print_all_processors_metadata.py @@ -18,14 +18,14 @@ from tempfile import NamedTemporaryFile from typing import Optional -from biomedicus.dependencies.stanza_parser import StanzaParser from biomedicus.java_support import run_java -from biomedicus.negation.negex import NegexProcessor -from biomedicus.sentences.bi_lstm import SentenceProcessor from biomedicus_client.cli_tools import Command def print_processor_meta(output_file: Optional[str] = None): + from biomedicus.dependencies.stanza_parser import StanzaParser + from biomedicus.negation.negex import NegexProcessor + from biomedicus.sentences.bi_lstm import SentenceProcessor if output_file is None: output_file = "processors.yaml" if os.path.isdir(output_file): diff --git a/python/tests/scaleout/test_scaleout.py b/python/tests/scaleout/test_scaleout.py index 5bafd32b..bb9d507d 100644 --- a/python/tests/scaleout/test_scaleout.py +++ b/python/tests/scaleout/test_scaleout.py @@ -35,7 +35,7 @@ def listen(p, e=None): deploy = Popen([sys.executable, "-m", "biomedicus", "deploy", "--noninteractive", "--log-level", "DEBUG", - "--config", os.path.join(tmpdir, "scaleout_deploy_config.yml"), + "--config", os.path.join(tmpdir, "scaleout_deploy.yml"), "--startup-timeout", str(processor_timeout)], stdout=PIPE, stderr=STDOUT) deploy_event = Event() @@ -49,7 +49,7 @@ def listen(p, e=None): output_folder = os.path.join(tmpdir, "output") process = run([sys.executable, "-m", "biomedicus_client", "run", "--log-level", "DEBUG", - "--config", os.path.join(tmpdir, "scaleout_pipeline_config.yml"), + "--config", os.path.join(tmpdir, "scaleout_pipeline.yml"), os.fspath(input_folder), "-o", output_folder]) assert process.returncode == 0 diff --git a/tools/docker/Dockerfile b/tools/docker/Dockerfile index 5cb9960d..31cd9aa0 100644 --- a/tools/docker/Dockerfile +++ b/tools/docker/Dockerfile @@ -22,7 +22,7 @@ RUN useradd -d /biomedicus -ms /bin/bash biomedicus USER biomedicus WORKDIR /biomedicus -RUN b9 download-data --with-stanza +RUN b9 download-data EXPOSE 50000-51000 ENTRYPOINT ["b9", "deploy", "--host", "0.0.0.0"]