From e46d9e31133a233d10de1894a04c26b581dffa1d Mon Sep 17 00:00:00 2001 From: Doojin Park Date: Wed, 2 Oct 2024 11:43:37 +0000 Subject: [PATCH 01/12] Deprecate KFP v1 SDK support --- build/BUILD | 1 - tfx/dependencies.py | 18 +- .../penguin/penguin_pipeline_kubeflow.py | 30 +- .../penguin/penguin_pipeline_kubeflow_test.py | 25 +- .../templates/taxi/kubeflow_runner.py | 100 ---- tfx/orchestration/data_types.py | 2 +- tfx/orchestration/kubeflow/base_component.py | 166 ------ .../kubeflow/base_component_test.py | 209 -------- .../kubeflow/kubeflow_dag_runner.py | 471 ------------------ .../kubeflow/kubeflow_dag_runner_test.py | 324 ------------ tfx/orchestration/kubeflow/proto/BUILD | 25 - .../kubeflow/proto/kubeflow.proto | 52 -- tfx/orchestration/pipeline.py | 2 +- .../handler/kubeflow_dag_runner_patcher.py | 86 ---- .../kubeflow_dag_runner_patcher_test.py | 66 --- tfx/v1/orchestration/experimental/__init__.py | 17 - tfx/v1/proto/__init__.py | 2 +- 17 files changed, 21 insertions(+), 1575 deletions(-) delete mode 100644 tfx/experimental/templates/taxi/kubeflow_runner.py delete mode 100644 tfx/orchestration/kubeflow/base_component.py delete mode 100644 tfx/orchestration/kubeflow/base_component_test.py delete mode 100644 tfx/orchestration/kubeflow/kubeflow_dag_runner.py delete mode 100644 tfx/orchestration/kubeflow/kubeflow_dag_runner_test.py delete mode 100644 tfx/orchestration/kubeflow/proto/BUILD delete mode 100644 tfx/orchestration/kubeflow/proto/kubeflow.proto delete mode 100644 tfx/tools/cli/handler/kubeflow_dag_runner_patcher.py delete mode 100644 tfx/tools/cli/handler/kubeflow_dag_runner_patcher_test.py diff --git a/build/BUILD b/build/BUILD index 4d596ef5b2..60607e96b3 100644 --- a/build/BUILD +++ b/build/BUILD @@ -25,7 +25,6 @@ sh_binary( "//tfx/extensions/experimental/kfp_compatibility/proto:kfp_component_spec_pb2.py", "//tfx/extensions/google_cloud_big_query/experimental/elwc_example_gen/proto:elwc_config_pb2.py", "//tfx/orchestration/experimental/core:component_generated_alert_pb2.py", - "//tfx/orchestration/kubeflow/proto:kubeflow_pb2.py", "//tfx/proto:bulk_inferrer_pb2.py", "//tfx/proto:distribution_validator_pb2.py", "//tfx/proto:evaluator_pb2.py", diff --git a/tfx/dependencies.py b/tfx/dependencies.py index 7cb051c75c..181b9aa020 100644 --- a/tfx/dependencies.py +++ b/tfx/dependencies.py @@ -71,10 +71,8 @@ def make_pipeline_sdk_required_install_packages(): "google-api-python-client>=1.8,<2", # TODO(b/176812386): Deprecate usage of jinja2 for placeholders. "jinja2>=2.7.3,<4", - # typing-extensions allows consistent & future-proof interface for typing. - # Since kfp<2 uses typing-extensions<4, lower bound is the latest 3.x, and - # upper bound is <5 as the semver started from 4.0 according to their doc. - "typing-extensions>=3.10.0.2,<5", + # Upper bound is <5 as the semver started from 4.0 according to their doc. + "typing-extensions<5", ] @@ -90,7 +88,7 @@ def make_required_install_packages(): "google-cloud-bigquery>=3,<4", "grpcio>=1.28.1,<2", "keras-tuner>=1.0.4,<2,!=1.4.0,!=1.4.1", - "kubernetes>=10.0.1,<13", + "kubernetes>=10.0.1,<27", "numpy>=1.16,<2", "pyarrow>=10,<11", # TODO: b/358471141 - Orjson 3.10.7 breaks TFX OSS tests. @@ -148,9 +146,8 @@ def make_extra_packages_airflow(): def make_extra_packages_kfp(): """Prepare extra packages needed for Kubeflow Pipelines orchestrator.""" return [ - # TODO(b/304892416): Migrate from KFP SDK v1 to v2. - "kfp>=1.8.14,<2", - "kfp-pipeline-spec>0.1.13,<0.2", + "kfp>=2", + "kfp-pipeline-spec>=0.3.0", ] @@ -171,9 +168,8 @@ def make_extra_packages_test(): def make_extra_packages_docker_image(): # Packages needed for tfx docker image. return [ - # TODO(b/304892416): Migrate from KFP SDK v1 to v2. - "kfp>=1.8.14,<2", - "kfp-pipeline-spec>0.1.13,<0.2", + "kfp>=2", + "kfp-pipeline-spec>=0.3.0", "mmh>=2.2,<3", "python-snappy>=0.5,<0.6", # Required for tfx/examples/penguin/penguin_utils_cloud_tuner.py diff --git a/tfx/examples/penguin/penguin_pipeline_kubeflow.py b/tfx/examples/penguin/penguin_pipeline_kubeflow.py index 26c82cc02e..ccb6b35f01 100644 --- a/tfx/examples/penguin/penguin_pipeline_kubeflow.py +++ b/tfx/examples/penguin/penguin_pipeline_kubeflow.py @@ -501,33 +501,9 @@ def main(): else: beam_pipeline_args = _beam_pipeline_args_by_runner['DirectRunner'] - if use_vertex: - dag_runner = tfx.orchestration.experimental.KubeflowV2DagRunner( - config=tfx.orchestration.experimental.KubeflowV2DagRunnerConfig(), - output_filename=_pipeline_definition_file) - else: - dag_runner = tfx.orchestration.experimental.KubeflowDagRunner( - config=tfx.orchestration.experimental.KubeflowDagRunnerConfig( - kubeflow_metadata_config=tfx.orchestration.experimental - .get_default_kubeflow_metadata_config())) - - dag_runner.run( - create_pipeline( - pipeline_name=_pipeline_name, - pipeline_root=_pipeline_root, - data_root=_data_root, - module_file=_module_file, - enable_tuning=False, - enable_cache=True, - user_provided_schema_path=_user_provided_schema, - ai_platform_training_args=_ai_platform_training_args, - ai_platform_serving_args=_ai_platform_serving_args, - beam_pipeline_args=beam_pipeline_args, - use_cloud_component=use_cloud_component, - use_aip=use_aip, - use_vertex=use_vertex, - serving_model_dir=_serving_model_dir, - )) + dag_runner = tfx.orchestration.experimental.KubeflowV2DagRunner( + config=tfx.orchestration.experimental.KubeflowV2DagRunnerConfig(), + output_filename=_pipeline_definition_file) # To compile the pipeline: diff --git a/tfx/examples/penguin/penguin_pipeline_kubeflow_test.py b/tfx/examples/penguin/penguin_pipeline_kubeflow_test.py index 2e519f1a7b..5575132edc 100644 --- a/tfx/examples/penguin/penguin_pipeline_kubeflow_test.py +++ b/tfx/examples/penguin/penguin_pipeline_kubeflow_test.py @@ -63,20 +63,11 @@ def testPenguinPipelineConstructionAndDefinitionFileExists( serving_model_dir=penguin_pipeline_kubeflow._serving_model_dir) self.assertLen(kubeflow_pipeline.components, 9) - if use_vertex: - v2_dag_runner = orchestration.experimental.KubeflowV2DagRunner( - config=orchestration.experimental.KubeflowV2DagRunnerConfig(), - output_dir=self.tmp_dir, - output_filename=penguin_pipeline_kubeflow._pipeline_definition_file) - v2_dag_runner.run(kubeflow_pipeline) - file_path = os.path.join( - self.tmp_dir, penguin_pipeline_kubeflow._pipeline_definition_file) - self.assertTrue(fileio.exists(file_path)) - else: - v1_dag_runner = orchestration.experimental.KubeflowDagRunner( - config=orchestration.experimental.KubeflowDagRunnerConfig( - kubeflow_metadata_config=orchestration.experimental - .get_default_kubeflow_metadata_config())) - v1_dag_runner.run(kubeflow_pipeline) - file_path = os.path.join(self.tmp_dir, 'penguin-kubeflow.tar.gz') - self.assertTrue(fileio.exists(file_path)) + v2_dag_runner = orchestration.experimental.KubeflowV2DagRunner( + config=orchestration.experimental.KubeflowV2DagRunnerConfig(), + output_dir=self.tmp_dir, + output_filename=penguin_pipeline_kubeflow._pipeline_definition_file) + v2_dag_runner.run(kubeflow_pipeline) + file_path = os.path.join( + self.tmp_dir, penguin_pipeline_kubeflow._pipeline_definition_file) + self.assertTrue(fileio.exists(file_path)) diff --git a/tfx/experimental/templates/taxi/kubeflow_runner.py b/tfx/experimental/templates/taxi/kubeflow_runner.py deleted file mode 100644 index 74d873f0f7..0000000000 --- a/tfx/experimental/templates/taxi/kubeflow_runner.py +++ /dev/null @@ -1,100 +0,0 @@ -# Copyright 2020 Google LLC. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Define KubeflowDagRunner to run the pipeline using Kubeflow.""" - -import os -from absl import logging - -from tfx import v1 as tfx -from tfx.experimental.templates.taxi.pipeline import configs -from tfx.experimental.templates.taxi.pipeline import pipeline - -# TFX pipeline produces many output files and metadata. All output data will be -# stored under this OUTPUT_DIR. -OUTPUT_DIR = os.path.join('gs://', configs.GCS_BUCKET_NAME) - -# TFX produces two types of outputs, files and metadata. -# - Files will be created under PIPELINE_ROOT directory. -PIPELINE_ROOT = os.path.join(OUTPUT_DIR, 'tfx_pipeline_output', - configs.PIPELINE_NAME) - -# The last component of the pipeline, "Pusher" will produce serving model under -# SERVING_MODEL_DIR. -SERVING_MODEL_DIR = os.path.join(PIPELINE_ROOT, 'serving_model') - -# Specifies data file directory. DATA_PATH should be a directory containing CSV -# files for CsvExampleGen in this example. By default, data files are in the -# GCS path: `gs://{GCS_BUCKET_NAME}/tfx-template/data/`. Using a GCS path is -# recommended for KFP. -# -# One can optionally choose to use a data source located inside of the container -# built by the template, by specifying -# DATA_PATH = 'data'. Note that Dataflow does not support use container as a -# dependency currently, so this means CsvExampleGen cannot be used with Dataflow -# (step 8 in the template notebook). - -DATA_PATH = 'gs://{}/tfx-template/data/taxi/'.format(configs.GCS_BUCKET_NAME) - - -def run(): - """Define a kubeflow pipeline.""" - - # Metadata config. The defaults works work with the installation of - # KF Pipelines using Kubeflow. If installing KF Pipelines using the - # lightweight deployment option, you may need to override the defaults. - # If you use Kubeflow, metadata will be written to MySQL database inside - # Kubeflow cluster. - metadata_config = tfx.orchestration.experimental.get_default_kubeflow_metadata_config( - ) - - runner_config = tfx.orchestration.experimental.KubeflowDagRunnerConfig( - kubeflow_metadata_config=metadata_config, - tfx_image=configs.PIPELINE_IMAGE) - pod_labels = { - 'add-pod-env': 'true', - tfx.orchestration.experimental.LABEL_KFP_SDK_ENV: 'tfx-template' - } - tfx.orchestration.experimental.KubeflowDagRunner( - config=runner_config, pod_labels_to_attach=pod_labels - ).run( - pipeline.create_pipeline( - pipeline_name=configs.PIPELINE_NAME, - pipeline_root=PIPELINE_ROOT, - data_path=DATA_PATH, - # TODO(step 7): (Optional) Uncomment below to use BigQueryExampleGen. - # query=configs.BIG_QUERY_QUERY, - # TODO(step 5): (Optional) Set the path of the customized schema. - # schema_path=generated_schema_path, - preprocessing_fn=configs.PREPROCESSING_FN, - run_fn=configs.RUN_FN, - train_args=tfx.proto.TrainArgs(num_steps=configs.TRAIN_NUM_STEPS), - eval_args=tfx.proto.EvalArgs(num_steps=configs.EVAL_NUM_STEPS), - eval_accuracy_threshold=configs.EVAL_ACCURACY_THRESHOLD, - serving_model_dir=SERVING_MODEL_DIR, - # TODO(step 7): (Optional) Uncomment below to use provide GCP related - # config for BigQuery with Beam DirectRunner. - # beam_pipeline_args=configs - # .BIG_QUERY_WITH_DIRECT_RUNNER_BEAM_PIPELINE_ARGS, - # TODO(step 8): (Optional) Uncomment below to use Dataflow. - # beam_pipeline_args=configs.DATAFLOW_BEAM_PIPELINE_ARGS, - # TODO(step 9): (Optional) Uncomment below to use Cloud AI Platform. - # ai_platform_training_args=configs.GCP_AI_PLATFORM_TRAINING_ARGS, - # TODO(step 9): (Optional) Uncomment below to use Cloud AI Platform. - # ai_platform_serving_args=configs.GCP_AI_PLATFORM_SERVING_ARGS, - )) - - -if __name__ == '__main__': - logging.set_verbosity(logging.INFO) - run() diff --git a/tfx/orchestration/data_types.py b/tfx/orchestration/data_types.py index aa4bb12c4b..10e88ec696 100644 --- a/tfx/orchestration/data_types.py +++ b/tfx/orchestration/data_types.py @@ -145,7 +145,7 @@ def component_run_context_name(self) -> str: class RuntimeParameter(json_utils.Jsonable): """Runtime parameter. - Currently only supported on KubeflowDagRunner. + Currently only supported on KubeflowV2DagRunner. For protos, use text type RuntimeParameter, which holds the proto json string, e.g., `'{"num_steps": 5}'` for TrainArgs proto. diff --git a/tfx/orchestration/kubeflow/base_component.py b/tfx/orchestration/kubeflow/base_component.py deleted file mode 100644 index 11eeb34a87..0000000000 --- a/tfx/orchestration/kubeflow/base_component.py +++ /dev/null @@ -1,166 +0,0 @@ -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Kubeflow Pipelines based implementation of TFX components. - -These components are lightweight wrappers around the KFP DSL's ContainerOp, -and ensure that the container gets called with the right set of input -arguments. It also ensures that each component exports named output -attributes that are consistent with those provided by the native TFX -components, thus ensuring that both types of pipeline definitions are -compatible. -Note: This requires Kubeflow Pipelines SDK to be installed. -""" - -from typing import Dict, List, Set - -from absl import logging -from kfp import dsl -from kubernetes import client as k8s_client -from tfx.dsl.components.base import base_node as tfx_base_node -from tfx.orchestration import data_types -from tfx.orchestration import pipeline as tfx_pipeline -from tfx.orchestration.kubeflow.proto import kubeflow_pb2 -from tfx.proto.orchestration import pipeline_pb2 - -from google.protobuf import json_format - -# TODO(b/166202742): Consolidate container entrypoint with TFX image's default. -_COMMAND = ['python', '-m', 'tfx.orchestration.kubeflow.container_entrypoint'] - -_WORKFLOW_ID_KEY = 'WORKFLOW_ID' - - -def _encode_runtime_parameter(param: data_types.RuntimeParameter) -> str: - """Encode a runtime parameter into a placeholder for value substitution.""" - if param.ptype is int: - type_enum = pipeline_pb2.RuntimeParameter.INT - elif param.ptype is float: - type_enum = pipeline_pb2.RuntimeParameter.DOUBLE - else: - type_enum = pipeline_pb2.RuntimeParameter.STRING - type_str = pipeline_pb2.RuntimeParameter.Type.Name(type_enum) - return f'{param.name}={type_str}:{str(dsl.PipelineParam(name=param.name))}' - - -def _replace_placeholder(component: tfx_base_node.BaseNode) -> None: - """Replaces the RuntimeParameter placeholders with kfp.dsl.PipelineParam.""" - keys = list(component.exec_properties.keys()) - for key in keys: - exec_property = component.exec_properties[key] - if not isinstance(exec_property, data_types.RuntimeParameter): - continue - component.exec_properties[key] = str( - dsl.PipelineParam(name=exec_property.name)) - - -# TODO(hongyes): renaming the name to KubeflowComponent. -class BaseComponent: - """Base component for all Kubeflow pipelines TFX components. - - Returns a wrapper around a KFP DSL ContainerOp class, and adds named output - attributes that match the output names for the corresponding native TFX - components. - """ - - def __init__(self, - component: tfx_base_node.BaseNode, - depends_on: Set[dsl.ContainerOp], - pipeline: tfx_pipeline.Pipeline, - pipeline_root: dsl.PipelineParam, - tfx_image: str, - kubeflow_metadata_config: kubeflow_pb2.KubeflowMetadataConfig, - tfx_ir: pipeline_pb2.Pipeline, - pod_labels_to_attach: Dict[str, str], - runtime_parameters: List[data_types.RuntimeParameter], - metadata_ui_path: str = '/mlpipeline-ui-metadata.json'): - """Creates a new Kubeflow-based component. - - This class essentially wraps a dsl.ContainerOp construct in Kubeflow - Pipelines. - - Args: - component: The logical TFX component to wrap. - depends_on: The set of upstream KFP ContainerOp components that this - component will depend on. - pipeline: The logical TFX pipeline to which this component belongs. - pipeline_root: The pipeline root specified, as a dsl.PipelineParam - tfx_image: The container image to use for this component. - kubeflow_metadata_config: Configuration settings for connecting to the - MLMD store in a Kubeflow cluster. - tfx_ir: The TFX intermedia representation of the pipeline. - pod_labels_to_attach: Dict of pod labels to attach to the GKE pod. - runtime_parameters: Runtime parameters of the pipeline. - metadata_ui_path: File location for metadata-ui-metadata.json file. - """ - - _replace_placeholder(component) - - arguments = [ - '--pipeline_root', - pipeline_root, - '--kubeflow_metadata_config', - json_format.MessageToJson( - message=kubeflow_metadata_config, preserving_proto_field_name=True), - '--node_id', - component.id, - # TODO(b/182220464): write IR to pipeline_root and let - # container_entrypoint.py read it back to avoid future issue that IR - # exeeds the flag size limit. - '--tfx_ir', - json_format.MessageToJson(tfx_ir), - '--metadata_ui_path', - metadata_ui_path, - ] - - for param in runtime_parameters: - arguments.append('--runtime_parameter') - arguments.append(_encode_runtime_parameter(param)) - - self.container_op = dsl.ContainerOp( - name=component.id, - command=_COMMAND, - image=tfx_image, - arguments=arguments, - output_artifact_paths={ - 'mlpipeline-ui-metadata': metadata_ui_path, - }, - ) - - logging.info('Adding upstream dependencies for component %s', - self.container_op.name) - for op in depends_on: - logging.info(' -> Component: %s', op.name) - self.container_op.after(op) - - # TODO(b/140172100): Document the use of additional_pipeline_args. - if _WORKFLOW_ID_KEY in pipeline.additional_pipeline_args: - # Allow overriding pipeline's run_id externally, primarily for testing. - self.container_op.container.add_env_variable( - k8s_client.V1EnvVar( - name=_WORKFLOW_ID_KEY, - value=pipeline.additional_pipeline_args[_WORKFLOW_ID_KEY])) - else: - # Add the Argo workflow ID to the container's environment variable so it - # can be used to uniquely place pipeline outputs under the pipeline_root. - field_path = "metadata.labels['workflows.argoproj.io/workflow']" - self.container_op.container.add_env_variable( - k8s_client.V1EnvVar( - name=_WORKFLOW_ID_KEY, - value_from=k8s_client.V1EnvVarSource( - field_ref=k8s_client.V1ObjectFieldSelector( - field_path=field_path)))) - - if pod_labels_to_attach: - for k, v in pod_labels_to_attach.items(): - self.container_op.add_pod_label(k, v) diff --git a/tfx/orchestration/kubeflow/base_component_test.py b/tfx/orchestration/kubeflow/base_component_test.py deleted file mode 100644 index 6171d6fbdd..0000000000 --- a/tfx/orchestration/kubeflow/base_component_test.py +++ /dev/null @@ -1,209 +0,0 @@ -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Tests for tfx.orchestration.kubeflow.base_component.""" - -import json - -from absl import logging -from kfp import dsl -import tensorflow as tf -from tfx.components.example_gen.csv_example_gen import component as csv_example_gen_component -from tfx.components.statistics_gen import component as statistics_gen_component -from tfx.orchestration import data_types -from tfx.orchestration import pipeline as tfx_pipeline -from tfx.orchestration.kubeflow import base_component -from tfx.orchestration.kubeflow.proto import kubeflow_pb2 -from tfx.proto.orchestration import pipeline_pb2 - -from ml_metadata.proto import metadata_store_pb2 - - -class BaseComponentTest(tf.test.TestCase): - maxDiff = None # pylint: disable=invalid-name - _test_pipeline_name = 'test_pipeline' - - def setUp(self): - super().setUp() - example_gen = csv_example_gen_component.CsvExampleGen( - input_base='data_input') - statistics_gen = statistics_gen_component.StatisticsGen( - examples=example_gen.outputs['examples']).with_id('foo') - - pipeline = tfx_pipeline.Pipeline( - pipeline_name=self._test_pipeline_name, - pipeline_root='test_pipeline_root', - metadata_connection_config=metadata_store_pb2.ConnectionConfig(), - components=[example_gen, statistics_gen], - ) - - test_pipeline_root = dsl.PipelineParam(name='pipeline-root-param') - - self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig() - self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST' - self._tfx_ir = pipeline_pb2.Pipeline() - with dsl.Pipeline('test_pipeline'): - self.component = base_component.BaseComponent( - component=statistics_gen, - depends_on=set(), - pipeline=pipeline, - pipeline_root=test_pipeline_root, - tfx_image='container_image', - kubeflow_metadata_config=self._metadata_config, - tfx_ir=self._tfx_ir, - pod_labels_to_attach={}, - runtime_parameters=[] - ) - self.tfx_component = statistics_gen - - def testContainerOpArguments(self): - expected_args = [ - '--pipeline_root', - '{{pipelineparam:op=;name=pipeline-root-param}}', - '--kubeflow_metadata_config', - '{\n' - ' "mysql_db_service_host": {\n' - ' "environment_variable": "MYSQL_SERVICE_HOST"\n' - ' }\n' - '}', - '--node_id', - 'foo', - ] - try: - self.assertEqual( - self.component.container_op.arguments[:len(expected_args)], - expected_args) - - except AssertionError: - # Print out full arguments for debugging. - logging.error('==== BEGIN CONTAINER OP ARGUMENT DUMP ====') - logging.error(json.dumps(self.component.container_op.arguments, indent=2)) - logging.error('==== END CONTAINER OP ARGUMENT DUMP ====') - raise - - def testContainerOpName(self): - self.assertEqual('foo', self.tfx_component.id) - self.assertEqual('foo', self.component.container_op.name) - - -class BaseComponentWithPipelineParamTest(tf.test.TestCase): - """Test the usage of RuntimeParameter.""" - maxDiff = None # pylint: disable=invalid-name - _test_pipeline_name = 'test_pipeline' - - def setUp(self): - super().setUp() - - example_gen_output_config = data_types.RuntimeParameter( - name='example-gen-output-config', ptype=str) - - example_gen = csv_example_gen_component.CsvExampleGen( - input_base='data_root', output_config=example_gen_output_config) - statistics_gen = statistics_gen_component.StatisticsGen( - examples=example_gen.outputs['examples']).with_id('foo') - - test_pipeline_root = dsl.PipelineParam(name='pipeline-root-param') - pipeline = tfx_pipeline.Pipeline( - pipeline_name=self._test_pipeline_name, - pipeline_root='test_pipeline_root', - metadata_connection_config=metadata_store_pb2.ConnectionConfig(), - components=[example_gen, statistics_gen], - ) - - self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig() - self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST' - self._tfx_ir = pipeline_pb2.Pipeline() - with dsl.Pipeline('test_pipeline'): - self.example_gen = base_component.BaseComponent( - component=example_gen, - depends_on=set(), - pipeline=pipeline, - pipeline_root=test_pipeline_root, - tfx_image='container_image', - kubeflow_metadata_config=self._metadata_config, - tfx_ir=self._tfx_ir, - pod_labels_to_attach={}, - runtime_parameters=[example_gen_output_config]) - self.statistics_gen = base_component.BaseComponent( - component=statistics_gen, - depends_on=set(), - pipeline=pipeline, - pipeline_root=test_pipeline_root, - tfx_image='container_image', - kubeflow_metadata_config=self._metadata_config, - tfx_ir=self._tfx_ir, - pod_labels_to_attach={}, - runtime_parameters=[] - ) - - self.tfx_example_gen = example_gen - self.tfx_statistics_gen = statistics_gen - - def testContainerOpArguments(self): - statistics_gen_expected_args = [ - '--pipeline_root', - '{{pipelineparam:op=;name=pipeline-root-param}}', - '--kubeflow_metadata_config', - '{\n' - ' "mysql_db_service_host": {\n' - ' "environment_variable": "MYSQL_SERVICE_HOST"\n' - ' }\n' - '}', - '--node_id', - 'foo', - '--tfx_ir', - '{}', - '--metadata_ui_path', - '/mlpipeline-ui-metadata.json', - ] - example_gen_expected_args = [ - '--pipeline_root', - '{{pipelineparam:op=;name=pipeline-root-param}}', - '--kubeflow_metadata_config', - '{\n' - ' "mysql_db_service_host": {\n' - ' "environment_variable": "MYSQL_SERVICE_HOST"\n' - ' }\n' - '}', - '--node_id', - 'CsvExampleGen', - '--tfx_ir', - '{}', - '--metadata_ui_path', - '/mlpipeline-ui-metadata.json', - '--runtime_parameter', - 'example-gen-output-config=STRING:{{pipelineparam:op=;name=example-gen-output-config}}', - ] - try: - self.assertEqual( - self.statistics_gen.container_op - .arguments, - statistics_gen_expected_args) - self.assertEqual( - self.example_gen.container_op.arguments, - example_gen_expected_args) - except AssertionError: - # Print out full arguments for debugging. - logging.error('==== BEGIN STATISTICSGEN CONTAINER OP ARGUMENT DUMP ====') - logging.error( - json.dumps(self.statistics_gen.container_op.arguments, indent=2)) - logging.error('==== END STATISTICSGEN CONTAINER OP ARGUMENT DUMP ====') - logging.error('==== BEGIN EXAMPLEGEN CONTAINER OP ARGUMENT DUMP ====') - logging.error( - json.dumps(self.example_gen.container_op.arguments, indent=2)) - logging.error('==== END EXAMPLEGEN CONTAINER OP ARGUMENT DUMP ====') - raise - - def testContainerOpName(self): - self.assertEqual('foo', self.tfx_statistics_gen.id) - self.assertEqual('foo', self.statistics_gen.container_op.name) diff --git a/tfx/orchestration/kubeflow/kubeflow_dag_runner.py b/tfx/orchestration/kubeflow/kubeflow_dag_runner.py deleted file mode 100644 index 1d320aeaf5..0000000000 --- a/tfx/orchestration/kubeflow/kubeflow_dag_runner.py +++ /dev/null @@ -1,471 +0,0 @@ -# Copyright 2019 Google LLC. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""TFX runner for Kubeflow.""" - -import collections -import copy -import os -from typing import Any, Callable, Dict, List, Optional, Type, cast, MutableMapping -from absl import logging - -from kfp import compiler -from kfp import dsl -from kfp import gcp -from kubernetes import client as k8s_client -from tfx import version -from tfx.dsl.compiler import compiler as tfx_compiler -from tfx.dsl.components.base import base_component as tfx_base_component -from tfx.dsl.components.base import base_node -from tfx.orchestration import data_types -from tfx.orchestration import pipeline as tfx_pipeline -from tfx.orchestration import tfx_runner -from tfx.orchestration.config import pipeline_config -from tfx.orchestration.kubeflow import base_component -from tfx.orchestration.kubeflow import utils -from tfx.orchestration.kubeflow.proto import kubeflow_pb2 -from tfx.orchestration.launcher import base_component_launcher -from tfx.orchestration.launcher import in_process_component_launcher -from tfx.orchestration.launcher import kubernetes_component_launcher -from tfx.proto.orchestration import pipeline_pb2 -from tfx.utils import telemetry_utils - - -# OpFunc represents the type of a function that takes as input a -# dsl.ContainerOp and returns the same object. Common operations such as adding -# k8s secrets, mounting volumes, specifying the use of TPUs and so on can be -# specified as an OpFunc. -# See example usage here: -# https://github.com/kubeflow/pipelines/blob/master/sdk/python/kfp/gcp.py -OpFunc = Callable[[dsl.ContainerOp], dsl.ContainerOp] - -# Default secret name for GCP credentials. This secret is installed as part of -# a typical Kubeflow installation when the component is GKE. -_KUBEFLOW_GCP_SECRET_NAME = 'user-gcp-sa' - -# Default TFX container image to use in KubeflowDagRunner. -DEFAULT_KUBEFLOW_TFX_IMAGE = 'tensorflow/tfx:%s' % (version.__version__,) - - -def _mount_config_map_op(config_map_name: str) -> OpFunc: - """Mounts all key-value pairs found in the named Kubernetes ConfigMap. - - All key-value pairs in the ConfigMap are mounted as environment variables. - - Args: - config_map_name: The name of the ConfigMap resource. - - Returns: - An OpFunc for mounting the ConfigMap. - """ - - def mount_config_map(container_op: dsl.ContainerOp): - config_map_ref = k8s_client.V1ConfigMapEnvSource( - name=config_map_name, optional=True) - container_op.container.add_env_from( - k8s_client.V1EnvFromSource(config_map_ref=config_map_ref)) - - return mount_config_map - - -def _mount_secret_op(secret_name: str) -> OpFunc: - """Mounts all key-value pairs found in the named Kubernetes Secret. - - All key-value pairs in the Secret are mounted as environment variables. - - Args: - secret_name: The name of the Secret resource. - - Returns: - An OpFunc for mounting the Secret. - """ - - def mount_secret(container_op: dsl.ContainerOp): - secret_ref = k8s_client.V1ConfigMapEnvSource( - name=secret_name, optional=True) - - container_op.container.add_env_from( - k8s_client.V1EnvFromSource(secret_ref=secret_ref)) - - return mount_secret - - -def get_default_pipeline_operator_funcs( - use_gcp_sa: bool = False) -> List[OpFunc]: - """Returns a default list of pipeline operator functions. - - Args: - use_gcp_sa: If true, mount a GCP service account secret to each pod, with - the name _KUBEFLOW_GCP_SECRET_NAME. - - Returns: - A list of functions with type OpFunc. - """ - # Enables authentication for GCP services if needed. - gcp_secret_op = gcp.use_gcp_secret(_KUBEFLOW_GCP_SECRET_NAME) - - # Mounts configmap containing Metadata gRPC server configuration. - mount_config_map_op = _mount_config_map_op('metadata-grpc-configmap') - if use_gcp_sa: - return [gcp_secret_op, mount_config_map_op] - else: - return [mount_config_map_op] - - -def get_default_kubeflow_metadata_config( -) -> kubeflow_pb2.KubeflowMetadataConfig: - """Returns the default metadata connection config for Kubeflow. - - Returns: - A config proto that will be serialized as JSON and passed to the running - container so the TFX component driver is able to communicate with MLMD in - a Kubeflow cluster. - """ - # The default metadata configuration for a Kubeflow Pipelines cluster is - # codified as a Kubernetes ConfigMap - # https://github.com/kubeflow/pipelines/blob/master/manifests/kustomize/base/metadata/metadata-grpc-configmap.yaml - - config = kubeflow_pb2.KubeflowMetadataConfig() - # The environment variable to use to obtain the Metadata gRPC service host in - # the cluster that is backing Kubeflow Metadata. Note that the key in the - # config map and therefore environment variable used, are lower-cased. - config.grpc_config.grpc_service_host.environment_variable = 'METADATA_GRPC_SERVICE_HOST' - # The environment variable to use to obtain the Metadata grpc service port in - # the cluster that is backing Kubeflow Metadata. - config.grpc_config.grpc_service_port.environment_variable = 'METADATA_GRPC_SERVICE_PORT' - - return config - - -def get_default_pod_labels() -> Dict[str, str]: - """Returns the default pod label dict for Kubeflow.""" - # KFP default transformers add pod env: - # https://github.com/kubeflow/pipelines/blob/0.1.32/sdk/python/kfp/compiler/_default_transformers.py - result = { - 'add-pod-env': 'true', - telemetry_utils.LABEL_KFP_SDK_ENV: 'tfx' - } - return result - - -def get_default_output_filename(pipeline_name: str) -> str: - return pipeline_name + '.tar.gz' - - -class KubeflowDagRunnerConfig(pipeline_config.PipelineConfig): - """Runtime configuration parameters specific to execution on Kubeflow.""" - - def __init__( - self, - pipeline_operator_funcs: Optional[List[OpFunc]] = None, - tfx_image: Optional[str] = None, - kubeflow_metadata_config: Optional[ - kubeflow_pb2.KubeflowMetadataConfig] = None, - # TODO(b/143883035): Figure out the best practice to put the - # SUPPORTED_LAUNCHER_CLASSES - supported_launcher_classes: Optional[List[Type[ - base_component_launcher.BaseComponentLauncher]]] = None, - metadata_ui_path: str = '/mlpipeline-ui-metadata.json', - **kwargs): - """Creates a KubeflowDagRunnerConfig object. - - The user can use pipeline_operator_funcs to apply modifications to - ContainerOps used in the pipeline. For example, to ensure the pipeline - steps mount a GCP secret, and a Persistent Volume, one can create config - object like so: - - from kfp import gcp, onprem - mount_secret_op = gcp.use_secret('my-secret-name) - mount_volume_op = onprem.mount_pvc( - "my-persistent-volume-claim", - "my-volume-name", - "/mnt/volume-mount-path") - - config = KubeflowDagRunnerConfig( - pipeline_operator_funcs=[mount_secret_op, mount_volume_op] - ) - - Args: - pipeline_operator_funcs: A list of ContainerOp modifying functions that - will be applied to every container step in the pipeline. - tfx_image: The TFX container image to use in the pipeline. - kubeflow_metadata_config: Runtime configuration to use to connect to - Kubeflow metadata. - supported_launcher_classes: A list of component launcher classes that are - supported by the current pipeline. List sequence determines the order in - which launchers are chosen for each component being run. - metadata_ui_path: File location for metadata-ui-metadata.json file. - **kwargs: keyword args for PipelineConfig. - """ - supported_launcher_classes = supported_launcher_classes or [ - in_process_component_launcher.InProcessComponentLauncher, - kubernetes_component_launcher.KubernetesComponentLauncher, - ] - super().__init__( - supported_launcher_classes=supported_launcher_classes, **kwargs) - self.pipeline_operator_funcs = ( - pipeline_operator_funcs or get_default_pipeline_operator_funcs()) - self.tfx_image = tfx_image or DEFAULT_KUBEFLOW_TFX_IMAGE - self.kubeflow_metadata_config = ( - kubeflow_metadata_config or get_default_kubeflow_metadata_config()) - self.metadata_ui_path = metadata_ui_path - - -class KubeflowDagRunner(tfx_runner.TfxRunner): - """Kubeflow Pipelines runner. - - Constructs a pipeline definition YAML file based on the TFX logical pipeline. - """ - - def __init__(self, - output_dir: Optional[str] = None, - output_filename: Optional[str] = None, - config: Optional[KubeflowDagRunnerConfig] = None, - pod_labels_to_attach: Optional[Dict[str, str]] = None): - """Initializes KubeflowDagRunner for compiling a Kubeflow Pipeline. - - Args: - output_dir: An optional output directory into which to output the pipeline - definition files. Defaults to the current working directory. - output_filename: An optional output file name for the pipeline definition - file. Defaults to pipeline_name.tar.gz when compiling a TFX pipeline. - Currently supports .tar.gz, .tgz, .zip, .yaml, .yml formats. See - https://github.com/kubeflow/pipelines/blob/181de66cf9fa87bcd0fe9291926790c400140783/sdk/python/kfp/compiler/compiler.py#L851 - for format restriction. - config: An optional KubeflowDagRunnerConfig object to specify runtime - configuration when running the pipeline under Kubeflow. - pod_labels_to_attach: Optional set of pod labels to attach to GKE pod - spinned up for this pipeline. Default to the 3 labels: - 1. add-pod-env: true, - 2. pipeline SDK type, - 3. pipeline unique ID, - where 2 and 3 are instrumentation of usage tracking. - """ - if config and not isinstance(config, KubeflowDagRunnerConfig): - raise TypeError('config must be type of KubeflowDagRunnerConfig.') - super().__init__(config or KubeflowDagRunnerConfig()) - self._config = cast(KubeflowDagRunnerConfig, self._config) - self._output_dir = output_dir or os.getcwd() - self._output_filename = output_filename - self._compiler = compiler.Compiler() - self._tfx_compiler = tfx_compiler.Compiler() - self._params = [] # List of dsl.PipelineParam used in this pipeline. - self._params_by_component_id = collections.defaultdict(list) - self._deduped_parameter_names = set() # Set of unique param names used. - self._exit_handler = None - if pod_labels_to_attach is None: - self._pod_labels_to_attach = get_default_pod_labels() - else: - self._pod_labels_to_attach = pod_labels_to_attach - - def _parse_parameter_from_component( - self, component: tfx_base_component.BaseComponent) -> None: - """Extract embedded RuntimeParameter placeholders from a component. - - Extract embedded RuntimeParameter placeholders from a component, then append - the corresponding dsl.PipelineParam to KubeflowDagRunner. - - Args: - component: a TFX component. - """ - - deduped_parameter_names_for_component = set() - for parameter in component.exec_properties.values(): - if not isinstance(parameter, data_types.RuntimeParameter): - continue - # Ignore pipeline root because it will be added later. - if parameter.name == tfx_pipeline.ROOT_PARAMETER.name: - continue - if parameter.name in deduped_parameter_names_for_component: - continue - - deduped_parameter_names_for_component.add(parameter.name) - self._params_by_component_id[component.id].append(parameter) - if parameter.name not in self._deduped_parameter_names: - self._deduped_parameter_names.add(parameter.name) - # TODO(b/178436919): Create a test to cover default value rendering - # and move the external code reference over there. - # The default needs to be serialized then passed to dsl.PipelineParam. - # See - # https://github.com/kubeflow/pipelines/blob/f65391309650fdc967586529e79af178241b4c2c/sdk/python/kfp/dsl/_pipeline_param.py#L154 - dsl_parameter = dsl.PipelineParam( - name=parameter.name, value=str(parameter.default)) - self._params.append(dsl_parameter) - - def _parse_parameter_from_pipeline(self, - pipeline: tfx_pipeline.Pipeline) -> None: - """Extract all the RuntimeParameter placeholders from the pipeline.""" - - for component in pipeline.components: - self._parse_parameter_from_component(component) - - def _construct_pipeline_graph(self, pipeline: tfx_pipeline.Pipeline, - pipeline_root: dsl.PipelineParam): - """Constructs a Kubeflow Pipeline graph. - - Args: - pipeline: The logical TFX pipeline to base the construction on. - pipeline_root: dsl.PipelineParam representing the pipeline root. - """ - component_to_kfp_op = {} - - for component in pipeline.components: - utils.replace_exec_properties(component) - tfx_ir = self._generate_tfx_ir(pipeline) - - # Assumption: There is a partial ordering of components in the list, i.e., - # if component A depends on component B and C, then A appears after B and C - # in the list. - for component in pipeline.components: - # Keep track of the set of upstream dsl.ContainerOps for this component. - depends_on = set() - - for upstream_component in component.upstream_nodes: - depends_on.add(component_to_kfp_op[upstream_component]) - - # remove the extra pipeline node information - tfx_node_ir = self._dehydrate_tfx_ir(tfx_ir, component.id) - - # Disable cache for exit_handler - if self._exit_handler and component.id == self._exit_handler.id: - tfx_node_ir.nodes[ - 0].pipeline_node.execution_options.caching_options.enable_cache = False - - kfp_component = base_component.BaseComponent( - component=component, - depends_on=depends_on, - pipeline=pipeline, - pipeline_root=pipeline_root, - tfx_image=self._config.tfx_image, - kubeflow_metadata_config=self._config.kubeflow_metadata_config, - pod_labels_to_attach=self._pod_labels_to_attach, - tfx_ir=tfx_node_ir, - metadata_ui_path=self._config.metadata_ui_path, - runtime_parameters=(self._params_by_component_id[component.id] + - [tfx_pipeline.ROOT_PARAMETER])) - - for operator in self._config.pipeline_operator_funcs: - kfp_component.container_op.apply(operator) - - component_to_kfp_op[component] = kfp_component.container_op - - # If exit handler defined create an exit handler and add all ops to it. - if self._exit_handler: - exit_op = component_to_kfp_op[self._exit_handler] - with dsl.ExitHandler(exit_op) as exit_handler_group: - exit_handler_group.name = utils.TFX_DAG_NAME - # KFP get_default_pipeline should have the pipeline object when invoked - # while compiling. This allows us to retrieve all ops from pipeline - # group (should be the only group in the pipeline). - pipeline_group = dsl.Pipeline.get_default_pipeline().groups[0] - - # Transfer all ops to exit_handler_group which will now contain all ops. - exit_handler_group.ops = pipeline_group.ops - # remove all ops from pipeline_group. Otherwise compiler fails in - # https://github.com/kubeflow/pipelines/blob/8aee62142aa13ae42b2dd18257d7e034861b7e5e/sdk/python/kfp/compiler/compiler.py#L893 - pipeline_group.ops = [] - - def _del_unused_field(self, node_id: str, message_dict: MutableMapping[str, - Any]): - for item in list(message_dict.keys()): - if item != node_id: - del message_dict[item] - - def _dehydrate_tfx_ir(self, original_pipeline: pipeline_pb2.Pipeline, - node_id: str) -> pipeline_pb2.Pipeline: - pipeline = copy.deepcopy(original_pipeline) - for node in pipeline.nodes: - if (node.WhichOneof('node') == 'pipeline_node' and - node.pipeline_node.node_info.id == node_id): - del pipeline.nodes[:] - pipeline.nodes.extend([node]) - break - - deployment_config = pipeline_pb2.IntermediateDeploymentConfig() - pipeline.deployment_config.Unpack(deployment_config) - self._del_unused_field(node_id, deployment_config.executor_specs) - self._del_unused_field(node_id, deployment_config.custom_driver_specs) - self._del_unused_field(node_id, - deployment_config.node_level_platform_configs) - pipeline.deployment_config.Pack(deployment_config) - return pipeline - - def _generate_tfx_ir( - self, pipeline: tfx_pipeline.Pipeline) -> Optional[pipeline_pb2.Pipeline]: - result = self._tfx_compiler.compile(pipeline) - return result - - def run(self, pipeline: tfx_pipeline.Pipeline): - """Compiles and outputs a Kubeflow Pipeline YAML definition file. - - Args: - pipeline: The logical TFX pipeline to use when building the Kubeflow - pipeline. - """ - # If exit handler is defined, append to existing pipeline components. - if self._exit_handler: - original_pipeline = pipeline - pipeline = copy.copy(original_pipeline) - pipeline.components = [*pipeline.components, self._exit_handler] - - for component in pipeline.components: - # TODO(b/187122662): Pass through pip dependencies as a first-class - # component flag. - if isinstance(component, tfx_base_component.BaseComponent): - component._resolve_pip_dependencies( # pylint: disable=protected-access - pipeline.pipeline_info.pipeline_root) - - # KFP DSL representation of pipeline root parameter. - dsl_pipeline_root = dsl.PipelineParam( - name=tfx_pipeline.ROOT_PARAMETER.name, - value=pipeline.pipeline_info.pipeline_root) - self._params.append(dsl_pipeline_root) - - def _construct_pipeline(): - """Constructs a Kubeflow pipeline. - - Creates Kubeflow ContainerOps for each TFX component encountered in the - logical pipeline definition. - """ - self._construct_pipeline_graph(pipeline, dsl_pipeline_root) - - # Need to run this first to get self._params populated. Then KFP compiler - # can correctly match default value with PipelineParam. - self._parse_parameter_from_pipeline(pipeline) - - file_name = self._output_filename or get_default_output_filename( - pipeline.pipeline_info.pipeline_name) - # Create workflow spec and write out to package. - self._compiler._create_and_write_workflow( # pylint: disable=protected-access - pipeline_func=_construct_pipeline, - pipeline_name=pipeline.pipeline_info.pipeline_name, - params_list=self._params, - package_path=os.path.join(self._output_dir, file_name)) - - def set_exit_handler(self, exit_handler: base_node.BaseNode): - """Set exit handler components for the Kubeflow dag runner. - - This feature is currently experimental without backward compatibility - gaurantee. - - Args: - exit_handler: exit handler component. - """ - if not exit_handler: - logging.error('Setting empty exit handler is not allowed.') - return - assert not exit_handler.downstream_nodes, ('Exit handler should not depend ' - 'on any other node.') - assert not exit_handler.upstream_nodes, ('Exit handler should not depend on' - ' any other node.') - self._exit_handler = exit_handler diff --git a/tfx/orchestration/kubeflow/kubeflow_dag_runner_test.py b/tfx/orchestration/kubeflow/kubeflow_dag_runner_test.py deleted file mode 100644 index 2d43dfad54..0000000000 --- a/tfx/orchestration/kubeflow/kubeflow_dag_runner_test.py +++ /dev/null @@ -1,324 +0,0 @@ -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Tests for tfx.orchestration.kubeflow.kubeflow_dag_runner.""" - -import json -import os -import tarfile -from typing import List - -from kfp import onprem -from tfx.components.statistics_gen import component as statistics_gen_component -from tfx.dsl.component.experimental import executor_specs -from tfx.dsl.component.experimental.annotations import Parameter -from tfx.dsl.component.experimental.decorators import component -from tfx.dsl.components.base import base_component -from tfx.dsl.io import fileio -from tfx.extensions.google_cloud_big_query.example_gen import component as big_query_example_gen_component -from tfx.orchestration import data_types -from tfx.orchestration import pipeline as tfx_pipeline -from tfx.orchestration.kubeflow import kubeflow_dag_runner -from tfx.orchestration.kubeflow.decorators import FinalStatusStr -from tfx.proto import example_gen_pb2 -from tfx.types import component_spec -from tfx.utils import telemetry_utils -from tfx.utils import test_case_utils -import yaml - -from ml_metadata.proto import metadata_store_pb2 - - -@component -def say_hi(status: Parameter[str]): - print(status) - - -# 2-step pipeline under test. -def _two_step_pipeline() -> tfx_pipeline.Pipeline: - default_input_config = json.dumps({ - 'splits': [{ - 'name': 'single_split', - 'pattern': 'SELECT * FROM default-table' - }] - }) - input_config = data_types.RuntimeParameter( - name='input_config', ptype=str, default=default_input_config) - example_gen = big_query_example_gen_component.BigQueryExampleGen( - input_config=input_config, output_config=example_gen_pb2.Output()) - statistics_gen = statistics_gen_component.StatisticsGen( - examples=example_gen.outputs['examples']) - return tfx_pipeline.Pipeline( - pipeline_name='two_step_pipeline', - pipeline_root='pipeline_root', - metadata_connection_config=metadata_store_pb2.ConnectionConfig(), - components=[example_gen, statistics_gen], - ) - - -class _DummySpec(component_spec.ComponentSpec): - INPUTS = {} - OUTPUTS = {} - PARAMETERS = {} - - -class _DummyComponent(base_component.BaseComponent): - SPEC_CLASS = _DummySpec - EXECUTOR_SPEC = executor_specs.TemplatedExecutorContainerSpec( - image='dummy:latest', command=['ls']) - - def __init__(self): - super().__init__(_DummySpec()) - - -def _container_component_pipeline() -> tfx_pipeline.Pipeline: - return tfx_pipeline.Pipeline( - pipeline_name='container_component_pipeline', - pipeline_root='pipeline_root', - metadata_connection_config=metadata_store_pb2.ConnectionConfig(), - components=[_DummyComponent()], - ) - - -class KubeflowDagRunnerTest(test_case_utils.TfxTest): - - def setUp(self): - super().setUp() - self._source_data_dir = os.path.join( - os.path.dirname(os.path.abspath(__file__)), 'testdata') - self.enter_context(test_case_utils.change_working_dir(self.tmp_dir)) - - def _compare_tfx_ir_against_testdata(self, args: List[str], golden_file: str): - index_of_tfx_ir_flag = args.index('--tfx_ir') - self.assertAllGreater(len(args), index_of_tfx_ir_flag) - real_tfx_ir = json.loads(args[index_of_tfx_ir_flag + 1]) - real_tfx_ir_str = json.dumps(real_tfx_ir, sort_keys=True) - with open(os.path.join(self._source_data_dir, - golden_file)) as tfx_ir_json_file: - formatted_tfx_ir = json.dumps(json.load(tfx_ir_json_file), sort_keys=True) - self.assertEqual(real_tfx_ir_str, formatted_tfx_ir) - - def testTwoStepPipeline(self): - """Sanity-checks the construction and dependencies for a 2-step pipeline.""" - kubeflow_dag_runner.KubeflowDagRunner().run(_two_step_pipeline()) - file_path = os.path.join(self.tmp_dir, 'two_step_pipeline.tar.gz') - self.assertTrue(fileio.exists(file_path)) - - with tarfile.TarFile.open(file_path).extractfile( - 'pipeline.yaml') as pipeline_file: - self.assertIsNotNone(pipeline_file) - pipeline = yaml.safe_load(pipeline_file) - - containers = [ - c for c in pipeline['spec']['templates'] if 'container' in c - ] - self.assertEqual(2, len(containers)) - - big_query_container = [ - c for c in containers if c['name'] == 'bigqueryexamplegen' - ] - self.assertEqual(1, len(big_query_container)) - self.assertEqual([ - 'python', - '-m', - 'tfx.orchestration.kubeflow.container_entrypoint', - ], big_query_container[0]['container']['command']) - self.assertIn('--tfx_ir', big_query_container[0]['container']['args']) - self.assertIn('--node_id', big_query_container[0]['container']['args']) - self._compare_tfx_ir_against_testdata( - big_query_container[0]['container']['args'], - 'two_step_pipeline_post_dehydrate_ir.json') - - statistics_gen_container = [ - c for c in containers if c['name'] == 'statisticsgen' - ] - self.assertEqual(1, len(statistics_gen_container)) - - # Ensure the pod labels are correctly appended. - metadata = [ - c['metadata'] for c in pipeline['spec']['templates'] if 'dag' not in c - ] - for m in metadata: - self.assertEqual('tfx', m['labels'][telemetry_utils.LABEL_KFP_SDK_ENV]) - - # Ensure dependencies between components are captured. - dag = [c for c in pipeline['spec']['templates'] if 'dag' in c] - self.assertEqual(1, len(dag)) - - self.assertEqual( - { - 'tasks': [{ - 'name': 'bigqueryexamplegen', - 'template': 'bigqueryexamplegen', - 'arguments': { - 'parameters': [{ - 'name': 'input_config', - 'value': '{{inputs.parameters.input_config}}' - }, { - 'name': 'pipeline-root', - 'value': '{{inputs.parameters.pipeline-root}}' - }] - } - }, { - 'name': 'statisticsgen', - 'template': 'statisticsgen', - 'arguments': { - 'parameters': [{ - 'name': 'pipeline-root', - 'value': '{{inputs.parameters.pipeline-root}}' - }] - }, - 'dependencies': ['bigqueryexamplegen'], - }] - }, dag[0]['dag']) - - def testDefaultPipelineOperatorFuncs(self): - kubeflow_dag_runner.KubeflowDagRunner().run(_two_step_pipeline()) - file_path = 'two_step_pipeline.tar.gz' - self.assertTrue(fileio.exists(file_path)) - - with tarfile.TarFile.open(file_path).extractfile( - 'pipeline.yaml') as pipeline_file: - self.assertIsNotNone(pipeline_file) - pipeline = yaml.safe_load(pipeline_file) - - containers = [ - c for c in pipeline['spec']['templates'] if 'container' in c - ] - self.assertEqual(2, len(containers)) - - def testMountGcpServiceAccount(self): - kubeflow_dag_runner.KubeflowDagRunner( - config=kubeflow_dag_runner.KubeflowDagRunnerConfig( - pipeline_operator_funcs=kubeflow_dag_runner - .get_default_pipeline_operator_funcs(use_gcp_sa=True))).run( - _two_step_pipeline()) - file_path = 'two_step_pipeline.tar.gz' - self.assertTrue(fileio.exists(file_path)) - - with tarfile.TarFile.open(file_path).extractfile( - 'pipeline.yaml') as pipeline_file: - self.assertIsNotNone(pipeline_file) - pipeline = yaml.safe_load(pipeline_file) - - containers = [ - c for c in pipeline['spec']['templates'] if 'container' in c - ] - self.assertEqual(2, len(containers)) - - # Check that each container has default GCP credentials. - - container_0 = containers[0] - env = [ - env for env in container_0['container']['env'] - if env['name'] == 'GOOGLE_APPLICATION_CREDENTIALS' - ] - self.assertEqual(1, len(env)) - self.assertEqual('/secret/gcp-credentials/user-gcp-sa.json', - env[0]['value']) - - container_1 = containers[0] - env = [ - env for env in container_1['container']['env'] - if env['name'] == 'GOOGLE_APPLICATION_CREDENTIALS' - ] - self.assertEqual(1, len(env)) - self.assertEqual('/secret/gcp-credentials/user-gcp-sa.json', - env[0]['value']) - - def testVolumeMountingPipelineOperatorFuncs(self): - mount_volume_op = onprem.mount_pvc('my-persistent-volume-claim', - 'my-volume-name', - '/mnt/volume-mount-path') - config = kubeflow_dag_runner.KubeflowDagRunnerConfig( - pipeline_operator_funcs=[mount_volume_op]) - - kubeflow_dag_runner.KubeflowDagRunner(config=config).run( - _two_step_pipeline()) - file_path = 'two_step_pipeline.tar.gz' - self.assertTrue(fileio.exists(file_path)) - - with tarfile.TarFile.open(file_path).extractfile( - 'pipeline.yaml') as pipeline_file: - self.assertIsNotNone(pipeline_file) - pipeline = yaml.safe_load(pipeline_file) - - container_templates = [ - c for c in pipeline['spec']['templates'] if 'container' in c - ] - self.assertEqual(2, len(container_templates)) - - volumes = [{ - 'name': 'my-volume-name', - 'persistentVolumeClaim': { - 'claimName': 'my-persistent-volume-claim' - } - }] - - # Check that the PVC is specified for kfp<=0.1.31.1. - if 'volumes' in pipeline['spec']: - self.assertEqual(volumes, pipeline['spec']['volumes']) - - for template in container_templates: - # Check that each container has the volume mounted. - self.assertEqual([{ - 'name': 'my-volume-name', - 'mountPath': '/mnt/volume-mount-path' - }], template['container']['volumeMounts']) - - # Check that each template has the PVC specified for kfp>=0.1.31.2. - if 'volumes' in template: - self.assertEqual(volumes, template['volumes']) - - def testContainerComponent(self): - kubeflow_dag_runner.KubeflowDagRunner().run(_container_component_pipeline()) - file_path = os.path.join(self.tmp_dir, - 'container_component_pipeline.tar.gz') - self.assertTrue(fileio.exists(file_path)) - - with tarfile.TarFile.open(file_path).extractfile( - 'pipeline.yaml') as pipeline_file: - self.assertIsNotNone(pipeline_file) - pipeline = yaml.safe_load(pipeline_file) - containers = [ - c for c in pipeline['spec']['templates'] if 'container' in c - ] - self.assertLen(containers, 1) - component_args = containers[0]['container']['args'] - self.assertIn('--node_id', component_args) - - def testExitHandler(self): - dag_runner = kubeflow_dag_runner.KubeflowDagRunner() - dag_runner.set_exit_handler(say_hi(status=FinalStatusStr())) - pipeline = _container_component_pipeline() - pipeline.enable_cache = True - dag_runner.run(pipeline) - file_path = os.path.join(self.tmp_dir, - 'container_component_pipeline.tar.gz') - self.assertTrue(fileio.exists(file_path)) - - with tarfile.TarFile.open(file_path).extractfile( - 'pipeline.yaml') as pipeline_file: - self.assertIsNotNone(pipeline_file) - pipeline = yaml.safe_load(pipeline_file) - self.assertIn('onExit', pipeline['spec']) - containers = [ - c for c in pipeline['spec']['templates'] if 'container' in c - ] - self.assertLen(containers, 2) - exit_component_args = ' '.join(containers[1]['container']['args']) - self.assertIn('{{workflow.status}}', exit_component_args) - self.assertNotIn('enableCache', exit_component_args) - first_component_args = ' '.join(containers[0]['container']['args']) - self.assertNotIn('{{workflow.status}}', first_component_args) - self.assertIn('enableCache', first_component_args) diff --git a/tfx/orchestration/kubeflow/proto/BUILD b/tfx/orchestration/kubeflow/proto/BUILD deleted file mode 100644 index b0ee822ee6..0000000000 --- a/tfx/orchestration/kubeflow/proto/BUILD +++ /dev/null @@ -1,25 +0,0 @@ -load("//tfx:tfx.bzl", "tfx_py_proto_library") - -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -package(default_visibility = ["//visibility:public"]) - -licenses(["notice"]) # Apache 2.0 - -exports_files(["LICENSE"]) - -tfx_py_proto_library( - name = "kubeflow_proto_py_pb2", - srcs = ["kubeflow.proto"], -) diff --git a/tfx/orchestration/kubeflow/proto/kubeflow.proto b/tfx/orchestration/kubeflow/proto/kubeflow.proto deleted file mode 100644 index bab34bdc69..0000000000 --- a/tfx/orchestration/kubeflow/proto/kubeflow.proto +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -syntax = "proto3"; - -package tfx.orchestration.kubeflow.proto; - -// ConfigValue specifies how Kubeflow components should obtain a runtime -// configuration parameter value. -message ConfigValue { - oneof value_from { - // Specifies a literal value to use. - string value = 1; - // Specifies that the parameter value should be obtained from the - // environment variable with this specified value. - string environment_variable = 2; - } -} - -// Message to specify the gRPC server configuration. -message KubeflowGrpcMetadataConfig { - // ML Metadata gRPC service host in the cluster. - ConfigValue grpc_service_host = 1; - // ML Metadata gRPC service port in the cluster. - ConfigValue grpc_service_port = 2; -} - -// Message to specify Metadata configuration. -message KubeflowMetadataConfig { - // Following mysql connection configuration fields will be deprecated soon in - // favor of oneof connection_config. - - ConfigValue mysql_db_service_host = 1 [deprecated = true]; - ConfigValue mysql_db_service_port = 2 [deprecated = true]; - ConfigValue mysql_db_name = 3 [deprecated = true]; - ConfigValue mysql_db_user = 4 [deprecated = true]; - ConfigValue mysql_db_password = 5 [deprecated = true]; - - oneof connection_config { - KubeflowGrpcMetadataConfig grpc_config = 7; - } -} diff --git a/tfx/orchestration/pipeline.py b/tfx/orchestration/pipeline.py index dd8e4984a1..cd7e88cea7 100644 --- a/tfx/orchestration/pipeline.py +++ b/tfx/orchestration/pipeline.py @@ -40,7 +40,7 @@ _MAX_PIPELINE_NAME_LENGTH = 63 # Pipeline root is by default specified as a RuntimeParameter when runnning on -# KubeflowDagRunner. This constant offers users an easy access to the pipeline +# KubeflowV2DagRunner. This constant offers users an easy access to the pipeline # root placeholder when defining a pipeline. For example, # # pusher = Pusher( diff --git a/tfx/tools/cli/handler/kubeflow_dag_runner_patcher.py b/tfx/tools/cli/handler/kubeflow_dag_runner_patcher.py deleted file mode 100644 index 01ea50d940..0000000000 --- a/tfx/tools/cli/handler/kubeflow_dag_runner_patcher.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright 2021 Google LLC. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Patches KubeflowDagRunner to read and update argument during compilation.""" - -import os -import tempfile -import typing -from typing import Any, Callable, MutableMapping, Optional, Type - -from tfx.orchestration import pipeline as tfx_pipeline -from tfx.orchestration import tfx_runner -from tfx.orchestration.kubeflow import kubeflow_dag_runner -from tfx.tools.cli.handler import dag_runner_patcher - - -def _get_temporary_package_filename(pipeline_name: str, directory: str) -> str: - # mkstemp will create and open a file named 'temp_xxxxx.tar.gz'. - fd, path = tempfile.mkstemp('.tar.gz', f'temp_{pipeline_name}', directory) - os.close(fd) - return os.path.basename(path) - - -class KubeflowDagRunnerPatcher(dag_runner_patcher.DagRunnerPatcher): - """Patches KubeflowDagRunner.run() with several customizations for CLI.""" - - USE_TEMPORARY_OUTPUT_FILE = 'use_temporary_output_file' - OUTPUT_FILE_PATH = 'output_file_path' - - def __init__(self, - call_real_run: bool, - use_temporary_output_file: bool = False, - build_image_fn: Optional[Callable[[str], str]] = None): - """Initialize KubeflowDagRunnerPatcher. - - Args: - call_real_run: Specify KubeflowDagRunner.run() should be called. - use_temporary_output_file: If True, we will override the default value of - the pipeline package output path. Even if it is set to True, if users - specified a path in KubeflowDagRunner then this option will be ignored. - build_image_fn: If specified, call the function with the configured - tfx_image in the pipeline. The result of the function will be - substituted as a new tfx_image of the pipeline. - """ - super().__init__(call_real_run) - self._build_image_fn = build_image_fn - self._use_temporary_output_file = use_temporary_output_file - - def _before_run(self, runner: tfx_runner.TfxRunner, - pipeline: tfx_pipeline.Pipeline, - context: MutableMapping[str, Any]) -> None: - runner = typing.cast(kubeflow_dag_runner.KubeflowDagRunner, runner) - runner_config = typing.cast(kubeflow_dag_runner.KubeflowDagRunnerConfig, - runner.config) - if self._build_image_fn is not None: - # Replace the image for the pipeline with the newly built image name. - # This new image name will include the sha256 image id. - runner_config.tfx_image = self._build_image_fn(runner_config.tfx_image) - - # pylint: disable=protected-access - context[self.USE_TEMPORARY_OUTPUT_FILE] = ( - runner._output_filename is None and self._use_temporary_output_file) - if context[self.USE_TEMPORARY_OUTPUT_FILE]: - # Replace the output of the kfp compile to a temporary file. - # This file will be deleted after job submission in kubeflow_handler.py - runner._output_filename = _get_temporary_package_filename( - context[self.PIPELINE_NAME], runner._output_dir) - output_filename = ( - runner._output_filename or - kubeflow_dag_runner.get_default_output_filename( - context[self.PIPELINE_NAME])) - context[self.OUTPUT_FILE_PATH] = os.path.join(runner._output_dir, - output_filename) - - def get_runner_class(self) -> Type[tfx_runner.TfxRunner]: - return kubeflow_dag_runner.KubeflowDagRunner diff --git a/tfx/tools/cli/handler/kubeflow_dag_runner_patcher_test.py b/tfx/tools/cli/handler/kubeflow_dag_runner_patcher_test.py deleted file mode 100644 index ef653b5b83..0000000000 --- a/tfx/tools/cli/handler/kubeflow_dag_runner_patcher_test.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright 2021 Google LLC. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Tests for tfx.tools.cli.handler.kubeflow_dag_runner_patcher.""" - -import os -from unittest import mock - -from tfx.orchestration import pipeline as tfx_pipeline -from tfx.orchestration.kubeflow import kubeflow_dag_runner -from tfx.tools.cli.handler import kubeflow_dag_runner_patcher -from tfx.utils import test_case_utils - - -class KubeflowDagRunnerPatcherTest(test_case_utils.TfxTest): - - def setUp(self): - super().setUp() - self.enter_context(test_case_utils.change_working_dir(self.tmp_dir)) - - def testPatcher(self): - given_image_name = 'foo/bar' - built_image_name = 'foo/bar@sha256:1234567890' - - mock_build_image_fn = mock.MagicMock(return_value=built_image_name) - patcher = kubeflow_dag_runner_patcher.KubeflowDagRunnerPatcher( - call_real_run=True, - build_image_fn=mock_build_image_fn, - use_temporary_output_file=True) - runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig( - tfx_image=given_image_name) - runner = kubeflow_dag_runner.KubeflowDagRunner(config=runner_config) - pipeline = tfx_pipeline.Pipeline('dummy', 'dummy_root') - with patcher.patch() as context: - runner.run(pipeline) - self.assertTrue(context[patcher.USE_TEMPORARY_OUTPUT_FILE]) - self.assertIn(patcher.OUTPUT_FILE_PATH, context) - - mock_build_image_fn.assert_called_once_with(given_image_name) - self.assertEqual(runner_config.tfx_image, built_image_name) - - def testPatcherWithOutputFile(self): - output_filename = 'foo.tar.gz' - patcher = kubeflow_dag_runner_patcher.KubeflowDagRunnerPatcher( - call_real_run=False, - build_image_fn=None, - use_temporary_output_file=True) - runner = kubeflow_dag_runner.KubeflowDagRunner( - output_filename=output_filename) - pipeline = tfx_pipeline.Pipeline('dummy', 'dummy_root') - with patcher.patch() as context: - runner.run(pipeline) - self.assertFalse(context[patcher.USE_TEMPORARY_OUTPUT_FILE]) - self.assertEqual( - os.path.basename(context[patcher.OUTPUT_FILE_PATH]), output_filename) - self.assertEqual(runner._output_filename, output_filename) diff --git a/tfx/v1/orchestration/experimental/__init__.py b/tfx/v1/orchestration/experimental/__init__.py index df82230e4e..7da280b36e 100644 --- a/tfx/v1/orchestration/experimental/__init__.py +++ b/tfx/v1/orchestration/experimental/__init__.py @@ -13,23 +13,6 @@ # limitations under the License. """TFX orchestration.experimental module.""" -try: - from tfx.orchestration.kubeflow.kubeflow_dag_runner import ( - KubeflowDagRunner, - KubeflowDagRunnerConfig, - get_default_kubeflow_metadata_config, - ) - from tfx.orchestration.kubeflow.decorators import ( - exit_handler, - ) - from tfx.orchestration.kubeflow.decorators import ( - FinalStatusStr, - ) - from tfx.utils.telemetry_utils import LABEL_KFP_SDK_ENV - -except ImportError: # Import will fail without kfp package. - pass - try: from tfx.orchestration.kubeflow.v2.kubeflow_v2_dag_runner import ( KubeflowV2DagRunner, diff --git a/tfx/v1/proto/__init__.py b/tfx/v1/proto/__init__.py index 47eebef596..89a2f60b5c 100644 --- a/tfx/v1/proto/__init__.py +++ b/tfx/v1/proto/__init__.py @@ -140,7 +140,7 @@ """ KubernetesConfig.__doc__ = """ -Kubernetes configuration. We currently only support the use case when infra validator is run by `orchestration.KubeflowDagRunner`. +Kubernetes configuration. Model server will be launched in the same namespace KFP is running on, as well as same service account will be used (unless specified). Model server will have `ownerReferences` to the infra validator, which delegates the strict cleanup guarantee to the kubernetes cluster. """ From 0159096ca72af3e8edb151728416767ddd479971 Mon Sep 17 00:00:00 2001 From: Doojin Park Date: Wed, 2 Oct 2024 22:02:34 +0000 Subject: [PATCH 02/12] update examples --- .../penguin/penguin_pipeline_kubeflow.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tfx/examples/penguin/penguin_pipeline_kubeflow.py b/tfx/examples/penguin/penguin_pipeline_kubeflow.py index ccb6b35f01..5a59b294bf 100644 --- a/tfx/examples/penguin/penguin_pipeline_kubeflow.py +++ b/tfx/examples/penguin/penguin_pipeline_kubeflow.py @@ -505,6 +505,24 @@ def main(): config=tfx.orchestration.experimental.KubeflowV2DagRunnerConfig(), output_filename=_pipeline_definition_file) + dag_runner.run( + create_pipeline( + pipeline_name=_pipeline_name, + pipeline_root=_pipeline_root, + data_root=_data_root, + module_file=_module_file, + enable_tuning=False, + enable_cache=True, + user_provided_schema_path=_user_provided_schema, + ai_platform_training_args=_ai_platform_training_args, + ai_platform_serving_args=_ai_platform_serving_args, + beam_pipeline_args=beam_pipeline_args, + use_cloud_component=use_cloud_component, + use_aip=use_aip, + use_vertex=use_vertex, + serving_model_dir=_serving_model_dir, + )) + # To compile the pipeline: # python penguin_pipeline_kubeflow.py --use_aip=True or False --use_vertex=True From ec6f6c8bf1a784a464aa4bdef59f21b61cebf7bd Mon Sep 17 00:00:00 2001 From: Doojin Park Date: Wed, 2 Oct 2024 22:33:18 +0000 Subject: [PATCH 03/12] loosen the version constraints for the kfp pipeline spec --- tfx/dependencies.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tfx/dependencies.py b/tfx/dependencies.py index 181b9aa020..4740967629 100644 --- a/tfx/dependencies.py +++ b/tfx/dependencies.py @@ -147,7 +147,7 @@ def make_extra_packages_kfp(): """Prepare extra packages needed for Kubeflow Pipelines orchestrator.""" return [ "kfp>=2", - "kfp-pipeline-spec>=0.3.0", + "kfp-pipeline-spec>=0.2.2", ] @@ -169,7 +169,7 @@ def make_extra_packages_docker_image(): # Packages needed for tfx docker image. return [ "kfp>=2", - "kfp-pipeline-spec>=0.3.0", + "kfp-pipeline-spec>=0.2.2", "mmh>=2.2,<3", "python-snappy>=0.5,<0.6", # Required for tfx/examples/penguin/penguin_utils_cloud_tuner.py From 54f78fa8e6b1be1cd938f42e92d4c169147d433c Mon Sep 17 00:00:00 2001 From: Doojin Park Date: Wed, 2 Oct 2024 23:34:17 +0000 Subject: [PATCH 04/12] update test constratins to fix conflict --- test_constraints.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_constraints.txt b/test_constraints.txt index 131727aa28..aa1144c6f3 100644 --- a/test_constraints.txt +++ b/test_constraints.txt @@ -13,4 +13,4 @@ Flask-session<0.6.0 #TODO(b/329181965): Remove once we migrate TFX to 2.16. tensorflow<2.16 -tensorflow-text<2.16 \ No newline at end of file +tensorflow-text>=2.15.1,<2.16 From 06fd564dae418be46683227f44a0f2d051ef6267 Mon Sep 17 00:00:00 2001 From: Doojin Park Date: Wed, 2 Oct 2024 23:36:43 +0000 Subject: [PATCH 05/12] update test constratins to fix conflict --- test_constraints.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_constraints.txt b/test_constraints.txt index aa1144c6f3..e6b443b414 100644 --- a/test_constraints.txt +++ b/test_constraints.txt @@ -12,5 +12,5 @@ Flask-session<0.6.0 #TODO(b/329181965): Remove once we migrate TFX to 2.16. -tensorflow<2.16 -tensorflow-text>=2.15.1,<2.16 +tensorflow>=2.15.1,<2.16 +tensorflow-text<2.16 From f5c57c8739fecf16a413d1244b1a0e3f57d79617 Mon Sep 17 00:00:00 2001 From: Doojin Park Date: Thu, 3 Oct 2024 02:49:26 +0000 Subject: [PATCH 06/12] Clean up KFP tests --- .../penguin_pipeline_kubeflow_e2e_test.py | 51 --- .../templates/container_based_test_case.py | 139 -------- .../penguin/e2e_tests/kubeflow_e2e_test.py | 52 --- .../taxi/e2e_tests/kubeflow_e2e_test.py | 119 ------- .../kubeflow/container_entrypoint_test.py | 244 --------------- .../e2e_tests/kubeflow_gcp_perf_test.py | 266 ---------------- tfx/orchestration/kubeflow/test_utils.py | 1 - .../cli/handler/kubeflow_handler_test.py | 296 ------------------ tfx/tools/cli/handler/vertex_handler.py | 9 +- 9 files changed, 7 insertions(+), 1170 deletions(-) delete mode 100644 tfx/experimental/templates/penguin/e2e_tests/kubeflow_e2e_test.py delete mode 100644 tfx/experimental/templates/taxi/e2e_tests/kubeflow_e2e_test.py delete mode 100644 tfx/orchestration/kubeflow/container_entrypoint_test.py delete mode 100644 tfx/orchestration/kubeflow/e2e_tests/kubeflow_gcp_perf_test.py delete mode 100644 tfx/tools/cli/handler/kubeflow_handler_test.py diff --git a/tfx/examples/penguin/penguin_pipeline_kubeflow_e2e_test.py b/tfx/examples/penguin/penguin_pipeline_kubeflow_e2e_test.py index 32453d38fb..0a7932e0e7 100644 --- a/tfx/examples/penguin/penguin_pipeline_kubeflow_e2e_test.py +++ b/tfx/examples/penguin/penguin_pipeline_kubeflow_e2e_test.py @@ -18,7 +18,6 @@ from absl.testing import parameterized from tfx.dsl.io import fileio from tfx.examples.penguin import penguin_pipeline_kubeflow -from tfx.orchestration.kubeflow import test_utils as kubeflow_test_utils from tfx.orchestration.kubeflow.v2.e2e_tests import base_test_case from tfx.utils import io_utils @@ -80,53 +79,3 @@ def testEndToEndPipelineRun(self, use_pipeline_spec_2_1): use_pipeline_spec_2_1=use_pipeline_spec_2_1, ) self.assertTrue(fileio.exists(self._serving_model_dir)) - -@pytest.mark.e2e -class PenguinPipelineKubeflowTest(kubeflow_test_utils.BaseKubeflowTest): - - def setUp(self): - super().setUp() - penguin_examples_dir = os.path.join(self._REPO_BASE, 'tfx', 'examples', - 'penguin') - penguin_test_data_root = os.path.join(penguin_examples_dir, 'data') - penguin_test_schema_file = os.path.join(penguin_examples_dir, 'schema', - 'user_provided', 'schema.pbtxt') - self._penguin_module_file = os.path.join(penguin_examples_dir, - 'penguin_utils_cloud_tuner.py') - self._penguin_data_root = os.path.join(self._test_data_dir, 'data') - self._penguin_schema_file = os.path.join(self._test_data_dir, - 'schema.pbtxt') - - io_utils.copy_dir(penguin_test_data_root, self._penguin_data_root) - io_utils.copy_file( - penguin_test_schema_file, self._penguin_schema_file, overwrite=True) - - def testEndToEndPipelineRun(self): - """End-to-end test for pipeline with RuntimeParameter.""" - pipeline_name = 'kubeflow-v1-e2e-test-{}'.format(self._test_id) - kubeflow_pipeline = penguin_pipeline_kubeflow.create_pipeline( - pipeline_name=pipeline_name, - pipeline_root=self._pipeline_root(pipeline_name), - data_root=self._penguin_data_root, - module_file=self._penguin_module_file, - enable_tuning=False, - enable_cache=True, - user_provided_schema_path=self._penguin_schema_file, - ai_platform_training_args=penguin_pipeline_kubeflow - ._ai_platform_training_args, - ai_platform_serving_args=penguin_pipeline_kubeflow - ._ai_platform_serving_args, - beam_pipeline_args=penguin_pipeline_kubeflow - ._beam_pipeline_args_by_runner['DirectRunner'], - use_cloud_component=False, - use_aip=False, - use_vertex=False, - serving_model_dir=self._serving_model_dir) - - parameters = { - 'train-args': '{"num_steps": 100}', - 'eval-args': '{"num_steps": 50}', - } - self._compile_and_run_pipeline( - pipeline=kubeflow_pipeline, parameters=parameters) - self.assertTrue(fileio.exists(self._serving_model_dir)) diff --git a/tfx/experimental/templates/container_based_test_case.py b/tfx/experimental/templates/container_based_test_case.py index dce5f4cbab..3e7733c5d9 100644 --- a/tfx/experimental/templates/container_based_test_case.py +++ b/tfx/experimental/templates/container_based_test_case.py @@ -24,7 +24,6 @@ from tfx.dsl.io import fileio from tfx.experimental.templates import test_utils from tfx.orchestration import test_utils as orchestration_test_utils -from tfx.orchestration.kubeflow import test_utils as kubeflow_test_utils from tfx.orchestration.kubeflow.v2 import vertex_client_utils from tfx.utils import docker_utils from tfx.utils import io_utils @@ -111,144 +110,6 @@ def _delete_target_container_image(self): docker_utils.delete_image(self._target_container_image) -class BaseKubeflowEndToEndTest(BaseContainerBasedEndToEndTest): - """Common utilities for kubeflow engine.""" - - _RETRY_LIMIT = 3 - - # This default bucket name is valid for KFP marketplace deployment since KFP - # version 0.5.0. - _BUCKET_NAME = ( - BaseContainerBasedEndToEndTest._GCP_PROJECT_ID + - '-kubeflowpipelines-default') - - def setUp(self): - super().setUp() - self._namespace = 'kubeflow' - self._endpoint = self._get_endpoint(self._namespace) - self._kfp_client = kfp.Client(host=self._endpoint) - logging.info('ENDPOINT: %s', self._endpoint) - self.enter_context( - test_case_utils.override_env_var( - 'KUBEFLOW_HOME', os.path.join(self._temp_dir, 'kubeflow'))) - - def tearDown(self): - super().tearDown() - self._delete_runs() - self._delete_pipeline() - - def _get_endpoint(self, namespace): - cmd = 'kubectl describe configmap inverse-proxy-config -n {}'.format( - namespace) - output = subprocess.check_output(cmd.split()) - for line in output.decode('utf-8').split('\n'): - if line.endswith('googleusercontent.com'): - return line - - def _get_kfp_runs(self): - # CLI uses experiment_name which is the same as pipeline_name. - experiment_id = self._kfp_client.get_experiment( - experiment_name=self._pipeline_name).id - response = self._kfp_client.list_runs(experiment_id=experiment_id) - return response.runs - - @retry.retry(ignore_eventual_failure=True) - def _delete_runs(self): - for run in self._get_kfp_runs(): - self._kfp_client._run_api.delete_run(id=run.id) # pylint: disable=protected-access - - @retry.retry(ignore_eventual_failure=True) - def _delete_pipeline(self): - self._runCli([ - 'pipeline', 'delete', '--engine', 'kubeflow', '--pipeline_name', - self._pipeline_name - ]) - - def _parse_run_id(self, output: str): - run_id_lines = [ - line for line in output.split('\n') - if '| {} |'.format(self._pipeline_name) in line - ] - self.assertLen(run_id_lines, 1) - return run_id_lines[0].split('|')[2].strip() - - def _wait_until_completed(self, run_id: str): - end_state = kubeflow_test_utils.poll_kfp_with_retry( - self._endpoint, run_id, self._RETRY_LIMIT, self._TIME_OUT, - self._POLLING_INTERVAL_IN_SECONDS) - self.assertEqual(end_state.lower(), kubeflow_test_utils.KFP_SUCCESS_STATUS) - - def _create_pipeline(self): - self._runCli([ - 'pipeline', - 'create', - '--engine', - 'kubeflow', - '--pipeline_path', - 'kubeflow_runner.py', - '--endpoint', - self._endpoint, - '--build-image', - '--build-base-image', - self._base_container_image, - ]) - - def _compile_pipeline(self): - self._runCli([ - 'pipeline', - 'compile', - '--engine', - 'kubeflow', - '--pipeline_path', - 'kubeflow_runner.py', - ]) - - def _update_pipeline(self): - self._runCli([ - 'pipeline', - 'update', - '--engine', - 'kubeflow', - '--pipeline_path', - 'kubeflow_runner.py', - '--endpoint', - self._endpoint, - '--build-image', - ]) - - def _run_pipeline(self): - result = self._runCli([ - 'run', - 'create', - '--engine', - 'kubeflow', - '--pipeline_name', - self._pipeline_name, - '--endpoint', - self._endpoint, - ]) - run_id = self._parse_run_id(result) - self._wait_until_completed(run_id) - kubeflow_test_utils.print_failure_log_for_run(self._endpoint, run_id, - self._namespace) - - def _check_telemetry_label(self): - file_path = os.path.join(self._project_dir, - '{}.tar.gz'.format(self._pipeline_name)) - self.assertTrue(fileio.exists(file_path)) - - with tarfile.TarFile.open(file_path).extractfile( - 'pipeline.yaml') as pipeline_file: - self.assertIsNotNone(pipeline_file) - pipeline = yaml.safe_load(pipeline_file) - metadata = [ - c['metadata'] for c in pipeline['spec']['templates'] if 'dag' not in c - ] - for m in metadata: - self.assertEqual('tfx-template', - m['labels'][telemetry_utils.LABEL_KFP_SDK_ENV]) - - class BaseVertexEndToEndTest(BaseContainerBasedEndToEndTest): """Common utilities for vertex engine.""" diff --git a/tfx/experimental/templates/penguin/e2e_tests/kubeflow_e2e_test.py b/tfx/experimental/templates/penguin/e2e_tests/kubeflow_e2e_test.py deleted file mode 100644 index 25623538df..0000000000 --- a/tfx/experimental/templates/penguin/e2e_tests/kubeflow_e2e_test.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright 2020 Google LLC. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""E2E test using kubeflow orchestrator for penguin template.""" - -from absl import logging -from tfx.experimental.templates import container_based_test_case - -import pytest - - -@pytest.mark.e2e -class PenguinTemplateKubeflowE2ETest( - container_based_test_case.BaseKubeflowEndToEndTest): - - def _generate_pipeline_name(self, random_id: str): - return f'penguin-template-kubeflow-e2e-test-{random_id}' - - def testPipeline(self): - self._copyTemplate('penguin') - - # Prepare data - self._prepare_data() - self._replaceFileContent('kubeflow_runner.py', [ - ('DATA_PATH = \'gs://{}/tfx-template/data/penguin/\'.format(configs.GCS_BUCKET_NAME)', - 'DATA_PATH = \'gs://{{}}/{}/{}\'.format(configs.GCS_BUCKET_NAME)' - .format(self._DATA_DIRECTORY_NAME, self._pipeline_name)), - ]) - - self._compile_pipeline() - self._check_telemetry_label() - - # Create a pipeline with only one component. - self._create_pipeline() - self._run_pipeline() - - # Update the pipeline to include all components. - updated_pipeline_file = self._addAllComponents() - logging.info('Updated %s to add all components to the pipeline.', - updated_pipeline_file) - self._update_pipeline() - self._run_pipeline() diff --git a/tfx/experimental/templates/taxi/e2e_tests/kubeflow_e2e_test.py b/tfx/experimental/templates/taxi/e2e_tests/kubeflow_e2e_test.py deleted file mode 100644 index d65421e210..0000000000 --- a/tfx/experimental/templates/taxi/e2e_tests/kubeflow_e2e_test.py +++ /dev/null @@ -1,119 +0,0 @@ -# Copyright 2020 Google LLC. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""E2E test using kubeflow orchestrator for taxi template.""" - -import os - -from absl import logging -from tfx.experimental.templates import container_based_test_case -from tfx.orchestration.kubeflow import test_utils as kubeflow_test_utils - -import pytest - - -@pytest.mark.e2e -class TaxiTemplateKubeflowE2ETest( - container_based_test_case.BaseKubeflowEndToEndTest): - - def tearDown(self): - super().tearDown() - self._delete_caip_model() - - def _generate_pipeline_name(self, random_id: str): - return f'taxi-template-kubeflow-e2e-test-{random_id}' - - # retry is handled by kubeflow_test_utils.delete_ai_platform_model(). - def _delete_caip_model(self): - model_name = self._pipeline_name.replace('-', '_') - kubeflow_test_utils.delete_ai_platform_model(model_name) - - def testPipeline(self): - self._copyTemplate('taxi') - - # Uncomment all variables in config. - self._uncommentMultiLineVariables( - os.path.join('pipeline', 'configs.py'), [ - 'GOOGLE_CLOUD_REGION', - 'BIG_QUERY_WITH_DIRECT_RUNNER_BEAM_PIPELINE_ARGS', - 'BIG_QUERY_QUERY', 'DATAFLOW_BEAM_PIPELINE_ARGS', - 'GCP_AI_PLATFORM_TRAINING_ARGS', 'GCP_AI_PLATFORM_SERVING_ARGS' - ]) - self._replaceFileContent( - os.path.join('pipeline', 'configs.py'), [ - ('GOOGLE_CLOUD_REGION = \'\'', - 'GOOGLE_CLOUD_REGION = \'{}\''.format(self._GCP_REGION)), - ]) - - # Prepare data - self._prepare_data() - self._replaceFileContent('kubeflow_runner.py', [ - ('DATA_PATH = \'gs://{}/tfx-template/data/taxi/\'.format(configs.GCS_BUCKET_NAME)', - 'DATA_PATH = \'gs://{{}}/{}/{}\'.format(configs.GCS_BUCKET_NAME)' - .format(self._DATA_DIRECTORY_NAME, self._pipeline_name)), - ]) - - self._compile_pipeline() - self._check_telemetry_label() - - # Create a pipeline with only one component. - self._create_pipeline() - self._run_pipeline() - - # Update the pipeline to include all components. - updated_pipeline_file = self._addAllComponents() - logging.info('Updated %s to add all components to the pipeline.', - updated_pipeline_file) - self._update_pipeline() - self._run_pipeline() - - # Enable BigQuery - self._uncomment( - os.path.join('pipeline', 'pipeline.py'), [ - 'query: str,', - 'example_gen = tfx.extensions.google_cloud_big_query.BigQueryExampleGen(', - ' query=query)' - ]) - self._uncomment('kubeflow_runner.py', [ - 'query=configs.BIG_QUERY_QUERY', - 'beam_pipeline_args=configs\n', - '.BIG_QUERY_WITH_DIRECT_RUNNER_BEAM_PIPELINE_ARGS,', - ]) - logging.info('Added BigQueryExampleGen to pipeline.') - self._update_pipeline() - self._run_pipeline() - - # TODO(b/173065862) Re-enable Dataflow tests after timeout is resolved. - # # Enable Dataflow - # self._comment('kubeflow_runner.py', [ - # 'beam_pipeline_args=configs\n', - # '.BIG_QUERY_WITH_DIRECT_RUNNER_BEAM_PIPELINE_ARGS', - # ]) - # self._uncomment('kubeflow_runner.py', [ - # 'beam_pipeline_args=configs.DATAFLOW_BEAM_PIPELINE_ARGS', - # ]) - # logging.info('Added Dataflow to pipeline.') - # self._update_pipeline() - # self._run_pipeline() - - # # Enable CAIP extension. - # self._comment('kubeflow_runner.py', [ - # 'beam_pipeline_args=configs.DATAFLOW_BEAM_PIPELINE_ARGS', - # ]) - self._uncomment('kubeflow_runner.py', [ - 'ai_platform_training_args=configs.GCP_AI_PLATFORM_TRAINING_ARGS,', - 'ai_platform_serving_args=configs.GCP_AI_PLATFORM_SERVING_ARGS,', - ]) - logging.info('Using CAIP trainer and pusher.') - self._update_pipeline() - self._run_pipeline() diff --git a/tfx/orchestration/kubeflow/container_entrypoint_test.py b/tfx/orchestration/kubeflow/container_entrypoint_test.py deleted file mode 100644 index edad32ae4d..0000000000 --- a/tfx/orchestration/kubeflow/container_entrypoint_test.py +++ /dev/null @@ -1,244 +0,0 @@ -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Tests for tfx.orchestration.kubeflow.container_entrypoint.""" - - -import pytest -import json -import os -from unittest import mock - -from tfx.dsl.io import fileio -from tfx.orchestration import metadata -from tfx.orchestration.kubeflow import container_entrypoint -from tfx.orchestration.kubeflow import kubeflow_dag_runner -from tfx.orchestration.kubeflow.proto import kubeflow_pb2 -from tfx.orchestration.portable import beam_executor_operator -from tfx.orchestration.portable import data_types -from tfx.orchestration.portable import execution_publish_utils -from tfx.orchestration.portable import launcher -from tfx.orchestration.portable import outputs_utils -from tfx.orchestration.portable import python_driver_operator -from tfx.orchestration.portable.mlmd import execution_lib -from tfx.proto.orchestration import driver_output_pb2 -from tfx.proto.orchestration import execution_result_pb2 -from tfx.proto.orchestration import pipeline_pb2 -from tfx.types import standard_artifacts -from tfx.utils import io_utils -from tfx.utils import test_case_utils - -from google.protobuf import json_format -from ml_metadata.proto import metadata_store_pb2 - - -class MLMDConfigTest(test_case_utils.TfxTest): - - def _set_required_env_vars(self, env_vars): - for k, v in env_vars.items(): - self.enter_context(test_case_utils.override_env_var(k, v)) - - def testDeprecatedMysqlMetadataConnectionConfig(self): - self._set_required_env_vars({ - 'mysql_host': 'mysql', - 'mysql_port': '3306', - 'mysql_database': 'metadb', - 'mysql_user_name': 'root', - 'mysql_user_password': 'test' - }) - - metadata_config = kubeflow_pb2.KubeflowMetadataConfig() - metadata_config.mysql_db_service_host.environment_variable = 'mysql_host' - metadata_config.mysql_db_service_port.environment_variable = 'mysql_port' - metadata_config.mysql_db_name.environment_variable = 'mysql_database' - metadata_config.mysql_db_user.environment_variable = 'mysql_user_name' - metadata_config.mysql_db_password.environment_variable = 'mysql_user_password' - - ml_metadata_config = container_entrypoint._get_metadata_connection_config( - metadata_config) - self.assertIsInstance(ml_metadata_config, - metadata_store_pb2.ConnectionConfig) - self.assertEqual(ml_metadata_config.mysql.host, 'mysql') - self.assertEqual(ml_metadata_config.mysql.port, 3306) - self.assertEqual(ml_metadata_config.mysql.database, 'metadb') - self.assertEqual(ml_metadata_config.mysql.user, 'root') - self.assertEqual(ml_metadata_config.mysql.password, 'test') - - def testGrpcMetadataConnectionConfig(self): - self._set_required_env_vars({ - 'METADATA_GRPC_SERVICE_HOST': 'metadata-grpc', - 'METADATA_GRPC_SERVICE_PORT': '8080', - }) - - grpc_config = kubeflow_pb2.KubeflowGrpcMetadataConfig() - grpc_config.grpc_service_host.environment_variable = 'METADATA_GRPC_SERVICE_HOST' - grpc_config.grpc_service_port.environment_variable = 'METADATA_GRPC_SERVICE_PORT' - metadata_config = kubeflow_pb2.KubeflowMetadataConfig() - metadata_config.grpc_config.CopyFrom(grpc_config) - - ml_metadata_config = container_entrypoint._get_metadata_connection_config( - metadata_config) - self.assertIsInstance(ml_metadata_config, - metadata_store_pb2.MetadataStoreClientConfig) - self.assertEqual(ml_metadata_config.host, 'metadata-grpc') - self.assertEqual(ml_metadata_config.port, 8080) - - def testDumpUiMetadata(self): - trainer = pipeline_pb2.PipelineNode() - trainer.node_info.type.name = 'tfx.components.trainer.component.Trainer' - model_run_out_spec = pipeline_pb2.OutputSpec( - artifact_spec=pipeline_pb2.OutputSpec.ArtifactSpec( - type=metadata_store_pb2.ArtifactType( - name=standard_artifacts.ModelRun.TYPE_NAME))) - trainer.outputs.outputs['model_run'].CopyFrom(model_run_out_spec) - - model_run = standard_artifacts.ModelRun() - model_run.uri = 'model_run_uri' - exec_info = data_types.ExecutionInfo( - input_dict={}, - output_dict={'model_run': [model_run]}, - exec_properties={}, - execution_id='id') - ui_metadata_path = os.path.join( - os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), - self._testMethodName, 'json') - fileio.makedirs(os.path.dirname(ui_metadata_path)) - container_entrypoint._dump_ui_metadata( - trainer, exec_info, ui_metadata_path) - with open(ui_metadata_path) as f: - ui_metadata = json.load(f) - self.assertEqual('tensorboard', ui_metadata['outputs'][-1]['type']) - self.assertEqual('model_run_uri', ui_metadata['outputs'][-1]['source']) - - def testDumpUiMetadataWithPreExistingFile(self): - dummy_node = pipeline_pb2.PipelineNode() - dummy_node.node_info.type.name = 'class_path_for_dummy_node.DummyComponent' - exec_info = data_types.ExecutionInfo( - input_dict={}, output_dict={}, exec_properties={}, execution_id='id') - - ui_metadata_path = os.path.join( - os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), - self._testMethodName, 'json') - fileio.makedirs(os.path.dirname(ui_metadata_path)) - - # Check with valid file - example_ui_metadata_item = { - 'type': 'table', - 'storage': 'gcs', - 'format': 'csv', - 'header': ['example-header1', 'example-header2'], - 'source': 'gs://example-data-source/example.csv', - } - with fileio.open(ui_metadata_path, 'w') as f: - f.write(json.dumps({'outputs': [example_ui_metadata_item]})) - - container_entrypoint._dump_ui_metadata(dummy_node, exec_info, - ui_metadata_path) - - with open(ui_metadata_path) as f: - ui_metadata = json.load(f) - self.assertLen(ui_metadata['outputs'], 2) - self.assertTrue( - any('markdown' == item['type'] for item in ui_metadata['outputs'])) - self.assertTrue( - any('table' == item['type'] for item in ui_metadata['outputs'])) - - # Check with invalid file - invalid_contents = [ - json.dumps({'wrong_key': [{ - 'foo': 1 - }]}), - json.dumps({'outputs': [1]}), # not a dictionary item - 'not a json', - ] - for content in invalid_contents: - with fileio.open(ui_metadata_path, 'w') as f: - f.write(content) - - container_entrypoint._dump_ui_metadata(dummy_node, exec_info, - ui_metadata_path) - - with open(ui_metadata_path) as f: - ui_metadata = json.load(f) - self.assertLen(ui_metadata['outputs'], 1) - self.assertEqual('markdown', ui_metadata['outputs'][0]['type']) - - @pytest.mark.xfail(run=False, reason="PR 6889 This test fails and needs to be fixed. " -"If this test passes, please remove this mark.", strict=True) - def testOverrideRegisterExecution(self): - # Mock all real operations of driver / executor / MLMD accesses. - mock_targets = ( # (cls, method, return_value) - (beam_executor_operator.BeamExecutorOperator, '__init__', None), - (beam_executor_operator.BeamExecutorOperator, 'run_executor', - execution_result_pb2.ExecutorOutput()), - (python_driver_operator.PythonDriverOperator, '__init__', None), - (python_driver_operator.PythonDriverOperator, 'run_driver', - driver_output_pb2.DriverOutput()), - (metadata.Metadata, '__init__', None), - (metadata.Metadata, '__exit__', None), - (launcher.Launcher, '_publish_successful_execution', None), - (launcher.Launcher, '_clean_up_stateless_execution_info', None), - (launcher.Launcher, '_clean_up_stateful_execution_info', None), - (outputs_utils, 'OutputsResolver', mock.MagicMock()), - (execution_lib, 'get_executions_associated_with_all_contexts', []), - (container_entrypoint, '_dump_ui_metadata', None), - ) - for cls, method, return_value in mock_targets: - self.enter_context( - mock.patch.object( - cls, method, autospec=True, return_value=return_value)) - - mock_mlmd = self.enter_context( - mock.patch.object(metadata.Metadata, '__enter__', - autospec=True)).return_value - mock_mlmd.store.return_value.get_executions_by_id.return_value = [ - metadata_store_pb2.Execution() - ] - - self._set_required_env_vars({ - 'WORKFLOW_ID': 'workflow-id-42', - 'METADATA_GRPC_SERVICE_HOST': 'metadata-grpc', - 'METADATA_GRPC_SERVICE_PORT': '8080', - container_entrypoint._KFP_POD_NAME_ENV_KEY: 'test_pod_name' - }) - - mock_register_execution = self.enter_context( - mock.patch.object( - execution_publish_utils, 'register_execution', - autospec=True)) - - test_ir_file = os.path.join( - os.path.dirname(os.path.abspath(__file__)), 'testdata', - 'two_step_pipeline_post_dehydrate_ir.json') - test_ir = io_utils.read_string_file(test_ir_file) - - argv = [ - '--pipeline_root', - 'dummy', - '--kubeflow_metadata_config', - json_format.MessageToJson( - kubeflow_dag_runner.get_default_kubeflow_metadata_config()), - '--tfx_ir', - test_ir, - '--node_id', - 'BigQueryExampleGen', - '--runtime_parameter', - 'pipeline-run-id=STRING:my-run-id', - ] - container_entrypoint.main(argv) - - mock_register_execution.assert_called_once() - kwargs = mock_register_execution.call_args[1] - self.assertEqual( - kwargs['exec_properties'][ - container_entrypoint._KFP_POD_NAME_PROPERTY_KEY], 'test_pod_name') diff --git a/tfx/orchestration/kubeflow/e2e_tests/kubeflow_gcp_perf_test.py b/tfx/orchestration/kubeflow/e2e_tests/kubeflow_gcp_perf_test.py deleted file mode 100644 index 493cd6f62c..0000000000 --- a/tfx/orchestration/kubeflow/e2e_tests/kubeflow_gcp_perf_test.py +++ /dev/null @@ -1,266 +0,0 @@ -# Copyright 2020 Google LLC. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Integration tests for TFX-on-KFP and GCP services.""" - -import datetime -import os -import subprocess - -from absl import logging -import kfp - -from tfx.dsl.io import fileio -from tfx.examples.penguin import penguin_pipeline_kubeflow -from tfx.orchestration import data_types -from tfx.orchestration import pipeline as tfx_pipeline -from tfx.orchestration import test_utils -from tfx.orchestration.kubeflow import kubeflow_dag_runner -from tfx.orchestration.kubeflow import test_utils as kubeflow_test_utils - -import pytest - - -@pytest.mark.perf -@pytest.mark.e2e -class KubeflowGcpPerfTest(kubeflow_test_utils.BaseKubeflowTest): - - # The endpoint of the KFP instance. - # This test fixture assumes an established KFP instance authenticated via - # inverse proxy. - _KFP_ENDPOINT = os.environ['KFP_E2E_ENDPOINT'] - - # The namespace where KFP is deployed. - _KFP_NAMESPACE = 'kubeflow' - - # Timeout for a single pipeline run. Set to 6 hours. - # TODO(b/158009615): Tune this timeout to align with our observation. - # Note: the Chicago Taxi dataset is a dataset growing with time. The 6 hour - # timeout here was calibrated according to our empirical study in - # b/150222976. This might need to be adjusted occasionally. - _TIME_OUT = datetime.timedelta(hours=6) - - # KFP client polling interval, in seconds - _POLLING_INTERVAL = 60 - - # TODO(b/156784019): temporary workaround. - # Number of retries when `get_run` returns remote error. - _N_RETRIES = 5 - - # The base container image name to use when building the image used in tests. - _BASE_CONTAINER_IMAGE = os.environ['KFP_E2E_BASE_CONTAINER_IMAGE'] - - # The project id to use to run tests. - _GCP_PROJECT_ID = os.environ['KFP_E2E_GCP_PROJECT_ID'] - - # The GCP region in which the end-to-end test is run. - _GCP_REGION = os.environ['KFP_E2E_GCP_REGION'] - - # The GCP zone in which the cluster is created. - _GCP_ZONE = os.environ['KFP_E2E_GCP_ZONE'] - - # The GCP bucket to use to write output artifacts. - _BUCKET_NAME = os.environ['KFP_E2E_BUCKET_NAME'] - - # The GCP GKE cluster name where the KFP deployment is installed. - _CLUSTER_NAME = os.environ['KFP_E2E_CLUSTER_NAME'] - - # The location of test user module file. - # It is retrieved from inside the container subject to testing. - # This location depends on install path of TFX in the docker image. - _MODULE_FILE = '/opt/conda/lib/python3.10/site-packages/tfx/examples/penguin/penguin_utils_cloud_tuner.py' - - # Parameterize worker type/count for easily ramping up the pipeline scale. - _WORKER_COUNT = data_types.RuntimeParameter( - name='worker_count', - default=2, - ptype=int, - ) - - _WORKER_TYPE = data_types.RuntimeParameter( - name='worker_type', - default='standard', - ptype=str, - ) - - # Parameterize parameter server count for easily ramping up the scale. - _PARAMETER_SERVER_COUNT = data_types.RuntimeParameter( - name='parameter_server_count', - default=1, - ptype=int, - ) - - _MODEL_NAME = 'penguin' - - _AI_PLATFORM_SERVING_ARGS = { - 'model_name': _MODEL_NAME, - 'project_id': _GCP_PROJECT_ID, - 'regions': [_GCP_REGION], - } - - # TODO(b/151114974): Remove `disk_size_gb` flag after default is increased. - # TODO(b/156874687): Remove `machine_type` after IP addresses are no longer a - # scaling bottleneck. - # TODO(b/171733562): Remove `use_runner_v2` once it is the default for - #. Dataflow. - _BEAM_PIPELINE_ARGS = [ - '--runner=DataflowRunner', - '--project=' + _GCP_PROJECT_ID, - '--temp_location=gs://' + os.path.join(_BUCKET_NAME, 'dataflow', 'tmp'), - '--region=' + _GCP_REGION, - - # In order not to consume in-use global IP addresses by Dataflow workers, - # configure workers to not use public IPs. If workers needs access to - # public Internet, CloudNAT needs to be configured for the VPC in which - # Dataflow runs. - '--no_use_public_ips', - - # Temporary overrides of defaults. - '--disk_size_gb=50', - '--machine_type=e2-standard-8', - '--experiments=use_runner_v2', - ] - - @classmethod - def tearDownClass(cls): - super(kubeflow_test_utils.BaseKubeflowTest, cls).tearDownClass() - # Delete the cluster created in the test. - delete_cluster_command = [ - 'gcloud', 'container', 'clusters', 'delete', cls._CLUSTER_NAME, - '--region=%s' % cls._GCP_ZONE, '--quiet' - ] - logging.info( - subprocess.check_output(delete_cluster_command).decode('utf-8')) - - def _get_workflow_name(self, pipeline_name: str) -> str: - """Gets the Argo workflow name using pipeline name.""" - get_workflow_name_command = ( - 'argo --namespace %s list | grep -o "%s[^ ]*"' % - (self._KFP_NAMESPACE, pipeline_name)) - # Need to explicitly decode because the test fixture is running on - # Python 3.5. Also need to remove the new line at the end of the string. - return subprocess.check_output( - get_workflow_name_command, shell=True).decode('utf-8')[:-1] - - def _get_workflow_log(self, pipeline_name: str) -> str: - """Gets the workflow log for all the pods using pipeline name.""" - get_workflow_log_command = [ - 'argo', '--namespace', self._KFP_NAMESPACE, 'logs', '-w', - self._get_workflow_name(pipeline_name) - ] - # Need to explicitly decode because the test fixture is running on - # Python 3.5. - return subprocess.check_output(get_workflow_log_command).decode('utf-8') - - def _assert_successful_run_completion(self, host: str, run_id: str, - pipeline_name: str, - timeout: datetime.timedelta): - """Waits and asserts a successful KFP pipeline execution. - - Args: - host: the endpoint of the KFP deployment. - run_id: the run ID of the execution, can be obtained from the respoonse - when submitting the pipeline. - pipeline_name: the name of the pipeline under test. - timeout: maximal waiting time for this execution, in timedelta. - - Raises: - RuntimeError: when timeout exceeds after waiting for specified duration. - """ - - status = kubeflow_test_utils.poll_kfp_with_retry( - host=host, - run_id=run_id, - retry_limit=self._N_RETRIES, - timeout=timeout, - polling_interval=self._POLLING_INTERVAL) - - workflow_log = self._get_workflow_log(pipeline_name) - - self.assertEqual( - status.lower(), kubeflow_test_utils.KFP_SUCCESS_STATUS, - 'Pipeline %s failed to complete successfully: %s' % - (pipeline_name, workflow_log)) - - def _compile_and_run_pipeline(self, pipeline: tfx_pipeline.Pipeline, - **kwargs): - """Compiles and runs a KFP pipeline. - - In this method, provided TFX pipeline will be submitted via kfp.Client() - instead of from Argo. - - Args: - pipeline: The logical pipeline to run. - **kwargs: Key-value pairs of runtime paramters passed to the pipeline - execution. - """ - client = kfp.Client(host=self._KFP_ENDPOINT) - - pipeline_name = pipeline.pipeline_info.pipeline_name - config = kubeflow_dag_runner.KubeflowDagRunnerConfig( - kubeflow_metadata_config=self._get_kubeflow_metadata_config(), - tfx_image=self.container_image) - kubeflow_dag_runner.KubeflowDagRunner(config=config).run(pipeline) - - file_path = os.path.join(self.tmp_dir, '{}.tar.gz'.format(pipeline_name)) - self.assertTrue(fileio.exists(file_path)) - - run_result = client.create_run_from_pipeline_package( - pipeline_file=file_path, arguments=kwargs) - run_id = run_result.run_id - - self._assert_successful_run_completion( - host=self._KFP_ENDPOINT, - run_id=run_id, - pipeline_name=pipeline_name, - timeout=self._TIME_OUT) - - def testFullTaxiGcpPipeline(self): - pipeline_name = 'gcp-perf-test-full-e2e-test-{}'.format( - test_utils.random_id()) - - # Custom CAIP training job using a testing image. - ai_platform_training_args = { - 'project': self._GCP_PROJECT_ID, - 'region': self._GCP_REGION, - 'scaleTier': 'CUSTOM', - 'masterType': 'large_model', - 'masterConfig': { - 'imageUri': self.container_image - }, - 'workerType': self._WORKER_TYPE, - 'parameterServerType': 'standard', - 'workerCount': self._WORKER_COUNT, - 'parameterServerCount': self._PARAMETER_SERVER_COUNT - } - - pipeline = penguin_pipeline_kubeflow.create_pipeline( - pipeline_name=pipeline_name, - pipeline_root=self._pipeline_root(pipeline_name), - module_file=self._MODULE_FILE, - ai_platform_training_args=ai_platform_training_args, - ai_platform_serving_args=self._AI_PLATFORM_SERVING_ARGS, - beam_pipeline_args=self._BEAM_PIPELINE_ARGS) - # TODO(b/162451308): Add this clean-up back after we re-enable AIP pusher - # when AIP prediction service supports TF>=2.3. - # self.addCleanup(kubeflow_test_utils.delete_ai_platform_model, - # self._MODEL_NAME) - self._compile_and_run_pipeline( - pipeline=pipeline, - query_sample_rate=1, - # (1M * batch_size=200) / 200M records ~ 1 epoch - train_steps=1000000, - eval_steps=10000, - worker_count=20, - parameter_server_count=3, - ) diff --git a/tfx/orchestration/kubeflow/test_utils.py b/tfx/orchestration/kubeflow/test_utils.py index add7e13968..88533eac2e 100644 --- a/tfx/orchestration/kubeflow/test_utils.py +++ b/tfx/orchestration/kubeflow/test_utils.py @@ -43,7 +43,6 @@ from tfx.dsl.placeholder import placeholder as ph from tfx.orchestration import pipeline as tfx_pipeline from tfx.orchestration import test_utils -from tfx.orchestration.kubeflow import kubeflow_dag_runner from tfx.orchestration.kubeflow.proto import kubeflow_pb2 from tfx.proto import infra_validator_pb2 from tfx.proto import pusher_pb2 diff --git a/tfx/tools/cli/handler/kubeflow_handler_test.py b/tfx/tools/cli/handler/kubeflow_handler_test.py deleted file mode 100644 index 6288b26617..0000000000 --- a/tfx/tools/cli/handler/kubeflow_handler_test.py +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Tests for tfx.tools.cli.handler.kubeflow_handler.""" - -import datetime -import os -import sys -from unittest import mock - -import kfp - -from tfx.dsl.components.base import base_driver -from tfx.dsl.io import fileio -from tfx.tools.cli import labels -from tfx.tools.cli.handler import kubeflow_dag_runner_patcher -from tfx.tools.cli.handler import kubeflow_handler -from tfx.utils import test_case_utils - - -class _MockRunResponse: - - def __init__(self, pipeline_name, run_id, status, created_at): - self.pipeline_spec = mock.MagicMock() - self.pipeline_spec.pipeline_name = pipeline_name - self.id = run_id - self.status = status - self.created_at = created_at - - -class KubeflowHandlerTest(test_case_utils.TfxTest): - - def setUp(self): - super().setUp() - - # Flags for handler. - self.engine = 'kubeflow' - self.chicago_taxi_pipeline_dir = os.path.join( - os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'testdata') - - self.enter_context(test_case_utils.change_working_dir(self.tmp_dir)) - self.enter_context( - test_case_utils.override_env_var('KFP_E2E_BASE_CONTAINER_IMAGE', - 'dummy-image')) - self.enter_context( - test_case_utils.override_env_var('KFP_E2E_BUCKET_NAME', 'dummy-bucket')) - self.enter_context( - test_case_utils.override_env_var('KFP_E2E_TEST_DATA_ROOT', - 'dummy-root')) - - self.pipeline_path = os.path.join(self.chicago_taxi_pipeline_dir, - 'test_pipeline_kubeflow_1.py') - self.pipeline_name = 'chicago_taxi_pipeline_kubeflow' - - # Kubeflow client params. - self.endpoint = 'dummyEndpoint' - self.namespace = 'kubeflow' - self.iap_client_id = 'dummyID' - - self.runtime_parameter = {'a': '1', 'b': '2'} - - default_flags = { - labels.ENGINE_FLAG: self.engine, - labels.ENDPOINT: self.endpoint, - labels.IAP_CLIENT_ID: self.iap_client_id, - labels.NAMESPACE: self.namespace, - } - - self.flags_with_name = { - **default_flags, - labels.PIPELINE_NAME: self.pipeline_name, - } - - self.flags_with_runtime_param = { - **default_flags, - labels.PIPELINE_NAME: self.pipeline_name, - labels.RUNTIME_PARAMETER: self.runtime_parameter, - } - - self.flags_with_dsl_path = { - **default_flags, - labels.PIPELINE_DSL_PATH: self.pipeline_path, - } - - # Pipeline args for mocking subprocess. - self.pipeline_args = {'pipeline_name': 'chicago_taxi_pipeline_kubeflow'} - self.pipeline_id = 'the_pipeline_id' - self.experiment_id = 'the_experiment_id' - self.pipeline_version_id = 'the_pipeline_version_id' - - mock_client_cls = self.enter_context( - mock.patch.object(kfp, 'Client', autospec=True)) - self.mock_client = mock_client_cls.return_value - # Required to access generated apis. - self.mock_client._experiment_api = mock.MagicMock() - - self.mock_client.get_pipeline_id.return_value = self.pipeline_id - self.mock_client.get_experiment.return_value.id = self.experiment_id - versions = [mock.MagicMock()] - versions[0].id = self.pipeline_version_id - self.mock_client.list_pipeline_versions.return_value.versions = versions - - def testCreatePipeline(self): - handler = kubeflow_handler.KubeflowHandler(self.flags_with_dsl_path) - - self.mock_client.get_pipeline_id.return_value = None - self.mock_client.upload_pipeline.return_value.id = 'new_pipeline_id' - - handler.create_pipeline() - - self.mock_client.upload_pipeline.assert_called_once_with( - pipeline_package_path=mock.ANY, - pipeline_name=self.pipeline_name) - self.mock_client.create_experiment.assert_called_once_with( - self.pipeline_name) - self.mock_client.upload_pipeline_version.assert_not_called() - - def testCreatePipelineExistentPipeline(self): - handler = kubeflow_handler.KubeflowHandler(self.flags_with_dsl_path) - - # 'the_pipeline_id' will be returned. - with self.assertRaises(SystemExit) as err: - handler.create_pipeline() - self.assertIn( - f'Pipeline "{self.pipeline_args[labels.PIPELINE_NAME]}" already exists.', - str(err.exception)) - self.mock_client.upload_pipeline.assert_not_called() - - def testUpdatePipeline(self): - handler = kubeflow_handler.KubeflowHandler(self.flags_with_dsl_path) - - # Update test_pipeline and run update_pipeline - handler.update_pipeline() - - self.mock_client.upload_pipeline.assert_not_called() - self.mock_client.create_experiment.assert_not_called() - self.mock_client.upload_pipeline_version.assert_called_once_with( - pipeline_package_path=mock.ANY, - pipeline_version_name=mock.ANY, - pipeline_id=self.pipeline_id) - - def testUpdatePipelineNoPipeline(self): - handler = kubeflow_handler.KubeflowHandler(self.flags_with_dsl_path) - - self.mock_client.get_pipeline_id.return_value = None - - with self.assertRaises(SystemExit) as err: - handler.update_pipeline() - self.assertIn(f'Cannot find pipeline "{self.pipeline_name}".', - str(err.exception)) - - self.mock_client.upload_pipeline.assert_not_called() - self.mock_client.upload_pipeline_version.assert_not_called() - - def testCompilePipeline(self): - handler = kubeflow_handler.KubeflowHandler(self.flags_with_dsl_path) - with self.captureWritesToStream(sys.stdout) as captured: - handler.compile_pipeline() - self.assertIn('Pipeline compiled successfully', captured.contents()) - self.assertIn('Pipeline package path', captured.contents()) - - def testDeletePipeline(self): - handler = kubeflow_handler.KubeflowHandler(self.flags_with_name) - - handler.delete_pipeline() - - self.mock_client.delete_pipeline.assert_called_once_with(self.pipeline_id) - self.mock_client._experiment_api.delete_experiment.assert_called_once_with( - self.experiment_id) - - def testDeletePipelineNonExistentPipeline(self): - handler = kubeflow_handler.KubeflowHandler(self.flags_with_name) - - self.mock_client.get_pipeline_id.return_value = None - - with self.assertRaises(SystemExit) as err: - handler.delete_pipeline() - self.assertIn(f'Cannot find pipeline "{self.pipeline_name}".', - str(err.exception)) - self.mock_client.delete_pipeline.assert_not_called() - self.mock_client._experiment_api.delete_experiment.assert_not_called() - - @mock.patch.object( - kubeflow_handler.KubeflowHandler, 'execute_dsl', autospec=True) - def testGetSchema(self, mock_execute_dsl): - temp_pipeline_root = os.path.join(self.tmp_dir, 'pipeline_root') - - handler = kubeflow_handler.KubeflowHandler( - {labels.ENGINE_FLAG: self.engine}) - assert isinstance(handler, kubeflow_handler.KubeflowHandler) - mock_execute_dsl.return_value = { - kubeflow_dag_runner_patcher.KubeflowDagRunnerPatcher.PIPELINE_NAME: - self.pipeline_name, - kubeflow_dag_runner_patcher.KubeflowDagRunnerPatcher.PIPELINE_ROOT: - temp_pipeline_root - } - - # No pipeline root - with self.assertRaises(SystemExit) as err: - handler.get_schema() - self.assertEqual( - str(err.exception), - 'Create a run before inferring schema. If pipeline is already running, then wait for it to successfully finish.' - ) - - # No SchemaGen output. - fileio.makedirs(temp_pipeline_root) - with self.assertRaises(SystemExit) as err: - handler.get_schema() - self.assertEqual( - str(err.exception), - 'Either SchemaGen component does not exist or pipeline is still running. If pipeline is running, then wait for it to successfully finish.' - ) - - # Successful pipeline run. - # Create fake schema in pipeline root. - component_output_dir = os.path.join(temp_pipeline_root, 'SchemaGen') - schema_path = base_driver._generate_output_uri( # pylint: disable=protected-access - component_output_dir, 'schema', 3) - fileio.makedirs(schema_path) - with open(os.path.join(schema_path, 'schema.pbtxt'), 'w') as f: - f.write('SCHEMA') - with self.captureWritesToStream(sys.stdout) as captured: - handler.get_schema() - curr_dir_path = os.path.join(os.getcwd(), 'schema.pbtxt') - self.assertIn('Path to schema: {}'.format(curr_dir_path), - captured.contents()) - self.assertIn( - '*********SCHEMA FOR {}**********'.format( - self.pipeline_name.upper()), captured.contents()) - self.assertTrue(fileio.exists(curr_dir_path)) - - def testCreateRun(self): - self.mock_client.run_pipeline.return_value = _MockRunResponse( - self.pipeline_name, '1', 'Success', datetime.datetime.now()) - - handler = kubeflow_handler.KubeflowHandler(self.flags_with_runtime_param) - with self.captureWritesToStream(sys.stdout) as captured: - handler.create_run() - self.assertIn('Run created for pipeline: ', captured.contents()) - self.mock_client.run_pipeline.assert_called_once_with( - experiment_id=self.experiment_id, - job_name=self.pipeline_name, - params={ - 'a': '1', - 'b': '2' - }, - version_id=self.pipeline_version_id) - - def testCreateRunNoPipeline(self): - handler = kubeflow_handler.KubeflowHandler(self.flags_with_name) - - self.mock_client.get_pipeline_id.return_value = None - - with self.assertRaises(SystemExit) as err: - handler.create_run() - self.assertIn(f'Cannot find pipeline "{self.pipeline_name}".', - str(err.exception)) - self.mock_client.run_pipeline.assert_not_called() - - def testListRuns(self): - handler = kubeflow_handler.KubeflowHandler(self.flags_with_name) - - self.mock_client.list_runs.return_value.runs = [ - _MockRunResponse(self.pipeline_name, '1', 'Success', - datetime.datetime.now()), - _MockRunResponse(self.pipeline_name, '2', 'Failed', - datetime.datetime.now()), - ] - - with self.captureWritesToStream(sys.stdout) as captured: - handler.list_runs() - - self.mock_client.list_runs.assert_called_once_with( - experiment_id=self.experiment_id) - self.assertIn('pipeline_name', captured.contents()) - - def testListRunsNoPipeline(self): - handler = kubeflow_handler.KubeflowHandler(self.flags_with_name) - - self.mock_client.get_pipeline_id.return_value = None - - with self.assertRaises(SystemExit) as err: - handler.list_runs() - self.assertIn(f'Cannot find pipeline "{self.pipeline_name}".', - str(err.exception)) diff --git a/tfx/tools/cli/handler/vertex_handler.py b/tfx/tools/cli/handler/vertex_handler.py index 9cb92e5191..50dee8716f 100644 --- a/tfx/tools/cli/handler/vertex_handler.py +++ b/tfx/tools/cli/handler/vertex_handler.py @@ -17,17 +17,22 @@ import os import sys import click +from typing import Optional from google.cloud import aiplatform from google.cloud.aiplatform import pipeline_jobs from tfx.dsl.io import fileio from tfx.tools.cli import labels +from tfx.tools.cli.container_builder import builder from tfx.tools.cli.handler import base_handler -from tfx.tools.cli.handler import kubeflow_handler from tfx.tools.cli.handler import kubeflow_v2_dag_runner_patcher from tfx.utils import io_utils +def create_container_image(image: str, base_image: Optional[str]) -> str: + built_image = builder.build(target_image=image, base_image=base_image) + click.echo(f'New container image "{built_image}" was built.') + return built_image class VertexHandler(base_handler.BaseHandler): """Helper methods for Vertex Handler.""" @@ -40,7 +45,7 @@ def create_pipeline(self, update: bool = False) -> None: """ if self.flags_dict.get(labels.BUILD_IMAGE): build_image_fn = functools.partial( - kubeflow_handler.create_container_image, + create_container_image, base_image=self.flags_dict.get(labels.BASE_IMAGE)) else: build_image_fn = None From 5b335e72c4c51565f42cdd557885cb610286c3a7 Mon Sep 17 00:00:00 2001 From: Doojin Park Date: Thu, 3 Oct 2024 02:53:52 +0000 Subject: [PATCH 07/12] Fix precommit errors --- .../templates/container_based_test_case.py | 6 - tfx/orchestration/kubeflow/test_utils.py | 245 ------------------ 2 files changed, 251 deletions(-) diff --git a/tfx/experimental/templates/container_based_test_case.py b/tfx/experimental/templates/container_based_test_case.py index 3e7733c5d9..bd048e8c27 100644 --- a/tfx/experimental/templates/container_based_test_case.py +++ b/tfx/experimental/templates/container_based_test_case.py @@ -15,22 +15,16 @@ import datetime import os -import subprocess -import tarfile from absl import logging from google.cloud import aiplatform -import kfp -from tfx.dsl.io import fileio from tfx.experimental.templates import test_utils from tfx.orchestration import test_utils as orchestration_test_utils from tfx.orchestration.kubeflow.v2 import vertex_client_utils from tfx.utils import docker_utils from tfx.utils import io_utils from tfx.utils import retry -from tfx.utils import telemetry_utils from tfx.utils import test_case_utils -import yaml class BaseContainerBasedEndToEndTest(test_utils.BaseEndToEndTest): diff --git a/tfx/orchestration/kubeflow/test_utils.py b/tfx/orchestration/kubeflow/test_utils.py index 88533eac2e..9dccf1f778 100644 --- a/tfx/orchestration/kubeflow/test_utils.py +++ b/tfx/orchestration/kubeflow/test_utils.py @@ -380,248 +380,3 @@ def delete_ai_platform_model(model_name): check=True) -class BaseKubeflowTest(test_case_utils.TfxTest): - """Base class that defines testing harness for pipeline on KubeflowRunner.""" - - _POLLING_INTERVAL_IN_SECONDS = 10 - - # The following environment variables need to be set prior to calling the test - # in this file. All variables are required and do not have a default. - - try: - # The base container image name to use when building the image used in tests. - _BASE_CONTAINER_IMAGE = os.environ['KFP_E2E_BASE_CONTAINER_IMAGE'] - - # The src path to use to build docker image - _REPO_BASE = os.environ['KFP_E2E_SRC'] - - # The project id to use to run tests. - _GCP_PROJECT_ID = os.environ['KFP_E2E_GCP_PROJECT_ID'] - - # The GCP region in which the end-to-end test is run. - _GCP_REGION = os.environ['KFP_E2E_GCP_REGION'] - - # The GCP bucket to use to write output artifacts. - _BUCKET_NAME = os.environ['KFP_E2E_BUCKET_NAME'] - - # The location of test data. The input files are copied to a test-local - # location for each invocation, and cleaned up at the end of test. - _TEST_DATA_ROOT = os.environ['KFP_E2E_TEST_DATA_ROOT'] - except KeyError as err: - pytest.skip(f"Environment variable {err} not found.", allow_module_level=True) - - # The location of test user module. Will be packaged and copied to under the - # pipeline root before pipeline execution. - _MODULE_ROOT = os.path.join( - os.path.dirname(os.path.dirname(os.path.dirname(__file__))), - 'components/testdata/module_file') - - @classmethod - def setUpClass(cls): - super(BaseKubeflowTest, cls).setUpClass() - - if ':' not in cls._BASE_CONTAINER_IMAGE: - # Generate base container image for the test if tag is not specified. - cls.container_image = '{}:{}'.format(cls._BASE_CONTAINER_IMAGE, - test_utils.random_id()) - - # Create a container image for use by test pipelines. - test_utils.build_and_push_docker_image(cls.container_image, - cls._REPO_BASE) - else: # Use the given image as a base image. - cls.container_image = cls._BASE_CONTAINER_IMAGE - - @classmethod - def tearDownClass(cls): - super(BaseKubeflowTest, cls).tearDownClass() - - if cls.container_image != cls._BASE_CONTAINER_IMAGE: - # Delete container image used in tests. - logging.info('Deleting image %s', cls.container_image) - docker_utils.delete_image(cls.container_image) - - def setUp(self): - super().setUp() - self._test_id = test_utils.random_id() - self.enter_context(test_case_utils.change_working_dir(self.tmp_dir)) - self._test_output_dir = 'gs://{}/test_output'.format(self._BUCKET_NAME) - self._test_data_dir = 'gs://{}/test_data/{}'.format(self._BUCKET_NAME, - self._test_id) - io_utils.copy_dir(self._TEST_DATA_ROOT, self._test_data_dir) - - self._data_root = os.path.join(self._test_data_dir, 'external', 'csv') - - self._transform_module = os.path.join(self._MODULE_ROOT, - 'transform_module.py') - self._trainer_module = os.path.join(self._MODULE_ROOT, 'trainer_module.py') - self._serving_model_dir = os.path.join(self._test_output_dir, 'output') - - self.addCleanup(self._delete_test_dir, self._test_id) - - @retry.retry(ignore_eventual_failure=True) - def _delete_test_dir(self, test_id: str): - """Deletes files for this test including the module file and data files.""" - logging.info('Deleting test data: %s', self._test_data_dir) - io_utils.delete_dir(self._test_data_dir) - - @retry.retry(ignore_eventual_failure=True) - def _delete_workflow(self, workflow_name: str): - """Deletes the specified Argo workflow.""" - logging.info('Deleting workflow %s', workflow_name) - subprocess.run(['argo', '--namespace', 'kubeflow', 'delete', workflow_name], - check=True) - - def _run_workflow(self, - workflow_file: str, - workflow_name: str, - parameter: Dict[str, str] = None): - """Runs the specified workflow with Argo. - - Blocks until the workflow has run (successfully or not) to completion. - - Args: - workflow_file: YAML file with Argo workflow spec for the pipeline. - workflow_name: Name to use for the workflow. - parameter: mapping from pipeline parameter name to its runtime value. - """ - - # TODO(ajaygopinathan): Consider using KFP cli instead. - def _format_parameter(parameter: Dict[str, Any]) -> List[str]: - """Format the pipeline parameter section of argo workflow.""" - if parameter: - result = [] - for k, v in parameter.items(): - result.append('-p') - result.append('{}={}'.format(k, v)) - return result - else: - return [] - - run_command = [ - 'argo', - 'submit', - '--name', - workflow_name, - '--namespace', - 'kubeflow', - '--serviceaccount', - 'pipeline-runner', - workflow_file, - ] - run_command += _format_parameter(parameter) - logging.info('Launching workflow %s with parameter %s', workflow_name, - _format_parameter(parameter)) - with test_utils.Timer('RunningPipelineToCompletion'): - subprocess.run(run_command, check=True) - # Wait in the loop while pipeline is pending or running state. - status = 'Pending' - while status in ('Pending', 'Running'): - time.sleep(self._POLLING_INTERVAL_IN_SECONDS) - status = self._get_argo_pipeline_status(workflow_name) - - @retry.retry(ignore_eventual_failure=True) - def _delete_pipeline_output(self, pipeline_name: str): - """Deletes output produced by the named pipeline.""" - io_utils.delete_dir(self._pipeline_root(pipeline_name)) - - def _pipeline_root(self, pipeline_name: str): - return os.path.join(self._test_output_dir, pipeline_name) - - def _create_pipeline(self, pipeline_name: str, - components: List[BaseComponent], - beam_pipeline_args: Optional[List[str]] = None): - """Creates a pipeline given name and list of components.""" - return tfx_pipeline.Pipeline( - pipeline_name=pipeline_name, - pipeline_root=self._pipeline_root(pipeline_name), - components=components, - enable_cache=True, - beam_pipeline_args=beam_pipeline_args, - ) - - def _create_dataflow_pipeline(self, - pipeline_name: str, - components: List[BaseComponent], - wait_until_finish_ms: int = 1000 * 60 * 20): - """Creates a pipeline with Beam DataflowRunner.""" - beam_pipeline_args = [ - '--runner=TestDataflowRunner', - '--wait_until_finish_duration=%d' % wait_until_finish_ms, - '--project=' + self._GCP_PROJECT_ID, - '--temp_location=' + - os.path.join(self._pipeline_root(pipeline_name), 'tmp'), - '--region=' + self._GCP_REGION, - - # TODO(b/171733562): Remove `use_runner_v2` once it is the default for - # Dataflow. - '--experiments=use_runner_v2', - ] - return self._create_pipeline( - pipeline_name, components, beam_pipeline_args=beam_pipeline_args) - - def _get_kubeflow_metadata_config( - self) -> kubeflow_pb2.KubeflowMetadataConfig: - config = kubeflow_dag_runner.get_default_kubeflow_metadata_config() - return config - - def _get_argo_pipeline_status(self, workflow_name: str) -> str: - """Get Pipeline status. - - Args: - workflow_name: The name of the workflow. - - Returns: - Simple status string which is returned from `argo get` command. - """ - get_workflow_command = [ - 'argo', '--namespace', 'kubeflow', 'get', workflow_name - ] - output = subprocess.check_output(get_workflow_command).decode('utf-8') - logging.info('Argo output ----\n%s', output) - match = re.search(r'^Status:\s+(.+)$', output, flags=re.MULTILINE) - self.assertIsNotNone(match) - return match.group(1) - - def _compile_and_run_pipeline(self, - pipeline: tfx_pipeline.Pipeline, - workflow_name: str = None, - parameters: Dict[str, Any] = None): - """Compiles and runs a KFP pipeline. - - Args: - pipeline: The logical pipeline to run. - workflow_name: The argo workflow name, default to pipeline name. - parameters: Value of runtime paramters of the pipeline. - """ - pipeline_name = pipeline.pipeline_info.pipeline_name - config = kubeflow_dag_runner.KubeflowDagRunnerConfig( - kubeflow_metadata_config=self._get_kubeflow_metadata_config(), - tfx_image=self.container_image) - kubeflow_dag_runner.KubeflowDagRunner(config=config).run(pipeline) - - file_path = os.path.join(self.tmp_dir, '{}.tar.gz'.format(pipeline_name)) - self.assertTrue(fileio.exists(file_path)) - tarfile.TarFile.open(file_path).extract('pipeline.yaml') - pipeline_file = os.path.join(self.tmp_dir, 'pipeline.yaml') - self.assertIsNotNone(pipeline_file) - - workflow_name = workflow_name or pipeline_name - # Ensure cleanup regardless of whether pipeline succeeds or fails. - self.addCleanup(self._delete_workflow, workflow_name) - self.addCleanup(self._delete_pipeline_output, pipeline_name) - - # Run the pipeline to completion. - self._run_workflow(pipeline_file, workflow_name, parameters) - - # Obtain workflow logs. - get_logs_command = [ - 'argo', '--namespace', 'kubeflow', 'logs', '-w', workflow_name - ] - logs_output = subprocess.check_output(get_logs_command).decode('utf-8') - - # Check if pipeline completed successfully. - status = self._get_argo_pipeline_status(workflow_name) - self.assertEqual( - 'Succeeded', status, 'Pipeline {} failed to complete successfully: {}' - '\nFailed workflow logs:\n{}'.format(pipeline_name, status, - logs_output)) From 333d02f8732ff0dee56904b2200c1937c2e135a5 Mon Sep 17 00:00:00 2001 From: Doojin Park Date: Thu, 3 Oct 2024 02:57:35 +0000 Subject: [PATCH 08/12] Fix pre-commit errors --- tfx/orchestration/kubeflow/test_utils.py | 47 +----------------------- 1 file changed, 1 insertion(+), 46 deletions(-) diff --git a/tfx/orchestration/kubeflow/test_utils.py b/tfx/orchestration/kubeflow/test_utils.py index 9dccf1f778..90ee675f2d 100644 --- a/tfx/orchestration/kubeflow/test_utils.py +++ b/tfx/orchestration/kubeflow/test_utils.py @@ -16,11 +16,9 @@ import datetime import json import os -import re import subprocess -import tarfile import time -from typing import Any, Dict, List, Optional +from typing import List from absl import logging import kfp @@ -39,11 +37,7 @@ from tfx.dsl.components.base.base_component import BaseComponent from tfx.dsl.components.common import resolver from tfx.dsl.input_resolution.strategies import latest_artifact_strategy -from tfx.dsl.io import fileio from tfx.dsl.placeholder import placeholder as ph -from tfx.orchestration import pipeline as tfx_pipeline -from tfx.orchestration import test_utils -from tfx.orchestration.kubeflow.proto import kubeflow_pb2 from tfx.proto import infra_validator_pb2 from tfx.proto import pusher_pb2 from tfx.proto import trainer_pb2 @@ -52,13 +46,8 @@ from tfx.types import component_spec from tfx.types import standard_artifacts from tfx.types.standard_artifacts import Model -from tfx.utils import docker_utils -from tfx.utils import io_utils from tfx.utils import kube_utils from tfx.utils import retry -from tfx.utils import test_case_utils - -import pytest # TODO(jiyongjung): Merge with kube_utils.PodStatus @@ -346,37 +335,3 @@ def create_e2e_components( ] -@retry.retry(ignore_eventual_failure=True) -def delete_ai_platform_model(model_name): - """Delete pushed model with the given name in AI Platform.""" - # In order to delete model, all versions in the model must be deleted first. - versions_command = ('gcloud', 'ai-platform', 'versions', 'list', - '--model={}'.format(model_name), '--region=global') - # The return code of the following subprocess call will be explicitly checked - # using the logic below, so we don't need to call check_output(). - versions = subprocess.run(versions_command, stdout=subprocess.PIPE) # pylint: disable=subprocess-run-check - if versions.returncode == 0: - logging.info('Model %s has versions %s', model_name, versions.stdout) - # The first stdout line is headers, ignore. The columns are - # [NAME] [DEPLOYMENT_URI] [STATE] - # - # By specification of test case, the last version in the output list is the - # default version, which will be deleted last in the for loop, so there's no - # special handling needed hear. - # The operation setting default version is at - # https://github.com/tensorflow/tfx/blob/65633c772f6446189e8be7c6332d32ea221ff836/tfx/extensions/google_cloud_ai_platform/runner.py#L309 - for version in versions.stdout.decode('utf-8').strip('\n').split('\n')[1:]: - version = version.split()[0] - logging.info('Deleting version %s of model %s', version, model_name) - version_delete_command = ('gcloud', '--quiet', 'ai-platform', 'versions', - 'delete', version, - '--model={}'.format(model_name), - '--region=global') - subprocess.run(version_delete_command, check=True) - - logging.info('Deleting model %s', model_name) - subprocess.run(('gcloud', '--quiet', 'ai-platform', 'models', 'delete', - model_name, '--region=global'), - check=True) - - From 3c71d6a550ce94904f8396d2146210cb068d7b95 Mon Sep 17 00:00:00 2001 From: Doojin Park Date: Thu, 3 Oct 2024 02:59:36 +0000 Subject: [PATCH 09/12] Fix pre-commit errors --- tfx/orchestration/kubeflow/test_utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tfx/orchestration/kubeflow/test_utils.py b/tfx/orchestration/kubeflow/test_utils.py index 90ee675f2d..89a1f2f432 100644 --- a/tfx/orchestration/kubeflow/test_utils.py +++ b/tfx/orchestration/kubeflow/test_utils.py @@ -16,7 +16,6 @@ import datetime import json import os -import subprocess import time from typing import List @@ -47,7 +46,6 @@ from tfx.types import standard_artifacts from tfx.types.standard_artifacts import Model from tfx.utils import kube_utils -from tfx.utils import retry # TODO(jiyongjung): Merge with kube_utils.PodStatus From 10543686e909f889f82534fbf64f993ff51dc8f9 Mon Sep 17 00:00:00 2001 From: Doojin Park Date: Thu, 3 Oct 2024 03:01:22 +0000 Subject: [PATCH 10/12] Fix pre-commit errors --- tfx/orchestration/kubeflow/test_utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tfx/orchestration/kubeflow/test_utils.py b/tfx/orchestration/kubeflow/test_utils.py index 89a1f2f432..50f87104ce 100644 --- a/tfx/orchestration/kubeflow/test_utils.py +++ b/tfx/orchestration/kubeflow/test_utils.py @@ -331,5 +331,3 @@ def create_e2e_components( infra_validator, pusher, ] - - From efb0b5f6f55a337d9b4d76c08b76a5a2236d6751 Mon Sep 17 00:00:00 2001 From: Doojin Park Date: Thu, 3 Oct 2024 03:15:43 +0000 Subject: [PATCH 11/12] Fix pre-commit errors --- .../kubeflow_dataflow_integration_test.py | 106 ---- .../kubeflow/e2e_tests/kubeflow_e2e_test.py | 279 ---------- .../kubeflow_gcp_integration_test.py | 481 ------------------ 3 files changed, 866 deletions(-) delete mode 100644 tfx/orchestration/kubeflow/e2e_tests/kubeflow_dataflow_integration_test.py delete mode 100644 tfx/orchestration/kubeflow/e2e_tests/kubeflow_e2e_test.py delete mode 100644 tfx/orchestration/kubeflow/e2e_tests/kubeflow_gcp_integration_test.py diff --git a/tfx/orchestration/kubeflow/e2e_tests/kubeflow_dataflow_integration_test.py b/tfx/orchestration/kubeflow/e2e_tests/kubeflow_dataflow_integration_test.py deleted file mode 100644 index 5bc1ac9e5e..0000000000 --- a/tfx/orchestration/kubeflow/e2e_tests/kubeflow_dataflow_integration_test.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright 2019 Google LLC. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Integration tests for Kubeflow-based orchestrator and Dataflow.""" - -import os - -from tfx.components.evaluator.component import Evaluator -from tfx.components.example_gen.csv_example_gen.component import CsvExampleGen -from tfx.components.statistics_gen.component import StatisticsGen -from tfx.components.transform.component import Transform -from tfx.dsl.components.common import importer -from tfx.orchestration import test_utils -from tfx.orchestration.kubeflow import test_utils as kubeflow_test_utils -from tfx.proto import evaluator_pb2 -from tfx.types import standard_artifacts - -import pytest - - -# TODO(b/202799145): Check whether dataflow jobs have actually been launched. -@pytest.mark.integration -@pytest.mark.e2e -class KubeflowDataflowIntegrationTest(kubeflow_test_utils.BaseKubeflowTest): - - def setUp(self): - super().setUp() - - # Example artifacts for testing. - self.raw_examples_importer = importer.Importer( - source_uri=os.path.join(self._test_data_dir, 'csv_example_gen'), - artifact_type=standard_artifacts.Examples, - reimport=True, - properties={ - 'split_names': '["train", "eval"]' - }).with_id('raw_examples') - - # Schema artifact for testing. - self.schema_importer = importer.Importer( - source_uri=os.path.join(self._test_data_dir, 'schema_gen'), - artifact_type=standard_artifacts.Schema, - reimport=True).with_id('schema') - - # Model artifact for testing. - self.model_1_importer = importer.Importer( - source_uri=os.path.join(self._test_data_dir, 'trainer', 'previous'), - artifact_type=standard_artifacts.Model, - reimport=True).with_id('model_1') - - def testCsvExampleGenOnDataflowRunner(self): - """CsvExampleGen-only test pipeline on DataflowRunner invocation.""" - pipeline_name = 'kubeflow-csv-example-gen-dataflow-test-{}'.format( - test_utils.random_id()) - pipeline = self._create_dataflow_pipeline(pipeline_name, [ - CsvExampleGen(input_base=self._data_root), - ]) - self._compile_and_run_pipeline(pipeline) - - def testStatisticsGenOnDataflowRunner(self): - """StatisticsGen-only test pipeline on DataflowRunner.""" - pipeline_name = 'kubeflow-statistics-gen-dataflow-test-{}'.format( - test_utils.random_id()) - pipeline = self._create_dataflow_pipeline(pipeline_name, [ - self.raw_examples_importer, - StatisticsGen(examples=self.raw_examples_importer.outputs['result']) - ]) - self._compile_and_run_pipeline(pipeline) - - def testTransformOnDataflowRunner(self): - """Transform-only test pipeline on DataflowRunner.""" - pipeline_name = 'kubeflow-transform-dataflow-test-{}'.format( - test_utils.random_id()) - pipeline = self._create_dataflow_pipeline(pipeline_name, [ - self.raw_examples_importer, self.schema_importer, - Transform( - examples=self.raw_examples_importer.outputs['result'], - schema=self.schema_importer.outputs['result'], - module_file=self._transform_module) - ]) - self._compile_and_run_pipeline(pipeline) - - def testEvaluatorOnDataflowRunner(self): - """Evaluator-only test pipeline on DataflowRunner.""" - pipeline_name = 'kubeflow-evaluator-dataflow-test-{}'.format( - test_utils.random_id()) - pipeline = self._create_dataflow_pipeline(pipeline_name, [ - self.raw_examples_importer, self.model_1_importer, - Evaluator( - examples=self.raw_examples_importer.outputs['result'], - model=self.model_1_importer.outputs['result'], - feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ - evaluator_pb2.SingleSlicingSpec( - column_for_slicing=['trip_start_hour']) - ])) - ]) - self._compile_and_run_pipeline(pipeline) diff --git a/tfx/orchestration/kubeflow/e2e_tests/kubeflow_e2e_test.py b/tfx/orchestration/kubeflow/e2e_tests/kubeflow_e2e_test.py deleted file mode 100644 index 8eba5787aa..0000000000 --- a/tfx/orchestration/kubeflow/e2e_tests/kubeflow_e2e_test.py +++ /dev/null @@ -1,279 +0,0 @@ -# Copyright 2019 Google LLC. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""End to end tests for Kubeflow-based orchestrator.""" - -import os -import subprocess -import time -from typing import List - -from absl import logging -from grpc import insecure_channel -from tfx.dsl.io import fileio -from tfx.orchestration import test_utils -from tfx.orchestration.experimental.core.testing import test_dynamic_exec_properties_pipeline -from tfx.orchestration.kubeflow import test_utils as kubeflow_test_utils -from tfx.orchestration.test_pipelines import download_grep_print_pipeline -from tfx.types import standard_artifacts - -from ml_metadata.proto import metadata_store_pb2 -from ml_metadata.proto import metadata_store_service_pb2 -from ml_metadata.proto import metadata_store_service_pb2_grpc - -import pytest - - -# The range of port-forwarding addresses used by Kubeflow E2E test. -# If the current specified address is occupied, the test will scan forward until -# a unused port is met, or stop at _KFP_E2E_TEST_FORWARDING_PORT_END. -_KFP_E2E_TEST_FORWARDING_PORT_BEGIN = 8081 -_KFP_E2E_TEST_FORWARDING_PORT_END = 8888 - -# Number of attempts to bind one port. -_MAX_ATTEMPTS = 5 - -# Context name of pipeline contexts. -_CONTEXT_TYPE_PIPELINE = 'pipeline' - - -@pytest.mark.e2e -class KubeflowEndToEndTest(kubeflow_test_utils.BaseKubeflowTest): - - @classmethod - def setUpClass(cls): - # Initializes the port-forward process to talk MLMD. - super().setUpClass() - cls._port_forwarding_process = cls._setup_mlmd_port_forward() - - @classmethod - def tearDownClass(cls): - super(KubeflowEndToEndTest, cls).tearDownClass() - - # Delete container image used in tests. - logging.info('Killing the GRPC port-forwarding process.') - cls._port_forwarding_process.kill() - - @classmethod - def _get_grpc_port(cls) -> str: - """Get the port number used by MLMD gRPC server.""" - get_grpc_port_command = [ - 'kubectl', '-n', 'kubeflow', 'get', 'configmap', - 'metadata-grpc-configmap', '-o', - 'jsonpath={.data.METADATA_GRPC_SERVICE_PORT}' - ] - - grpc_port = subprocess.check_output(get_grpc_port_command) - return grpc_port.decode('utf-8') - - @classmethod - def _setup_mlmd_port_forward(cls) -> subprocess.Popen: - """Uses port forward to talk to MLMD gRPC server.""" - grpc_port = cls._get_grpc_port() - - is_bind = False - forwarded_port = None - - for port in range(_KFP_E2E_TEST_FORWARDING_PORT_BEGIN, - _KFP_E2E_TEST_FORWARDING_PORT_END): - grpc_forward_command = [ - 'kubectl', 'port-forward', 'deployment/metadata-grpc-deployment', - '-n', 'kubeflow', ('%s:%s' % (port, grpc_port)) - ] - # Begin port forwarding. - proc = subprocess.Popen(grpc_forward_command) - try: - # Wait while port forward to pod is being established - poll_grpc_port_command = ['lsof', '-i', ':%s' % port] - result = subprocess.run( # pylint: disable=subprocess-run-check - poll_grpc_port_command, - stdout=subprocess.PIPE) - for _ in range(_MAX_ATTEMPTS): - if (result.returncode == 0 and - 'kubectl' in result.stdout.decode('utf-8')): - is_bind = True - break - logging.info( - 'Waiting while gRPC port-forward is being established...') - time.sleep(5) - result = subprocess.run( # pylint: disable=subprocess-run-check - poll_grpc_port_command, - stdout=subprocess.PIPE) - - except Exception as e: - logging.exception("An unexpected error occurred", exc_info = e) - # Kill the process in case unexpected error occurred. - proc.kill() - - if is_bind: - forwarded_port = port - break - - if not is_bind: - raise RuntimeError('Failed to establish gRPC port-forward to cluster in ' - 'the specified range: port %s to %s' % - (_KFP_E2E_TEST_FORWARDING_PORT_BEGIN, - _KFP_E2E_TEST_FORWARDING_PORT_END)) - - # Establish MLMD gRPC channel. - forwarding_channel = insecure_channel('localhost:%s' % forwarded_port) - cls._stub = metadata_store_service_pb2_grpc.MetadataStoreServiceStub( - forwarding_channel) - - return proc - - def _get_artifacts_with_type_and_pipeline( - self, type_name: str, - pipeline_name: str) -> List[metadata_store_pb2.Artifact]: - """Helper function returns artifacts of specified pipeline and type.""" - # 1. Find the pipeline context according to its name. - request = metadata_store_service_pb2.GetContextByTypeAndNameRequest( - type_name=_CONTEXT_TYPE_PIPELINE, context_name=pipeline_name) - pipeline_context = self._stub.GetContextByTypeAndName(request) - # 2. Find the artifacts associated with the pipeline context. - request = metadata_store_service_pb2.GetArtifactsByContextRequest( - context_id=pipeline_context.context.id) - artifacts_response = self._stub.GetArtifactsByContext(request) - # 3. Find the specified artifact type id. - artifact_type_request = metadata_store_service_pb2.GetArtifactTypeRequest( - type_name=type_name) - artifact_type = self._stub.GetArtifactType( - artifact_type_request).artifact_type - # 4. Filter the returned artifacts according to their types and return. - return [ - artifact for artifact in artifacts_response.artifacts - if artifact.type_id == artifact_type.id - ] - - def _get_value_of_string_artifact( - self, string_artifact: metadata_store_pb2.Artifact) -> str: - """Helper function returns the actual value of a ValueArtifact.""" - - string_artifact_obj = standard_artifacts.String() - string_artifact_obj.uri = string_artifact.uri - string_artifact_obj.read() - return string_artifact_obj.value - - def _get_executions_by_pipeline_name( - self, pipeline_name: str) -> List[metadata_store_pb2.Execution]: - """Helper function returns executions under a given pipeline name.""" - # step 1: get context id by context name - request = metadata_store_service_pb2.GetContextByTypeAndNameRequest( - type_name='pipeline', context_name=pipeline_name) - context_id = self._stub.GetContextByTypeAndName(request).context.id - # step 2: get executions by context id - request = metadata_store_service_pb2.GetExecutionsByContextRequest( - context_id=context_id) - return self._stub.GetExecutionsByContext(request).executions - - def _get_executions_by_pipeline_name_and_state( - self, pipeline_name: str, state: metadata_store_pb2.Execution.State - ) -> List[metadata_store_pb2.Execution]: - """Helper function returns executions for a given state.""" - executions = self._get_executions_by_pipeline_name(pipeline_name) - result = [] - for e in executions: - if e.last_known_state == state: - result.append(e) - - return result - - def _assert_infra_validator_passed(self, pipeline_name: str): - artifacts = self._get_artifacts_with_type_and_pipeline( - type_name='InfraBlessing', pipeline_name=pipeline_name) - self.assertGreaterEqual(len(artifacts), 1) - for artifact in artifacts: - blessed = os.path.join(artifact.uri, 'INFRA_BLESSED') - self.assertTrue( - fileio.exists(blessed), - 'Expected InfraBlessing results cannot be found under path %s for ' - 'artifact %s' % (blessed, artifact)) - - def testSimpleEnd2EndPipeline(self): - """End-to-End test for simple pipeline.""" - pipeline_name = 'kubeflow-e2e-test-{}'.format(test_utils.random_id()) - # Test data is copied from the repository(tfx/components/testdata/) to an - # ephemeral location in GCS bucket(BaseKubeflowTest._BUCKET_NAME). - # See kubeflow_test_utils.BaseKubeflowTest.setUp() for the detail. - components = kubeflow_test_utils.create_e2e_components( - self._pipeline_root(pipeline_name), - self._data_root, - self._transform_module, - self._trainer_module, - ) - pipeline = self._create_pipeline(pipeline_name, components) - - self._compile_and_run_pipeline(pipeline) - self._assert_infra_validator_passed(pipeline_name) - - def testPrimitiveEnd2EndPipeline(self): - """End-to-End test for primitive artifacts passing.""" - pipeline_name = 'kubeflow-primitive-e2e-test-{}'.format( - test_utils.random_id()) - components = kubeflow_test_utils.create_primitive_type_components( - pipeline_name) - # Test that the pipeline can be executed successfully. - pipeline = self._create_pipeline(pipeline_name, components) - self._compile_and_run_pipeline( - pipeline=pipeline, workflow_name=pipeline_name + '-run-1') - # Test if the correct value has been passed. - str_artifacts = self._get_artifacts_with_type_and_pipeline( - type_name='String', pipeline_name=pipeline_name) - # There should be exactly one string artifact. - self.assertEqual(1, len(str_artifacts)) - self.assertEqual( - self._get_value_of_string_artifact(str_artifacts[0]), - 'hello %s\n' % pipeline_name) - # Test caching. - self._compile_and_run_pipeline( - pipeline=pipeline, workflow_name=pipeline_name + '-run-2') - cached_execution = self._get_executions_by_pipeline_name_and_state( - pipeline_name=pipeline_name, - state=metadata_store_pb2.Execution.State.CACHED) - self.assertEqual(2, len(cached_execution)) - - def testCreateContainerComponentEnd2EndPipeline(self): - """End-to-End test for container components.""" - pipeline_name = 'kubeflow-container-e2e-test-{}'.format( - test_utils.random_id()) - text_url = ( - 'https://storage.googleapis.com/ml-pipeline-playground/hamlet.txt') - pattern = 'art thou' - component_instances = download_grep_print_pipeline.create_pipeline_component_instances( - text_url=text_url, - pattern=pattern, - ) - # Test that the pipeline can be executed successfully. - pipeline = self._create_pipeline(pipeline_name, component_instances) - self._compile_and_run_pipeline( - pipeline=pipeline, workflow_name=pipeline_name) - # Test if the correct value has been passed. - artifacts = self._get_artifacts_with_type_and_pipeline( - type_name='ExternalArtifact', pipeline_name=pipeline_name) - # There should be exactly two artifacts. - self.assertEqual(len(artifacts), 2) - for artifact in artifacts: - # TODO(b/150515270) Remove the '/data' suffix when b/150515270 is fixed. - artifact_value = fileio.open(artifact.uri + '/data', 'r').read() - self.assertGreater(len(artifact_value), 100) - - def testDynamicPropertiesEnd2EndPipeline(self): - pipeline_name = 'kubeflow-dynamic-exec-e2e-test-{}'.format( - test_utils.random_id()) - components = test_dynamic_exec_properties_pipeline.create_components() - pipeline = self._create_pipeline(pipeline_name, components) - self._compile_and_run_pipeline( - pipeline=pipeline, workflow_name=pipeline_name) - artifacts = self._get_artifacts_with_type_and_pipeline( - type_name='String', pipeline_name=pipeline_name) - self.assertEqual(len(artifacts), 1) diff --git a/tfx/orchestration/kubeflow/e2e_tests/kubeflow_gcp_integration_test.py b/tfx/orchestration/kubeflow/e2e_tests/kubeflow_gcp_integration_test.py deleted file mode 100644 index 86b6686132..0000000000 --- a/tfx/orchestration/kubeflow/e2e_tests/kubeflow_gcp_integration_test.py +++ /dev/null @@ -1,481 +0,0 @@ -# Copyright 2019 Google LLC. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Integration tests for Kubeflow-based orchestrator and GCP backend.""" - -import os - -import absl -from googleapiclient import discovery -from googleapiclient import errors as googleapiclient_errors -from tfx import v1 as tfx -from tfx.components.pusher.component import Pusher -from tfx.components.trainer.component import Trainer -from tfx.dsl.components.base import executor_spec -from tfx.dsl.components.common import importer -from tfx.dsl.io import fileio -from tfx.extensions.google_cloud_ai_platform import constants -from tfx.extensions.google_cloud_ai_platform import runner -from tfx.extensions.google_cloud_ai_platform.pusher import executor as ai_platform_pusher_executor -from tfx.extensions.google_cloud_ai_platform.trainer import executor as ai_platform_trainer_executor -from tfx.extensions.google_cloud_ai_platform.tuner import component as ai_platform_tuner_component -from tfx.extensions.google_cloud_ai_platform.tuner import executor as ai_platform_tuner_executor -from tfx.extensions.google_cloud_big_query.pusher import executor as bigquery_pusher_executor -from tfx.orchestration import test_utils -from tfx.orchestration.kubeflow import test_utils as kubeflow_test_utils -from tfx.proto import trainer_pb2 -from tfx.proto import tuner_pb2 -from tfx.types import standard_artifacts -from tfx.utils import path_utils -from tfx.utils import telemetry_utils - -import pytest - - -@pytest.mark.integration -@pytest.mark.e2e -class KubeflowGCPIntegrationTest(kubeflow_test_utils.BaseKubeflowTest): - - def setUp(self): - super().setUp() - - # Transformed Example artifacts for testing. - self.transformed_examples_importer = importer.Importer( - source_uri=os.path.join(self._test_data_dir, 'transform', - 'transformed_examples'), - artifact_type=standard_artifacts.Examples, - reimport=True, - properties={ - 'split_names': '["train", "eval"]' - }).with_id('transformed_examples') - - # Schema artifact for testing. - self.schema_importer = importer.Importer( - source_uri=os.path.join(self._test_data_dir, 'schema_gen'), - artifact_type=standard_artifacts.Schema, - reimport=True).with_id('schema') - - # TransformGraph artifact for testing. - self.transform_graph_importer = importer.Importer( - source_uri=os.path.join(self._test_data_dir, 'transform', - 'transform_graph'), - artifact_type=standard_artifacts.TransformGraph, - reimport=True).with_id('transform_graph') - - # Model artifact for testing. - self.model_1_importer = importer.Importer( - source_uri=os.path.join(self._test_data_dir, 'trainer', 'previous'), - artifact_type=standard_artifacts.Model, - reimport=True).with_id('model_1') - - self.model_2_importer = importer.Importer( - source_uri=os.path.join(self._test_data_dir, 'trainer', 'current'), - artifact_type=standard_artifacts.Model, - reimport=True).with_id('model_2') - - # ModelBlessing artifact for testing. - self.model_blessing_1_importer = importer.Importer( - source_uri=os.path.join(self._test_data_dir, 'model_validator', - 'blessed'), - artifact_type=standard_artifacts.ModelBlessing, - reimport=True, - custom_properties={ - 'blessed': 1 - }).with_id('model_blessing_1') - - self.model_blessing_2_importer = importer.Importer( - source_uri=os.path.join(self._test_data_dir, 'model_validator', - 'blessed'), - artifact_type=standard_artifacts.ModelBlessing, - reimport=True, - custom_properties={ - 'blessed': 1 - }).with_id('model_blessing_2') - - ### Test data and modules for native Keras trainer and tuner. - self._penguin_tuner_module = os.path.join(self._MODULE_ROOT, - 'tuner_module.py') - self.penguin_examples_importer = importer.Importer( - source_uri=os.path.join(self._test_data_dir, 'penguin', 'data'), - artifact_type=standard_artifacts.Examples, - reimport=True, - properties={ - 'split_names': '["train", "eval"]' - }).with_id('penguin_examples') - self.penguin_schema_importer = importer.Importer( - source_uri=os.path.join(self._test_data_dir, 'penguin', 'schema'), - artifact_type=standard_artifacts.Schema, - reimport=True).with_id('penguin_schema') - - def _getCaipTrainingArgs(self, pipeline_name): - """Training args for Google CAIP Training.""" - return { - 'project': self._GCP_PROJECT_ID, - 'region': self._GCP_REGION, - 'jobDir': os.path.join(self._pipeline_root(pipeline_name), 'tmp'), - 'masterConfig': { - 'imageUri': self.container_image, - }, - } - - def _getCaipTrainingArgsForDistributed(self, pipeline_name): - """Training args to test that distributed training is behaves properly.""" - args = self._getCaipTrainingArgs(pipeline_name) - args.update({ - 'scaleTier': 'CUSTOM', - 'masterType': 'large_model', - 'parameterServerType': 'standard', - 'parameterServerCount': 1, - 'workerType': 'standard', - 'workerCount': 2, - }) - return args - - def _getVertexTrainingArgs(self, pipeline_name): - """Training args for Google Vertex AI Training.""" - return { - 'project': self._GCP_PROJECT_ID, - 'job_spec': { - 'worker_pool_specs': [{ - 'machine_spec': { - 'machine_type': 'e2-standard-8' - }, - 'replica_count': 1, - 'container_spec': { - 'image_uri': self.container_image - } - }] - } - } - - def _assertNumberOfTrainerOutputIsOne(self, pipeline_name): - """Make sure the number of trainer executions and output models.""" - # There must be only one execution of Trainer. - trainer_output_base_dir = os.path.join( - self._pipeline_root(pipeline_name), 'Trainer', 'model') - trainer_outputs = fileio.listdir(trainer_output_base_dir) - self.assertEqual(1, len(trainer_outputs)) - - # There must be only one saved models each for serving and eval. - model_uri = os.path.join(trainer_output_base_dir, trainer_outputs[0]) - eval_model_dir = path_utils.eval_model_dir(model_uri) - serving_model_dir = path_utils.serving_model_dir(model_uri) - self.assertEqual(1, fileio.listdir(eval_model_dir).count('saved_model.pb')) - self.assertEqual(1, - fileio.listdir(serving_model_dir).count('saved_model.pb')) - - def _make_unique_pipeline_name(self, prefix): - return '-'.join([prefix, 'test', test_utils.random_id()]) - - def testAIPlatformTrainerPipeline(self): - """Trainer-only test pipeline on AI Platform Training.""" - pipeline_name = self._make_unique_pipeline_name('kubeflow-aip-trainer') - pipeline = self._create_pipeline(pipeline_name, [ - self.schema_importer, self.transformed_examples_importer, - self.transform_graph_importer, - Trainer( - custom_executor_spec=executor_spec.ExecutorClassSpec( - ai_platform_trainer_executor.Executor), - module_file=self._trainer_module, - transformed_examples=self.transformed_examples_importer - .outputs['result'], - schema=self.schema_importer.outputs['result'], - transform_graph=self.transform_graph_importer.outputs['result'], - train_args=trainer_pb2.TrainArgs(num_steps=10), - eval_args=trainer_pb2.EvalArgs(num_steps=5), - custom_config={ - ai_platform_trainer_executor.TRAINING_ARGS_KEY: - self._getCaipTrainingArgsForDistributed(pipeline_name) - }) - ]) - self._compile_and_run_pipeline(pipeline) - self._assertNumberOfTrainerOutputIsOne(pipeline_name) - - def testAIPlatformGenericTrainerPipeline(self): - """Trainer-only pipeline on AI Platform Training with GenericTrainer.""" - pipeline_name = self._make_unique_pipeline_name( - 'kubeflow-aip-generic-trainer') - pipeline = self._create_pipeline(pipeline_name, [ - self.schema_importer, self.transformed_examples_importer, - self.transform_graph_importer, - Trainer( - custom_executor_spec=executor_spec.ExecutorClassSpec( - ai_platform_trainer_executor.GenericExecutor), - module_file=self._trainer_module, - transformed_examples=self.transformed_examples_importer - .outputs['result'], - schema=self.schema_importer.outputs['result'], - transform_graph=self.transform_graph_importer.outputs['result'], - train_args=trainer_pb2.TrainArgs(num_steps=10), - eval_args=trainer_pb2.EvalArgs(num_steps=5), - custom_config={ - ai_platform_trainer_executor.TRAINING_ARGS_KEY: - self._getCaipTrainingArgs(pipeline_name) - }) - ]) - self._compile_and_run_pipeline(pipeline) - self._assertNumberOfTrainerOutputIsOne(pipeline_name) - - # TODO(b/150661783): Add tests using distributed training with a generic - # trainer. - # TODO(b/150576271): Add Trainer tests using Keras models. - - def _assertHyperparametersAreWritten(self, pipeline_name): - """Make sure the tuner execution and hyperpearameters output.""" - # There must be only one execution of Tuner. - tuner_output_base_dir = os.path.join( - self._pipeline_root(pipeline_name), 'Tuner', 'best_hyperparameters') - tuner_outputs = fileio.listdir(tuner_output_base_dir) - self.assertEqual(1, len(tuner_outputs)) - - # There must be only one best hyperparameters. - best_hyperparameters_uri = os.path.join(tuner_output_base_dir, - tuner_outputs[0]) - self.assertTrue(fileio.exists(best_hyperparameters_uri)) - - def testVertexSequentialTunerPipeline(self): - """Tuner-only pipeline for sequential Tuner flock on Vertex AI Training.""" - pipeline_name = self._make_unique_pipeline_name( - 'kubeflow-vertex-seq-tuner') - pipeline = self._create_pipeline( - pipeline_name, - [ - self.penguin_examples_importer, - self.penguin_schema_importer, - ai_platform_tuner_component.Tuner( - examples=self.penguin_examples_importer.outputs['result'], - module_file=self._penguin_tuner_module, - schema=self.penguin_schema_importer.outputs['result'], - train_args=trainer_pb2.TrainArgs(num_steps=1), - eval_args=trainer_pb2.EvalArgs(num_steps=1), - # Single worker sequential tuning. - tune_args=tuner_pb2.TuneArgs(num_parallel_trials=1), - custom_config={ - ai_platform_tuner_executor.TUNING_ARGS_KEY: - self._getVertexTrainingArgs(pipeline_name), - constants.ENABLE_VERTEX_KEY: - True, - constants.VERTEX_REGION_KEY: - self._GCP_REGION - }) - ]) - self._compile_and_run_pipeline(pipeline) - self._assertHyperparametersAreWritten(pipeline_name) - - def testVertexDistributedTunerPipeline(self): - """Tuner-only pipeline for distributed Tuner flock on Vertex AI Training.""" - pipeline_name = self._make_unique_pipeline_name( - 'kubeflow-vertex-dist-tuner') - pipeline = self._create_pipeline( - pipeline_name, - [ - self.penguin_examples_importer, - self.penguin_schema_importer, - ai_platform_tuner_component.Tuner( - examples=self.penguin_examples_importer.outputs['result'], - module_file=self._penguin_tuner_module, - schema=self.penguin_schema_importer.outputs['result'], - train_args=trainer_pb2.TrainArgs(num_steps=10), - eval_args=trainer_pb2.EvalArgs(num_steps=5), - # 3 worker parallel tuning. - tune_args=tuner_pb2.TuneArgs(num_parallel_trials=3), - custom_config={ - ai_platform_tuner_executor.TUNING_ARGS_KEY: - self._getVertexTrainingArgs(pipeline_name), - constants.ENABLE_VERTEX_KEY: - True, - constants.VERTEX_REGION_KEY: - self._GCP_REGION - }) - ]) - self._compile_and_run_pipeline(pipeline) - self._assertHyperparametersAreWritten(pipeline_name) - - def testAIPlatformDistributedTunerPipeline(self): - """Tuner-only pipeline for distributed Tuner flock on AIP Training.""" - pipeline_name = self._make_unique_pipeline_name('kubeflow-aip-dist-tuner') - pipeline = self._create_pipeline( - pipeline_name, - [ - self.penguin_examples_importer, - self.penguin_schema_importer, - ai_platform_tuner_component.Tuner( - examples=self.penguin_examples_importer.outputs['result'], - module_file=self._penguin_tuner_module, - schema=self.penguin_schema_importer.outputs['result'], - train_args=trainer_pb2.TrainArgs(num_steps=10), - eval_args=trainer_pb2.EvalArgs(num_steps=5), - # 3 worker parallel tuning. - tune_args=tuner_pb2.TuneArgs(num_parallel_trials=3), - custom_config={ - ai_platform_tuner_executor.TUNING_ARGS_KEY: - self._getCaipTrainingArgs(pipeline_name) - }) - ]) - self._compile_and_run_pipeline(pipeline) - self._assertHyperparametersAreWritten(pipeline_name) - - def _get_list_bigqueryml_models(self, api, dataset_name): - r = api.models().list( - projectId=self._GCP_PROJECT_ID, - datasetId=dataset_name).execute() - if r: - return [m['modelReference']['modelId'] for m in r['models']] - else: - return [] - - def testBigQueryMlPusherPipeline(self): - """BigQuery ML Pusher pipeline on CAIP.""" - pipeline_name = self._make_unique_pipeline_name( - 'kubeflow-aip-bqml-pusher') - # Big Query does not accept '-' in the dataset name. - dataset_name = ('%s_model' % pipeline_name).replace('-', '_') - self.addCleanup(_delete_bigquery_dataset, - dataset_name, self._GCP_PROJECT_ID) - - api = discovery.build('bigquery', 'v2') - api.datasets().insert( - projectId=self._GCP_PROJECT_ID, - body={'location': 'US', - 'projectId': self._GCP_PROJECT_ID, - 'datasetReference': {'datasetId': dataset_name, - 'projectId': self._GCP_PROJECT_ID} - }).execute() - - def _pusher(model_importer, model_blessing_importer, bigquery_dataset_id): - return Pusher( - custom_executor_spec=executor_spec.ExecutorClassSpec( - bigquery_pusher_executor.Executor), - model=model_importer.outputs['result'], - model_blessing=model_blessing_importer.outputs['result'], - custom_config={ - bigquery_pusher_executor.SERVING_ARGS_KEY: { - 'bq_dataset_id': bigquery_dataset_id, - 'model_name': pipeline_name, - 'project_id': self._GCP_PROJECT_ID, - } - }, - ) - - # The model list should be empty - self.assertEmpty(self._get_list_bigqueryml_models( - api, dataset_name)) - - # Test creation of multiple versions under the same model_name. - pipeline = self._create_pipeline(pipeline_name, [ - self.model_1_importer, - self.model_blessing_1_importer, - _pusher(self.model_1_importer, self.model_blessing_1_importer, - dataset_name), - ]) - self._compile_and_run_pipeline(pipeline) - self.assertIn( - pipeline_name, self._get_list_bigqueryml_models( - api, dataset_name)) - - def _getNumberOfVersionsForModel(self, api, project, model_name): - resource_name = f'projects/{project}/models/{model_name}' - res = api.projects().models().versions().list( - parent=resource_name).execute() - return len(res['versions']) - - def _sendDummyRequestToModel(self, api, project, model_name): - resource_name = f'projects/{project}/models/{model_name}' - res = api.projects().predict( - name=resource_name, - body={ - 'instances': { - 'inputs': '' # Just use dummy input for basic check. - } - }).execute() - absl.logging.info('Response from the pushed model: %s', res) - - def testAIPlatformPusherPipeline(self): - """Pusher-only test pipeline to AI Platform Prediction.""" - pipeline_name_base = self._make_unique_pipeline_name('kubeflow-aip-pusher') - # AI Platform does not accept '-' in the model name. - model_name = ('%s_model' % pipeline_name_base).replace('-', '_') - self.addCleanup(kubeflow_test_utils.delete_ai_platform_model, model_name) - - def _pusher(model_importer, model_blessing_importer): - return Pusher( - custom_executor_spec=executor_spec.ExecutorClassSpec( - ai_platform_pusher_executor.Executor), - model=model_importer.outputs['result'], - model_blessing=model_blessing_importer.outputs['result'], - custom_config={ - tfx.extensions.google_cloud_ai_platform.experimental - .PUSHER_SERVING_ARGS_KEY: { - 'model_name': model_name, - 'project_id': self._GCP_PROJECT_ID, - } - }, - ) - - # Use default service_name / api_version. - service_name, api_version = runner.get_service_name_and_api_version({}) - api = discovery.build( - service_name, - api_version, - requestBuilder=telemetry_utils.TFXHttpRequest, - ) - - # The model should be NotFound yet. - with self.assertRaisesRegex(googleapiclient_errors.HttpError, - 'HttpError 404'): - self._sendDummyRequestToModel(api, self._GCP_PROJECT_ID, model_name) - - # Test creation of multiple versions under the same model_name. - pipeline_name_1 = '%s-1' % pipeline_name_base - pipeline_1 = self._create_pipeline(pipeline_name_1, [ - self.model_1_importer, - self.model_blessing_1_importer, - _pusher(self.model_1_importer, self.model_blessing_1_importer), - ]) - self._compile_and_run_pipeline(pipeline_1) - self.assertEqual( - 1, - self._getNumberOfVersionsForModel(api, self._GCP_PROJECT_ID, - model_name)) - self._sendDummyRequestToModel(api, self._GCP_PROJECT_ID, model_name) - - pipeline_name_2 = '%s-2' % pipeline_name_base - pipeline_2 = self._create_pipeline(pipeline_name_2, [ - self.model_2_importer, - self.model_blessing_2_importer, - _pusher(self.model_2_importer, self.model_blessing_2_importer), - ]) - self._compile_and_run_pipeline(pipeline_2) - self.assertEqual( - 2, - self._getNumberOfVersionsForModel(api, self._GCP_PROJECT_ID, - model_name)) - self._sendDummyRequestToModel(api, self._GCP_PROJECT_ID, model_name) - - -def _delete_bigquery_dataset(dataset_name, project_id): - """Deletes Big Query dataset with all the content.""" - api = discovery.build('bigquery', 'v2') - try: - api.datasets().delete( - projectId=project_id, - datasetId=dataset_name, - deleteContents=True).execute() - except googleapiclient_errors.HttpError as err: - err_descr = err._get_reson() # pylint: disable=protected-access - if err.args[0].status == 404 and err_descr.startswith('Not found'): - absl.logging.info('Dataset %s not found at project %s!', - dataset_name, project_id) - pass - else: - raise From 51284073b52798d467323069659f74f3cb22a250 Mon Sep 17 00:00:00 2001 From: Doojin Park Date: Thu, 3 Oct 2024 12:52:11 +0000 Subject: [PATCH 12/12] update skleargn_gcp_test to use KFP v2 dag runner --- .../experimental/penguin_pipeline_sklearn_gcp_test.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tfx/examples/penguin/experimental/penguin_pipeline_sklearn_gcp_test.py b/tfx/examples/penguin/experimental/penguin_pipeline_sklearn_gcp_test.py index d0a8b7ac03..d8d828f3a4 100644 --- a/tfx/examples/penguin/experimental/penguin_pipeline_sklearn_gcp_test.py +++ b/tfx/examples/penguin/experimental/penguin_pipeline_sklearn_gcp_test.py @@ -30,7 +30,7 @@ def setUp(self): self._experimental_root = os.path.dirname(__file__) self._penguin_root = os.path.dirname(self._experimental_root) - self._pipeline_name = 'sklearn_test' + self._pipeline_name = 'sklearn-test' self._data_root = os.path.join(self._penguin_root, 'data') self._trainer_module_file = os.path.join( self._experimental_root, 'penguin_utils_sklearn.py') @@ -66,6 +66,8 @@ def testPipelineConstruction(self, resolve_mock): beam_pipeline_args=[]) self.assertEqual(8, len(logical_pipeline.components)) - tfx.orchestration.experimental.KubeflowDagRunner().run(logical_pipeline) - file_path = os.path.join(self.tmp_dir, 'sklearn_test.tar.gz') + tfx.orchestration.experimental.KubeflowV2DagRunner( + config=tfx.orchestration.experimental.KubeflowV2DagRunnerConfig(), + output_filename='sklearn_test.yaml').run(logical_pipeline) + file_path = os.path.join(self.tmp_dir, 'sklearn_test.yaml') self.assertTrue(tfx.dsl.io.fileio.exists(file_path))