diff --git a/.github/workflows/cd-docs.yml b/.github/workflows/cd-docs.yml index 93536f52bb..6616cd5aea 100644 --- a/.github/workflows/cd-docs.yml +++ b/.github/workflows/cd-docs.yml @@ -39,7 +39,7 @@ jobs: mkdocs-material- - name: Install Dependencies - run: pip install mkdocs mkdocs-material mkdocstrings[python] griffe-inherited-docstrings mkdocs-autorefs mkdocs-jupyter mkdocs-caption + run: pip install mkdocs mkdocs-material mkdocstrings[python] griffe-inherited-docstrings mkdocs-autorefs mkdocs-jupyter mkdocs-caption markdown-grid-tables - name: Deploy to GitHub Pages run: mkdocs gh-deploy --force diff --git a/docs/api/v1/extensions.md b/docs/api/v1/extensions.md index 2679aae75d..87b68d6713 100644 --- a/docs/api/v1/extensions.md +++ b/docs/api/v1/extensions.md @@ -1,3 +1,5 @@ # Extension ::: tfx.v1.extensions + options: + show_if_no_docstring: true diff --git a/docs/api/v1/index.md b/docs/api/v1/index.md index e69de29bb2..b06cb920bf 100644 --- a/docs/api/v1/index.md +++ b/docs/api/v1/index.md @@ -0,0 +1,17 @@ +# Modules + +[components][tfx.v1.components] module: TFX components module. + +[dsl][tfx.v1.dsl] module: TFX DSL module. + +[extensions][tfx.v1.extensions] module: TFX extensions module. + +[orchestration][tfx.v1.orchestration] module: TFX orchestration module. + +[proto][tfx.v1.proto] module: TFX proto module. + +[testing][tfx.v1.testing] module: Public testing modules for TFX. + +[types][tfx.v1.types] module: TFX types module. + +[utils][tfx.v1.utils] module: TFX utils module. diff --git a/docs/api/v1/orchestration.md b/docs/api/v1/orchestration.md index 26250ca1d9..6a13999208 100644 --- a/docs/api/v1/orchestration.md +++ b/docs/api/v1/orchestration.md @@ -1,3 +1,5 @@ # Orchestration ::: tfx.v1.orchestration + options: + show_if_no_docstring: true diff --git a/docs/api/v1/root.md b/docs/api/v1/root.md deleted file mode 100644 index b06cb920bf..0000000000 --- a/docs/api/v1/root.md +++ /dev/null @@ -1,17 +0,0 @@ -# Modules - -[components][tfx.v1.components] module: TFX components module. - -[dsl][tfx.v1.dsl] module: TFX DSL module. - -[extensions][tfx.v1.extensions] module: TFX extensions module. - -[orchestration][tfx.v1.orchestration] module: TFX orchestration module. - -[proto][tfx.v1.proto] module: TFX proto module. - -[testing][tfx.v1.testing] module: Public testing modules for TFX. - -[types][tfx.v1.types] module: TFX types module. - -[utils][tfx.v1.utils] module: TFX utils module. diff --git a/docs/api/v1/testing.md b/docs/api/v1/testing.md index 1369879c3a..f81aedc1ae 100644 --- a/docs/api/v1/testing.md +++ b/docs/api/v1/testing.md @@ -1,3 +1,5 @@ # Testing ::: tfx.v1.testing + options: + show_if_no_docstring: true diff --git a/docs/api/v1/utils.md b/docs/api/v1/utils.md index 349a42c01b..0b061e9d9b 100644 --- a/docs/api/v1/utils.md +++ b/docs/api/v1/utils.md @@ -1,3 +1,5 @@ # Utils ::: tfx.v1.utils + options: + show_if_no_docstring: true diff --git a/docs/guide/build_local_pipeline.md b/docs/guide/build_local_pipeline.md index c5a4e3a998..27475528f2 100644 --- a/docs/guide/build_local_pipeline.md +++ b/docs/guide/build_local_pipeline.md @@ -157,7 +157,7 @@ template. implement a pipeline for tabular data using the TFX standard components. If you are moving an existing ML workflow into a pipeline, you may need to revise your code to make full use of - [TFX standard components](index.md#tfx_standard_components). You may also need + [TFX standard components](index.md#tfx-standard-components). You may also need to create [custom components](understanding_custom_components.md) that implement features which are unique to your workflow or that are not yet supported by TFX standard components. @@ -198,7 +198,7 @@ without using a template. features such as data augmentation. * Learn more about - [standard TFX components](index.md#tfx_standard_components). + [standard TFX components](index.md#tfx-standard-components). * Learn more about [custom components](understanding_custom_components.md). 1. Create a script file to define your pipeline using the following example. diff --git a/docs/guide/build_tfx_pipeline.md b/docs/guide/build_tfx_pipeline.md index f03a5f4648..c9294d7e4d 100644 --- a/docs/guide/build_tfx_pipeline.md +++ b/docs/guide/build_tfx_pipeline.md @@ -1,11 +1,13 @@ # Building TFX pipelines -Note: For a conceptual view of TFX Pipelines, see -[Understanding TFX Pipelines](understanding_tfx_pipelines.md). +!!! Note + For a conceptual view of TFX Pipelines, see + [Understanding TFX Pipelines](understanding_tfx_pipelines.md). -Note: Want to build your first pipeline before you dive into the details? Get -started -[building a pipeline using a template](build_local_pipeline.md#build-a-pipeline-using-a-template). +!!!Note + Want to build your first pipeline before you dive into the details? Get + started + [building a pipeline using a template](build_local_pipeline.md#build-a-pipeline-using-a-template). ## Using the `Pipeline` class @@ -61,9 +63,10 @@ statistics. In this example, the instance of `StatisticsGen` must follow ### Task-based dependencies -Note: Using task-based dependencies is typically not recommended. Defining the -execution graph with artifact dependencies lets you take advantage of the -automatic artifact lineage tracking and caching features of TFX. +!!! Note + Using task-based dependencies is typically not recommended. Defining the + execution graph with artifact dependencies lets you take advantage of the + automatic artifact lineage tracking and caching features of TFX. You can also define task-based dependencies using your component's [`add_upstream_node` and `add_downstream_node`](https://github.com/tensorflow/tfx/blob/master/tfx/components/base/base_node.py){: .external } @@ -75,7 +78,7 @@ that the current component must be executed before the specified component. The easiest way to get a pipeline set up quickly, and to see how all the pieces fit together, is to use a template. Using templates is covered in [Building a -TFX Pipeline Locally](build_local_pipeline). +TFX Pipeline Locally](../build_local_pipeline). ## Caching diff --git a/docs/guide/cli.md b/docs/guide/cli.md index 855f5d2bdd..cadcab772f 100644 --- a/docs/guide/cli.md +++ b/docs/guide/cli.md @@ -10,8 +10,9 @@ can use the CLI to: * Run a pipeline and monitor the run on various orchestrators. * List pipelines and pipeline runs. -Note: The TFX CLI doesn't currently provide compatibility guarantees. The CLI -interface might change as new versions are released. +!!! Note + The TFX CLI doesn't currently provide compatibility guarantees. The CLI + interface might change as new versions are released. ## About the TFX CLI @@ -35,8 +36,9 @@ instructions in the [pipeline commands](#tfx-pipeline), [run commands](#tfx-run), and [template commands](#tfx-template-experimental) sections to learn more about using these commands. -Warning: Currently not all commands are supported in every orchestrator. Such -commands explicitly mention the engines supported. +!!! Warning + Currently not all commands are supported in every orchestrator. Such + commands explicitly mention the engines supported. Flags let you pass arguments into CLI commands. Words in flags are separated with either a hyphen (`-`) or an underscore (`_`). For example, the pipeline @@ -62,118 +64,76 @@ Creates a new pipeline in the given orchestrator. Usage: ```bash -tfx pipeline create --pipeline_path=pipeline-path [--endpoint=endpoint --engine=engine \ ---iap_client_id=iap-client-id --namespace=namespace \ ---build_image --build_base_image=build-base-image] +tfx pipeline create --pipeline_path=pipeline-path [--endpoint=endpoint --engine=engine \ +--iap_client_id=iap-client-id --namespace=namespace \ +--build_image --build_base_image=build-base-image] ``` -
-
--pipeline_path=pipeline-path
-
The path to the pipeline configuration file.
-
--endpoint=endpoint
-
-

- (Optional.) Endpoint of the Kubeflow Pipelines API service. The endpoint - of your Kubeflow Pipelines API service is the same as URL of the Kubeflow - Pipelines dashboard. Your endpoint value should be something like: -

- -
https://host-name/pipeline
- -

- If you do not know the endpoint for your Kubeflow Pipelines cluster, - contact you cluster administrator. -

- -

- If the --endpoint is not specified, the in-cluster service - DNS name is used as the default value. This name works only if the - CLI command executes in a pod on the Kubeflow Pipelines cluster, such as a - Kubeflow Jupyter notebooks instance. -

-
-
--engine=engine
-
-

- (Optional.) The orchestrator to be used for the pipeline. The value of - engine must match on of the following values: -

- -

- If the engine is not set, the engine is auto-detected based on the - environment. -

-

- ** Important note: The orchestrator required by the DagRunner in the - pipeline config file must match the selected or autodetected engine. - Engine auto-detection is based on user environment. If Apache Airflow - and Kubeflow Pipelines are not installed, then the local orchestrator is - used by default. -

-
-
--iap_client_id=iap-client-id
-
- (Optional.) Client ID for IAP protected endpoint when using Kubeflow Pipelines. -
- -
--namespace=namespace -
- (Optional.) Kubernetes namespace to connect to the Kubeflow Pipelines API. - If the namespace is not specified, the value defaults to - kubeflow. -
- -
--build_image
-
-

- (Optional.) When the engine is kubeflow or vertex, TFX - creates a container image for your pipeline if specified. `Dockerfile` in - the current directory will be used, and TFX will automatically generate - one if not exists. -

-

- The built image will be pushed to the remote registry which is specified - in `KubeflowDagRunnerConfig` or `KubeflowV2DagRunnerConfig`. -

-
-
--build_base_image=build-base-image
-
-

- (Optional.) When the engine is kubeflow, TFX - creates a container image for your pipeline. The build base image - specifies the base container image to use when building the pipeline - container image. -

-
-
+\--pipeline\_path=`pipeline-path`{.variable} +: The path to the pipeline configuration file. + +\--endpoint=`endpoint`{.variable} + +: (Optional.) Endpoint of the Kubeflow Pipelines API service. The endpoint of your Kubeflow Pipelines API service is the same as URL of the Kubeflow Pipelines dashboard. Your endpoint value should be something like: + + https://host-name/pipeline + + If you do not know the endpoint for your Kubeflow Pipelines cluster, contact you cluster administrator. + + If the `--endpoint` is not specified, the in-cluster service DNS name is used as the default value. This name works only if the CLI command executes in a pod on the Kubeflow Pipelines cluster, such as a [Kubeflow Jupyter notebooks](https://www.kubeflow.org/docs/components/notebooks/jupyter-tensorflow-examples/){.external} instance. + +\--engine=`engine`{.variable} + +: (Optional.) The orchestrator to be used for the pipeline. The value of engine must match on of the following values: + + - **kubeflow**: sets engine to Kubeflow + - **local**: sets engine to local orchestrator + - **vertex**: sets engine to Vertex Pipelines + - **airflow**: (experimental) sets engine to Apache Airflow + - **beam**: (experimental) sets engine to Apache Beam + + If the engine is not set, the engine is auto-detected based on the environment. + + !!! note "Important Note" + The orchestrator required by the DagRunner in the pipeline config file must match the selected or autodetected engine. Engine auto-detection is based on user environment. If Apache Airflow and Kubeflow Pipelines are not installed, then the local orchestrator is used by default. + +\--iap\_client\_id=`iap-client-id`{.variable} +: (Optional.) Client ID for IAP protected endpoint when using Kubeflow Pipelines. + +\--namespace=`namespace`{.variable} +: (Optional.) Kubernetes namespace to connect to the Kubeflow Pipelines API. If the namespace is not specified, the value defaults to `kubeflow`. + +\--build\_image + +: (Optional.) When the `engine`{.variable} is **kubeflow** or **vertex**, TFX creates a container image for your pipeline if specified. `Dockerfile` in the current directory will be used, and TFX will automatically generate one if not exists. + + The built image will be pushed to the remote registry which is specified in `KubeflowDagRunnerConfig` or `KubeflowV2DagRunnerConfig`. + +\--build\_base\_image=`build-base-image`{.variable} + +: (Optional.) When the `engine`{.variable} is **kubeflow**, TFX creates a container image for your pipeline. The build base image specifies the base container image to use when building the pipeline container image. + #### Examples Kubeflow: ```bash -tfx pipeline create --engine=kubeflow --pipeline_path=pipeline-path \ ---iap_client_id=iap-client-id --namespace=namespace --endpoint=endpoint \ +tfx pipeline create --engine=kubeflow --pipeline_path=pipeline-path \ +--iap_client_id=iap-client-id --namespace=namespace --endpoint=endpoint \ --build_image ``` Local: ```bash -tfx pipeline create --engine=local --pipeline_path=pipeline-path +tfx pipeline create --engine=local --pipeline_path=pipeline-path ``` Vertex: ```bash -tfx pipeline create --engine=vertex --pipeline_path=pipeline-path \ +tfx pipeline create --engine=vertex --pipeline_path=pipeline-path \ --build_image ``` @@ -181,7 +141,7 @@ To autodetect engine from user environment, simply avoid using the engine flag like the example below. For more details, check the flags section. ```bash -tfx pipeline create --pipeline_path=pipeline-path +tfx pipeline create --pipeline_path=pipeline-path ``` ### update @@ -191,106 +151,71 @@ Updates an existing pipeline in the given orchestrator. Usage: ```bash -tfx pipeline update --pipeline_path=pipeline-path [--endpoint=endpoint --engine=engine \ ---iap_client_id=iap-client-id --namespace=namespace --build_image] +tfx pipeline update --pipeline_path=pipeline-path [--endpoint=endpoint --engine=engine \ +--iap_client_id=iap-client-id --namespace=namespace --build_image] ``` -
-
--pipeline_path=pipeline-path
-
The path to the pipeline configuration file.
-
--endpoint=endpoint
-
-

- (Optional.) Endpoint of the Kubeflow Pipelines API service. The endpoint - of your Kubeflow Pipelines API service is the same as URL of the Kubeflow - Pipelines dashboard. Your endpoint value should be something like: -

- -
https://host-name/pipeline
- -

- If you do not know the endpoint for your Kubeflow Pipelines cluster, - contact you cluster administrator. -

- -

- If the --endpoint is not specified, the in-cluster service - DNS name is used as the default value. This name works only if the - CLI command executes in a pod on the Kubeflow Pipelines cluster, such as a - Kubeflow Jupyter notebooks instance. -

-
-
--engine=engine
-
-

- (Optional.) The orchestrator to be used for the pipeline. The value of - engine must match on of the following values: -

- -

- If the engine is not set, the engine is auto-detected based on the - environment. -

-

- ** Important note: The orchestrator required by the DagRunner in the - pipeline config file must match the selected or autodetected engine. - Engine auto-detection is based on user environment. If Apache Airflow - and Kubeflow Pipelines are not installed, then the local orchestrator is - used by default. -

-
-
--iap_client_id=iap-client-id
-
- (Optional.) Client ID for IAP protected endpoint. -
- -
--namespace=namespace -
- (Optional.) Kubernetes namespace to connect to the Kubeflow Pipelines API. - If the namespace is not specified, the value defaults to - kubeflow. -
-
--build_image
-
-

- (Optional.) When the engine is kubeflow or vertex, TFX - creates a container image for your pipeline if specified. `Dockerfile` in - the current directory will be used. -

-

- The built image will be pushed to the remote registry which is specified - in `KubeflowDagRunnerConfig` or `KubeflowV2DagRunnerConfig`. -

-
-
+\--pipeline\_path=`pipeline-path`{.variable} +: The path to the pipeline configuration file. + +\--endpoint=`endpoint`{.variable} + +: (Optional.) Endpoint of the Kubeflow Pipelines API service. The endpoint of your Kubeflow Pipelines API service is the same as URL of the Kubeflow Pipelines dashboard. Your endpoint value should be something like: + + https://host-name/pipeline + + If you do not know the endpoint for your Kubeflow Pipelines cluster, contact you cluster administrator. + + If the `--endpoint` is not specified, the in-cluster service DNS name is used as the default value. This name works only if the CLI command executes in a pod on the Kubeflow Pipelines cluster, such as a [Kubeflow Jupyter notebooks](https://www.kubeflow.org/docs/components/notebooks/jupyter-tensorflow-examples/){.external} instance. + +\--engine=`engine`{.variable} + +: (Optional.) The orchestrator to be used for the pipeline. The value of engine must match on of the following values: + + - **kubeflow**: sets engine to Kubeflow + - **local**: sets engine to local orchestrator + - **vertex**: sets engine to Vertex Pipelines + - **airflow**: (experimental) sets engine to Apache Airflow + - **beam**: (experimental) sets engine to Apache Beam + + If the engine is not set, the engine is auto-detected based on the environment. + + !!! note "Important Note" + The orchestrator required by the DagRunner in the pipeline config file must match the selected or autodetected engine. Engine auto-detection is based on user environment. If Apache Airflow and Kubeflow Pipelines are not installed, then the local orchestrator is used by default. + +\--iap\_client\_id=`iap-client-id`{.variable} +: (Optional.) Client ID for IAP protected endpoint. + +\--namespace=`namespace`{.variable} +: (Optional.) Kubernetes namespace to connect to the Kubeflow Pipelines API. If the namespace is not specified, the value defaults to `kubeflow`. + +\--build\_image + +: (Optional.) When the `engine`{.variable} is **kubeflow** or **vertex**, TFX creates a container image for your pipeline if specified. `Dockerfile` in the current directory will be used. + + The built image will be pushed to the remote registry which is specified in `KubeflowDagRunnerConfig` or `KubeflowV2DagRunnerConfig`. + #### Examples Kubeflow: ```bash -tfx pipeline update --engine=kubeflow --pipeline_path=pipeline-path \ ---iap_client_id=iap-client-id --namespace=namespace --endpoint=endpoint \ +tfx pipeline update --engine=kubeflow --pipeline_path=pipeline-path \ +--iap_client_id=iap-client-id --namespace=namespace --endpoint=endpoint \ --build_image ``` Local: ```bash -tfx pipeline update --engine=local --pipeline_path=pipeline-path +tfx pipeline update --engine=local --pipeline_path=pipeline-path ``` Vertex: ```bash -tfx pipeline update --engine=vertex --pipeline_path=pipeline-path \ +tfx pipeline update --engine=vertex --pipeline_path=pipeline-path \ --build_image ``` @@ -311,57 +236,46 @@ Recommended to use before creating or updating a pipeline. Usage: ```bash -tfx pipeline compile --pipeline_path=pipeline-path [--engine=engine] +tfx pipeline compile --pipeline_path=pipeline-path [--engine=engine] ``` -
-
--pipeline_path=pipeline-path
-
The path to the pipeline configuration file.
-
--engine=engine
-
-

- (Optional.) The orchestrator to be used for the pipeline. The value of - engine must match on of the following values: -

- -

- If the engine is not set, the engine is auto-detected based on the - environment. -

-

- ** Important note: The orchestrator required by the DagRunner in the - pipeline config file must match the selected or autodetected engine. - Engine auto-detection is based on user environment. If Apache Airflow - and Kubeflow Pipelines are not installed, then the local orchestrator is - used by default. -

-
-
+\--pipeline\_path=`pipeline-path`{.variable} +: The path to the pipeline configuration file. + +\--engine=`engine`{.variable} + +: (Optional.) The orchestrator to be used for the pipeline. The value of engine must match on of the following values: + + - **kubeflow**: sets engine to Kubeflow + - **local**: sets engine to local orchestrator + - **vertex**: sets engine to Vertex Pipelines + - **airflow**: (experimental) sets engine to Apache Airflow + - **beam**: (experimental) sets engine to Apache Beam + + If the engine is not set, the engine is auto-detected based on the environment. + + !!! note "Important Note" + The orchestrator required by the DagRunner in the pipeline config file must match the selected or autodetected engine. Engine auto-detection is based on user environment. If Apache Airflow and Kubeflow Pipelines are not installed, then the local orchestrator is used by default. + #### Examples Kubeflow: ```bash -tfx pipeline compile --engine=kubeflow --pipeline_path=pipeline-path +tfx pipeline compile --engine=kubeflow --pipeline_path=pipeline-path ``` Local: ```bash -tfx pipeline compile --engine=local --pipeline_path=pipeline-path +tfx pipeline compile --engine=local --pipeline_path=pipeline-path ``` Vertex: ```bash -tfx pipeline compile --engine=vertex --pipeline_path=pipeline-path +tfx pipeline compile --engine=vertex --pipeline_path=pipeline-path ``` ### delete @@ -371,93 +285,64 @@ Deletes a pipeline from the given orchestrator. Usage: ```bash -tfx pipeline delete --pipeline_path=pipeline-path [--endpoint=endpoint --engine=engine \ ---iap_client_id=iap-client-id --namespace=namespace] +tfx pipeline delete --pipeline_path=pipeline-path [--endpoint=endpoint --engine=engine \ +--iap_client_id=iap-client-id --namespace=namespace] ``` -
-
--pipeline_path=pipeline-path
-
The path to the pipeline configuration file.
-
--endpoint=endpoint
-
-

- (Optional.) Endpoint of the Kubeflow Pipelines API service. The endpoint - of your Kubeflow Pipelines API service is the same as URL of the Kubeflow - Pipelines dashboard. Your endpoint value should be something like: -

- -
https://host-name/pipeline
- -

- If you do not know the endpoint for your Kubeflow Pipelines cluster, - contact you cluster administrator. -

- -

- If the --endpoint is not specified, the in-cluster service - DNS name is used as the default value. This name works only if the - CLI command executes in a pod on the Kubeflow Pipelines cluster, such as a - Kubeflow Jupyter notebooks instance. -

-
-
--engine=engine
-
-

- (Optional.) The orchestrator to be used for the pipeline. The value of - engine must match on of the following values: -

- -

- If the engine is not set, the engine is auto-detected based on the - environment. -

-

- ** Important note: The orchestrator required by the DagRunner in the - pipeline config file must match the selected or autodetected engine. - Engine auto-detection is based on user environment. If Apache Airflow - and Kubeflow Pipelines are not installed, then the local orchestrator is - used by default. -

-
-
--iap_client_id=iap-client-id
-
- (Optional.) Client ID for IAP protected endpoint. -
- -
--namespace=namespace -
- (Optional.) Kubernetes namespace to connect to the Kubeflow Pipelines API. - If the namespace is not specified, the value defaults to - kubeflow. -
-
+\--pipeline\_path=`pipeline-path`{.variable} +: The path to the pipeline configuration file. + +\--endpoint=`endpoint`{.variable} + +: (Optional.) Endpoint of the Kubeflow Pipelines API service. The endpoint of your Kubeflow Pipelines API service is the same as URL of the Kubeflow Pipelines dashboard. Your endpoint value should be something like: + + https://host-name/pipeline + + If you do not know the endpoint for your Kubeflow Pipelines cluster, contact you cluster administrator. + + If the `--endpoint` is not specified, the in-cluster service DNS name is used as the default value. This name works only if the CLI command executes in a pod on the Kubeflow Pipelines cluster, such as a [Kubeflow Jupyter notebooks](https://www.kubeflow.org/docs/components/notebooks/jupyter-tensorflow-examples/){.external} instance. + +\--engine=`engine`{.variable} + +: (Optional.) The orchestrator to be used for the pipeline. The value of engine must match on of the following values: + + - **kubeflow**: sets engine to Kubeflow + - **local**: sets engine to local orchestrator + - **vertex**: sets engine to Vertex Pipelines + - **airflow**: (experimental) sets engine to Apache Airflow + - **beam**: (experimental) sets engine to Apache Beam + + If the engine is not set, the engine is auto-detected based on the environment. + + !!! note "Important Note" + The orchestrator required by the DagRunner in the pipeline config file must match the selected or autodetected engine. Engine auto-detection is based on user environment. If Apache Airflow and Kubeflow Pipelines are not installed, then the local orchestrator is used by default. + +\--iap\_client\_id=`iap-client-id`{.variable} +: (Optional.) Client ID for IAP protected endpoint. + +\--namespace=`namespace`{.variable} +: (Optional.) Kubernetes namespace to connect to the Kubeflow Pipelines API. If the namespace is not specified, the value defaults to `kubeflow`. + #### Examples Kubeflow: ```bash -tfx pipeline delete --engine=kubeflow --pipeline_name=pipeline-name \ ---iap_client_id=iap-client-id --namespace=namespace --endpoint=endpoint +tfx pipeline delete --engine=kubeflow --pipeline_name=pipeline-name \ +--iap_client_id=iap-client-id --namespace=namespace --endpoint=endpoint ``` Local: ```bash -tfx pipeline delete --engine=local --pipeline_name=pipeline-name +tfx pipeline delete --engine=local --pipeline_name=pipeline-name ``` Vertex: ```bash -tfx pipeline delete --engine=vertex --pipeline_name=pipeline-name +tfx pipeline delete --engine=vertex --pipeline_name=pipeline-name ``` ### list @@ -467,79 +352,49 @@ Lists all the pipelines in the given orchestrator. Usage: ```bash -tfx pipeline list [--endpoint=endpoint --engine=engine \ ---iap_client_id=iap-client-id --namespace=namespace] +tfx pipeline list [--endpoint=endpoint --engine=engine \ +--iap_client_id=iap-client-id --namespace=namespace] ``` -
-
--endpoint=endpoint
-
-

- (Optional.) Endpoint of the Kubeflow Pipelines API service. The endpoint - of your Kubeflow Pipelines API service is the same as URL of the Kubeflow - Pipelines dashboard. Your endpoint value should be something like: -

- -
https://host-name/pipeline
- -

- If you do not know the endpoint for your Kubeflow Pipelines cluster, - contact you cluster administrator. -

- -

- If the --endpoint is not specified, the in-cluster service - DNS name is used as the default value. This name works only if the - CLI command executes in a pod on the Kubeflow Pipelines cluster, such as a - Kubeflow Jupyter notebooks instance. -

-
-
--engine=engine
-
-

- (Optional.) The orchestrator to be used for the pipeline. The value of - engine must match on of the following values: -

- -

- If the engine is not set, the engine is auto-detected based on the - environment. -

-

- ** Important note: The orchestrator required by the DagRunner in the - pipeline config file must match the selected or autodetected engine. - Engine auto-detection is based on user environment. If Apache Airflow - and Kubeflow Pipelines are not installed, then the local orchestrator is - used by default. -

-
-
--iap_client_id=iap-client-id
-
- (Optional.) Client ID for IAP protected endpoint. -
- -
--namespace=namespace -
- (Optional.) Kubernetes namespace to connect to the Kubeflow Pipelines API. - If the namespace is not specified, the value defaults to - kubeflow. -
-
+\--endpoint=`endpoint`{.variable} + +: (Optional.) Endpoint of the Kubeflow Pipelines API service. The endpoint of your Kubeflow Pipelines API service is the same as URL of the Kubeflow Pipelines dashboard. Your endpoint value should be something like: + + https://host-name/pipeline + + If you do not know the endpoint for your Kubeflow Pipelines cluster, contact you cluster administrator. + + If the `--endpoint` is not specified, the in-cluster service DNS name is used as the default value. This name works only if the CLI command executes in a pod on the Kubeflow Pipelines cluster, such as a [Kubeflow Jupyter notebooks](https://www.kubeflow.org/docs/components/notebooks/jupyter-tensorflow-examples/){.external} instance. + +\--engine=`engine`{.variable} + +: (Optional.) The orchestrator to be used for the pipeline. The value of engine must match on of the following values: + + - **kubeflow**: sets engine to Kubeflow + - **local**: sets engine to local orchestrator + - **vertex**: sets engine to Vertex Pipelines + - **airflow**: (experimental) sets engine to Apache Airflow + - **beam**: (experimental) sets engine to Apache Beam + + If the engine is not set, the engine is auto-detected based on the environment. + + !!! note "Important Note" + The orchestrator required by the DagRunner in the pipeline config file must match the selected or autodetected engine. Engine auto-detection is based on user environment. If Apache Airflow and Kubeflow Pipelines are not installed, then the local orchestrator is used by default. + +\--iap\_client\_id=`iap-client-id`{.variable} +: (Optional.) Client ID for IAP protected endpoint. + +\--namespace=`namespace`{.variable} +: (Optional.) Kubernetes namespace to connect to the Kubeflow Pipelines API. If the namespace is not specified, the value defaults to `kubeflow`. + #### Examples Kubeflow: ```bash -tfx pipeline list --engine=kubeflow --iap_client_id=iap-client-id \ ---namespace=namespace --endpoint=endpoint +tfx pipeline list --engine=kubeflow --iap_client_id=iap-client-id \ +--namespace=namespace --endpoint=endpoint ``` Local: @@ -559,7 +414,7 @@ tfx pipeline list --engine=vertex The structure for commands in the `tfx run` command group is as follows: ```bash -tfx run command required-flags [optional-flags] +tfx run command required-flags [optional-flags] ``` Use the following sections to learn more about the commands in the `tfx run` @@ -573,446 +428,295 @@ most recent pipeline version of the pipeline in the cluster is used. Usage: ```bash -tfx run create --pipeline_name=pipeline-name [--endpoint=endpoint \ ---engine=engine --iap_client_id=iap-client-id --namespace=namespace] +tfx run create --pipeline_name=pipeline-name [--endpoint=endpoint \ +--engine=engine --iap_client_id=iap-client-id --namespace=namespace] ``` -
-
--pipeline_name=pipeline-name
-
The name of the pipeline.
-
--endpoint=endpoint
-
-

- (Optional.) Endpoint of the Kubeflow Pipelines API service. The endpoint - of your Kubeflow Pipelines API service is the same as URL of the Kubeflow - Pipelines dashboard. Your endpoint value should be something like: -

- -
https://host-name/pipeline
- -

- If you do not know the endpoint for your Kubeflow Pipelines cluster, - contact you cluster administrator. -

- -

- If the --endpoint is not specified, the in-cluster service - DNS name is used as the default value. This name works only if the - CLI command executes in a pod on the Kubeflow Pipelines cluster, such as a - Kubeflow Jupyter notebooks instance. -

-
-
--engine=engine
-
-

- (Optional.) The orchestrator to be used for the pipeline. The value of - engine must match on of the following values: -

- -

- If the engine is not set, the engine is auto-detected based on the - environment. -

-

- ** Important note: The orchestrator required by the DagRunner in the - pipeline config file must match the selected or autodetected engine. - Engine auto-detection is based on user environment. If Apache Airflow - and Kubeflow Pipelines are not installed, then the local orchestrator is - used by default. -

-
- -
--runtime_parameter=parameter-name=parameter-value
-
- (Optional.) Sets a runtime parameter value. Can be set multiple times to set - values of multiple variables. Only applicable to `airflow`, `kubeflow` and - `vertex` engine. -
- -
--iap_client_id=iap-client-id
-
- (Optional.) Client ID for IAP protected endpoint. -
- -
--namespace=namespace
-
- (Optional.) Kubernetes namespace to connect to the Kubeflow Pipelines API. - If the namespace is not specified, the value defaults to - kubeflow. -
- -
--project=GCP-project-id
-
- (Required for Vertex.) GCP project id for the vertex pipeline. -
- -
--region=GCP-region
-
- (Required for Vertex.) GCP region name like us-central1. See [Vertex documentation](https://cloud.google.com/vertex-ai/docs/general/locations) for available regions. -
- -
+\--pipeline\_name=`pipeline-name`{.variable} +: The name of the pipeline. + +\--endpoint=`endpoint`{.variable} + +: (Optional.) Endpoint of the Kubeflow Pipelines API service. The endpoint of your Kubeflow Pipelines API service is the same as URL of the Kubeflow Pipelines dashboard. Your endpoint value should be something like: + + https://host-name/pipeline + + If you do not know the endpoint for your Kubeflow Pipelines cluster, contact you cluster administrator. + + If the `--endpoint` is not specified, the in-cluster service DNS name is used as the default value. This name works only if the CLI command executes in a pod on the Kubeflow Pipelines cluster, such as a [Kubeflow Jupyter notebooks](https://www.kubeflow.org/docs/components/notebooks/jupyter-tensorflow-examples/){.external} instance. + +\--engine=`engine`{.variable} + +: (Optional.) The orchestrator to be used for the pipeline. The value of engine must match on of the following values: + + - **kubeflow**: sets engine to Kubeflow + - **local**: sets engine to local orchestrator + - **vertex**: sets engine to Vertex Pipelines + - **airflow**: (experimental) sets engine to Apache Airflow + - **beam**: (experimental) sets engine to Apache Beam + + If the engine is not set, the engine is auto-detected based on the environment. + + !!! note "Important Note" + The orchestrator required by the DagRunner in the pipeline config file must match the selected or autodetected engine. Engine auto-detection is based on user environment. If Apache Airflow and Kubeflow Pipelines are not installed, then the local orchestrator is used by default. + +\--runtime\_parameter=`parameter-name`{.variable}=`parameter-value`{.variable} +: (Optional.) Sets a runtime parameter value. Can be set multiple times to set values of multiple variables. Only applicable to `airflow`, `kubeflow` and `vertex` engine. + +\--iap\_client\_id=`iap-client-id`{.variable} +: (Optional.) Client ID for IAP protected endpoint. + +\--namespace=`namespace`{.variable} +: (Optional.) Kubernetes namespace to connect to the Kubeflow Pipelines API. If the namespace is not specified, the value defaults to `kubeflow`. + +\--project=`GCP-project-id`{.variable} +: (Required for Vertex.) GCP project id for the vertex pipeline. + +\--region=`GCP-region`{.variable} +: (Required for Vertex.) GCP region name like us-central1. See \[Vertex documentation\](https://cloud.google.com/vertex-ai/docs/general/locations) for available regions. + #### Examples Kubeflow: ```bash -tfx run create --engine=kubeflow --pipeline_name=pipeline-name --iap_client_id=iap-client-id \ ---namespace=namespace --endpoint=endpoint +tfx run create --engine=kubeflow --pipeline_name=pipeline-name --iap_client_id=iap-client-id \ +--namespace=namespace --endpoint=endpoint ``` Local: ```bash -tfx run create --engine=local --pipeline_name=pipeline-name +tfx run create --engine=local --pipeline_name=pipeline-name ``` Vertex: ```bash -tfx run create --engine=vertex --pipeline_name=pipeline-name \ - --runtime_parameter=var_name=var_value \ - --project=gcp-project-id --region=gcp-region +tfx run create --engine=vertex --pipeline_name=pipeline-name \ + --runtime_parameter=var_name=var_value \ + --project=gcp-project-id --region=gcp-region ``` ### terminate Stops a run of a given pipeline. -** Important Note: Currently supported only in Kubeflow. +!!! note "Important Note" + Currently supported only in Kubeflow. Usage: ```bash -tfx run terminate --run_id=run-id [--endpoint=endpoint --engine=engine \ ---iap_client_id=iap-client-id --namespace=namespace] +tfx run terminate --run_id=run-id [--endpoint=endpoint --engine=engine \ +--iap_client_id=iap-client-id --namespace=namespace] ``` -
-
--run_id=run-id
-
Unique identifier for a pipeline run.
-
--endpoint=endpoint
-
-

- (Optional.) Endpoint of the Kubeflow Pipelines API service. The endpoint - of your Kubeflow Pipelines API service is the same as URL of the Kubeflow - Pipelines dashboard. Your endpoint value should be something like: -

- -
https://host-name/pipeline
- -

- If you do not know the endpoint for your Kubeflow Pipelines cluster, - contact you cluster administrator. -

- -

- If the --endpoint is not specified, the in-cluster service - DNS name is used as the default value. This name works only if the - CLI command executes in a pod on the Kubeflow Pipelines cluster, such as a - Kubeflow Jupyter notebooks instance. -

-
-
--engine=engine
-
-

- (Optional.) The orchestrator to be used for the pipeline. The value of - engine must match on of the following values: -

- -

- If the engine is not set, the engine is auto-detected based on the - environment. -

-

- ** Important note: The orchestrator required by the DagRunner in the - pipeline config file must match the selected or autodetected engine. - Engine auto-detection is based on user environment. If Apache Airflow - and Kubeflow Pipelines are not installed, then the local orchestrator is - used by default. -

-
-
--iap_client_id=iap-client-id
-
- (Optional.) Client ID for IAP protected endpoint. -
- -
--namespace=namespace -
- (Optional.) Kubernetes namespace to connect to the Kubeflow Pipelines API. - If the namespace is not specified, the value defaults to - kubeflow. -
-
+\--run\_id=`run-id`{.variable} +: Unique identifier for a pipeline run. + +\--endpoint=`endpoint`{.variable} + +: (Optional.) Endpoint of the Kubeflow Pipelines API service. The endpoint of your Kubeflow Pipelines API service is the same as URL of the Kubeflow Pipelines dashboard. Your endpoint value should be something like: + + https://host-name/pipeline + + If you do not know the endpoint for your Kubeflow Pipelines cluster, contact you cluster administrator. + + If the `--endpoint` is not specified, the in-cluster service DNS name is used as the default value. This name works only if the CLI command executes in a pod on the Kubeflow Pipelines cluster, such as a [Kubeflow Jupyter notebooks](https://www.kubeflow.org/docs/components/notebooks/jupyter-tensorflow-examples/){.external} instance. + +\--engine=`engine`{.variable} + +: (Optional.) The orchestrator to be used for the pipeline. The value of engine must match on of the following values: + + - **kubeflow**: sets engine to Kubeflow + + If the engine is not set, the engine is auto-detected based on the environment. + + !!! note "Important Note" + The orchestrator required by the DagRunner in the pipeline config file must match the selected or autodetected engine. Engine auto-detection is based on user environment. If Apache Airflow and Kubeflow Pipelines are not installed, then the local orchestrator is used by default. + +\--iap\_client\_id=`iap-client-id`{.variable} +: (Optional.) Client ID for IAP protected endpoint. + +\--namespace=`namespace`{.variable} +: (Optional.) Kubernetes namespace to connect to the Kubeflow Pipelines API. If the namespace is not specified, the value defaults to `kubeflow`. + #### Examples Kubeflow: ```bash -tfx run delete --engine=kubeflow --run_id=run-id --iap_client_id=iap-client-id \ ---namespace=namespace --endpoint=endpoint +tfx run delete --engine=kubeflow --run_id=run-id --iap_client_id=iap-client-id \ +--namespace=namespace --endpoint=endpoint ``` ### list Lists all runs of a pipeline. -** Important Note: Currently not supported in Local and Apache Beam. +!!! note "Important Note" + Currently not supported in Local and Apache Beam. Usage: ```bash -tfx run list --pipeline_name=pipeline-name [--endpoint=endpoint \ ---engine=engine --iap_client_id=iap-client-id --namespace=namespace] +tfx run list --pipeline_name=pipeline-name [--endpoint=endpoint \ +--engine=engine --iap_client_id=iap-client-id --namespace=namespace] ``` -
-
--pipeline_name=pipeline-name
-
The name of the pipeline.
-
--endpoint=endpoint
-
-

- (Optional.) Endpoint of the Kubeflow Pipelines API service. The endpoint - of your Kubeflow Pipelines API service is the same as URL of the Kubeflow - Pipelines dashboard. Your endpoint value should be something like: -

- -
https://host-name/pipeline
- -

- If you do not know the endpoint for your Kubeflow Pipelines cluster, - contact you cluster administrator. -

- -

- If the --endpoint is not specified, the in-cluster service - DNS name is used as the default value. This name works only if the - CLI command executes in a pod on the Kubeflow Pipelines cluster, such as a - Kubeflow Jupyter notebooks instance. -

-
-
--engine=engine
-
-

- (Optional.) The orchestrator to be used for the pipeline. The value of - engine must match on of the following values: -

- -

- If the engine is not set, the engine is auto-detected based on the - environment. -

-

- ** Important note: The orchestrator required by the DagRunner in the - pipeline config file must match the selected or autodetected engine. - Engine auto-detection is based on user environment. If Apache Airflow - and Kubeflow Pipelines are not installed, then the local orchestrator is - used by default. -

-
-
--iap_client_id=iap-client-id
-
- (Optional.) Client ID for IAP protected endpoint. -
- -
--namespace=namespace -
- (Optional.) Kubernetes namespace to connect to the Kubeflow Pipelines API. - If the namespace is not specified, the value defaults to - kubeflow. -
-
+\--pipeline\_name=`pipeline-name`{.variable} +: The name of the pipeline. + +\--endpoint=`endpoint`{.variable} + +: (Optional.) Endpoint of the Kubeflow Pipelines API service. The endpoint of your Kubeflow Pipelines API service is the same as URL of the Kubeflow Pipelines dashboard. Your endpoint value should be something like: + + https://host-name/pipeline + + If you do not know the endpoint for your Kubeflow Pipelines cluster, contact you cluster administrator. + + If the `--endpoint` is not specified, the in-cluster service DNS name is used as the default value. This name works only if the CLI command executes in a pod on the Kubeflow Pipelines cluster, such as a [Kubeflow Jupyter notebooks](https://www.kubeflow.org/docs/components/notebooks/jupyter-tensorflow-examples/){.external} instance. + +\--engine=`engine`{.variable} + +: (Optional.) The orchestrator to be used for the pipeline. The value of engine must match on of the following values: + + - **kubeflow**: sets engine to Kubeflow + - **airflow**: (experimental) sets engine to Apache Airflow + + If the engine is not set, the engine is auto-detected based on the environment. + + !!! note "Important Note" + The orchestrator required by the DagRunner in the pipeline config file must match the selected or autodetected engine. Engine auto-detection is based on user environment. If Apache Airflow and Kubeflow Pipelines are not installed, then the local orchestrator is used by default. + +\--iap\_client\_id=`iap-client-id`{.variable} +: (Optional.) Client ID for IAP protected endpoint. + +\--namespace=`namespace`{.variable} +: (Optional.) Kubernetes namespace to connect to the Kubeflow Pipelines API. If the namespace is not specified, the value defaults to `kubeflow`. #### Examples Kubeflow: ```bash -tfx run list --engine=kubeflow --pipeline_name=pipeline-name --iap_client_id=iap-client-id \ ---namespace=namespace --endpoint=endpoint +tfx run list --engine=kubeflow --pipeline_name=pipeline-name --iap_client_id=iap-client-id \ +--namespace=namespace --endpoint=endpoint ``` ### status Returns the current status of a run. -** Important Note: Currently not supported in Local and Apache Beam. +!!! note "Important Note" + Currently not supported in Local and Apache Beam. Usage: ```bash -tfx run status --pipeline_name=pipeline-name --run_id=run-id [--endpoint=endpoint \ ---engine=engine --iap_client_id=iap-client-id --namespace=namespace] +tfx run status --pipeline_name=pipeline-name --run_id=run-id [--endpoint=endpoint \ +--engine=engine --iap_client_id=iap-client-id --namespace=namespace] ``` -
-
--pipeline_name=pipeline-name
-
The name of the pipeline.
-
--run_id=run-id
-
Unique identifier for a pipeline run.
-
--endpoint=endpoint
-
-

- (Optional.) Endpoint of the Kubeflow Pipelines API service. The endpoint - of your Kubeflow Pipelines API service is the same as URL of the Kubeflow - Pipelines dashboard. Your endpoint value should be something like: -

- -
https://host-name/pipeline
- -

- If you do not know the endpoint for your Kubeflow Pipelines cluster, - contact you cluster administrator. -

- -

- If the --endpoint is not specified, the in-cluster service - DNS name is used as the default value. This name works only if the - CLI command executes in a pod on the Kubeflow Pipelines cluster, such as a - Kubeflow Jupyter notebooks instance. -

-
-
--engine=engine
-
-

- (Optional.) The orchestrator to be used for the pipeline. The value of - engine must match on of the following values: -

- -

- If the engine is not set, the engine is auto-detected based on the - environment. -

-

- ** Important note: The orchestrator required by the DagRunner in the - pipeline config file must match the selected or autodetected engine. - Engine auto-detection is based on user environment. If Apache Airflow - and Kubeflow Pipelines are not installed, then the local orchestrator is - used by default. -

-
-
--iap_client_id=iap-client-id
-
- (Optional.) Client ID for IAP protected endpoint. -
- -
--namespace=namespace -
- (Optional.) Kubernetes namespace to connect to the Kubeflow Pipelines API. - If the namespace is not specified, the value defaults to - kubeflow. -
-
+\--pipeline\_name=`pipeline-name`{.variable} +: The name of the pipeline. + +\--run\_id=`run-id`{.variable} +: Unique identifier for a pipeline run. + +\--endpoint=`endpoint`{.variable} + +: (Optional.) Endpoint of the Kubeflow Pipelines API service. The endpoint of your Kubeflow Pipelines API service is the same as URL of the Kubeflow Pipelines dashboard. Your endpoint value should be something like: + + https://host-name/pipeline + + If you do not know the endpoint for your Kubeflow Pipelines cluster, contact you cluster administrator. + + If the `--endpoint` is not specified, the in-cluster service DNS name is used as the default value. This name works only if the CLI command executes in a pod on the Kubeflow Pipelines cluster, such as a [Kubeflow Jupyter notebooks](https://www.kubeflow.org/docs/components/notebooks/jupyter-tensorflow-examples/){.external} instance. + +\--engine=`engine`{.variable} + +: (Optional.) The orchestrator to be used for the pipeline. The value of engine must match on of the following values: + + - **kubeflow**: sets engine to Kubeflow + - **airflow**: (experimental) sets engine to Apache Airflow + + If the engine is not set, the engine is auto-detected based on the environment. + + !!! note "Important Note" + The orchestrator required by the DagRunner in the pipeline config file must match the selected or autodetected engine. Engine auto-detection is based on user environment. If Apache Airflow and Kubeflow Pipelines are not installed, then the local orchestrator is used by default. + +\--iap\_client\_id=`iap-client-id`{.variable} +: (Optional.) Client ID for IAP protected endpoint. + +\--namespace=`namespace`{.variable} +: (Optional.) Kubernetes namespace to connect to the Kubeflow Pipelines API. If the namespace is not specified, the value defaults to `kubeflow`. + #### Examples Kubeflow: ```bash -tfx run status --engine=kubeflow --run_id=run-id --pipeline_name=pipeline-name \ ---iap_client_id=iap-client-id --namespace=namespace --endpoint=endpoint +tfx run status --engine=kubeflow --run_id=run-id --pipeline_name=pipeline-name \ +--iap_client_id=iap-client-id --namespace=namespace --endpoint=endpoint ``` ### delete Deletes a run of a given pipeline. -** Important Note: Currently supported only in Kubeflow +!!! note Important Note + Currently supported only in Kubeflow Usage: ```bash -tfx run delete --run_id=run-id [--engine=engine --iap_client_id=iap-client-id \ ---namespace=namespace --endpoint=endpoint] +tfx run delete --run_id=run-id [--engine=engine --iap_client_id=iap-client-id \ +--namespace=namespace --endpoint=endpoint] ``` -
-
--run_id=run-id
-
Unique identifier for a pipeline run.
-
--endpoint=endpoint
-
-

- (Optional.) Endpoint of the Kubeflow Pipelines API service. The endpoint - of your Kubeflow Pipelines API service is the same as URL of the Kubeflow - Pipelines dashboard. Your endpoint value should be something like: -

- -
https://host-name/pipeline
- -

- If you do not know the endpoint for your Kubeflow Pipelines cluster, - contact you cluster administrator. -

- -

- If the --endpoint is not specified, the in-cluster service - DNS name is used as the default value. This name works only if the - CLI command executes in a pod on the Kubeflow Pipelines cluster, such as a - Kubeflow Jupyter notebooks instance. -

-
-
--engine=engine
-
-

- (Optional.) The orchestrator to be used for the pipeline. The value of - engine must match on of the following values: -

- -

- If the engine is not set, the engine is auto-detected based on the - environment. -

-

- ** Important note: The orchestrator required by the DagRunner in the - pipeline config file must match the selected or autodetected engine. - Engine auto-detection is based on user environment. If Apache Airflow - and Kubeflow Pipelines are not installed, then the local orchestrator is - used by default. -

-
-
--iap_client_id=iap-client-id
-
- (Optional.) Client ID for IAP protected endpoint. -
- -
--namespace=namespace -
- (Optional.) Kubernetes namespace to connect to the Kubeflow Pipelines API. - If the namespace is not specified, the value defaults to - kubeflow. -
-
+\--run\_id=`run-id`{.variable} +: Unique identifier for a pipeline run. + +\--endpoint=`endpoint`{.variable} + +: (Optional.) Endpoint of the Kubeflow Pipelines API service. The endpoint of your Kubeflow Pipelines API service is the same as URL of the Kubeflow Pipelines dashboard. Your endpoint value should be something like: + + https://host-name/pipeline + + If you do not know the endpoint for your Kubeflow Pipelines cluster, contact you cluster administrator. + + If the `--endpoint` is not specified, the in-cluster service DNS name is used as the default value. This name works only if the CLI command executes in a pod on the Kubeflow Pipelines cluster, such as a [Kubeflow Jupyter notebooks](https://www.kubeflow.org/docs/components/notebooks/jupyter-tensorflow-examples/){.external} instance. + +\--engine=`engine`{.variable} + +: (Optional.) The orchestrator to be used for the pipeline. The value of engine must match on of the following values: + + - **kubeflow**: sets engine to Kubeflow + + If the engine is not set, the engine is auto-detected based on the environment. + + !!! note "Important Note" + The orchestrator required by the DagRunner in the pipeline config file must match the selected or autodetected engine. Engine auto-detection is based on user environment. If Apache Airflow and Kubeflow Pipelines are not installed, then the local orchestrator is used by default. + +\--iap\_client\_id=`iap-client-id`{.variable} +: (Optional.) Client ID for IAP protected endpoint. + +\--namespace=`namespace`{.variable} +: (Optional.) Kubernetes namespace to connect to the Kubeflow Pipelines API. If the namespace is not specified, the value defaults to `kubeflow`. + #### Examples Kubeflow: ```bash -tfx run delete --engine=kubeflow --run_id=run-id --iap_client_id=iap-client-id \ ---namespace=namespace --endpoint=endpoint +tfx run delete --engine=kubeflow --run_id=run-id --iap_client_id=iap-client-id \ +--namespace=namespace --endpoint=endpoint ``` ## tfx template [Experimental] @@ -1020,7 +724,7 @@ tfx run delete --engine=kubeflow --run_id=run-id --iap_client_id=command required-flags [optional-flags] +tfx template command required-flags [optional-flags] ``` Use the following sections to learn more about the commands in the `tfx @@ -1044,100 +748,67 @@ Copy a template to the destination directory. Usage: ```bash -tfx template copy --model=model --pipeline_name=pipeline-name \ ---destination_path=destination-path +tfx template copy --model=model --pipeline_name=pipeline-name \ +--destination_path=destination-path ``` -
-
--model=model
-
The name of the model built by the pipeline template.
-
--pipeline_name=pipeline-name
-
The name of the pipeline.
-
--destination_path=destination-path
-
The path to copy the template to.
-
+\--model=`model`{.variable} +: The name of the model built by the pipeline template. + +\--pipeline\_name=`pipeline-name`{.variable} +: The name of the pipeline. + +\--destination\_path=`destination-path`{.variable} +: The path to copy the template to. + ## Understanding TFX CLI Flags ### Common flags -
-
--engine=engine
-
-

- The orchestrator to be used for the pipeline. The value of engine must - match on of the following values: -

- -

- If the engine is not set, the engine is auto-detected based on the - environment. -

-

- ** Important note: The orchestrator required by the DagRunner in the - pipeline config file must match the selected or autodetected engine. - Engine auto-detection is based on user environment. If Apache Airflow - and Kubeflow Pipelines are not installed, then the local orchestrator is - used by default. -

-
- -
--pipeline_name=pipeline-name
-
The name of the pipeline.
- -
--pipeline_path=pipeline-path
-
The path to the pipeline configuration file.
- -
--run_id=run-id
-
Unique identifier for a pipeline run.
- -
+\--engine=`engine`{.variable} + +: The orchestrator to be used for the pipeline. The value of engine must match on of the following values: + + - **kubeflow**: sets engine to Kubeflow + - **local**: sets engine to local orchestrator + - **vertex**: sets engine to Vertex Pipelines + - **airflow**: (experimental) sets engine to Apache Airflow + - **beam**: (experimental) sets engine to Apache Beam + + If the engine is not set, the engine is auto-detected based on the environment. + + !!! note "Important Note" + The orchestrator required by the DagRunner in the pipeline config file must match the selected or autodetected engine. Engine auto-detection is based on user environment. If Apache Airflow and Kubeflow Pipelines are not installed, then the local orchestrator is used by default. + +\--pipeline\_name=`pipeline-name`{.variable} +: The name of the pipeline. + +\--pipeline\_path=`pipeline-path`{.variable} +: The path to the pipeline configuration file. + +\--run\_id=`run-id`{.variable} +: Unique identifier for a pipeline run. + ### Kubeflow specific flags -
-
--endpoint=endpoint
-
-

- Endpoint of the Kubeflow Pipelines API service. The endpoint - of your Kubeflow Pipelines API service is the same as URL of the Kubeflow - Pipelines dashboard. Your endpoint value should be something like: -

- -
https://host-name/pipeline
- -

- If you do not know the endpoint for your Kubeflow Pipelines cluster, - contact you cluster administrator. -

- -

- If the --endpoint is not specified, the in-cluster service - DNS name is used as the default value. This name works only if the - CLI command executes in a pod on the Kubeflow Pipelines cluster, such as a - Kubeflow Jupyter notebooks instance. -

-
- -
--iap_client_id=iap-client-id
-
- Client ID for IAP protected endpoint. -
- -
--namespace=namespace -
- Kubernetes namespace to connect to the Kubeflow Pipelines API. If the - namespace is not specified, the value defaults to - kubeflow. -
-
+\--endpoint=`endpoint`{.variable} + +: Endpoint of the Kubeflow Pipelines API service. The endpoint of your Kubeflow Pipelines API service is the same as URL of the Kubeflow Pipelines dashboard. Your endpoint value should be something like: + + https://host-name/pipeline + + If you do not know the endpoint for your Kubeflow Pipelines cluster, contact you cluster administrator. + + If the `--endpoint` is not specified, the in-cluster service DNS name is used as the default value. This name works only if the CLI command executes in a pod on the Kubeflow Pipelines cluster, such as a [Kubeflow Jupyter notebooks](https://www.kubeflow.org/docs/components/notebooks/jupyter-tensorflow-examples/){.external} instance. + +\--iap\_client\_id=`iap-client-id`{.variable} +: Client ID for IAP protected endpoint. + +\--namespace=`namespace`{.variable} +: Kubernetes namespace to connect to the Kubeflow Pipelines API. If the namespace is not specified, the value defaults to `kubeflow`. + ## Generated files by TFX CLI diff --git a/docs/guide/custom_function_component.md b/docs/guide/custom_function_component.md index 8aca8be9aa..bf61bed771 100644 --- a/docs/guide/custom_function_component.md +++ b/docs/guide/custom_function_component.md @@ -35,9 +35,10 @@ Under the hood, this defines a custom component that is a subclass of [`BaseComponent`](https://github.com/tensorflow/tfx/blob/master/tfx/dsl/components/base/base_component.py){: .external } and its Spec and Executor classes. -Note: the feature (BaseBeamComponent based component by annotating a function -with `@component(use_beam=True)`) described below is experimental and there is -no public backwards compatibility guarantees. +!!! Note + The feature (BaseBeamComponent based component by annotating a function + with `@component(use_beam=True)`) described below is experimental and there is + no public backwards compatibility guarantees. If you want to define a subclass of [`BaseBeamComponent`](https://github.com/tensorflow/tfx/blob/master/tfx/dsl/components/base/base_beam_component.py){: .external } @@ -79,10 +80,11 @@ arguments and hyperparameters like training iteration count, dropout rate, and other configuration to your component. Parameters are stored as properties of component executions when tracked in ML Metadata. -Note: Currently, output simple data type values cannot be used as parameters -since they are not known at execution time. Similarly, input simple data type -values currently cannot take concrete values known at pipeline construction -time. We may remove this restriction in a future release of TFX. +!!! Note + Currently, output simple data type values cannot be used as parameters + since they are not known at execution time. Similarly, input simple data type + values currently cannot take concrete values known at pipeline construction + time. We may remove this restriction in a future release of TFX. ## Definition diff --git a/docs/guide/examplegen.md b/docs/guide/examplegen.md index aff3284de2..af7be7e662 100644 --- a/docs/guide/examplegen.md +++ b/docs/guide/examplegen.md @@ -34,12 +34,13 @@ components for these data sources and formats: * [Parquet](https://github.com/tensorflow/tfx/blob/master/tfx/components/example_gen/custom_executors/parquet_executor.py) See the usage examples in the source code and -[this discussion](examplegen.md#custom_examplegen) for more information on +[this discussion](examplegen.md#custom-examplegen) for more information on how to use and develop custom executors. -Note: In most case it's better to inherit from `base_example_gen_executor` -instead of `base_executor`. So following the Avro or Parquet example in the -Executor source code may be advisable. +!!! Note + In most case it's better to inherit from `base_example_gen_executor` + instead of `base_executor`. So following the Avro or Parquet example in the + Executor source code may be advisable. In addition, these data sources and formats are available as [custom component](understanding_custom_components.md) examples: @@ -50,10 +51,10 @@ In addition, these data sources and formats are available as Apache Beam supports ingesting data from a [broad range of data sources and formats](https://beam.apache.org/documentation/io/built-in/), -([see below](#additional_data_formats)). These capabilities +([see below](#additional-data-formats)). These capabilities can be used to create custom ExampleGen components for TFX, which is demonstrated by some existing ExampleGen components -([see below](#additional_data_formats)). +([see below](#additional-data-formats)). ## How to use an ExampleGen Component @@ -92,7 +93,8 @@ data. ### Custom input/output split -Note: this feature is only available after TFX 0.14. +!!! Note + This feature is only available after TFX 0.14. To customize the train/eval split ratio which ExampleGen will output, set the `output_config` for ExampleGen component. For example: @@ -135,7 +137,7 @@ the train and eval output split is generated with a 2:1 ratio. Please refer to [proto/example_gen.proto](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto) for ExampleGen's input and output split configuration. And refer to -[downstream components guide](#examplegen_downstream_components) for utilizing +[downstream components guide](#examplegen-downstream-components) for utilizing the custom splits downstream. #### Splitting Method @@ -185,7 +187,8 @@ Notice how the `partition_feature_name` was set in this example. ### Span -Note: this feature is only available after TFX 0.15. +!!! Note + This feature is only available after TFX 0.15. Span can be retrieved by using '{SPAN}' spec in the [input glob pattern](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto): @@ -244,7 +247,8 @@ Retrieving a certain span can be done with RangeConfig, which is detailed below. ### Date -Note: this feature is only availible after TFX 0.24.0. +!!! Note + This feature is only availible after TFX 0.24.0. If your data source is organized on filesystem by date, TFX supports mapping dates directly to span numbers. There are three specs to represent mapping from @@ -303,7 +307,8 @@ example_gen = CsvExampleGen(input_base='/tmp', input_config=input) ### Version -Note: this feature is only availible after TFX 0.24.0. +!!! Note + This feature is only availible after TFX 0.24.0. Version can be retrieved by using '{VERSION}' spec in the [input glob pattern](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto): @@ -363,7 +368,8 @@ example_gen = CsvExampleGen(input_base='/tmp', input_config=input) ### Range Config -Note: this feature is only available after TFX 0.24.0. +!!! Note + This feature is only available after TFX 0.24.0. TFX supports retrieval and processing of a specific span in file-based ExampleGen using range config, an abstract config used to describe ranges for @@ -630,6 +636,6 @@ evaluator = Evaluator( More details are available in the [CsvExampleGen API reference][tfx.v1.components.CsvExampleGen], -[FileBasedExampleGen API implementation][tfx.v1.components.example_gen.component], +[FileBasedExampleGen API implementation](https://github.com/tensorflow/tfx/blob/master/tfx/components/example_gen/component.py), and -[ImportExampleGen API reference][tfx.v1.components/ImportExampleGen]. +[ImportExampleGen API reference][tfx.v1.components.ImportExampleGen]. diff --git a/docs/guide/fairness_indicators.md b/docs/guide/fairness_indicators.md index 88192873ae..7f891d1408 100644 --- a/docs/guide/fairness_indicators.md +++ b/docs/guide/fairness_indicators.md @@ -308,7 +308,7 @@ contains several examples: * [Fairness_Indicators_Example_Colab.ipynb](https://github.com/tensorflow/fairness-indicators/blob/master/g3doc/tutorials/Fairness_Indicators_Example_Colab.ipynb) gives an overview of Fairness Indicators in - [TensorFlow Model Analysis](https://www.tensorflow.org/tfx/guide/tfma) and + [TensorFlow Model Analysis](../tfma) and how to use it with a real dataset. This notebook also goes over [TensorFlow Data Validation](https://www.tensorflow.org/tfx/data_validation/get_started) and [What-If Tool](https://pair-code.github.io/what-if-tool/), two tools for diff --git a/docs/guide/index.md b/docs/guide/index.md index dd1001ca38..cf70a88ecf 100644 --- a/docs/guide/index.md +++ b/docs/guide/index.md @@ -26,16 +26,18 @@ https://github.com/tensorflow/tfx) pip install tfx ``` -Note: See the -[TensorFlow Serving](https://www.tensorflow.org/tfx/guide/serving), -[TensorFlow JS](https://js.tensorflow.org/), and/or -[TensorFlow Lite](https://www.tensorflow.org/lite) documentation for installing -those optional components. - -Note: This installs [Apache Beam](beam.md) with the DirectRunner. You can also -separately install runners that perform distributed computation, such as -[Apache Flink](https://flink.apache.org/) or -[Apache Spark](https://spark.apache.org/). +!!! Note + See the + [TensorFlow Serving](./serving), + [TensorFlow JS](https://js.tensorflow.org/), and/or + [TensorFlow Lite](https://www.tensorflow.org/lite) documentation for installing + those optional components. + +!!! Note + This installs [Apache Beam](beam.md) with the DirectRunner. You can also + separately install runners that perform distributed computation, such as + [Apache Flink](https://flink.apache.org/) or + [Apache Spark](https://spark.apache.org/). ### Nightly Packages @@ -50,8 +52,9 @@ This will install the nightly packages for the major dependencies of TFX such as TensorFlow Model Analysis (TFMA), TensorFlow Data Validation (TFDV), TensorFlow Transform (TFT), TFX Basic Shared Libraries (TFX-BSL), ML Metadata (MLMD). -Note: These nightly packages are unstable and breakages are likely to happen. -The fix could often take a week or more depending on the complexity involved. +!!! Note + These nightly packages are unstable and breakages are likely to happen. + The fix could often take a week or more depending on the complexity involved. ## About TFX @@ -170,8 +173,9 @@ TFX libraries include: [KerasTuner](https://www.tensorflow.org/tutorials/keras/keras_tuner) is used for tuning hyperparameters for model. - Note: TFX supports TensorFlow 1.15 and, with some exceptions, 2.x. For - details, see [Designing TensorFlow Modeling Code For TFX](train.md). + !!! Note + TFX supports TensorFlow 1.15 and, with some exceptions, 2.x. For + details, see [Designing TensorFlow Modeling Code For TFX](train.md). * [**TensorFlow Model Analysis (TFMA)**](tfma.md) is a library for evaluating TensorFlow models. It is used along with TensorFlow to create an @@ -240,7 +244,7 @@ monitoring, and maintaining an ML pipeline easier. TFX is designed to be portable to multiple environments and orchestration frameworks, including [Apache Airflow](airflow.md), -[Apache Beam](beam_orchestrator.md) and [Kubeflow](kubeflow.md) . It is also +[Apache Beam](beam.md) and [Kubeflow](kubeflow.md) . It is also portable to different computing platforms, including on-premise, and cloud platforms such as the [Google Cloud Platform (GCP)](https://cloud.google.com/). In particular, @@ -250,8 +254,9 @@ TFX interoperates with serveral managed GCP services, such as [Cloud Dataflow](https://cloud.google.com/dataflow/) for distributed data processing for several other aspects of the ML lifecycle. -Note: The current revision of this user guide primarily discusses deployment -on a bare-metal system using Apache Airflow for orchestration. +!!! Note + The current revision of this user guide primarily discusses deployment + on a bare-metal system using Apache Airflow for orchestration. ### Model vs. SavedModel @@ -336,16 +341,17 @@ The following components use the schema: In a typical TFX pipeline TensorFlow Data Validation generates a schema, which is consumed by the other components. -Note: The auto-generated schema is best-effort and only tries to infer basic -properties of the data. It is expected that developers review and modify it as -needed. +!!! Note + The auto-generated schema is best-effort and only tries to infer basic + properties of the data. It is expected that developers review and modify it as + needed. ## Developing with TFX TFX provides a powerful platform for every phase of a machine learning project, from research, experimentation, and development on your local machine, through deployment. In order to avoid code duplication and eliminate the potential for -[training/serving skew](https://www.tensorflow.org/tfx/guide/tfdv#training-serving_skew_detection) +[training/serving skew](./tfdv#training-serving-skew-detection) it is strongly recommended to implement your TFX pipeline for both model training and deployment of trained models, and use [Transform](transform.md) components which leverage the [TensorFlow Transform](tft.md) library for both @@ -594,4 +600,4 @@ TFX provides a unified CLI which helps the perform full range of pipeline actions such as create, update, run, list, and delete pipelines on various orchestrators including Apache Airflow, Apache Beam, and Kubeflow. For details, please follow -[these instructions](https://github.com/tensorflow/tfx/blob/master/docs/guide/cli.md). +[these instructions](cli.md). diff --git a/docs/guide/infra_validator.md b/docs/guide/infra_validator.md index 1daeea2856..791e9b611c 100644 --- a/docs/guide/infra_validator.md +++ b/docs/guide/infra_validator.md @@ -91,11 +91,12 @@ For model server types (called serving binary) we support - [TensorFlow Serving](serving.md) -Note: InfraValidator allows specifying multiple versions of the same model -server type in order to upgrade the model server version without affecting model -compatibility. For example, user can test `tensorflow/serving` image with both -`2.1.0` and `latest` versions, to ensure the model will be compatible with the -latest `tensorflow/serving` version as well. +!!! Note + InfraValidator allows specifying multiple versions of the same model + server type in order to upgrade the model server version without affecting model + compatibility. For example, user can test `tensorflow/serving` image with both + `2.1.0` and `latest` versions, to ensure the model will be compatible with the + latest `tensorflow/serving` version as well. Following serving platforms are currently supported: @@ -209,7 +210,7 @@ Current InfraValidator is not complete yet, and has some limitations. for deployments to [TensorFlow Lite](https://www.tensorflow.org/lite) and [TensorFlow.js](https://www.tensorflow.org/js), or other inference frameworks. - There's a limited support on `LOAD_AND_QUERY` mode for the - [Predict](/versions/r1.15/api_docs/python/tf/saved_model/predict_signature_def) + [Predict](https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/saved_model/predict_signature_def) method signature (which is the only exportable method in TensorFlow 2). InfraValidator requires the Predict signature to consume a serialized [`tf.Example`](https://www.tensorflow.org/tutorials/load_data/tfrecord#tfexample) as the only input. diff --git a/docs/guide/keras.md b/docs/guide/keras.md index dd1454db9a..f0870b8200 100644 --- a/docs/guide/keras.md +++ b/docs/guide/keras.md @@ -87,9 +87,10 @@ unchanged. ## Native Keras (i.e. Keras without `model_to_estimator`) -Note: Full support for all features in Keras is in progress, in most cases, -Keras in TFX will work as expected. It does not yet work with Sparse Features -for FeatureColumns. +!!! Note + Full support for all features in Keras is in progress, in most cases, + Keras in TFX will work as expected. It does not yet work with Sparse Features + for FeatureColumns. ### Examples and Colab @@ -125,15 +126,16 @@ ops. The serving function and eval function are changed for native Keras. Details will be discussed in the following Trainer and Evaluator sections. -Note: Transformations within the `preprocessing_fn` cannot be applied to the -label feature for training or eval. +!!! Note + Transformations within the `preprocessing_fn` cannot be applied to the + label feature for training or eval. #### Trainer To configure native Keras, the `GenericExecutor` needs to be set for Trainer component to replace the default Estimator based executor. For details, please check -[here](trainer.md#configuring-the-trainer-component-to-use-the-genericexecutor). +[here](trainer.md#configuring-the-trainer-component). ##### Keras Module file with Transform @@ -280,9 +282,10 @@ logging.getLogger("tensorflow").setLevel(logging.INFO) and you should be able to see `Using MirroredStrategy with devices (...)` in the log. -Note: The environment variable `TF_FORCE_GPU_ALLOW_GROWTH=true` might be needed -for a GPU out of memory issue. For details, please refer to -[tensorflow GPU guide](https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth). +!!! Note + The environment variable `TF_FORCE_GPU_ALLOW_GROWTH=true` might be needed + for a GPU out of memory issue. For details, please refer to + [tensorflow GPU guide](https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth). #### Evaluator diff --git a/docs/guide/solutions.md b/docs/guide/solutions.md index f14b6fb47f..c47181eebb 100644 --- a/docs/guide/solutions.md +++ b/docs/guide/solutions.md @@ -3,12 +3,13 @@ Looking for insights into how TFX can be applied to build a solution that meets your needs? These in-depth articles and guides may help! -Note: These articles discuss complete solutions in which TFX is a key part, but -not the only part. This is nearly always the case for real-world deployments. So -implementing these solutions yourself will require more than just TFX. The main -goal is to give you some insight into how others have implemented solutions that -may meet requirements that are similar to yours, and not to serve as a cookbook -or list of approved applications of TFX. +!!! Note + These articles discuss complete solutions in which TFX is a key part, but + not the only part. This is nearly always the case for real-world deployments. So + implementing these solutions yourself will require more than just TFX. The main + goal is to give you some insight into how others have implemented solutions that + may meet requirements that are similar to yours, and not to serve as a cookbook + or list of approved applications of TFX. ## Architecture of a machine learning system for near real-time item matching diff --git a/docs/guide/tfdv.md b/docs/guide/tfdv.md index b496170d86..1628f3de14 100644 --- a/docs/guide/tfdv.md +++ b/docs/guide/tfdv.md @@ -24,7 +24,7 @@ TFX tools can both help find data bugs, and help with feature engineering. ## TensorFlow Data Validation * [Overview](#overview) -* [Schema Based Example Validation](#schema_based-example-validation) +* [Schema Based Example Validation](#schema-based-example-validation) * [Training-Serving Skew Detection](#skewdetect) * [Drift Detection](#drift-detection) @@ -42,7 +42,7 @@ be configured to detect different classes of anomalies in the data. It can We document each of these functionalities independently: -* [Schema Based Example Validation](#schema_based-example-validation) +* [Schema Based Example Validation](#schema-based-example-validation) * [Training-Serving Skew Detection](#skewdetect) * [Drift Detection](#drift-detection) @@ -146,9 +146,10 @@ This triggers an automatic schema generation based on the following rules: * Otherwise, TensorFlow Data Validation examines the available data statistics and computes a suitable schema for the data. -_Note: The auto-generated schema is best-effort and only tries to infer basic -properties of the data. It is expected that users review and modify it as -needed._ +!!! Note + The auto-generated schema is best-effort and only tries to infer basic + properties of the data. It is expected that users review and modify it as + needed. ### Training-Serving Skew Detection @@ -164,10 +165,11 @@ the serving data to train on. ##### Example Scenario -Note: For instance, in order to compensate for an underrepresented slice of -data, if a biased sampling is used without upweighting the downsampled examples -appropriately, the distribution of feature values between training and -serving data gets artificially skewed. +!!! Note + For instance, in order to compensate for an underrepresented slice of + data, if a biased sampling is used without upweighting the downsampled examples + appropriately, the distribution of feature values between training and + serving data gets artificially skewed. See the [TensorFlow Data Validation Get Started Guide](https://www.tensorflow.org/tfx/data_validation/get_started#checking_data_skew_and_drift) for information about configuring training-serving skew detection. diff --git a/docs/guide/tft_bestpractices.md b/docs/guide/tft_bestpractices.md index 11bd10ad52..44ab9bbc0c 100644 --- a/docs/guide/tft_bestpractices.md +++ b/docs/guide/tft_bestpractices.md @@ -114,13 +114,6 @@ Figure: The flow of data from raw data to prepared data to engineered features t ![Flow diagram showing raw data moving to prepared data moving to engineered features.](images/data-preprocessing-for-ml-with-tf-transform-data-preprocessing-flow.svg) - - In practice, data from the same source is often at different stages of readiness. For example, a field from a table in your data warehouse might be used directly as an engineered feature. At the same time, another field in the @@ -162,7 +155,7 @@ For structured data, data preprocessing operations include the following: lower-dimension, more powerful data representations using techniques such as [PCA](https://en.wikipedia.org/wiki/Principal_component_analysis){: .external }, - [embedding](https://developers.google.com/machine-learning/glossary/#embeddings){: .external } + [embedding](https://developers.google.com/machine-learning/crash-course/embeddings){: .external } extraction, and [hashing](https://medium.com/value-stream-design/introducing-one-of-the-best-hacks-in-machine-learning-the-hashing-trick-bf6a9c8af18f){: .external }. - **Feature selection:** selecting a subset of the input features for @@ -238,7 +231,7 @@ on operation granularity: values that are computed during training are used to adjust the feature value, which is the following simple *instance-level* operation: - \[ value_{scaled} = (value_{raw} - \mu) \div \sigma \] + \[ value_{\text{scaled}} = \frac{value_{\text{raw}} - \mu}{\sigma} \] Full-pass transformations include the following: @@ -306,7 +299,7 @@ on operation granularity: before training and prediction. -## ML pipeline on Google Cloud{: id="machine_learning_pipeline_on_gcp" } +## ML pipeline on Google Cloud This section discusses the core components of a typical end-to-end pipeline to train and serve TensorFlow ML models on Google Cloud using @@ -329,13 +322,6 @@ Figure: High-level architecture for ML training and serving on Google Cloud. {#h ![Architecture diagram showing stages for processing data.](images/data-preprocessing-for-ml-with-tf-transform-ml-training-serving-architecture.svg) - - The pipeline consists of the following steps: 1. After raw data is imported, tabular data is stored in BigQuery, and other @@ -461,13 +447,6 @@ Figure: High-level architecture using stream data for prediction in Dataflow. {# ![Architecture for using stream data for prediction.](images/data-preprocessing-for-ml-with-tf-transform-streaming-data-with-dataflow-architecture.svg) - - As shown in figure 3, during processing, events called *data points* are ingested into [Pub/Sub](https://cloud.google.com/pubsub/docs){: .external }. Dataflow consumes these data points, computes features based on aggregates over @@ -627,14 +606,6 @@ Figure: Behavior of `tf.Transform` for preprocessing and transforming data. ![Diagram showing flow from raw data through tf.Transform to predictions.](images/data-preprocessing-for-ml-with-tf-transform-tf-transform-behavior-flow.svg) - - - ### Transform training and evaluation data You preprocess the raw training data using the transformation implemented in @@ -705,196 +676,37 @@ new data points during prediction serving. The following table summarizes the data preprocessing options that this document discussed. In the table, "N/A" stands for "not applicable." - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- Data preprocessing option - - Instance-level
- (stateless transformations) -
-

- Full-pass during training and instance-level during serving - (stateful transformations) -

-
-

- Real-time (window) aggregations during training and serving (streaming - transformations) -

-
-

- BigQuery -  (SQL) -

-
-

- Batch scoring: OK—the same transformation implementation is - applied on data during training and batch scoring. -

-

- Online prediction: Not recommended—you can process training data, - but it results in training-serving skew because you process serving data - using different tools. -

-
-

- Batch scoring: Not recommended. -

-

- Online prediction: Not recommended. -

-

- Although you can use statistics computed using BigQuery - for instance-level batch/online transformations, it isn't easy because - you must maintain a stats store to be populated during training and - used during prediction. -

-
-

- Batch scoring: N/A—aggregates like these are computed based on - real-time events. -

-

- Online prediction: Not recommended—you can process training data, - but it results in training-serving skew because you process serving data - using different tools. -

-
-

- Dataflow (Apache Beam) -

-
-

- Batch scoring: OK—the same transformation implementation is - applied on data during training and batch scoring. -

-

- Online prediction: OK—if data at serving time comes from - Pub/Sub to be consumed by Dataflow. - Otherwise, results in training-serving skew. -

-
-

- Batch scoring: Not recommended. -

-

- Online predictions: Not recommended. -

-

- Although you can use statistics computed using Dataflow - for instance-level batch/online transformations, it isn't easy - because you must maintain a stats store to be populated during training - and used during prediction. -

-
-

- Batch scoring: N/A—aggregates like these are computed - based on real-time events. -

-

- Online prediction: OK—the same Apache Beam transformation is - applied on data during training (batch) and serving (stream). -

-
-

- Dataflow (Apache Beam + TFT) -

-
-

- Batch scoring: OK—the same transformation implementation is - applied to data during training and batch scoring. -

-

- Online prediction: Recommended—it avoids training-serving skew - and prepares training data up front. -

-
-

- Batch scoring: Recommended. -

-

- Online prediction: Recommended. -

-

- Both uses are recommended because transformation logic and computed - statistics during training are stored as a TensorFlow - graph that's attached to the exported model for serving. -

-
-

- Batch scoring: N/A—aggregates like these are computed - based on real-time events. -

-

- Online prediction: OK—the same Apache Beam transformation is - applied on data during training (batch) and serving (stream). -

-
-

- TensorFlow * -
- (input_fn & serving_fn) -

-
-

- Batch scoring: Not recommended. -

-

- Online prediction: Not recommended. -

-

- For training efficiency in both cases, it's better to prepare the - training data up front. -

-
-

- Batch scoring: Not Possible. -

-

- Online prediction: Not Possible. -

-
-

- Batch scoring: N/A—aggregates like these are computed - based on real-time events. -

- Online prediction: Not Possible. -

-
- -* With TensorFlow, transformations like crossing, embedding, ++----------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Data preprocessing option | Instance-level | Full-pass during training and instance-level during serving | Real-time (window) aggregations during training and serving | +| | | | | +| | (stateless transformations) | (stateful transformations) | (streaming transformations) | ++==================================+=========================================================================================================================================================================+=============================================================================================================================================================================================================================+=========================================================================================================================================================================+ +| **BigQuery** | **Batch scoring: OK**—the same transformation implementation is applied on data during training and batch scoring. | **Batch scoring: Not recommended**. | **Batch scoring: N/A**—aggregates like these are computed based on real-time events. | +| | | | | +| (SQL) | **Online prediction: Not recommended**—you can process training data, but it results in training-serving skew because you process serving data using different | **Online prediction: Not recommended**. | **Online prediction: Not recommended**—you can process training data, but it results in training-serving skew because you process serving data using different | +| | tools. | | tools. | +| | | Although you can use statistics computed using BigQuery for instance-level batch/online transformations, it isn't easy because you must maintain a stats store to be populated during training and used during prediction. | | ++----------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| **Dataflow** | **Batch scoring: OK**—the same transformation implementation is applied on data during training and batch scoring. | **Batch scoring: Not recommended**. | **Batch scoring: N/A**---aggregates like these are computed based on real-time events. | +| | | | | +| (Apache Beam) | **Online prediction: OK**—if data at serving time comes from Pub/Sub to be consumed by Dataflow. Otherwise, results in training-serving skew. | **Online predictions: Not recommended**. | **Online prediction: OK**—the same Apache Beam transformation is applied on data during training (batch) and serving (stream). | +| | | | | +| | | Although you can use statistics computed using Dataflow for instance-level batch/online transformations, it isn't easy because you must maintain a stats store to be populated during training and used during prediction. | | ++----------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| **Dataflow** | **Batch scoring: OK**—the same transformation implementation is applied to data during training and batch scoring. | **Batch scoring: Recommended**. | **Batch scoring: N/A**---aggregates like these are computed based on real-time events. | +| | | | | +| (Apache Beam + TFT) | **Online prediction: Recommended**—it avoids training-serving skew and prepares training data up front. | **Online prediction: Recommended**. | **Online prediction: OK**—the same Apache Beam transformation is applied on data during training (batch) and serving (stream). | +| | | | | +| | | Both uses are recommended because transformation logic and computed statistics during training are stored as a TensorFlow graph that's attached to the exported model for serving. | | ++----------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| **TensorFlow** ^\*^ | **Batch scoring: Not recommended**. | **Batch scoring: Not Possible**. | **Batch scoring: N/A**—aggregates like these are computed based on real-time events. | +| | | | | +| (`input_fn` & `serving_fn`) | **Online prediction: Not recommended**. | **Online prediction: Not Possible**. | **Online prediction: Not Possible**. | +| | | | | +| | For training efficiency in both cases, it's better to prepare the training data up front. | | | ++----------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +^\*^ With TensorFlow, transformations like crossing, embedding, and one-hot encoding should be performed declaratively as `feature_columns` columns. @@ -908,5 +720,5 @@ columns. - Learn about best practices for ML engineering in [Rules of ML](https://developers.google.com/machine-learning/guides/rules-of-ml/){: .external }. + For more reference architectures, diagrams, and best practices, explore the - TFX + TFX Cloud Solutions. diff --git a/docs/guide/trainer.md b/docs/guide/trainer.md index 0b94a62c09..ba80f2e4ca 100644 --- a/docs/guide/trainer.md +++ b/docs/guide/trainer.md @@ -7,7 +7,8 @@ The Trainer TFX pipeline component trains a TensorFlow model. Trainer makes extensive use of the Python [TensorFlow](https://www.tensorflow.org) API for training models. -Note: TFX supports TensorFlow 1.15 and 2.x. +!!! Note + TFX supports TensorFlow 1.15 and 2.x. ## Component diff --git a/docs/guide/transform.md b/docs/guide/transform.md index 753f82fa42..db01b4e371 100644 --- a/docs/guide/transform.md +++ b/docs/guide/transform.md @@ -78,8 +78,9 @@ By contrast, TensorFlow Transform is designed for transformations that require a full pass over the data to compute values that are not known in advance. For example, vocabulary generation requires a full pass over the data. -Note: These computations are implemented in [Apache Beam](https://beam.apache.org/) -under the hood. +!!! Note + These computations are implemented in [Apache Beam](https://beam.apache.org/) + under the hood. In addition to computing values using Apache Beam, TensorFlow Transform allows users to embed these values into a TensorFlow graph, which can then be loaded @@ -125,7 +126,7 @@ disk. As a TFX user, you only have to define a single function called the In `preprocessing_fn` you define a series of functions that manipulate the input dict of tensors to produce the output dict of tensors. You can find helper functions like scale_to_0_1 and compute_and_apply_vocabulary the -[TensorFlow Transform API](/tfx/transform/api_docs/python/tft) or use +[TensorFlow Transform API](https://www.tensorflow.org/tfx/transform/api_docs/python/tft) or use regular TensorFlow functions as shown below. ```python diff --git a/docs/guide/tuner.md b/docs/guide/tuner.md index abba1a7505..a2cb39f790 100644 --- a/docs/guide/tuner.md +++ b/docs/guide/tuner.md @@ -8,8 +8,9 @@ The Tuner component makes extensive use of the Python [KerasTuner](https://www.tensorflow.org/tutorials/keras/keras_tuner) API for tuning hyperparameters. -Note: The KerasTuner library can be used for hyperparameter tuning regardless of -the modeling API, not just for Keras models only. +!!! Note + The KerasTuner library can be used for hyperparameter tuning regardless of + the modeling API, not just for Keras models only. ## Component @@ -206,22 +207,24 @@ algorithm uses information from results of prior trials, such as Google Vizier algorithm implemented in the AI Platform Vizier does, an excessively parallel search would negatively affect the efficacy of the search. -Note: Each trial in each parallel search is conducted on a single machine in the -worker flock, i.e., each trial does not take advantage of multi-worker -distributed training. If multi-worker distribution is desired for each trial, -refer to -[`DistributingCloudTuner`](https://github.com/tensorflow/cloud/blob/b9c8752f5c53f8722dfc0b5c7e05be52e62597a8/src/python/tensorflow_cloud/tuner/tuner.py#L384-L676), -instead of `CloudTuner`. - -Note: Both `CloudTuner` and the Google Cloud AI Platform extensions Tuner -component can be used together, in which case it allows distributed parallel -tuning backed by the AI Platform Vizier's hyperparameter search algorithm. -However, in order to do so, the Cloud AI Platform Job must be given access to -the AI Platform Vizier service. See this -[guide](https://cloud.google.com/ai-platform/training/docs/custom-service-account#custom) -to set up a custom service account. After that, you should specify the custom -service account for your training job in the pipeline code. More details see -[E2E CloudTuner on GCP example](https://github.com/tensorflow/tfx/blob/master/tfx/examples/penguin/penguin_pipeline_kubeflow.py). +!!! Note + Each trial in each parallel search is conducted on a single machine in the + worker flock, i.e., each trial does not take advantage of multi-worker + distributed training. If multi-worker distribution is desired for each trial, + refer to + [`DistributingCloudTuner`](https://github.com/tensorflow/cloud/blob/b9c8752f5c53f8722dfc0b5c7e05be52e62597a8/src/python/tensorflow_cloud/tuner/tuner.py#L384-L676), + instead of `CloudTuner`. + +!!! Note + Both `CloudTuner` and the Google Cloud AI Platform extensions Tuner + component can be used together, in which case it allows distributed parallel + tuning backed by the AI Platform Vizier's hyperparameter search algorithm. + However, in order to do so, the Cloud AI Platform Job must be given access to + the AI Platform Vizier service. See this + [guide](https://cloud.google.com/ai-platform/training/docs/custom-service-account#custom) + to set up a custom service account. After that, you should specify the custom + service account for your training job in the pipeline code. More details see + [E2E CloudTuner on GCP example](https://github.com/tensorflow/tfx/blob/master/tfx/examples/penguin/penguin_pipeline_kubeflow.py). ## Links diff --git a/docs/guide/understanding_tfx_pipelines.md b/docs/guide/understanding_tfx_pipelines.md index f0edac2546..21a043063c 100644 --- a/docs/guide/understanding_tfx_pipelines.md +++ b/docs/guide/understanding_tfx_pipelines.md @@ -35,7 +35,7 @@ which components such as the `StatisticsGen` standard component use as inputs. Artifacts must be strongly typed with an **artifact type** registered in the [ML Metadata](mlmd.md) store. Learn more about the -[concepts used in ML Metadata](mlmd.md#concepts). +[concepts used in ML Metadata](mlmd.md). Artifact types have a name and define a schema of its properties. Artifact type names must be unique in your ML Metadata store. TFX provides several diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css index e734efefd6..21c97aa98c 100644 --- a/docs/stylesheets/extra.css +++ b/docs/stylesheets/extra.css @@ -13,3 +13,30 @@ width: 100%; aspect-ratio: 16 / 9; } + +.buttons-wrapper { + flex-wrap: wrap; + gap: 1em; + display: flex; + /* flex-grow: 1; */ + /* justify-content: center; */ + /* align-content: center; */ +} + +.buttons-wrapper > a { + justify-content: center; + align-content: center; + flex-wrap: nowrap; + /* gap: 1em; */ + align-items: center; + text-align: center; + flex: 1 1 30%; + display: flex; +} + +.md-button > .buttons-content { + align-items: center; + justify-content: center; + display: flex; + gap: 1em; +} diff --git a/docs/tutorials/data_validation/tfdv_basic.ipynb b/docs/tutorials/data_validation/tfdv_basic.ipynb index f8e44389a0..6b412fc3c8 100644 --- a/docs/tutorials/data_validation/tfdv_basic.ipynb +++ b/docs/tutorials/data_validation/tfdv_basic.ipynb @@ -46,18 +46,42 @@ "id": "rLsMb4vqY244" }, "source": [ - "Note: You can run this example right now in a Jupyter-style notebook, no setup required! Just click \"Run in Google Colab\"\n", - "\n", - "\u003cdiv class=\"devsite-table-wrapper\"\u003e\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://www.tensorflow.org/tfx/tutorials/data_validation/tfdv_basic\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://colab.sandbox.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/data_validation/tfdv_basic.ipynb\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\"\u003eRun in Google Colab\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tfx/blob/master/docs/tutorials/data_validation/tfdv_basic.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\"\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tfx/docs/tutorials/data_validation/tfdv_basic.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\u003c/td\u003e\n", - "\u003c/table\u003e\u003c/div\u003e" - ] + "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", + "\n", + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] }, { "cell_type": "markdown", diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md index d4163ca297..6085d56ace 100644 --- a/docs/tutorials/index.md +++ b/docs/tutorials/index.md @@ -21,7 +21,7 @@ you'll learn the two main styles of developing a TFX pipeline: Probably the simplest pipeline you can build, to help you get started. Click the _Run in Google Colab_ button. - [:octicons-arrow-right-24: Starter Pipeline](tutorials/tfx/penguin_simple.md) + [:octicons-arrow-right-24: Starter Pipeline](tfx/penguin_simple) - __2. Adding Data Validation__ @@ -29,7 +29,7 @@ you'll learn the two main styles of developing a TFX pipeline: Building on the simple pipeline to add data validation components. - [:octicons-arrow-right-24: Data Validation](tutorials/tfx/penguin_tfdv) + [:octicons-arrow-right-24: Data Validation](tfx/penguin_tfdv) - __3. Adding Feature Engineering__ @@ -37,7 +37,7 @@ you'll learn the two main styles of developing a TFX pipeline: Building on the data validation pipeline to add a feature engineering component. - [:octicons-arrow-right-24: Feature Engineering](tutorials/tfx/penguin_tft) + [:octicons-arrow-right-24: Feature Engineering](tfx/penguin_tft) - __4. Adding Model Analysis__ @@ -45,7 +45,7 @@ you'll learn the two main styles of developing a TFX pipeline: Building on the simple pipeline to add a model analysis component. - [:octicons-arrow-right-24: Model Analysis](tutorials/tfx/penguin_tfma) + [:octicons-arrow-right-24: Model Analysis](tfx/penguin_tfma) @@ -64,7 +64,7 @@ in your TFX pipeline. Running pipelines on a managed pipeline service, Vertex Pipelines. - [:octicons-arrow-right-24: Vertex Pipelines](tutorials/tfx/gcp/vertex_pipelines_simple) + [:octicons-arrow-right-24: Vertex Pipelines](tfx/gcp/vertex_pipelines_simple) - __Read data from BigQuery__ @@ -72,7 +72,7 @@ in your TFX pipeline. Using BigQuery as a data source of ML pipelines. - [:octicons-arrow-right-24: BigQuery](tutorials/tfx/gcp/vertex_pipelines_bq) + [:octicons-arrow-right-24: BigQuery](tfx/gcp/vertex_pipelines_bq) - __Vertex AI Training and Serving__ @@ -80,7 +80,7 @@ in your TFX pipeline. Using cloud resources for ML training and serving with Vertex AI. - [:octicons-arrow-right-24: Vertex Training and Serving](tutorials/tfx/gcp/vertex_pipelines_vertex_training) + [:octicons-arrow-right-24: Vertex Training and Serving](tfx/gcp/vertex_pipelines_vertex_training) - __TFX on Cloud AI Platform Pipelines__ @@ -88,14 +88,14 @@ in your TFX pipeline. An introduction to using TFX and Cloud AI Platform Pipelines. - [:octicons-arrow-right-24: Cloud Pipelines](tutorials/tfx/cloud-ai-platform-pipelines) + [:octicons-arrow-right-24: Cloud Pipelines](tfx/cloud-ai-platform-pipelines) ## Next Steps Once you have a basic understanding of TFX, check these additional tutorials and -guides. And don't forget to read the [TFX User Guide](guide/index.md). +guides. And don't forget to read the [TFX User Guide](../../guide).
@@ -107,7 +107,7 @@ guides. And don't forget to read the [TFX User Guide](guide/index.md). context_, a very useful development tool. Click the _Run in Google Colab_ button. - [:octicons-arrow-right-24: Keras](tutorials/tfx/components_keras) + [:octicons-arrow-right-24: Keras](tfx/components_keras) - __Custom Component Tutorial__ @@ -115,7 +115,7 @@ guides. And don't forget to read the [TFX User Guide](guide/index.md). A tutorial showing how to develop your own custom TFX components. - [:octicons-arrow-right-24: Custom Component](tutorials/tfx/python_function_component) + [:octicons-arrow-right-24: Custom Component](tfx/python_function_component) - __Data Validation__ @@ -126,7 +126,7 @@ guides. And don't forget to read the [TFX User Guide](guide/index.md). generating descriptive statistics, inferring a schema, and finding anomalies. - [:octicons-arrow-right-24: Data Validation](tutorials/data_validation/tfdv_basic) + [:octicons-arrow-right-24: Data Validation](data_validation/tfdv_basic) - __Model Analysis__ @@ -137,7 +137,7 @@ guides. And don't forget to read the [TFX User Guide](guide/index.md). dataset and evaluate the performance of a model along several axes of accuracy. - [:octicons-arrow-right-24: Model Analysis](tutorials/model_analysis/tfma_basic) + [:octicons-arrow-right-24: Model Analysis](model_analysis/tfma_basic) - __Serve a Model__ @@ -146,7 +146,7 @@ guides. And don't forget to read the [TFX User Guide](guide/index.md). This tutorial demonstrates how TensorFlow Serving can be used to serve a model using a simple REST API. - [:octicons-arrow-right-24: Model Analysis](tutorials/serving/rest_simple) + [:octicons-arrow-right-24: Model Analysis](serving/rest_simple)
diff --git a/docs/tutorials/mlmd/mlmd_tutorial.ipynb b/docs/tutorials/mlmd/mlmd_tutorial.ipynb index 5f869c6363..73027a6cb8 100644 --- a/docs/tutorials/mlmd/mlmd_tutorial.ipynb +++ b/docs/tutorials/mlmd/mlmd_tutorial.ipynb @@ -50,20 +50,42 @@ "id": "MfBg1C5NB3X0" }, "source": [ - "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/tfx/tutorials/mlmd/mlmd_tutorial\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/mlmd/mlmd_tutorial.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tfx/blob/master/docs/tutorials/mlmd/mlmd_tutorial.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://storage.googleapis.com/tensorflow_docs/tfx/docs/tutorials/mlmd/mlmd_tutorial.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/download_logo_32px.png\"\u003eDownload notebook\u003c/a\u003e\u003c/td\u003e\n", - " \u003c/td\u003e\n", - "\u003c/table\u003e" - ] + "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", + "\n", + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] }, { "cell_type": "markdown", @@ -96,7 +118,7 @@ "source": [ "## TFX Pipelines in Colab\n", "\n", - "Colab is a lightweight development environment which differs significantly from a production environment. In production, you may have various pipeline components like data ingestion, transformation, model training, run histories, etc. across multiple, distributed systems. For this tutorial, you should be aware that siginificant differences exist in Orchestration and Metadata storage - it is all handled locally within Colab. Learn more about TFX in Colab [here](https://www.tensorflow.org/tfx/tutorials/tfx/components_keras#background).\n", + "Colab is a lightweight development environment which differs significantly from a production environment. In production, you may have various pipeline components like data ingestion, transformation, model training, run histories, etc. across multiple, distributed systems. For this tutorial, you should be aware that siginificant differences exist in Orchestration and Metadata storage - it is all handled locally within Colab. Learn more about TFX in Colab [here](/tutorials/tfx/components_keras#background).\n", "\n" ] }, @@ -280,7 +302,7 @@ "\n", "A TFX pipeline consists of several components that perform different aspects of the ML workflow. In this notebook, you create and run the `ExampleGen`, `StatisticsGen`, `SchemaGen`, and `Trainer` components and use the `Evaluator` and `Pusher` component to evaluate and push the trained model. \n", "\n", - "Refer to the [components tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/components_keras) for more information on TFX pipeline components." + "Refer to the [components tutorial](/tutorials/tfx/components_keras) for more information on TFX pipeline components." ] }, { @@ -919,7 +941,7 @@ "To learn more about how to use MLMD, check out these additional resources:\n", "\n", "* [MLMD API documentation](https://www.tensorflow.org/tfx/ml_metadata/api_docs/python/mlmd)\n", - "* [MLMD guide](https://www.tensorflow.org/tfx/guide/mlmd)" + "* [MLMD guide](../../../guide/mlmd)" ] } ], diff --git a/docs/tutorials/model_analysis/tfma_basic.ipynb b/docs/tutorials/model_analysis/tfma_basic.ipynb index e3251c0222..d22d3b0604 100644 --- a/docs/tutorials/model_analysis/tfma_basic.ipynb +++ b/docs/tutorials/model_analysis/tfma_basic.ipynb @@ -37,19 +37,42 @@ "id": "rLsMb4vqY244" }, "source": [ - "Note: You can run this example right now in a Jupyter-style notebook, no setup required! Just click \"Run in Google Colab\"\n", - "\n", - "\u003cdiv class=\"devsite-table-wrapper\"\u003e\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://www.tensorflow.org/tfx/tutorials/model_analysis/tfma_basic\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://colab.sandbox.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/model_analysis/tfma_basic.ipynb\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\"\u003eRun in Google Colab\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tfx/blob/master/docs/tutorials/model_analysis/tfma_basic.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\"\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://storage.googleapis.com/tensorflow_docs/tfx/docs/tutorials/model_analysis/tfma_basic.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/download_logo_32px.png\"\u003eDownload notebook\u003c/a\u003e\u003c/td\u003e\n", - "\u003c/table\u003e\u003c/div\u003e" - ] + "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", + "\n", + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] }, { "cell_type": "markdown", @@ -67,7 +90,7 @@ "id": "mPt5BHTwy_0F" }, "source": [ - "[TensorFlow Model Analysis (TFMA)](https://www.tensorflow.org/tfx/guide/tfma) is a library for performing model evaluation across different slices of data. TFMA performs its computations in a distributed manner over large amounts of data using [Apache Beam](https://beam.apache.org/documentation/programming-guide/).\n", + "[TensorFlow Model Analysis (TFMA)](../../../guide/tfma) is a library for performing model evaluation across different slices of data. TFMA performs its computations in a distributed manner over large amounts of data using [Apache Beam](https://beam.apache.org/documentation/programming-guide/).\n", "\n", "This example colab notebook illustrates how TFMA can be used to investigate and visualize the performance of a model with respect to characteristics of the dataset. We'll use a model that we trained previously, and now you get to play with the results! The model we trained was for the [Chicago Taxi Example](https://github.com/tensorflow/tfx/tree/master/tfx/examples/chicago_taxi_pipeline), which uses the [Taxi Trips dataset](https://data.cityofchicago.org/Transportation/Taxi-Trips/wrvz-psew) released by the City of Chicago. Explore the full dataset in the [BigQuery UI](https://bigquery.cloud.google.com/dataset/bigquery-public-data:chicago_taxi_trips).\n", "\n", diff --git a/docs/tutorials/serving/rest_simple.ipynb b/docs/tutorials/serving/rest_simple.ipynb index aa13c8d202..a3c25bbf9e 100644 --- a/docs/tutorials/serving/rest_simple.ipynb +++ b/docs/tutorials/serving/rest_simple.ipynb @@ -46,20 +46,42 @@ "id": "E6FwTNtl3S4v" }, "source": [ - "**Warning: This notebook is designed to be run in a Google Colab only**. It installs packages on the system and requires root access. If you want to run it in a local Jupyter notebook, please proceed with caution.\n", - "\n", - "Note: You can run this example right now in a Jupyter-style notebook, no setup required! Just click \"Run in Google Colab\"\n", - "\n", - "\u003cdiv class=\"devsite-table-wrapper\"\u003e\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - "\u003ctr\u003e\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://www.tensorflow.org/tfx/tutorials/serving/rest_simple\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/serving/rest_simple.ipynb\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\"\u003eRun in Google Colab\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tfx/blob/master/docs/tutorials/serving/rest_simple.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\"\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tfx/docs/tutorials/serving/rest_simple.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\u003c/td\u003e\n", - "\u003c/tr\u003e\u003c/table\u003e\u003c/div\u003e" - ] + "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", + "\n", + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] }, { "cell_type": "markdown", @@ -67,7 +89,7 @@ "id": "FbVhjPpzn6BM" }, "source": [ - "This guide trains a neural network model to classify [images of clothing, like sneakers and shirts](https://github.com/zalandoresearch/fashion-mnist), saves the trained model, and then serves it with [TensorFlow Serving](https://www.tensorflow.org/tfx/guide/serving). The focus is on TensorFlow Serving, rather than the modeling and training in TensorFlow, so for a complete example which focuses on the modeling and training see the [Basic Classification example](https://github.com/tensorflow/docs/blob/master/site/en/r1/tutorials/keras/basic_classification.ipynb).\n", + "This guide trains a neural network model to classify [images of clothing, like sneakers and shirts](https://github.com/zalandoresearch/fashion-mnist), saves the trained model, and then serves it with [TensorFlow Serving](../../../guide/serving). The focus is on TensorFlow Serving, rather than the modeling and training in TensorFlow, so for a complete example which focuses on the modeling and training see the [Basic Classification example](https://github.com/tensorflow/docs/blob/master/site/en/r1/tutorials/keras/basic_classification.ipynb).\n", "\n", "This guide uses [tf.keras](https://github.com/tensorflow/docs/blob/master/site/en/r1/guide/keras.ipynb), a high-level API to build and train models in TensorFlow." ] @@ -217,7 +239,7 @@ "source": [ "## Save your model\n", "\n", - "To load our trained model into TensorFlow Serving we first need to save it in [SavedModel](https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/saved_model) format. This will create a protobuf file in a well-defined directory hierarchy, and will include a version number. [TensorFlow Serving](https://www.tensorflow.org/tfx/guide/serving) allows us to select which version of a model, or \"servable\" we want to use when we make inference requests. Each version will be exported to a different sub-directory under the given path." + "To load our trained model into TensorFlow Serving we first need to save it in [SavedModel](https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/saved_model) format. This will create a protobuf file in a well-defined directory hierarchy, and will include a version number. [TensorFlow Serving](../../../guide/serving) allows us to select which version of a model, or \"servable\" we want to use when we make inference requests. Each version will be exported to a different sub-directory under the given path." ] }, { diff --git a/docs/tutorials/tfx/airflow_workshop.md b/docs/tutorials/tfx/airflow_workshop.md index 61b8d7abdf..9dc033d5e3 100644 --- a/docs/tutorials/tfx/airflow_workshop.md +++ b/docs/tutorials/tfx/airflow_workshop.md @@ -24,7 +24,7 @@ You’ll learn how to create an ML pipeline using TFX important * Google uses TFX pipelines for production ML -Please see the [TFX User Guide](https://www.tensorflow.org/tfx/guide) to learn +Please see the [TFX User Guide](../../../guide) to learn more. You'll follow a typical ML development process: @@ -42,7 +42,7 @@ TFX orchestrators are responsible for scheduling components of the TFX pipeline based on the dependencies defined by the pipeline. TFX is designed to be portable to multiple environments and orchestration frameworks. One of the default orchestrators supported by TFX is -[Apache Airflow](https://www.tensorflow.org/tfx/guide/airflow). This lab +[Apache Airflow](../../../guide/airflow). This lab illustrates the use of Apache Airflow for TFX pipeline orchestration. Apache Airflow is a platform to programmatically author, schedule and monitor workflows. TFX uses Airflow to author workflows as directed acyclic graphs @@ -56,16 +56,17 @@ In this example, we are going to run a TFX pipeline on an instance by manually setting up Airflow. The other default orchestrators supported by TFX are Apache Beam and Kubeflow. -[Apache Beam](https://www.tensorflow.org/tfx/guide/beam_orchestrator) can run on +[Apache Beam](../../../guide/beam_orchestrator) can run on multiple data processing backends (Beam Ruunners). Cloud Dataflow is one such beam runner which can be used for running TFX pipelines. Apache Beam can be used -for both streaming and batch processing pipelines. \ -[Kubeflow](https://www.tensorflow.org/tfx/guide/kubeflow) is an open source ML +for both streaming and batch processing pipelines. + +[Kubeflow](../../../guide/kubeflow) is an open source ML platform dedicated to making deployments of machine learning (ML) workflows on Kubernetes simple, portable and scalable. Kubeflow can be used as an orchestrator for TFFX pipelines when they need to be deployed on Kubernetes clusters. In addition, you can also use your own -[custom orchestrator](https://www.tensorflow.org/tfx/guide/custom_orchestrator) +[custom orchestrator](../../../guide/custom_orchestrator) to run a TFX pipeline. Read more about Airflow [here](https://airflow.apache.org/). @@ -80,13 +81,14 @@ You'll be using the [Taxi Trips dataset](https://data.cityofchicago.org/Transportation/Taxi-Trips/wrvz-psew) released by the City of Chicago. -Note: This tutorial builds an application using data that has been modified for -use from its original source, www.cityofchicago.org, the official website of the -City of Chicago. The City of Chicago makes no claims as to the content, -accuracy, timeliness, or completeness of any of the data provided at in this -tutorial. The data provided at this site is subject to change at any time. It is -understood that the data provided in this tutorial is being used at one’s own -risk. +!!! Note + This tutorial builds an application using data that has been modified for + use from its original source, www.cityofchicago.org, the official website of the + City of Chicago. The City of Chicago makes no claims as to the content, + accuracy, timeliness, or completeness of any of the data provided at in this + tutorial. The data provided at this site is subject to change at any time. It is + understood that the data provided in this tutorial is being used at one’s own + risk. ### Model Goal - Binary classification Will the customer tip more or less than 20%? @@ -107,11 +109,13 @@ the duration of the lab. * Access to a standard internet browser (Chrome browser recommended). * Time to complete the lab. -**Note:** If you already have your own personal Google Cloud account or project, -do not use it for this lab. +!!! Note + If you already have your own personal Google Cloud account or project, + do not use it for this lab. -**Note:** If you are using a Chrome OS device, open an Incognito window to run -this lab. +!!! Note + If you are using a Chrome OS device, open an Incognito window to run + this lab. **How to start your lab and sign in to the Google Cloud Console** 1. Click the **Start Lab** button. If you need to pay for the lab, a pop-up opens for you to @@ -146,8 +150,9 @@ account, do not use it for this lab (avoids incurring charges). After a few moments, the Cloud Console opens in this tab. -**Note:** You can view the menu with a list of Google Cloud Products and -Services by clicking the **Navigation menu** at the top-left. +!!! Note + You can view the menu with a list of Google Cloud Products and + Services by clicking the **Navigation menu** at the top-left. ![qwiksetup4.png](images/airflow_workshop/qwiksetup4.png) @@ -242,8 +247,9 @@ followed by **Open Jupyterlab**. Next you'll clone the `tfx` repository in your JupyterLab instance. 1. In JupyterLab, click the **Terminal** icon to open a new terminal. -Note: If prompted, click Cancel for -Build Recommended. +!!! Note + If prompted, click `Cancel` for + Build Recommended. 1. To clone the `tfx` Github repository, type in the following command, and press **Enter**. diff --git a/docs/tutorials/tfx/cloud-ai-platform-pipelines.md b/docs/tutorials/tfx/cloud-ai-platform-pipelines.md index b0f9dd33c8..7edd78f6ab 100644 --- a/docs/tutorials/tfx/cloud-ai-platform-pipelines.md +++ b/docs/tutorials/tfx/cloud-ai-platform-pipelines.md @@ -14,14 +14,16 @@ At the end of this tutorial, you will have created and run an ML Pipeline, hosted on Google Cloud. You'll be able to visualize the results of each run, and view the lineage of the created artifacts. -Key Term: A TFX pipeline is a Directed Acyclic Graph, or "DAG". We will often -refer to pipelines as DAGs. +!!! abstract "Key Term" + A TFX pipeline is a Directed Acyclic Graph, or "DAG". We will often + refer to pipelines as DAGs. You'll follow a typical ML development process, starting by examining the dataset, and ending up with a complete working pipeline. Along the way you'll explore ways to debug and update your pipeline, and measure performance. -Note: Completing this tutorial may take 45-60 minutes. +!!! Note + Completing this tutorial may take 45-60 minutes. ### Chicago Taxi Dataset @@ -35,12 +37,13 @@ You're using the [Taxi Trips dataset](https://data.cityofchicago.org/Transportation/Taxi-Trips/wrvz-psew) released by the City of Chicago. -Note: This site provides applications using data that has been modified for use -from its original source, www.cityofchicago.org, the official website of the -City of Chicago. The City of Chicago makes no claims as to the content, -accuracy, timeliness, or completeness of any of the data provided at this site. -The data provided at this site is subject to change at any time. It is -understood that the data provided at this site is being used at one’s own risk. +!!! Note + This site provides applications using data that has been modified for use + from its original source, www.cityofchicago.org, the official website of the + City of Chicago. The City of Chicago makes no claims as to the content, + accuracy, timeliness, or completeness of any of the data provided at this site. + The data provided at this site is subject to change at any time. It is + understood that the data provided at this site is being used at one’s own risk. You can [read more](https://cloud.google.com/bigquery/public-data/chicago-taxi) about the dataset in [Google BigQuery](https://cloud.google.com/bigquery/). @@ -58,17 +61,18 @@ Will the customer tip more or less than 20%? To get started, you need a Google Cloud Account. If you already have one, skip ahead to [Create New Project](#create_project). -Warning: This demo is designed to not exceed -[Google Cloud's Free Tier](https://cloud.google.com/free) limits. If you already -have a Google Account, you may have reached your Free Tier limits, or exhausted -any free Google Cloud credits given to new users. **If that is the case, -following this demo will result in charges to your Google Cloud account**. +!!! Warning + This demo is designed to not exceed + [Google Cloud's Free Tier](https://cloud.google.com/free) limits. If you already + have a Google Account, you may have reached your Free Tier limits, or exhausted + any free Google Cloud credits given to new users. **If that is the case, + following this demo will result in charges to your Google Cloud account**. 1. Go to the [Google Cloud Console](https://console.cloud.google.com/). 1. Agree to Google Cloud terms and conditions - + ![](images/cloud-ai-platform-pipelines/welcome-popup.png){ width="65%" } 1. If you would like to start with a free trial account, click on [**Try For Free**](https://console.cloud.google.com/freetrial) (or @@ -85,19 +89,22 @@ following this demo will result in charges to your Google Cloud account**. [Google Cloud Free Tier](https://cloud.google.com/free) limits, which includes a max of 8 cores running at the same time. -Note: You can choose at this point to become a paid user instead of relying on -the free trial. Since this tutorial stays within the Free Tier limits, you still -won't be charged if this is your only project and you stay within those limits. -For more details, see -[Google Cloud Cost Calculator](https://cloud.google.com/products/calculator/) -and [Google Cloud Platform Free Tier](https://cloud.google.com/free). +!!! Note + You can choose at this point to become a paid user instead of relying on + the free trial. Since this tutorial stays within the Free Tier limits, you still + won't be charged if this is your only project and you stay within those limits. + For more details, see + [Google Cloud Cost Calculator](https://cloud.google.com/products/calculator/) + and [Google Cloud Platform Free Tier](https://cloud.google.com/free). ### 1.b Create a new project. -Note: This tutorial assumes you want to work on this demo in a new project. You -can, if you want, work in an existing project. +!!! Note + This tutorial assumes you want to work on this demo in a new project. You + can, if you want, work in an existing project. -Note: You must have a verified credit card on file before creating the project. +!!! Note + You must have a verified credit card on file before creating the project. 1. From the [main Google Cloud dashboard](https://console.cloud.google.com/home/dashboard), @@ -109,8 +116,9 @@ drop-down.** ## 2. Set up and deploy an AI Platform Pipeline on a new Kubernetes cluster -Note: This will take up to 10 minutes, as it requires waiting at several points -for resources to be provisioned. +!!! Note + This will take up to 10 minutes, as it requires waiting at several points + for resources to be provisioned. 1. Go to the [AI Platform Pipelines Clusters](https://console.cloud.google.com/ai-platform/pipelines) @@ -120,17 +128,18 @@ for resources to be provisioned. 1. Click **+ New Instance** to create a new cluster. - + ![](images/cloud-ai-platform-pipelines/new-instance.png){ width="65%" } 1. On the **Kubeflow Pipelines** overview page, click **Configure**. - + ![](images/cloud-ai-platform-pipelines/configure.png){ width="65%" } 1. Click "Enable" to enable the Kubernetes Engine API - + ![](images/cloud-ai-platform-pipelines/enable_api.png){ width="65%" } - Note: You may have to wait several minutes before moving on, while the Kubernetes Engine APIs are being enabled for you. + !!! Note + You may have to wait several minutes before moving on, while the Kubernetes Engine APIs are being enabled for you. 1. On the **Deploy Kubeflow Pipelines** page: @@ -142,7 +151,7 @@ for resources to be provisioned. APIs*. (This is required for this cluster to access the other pieces of your project. If you miss this step, fixing it later is a bit tricky.) - + ![](images/cloud-ai-platform-pipelines/check-the-box.png){ width="50%" } 1. Click **Create New Cluster**, and wait several minutes until the cluster has been created. This will take a few minutes. When it completes you @@ -172,7 +181,7 @@ for resources to be provisioned. 1. Create a **New Notebook** with TensorFlow Enterprise 2.7 (or above) installed. - + ![](images/cloud-ai-platform-pipelines/new-notebook.png){ width="65%" } New Notebook -> TensorFlow Enterprise 2.7 -> Without GPU @@ -186,19 +195,21 @@ for resources to be provisioned. 1. Under **Machine configuration** you may want to select a configuration with 1 or 2 vCPUs if you need to stay in the free tier. - + ![](images/cloud-ai-platform-pipelines/two-cpus.png){ width="65%" } + 1. Wait for the new notebook to be created, and then click **Enable Notebooks API** -Note: You may experience slow performance in your notebook if you use 1 or 2 -vCPUs instead of the default or higher. This should not seriously hinder your -completion of this tutorial. If would like to use the default settings, -[upgrade your account](https://cloud.google.com/free/docs/gcp-free-tier#to_upgrade_your_account) -to at least 12 vCPUs. This will accrue charges. See -[Google Kubernetes Engine Pricing](https://cloud.google.com/kubernetes-engine/pricing/) -for more details on pricing, including a -[pricing calculator](https://cloud.google.com/products/calculator) and -information about the [Google Cloud Free Tier](https://cloud.google.com/free). +!!! Note + You may experience slow performance in your notebook if you use 1 or 2 + vCPUs instead of the default or higher. This should not seriously hinder your + completion of this tutorial. If would like to use the default settings, + [upgrade your account](https://cloud.google.com/free/docs/gcp-free-tier#to_upgrade_your_account) + to at least 12 vCPUs. This will accrue charges. See + [Google Kubernetes Engine Pricing](https://cloud.google.com/kubernetes-engine/pricing/) + for more details on pricing, including a + [pricing calculator](https://cloud.google.com/products/calculator) and + information about the [Google Cloud Free Tier](https://cloud.google.com/free). ## 4. Launch the Getting Started Notebook @@ -210,12 +221,12 @@ information about the [Google Cloud Free Tier](https://cloud.google.com/free). 1. On the line for the cluster you are using in this tutorial, click **Open Pipelines Dashboard**. - + ![](images/cloud-ai-platform-pipelines/open-dashboard.png) 1. On the **Getting Started** page, click **Open a Cloud AI Platform Notebook on Google Cloud**. - + ![](images/cloud-ai-platform-pipelines/open-template.png) 1. Select the Notebook instance you are using for this tutorial and **Continue**, and then **Confirm**. @@ -379,13 +390,14 @@ Kubeflow Pipelines Dashboard. You can view your pipeline from the Kubeflow Pipelines Dashboard. -Note: If your pipeline run fails, you can see detailed logs in the KFP -Dashboard. One of the major sources of failure is permission related problems. -Make sure your KFP cluster has permissions to access Google Cloud APIs. This can -be configured -[when you create a KFP cluster in GCP](https://cloud.google.com/ai-platform/pipelines/docs/setting-up), -or see -[Troubleshooting document in GCP](https://cloud.google.com/ai-platform/pipelines/docs/troubleshooting). +!!! Note + If your pipeline run fails, you can see detailed logs in the KFP + Dashboard. One of the major sources of failure is permission related problems. + Make sure your KFP cluster has permissions to access Google Cloud APIs. This can + be configured + [when you create a KFP cluster in GCP](https://cloud.google.com/ai-platform/pipelines/docs/setting-up), + or see + [Troubleshooting document in GCP](https://cloud.google.com/ai-platform/pipelines/docs/troubleshooting). ## 8. Validate your data @@ -398,16 +410,16 @@ data. ### Components -![Data Components](images/airflow_workshop/examplegen1.png) -![Data Components](images/airflow_workshop/examplegen2.png) +![Data Components](images/cloud-ai-platform-pipelines/examplegen1.png) +![Data Components](images/cloud-ai-platform-pipelines/examplegen2.png) -* [ExampleGen](https://www.tensorflow.org/tfx/guide/examplegen) ingests and +* [ExampleGen](../../../guide/examplegen) ingests and splits the input dataset. -* [StatisticsGen](https://www.tensorflow.org/tfx/guide/statsgen) calculates +* [StatisticsGen](../../../guide/statsgen) calculates statistics for the dataset. -* [SchemaGen](https://www.tensorflow.org/tfx/guide/schemagen) SchemaGen +* [SchemaGen](../../../guide/schemagen) SchemaGen examines the statistics and creates a data schema. -* [ExampleValidator](https://www.tensorflow.org/tfx/guide/exampleval) looks +* [ExampleValidator](../../../guide/exampleval) looks for anomalies and missing values in the dataset. ### In Jupyter lab file editor: @@ -445,7 +457,7 @@ your pipeline. The example presented here is really only meant to get you started. For a more advanced example see the -[TensorFlow Data Validation Colab](https://www.tensorflow.org/tfx/tutorials/data_validation/chicago_taxi). +[TensorFlow Data Validation Colab](/tutorials/data_validation/chicago_taxi). For more information on using TFDV to explore and validate a dataset, [see the examples on tensorflow.org](https://www.tensorflow.org/tfx/data_validation). @@ -467,15 +479,15 @@ serving. ### Components -![Transform](images/airflow_workshop/transform.png) +![Transform](images/cloud-ai-platform-pipelines/transform.png) -* [Transform](https://www.tensorflow.org/tfx/guide/transform) performs feature +* [Transform](../../../guide/transform) performs feature engineering on the dataset. ### In Jupyter lab file editor: In `pipeline`/`pipeline.py`, find and uncomment the line which appends -[Transform](https://www.tensorflow.org/tfx/guide/transform) to the pipeline. +[Transform](../../../guide/transform) to the pipeline. ```python # components.append(transform) @@ -503,7 +515,7 @@ your pipeline. The example presented here is really only meant to get you started. For a more advanced example see the -[TensorFlow Transform Colab](https://www.tensorflow.org/tfx/tutorials/transform/census). +[TensorFlow Transform Colab](/tutorials/transform/census). ## 10. Training @@ -517,7 +529,7 @@ Train a TensorFlow model with your nice, clean, transformed data. ### Components -* [Trainer](https://www.tensorflow.org/tfx/guide/trainer) trains a TensorFlow +* [Trainer](../../../guide/trainer) trains a TensorFlow model. ### In Jupyter lab file editor: @@ -568,7 +580,7 @@ Understanding more than just the top level metrics. ### Components -* [Evaluator](https://www.tensorflow.org/tfx/guide/evaluator) performs deep +* [Evaluator](../../../guide/evaluator) performs deep analysis of the training results. ### In Jupyter lab file editor: @@ -613,7 +625,7 @@ Deployment targets receive new models from well-known locations ### Components -* [Pusher](https://www.tensorflow.org/tfx/guide/pusher) deploys the model to a +* [Pusher](../../../guide/pusher) deploys the model to a serving infrastructure. ### In Jupyter lab file editor: @@ -638,7 +650,7 @@ You have now trained and validated your model, and your model is now ready for production. You can now deploy your model to any of the TensorFlow deployment targets, including: -* [TensorFlow Serving](https://www.tensorflow.org/tfx/guide/serving), for +* [TensorFlow Serving](../../../guide/serving), for serving your model on a server or server farm and processing REST and/or gRPC inference requests. * [TensorFlow Lite](https://www.tensorflow.org/lite), for including your model @@ -713,8 +725,9 @@ setting `--project` in `beam_pipeline_args` when creating a pipeline. should replace the project id and the region value in this file with the correct values for your GCP project. ->**Note: You MUST set your GCP project ID and region in the `configs.py` file -before proceeding.** +!!! Note + You MUST set your GCP project ID and region in the `configs.py` file + before proceeding. **Change directory one level up.** Click the name of the directory above the file list. The name of the directory is the name of the pipeline which is @@ -739,16 +752,17 @@ pipeline as before and create a new execution run as we did in step 5 and 6. ### Try Dataflow Several -[TFX Components use Apache Beam](https://www.tensorflow.org/tfx/guide/beam) to +[TFX Components use Apache Beam](../../../guide/beam) to implement data-parallel pipelines, and it means that you can distribute data processing workloads using [Google Cloud Dataflow](https://cloud.google.com/dataflow/). In this step, we will set the Kubeflow orchestrator to use Dataflow as the data processing back-end for Apache Beam. ->**Note:** If the Dataflow API is not already enabled, you can enable it using -the console, or from the CLI using this command (for example, in the Cloud -Shell): +!!! Note + If the Dataflow API is not already enabled, you can enable it using + the console, or from the CLI using this command (for example, in the Cloud + Shell): ```bash # Select your project: @@ -765,15 +779,16 @@ gcloud services list --available | grep Dataflow gcloud services enable dataflow.googleapis.com ``` -> **Note:** Execution speed may be limited by default -> [Google Compute Engine (GCE)](https://cloud.google.com/compute) quota. We -> recommend setting a sufficient quota for approximately 250 Dataflow VMs: **250 -> CPUs, 250 IP Addresses, and 62500 GB of Persistent Disk**. For more details, -> please see the [GCE Quota](https://cloud.google.com/compute/quotas) and -> [Dataflow Quota](https://cloud.google.com/dataflow/quotas) documentation. If -> you are blocked by IP Address quota, using a bigger -> [`worker_type`](https://cloud.google.com/dataflow/docs/guides/specifying-exec-params#setting-other-cloud-dataflow-pipeline-options) -> will reduce the number of needed IPs. +!!! Note + Execution speed may be limited by default + [Google Compute Engine (GCE)](https://cloud.google.com/compute) quota. We + recommend setting a sufficient quota for approximately 250 Dataflow VMs: **250 + CPUs, 250 IP Addresses, and 62500 GB of Persistent Disk**. For more details, + please see the [GCE Quota](https://cloud.google.com/compute/quotas) and + [Dataflow Quota](https://cloud.google.com/dataflow/quotas) documentation. If + you are blocked by IP Address quota, using a bigger + [`worker_type`](https://cloud.google.com/dataflow/docs/guides/specifying-exec-params#setting-other-cloud-dataflow-pipeline-options) + will reduce the number of needed IPs. **Double-click `pipeline` to change directory, and double-click to open `configs.py`**. Uncomment the definition of `GOOGLE_CLOUD_REGION`, and @@ -825,11 +840,12 @@ the same value as `CUSTOM_TFX_IMAGE` above. `kubeflow_runner.py`**. Uncomment `ai_platform_training_args` and `ai_platform_serving_args`. -> Note: If you receive a permissions error in the Training step, you may need to -> provide Storage Object Viewer permissions to the Cloud Machine Learning Engine -> (AI Platform Prediction & Training) service account. More information is -> available in the -> [Container Registry documentation](https://cloud.google.com/container-registry/docs/access-control#grant). +!!! Note + If you receive a permissions error in the Training step, you may need to + provide Storage Object Viewer permissions to the Cloud Machine Learning Engine + (AI Platform Prediction & Training) service account. More information is + available in the + [Container Registry documentation](https://cloud.google.com/container-registry/docs/access-control#grant). #### Update the pipeline and re-run it @@ -865,13 +881,13 @@ You need to modify the pipeline definition to accommodate your data. 1. Modify `BIG_QUERY_QUERY` in configs.py to your query statement. 1. Add features in `models`/`features.py`. 1. Modify `models`/`preprocessing.py` to - [transform input data for training](https://www.tensorflow.org/tfx/guide/transform). + [transform input data for training](../../../guide/transform). 1. Modify `models`/`keras`/`model.py` and `models`/`keras`/`constants.py` to - [describe your ML model](https://www.tensorflow.org/tfx/guide/trainer). + [describe your ML model](../../../guide/trainer). ### Learn more about Trainer -See [Trainer component guide](https://www.tensorflow.org/tfx/guide/trainer) for +See [Trainer component guide](../../../guide/trainer) for more details on Training pipelines. ## Cleaning up diff --git a/docs/tutorials/tfx/components.ipynb b/docs/tutorials/tfx/components.ipynb index 74b9435523..49959bc8a8 100644 --- a/docs/tutorials/tfx/components.ipynb +++ b/docs/tutorials/tfx/components.ipynb @@ -48,19 +48,42 @@ "id": "LidV2qsXm4XC" }, "source": [ - "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", - "\n", - "\u003cdiv class=\"devsite-table-wrapper\"\u003e\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://www.tensorflow.org/tfx/tutorials/tfx/components\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/tfx/components.ipynb\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\"\u003eRun in Google Colab\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tfx/tree/master/docs/tutorials/tfx/components.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\"\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://storage.googleapis.com/tensorflow_docs/tfx/docs/tutorials/tfx/components.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/download_logo_32px.png\"\u003eDownload notebook\u003c/a\u003e\u003c/td\u003e\n", - "\u003c/table\u003e\u003c/div\u003e" - ] + "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", + "\n", + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] }, { "cell_type": "markdown", @@ -385,7 +408,7 @@ "\n", "`ExampleGen` takes as input the path to your data source. In our case, this is the `_data_root` path that contains the downloaded CSV.\n", "\n", - "Note: In this notebook, we can instantiate components one-by-one and run them with `InteractiveContext.run()`. By contrast, in a production setting, we would specify all the components upfront in a `Pipeline` to pass to the orchestrator (see the [Building a TFX Pipeline Guide](https://www.tensorflow.org/tfx/guide/build_tfx_pipeline))." + "Note: In this notebook, we can instantiate components one-by-one and run them with `InteractiveContext.run()`. By contrast, in a production setting, we would specify all the components upfront in a `Pipeline` to pass to the orchestrator (see the [Building a TFX Pipeline Guide](../../../guide/build_tfx_pipeline))." ] }, { @@ -564,7 +587,7 @@ "source": [ "Each feature in your dataset shows up as a row in the schema table, alongside its properties. The schema also captures all the values that a categorical feature takes on, denoted as its domain.\n", "\n", - "To learn more about schemas, see [the SchemaGen documentation](https://www.tensorflow.org/tfx/guide/schemagen)." + "To learn more about schemas, see [the SchemaGen documentation](../../../guide/schemagen)." ] }, { @@ -633,7 +656,7 @@ "\n", "`Transform` will take as input the data from `ExampleGen`, the schema from `SchemaGen`, as well as a module that contains user-defined Transform code.\n", "\n", - "Let's see an example of user-defined Transform code below (for an introduction to the TensorFlow Transform APIs, [see the tutorial](https://www.tensorflow.org/tfx/tutorials/transform/simple)). First, we define a few constants for feature engineering:\n", + "Let's see an example of user-defined Transform code below (for an introduction to the TensorFlow Transform APIs, [see the tutorial](/tutorials/transform/simple)). First, we define a few constants for feature engineering:\n", "\n", "Note: The `%%writefile` cell magic will save the contents of the cell as a `.py` file on disk. This allows the `Transform` component to load your code as a module.\n", "\n" @@ -1407,7 +1430,7 @@ "source": [ "This visualization shows the same metrics, but computed at every feature value of `trip_start_hour` instead of on the entire evaluation set.\n", "\n", - "TensorFlow Model Analysis supports many other visualizations, such as Fairness Indicators and plotting a time series of model performance. To learn more, see [the tutorial](https://www.tensorflow.org/tfx/tutorials/model_analysis/tfma_basic)." + "TensorFlow Model Analysis supports many other visualizations, such as Fairness Indicators and plotting a time series of model performance. To learn more, see [the tutorial](/tutorials/model_analysis/tfma_basic)." ] }, { diff --git a/docs/tutorials/tfx/components_keras.ipynb b/docs/tutorials/tfx/components_keras.ipynb index 2b0e5edfb6..37d3843ae1 100644 --- a/docs/tutorials/tfx/components_keras.ipynb +++ b/docs/tutorials/tfx/components_keras.ipynb @@ -48,19 +48,42 @@ "id": "LidV2qsXm4XC" }, "source": [ - "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", - "\n", - "\u003cdiv class=\"devsite-table-wrapper\"\u003e\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://www.tensorflow.org/tfx/tutorials/tfx/components_keras\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/tfx/components_keras.ipynb\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\"\u003eRun in Google Colab\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tfx/tree/master/docs/tutorials/tfx/components_keras.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\"\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://storage.googleapis.com/tensorflow_docs/tfx/docs/tutorials/tfx/components_keras.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/download_logo_32px.png\"\u003eDownload notebook\u003c/a\u003e\u003c/td\u003e\n", - "\u003c/table\u003e\u003c/div\u003e" - ] + "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", + "\n", + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] }, { "cell_type": "markdown", @@ -371,7 +394,7 @@ "\n", "`ExampleGen` takes as input the path to your data source. In our case, this is the `_data_root` path that contains the downloaded CSV.\n", "\n", - "Note: In this notebook, we can instantiate components one-by-one and run them with `InteractiveContext.run()`. By contrast, in a production setting, we would specify all the components upfront in a `Pipeline` to pass to the orchestrator (see the [Building a TFX Pipeline Guide](https://www.tensorflow.org/tfx/guide/build_tfx_pipeline)).\n", + "Note: In this notebook, we can instantiate components one-by-one and run them with `InteractiveContext.run()`. By contrast, in a production setting, we would specify all the components upfront in a `Pipeline` to pass to the orchestrator (see the [Building a TFX Pipeline Guide](../../../guide/build_tfx_pipeline)).\n", "\n", "#### Enabling the Cache\n", "When using the `InteractiveContext` in a notebook to develop a pipeline you can control when individual components will cache their outputs. Set `enable_cache` to `True` when you want to reuse the previous output artifacts that the component generated. Set `enable_cache` to `False` when you want to recompute the output artifacts for a component, if you are making changes to the code for example." @@ -556,7 +579,7 @@ "source": [ "Each feature in your dataset shows up as a row in the schema table, alongside its properties. The schema also captures all the values that a categorical feature takes on, denoted as its domain.\n", "\n", - "To learn more about schemas, see [the SchemaGen documentation](https://www.tensorflow.org/tfx/guide/schemagen)." + "To learn more about schemas, see [the SchemaGen documentation](../../../guide/schemagen)." ] }, { @@ -625,7 +648,7 @@ "\n", "`Transform` will take as input the data from `ExampleGen`, the schema from `SchemaGen`, as well as a module that contains user-defined Transform code.\n", "\n", - "Let's see an example of user-defined Transform code below (for an introduction to the TensorFlow Transform APIs, [see the tutorial](https://www.tensorflow.org/tfx/tutorials/transform/simple)). First, we define a few constants for feature engineering:\n", + "Let's see an example of user-defined Transform code below (for an introduction to the TensorFlow Transform APIs, [see the tutorial](/tutorials/transform/simple)). First, we define a few constants for feature engineering:\n", "\n", "Note: The `%%writefile` cell magic will save the contents of the cell as a `.py` file on disk. This allows the `Transform` component to load your code as a module.\n", "\n" @@ -1432,7 +1455,7 @@ "source": [ "This visualization shows the same metrics, but computed at every feature value of `trip_start_hour` instead of on the entire evaluation set.\n", "\n", - "TensorFlow Model Analysis supports many other visualizations, such as Fairness Indicators and plotting a time series of model performance. To learn more, see [the tutorial](https://www.tensorflow.org/tfx/tutorials/model_analysis/tfma_basic)." + "TensorFlow Model Analysis supports many other visualizations, such as Fairness Indicators and plotting a time series of model performance. To learn more, see [the tutorial](/tutorials/model_analysis/tfma_basic)." ] }, { diff --git a/docs/tutorials/tfx/gcp/vertex_pipelines_bq.ipynb b/docs/tutorials/tfx/gcp/vertex_pipelines_bq.ipynb index c864e1ee40..bc35bdb777 100644 --- a/docs/tutorials/tfx/gcp/vertex_pipelines_bq.ipynb +++ b/docs/tutorials/tfx/gcp/vertex_pipelines_bq.ipynb @@ -45,17 +45,42 @@ "id": "_445qeKq8e3-" }, "source": [ - "\u003cdiv class=\"devsite-table-wrapper\"\u003e\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://www.tensorflow.org/tfx/tutorials/tfx/gcp/vertex_pipelines_bq\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\"/\u003eView on TensorFlow.org\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/tfx/gcp/vertex_pipelines_bq.ipynb\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\"\u003eRun in Google Colab\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tfx/tree/master/docs/tutorials/tfx/gcp/vertex_pipelines_bq.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\"\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tfx/docs/tutorials/tfx/gcp/vertex_pipelines_bq.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca href=\"https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?q=download_url%3Dhttps%253A%252F%252Fraw.githubusercontent.com%252Ftensorflow%252Ftfx%252Fmaster%252Fdocs%252Ftutorials%252Ftfx%252Fgcp%252Fvertex_pipelines_bq.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eRun in Google Cloud Vertex AI Workbench\u003c/a\u003e\u003c/td\u003e\n", - "\u003c/table\u003e\u003c/div\u003e\n" - ] + "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", + "\n", + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] }, { "cell_type": "markdown", @@ -69,7 +94,7 @@ "Google Cloud Vertex Pipelines.\n", "\n", "This notebook is based on the TFX pipeline we built in\n", - "[Simple TFX Pipeline for Vertex Pipelines Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/gcp/vertex_pipelines_simple).\n", + "[Simple TFX Pipeline for Vertex Pipelines Tutorial](/tutorials/tfx/gcp/vertex_pipelines_simple).\n", "If you have not read that tutorial yet, you should read it before proceeding\n", "with this notebook.\n", "\n", @@ -98,7 +123,7 @@ "\n", "## Set up\n", "If you have completed\n", - "[Simple TFX Pipeline for Vertex Pipelines Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/gcp/vertex_pipelines_simple),\n", + "[Simple TFX Pipeline for Vertex Pipelines Tutorial](/tutorials/tfx/gcp/vertex_pipelines_simple),\n", "you will have a working GCP project and a GCS bucket and that is all we need\n", "for this tutorial. Please read the preliminary tutorial first if you missed it." ] @@ -372,7 +397,7 @@ "## Create a pipeline\n", "\n", "TFX pipelines are defined using Python APIs as we did in\n", - "[Simple TFX Pipeline for Vertex Pipelines Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/gcp/vertex_pipelines_simple).\n", + "[Simple TFX Pipeline for Vertex Pipelines Tutorial](/tutorials/tfx/gcp/vertex_pipelines_simple).\n", "We previously used `CsvExampleGen` which reads data from a CSV file. In this\n", "tutorial, we will use\n", "[`BigQueryExampleGen`](https://www.tensorflow.org/tfx/api_docs/python/tfx/v1/extensions/google_cloud_big_query/BigQueryExampleGen)\n", @@ -448,7 +473,7 @@ "### Write model code.\n", "\n", "We will use the same model code as in the\n", - "[Simple TFX Pipeline Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/penguin_simple)." + "[Simple TFX Pipeline Tutorial](/tutorials/tfx/penguin_simple)." ] }, { @@ -687,7 +712,7 @@ "## Run the pipeline on Vertex Pipelines.\n", "\n", "We will use Vertex Pipelines to run the pipeline as we did in\n", - "[Simple TFX Pipeline for Vertex Pipelines Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/gcp/vertex_pipelines_simple).\n" + "[Simple TFX Pipeline for Vertex Pipelines Tutorial](/tutorials/tfx/gcp/vertex_pipelines_simple).\n" ] }, { diff --git a/docs/tutorials/tfx/gcp/vertex_pipelines_simple.ipynb b/docs/tutorials/tfx/gcp/vertex_pipelines_simple.ipynb index 465637753a..3c63483712 100644 --- a/docs/tutorials/tfx/gcp/vertex_pipelines_simple.ipynb +++ b/docs/tutorials/tfx/gcp/vertex_pipelines_simple.ipynb @@ -45,17 +45,42 @@ "id": "_445qeKq8e3-" }, "source": [ - "\u003cdiv class=\"devsite-table-wrapper\"\u003e\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://www.tensorflow.org/tfx/tutorials/tfx/gcp/vertex_pipelines_simple\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\"/\u003eView on TensorFlow.org\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/tfx/gcp/vertex_pipelines_simple.ipynb\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\"\u003eRun in Google Colab\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tfx/tree/master/docs/tutorials/tfx/gcp/vertex_pipelines_simple.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\"\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tfx/docs/tutorials/tfx/gcp/vertex_pipelines_simple.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca href=\"https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?q=download_url%3Dhttps%253A%252F%252Fraw.githubusercontent.com%252Ftensorflow%252Ftfx%252Fmaster%252Fdocs%252Ftutorials%252Ftfx%252Fgcp%252Fvertex_pipelines_simple.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eRun in Google Cloud Vertex AI Workbench\u003c/a\u003e\u003c/td\u003e\n", - "\u003c/table\u003e\u003c/div\u003e\n" - ] + "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", + "\n", + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] }, { "cell_type": "markdown", @@ -66,7 +91,7 @@ "This notebook-based tutorial will create a simple TFX pipeline and run it using\n", "Google Cloud Vertex Pipelines. This notebook is based on the TFX pipeline\n", "we built in\n", - "[Simple TFX Pipeline Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/penguin_simple).\n", + "[Simple TFX Pipeline Tutorial](/tutorials/tfx/penguin_simple).\n", "If you are not familiar with TFX and you have not read that tutorial yet, you\n", "should read it before proceeding with this notebook.\n", "\n", @@ -336,7 +361,7 @@ "We will use the same\n", "[Palmer Penguins dataset](https://allisonhorst.github.io/palmerpenguins/articles/intro.html)\n", "as\n", - "[Simple TFX Pipeline Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/penguin_simple).\n", + "[Simple TFX Pipeline Tutorial](/tutorials/tfx/penguin_simple).\n", "\n", "There are four numeric features in this dataset which were already normalized\n", "to have range [0,1]. We will build a classification model which predicts the\n", @@ -396,11 +421,11 @@ "TFX pipelines are defined using Python APIs. We will define a pipeline which\n", "consists of three components, CsvExampleGen, Trainer and Pusher. The pipeline\n", "and model definition is almost the same as\n", - "[Simple TFX Pipeline Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/penguin_simple).\n", + "[Simple TFX Pipeline Tutorial](/tutorials/tfx/penguin_simple).\n", "\n", "The only difference is that we don't need to set `metadata_connection_config`\n", "which is used to locate\n", - "[ML Metadata](https://www.tensorflow.org/tfx/guide/mlmd) database. Because\n", + "[ML Metadata](../../../guide/mlmd) database. Because\n", "Vertex Pipelines uses a managed metadata service, users don't need to care\n", "of it, and we don't need to specify the parameter.\n", "\n", @@ -417,7 +442,7 @@ "### Write model code.\n", "\n", "We will use the same model code as in the\n", - "[Simple TFX Pipeline Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/penguin_simple)." + "[Simple TFX Pipeline Tutorial](/tutorials/tfx/penguin_simple)." ] }, { @@ -650,7 +675,7 @@ "## Run the pipeline on Vertex Pipelines.\n", "\n", "We used `LocalDagRunner` which runs on local environment in\n", - "[Simple TFX Pipeline Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/penguin_simple).\n", + "[Simple TFX Pipeline Tutorial](/tutorials/tfx/penguin_simple).\n", "TFX provides multiple orchestrators to run your pipeline. In this tutorial we\n", "will use the Vertex Pipelines together with the Kubeflow V2 dag runner." ] diff --git a/docs/tutorials/tfx/gcp/vertex_pipelines_vertex_training.ipynb b/docs/tutorials/tfx/gcp/vertex_pipelines_vertex_training.ipynb index ee7c821ea0..9773b9f317 100644 --- a/docs/tutorials/tfx/gcp/vertex_pipelines_vertex_training.ipynb +++ b/docs/tutorials/tfx/gcp/vertex_pipelines_vertex_training.ipynb @@ -45,17 +45,42 @@ "id": "_445qeKq8e3-" }, "source": [ - "\u003cdiv class=\"devsite-table-wrapper\"\u003e\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://www.tensorflow.org/tfx/tutorials/tfx/gcp/vertex_pipelines_vertex_training\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\"/\u003eView on TensorFlow.org\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/tfx/gcp/vertex_pipelines_vertex_training.ipynb\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\"\u003eRun in Google Colab\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tfx/tree/master/docs/tutorials/tfx/gcp/vertex_pipelines_vertex_training.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\"\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tfx/docs/tutorials/tfx/gcp/vertex_pipelines_vertex_training.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca href=\"https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?q=download_url%3Dhttps%253A%252F%252Fraw.githubusercontent.com%252Ftensorflow%252Ftfx%252Fmaster%252Fdocs%252Ftutorials%252Ftfx%252Fgcp%252Fvertex_pipelines_vertex_training.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eRun in Google Cloud Vertex AI Workbench\u003c/a\u003e\u003c/td\u003e\n", - "\u003c/table\u003e\u003c/div\u003e\n" - ] + "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", + "\n", + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] }, { "cell_type": "markdown", @@ -67,7 +92,7 @@ "ML model using Vertex AI Training service and publishes it to Vertex AI for serving.\n", "\n", "This notebook is based on the TFX pipeline we built in\n", - "[Simple TFX Pipeline for Vertex Pipelines Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/gcp/vertex_pipelines_simple).\n", + "[Simple TFX Pipeline for Vertex Pipelines Tutorial](/tutorials/tfx/gcp/vertex_pipelines_simple).\n", "If you have not read that tutorial yet, you should read it before proceeding\n", "with this notebook.\n", "\n", @@ -98,7 +123,7 @@ "\n", "## Set up\n", "If you have completed\n", - "[Simple TFX Pipeline for Vertex Pipelines Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/gcp/vertex_pipelines_simple),\n", + "[Simple TFX Pipeline for Vertex Pipelines Tutorial](/tutorials/tfx/gcp/vertex_pipelines_simple),\n", "you will have a working GCP project and a GCS bucket and that is all we need\n", "for this tutorial. Please read the preliminary tutorial first if you missed it." ] @@ -333,7 +358,7 @@ "We will use the same\n", "[Palmer Penguins dataset](https://allisonhorst.github.io/palmerpenguins/articles/intro.html)\n", "as\n", - "[Simple TFX Pipeline Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/penguin_simple).\n", + "[Simple TFX Pipeline Tutorial](/tutorials/tfx/penguin_simple).\n", "\n", "There are four numeric features in this dataset which were already normalized\n", "to have range [0,1]. We will build a classification model which predicts the\n", @@ -391,7 +416,7 @@ "## Create a pipeline\n", "\n", "Our pipeline will be very similar to the pipeline we created in\n", - "[Simple TFX Pipeline for Vertex Pipelines Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/gcp/vertex_pipelines_simple).\n", + "[Simple TFX Pipeline for Vertex Pipelines Tutorial](/tutorials/tfx/gcp/vertex_pipelines_simple).\n", "The pipeline will consists of three components, CsvExampleGen, Trainer and\n", "Pusher. But we will use a special Trainer and Pusher component. The Trainer component will move\n", "training workloads to Vertex AI, and the Pusher component will publish the\n", @@ -421,7 +446,7 @@ "### Write model code.\n", "\n", "The model itself is almost similar to the model in\n", - "[Simple TFX Pipeline Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/penguin_simple).\n", + "[Simple TFX Pipeline Tutorial](/tutorials/tfx/penguin_simple).\n", "\n", "We will add `_get_distribution_strategy()` function which creates a\n", "[TensorFlow distribution strategy](https://www.tensorflow.org/guide/distributed_training)\n", @@ -616,7 +641,7 @@ "\n", "We will define a function to create a TFX pipeline. It has the same three\n", "Components as in\n", - "[Simple TFX Pipeline Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/penguin_simple),\n", + "[Simple TFX Pipeline Tutorial](/tutorials/tfx/penguin_simple),\n", "but we use a `Trainer` and `Pusher` component in the GCP extension module.\n", "\n", "`tfx.extensions.google_cloud_ai_platform.Trainer` behaves like a regular\n", @@ -745,7 +770,7 @@ "## Run the pipeline on Vertex Pipelines.\n", "\n", "We will use Vertex Pipelines to run the pipeline as we did in\n", - "[Simple TFX Pipeline for Vertex Pipelines Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/gcp/vertex_pipelines_simple)." + "[Simple TFX Pipeline for Vertex Pipelines Tutorial](/tutorials/tfx/gcp/vertex_pipelines_simple)." ] }, { diff --git a/docs/tutorials/tfx/gpt2_finetuning_and_conversion.ipynb b/docs/tutorials/tfx/gpt2_finetuning_and_conversion.ipynb index 35f8af7b4e..688268512f 100644 --- a/docs/tutorials/tfx/gpt2_finetuning_and_conversion.ipynb +++ b/docs/tutorials/tfx/gpt2_finetuning_and_conversion.ipynb @@ -64,24 +64,42 @@ "id": "uf3QpfdiIl7O" }, "source": [ - "# TFX Pipeline for Fine-Tuning a Large Language Model (LLM)\n", - "\n", - "\n", - "This codelab demonstrates how to leverage the power of Keras 3, KerasNLP and TFX pipelines to fine-tune a pre-trained GPT-2 model on the IMDb movie reviews dataset. The dataset that is used in this demo is [IMDB Reviews dataset](https://www.tensorflow.org/datasets/catalog/imdb_reviews).\n", - "\n", - "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", - "\n", - "\u003cdiv class=\"devsite-table-wrapper\"\u003e\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://www.tensorflow.org/tfx/tutorials/tfx/gpt2_finetuning_and_conversion\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\"/\u003eView on TensorFlow.org\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/tfx/gpt2_finetuning_and_conversion.ipynb\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\"\u003eRun in Google Colab\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tfx/tree/master/docs/tutorials/tfx/gpt2_finetuning_and_conversion.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\"\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tfx/docs/tutorials/tfx/gpt2_finetuning_and_conversion.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\u003c/td\u003e\n", - "\u003c/table\u003e\u003c/div\u003e\n", - "\n" - ] + "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", + "\n", + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] }, { "cell_type": "markdown", @@ -1383,7 +1401,7 @@ "source": [ "TFX supports multiple orchestrators to run pipelines. In this tutorial we will use LocalDagRunner which is included in the TFX Python package and runs pipelines on local environment. We often call TFX pipelines \"DAGs\" which stands for directed acyclic graph.\n", "\n", - "LocalDagRunner provides fast iterations for development and debugging. TFX also supports other orchestrators including Kubeflow Pipelines and Apache Airflow which are suitable for production use cases. See [TFX on Cloud AI Platform Pipelines](https://www.tensorflow.org/tfx/tutorials/tfx/cloud-ai-platform-pipelines) or [TFX Airflow](https://www.tensorflow.org/tfx/tutorials/tfx/airflow_workshop) Tutorial to learn more about other orchestration systems.\n", + "LocalDagRunner provides fast iterations for development and debugging. TFX also supports other orchestrators including Kubeflow Pipelines and Apache Airflow which are suitable for production use cases. See [TFX on Cloud AI Platform Pipelines](/tutorials/tfx/cloud-ai-platform-pipelines) or [TFX Airflow](/tutorials/tfx/airflow_workshop) Tutorial to learn more about other orchestration systems.\n", "\n", "Now we create a LocalDagRunner and pass a Pipeline object created from the function we already defined. The pipeline runs directly and you can see logs for the progress of the pipeline including ML model training." ] diff --git a/docs/tutorials/tfx/images/cloud-ai-platform-pipelines/examplegen1.png b/docs/tutorials/tfx/images/cloud-ai-platform-pipelines/examplegen1.png new file mode 120000 index 0000000000..1a26a5688c --- /dev/null +++ b/docs/tutorials/tfx/images/cloud-ai-platform-pipelines/examplegen1.png @@ -0,0 +1 @@ +../../../../../tfx/examples/airflow_workshop/taxi/notebooks/img/examplegen1.png \ No newline at end of file diff --git a/docs/tutorials/tfx/images/cloud-ai-platform-pipelines/examplegen2.png b/docs/tutorials/tfx/images/cloud-ai-platform-pipelines/examplegen2.png new file mode 120000 index 0000000000..789aab9f09 --- /dev/null +++ b/docs/tutorials/tfx/images/cloud-ai-platform-pipelines/examplegen2.png @@ -0,0 +1 @@ +../../../../../tfx/examples/airflow_workshop/taxi/notebooks/img/examplegen2.png \ No newline at end of file diff --git a/docs/tutorials/tfx/images/cloud-ai-platform-pipelines/transform.png b/docs/tutorials/tfx/images/cloud-ai-platform-pipelines/transform.png new file mode 120000 index 0000000000..9391389e98 --- /dev/null +++ b/docs/tutorials/tfx/images/cloud-ai-platform-pipelines/transform.png @@ -0,0 +1 @@ +../../../../../tfx/examples/airflow_workshop/taxi/notebooks/img/transform.png \ No newline at end of file diff --git a/docs/tutorials/tfx/neural_structured_learning.ipynb b/docs/tutorials/tfx/neural_structured_learning.ipynb index 1ba25acf08..6011f258c3 100644 --- a/docs/tutorials/tfx/neural_structured_learning.ipynb +++ b/docs/tutorials/tfx/neural_structured_learning.ipynb @@ -50,26 +50,42 @@ "id": "vyAF26z9IDoq" }, "source": [ - "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", - "\n", - "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/tfx/tutorials/tfx/neural_structured_learning\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/tfx/neural_structured_learning.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tfx/tree/master/docs/tutorials/tfx/neural_structured_learning.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView on GitHub\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tfx/docs/tutorials/tfx/neural_structured_learning.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca href=\"https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/hub_logo_32px.png\" /\u003eSee TF Hub model\u003c/a\u003e\n", - " \u003c/td\u003e\n", - "\u003c/table\u003e" - ] + "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", + "\n", + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] }, { "cell_type": "markdown", diff --git a/docs/tutorials/tfx/penguin_simple.ipynb b/docs/tutorials/tfx/penguin_simple.ipynb index 52e4a54df6..a9339e295d 100644 --- a/docs/tutorials/tfx/penguin_simple.ipynb +++ b/docs/tutorials/tfx/penguin_simple.ipynb @@ -1,648 +1,685 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "penguin_simple.ipynb", - "provenance": [], - "collapsed_sections": [ - "DjUA6S30k52h" - ], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "DjUA6S30k52h" - }, - "source": [ - "##### Copyright 2021 The TensorFlow Authors." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "SpNWyqewk8fE" - }, - "source": [ - "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6x1ypzczQCwy" - }, - "source": [ - "# Simple TFX Pipeline Tutorial using Penguin dataset\n", - "\n", - "***A Short tutorial to run a simple TFX pipeline.***" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HU9YYythm0dx" - }, - "source": [ - "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", - "\n", - "\u003cdiv class=\"devsite-table-wrapper\"\u003e\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://www.tensorflow.org/tfx/tutorials/tfx/penguin_simple\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\"/\u003eView on TensorFlow.org\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/tfx/penguin_simple.ipynb\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\"\u003eRun in Google Colab\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tfx/tree/master/docs/tutorials/tfx/penguin_simple.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\"\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tfx/docs/tutorials/tfx/penguin_simple.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\u003c/td\u003e\n", - "\u003c/table\u003e\u003c/div\u003e" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_VuwrlnvQJ5k" - }, - "source": [ - "In this notebook-based tutorial, we will create and run a TFX pipeline\n", - "for a simple classification model.\n", - "The pipeline will consist of three essential TFX components: ExampleGen,\n", - "Trainer and Pusher. The pipeline includes the most minimal ML workflow like\n", - "importing data, training a model and exporting the trained model.\n", - "\n", - "Please see\n", - "[Understanding TFX Pipelines](https://www.tensorflow.org/tfx/guide/understanding_tfx_pipelines)\n", - "to learn more about various concepts in TFX." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Fmgi8ZvQkScg" - }, - "source": [ - "## Set Up\n", - "We first need to install the TFX Python package and download\n", - "the dataset which we will use for our model.\n", - "\n", - "### Upgrade Pip\n", - "\n", - "To avoid upgrading Pip in a system when running locally,\n", - "check to make sure that we are running in Colab.\n", - "Local systems can of course be upgraded separately." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "as4OTe2ukSqm" - }, - "source": [ - "try:\n", - " import colab\n", - " !pip install --upgrade pip\n", - "except:\n", - " pass" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MZOYTt1RW4TK" - }, - "source": [ - "### Install TFX\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "iyQtljP-qPHY" - }, - "source": [ - "!pip install -U tfx" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EwT0nov5QO1M" - }, - "source": [ - "### Did you restart the runtime?\n", - "\n", - "If you are using Google Colab, the first time that you run\n", - "the cell above, you must restart the runtime by clicking\n", - "above \"RESTART RUNTIME\" button or using \"Runtime \u003e Restart\n", - "runtime ...\" menu. This is because of the way that Colab\n", - "loads packages." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BDnPgN8UJtzN" - }, - "source": [ - "Check the TensorFlow and TFX versions." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "6jh7vKSRqPHb" - }, - "source": [ - "import tensorflow as tf\n", - "print('TensorFlow version: {}'.format(tf.__version__))\n", - "from tfx import v1 as tfx\n", - "print('TFX version: {}'.format(tfx.__version__))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aDtLdSkvqPHe" - }, - "source": [ - "### Set up variables\n", - "\n", - "There are some variables used to define a pipeline. You can customize these\n", - "variables as you want. By default all output from the pipeline will be\n", - "generated under the current directory." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "EcUseqJaE2XN" - }, - "source": [ - "import os\n", - "\n", - "PIPELINE_NAME = \"penguin-simple\"\n", - "\n", - "# Output directory to store artifacts generated from the pipeline.\n", - "PIPELINE_ROOT = os.path.join('pipelines', PIPELINE_NAME)\n", - "# Path to a SQLite DB file to use as an MLMD storage.\n", - "METADATA_PATH = os.path.join('metadata', PIPELINE_NAME, 'metadata.db')\n", - "# Output directory where created models from the pipeline will be exported.\n", - "SERVING_MODEL_DIR = os.path.join('serving_model', PIPELINE_NAME)\n", - "\n", - "from absl import logging\n", - "logging.set_verbosity(logging.INFO) # Set default logging level." - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8F2SRwRLSYGa" - }, - "source": [ - "### Prepare example data\n", - "We will download the example dataset for use in our TFX pipeline. The dataset we\n", - "are using is\n", - "[Palmer Penguins dataset](https://allisonhorst.github.io/palmerpenguins/articles/intro.html)\n", - "which is also used in other\n", - "[TFX examples](https://github.com/tensorflow/tfx/tree/master/tfx/examples/penguin).\n", - "\n", - "There are four numeric features in this dataset:\n", - "\n", - "- culmen_length_mm\n", - "- culmen_depth_mm\n", - "- flipper_length_mm\n", - "- body_mass_g\n", - "\n", - "All features were already normalized to have range [0,1]. We will build a\n", - "classification model which predicts the `species` of penguins." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "11J7XiCq6AFP" - }, - "source": [ - "Because TFX ExampleGen reads inputs from a directory, we need to create a\n", - "directory and copy dataset to it." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "4fxMs6u86acP" - }, - "source": [ - "import urllib.request\n", - "import tempfile\n", - "\n", - "DATA_ROOT = tempfile.mkdtemp(prefix='tfx-data') # Create a temporary directory.\n", - "_data_url = 'https://raw.githubusercontent.com/tensorflow/tfx/master/tfx/examples/penguin/data/labelled/penguins_processed.csv'\n", - "_data_filepath = os.path.join(DATA_ROOT, \"data.csv\")\n", - "urllib.request.urlretrieve(_data_url, _data_filepath)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ASpoNmxKSQjI" - }, - "source": [ - "Take a quick look at the CSV file." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "-eSz28UDSnlG" - }, - "source": [ - "!head {_data_filepath}" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "OTtQNq1DdVvG" - }, - "source": [ - "You should be able to see five values. `species` is one of 0, 1 or 2, and all\n", - "other features should have values between 0 and 1." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nH6gizcpSwWV" - }, - "source": [ - "## Create a pipeline\n", - "\n", - "TFX pipelines are defined using Python APIs. We will define a pipeline which\n", - "consists of following three components.\n", - "- CsvExampleGen: Reads in data files and convert them to TFX internal format\n", - "for further processing. There are multiple\n", - "[ExampleGen](https://www.tensorflow.org/tfx/guide/examplegen)s for various\n", - "formats. In this tutorial, we will use CsvExampleGen which takes CSV file input.\n", - "- Trainer: Trains an ML model.\n", - "[Trainer component](https://www.tensorflow.org/tfx/guide/trainer) requires a\n", - "model definition code from users. You can use TensorFlow APIs to specify how to\n", - "train a model and save it in a _saved_model_ format.\n", - "- Pusher: Copies the trained model outside of the TFX pipeline.\n", - "[Pusher component](https://www.tensorflow.org/tfx/guide/pusher) can be thought\n", - "of as a deployment process of the trained ML model.\n", - "\n", - "Before actually define the pipeline, we need to write a model code for the\n", - "Trainer component first." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lOjDv93eS5xV" - }, - "source": [ - "### Write model training code\n", - "\n", - "We will create a simple DNN model for classification using TensorFlow Keras\n", - "API. This model training code will be saved to a separate file.\n", - "\n", - "In this tutorial we will use\n", - "[Generic Trainer](https://www.tensorflow.org/tfx/guide/trainer#generic_trainer)\n", - "of TFX which support Keras-based models. You need to write a Python file\n", - "containing `run_fn` function, which is the entrypoint for the `Trainer`\n", - "component." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "aES7Hv5QTDK3" - }, - "source": [ - "_trainer_module_file = 'penguin_trainer.py'" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "Gnc67uQNTDfW" - }, - "source": [ - "%%writefile {_trainer_module_file}\n", - "\n", - "from typing import List\n", - "from absl import logging\n", - "import tensorflow as tf\n", - "from tensorflow import keras\n", - "from tensorflow_transform.tf_metadata import schema_utils\n", - "\n", - "from tfx import v1 as tfx\n", - "from tfx_bsl.public import tfxio\n", - "from tensorflow_metadata.proto.v0 import schema_pb2\n", - "\n", - "_FEATURE_KEYS = [\n", - " 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g'\n", - "]\n", - "_LABEL_KEY = 'species'\n", - "\n", - "_TRAIN_BATCH_SIZE = 20\n", - "_EVAL_BATCH_SIZE = 10\n", - "\n", - "# Since we're not generating or creating a schema, we will instead create\n", - "# a feature spec. Since there are a fairly small number of features this is\n", - "# manageable for this dataset.\n", - "_FEATURE_SPEC = {\n", - " **{\n", - " feature: tf.io.FixedLenFeature(shape=[1], dtype=tf.float32)\n", - " for feature in _FEATURE_KEYS\n", - " },\n", - " _LABEL_KEY: tf.io.FixedLenFeature(shape=[1], dtype=tf.int64)\n", - "}\n", - "\n", - "\n", - "def _input_fn(file_pattern: List[str],\n", - " data_accessor: tfx.components.DataAccessor,\n", - " schema: schema_pb2.Schema,\n", - " batch_size: int = 200) -\u003e tf.data.Dataset:\n", - " \"\"\"Generates features and label for training.\n", - "\n", - " Args:\n", - " file_pattern: List of paths or patterns of input tfrecord files.\n", - " data_accessor: DataAccessor for converting input to RecordBatch.\n", - " schema: schema of the input data.\n", - " batch_size: representing the number of consecutive elements of returned\n", - " dataset to combine in a single batch\n", - "\n", - " Returns:\n", - " A dataset that contains (features, indices) tuple where features is a\n", - " dictionary of Tensors, and indices is a single Tensor of label indices.\n", - " \"\"\"\n", - " return data_accessor.tf_dataset_factory(\n", - " file_pattern,\n", - " tfxio.TensorFlowDatasetOptions(\n", - " batch_size=batch_size, label_key=_LABEL_KEY),\n", - " schema=schema).repeat()\n", - "\n", - "\n", - "def _build_keras_model() -\u003e tf.keras.Model:\n", - " \"\"\"Creates a DNN Keras model for classifying penguin data.\n", - "\n", - " Returns:\n", - " A Keras Model.\n", - " \"\"\"\n", - " # The model below is built with Functional API, please refer to\n", - " # https://www.tensorflow.org/guide/keras/overview for all API options.\n", - " inputs = [keras.layers.Input(shape=(1,), name=f) for f in _FEATURE_KEYS]\n", - " d = keras.layers.concatenate(inputs)\n", - " for _ in range(2):\n", - " d = keras.layers.Dense(8, activation='relu')(d)\n", - " outputs = keras.layers.Dense(3)(d)\n", - "\n", - " model = keras.Model(inputs=inputs, outputs=outputs)\n", - " model.compile(\n", - " optimizer=keras.optimizers.Adam(1e-2),\n", - " loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n", - " metrics=[keras.metrics.SparseCategoricalAccuracy()])\n", - "\n", - " model.summary(print_fn=logging.info)\n", - " return model\n", - "\n", - "\n", - "# TFX Trainer will call this function.\n", - "def run_fn(fn_args: tfx.components.FnArgs):\n", - " \"\"\"Train the model based on given args.\n", - "\n", - " Args:\n", - " fn_args: Holds args used to train the model as name/value pairs.\n", - " \"\"\"\n", - "\n", - " # This schema is usually either an output of SchemaGen or a manually-curated\n", - " # version provided by pipeline author. A schema can also derived from TFT\n", - " # graph if a Transform component is used. In the case when either is missing,\n", - " # `schema_from_feature_spec` could be used to generate schema from very simple\n", - " # feature_spec, but the schema returned would be very primitive.\n", - " schema = schema_utils.schema_from_feature_spec(_FEATURE_SPEC)\n", - "\n", - " train_dataset = _input_fn(\n", - " fn_args.train_files,\n", - " fn_args.data_accessor,\n", - " schema,\n", - " batch_size=_TRAIN_BATCH_SIZE)\n", - " eval_dataset = _input_fn(\n", - " fn_args.eval_files,\n", - " fn_args.data_accessor,\n", - " schema,\n", - " batch_size=_EVAL_BATCH_SIZE)\n", - "\n", - " model = _build_keras_model()\n", - " model.fit(\n", - " train_dataset,\n", - " steps_per_epoch=fn_args.train_steps,\n", - " validation_data=eval_dataset,\n", - " validation_steps=fn_args.eval_steps)\n", - "\n", - " # The result of the training should be saved in `fn_args.serving_model_dir`\n", - " # directory.\n", - " model.save(fn_args.serving_model_dir, save_format='tf')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "blaw0rs-emEf" - }, - "source": [ - "Now you have completed all preparation steps to build a TFX pipeline." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "w3OkNz3gTLwM" - }, - "source": [ - "### Write a pipeline definition\n", - "\n", - "We define a function to create a TFX pipeline. A `Pipeline` object\n", - "represents a TFX pipeline which can be run using one of the pipeline\n", - "orchestration systems that TFX supports.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "M49yYVNBTPd4" - }, - "source": [ - "def _create_pipeline(pipeline_name: str, pipeline_root: str, data_root: str,\n", - " module_file: str, serving_model_dir: str,\n", - " metadata_path: str) -\u003e tfx.dsl.Pipeline:\n", - " \"\"\"Creates a three component penguin pipeline with TFX.\"\"\"\n", - " # Brings data into the pipeline.\n", - " example_gen = tfx.components.CsvExampleGen(input_base=data_root)\n", - "\n", - " # Uses user-provided Python function that trains a model.\n", - " trainer = tfx.components.Trainer(\n", - " module_file=module_file,\n", - " examples=example_gen.outputs['examples'],\n", - " train_args=tfx.proto.TrainArgs(num_steps=100),\n", - " eval_args=tfx.proto.EvalArgs(num_steps=5))\n", - "\n", - " # Pushes the model to a filesystem destination.\n", - " pusher = tfx.components.Pusher(\n", - " model=trainer.outputs['model'],\n", - " push_destination=tfx.proto.PushDestination(\n", - " filesystem=tfx.proto.PushDestination.Filesystem(\n", - " base_directory=serving_model_dir)))\n", - "\n", - " # Following three components will be included in the pipeline.\n", - " components = [\n", - " example_gen,\n", - " trainer,\n", - " pusher,\n", - " ]\n", - "\n", - " return tfx.dsl.Pipeline(\n", - " pipeline_name=pipeline_name,\n", - " pipeline_root=pipeline_root,\n", - " metadata_connection_config=tfx.orchestration.metadata\n", - " .sqlite_metadata_connection_config(metadata_path),\n", - " components=components)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mJbq07THU2GV" - }, - "source": [ - "## Run the pipeline\n", - "\n", - "TFX supports multiple orchestrators to run pipelines.\n", - "In this tutorial we will use `LocalDagRunner` which is included in the TFX\n", - "Python package and runs pipelines on local environment.\n", - "We often call TFX pipelines \"DAGs\" which stands for directed acyclic graph.\n", - "\n", - "`LocalDagRunner` provides fast iterations for development and debugging.\n", - "TFX also supports other orchestrators including Kubeflow Pipelines and Apache\n", - "Airflow which are suitable for production use cases.\n", - "\n", - "See\n", - "[TFX on Cloud AI Platform Pipelines](https://www.tensorflow.org/tfx/tutorials/tfx/cloud-ai-platform-pipelines)\n", - "or\n", - "[TFX Airflow Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/airflow_workshop)\n", - "to learn more about other orchestration systems." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7mp0AkmrPdUb" - }, - "source": [ - "Now we create a `LocalDagRunner` and pass a `Pipeline` object created from the\n", - "function we already defined.\n", - "\n", - "The pipeline runs directly and you can see logs for the progress of the pipeline including ML model training." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "fAtfOZTYWJu-" - }, - "source": [ - "tfx.orchestration.LocalDagRunner().run(\n", - " _create_pipeline(\n", - " pipeline_name=PIPELINE_NAME,\n", - " pipeline_root=PIPELINE_ROOT,\n", - " data_root=DATA_ROOT,\n", - " module_file=_trainer_module_file,\n", - " serving_model_dir=SERVING_MODEL_DIR,\n", - " metadata_path=METADATA_PATH))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ppERq0Mj6xvW" - }, - "source": [ - "You should see \"INFO:absl:Component Pusher is finished.\" at the end of the\n", - "logs if the pipeline finished successfully. Because `Pusher` component is the\n", - "last component of the pipeline.\n", - "\n", - "The pusher component pushes the trained model to the `SERVING_MODEL_DIR` which\n", - "is the `serving_model/penguin-simple` directory if you did not change the\n", - "variables in the previous steps. You can see the result from the file browser\n", - "in the left-side panel in Colab, or using the following command:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "NTHROkqX6yHx" - }, - "source": [ - "# List files in created model directory.\n", - "!find {SERVING_MODEL_DIR}" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "08R8qvweThRf" - }, - "source": [ - "## Next steps\n", - "\n", - "You can find more resources on https://www.tensorflow.org/tfx/tutorials.\n", - "\n", - "Please see\n", - "[Understanding TFX Pipelines](https://www.tensorflow.org/tfx/guide/understanding_tfx_pipelines)\n", - "to learn more about various concepts in TFX.\n" - ] - } - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "DjUA6S30k52h" + }, + "source": [ + "##### Copyright 2021 The TensorFlow Authors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SpNWyqewk8fE" + }, + "outputs": [], + "source": [ + "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6x1ypzczQCwy" + }, + "source": [ + "# Simple TFX Pipeline Tutorial using Penguin dataset\n", + "\n", + "***A Short tutorial to run a simple TFX pipeline.***" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HU9YYythm0dx" + }, + "source": [ + "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", + "\n", + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_VuwrlnvQJ5k" + }, + "source": [ + "In this notebook-based tutorial, we will create and run a TFX pipeline\n", + "for a simple classification model.\n", + "The pipeline will consist of three essential TFX components: ExampleGen,\n", + "Trainer and Pusher. The pipeline includes the most minimal ML workflow like\n", + "importing data, training a model and exporting the trained model.\n", + "\n", + "Please see\n", + "[Understanding TFX Pipelines](../../../guide/understanding_tfx_pipelines)\n", + "to learn more about various concepts in TFX." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Fmgi8ZvQkScg" + }, + "source": [ + "## Set Up\n", + "We first need to install the TFX Python package and download\n", + "the dataset which we will use for our model.\n", + "\n", + "### Upgrade Pip\n", + "\n", + "To avoid upgrading Pip in a system when running locally,\n", + "check to make sure that we are running in Colab.\n", + "Local systems can of course be upgraded separately." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "as4OTe2ukSqm" + }, + "outputs": [], + "source": [ + "try:\n", + " import colab\n", + " !pip install --upgrade pip\n", + "except:\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MZOYTt1RW4TK" + }, + "source": [ + "### Install TFX\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iyQtljP-qPHY" + }, + "outputs": [], + "source": [ + "!pip install -U tfx" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EwT0nov5QO1M" + }, + "source": [ + "### Did you restart the runtime?\n", + "\n", + "If you are using Google Colab, the first time that you run\n", + "the cell above, you must restart the runtime by clicking\n", + "above \"RESTART RUNTIME\" button or using \"Runtime > Restart\n", + "runtime ...\" menu. This is because of the way that Colab\n", + "loads packages." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BDnPgN8UJtzN" + }, + "source": [ + "Check the TensorFlow and TFX versions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6jh7vKSRqPHb" + }, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "print('TensorFlow version: {}'.format(tf.__version__))\n", + "from tfx import v1 as tfx\n", + "print('TFX version: {}'.format(tfx.__version__))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aDtLdSkvqPHe" + }, + "source": [ + "### Set up variables\n", + "\n", + "There are some variables used to define a pipeline. You can customize these\n", + "variables as you want. By default all output from the pipeline will be\n", + "generated under the current directory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EcUseqJaE2XN" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "PIPELINE_NAME = \"penguin-simple\"\n", + "\n", + "# Output directory to store artifacts generated from the pipeline.\n", + "PIPELINE_ROOT = os.path.join('pipelines', PIPELINE_NAME)\n", + "# Path to a SQLite DB file to use as an MLMD storage.\n", + "METADATA_PATH = os.path.join('metadata', PIPELINE_NAME, 'metadata.db')\n", + "# Output directory where created models from the pipeline will be exported.\n", + "SERVING_MODEL_DIR = os.path.join('serving_model', PIPELINE_NAME)\n", + "\n", + "from absl import logging\n", + "logging.set_verbosity(logging.INFO) # Set default logging level." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8F2SRwRLSYGa" + }, + "source": [ + "### Prepare example data\n", + "We will download the example dataset for use in our TFX pipeline. The dataset we\n", + "are using is\n", + "[Palmer Penguins dataset](https://allisonhorst.github.io/palmerpenguins/articles/intro.html)\n", + "which is also used in other\n", + "[TFX examples](https://github.com/tensorflow/tfx/tree/master/tfx/examples/penguin).\n", + "\n", + "There are four numeric features in this dataset:\n", + "\n", + "- culmen_length_mm\n", + "- culmen_depth_mm\n", + "- flipper_length_mm\n", + "- body_mass_g\n", + "\n", + "All features were already normalized to have range [0,1]. We will build a\n", + "classification model which predicts the `species` of penguins." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "11J7XiCq6AFP" + }, + "source": [ + "Because TFX ExampleGen reads inputs from a directory, we need to create a\n", + "directory and copy dataset to it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4fxMs6u86acP" + }, + "outputs": [], + "source": [ + "import urllib.request\n", + "import tempfile\n", + "\n", + "DATA_ROOT = tempfile.mkdtemp(prefix='tfx-data') # Create a temporary directory.\n", + "_data_url = 'https://raw.githubusercontent.com/tensorflow/tfx/master/tfx/examples/penguin/data/labelled/penguins_processed.csv'\n", + "_data_filepath = os.path.join(DATA_ROOT, \"data.csv\")\n", + "urllib.request.urlretrieve(_data_url, _data_filepath)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ASpoNmxKSQjI" + }, + "source": [ + "Take a quick look at the CSV file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-eSz28UDSnlG" + }, + "outputs": [], + "source": [ + "!head {_data_filepath}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OTtQNq1DdVvG" + }, + "source": [ + "You should be able to see five values. `species` is one of 0, 1 or 2, and all\n", + "other features should have values between 0 and 1." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nH6gizcpSwWV" + }, + "source": [ + "## Create a pipeline\n", + "\n", + "TFX pipelines are defined using Python APIs. We will define a pipeline which\n", + "consists of following three components.\n", + "- CsvExampleGen: Reads in data files and convert them to TFX internal format\n", + "for further processing. There are multiple\n", + "[ExampleGen](../../../guide/examplegen)s for various\n", + "formats. In this tutorial, we will use CsvExampleGen which takes CSV file input.\n", + "- Trainer: Trains an ML model.\n", + "[Trainer component](../../../guide/trainer) requires a\n", + "model definition code from users. You can use TensorFlow APIs to specify how to\n", + "train a model and save it in a _saved_model_ format.\n", + "- Pusher: Copies the trained model outside of the TFX pipeline.\n", + "[Pusher component](../../../guide/pusher) can be thought\n", + "of as a deployment process of the trained ML model.\n", + "\n", + "Before actually define the pipeline, we need to write a model code for the\n", + "Trainer component first." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lOjDv93eS5xV" + }, + "source": [ + "### Write model training code\n", + "\n", + "We will create a simple DNN model for classification using TensorFlow Keras\n", + "API. This model training code will be saved to a separate file.\n", + "\n", + "In this tutorial we will use\n", + "[Generic Trainer](../../../guide/trainer#generic_trainer)\n", + "of TFX which support Keras-based models. You need to write a Python file\n", + "containing `run_fn` function, which is the entrypoint for the `Trainer`\n", + "component." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "aES7Hv5QTDK3" + }, + "outputs": [], + "source": [ + "_trainer_module_file = 'penguin_trainer.py'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Gnc67uQNTDfW" + }, + "outputs": [], + "source": [ + "%%writefile {_trainer_module_file}\n", + "\n", + "from typing import List\n", + "from absl import logging\n", + "import tensorflow as tf\n", + "from tensorflow import keras\n", + "from tensorflow_transform.tf_metadata import schema_utils\n", + "\n", + "from tfx import v1 as tfx\n", + "from tfx_bsl.public import tfxio\n", + "from tensorflow_metadata.proto.v0 import schema_pb2\n", + "\n", + "_FEATURE_KEYS = [\n", + " 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g'\n", + "]\n", + "_LABEL_KEY = 'species'\n", + "\n", + "_TRAIN_BATCH_SIZE = 20\n", + "_EVAL_BATCH_SIZE = 10\n", + "\n", + "# Since we're not generating or creating a schema, we will instead create\n", + "# a feature spec. Since there are a fairly small number of features this is\n", + "# manageable for this dataset.\n", + "_FEATURE_SPEC = {\n", + " **{\n", + " feature: tf.io.FixedLenFeature(shape=[1], dtype=tf.float32)\n", + " for feature in _FEATURE_KEYS\n", + " },\n", + " _LABEL_KEY: tf.io.FixedLenFeature(shape=[1], dtype=tf.int64)\n", + "}\n", + "\n", + "\n", + "def _input_fn(file_pattern: List[str],\n", + " data_accessor: tfx.components.DataAccessor,\n", + " schema: schema_pb2.Schema,\n", + " batch_size: int = 200) -> tf.data.Dataset:\n", + " \"\"\"Generates features and label for training.\n", + "\n", + " Args:\n", + " file_pattern: List of paths or patterns of input tfrecord files.\n", + " data_accessor: DataAccessor for converting input to RecordBatch.\n", + " schema: schema of the input data.\n", + " batch_size: representing the number of consecutive elements of returned\n", + " dataset to combine in a single batch\n", + "\n", + " Returns:\n", + " A dataset that contains (features, indices) tuple where features is a\n", + " dictionary of Tensors, and indices is a single Tensor of label indices.\n", + " \"\"\"\n", + " return data_accessor.tf_dataset_factory(\n", + " file_pattern,\n", + " tfxio.TensorFlowDatasetOptions(\n", + " batch_size=batch_size, label_key=_LABEL_KEY),\n", + " schema=schema).repeat()\n", + "\n", + "\n", + "def _build_keras_model() -> tf.keras.Model:\n", + " \"\"\"Creates a DNN Keras model for classifying penguin data.\n", + "\n", + " Returns:\n", + " A Keras Model.\n", + " \"\"\"\n", + " # The model below is built with Functional API, please refer to\n", + " # https://www.tensorflow.org/guide/keras/overview for all API options.\n", + " inputs = [keras.layers.Input(shape=(1,), name=f) for f in _FEATURE_KEYS]\n", + " d = keras.layers.concatenate(inputs)\n", + " for _ in range(2):\n", + " d = keras.layers.Dense(8, activation='relu')(d)\n", + " outputs = keras.layers.Dense(3)(d)\n", + "\n", + " model = keras.Model(inputs=inputs, outputs=outputs)\n", + " model.compile(\n", + " optimizer=keras.optimizers.Adam(1e-2),\n", + " loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n", + " metrics=[keras.metrics.SparseCategoricalAccuracy()])\n", + "\n", + " model.summary(print_fn=logging.info)\n", + " return model\n", + "\n", + "\n", + "# TFX Trainer will call this function.\n", + "def run_fn(fn_args: tfx.components.FnArgs):\n", + " \"\"\"Train the model based on given args.\n", + "\n", + " Args:\n", + " fn_args: Holds args used to train the model as name/value pairs.\n", + " \"\"\"\n", + "\n", + " # This schema is usually either an output of SchemaGen or a manually-curated\n", + " # version provided by pipeline author. A schema can also derived from TFT\n", + " # graph if a Transform component is used. In the case when either is missing,\n", + " # `schema_from_feature_spec` could be used to generate schema from very simple\n", + " # feature_spec, but the schema returned would be very primitive.\n", + " schema = schema_utils.schema_from_feature_spec(_FEATURE_SPEC)\n", + "\n", + " train_dataset = _input_fn(\n", + " fn_args.train_files,\n", + " fn_args.data_accessor,\n", + " schema,\n", + " batch_size=_TRAIN_BATCH_SIZE)\n", + " eval_dataset = _input_fn(\n", + " fn_args.eval_files,\n", + " fn_args.data_accessor,\n", + " schema,\n", + " batch_size=_EVAL_BATCH_SIZE)\n", + "\n", + " model = _build_keras_model()\n", + " model.fit(\n", + " train_dataset,\n", + " steps_per_epoch=fn_args.train_steps,\n", + " validation_data=eval_dataset,\n", + " validation_steps=fn_args.eval_steps)\n", + "\n", + " # The result of the training should be saved in `fn_args.serving_model_dir`\n", + " # directory.\n", + " model.save(fn_args.serving_model_dir, save_format='tf')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "blaw0rs-emEf" + }, + "source": [ + "Now you have completed all preparation steps to build a TFX pipeline." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "w3OkNz3gTLwM" + }, + "source": [ + "### Write a pipeline definition\n", + "\n", + "We define a function to create a TFX pipeline. A `Pipeline` object\n", + "represents a TFX pipeline which can be run using one of the pipeline\n", + "orchestration systems that TFX supports.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "M49yYVNBTPd4" + }, + "outputs": [], + "source": [ + "def _create_pipeline(pipeline_name: str, pipeline_root: str, data_root: str,\n", + " module_file: str, serving_model_dir: str,\n", + " metadata_path: str) -> tfx.dsl.Pipeline:\n", + " \"\"\"Creates a three component penguin pipeline with TFX.\"\"\"\n", + " # Brings data into the pipeline.\n", + " example_gen = tfx.components.CsvExampleGen(input_base=data_root)\n", + "\n", + " # Uses user-provided Python function that trains a model.\n", + " trainer = tfx.components.Trainer(\n", + " module_file=module_file,\n", + " examples=example_gen.outputs['examples'],\n", + " train_args=tfx.proto.TrainArgs(num_steps=100),\n", + " eval_args=tfx.proto.EvalArgs(num_steps=5))\n", + "\n", + " # Pushes the model to a filesystem destination.\n", + " pusher = tfx.components.Pusher(\n", + " model=trainer.outputs['model'],\n", + " push_destination=tfx.proto.PushDestination(\n", + " filesystem=tfx.proto.PushDestination.Filesystem(\n", + " base_directory=serving_model_dir)))\n", + "\n", + " # Following three components will be included in the pipeline.\n", + " components = [\n", + " example_gen,\n", + " trainer,\n", + " pusher,\n", + " ]\n", + "\n", + " return tfx.dsl.Pipeline(\n", + " pipeline_name=pipeline_name,\n", + " pipeline_root=pipeline_root,\n", + " metadata_connection_config=tfx.orchestration.metadata\n", + " .sqlite_metadata_connection_config(metadata_path),\n", + " components=components)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mJbq07THU2GV" + }, + "source": [ + "## Run the pipeline\n", + "\n", + "TFX supports multiple orchestrators to run pipelines.\n", + "In this tutorial we will use `LocalDagRunner` which is included in the TFX\n", + "Python package and runs pipelines on local environment.\n", + "We often call TFX pipelines \"DAGs\" which stands for directed acyclic graph.\n", + "\n", + "`LocalDagRunner` provides fast iterations for development and debugging.\n", + "TFX also supports other orchestrators including Kubeflow Pipelines and Apache\n", + "Airflow which are suitable for production use cases.\n", + "\n", + "See\n", + "[TFX on Cloud AI Platform Pipelines](/tutorials/tfx/cloud-ai-platform-pipelines)\n", + "or\n", + "[TFX Airflow Tutorial](/tutorials/tfx/airflow_workshop)\n", + "to learn more about other orchestration systems." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7mp0AkmrPdUb" + }, + "source": [ + "Now we create a `LocalDagRunner` and pass a `Pipeline` object created from the\n", + "function we already defined.\n", + "\n", + "The pipeline runs directly and you can see logs for the progress of the pipeline including ML model training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fAtfOZTYWJu-" + }, + "outputs": [], + "source": [ + "tfx.orchestration.LocalDagRunner().run(\n", + " _create_pipeline(\n", + " pipeline_name=PIPELINE_NAME,\n", + " pipeline_root=PIPELINE_ROOT,\n", + " data_root=DATA_ROOT,\n", + " module_file=_trainer_module_file,\n", + " serving_model_dir=SERVING_MODEL_DIR,\n", + " metadata_path=METADATA_PATH))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ppERq0Mj6xvW" + }, + "source": [ + "You should see \"INFO:absl:Component Pusher is finished.\" at the end of the\n", + "logs if the pipeline finished successfully. Because `Pusher` component is the\n", + "last component of the pipeline.\n", + "\n", + "The pusher component pushes the trained model to the `SERVING_MODEL_DIR` which\n", + "is the `serving_model/penguin-simple` directory if you did not change the\n", + "variables in the previous steps. You can see the result from the file browser\n", + "in the left-side panel in Colab, or using the following command:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NTHROkqX6yHx" + }, + "outputs": [], + "source": [ + "# List files in created model directory.\n", + "!find {SERVING_MODEL_DIR}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "08R8qvweThRf" + }, + "source": [ + "## Next steps\n", + "\n", + "You can find more resources on https://www.tensorflow.org/tfx/tutorials.\n", + "\n", + "Please see\n", + "[Understanding TFX Pipelines](../../../guide/understanding_tfx_pipelines)\n", + "to learn more about various concepts in TFX.\n" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [ + "DjUA6S30k52h" + ], + "name": "penguin_simple.ipynb", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } diff --git a/docs/tutorials/tfx/penguin_template.ipynb b/docs/tutorials/tfx/penguin_template.ipynb index 9ce1babc6b..4d343e35cc 100644 --- a/docs/tutorials/tfx/penguin_template.ipynb +++ b/docs/tutorials/tfx/penguin_template.ipynb @@ -48,19 +48,42 @@ "id": "ZQmvgl9nsqPW" }, "source": [ - "Note: We recommend running this tutorial on Google Cloud [Vertex AI Workbench](https://cloud.google.com/vertex-ai-workbench). [Go to Vertex AI Workbench](https://console.cloud.google.com/vertex-ai/workbench).\n", - "\n", - "\n", - "\u003cdiv class=\"devsite-table-wrapper\"\u003e\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://www.tensorflow.org/tfx/tutorials/tfx/penguin_template\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\"/\u003eView on TensorFlow.org\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/tfx/penguin_template.ipynb\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\"\u003eRun in Google Colab\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tfx/tree/master/docs/tutorials/tfx/penguin_template.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\"\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tfx/docs/tutorials/tfx/penguin_template.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\u003c/td\u003e\n", - "\u003c/table\u003e\u003c/div\u003e" - ] + "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", + "\n", + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] }, { "cell_type": "markdown", @@ -312,7 +335,7 @@ "By default, the template only includes standard TFX components. If you need\n", "some customized actions, you can create custom components for your pipeline.\n", "Please see\n", - "[TFX custom component guide](https://www.tensorflow.org/tfx/guide/understanding_custom_components)\n", + "[TFX custom component guide](../../../guide/understanding_custom_components)\n", "for the detail." ] }, @@ -414,7 +437,7 @@ "### Choose an ExampleGen\n", "\n", "Your data can be stored anywhere your pipeline can access, on either a local or distributed filesystem, or a query-able system. TFX provides various\n", - "[`ExampleGen` components](https://www.tensorflow.org/tfx/guide/examplegen)\n", + "[`ExampleGen` components](../../../guide/examplegen)\n", "to bring your data into a TFX pipeline. You can choose one from following\n", "example generating components.\n", "\n", @@ -436,7 +459,7 @@ "You can also create your own ExampleGen, for example, tfx includes\n", "[a custom ExecampleGen which uses Presto](https://github.com/tensorflow/tfx/tree/master/tfx/examples/custom_components/presto_example_gen)\n", "as a data source. See\n", - "[the guide](https://www.tensorflow.org/tfx/guide/examplegen#custom_examplegen)\n", + "[the guide](../../../guide/examplegen#custom_examplegen)\n", "for more information on how to use and develop custom executors.\n", "\n", "Once you decide which ExampleGen to use, you will need to modify the pipeline\n", @@ -475,7 +498,7 @@ "\n", "1. Replace existing CsvExampleGen to your ExampleGen class in\n", "`pipeline/pipeline.py`. Each ExampleGen class has different signature.\n", - "Please see [ExampleGen component guide](https://www.tensorflow.org/tfx/guide/examplegen) for more detail. Don't forget to import required modules with\n", + "Please see [ExampleGen component guide](../../../guide/examplegen) for more detail. Don't forget to import required modules with\n", "`import` statements in `pipeline/pipeline.py`." ] }, @@ -529,7 +552,7 @@ }, "source": [ "TFX pipeline produces two kinds of output, artifacts and a\n", - "[metadata DB(MLMD)](https://www.tensorflow.org/tfx/guide/mlmd) which contains\n", + "[metadata DB(MLMD)](../../../guide/mlmd) which contains\n", "metadata of artifacts and pipeline executions. The location to the output is\n", "defined in `local_runner.py`. By default, artifacts are stored under\n", "`tfx_pipeline_output` directory and metadata is stored as an sqlite database\n", @@ -701,7 +724,7 @@ "\n", "In this tutorial, we will use visualzation helper methods in TFX which use TFDV\n", "internally to show the visualization. Please see\n", - "[TFX components tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/components_keras)\n", + "[TFX components tutorial](/tutorials/tfx/components_keras)\n", "to learn more about each component." ] }, @@ -736,7 +759,7 @@ "source": [ "By default, TFX ExampleGen divides examples into two splits, *train* and\n", "*eval*, but you can\n", - "[adjust your split configuration](https://www.tensorflow.org/tfx/guide/examplegen#span_version_and_split)." + "[adjust your split configuration](../../../guide/examplegen#span_version_and_split)." ] }, { @@ -799,7 +822,7 @@ "source": [ "This schema is automatically inferred from the output of StatisticsGen.\n", "We will use this generated schema in this tutorial, but you also can\n", - "[modify and customize the schema](https://www.tensorflow.org/tfx/guide/statsgen#creating_a_curated_schema)." + "[modify and customize the schema](../../../guide/statsgen#creating_a_curated_schema)." ] }, { @@ -858,7 +881,7 @@ "\n", "In this step, you will define various feature engineering job which will be\n", "used by `Transform` component in the pipeline. See\n", - "[Transform component guide](https://www.tensorflow.org/tfx/guide/transform)\n", + "[Transform component guide](../../../guide/transform)\n", "for more information.\n", "\n", "This is only necessary if you training code requires additional feature(s)\n", @@ -1001,7 +1024,7 @@ "## Step 4. Train your model with Trainer component.\n", "\n", "We will build a ML model using `Trainer` component. See\n", - "[Trainer component guide](https://www.tensorflow.org/tfx/guide/trainer)\n", + "[Trainer component guide](../../../guide/trainer)\n", "for more information. You need to provide your model code to the Trainer\n", "component.\n", "\n", @@ -1011,7 +1034,7 @@ "`Trainer` component. It means that `run_fn()` function in `models/model.py`\n", "will be called when `Trainer` component runs. You can see the code to construct\n", "a simple DNN model using `keras` API in given code. See\n", - "[TensorFlow 2.x in TFX](https://www.tensorflow.org/tfx/guide/keras)\n", + "[TensorFlow 2.x in TFX](../../../guide/keras)\n", "guide for more information about using keras API in TFX.\n", "\n", "In this `run_fn`, you should build a model and save it to a directory pointed\n", @@ -1109,9 +1132,9 @@ "id": "5DID2nzH-IR7" }, "source": [ - "[`Evaluator`](https://www.tensorflow.org/tfx/guide/evaluator) component\n", + "[`Evaluator`](../../../guide/evaluator) component\n", "continuously evaluate every built model from `Trainer`, and\n", - "[`Pusher`](https://www.tensorflow.org/tfx/guide/pusher) copies the model to\n", + "[`Pusher`](../../../guide/pusher) copies the model to\n", "a predefined location in the file system or even to\n", "[Google Cloud AI Platform Models](https://console.cloud.google.com/ai-platform/models).\n", "\n", @@ -1127,7 +1150,7 @@ "because we are solving a multi category classification problem. You also need\n", "to specify `tfma.SliceSpec` to analyze your model for specific slices. For more\n", "detail, see\n", - "[Evaluator component guide](https://www.tensorflow.org/tfx/guide/evaluator).\n", + "[Evaluator component guide](../../../guide/evaluator).\n", "1. Uncomment `# components.append(evaluator)` to add the component to the\n", "pipeline.\n", "\n", @@ -1222,13 +1245,13 @@ "### Adds Pusher component to the pipeline.\n", "\n", "If the model looks promising, we need to publish the model.\n", - "[Pusher component](https://www.tensorflow.org/tfx/guide/pusher)\n", + "[Pusher component](../../../guide/pusher)\n", "can publish the model to a location in the filesystem or to GCP AI Platform\n", "Models using\n", "[a custom executor](https://github.com/tensorflow/tfx/blob/master/tfx/extensions/google_cloud_ai_platform/pusher/executor.py).\n", "\n", "`Evaluator` component continuously evaluate every built model from `Trainer`,\n", - "and [`Pusher`](https://www.tensorflow.org/tfx/guide/pusher) copies the model to\n", + "and [`Pusher`](../../../guide/pusher) copies the model to\n", "a predefined location in the file system or even to\n", "[Google Cloud AI Platform Models](https://console.cloud.google.com/ai-platform/models).\n", "\n", @@ -1330,7 +1353,7 @@ "source": [ "You also need a Kubeflow Pipelines cluster to run the pipeline. Please\n", "follow Step 1 and 2 in\n", - "[TFX on Cloud AI Platform Pipelines tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/cloud-ai-platform-pipelines).\n", + "[TFX on Cloud AI Platform Pipelines tutorial](/tutorials/tfx/cloud-ai-platform-pipelines).\n", "\n", "When your cluster is ready, open the pipeline dashboard by clicking\n", "*Open Pipelines Dashboard* in the\n", @@ -1494,7 +1517,7 @@ "source": [ "If you are interested in running your pipeline on Kubeflow Pipelines,\n", "find more instructions in\n", - "[TFX on Cloud AI Platform Pipelines tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/cloud-ai-platform-pipelines)." + "[TFX on Cloud AI Platform Pipelines tutorial](/tutorials/tfx/cloud-ai-platform-pipelines)." ] }, { diff --git a/docs/tutorials/tfx/penguin_tfdv.ipynb b/docs/tutorials/tfx/penguin_tfdv.ipynb index 09fb11a0af..4a707b26d6 100644 --- a/docs/tutorials/tfx/penguin_tfdv.ipynb +++ b/docs/tutorials/tfx/penguin_tfdv.ipynb @@ -45,18 +45,42 @@ "id": "HU9YYythm0dx" }, "source": [ - "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", - "\n", - "\u003cdiv class=\"devsite-table-wrapper\"\u003e\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://www.tensorflow.org/tfx/tutorials/tfx/penguin_tfdv\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\"/\u003eView on TensorFlow.org\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/tfx/penguin_tfdv.ipynb\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\"\u003eRun in Google Colab\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tfx/tree/master/docs/tutorials/tfx/penguin_tfdv.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\"\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tfx/docs/tutorials/tfx/penguin_tfdv.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\u003c/td\u003e\n", - "\u003c/table\u003e\u003c/div\u003e" - ] + "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", + "\n", + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] }, { "cell_type": "markdown", @@ -67,7 +91,7 @@ "In this notebook-based tutorial, we will create and run TFX pipelines\n", "to validate input data and create an ML model. This notebook is based on the\n", "TFX pipeline we built in\n", - "[Simple TFX Pipeline Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/penguin_simple).\n", + "[Simple TFX Pipeline Tutorial](/tutorials/tfx/penguin_simple).\n", "If you have not read that tutorial yet, you should read it before proceeding\n", "with this notebook.\n", "\n", @@ -93,10 +117,10 @@ "The three new components, StatisticsGen, SchemaGen and ExampleValidator, are\n", "TFX components for data analysis and validation, and they are implemented\n", "using the\n", - "[TensorFlow Data Validation](https://www.tensorflow.org/tfx/guide/tfdv) library.\n", + "[TensorFlow Data Validation](../../../guide/tfdv) library.\n", "\n", "Please see\n", - "[Understanding TFX Pipelines](https://www.tensorflow.org/tfx/guide/understanding_tfx_pipelines)\n", + "[Understanding TFX Pipelines](../../../guide/understanding_tfx_pipelines)\n", "to learn more about various concepts in TFX." ] }, @@ -328,16 +352,16 @@ "be used for training and example validation in later tasks.\n", "\n", "In addition to `CsvExampleGen` which is used in\n", - "[Simple TFX Pipeline Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/penguin_simple),\n", + "[Simple TFX Pipeline Tutorial](/tutorials/tfx/penguin_simple),\n", "we will use `StatisticsGen` and `SchemaGen`:\n", "\n", - "- [StatisticsGen](https://www.tensorflow.org/tfx/guide/statsgen) calculates\n", + "- [StatisticsGen](../../../guide/statsgen) calculates\n", "statistics for the dataset.\n", - "- [SchemaGen](https://www.tensorflow.org/tfx/guide/schemagen) examines the\n", + "- [SchemaGen](../../../guide/schemagen) examines the\n", "statistics and creates an initial data schema.\n", "\n", "See the guides for each component or\n", - "[TFX components tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/components_keras)\n", + "[TFX components tutorial](/tutorials/tfx/components_keras)\n", "to learn more on these components." ] }, @@ -448,7 +472,7 @@ "source": [ "As explained in the previous tutorial, a TFX pipeline produces two kinds of\n", "outputs, artifacts and a\n", - "[metadata DB(MLMD)](https://www.tensorflow.org/tfx/guide/mlmd) which contains\n", + "[metadata DB(MLMD)](../../../guide/mlmd) which contains\n", "metadata of artifacts and pipeline executions. We defined the location of \n", "these outputs in the above cells. By default, artifacts are stored under\n", "the `pipelines` directory and metadata is stored as a sqlite database\n", @@ -700,12 +724,12 @@ "## Validate input examples and train an ML model\n", "\n", "We will go back to the pipeline that we created in\n", - "[Simple TFX Pipeline Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/penguin_simple),\n", + "[Simple TFX Pipeline Tutorial](/tutorials/tfx/penguin_simple),\n", "to train an ML model and use the generated schema for writing the model\n", "training code.\n", "\n", "We will also add an\n", - "[ExampleValidator](https://www.tensorflow.org/tfx/guide/exampleval)\n", + "[ExampleValidator](../../../guide/exampleval)\n", "component which will look for anomalies and missing values in the incoming\n", "dataset with respect to the schema.\n" ] @@ -719,7 +743,7 @@ "### Write model training code\n", "\n", "We need to write the model code as we did in\n", - "[Simple TFX Pipeline Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/penguin_simple).\n", + "[Simple TFX Pipeline Tutorial](/tutorials/tfx/penguin_simple).\n", "\n", "The model itself is the same as in the previous tutorial, but this time we will\n", "use the schema generated from the previous pipeline instead of specifying\n", @@ -1063,7 +1087,7 @@ "You can find more resources on https://www.tensorflow.org/tfx/tutorials.\n", "\n", "Please see\n", - "[Understanding TFX Pipelines](https://www.tensorflow.org/tfx/guide/understanding_tfx_pipelines)\n", + "[Understanding TFX Pipelines](../../../guide/understanding_tfx_pipelines)\n", "to learn more about various concepts in TFX.\n", "\n" ] diff --git a/docs/tutorials/tfx/penguin_tfma.ipynb b/docs/tutorials/tfx/penguin_tfma.ipynb index 706ac1e546..2ee9524917 100644 --- a/docs/tutorials/tfx/penguin_tfma.ipynb +++ b/docs/tutorials/tfx/penguin_tfma.ipynb @@ -62,18 +62,42 @@ "id": "HU9YYythm0dx" }, "source": [ - "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", - "\n", - "\u003cdiv class=\"devsite-table-wrapper\"\u003e\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://www.tensorflow.org/tfx/tutorials/tfx/penguin_tfma\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\"/\u003eView on TensorFlow.org\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/tfx/penguin_tfma.ipynb\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\"\u003eRun in Google Colab\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tfx/tree/master/docs/tutorials/tfx/penguin_tfma.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\"\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tfx/docs/tutorials/tfx/penguin_tfma.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\u003c/td\u003e\n", - "\u003c/table\u003e\u003c/div\u003e" - ] + "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", + "\n", + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] }, { "cell_type": "markdown", @@ -84,7 +108,7 @@ "In this notebook-based tutorial, we will create and run a TFX pipeline\n", "which creates a simple classification model and analyzes its performance\n", "across multiple runs. This notebook is based on the TFX pipeline we built in\n", - "[Simple TFX Pipeline Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/penguin_simple).\n", + "[Simple TFX Pipeline Tutorial](/tutorials/tfx/penguin_simple).\n", "If you have not read that tutorial yet, you should read it before proceeding\n", "with this notebook.\n", "\n", @@ -97,10 +121,10 @@ "tutorial. The Evaluator component performs deep analysis for your models and\n", "compare the new model against a baseline to determine they are \"good enough\".\n", "It is implemented using the\n", - "[TensorFlow Model Analysis](https://www.tensorflow.org/tfx/guide/tfma) library.\n", + "[TensorFlow Model Analysis](../../../guide/tfma) library.\n", "\n", "Please see\n", - "[Understanding TFX Pipelines](https://www.tensorflow.org/tfx/guide/understanding_tfx_pipelines)\n", + "[Understanding TFX Pipelines](../../../guide/understanding_tfx_pipelines)\n", "to learn more about various concepts in TFX." ] }, @@ -282,9 +306,9 @@ "source": [ "## Create a pipeline\n", "\n", - "We will add an [`Evaluator`](https://www.tensorflow.org/tfx/guide/evaluator)\n", + "We will add an [`Evaluator`](../../../guide/evaluator)\n", "component to the pipeline we created in the\n", - "[Simple TFX Pipeline Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/penguin_simple).\n", + "[Simple TFX Pipeline Tutorial](/tutorials/tfx/penguin_simple).\n", "\n", "An Evaluator component requires input data from an `ExampleGen` component and\n", "a model from a `Trainer` component and a\n", @@ -308,7 +332,7 @@ "### Write model training code\n", "\n", "We will use the same model code as in the\n", - "[Simple TFX Pipeline Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/penguin_simple)." + "[Simple TFX Pipeline Tutorial](/tutorials/tfx/penguin_simple)." ] }, { @@ -464,7 +488,7 @@ "[`Resolver`](https://www.tensorflow.org/tfx/api_docs/python/tfx/v1/dsl/Resolver).\n", "To check a new model is getting better than previous model, we need to compare\n", "it against a previous published model, called baseline.\n", - "[ML Metadata(MLMD)](https://www.tensorflow.org/tfx/guide/mlmd) tracks all\n", + "[ML Metadata(MLMD)](../../../guide/mlmd) tracks all\n", "previous artifacts of the pipeline and `Resolver` can find what was the latest\n", "*blessed* model -- a model passed Evaluator successfully -- from MLMD using a\n", "strategy class called `LatestBlessedModelStrategy`.\n" @@ -591,7 +615,7 @@ "model from the previous run and it will be used as a baseline model for the\n", "comparison.\n", "\n", - "See [Evaluator component guide](https://www.tensorflow.org/tfx/guide/evaluator#using_the_evaluator_component) for more information." + "See [Evaluator component guide](../../../guide/evaluator#using_the_evaluator_component) for more information." ] }, { @@ -803,12 +827,12 @@ "## Next steps\n", "\n", "Learn more on model analysis at\n", - "[TensorFlow Model Analysis library tutorial](https://www.tensorflow.org/tfx/tutorials/model_analysis/tfma_basic).\n", + "[TensorFlow Model Analysis library tutorial](/tutorials/model_analysis/tfma_basic).\n", "\n", "You can find more resources on https://www.tensorflow.org/tfx/tutorials.\n", "\n", "Please see\n", - "[Understanding TFX Pipelines](https://www.tensorflow.org/tfx/guide/understanding_tfx_pipelines)\n", + "[Understanding TFX Pipelines](../../../guide/understanding_tfx_pipelines)\n", "to learn more about various concepts in TFX.\n" ] } diff --git a/docs/tutorials/tfx/penguin_tft.ipynb b/docs/tutorials/tfx/penguin_tft.ipynb index 7bfb8213b9..0e979f4f49 100644 --- a/docs/tutorials/tfx/penguin_tft.ipynb +++ b/docs/tutorials/tfx/penguin_tft.ipynb @@ -47,18 +47,42 @@ "id": "HU9YYythm0dx" }, "source": [ - "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", - "\n", - "\u003cdiv class=\"devsite-table-wrapper\"\u003e\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://www.tensorflow.org/tfx/tutorials/tfx/penguin_tft\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\"/\u003eView on TensorFlow.org\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/tfx/penguin_tft.ipynb\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\"\u003eRun in Google Colab\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tfx/tree/master/docs/tutorials/tfx/penguin_tft.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\"\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tfx/docs/tutorials/tfx/penguin_tft.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\u003c/td\u003e\n", - "\u003c/table\u003e\u003c/div\u003e" - ] + "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", + "\n", + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] }, { "cell_type": "markdown", @@ -69,7 +93,7 @@ "In this notebook-based tutorial, we will create and run a TFX pipeline\n", "to ingest raw input data and preprocess it appropriately for ML training.\n", "This notebook is based on the TFX pipeline we built in\n", - "[Data validation using TFX Pipeline and TensorFlow Data Validation Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/penguin_tfdv).\n", + "[Data validation using TFX Pipeline and TensorFlow Data Validation Tutorial](/tutorials/tfx/penguin_tfdv).\n", "If you have not read that one yet, you should read it before proceeding with\n", "this notebook.\n", "\n", @@ -84,7 +108,7 @@ "[tf.transform](https://www.tensorflow.org/tfx/transform/get_started) library.\n", "\n", "Please see\n", - "[Understanding TFX Pipelines](https://www.tensorflow.org/tfx/guide/understanding_tfx_pipelines)\n", + "[Understanding TFX Pipelines](../../../guide/understanding_tfx_pipelines)\n", "to learn more about various concepts in TFX." ] }, @@ -322,7 +346,7 @@ "### Prepare a schema file\n", "\n", "As described in\n", - "[Data validation using TFX Pipeline and TensorFlow Data Validation Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/penguin_tfdv),\n", + "[Data validation using TFX Pipeline and TensorFlow Data Validation Tutorial](/tutorials/tfx/penguin_tfdv),\n", "we need a schema file for the dataset. Because the dataset is different from the previous tutorial we need to generate it again. In this tutorial, we will skip those steps and just use a prepared schema file.\n" ] }, @@ -366,7 +390,7 @@ "\n", "TFX pipelines are defined using Python APIs. We will add `Transform`\n", "component to the pipeline we created in the\n", - "[Data Validation tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/penguin_tfdv).\n", + "[Data Validation tutorial](/tutorials/tfx/penguin_tfdv).\n", "\n", "A Transform component requires input data from an `ExampleGen` component and\n", "a schema from a `SchemaGen` component, and produces a \"transform graph\". The\n", @@ -880,11 +904,11 @@ "## Next steps\n", "\n", "If you want to learn more about Transform component, see\n", - "[Transform Component guide](https://www.tensorflow.org/tfx/guide/transform).\n", + "[Transform Component guide](../../../guide/transform).\n", "You can find more resources on https://www.tensorflow.org/tfx/tutorials.\n", "\n", "Please see\n", - "[Understanding TFX Pipelines](https://www.tensorflow.org/tfx/guide/understanding_tfx_pipelines)\n", + "[Understanding TFX Pipelines](../../../guide/understanding_tfx_pipelines)\n", "to learn more about various concepts in TFX.\n" ] } diff --git a/docs/tutorials/tfx/python_function_component.ipynb b/docs/tutorials/tfx/python_function_component.ipynb index ab6df9f0c5..639abbeec3 100644 --- a/docs/tutorials/tfx/python_function_component.ipynb +++ b/docs/tutorials/tfx/python_function_component.ipynb @@ -75,20 +75,42 @@ "id": "WdRDkO2wQHUw" }, "source": [ - "Note: We recommend running this tutorial in a Colab notebook, with no setup\n", - "required! Just click \"Run in Google Colab\".\n", - "\n", - "\u003cdiv class=\"devsite-table-wrapper\"\u003e\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://www.tensorflow.org/tfx/tutorials/tfx/python_function_component\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/tfx/python_function_component.ipynb\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\"\u003eRun in Google Colab\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tfx/tree/master/docs/tutorials/tfx/python_function_component.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\"\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://storage.googleapis.com/tensorflow_docs/tfx/docs/tutorials/tfx/python_function_component.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/download_logo_32px.png\"\u003eDownload notebook\u003c/a\u003e\u003c/td\u003e\n", - "\u003c/table\u003e\u003c/div\u003e" - ] + "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", + "\n", + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] }, { "cell_type": "markdown", @@ -101,7 +123,7 @@ "components within the TFX InteractiveContext and in a locally-orchestrated TFX\n", "pipeline.\n", "\n", - "For more context and information, see the [Custom Python function components](https://www.tensorflow.org/tfx/guide/custom_function_component)\n", + "For more context and information, see the [Custom Python function components](../../../guide/custom_function_component)\n", "page on the TFX documentation site." ] }, @@ -238,7 +260,7 @@ "the Python function component development process.\n", "\n", "See [Python function based component\n", - "guide](https://www.tensorflow.org/tfx/guide/custom_function_component)\n", + "guide](../../../guide/custom_function_component)\n", "for more documentation." ] }, @@ -340,7 +362,7 @@ "InteractiveContext.\n", "\n", "For more information on what you can do with the TFX notebook\n", - "InteractiveContext, see the in-notebook [TFX Keras Component Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/components_keras)." + "InteractiveContext, see the in-notebook [TFX Keras Component Tutorial](/tutorials/tfx/components_keras)." ] }, { diff --git a/docs/tutorials/tfx/recommenders.ipynb b/docs/tutorials/tfx/recommenders.ipynb index 78bc375039..b77ae2f672 100644 --- a/docs/tutorials/tfx/recommenders.ipynb +++ b/docs/tutorials/tfx/recommenders.ipynb @@ -46,20 +46,42 @@ "id": "Z17OmgavQfp4" }, "source": [ - "Note: We recommend running this tutorial in a Colab notebook, with no setup\n", - "required! Just click \"Run in Google Colab\".\n", - "\n", - "\u003cdiv class=\"devsite-table-wrapper\"\u003e\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://www.tensorflow.org/tfx/tutorials/tfx/recommenders\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/tfx/recommenders.ipynb\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\"\u003eRun in Google Colab\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tfx/tree/master/docs/tutorials/tfx/recommenders.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\"\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://storage.googleapis.com/tensorflow_docs/tfx/docs/tutorials/tfx/recommenders.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/download_logo_32px.png\"\u003eDownload notebook\u003c/a\u003e\u003c/td\u003e\n", - "\u003c/table\u003e\u003c/div\u003e" - ] + "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", + "\n", + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] }, { "cell_type": "markdown", @@ -209,7 +231,7 @@ "source": [ "## Create a TFDS ExampleGen\n", "\n", - "We create a [custom ExampleGen component](https://www.tensorflow.org/tfx/guide/examplegen#custom_examplegen) which we use to load a TensorFlow Datasets (TFDS) dataset. This uses a custom executor in a FileBasedExampleGen." + "We create a [custom ExampleGen component](../../../guide/examplegen#custom_examplegen) which we use to load a TensorFlow Datasets (TFDS) dataset. This uses a custom executor in a FileBasedExampleGen." ] }, { @@ -396,7 +418,7 @@ "source": [ "## Generate statistics for movies and ratings\n", "\n", - "For a TFX pipeline we need to generate statistics for the dataset. We do that by using a [StatisticsGen component](https://www.tensorflow.org/tfx/guide/statsgen). These will be used by the [SchemaGen component](https://www.tensorflow.org/tfx/guide/schemagen) below when we generate a schema for our dataset. This is good practice anyway, because it's important to examine and analyze your data on an ongoing basis. Since we have two datasets we will create two StatisticsGen components." + "For a TFX pipeline we need to generate statistics for the dataset. We do that by using a [StatisticsGen component](../../../guide/statsgen). These will be used by the [SchemaGen component](../../../guide/schemagen) below when we generate a schema for our dataset. This is good practice anyway, because it's important to examine and analyze your data on an ongoing basis. Since we have two datasets we will create two StatisticsGen components." ] }, { @@ -455,7 +477,7 @@ "source": [ "## Create schemas for movies and ratings\n", "\n", - "For a TFX pipeline we need to generate a data schema from our dataset. We do that by using a [SchemaGen component](https://www.tensorflow.org/tfx/guide/schemagen). This will be used by the [Transform component](https://www.tensorflow.org/tfx/guide/transform) below to do our feature engineering in a way that is highly scalable to large datasets, and avoids training/serving skew. Since we have two datasets we will create two SchemaGen components." + "For a TFX pipeline we need to generate a data schema from our dataset. We do that by using a [SchemaGen component](../../../guide/schemagen). This will be used by the [Transform component](../../../guide/transform) below to do our feature engineering in a way that is highly scalable to large datasets, and avoids training/serving skew. Since we have two datasets we will create two SchemaGen components." ] }, { @@ -516,7 +538,7 @@ "source": [ "## Feature Engineering using Transform\n", "\n", - "For a structured and repeatable design of a TFX pipeline we will need a scalable approach to feature engineering. This allows us to handle the large datasets which are usually part of many recommender systems, and it also avoids training/serving skew. We will do that using the [Transform component](https://www.tensorflow.org/tfx/guide/transform).\n", + "For a structured and repeatable design of a TFX pipeline we will need a scalable approach to feature engineering. This allows us to handle the large datasets which are usually part of many recommender systems, and it also avoids training/serving skew. We will do that using the [Transform component](../../../guide/transform).\n", "\n", "The Transform component uses a module file to supply user code for the feature engineering what we want to do, so our first step is to create that module file. Since we have two datasets, we will create two of these module files and two Transform components.\n", "\n", @@ -684,7 +706,7 @@ "source": [ "## Implementing a model in TFX\n", "\n", - "In the [basic_retrieval](https://www.tensorflow.org/recommenders/examples/basic_retrieval) tutorial the model was created inline in the Python runtime. In a TFX pipeline, the model, metric, and loss are defined and trained in the module file for a [pipeline component called Trainer](https://www.tensorflow.org/tfx/guide/trainer). This makes the model, metric, and loss part of a repeatable process which can be automated and monitored.\n", + "In the [basic_retrieval](https://www.tensorflow.org/recommenders/examples/basic_retrieval) tutorial the model was created inline in the Python runtime. In a TFX pipeline, the model, metric, and loss are defined and trained in the module file for a [pipeline component called Trainer](../../../guide/trainer). This makes the model, metric, and loss part of a repeatable process which can be automated and monitored.\n", "\n", "### TensorFlow Recommenders model architecture\n", "\n", @@ -989,7 +1011,7 @@ "source": [ "## Training the model\n", "\n", - "After defining the model, we can run the [Trainer component](https://www.tensorflow.org/tfx/guide/trainer) to do the model training." + "After defining the model, we can run the [Trainer component](../../../guide/trainer) to do the model training." ] }, { @@ -1027,7 +1049,7 @@ "source": [ "## Exporting the model\n", "\n", - "After training the model, we can use the [Pusher component](https://www.tensorflow.org/tfx/guide/pusher) to export the model." + "After training the model, we can use the [Pusher component](../../../guide/pusher) to export the model." ] }, { diff --git a/docs/tutorials/tfx/stub_template.md b/docs/tutorials/tfx/stub_template.md index 04dd58b9ec..d99fa455dd 100644 --- a/docs/tutorials/tfx/stub_template.md +++ b/docs/tutorials/tfx/stub_template.md @@ -26,7 +26,7 @@ over the artifacts from the recorded outputs. Since this tutorial assumes that you have completed `template.ipynb` up to step 6, a successful pipeline run must have been saved in the -[MLMD](https://www.tensorflow.org/tfx/guide/mlmd). The execution information in +[MLMD](../../../guide/mlmd). The execution information in MLMD can be accessed using gRPC server. Open a Terminal and run the following commands: @@ -92,9 +92,10 @@ following two files in the copied source files. test_component_ids=test_component_ids) ``` - NOTE: This stub component launcher cannot be defined within - `kubeflow_dag_runner.py` because launcher class is imported by the module - path. + !!! Note + This stub component launcher cannot be defined within + `kubeflow_dag_runner.py` because launcher class is imported by the module + path. 1. Set component ids to be list of component ids that are to be tested (in other words, other components' executors are replaced with BaseStubExecutor) diff --git a/docs/tutorials/tfx/template.ipynb b/docs/tutorials/tfx/template.ipynb index fd5454b57e..bf9592cbd4 100644 --- a/docs/tutorials/tfx/template.ipynb +++ b/docs/tutorials/tfx/template.ipynb @@ -45,19 +45,42 @@ "id": "wD2KOXlZuAOj" }, "source": [ - "Note: We recommend running this tutorial on Google Cloud Vertex AI Workbench. [Launch this notebook on Vertex AI Workbench](https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?q=download_url%3Dhttps%253A%252F%252Fraw.githubusercontent.com%252Ftensorflow%252Ftfx%252Fmaster%252Fdocs%252Ftutorials%252Ftfx%252Ftemplate.ipynb).\n", - "\n", - "\n", - "\u003cdiv class=\"devsite-table-wrapper\"\u003e\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://www.tensorflow.org/tfx/tutorials/tfx/template\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\"/\u003eView on TensorFlow.org\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/tfx/template.ipynb\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\"\u003eRun in Google Colab\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tfx/tree/master/docs/tutorials/tfx/template.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\"\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tfx/docs/tutorials/tfx/template.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\u003c/td\u003e\n", - "\u003c/table\u003e\u003c/div\u003e" - ] + "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", + "\n", + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] }, { "metadata": { @@ -316,7 +339,7 @@ "source": [ "## Step 3. Browse your copied source files\n", "\n", - "The TFX template provides basic scaffold files to build a pipeline, including Python source code, sample data, and Jupyter Notebooks to analyse the output of the pipeline. The `taxi` template uses the same *Chicago Taxi* dataset and ML model as the [Airflow Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/airflow_workshop).\n", + "The TFX template provides basic scaffold files to build a pipeline, including Python source code, sample data, and Jupyter Notebooks to analyse the output of the pipeline. The `taxi` template uses the same *Chicago Taxi* dataset and ML model as the [Airflow Tutorial](/tutorials/tfx/airflow_workshop).\n", "\n", "Here is brief introduction to each of the Python files.\n", "- `pipeline` - This directory contains the definition of the pipeline\n", @@ -365,7 +388,7 @@ "source": [ "## Step 4. Run your first TFX pipeline\n", "\n", - "Components in the TFX pipeline will generate outputs for each run as [ML Metadata Artifacts](https://www.tensorflow.org/tfx/guide/mlmd), and they need to be stored somewhere. You can use any storage which the KFP cluster can access, and for this example we will use Google Cloud Storage (GCS). A default GCS bucket should have been created automatically. Its name will be `\u003cyour-project-id\u003e-kubeflowpipelines-default`.\n" + "Components in the TFX pipeline will generate outputs for each run as [ML Metadata Artifacts](../../../guide/mlmd), and they need to be stored somewhere. You can use any storage which the KFP cluster can access, and for this example we will use Google Cloud Storage (GCS). A default GCS bucket should have been created automatically. Its name will be `\u003cyour-project-id\u003e-kubeflowpipelines-default`.\n" ] }, { @@ -592,7 +615,7 @@ "source": [ "## Step 8. (*Optional*) Try Dataflow with KFP\n", "\n", - "Several [TFX Components uses Apache Beam](https://www.tensorflow.org/tfx/guide/beam) to implement data-parallel pipelines, and it means that you can distribute data processing workloads using [Google Cloud Dataflow](https://cloud.google.com/dataflow/). In this step, we will set the Kubeflow orchestrator to use dataflow as the data processing back-end for Apache Beam.\n", + "Several [TFX Components uses Apache Beam](../../../guide/beam) to implement data-parallel pipelines, and it means that you can distribute data processing workloads using [Google Cloud Dataflow](https://cloud.google.com/dataflow/). In this step, we will set the Kubeflow orchestrator to use dataflow as the data processing back-end for Apache Beam.\n", "\n", "\u003e**Double-click `pipeline` to change directory, and double-click to open `configs.py`**. Uncomment the definition of `GOOGLE_CLOUD_REGION`, and `DATAFLOW_BEAM_PIPELINE_ARGS`.\n", "\n", @@ -682,11 +705,11 @@ "\n", "1. If your data is stored in files, modify the `DATA_PATH` in `kubeflow_runner.py` or `local_runner.py` and set it to the location of your files. If your data is stored in BigQuery, modify `BIG_QUERY_QUERY` in `pipeline/configs.py` to correctly query for your data.\n", "1. Add features in `models/features.py`.\n", - "1. Modify `models/preprocessing.py` to [transform input data for training](https://www.tensorflow.org/tfx/guide/transform).\n", - "1. Modify `models/keras/model.py` and `models/keras/constants.py` to [describe your ML model](https://www.tensorflow.org/tfx/guide/trainer).\n", + "1. Modify `models/preprocessing.py` to [transform input data for training](../../../guide/transform).\n", + "1. Modify `models/keras/model.py` and `models/keras/constants.py` to [describe your ML model](../../../guide/trainer).\n", " - You can use an estimator based model, too. Change `RUN_FN` constant to `models.estimator.model.run_fn` in `pipeline/configs.py`.\n", "\n", - "Please see [Trainer component guide](https://www.tensorflow.org/tfx/guide/trainer) for more introduction." + "Please see [Trainer component guide](../../../guide/trainer) for more introduction." ] }, { diff --git a/docs/tutorials/tfx/template_local.ipynb b/docs/tutorials/tfx/template_local.ipynb index 4cad4d5988..1263259c0e 100644 --- a/docs/tutorials/tfx/template_local.ipynb +++ b/docs/tutorials/tfx/template_local.ipynb @@ -45,16 +45,42 @@ "id": "XdSXv1DrxdLL" }, "source": [ - "\u003cdiv class=\"devsite-table-wrapper\"\u003e\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://www.tensorflow.org/tfx/tutorials/tfx/template_local\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\"/\u003eView on TensorFlow.org\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/tfx/template_local.ipynb\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\"\u003eRun in Google Colab\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tfx/tree/master/docs/tutorials/tfx/template_local.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\"\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tfx/docs/tutorials/tfx/template_local.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\u003c/td\u003e\n", - "\u003c/table\u003e\u003c/div\u003e" - ] + "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", + "\n", + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] }, { "metadata": { @@ -83,7 +109,7 @@ "released by the City of Chicago. We strongly encourage you to try to build\n", "your own pipeline using your dataset by utilizing this pipeline as a baseline.\n", "\n", - "We will build a pipeline which runs on local environment. If you are interested in using Kubeflow orchestrator on Google Cloud, please see [TFX on Cloud AI Platform Pipelines tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/cloud-ai-platform-pipelines).\n", + "We will build a pipeline which runs on local environment. If you are interested in using Kubeflow orchestrator on Google Cloud, please see [TFX on Cloud AI Platform Pipelines tutorial](/tutorials/tfx/cloud-ai-platform-pipelines).\n", "\n", "## Prerequisites\n", "\n", @@ -292,7 +318,7 @@ "id": "QdiHik_w42xN" }, "source": [ - "The TFX template provides basic scaffold files to build a pipeline, including Python source code, sample data, and Jupyter Notebooks to analyse the output of the pipeline. The `taxi` template uses the same *Chicago Taxi* dataset and ML model as the [Airflow Tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/airflow_workshop).\n", + "The TFX template provides basic scaffold files to build a pipeline, including Python source code, sample data, and Jupyter Notebooks to analyse the output of the pipeline. The `taxi` template uses the same *Chicago Taxi* dataset and ML model as the [Airflow Tutorial](/tutorials/tfx/airflow_workshop).\n", "\n", "In Google Colab, you can browse files by clicking a folder icon on the left. Files should be copied under the project directoy, whose name is `my_pipeline` in this case. You can click directory names to see the content of the directory, and double-click file names to open them.\n", "\n", @@ -595,11 +621,11 @@ "\n", "1. If your data is stored in files, modify the `DATA_PATH` in `kubeflow_runner.py` or `local_runner.py` and set it to the location of your files. If your data is stored in BigQuery, modify `BIG_QUERY_QUERY` in `pipeline/configs.py` to correctly query for your data.\n", "1. Add features in `models/features.py`.\n", - "1. Modify `models/preprocessing.py` to [transform input data for training](https://www.tensorflow.org/tfx/guide/transform).\n", - "1. Modify `models/keras/model.py` and `models/keras/constants.py` to [describe your ML model](https://www.tensorflow.org/tfx/guide/trainer).\n", + "1. Modify `models/preprocessing.py` to [transform input data for training](../../../guide/transform).\n", + "1. Modify `models/keras/model.py` and `models/keras/constants.py` to [describe your ML model](../../../guide/trainer).\n", " - You can use an estimator based model, too. Change `RUN_FN` constant to `models.estimator.model.run_fn` in `pipeline/configs.py`.\n", "\n", - "Please see [Trainer component guide](https://www.tensorflow.org/tfx/guide/trainer) for more introduction." + "Please see [Trainer component guide](../../../guide/trainer) for more introduction." ] } ], diff --git a/docs/tutorials/tfx/tfx_for_mobile.md b/docs/tutorials/tfx/tfx_for_mobile.md index 95fe2899a8..ec12a0575c 100644 --- a/docs/tutorials/tfx/tfx_for_mobile.md +++ b/docs/tutorials/tfx/tfx_for_mobile.md @@ -16,12 +16,12 @@ standard Keras-based [SavedModel](https://www.tensorflow.org/guide/saved_model) as well as the TFLite one, allowing users to compare the quality of the two. We assume you are familiar with TFX, our components, and our pipelines. If not, -then please see this [tutorial](https://www.tensorflow.org/tfx/tutorials/tfx/components). +then please see this [tutorial](/tutorials/tfx/components). ## Steps Only two steps are required to create and evaluate a TFLite model in TFX. The first step is invoking the TFLite rewriter within the context of the -[TFX Trainer](https://www.tensorflow.org/tfx/guide/trainer) to convert the +[TFX Trainer](../../../guide/trainer) to convert the trained TensorFlow model into a TFLite one. The second step is configuring the Evaluator to evaluate TFLite models. We now discuss each in turn. @@ -79,7 +79,7 @@ components will be expecting to find the model. ### Evaluating the TFLite model. -The [TFX Evaluator](https://www.tensorflow.org/tfx/guide/evaluator) provides the +The [TFX Evaluator](../../../guide/evaluator) provides the ability to analyze trained models to understand their quality across a wide range of metrics. In addition to analyzing SavedModels, the TFX Evaluator is now able to analyze TFLite models as well. diff --git a/docs/tutorials/transform/census.ipynb b/docs/tutorials/transform/census.ipynb index 5e2ac99985..f90dcc944f 100644 --- a/docs/tutorials/transform/census.ipynb +++ b/docs/tutorials/transform/census.ipynb @@ -6,17 +6,42 @@ "id": "uAttKaKmT435" }, "source": [ - "\u003cdiv class=\"devsite-table-wrapper\"\u003e\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://www.tensorflow.org/tfx/tutorials/transform/census\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://colab.sandbox.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/transform/census.ipynb\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\"\u003eRun in Google Colab\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tfx/blob/master/docs/tutorials/transform/census.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\"\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://storage.googleapis.com/tensorflow_docs/tfx/docs/tutorials/transform/census.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/download_logo_32px.png\"\u003eDownload notebook\u003c/a\u003e\u003c/td\u003e\n", - "\u003c/table\u003e\u003c/div\u003e" - ] + "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", + "\n", + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] }, { "cell_type": "markdown", diff --git a/docs/tutorials/transform/data_preprocessing_with_cloud.md b/docs/tutorials/transform/data_preprocessing_with_cloud.md index 88d6ef9428..8b4db2a29b 100644 --- a/docs/tutorials/transform/data_preprocessing_with_cloud.md +++ b/docs/tutorials/transform/data_preprocessing_with_cloud.md @@ -11,16 +11,16 @@ and they create as byproducts a TensorFlow graph to apply the same transformations during prediction as when the model is served. This tutorial provides an end-to-end example using -[Dataflow](https://cloud.google.com/dataflow/docs){: .external } +[Dataflow](https://cloud.google.com/dataflow/docs) as a runner for Apache Beam. It assumes that you're familiar with -[BigQuery](https://cloud.google.com/bigquery/docs){: .external }, +[BigQuery](https://cloud.google.com/bigquery/docs), Dataflow, -[Vertex AI](https://cloud.google.com/vertex-ai/docs/start/introduction-unified-platform){: .external }, +[Vertex AI](https://cloud.google.com/vertex-ai/docs/start/introduction-unified-platform), and the TensorFlow [Keras](https://www.tensorflow.org/guide/keras/overview) API. It also assumes that you have some experience using Jupyter Notebooks, such as with -[Vertex AI Workbench](https://cloud.google.com/vertex-ai/docs/workbench/introduction){: .external }. +[Vertex AI Workbench](https://cloud.google.com/vertex-ai/docs/workbench/introduction). This tutorial also assumes that you're familiar with the concepts of preprocessing types, challenges, and options on Google Cloud, as described in @@ -47,7 +47,7 @@ This tutorial uses the following billable components of Google Cloud: To estimate the cost to run this tutorial, assuming you use every resource for an entire day, use the preconfigured -[pricing calculator](/products/calculator/#id=fad408d8-dd68-45b8-954e-5a5619a5d148){: .external }. +[pricing calculator](https://www.tensorflow.org/products/calculator#id=fad408d8-dd68-45b8-954e-5a5619a5d148). ## Before you begin @@ -55,28 +55,30 @@ an entire day, use the preconfigured 1. In the Google Cloud console, on the project selector page, select or [create a Google Cloud project](https://cloud.google.com/resource-manager/docs/creating-managing-projects). - Note: If you don't plan to keep the resources that you create in this - procedure, create a project instead of selecting an existing project. - After you finish these steps, you can delete the project, removing all - resources associated with the project. + !!! Note + If you don't plan to keep the resources that you create in this + procedure, create a project instead of selecting an existing project. + After you finish these steps, you can delete the project, removing all + resources associated with the project. - [Go to project selector](https://console.cloud.google.com/projectselector2/home/dashboard){: class="button button-primary" target="console" track-type="solution" track-name="consoleLink" track-metadata-position="body" } + [Go to project selector](https://console.cloud.google.com/projectselector2/home/dashboard){ .md-button .md-button--primary } 2. Make sure that billing is enabled for your Cloud project. Learn how to [check if billing is enabled on a project](https://cloud.google.com/billing/docs/how-to/verify-billing-enabled). 3. Enable the Dataflow, Vertex AI, and Notebooks APIs. - [Enable the APIs](https://console.cloud.google.com/flows/enableapi?apiid=dataflow,aiplatform.googleapis.com,notebooks.googleapis.com){: class="button button-primary" target="console" track-type="solution" track-name="consoleLink" track-metadata-position="body" } + + [Enable the APIs](https://console.cloud.google.com/flows/enableapi?apiid=dataflow,aiplatform.googleapis.com,notebooks.googleapis.com){ .md-button .md-button--primary } ## Jupyter notebooks for this solution The following Jupyter notebooks show the implementation example: -* [Notebook 1](https://github.com/GoogleCloudPlatform/training-data-analyst/blob/master/blogs/babyweight_tft/babyweight_tft_keras_01.ipynb){: .external } +* [Notebook 1](https://github.com/GoogleCloudPlatform/training-data-analyst/blob/master/blogs/babyweight_tft/babyweight_tft_keras_01.ipynb) covers data preprocessing. Details are provided in the [Implementing the Apache Beam pipeline](#implement-the-apache-beam-pipeline) section later. -* [Notebook 2](https://github.com/GoogleCloudPlatform/training-data-analyst/blob/master/blogs/babyweight_tft/babyweight_tft_keras_02.ipynb){: .external } +* [Notebook 2](https://github.com/GoogleCloudPlatform/training-data-analyst/blob/master/blogs/babyweight_tft/babyweight_tft_keras_02.ipynb) covers model training. Details are provided in the [Implementing the TensorFlow model](#implement-the-tensorflow-model) section later. @@ -88,7 +90,7 @@ notebooks to learn how the implementation example works. 1. In the Google Cloud console, go to the **Vertex AI Workbench** page. - [Go to Workbench](https://console.cloud.google.com/ai-platform/notebooks/list/instances){: class="button button-primary" target="console" track-type="solution" track-name="consoleLink" track-metadata-position="body" } + [Go to Workbench](https://console.cloud.google.com/ai-platform/notebooks/list/instances){ .md-button .md-button--primary } 1. On the **User-managed notebooks** tab, click **+New notebook**. 1. Select **TensorFlow Enterprise 2.8 (with LTS) without GPUs** for the @@ -116,12 +118,12 @@ notebook name. ## Implement the Apache Beam pipeline This section and the next section -[Run the pipeline in Dataflow](#run-the-pipeline-in-dataflow){: track-type="solution" track-name="internalLink" track-metadata-position="body" } +[Run the pipeline in Dataflow](#run-the-pipeline-in-dataflow) provide an overview and context for Notebook 1. The notebook provides a practical example to describe how to use the `tf.Transform` library to preprocess data. This example uses the Natality dataset, which is used to predict baby weights based on various inputs. The data is stored in the public -[natality](https://console.cloud.google.com/bigquery?p=bigquery-public-data&d=samples&t=natality&page=table&_ga=2.267763789.2122871960.1676620306-376763843.1676620306){: target="console" track-type="solution" track-name="consoleLink" track-metadata-position="body" } +[natality](https://console.cloud.google.com/bigquery?p=bigquery-public-data&d=samples&t=natality&page=table&_ga=2.267763789.2122871960.1676620306-376763843.1676620306) table in BigQuery. ### Run Notebook 1 @@ -139,7 +141,7 @@ table in BigQuery. The last part of the output is the following: - ```none{:.devsite-disable-click-to-copy} + ``` {.no-copy } Successfully installed ... ``` @@ -149,7 +151,7 @@ table in BigQuery. 1. Execute the second cell to run the `pip install tensorflow-transform `command. The last part of the output is the following: - ```none{:.devsite-disable-click-to-copy} + ``` { .no-copy } Successfully installed ... Note: you may need to restart the kernel to use updated packages. ``` @@ -176,7 +178,7 @@ the pipeline. The overall pipeline steps are as follows: 1. Read training data from BigQuery. 1. Analyze and transform training data using the `tf.Transform` library. 1. Write transformed training data to Cloud Storage in the - [TFRecord](https://www.tensorflow.org/tutorials/load_data/tfrecord){: target="external" class="external" track-type="solution" track-name="externalLink" track-metadata-position="body" } + [TFRecord](https://www.tensorflow.org/tutorials/load_data/tfrecord) format. 1. Read evaluation data from BigQuery. 1. Transform evaluation data using the `transform_fn` graph produced by step 2. @@ -188,7 +190,7 @@ the pipeline. The overall pipeline steps are as follows: The following example shows the Python code for the overall pipeline. The sections that follow provide explanations and code listings for each step. -```py{:.devsite-disable-click-to-copy} +``` { .py .yaml .no-copy } def run_transformation_pipeline(args): pipeline_options = beam.pipeline.PipelineOptions(flags=[], **args) @@ -232,7 +234,7 @@ def run_transformation_pipeline(args): write_text(transformed_train_dataset, transformed_data_location, step) ``` -### Read raw training data from BigQuery{: id="read_raw_training_data"} +### Read raw training data from BigQuery The first step is to read the raw training data from BigQuery using the `read_from_bq` function. This function returns a `raw_dataset` object @@ -241,7 +243,7 @@ pass a `step` value of `train` or `eval`. The BigQuery source query is constructed using the `get_source_query` function, as shown in the following example: -```py{:.devsite-disable-click-to-copy} +``` { .py .yaml .no-copy } def read_from_bq(pipeline, step, data_size): source_query = get_source_query(step, data_size) @@ -270,7 +272,7 @@ In addition, to use the `tf.Transform` library to analyze and transform the The `raw_metadata` object is created using the `create_raw_metadata` function, as follows: -```py{:.devsite-disable-click-to-copy} +``` { .py .yaml .no-copy } CATEGORICAL_FEATURE_NAMES = ['is_male', 'mother_race'] NUMERIC_FEATURE_NAMES = ['mother_age', 'plurality', 'gestation_weeks'] TARGET_FEATURE_NAME = 'weight_pounds' @@ -306,84 +308,17 @@ input raw features of the training data in order to prepare it for ML. These transformations include both full-pass and instance-level operations, as shown in the following table: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Input featureTransformationStats neededTypeOutput feature
weight_poundNoneNoneNAweight_pound
mother_ageNormalizemean, varFull-passmother_age_normalized
mother_ageEqual size bucketizationquantilesFull-passmother_age_bucketized
mother_ageCompute the logNoneInstance-level - mother_age_log -
pluralityIndicate if it is single or multiple babiesNoneInstance-levelis_multiple
is_multipleConvert nominal values to numerical indexvocabFull-passis_multiple_index
gestation_weeksScale between 0 and 1min, maxFull-passgestation_weeks_scaled
mother_raceConvert nominal values to numerical indexvocabFull-passmother_race_index
is_maleConvert nominal values to numerical indexvocabFull-passis_male_index
+ | Input feature | Transformation | Stats needed | Type | Output feature + | ------------------- | --------------------------------------------- | -------------- | ---------------- | -------------------------- | + | `weight_pound` | None | None | NA | `weight_pound` | + | `mother_age` | Normalize | mean, var | Full-pass | `mother_age_normalized` | + | `mother_age` | Equal size bucketization | quantiles | Full-pass | `mother_age_bucketized` | + | `mother_age` | Compute the log | None | Instance-level | `mother_age_log` | + | `plurality` | Indicate if it is single or multiple babies | None | Instance-level | `is_multiple` | + | `is_multiple` | Convert nominal values to numerical index | vocab | Full-pass | `is_multiple_index` | + | `gestation_weeks` | Scale between 0 and 1 | min, max | Full-pass | `gestation_weeks_scaled` | + | `mother_race` | Convert nominal values to numerical index | vocab | Full-pass | `mother_race_index` | + | `is_male` | Convert nominal values to numerical index | vocab | Full-pass | `is_male_index` | These transformations are implemented in a `preprocess_fn` function, which expects a dictionary of tensors (`input_features`) and returns a dictionary of @@ -393,7 +328,7 @@ The following code shows the implementation of the `preprocess_fn` function, using the `tf.Transform` full-pass transformation APIs (prefixed with `tft.`), and TensorFlow (prefixed with `tf.`) instance-level operations: -```py{:.devsite-disable-click-to-copy} +``` { .py .yaml .no-copy } def preprocess_fn(input_features): output_features = {} @@ -425,81 +360,22 @@ def preprocess_fn(input_features): ``` The `tf.Transform` -[framework](https://github.com/tensorflow/transform){: .external } +[framework](https://github.com/tensorflow/transform) has several other transformations in addition to those in the preceding example, including those listed in the following table: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
TransformationApplies toDescription
scale_by_min_maxNumeric features - Scales a numerical column into the range [output_min, - output_max] -
scale_to_0_1Numeric features - Returns a column which is the input column scaled to have range - [0,1] -
scale_to_z_scoreNumeric featuresReturns a standardized column with mean 0 and variance 1
tfidfText features - Maps the terms in x to their term frequency * inverse document - frequency -
compute_and_apply_vocabularyCategorical features - Generates a vocabulary for a categorical feature and maps it to - an integer with this vocab -
ngramsText featuresCreates a SparseTensor of n-grams
hash_stringsCategorical featuresHashes strings into buckets
pcaNumeric featuresComputes PCA on the dataset using biased covariance
bucketizeNumeric features - Returns an equal-sized (quantiles-based) bucketized column, with - a bucket index assigned to each input -
+ | Transformation | Applies to | Description | + | -------------------------------- | ---------------------- | -------------------------------------------------------------------------------------------------------- | + | `scale_by_min_max` | Numeric features | Scales a numerical column into the range \[`output_min`, `output_max`\] | + | `scale_to_0_1` | Numeric features | Returns a column which is the input column scaled to have range \[`0`,`1`\] | + | `scale_to_z_score` | Numeric features | Returns a standardized column with mean 0 and variance 1 | + | `tfidf` | Text features | Maps the terms in *x* to their term frequency \* inverse document frequency | + | `compute_and_apply_vocabulary` | Categorical features | Generates a vocabulary for a categorical feature and maps it to an integer with this vocab | + | `ngrams` | Text features | Creates a `SparseTensor` of n-grams | + | `hash_strings` | Categorical features | Hashes strings into buckets | + | `pca` | Numeric features | Computes PCA on the dataset using biased covariance | + | `bucketize` | Numeric features | Returns an equal-sized (quantiles-based) bucketized column, with a bucket index assigned to each input | + In order to apply the transformations implemented in the `preprocess_fn` function to the `raw_train_dataset` object produced in the previous step of the @@ -508,7 +384,7 @@ the `raw_dataset` object as input, applies the `preprocess_fn` function, and it produces the `transformed_dataset` object and the `transform_fn` graph. The following code illustrates this processing: -```py{:.devsite-disable-click-to-copy} +``` { .py .yaml .no-copy } def analyze_and_transform(raw_dataset, step): transformed_dataset, transform_fn = ( @@ -536,7 +412,7 @@ produces two outputs: - `transform_fn`: a TensorFlow graph that contains the computed stats from the analyze phase and the transformation logic (which uses the stats) as instance-level operations. As discussed later in - [Save the graph](#save_the_graph){: track-type="solution" track-name="internalLink" track-metadata-position="body" }, + [Save the graph](#save-the-graph), the `transform_fn` graph is saved to be attached to the model `serving_fn` function. This makes it possible to apply the same transformation to the online prediction data points. @@ -545,14 +421,12 @@ produces two outputs: The analyze phase is illustrated in the following diagram, figure 1: -
- The tf.Transform analyze phase. -
Figure 1. The tf.Transform analyze phase.
-
+Figure: The `tf.Transform` analyze phase. { #tf-transform-analyze-phase } + +![The tf.Transform analyze phase.](images/data-preprocessing-for-ml-with-tf-transform-tf-transform-analyze-phase.svg) The `tf.Transform` -[analyzers](https://github.com/tensorflow/transform/blob/master/tensorflow_transform/beam/analyzer_impls.py){: target="github" class="external" track-type="solution" track-name="gitHubLink" track-metadata-position="body" } +[analyzers](https://github.com/tensorflow/transform/blob/master/tensorflow_transform/beam/analyzer_impls.py) include `min`, `max`, `sum`, `size`, `mean`, `var`, `covariance`, `quantiles`, `vocabulary`, and `pca`. @@ -566,11 +440,9 @@ the `transformed_train_dataset` dataset. The transform phase is illustrated in the following diagram, figure 2: -
- The tf.Transform transform phase. -
Figure 2. The tf.Transform transform phase.
-
+Figure: The `tf.Transform` transform phase. { #tf-transform-transform-phase } + +![The tf.Transform transform phase.](images/data-preprocessing-for-ml-with-tf-transform-tf-transform-transform-phase.svg) To preprocess the features, you call the required `tensorflow_transform` transformations (imported as `tft` in the code) in your implementation of the @@ -594,7 +466,7 @@ following columns: - `weight_pounds` (type: `FLOAT`) As explained in -[Preprocessing operations](data-preprocessing-for-ml-with-tf-transform-pt1#preprocessing_operations) +[Preprocessing operations](../data-preprocessing-for-ml-with-tf-transform-pt1#preprocessing-operations) in the first part of this series, the feature transformation converts categorical features to a numeric representation. After the transformation, the categorical features are represented by integer values. In the @@ -602,7 +474,7 @@ categorical features are represented by integer values. In the columns indicates whether the column represents a categorical feature or a true numeric feature. -### Write transformed training data{: id="step_3_write_transformed_training_data"} +### Write transformed training data After the training data is preprocessed with the `preprocess_fn` function through the analyze and transform phases, you can write the data to a sink to be @@ -619,7 +491,7 @@ converted into tensors when they are fed to the model for training. The following code writes the transformed dataset to TFRecord files in the specified location: -```py{:.devsite-disable-click-to-copy} +``` { .py .yaml .no-copy } def write_tfrecords(transformed_dataset, location, step): from tfx_bsl.coders import example_coder @@ -640,12 +512,12 @@ After you transform the training data and produce the `transform_fn` graph, you can use it to transform the evaluation data. First, you read and clean the evaluation data from BigQuery using the `read_from_bq` function described earlier in -[Read raw training data from BigQuery](#read-raw-training-data-from-bigquery){: track-type="solution" track-name="internalLink" track-metadata-position="body" }, +[Read raw training data from BigQuery](#read-raw-training-data-from-bigquery), and passing a value of `eval` for the `step` parameter. Then, you use the following code to transform the raw evaluation dataset (`raw_dataset`) to the expected transformed format (`transformed_dataset`): -```py{:.devsite-disable-click-to-copy} +``` { .py .yaml .no-copy } def transform(raw_dataset, transform_fn, step): transformed_dataset = ( @@ -673,16 +545,14 @@ You then write the data to a sink (Cloud Storage or local disk, depending on the runner) in the TFRecord format for evaluating the TensorFlow model during the training process. To do this, you use the `write_tfrecords` function that's discussed in -[Write transformed training data](#step_3_write_transformed_training_data){: track-type="solution" track-name="internalLink" track-metadata-position="body" }. +[Write transformed training data](#write-transformed-training-data). The following diagram, figure 3, shows how the `transform_fn` graph that's produced in the analyze phase of the training data is used to transform the evaluation data. -
- Transforming evaluation data using the transform_fn graph. -
Figure 3. Transforming evaluation data using the transform_fn graph.
-
+Figure: Transforming evaluation data using the `transform_fn` graph. { #transform-eval-data-using-transform-fn } + +![Transforming evaluation data using the transform_fn graph.](images/data-preprocessing-for-ml-with-tf-transform-transforming-eval-data-using-transform_fn.svg) ### Save the graph @@ -691,7 +561,7 @@ artifacts, which includes the `transform_fn` graph that's produced by the analyze phase on the training data. The code for storing the artifacts is shown in the following `write_transform_artefacts` function: -```py{:.devsite-disable-click-to-copy} +``` { .py .yaml .no-copy } def write_transform_artefacts(transform_fn, location): ( @@ -716,19 +586,16 @@ The following artifacts are also produced, as shown in the next section: - `transformed_metadata`: a directory that contains the `schema.json` file that describes the schema of the transformed data. -## Run the pipeline in Dataflow{:#run_the_pipeline_in_dataflow} +## Run the pipeline in Dataflow After you define the `tf.Transform` pipeline, you run the pipeline using Dataflow. The following diagram, figure 4, shows the Dataflow execution graph of the `tf.Transform` pipeline described in the example. -
- Dataflow execution graph of the tf.Transform pipeline. -
Figure 4. Dataflow execution graph - of the tf.Transform pipeline.
-
+Figure: Dataflow execution graph of the `tf.Transform` pipeline. { #dataflow-execution-graph } + +![Dataflow execution graph of the tf.Transform pipeline.](images/data-preprocessing-for-ml-with-tf-transform-dataflow-execution-graph.png) After you execute the Dataflow pipeline to preprocess the training and evaluation data, you can explore the produced objects in @@ -740,20 +607,20 @@ bucket. The transformed training and evaluation data in TFRecord format are stored at the following location: -```none{:.devsite-disable-click-to-copy} +``` { .yaml .no-copy } gs://YOUR_BUCKET_NAME/babyweight_tft/transformed ``` The transform artifacts are produced at the following location: -```none{:.devsite-disable-click-to-copy} +``` { .yaml .no-copy } gs://YOUR_BUCKET_NAME/babyweight_tft/transform ``` The following list is the output of the pipeline, showing the produced data objects and artifacts: -```none{:.devsite-disable-click-to-copy} +``` { .yaml .no-copy } transformed data: gs://YOUR_BUCKET_NAME/babyweight_tft/transformed/eval-00000-of-00001.tfrecords gs://YOUR_BUCKET_NAME/babyweight_tft/transformed/train-00000-of-00002.tfrecords @@ -777,10 +644,10 @@ gs://YOUR_BUCKET_NAME/babyweight_tft/transform/transform_fn/assets/is_multiple gs://YOUR_BUCKET_NAME/babyweight_tft/transform/transform_fn/assets/mother_race ``` -## Implement the TensorFlow model{: id="implementing_the_tensorflow_model"} +## Implement the TensorFlow model This section and the next section, -[Train and use the model for predictions](#train_and_use_the_model_for_predictions){: track-type="solution" track-name="internalLink" track-metadata-position="body" }, +[Train and use the model for predictions](#train-and-use-the-model-for-predictions), provide an overview and context for Notebook 2. The notebook provides an example ML model to predict baby weights. In this example, a TensorFlow model is implemented using the Keras API. The model @@ -802,7 +669,7 @@ preprocessing pipeline explained earlier. The last part of the output is the following: - ```none{:.devsite-disable-click-to-copy} + ``` { .yaml .no-copy } Successfully installed ... Note: you may need to restart the kernel to use updated packages. ``` @@ -866,7 +733,7 @@ the previous step: 1. Create a `TFTransformOutput` object from the artifacts that are generated and saved in the previous preprocessing step, as described in the - [Save the graph](#save_the_graph){: track-type="solution" track-name="internalLink" track-metadata-position="body" } + [Save the graph](#save-the-graph) section: ```py @@ -965,7 +832,7 @@ features, and a `tf.feature_column.categorical_column_with_identity` column for categorical features. You can also create extended feature columns, as described in -[Option C: TensorFlow](/architecture/data-preprocessing-for-ml-with-tf-transform-pt1#option_c_tensorflow){: track-type="solution" track-name="internalLink" track-metadata-position="body" } +[Option C: TensorFlow](../../../guide/tft_bestpractices#option-c-tensorflow) in the first part of this series. In the example used for this series, a new feature is created, `mother_race_X_mother_age_bucketized`, by crossing the `mother_race` and `mother_age_bucketized` features using the @@ -977,12 +844,9 @@ The following diagram, figure 5, shows the transformed data and how the transformed metadata is used to define and train the TensorFlow model: -
- Training the TensorFlow model with transformed data. -
Figure 5. Training the TensorFlow model with - the transformed data.
-
+Figure: Training the TensorFlow model with the transformed data. { #training-tf-with-transformed-data } + +![Training the TensorFlow model with transformed data.](images/data-preprocessing-for-ml-with-tf-transform-training-tf-model-with-transformed-data.svg) ### Export the model for serving prediction @@ -993,7 +857,7 @@ interface—that is, the input features schema that is expected during serving. This input features schema is defined in the `serving_fn` function, as shown in the following code: -```py{:.devsite-disable-click-to-copy} +``` { .py .yaml .no-copy } def export_serving_model(model, output_dir): tf_transform_output = tft.TFTransformOutput(TRANSFORM_ARTEFACTS_DIR) @@ -1062,26 +926,23 @@ following steps: The following diagram, figure 6, illustrates the final step of exporting a model for serving: -
- Exporting the model for serving with the transform_fn graph attached. -
Figure 6. Exporting the model for serving with the - transform_fn graph attached.
-
+Figure: Exporting the model for serving with the `transform_fn` graph attached. { #exporting-model-for-serving-with-transform_fn } + +![Exporting the model for serving with the transform_fn graph attached.](images/data-preprocessing-for-ml-with-tf-transform-exporting-model-for-serving-with-transform_fn.svg) ## Train and use the model for predictions You can train the model locally by executing the cells of the notebook. For examples of how to package the code and train your model at scale using Vertex AI Training, see the samples and guides in the Google Cloud -[cloudml-samples](https://github.com/GoogleCloudPlatform/cloudml-samples){: .external } +[cloudml-samples](https://github.com/GoogleCloudPlatform/cloudml-samples) GitHub repository. When you inspect the exported SavedModel object using the `saved_model_cli` tool, you see that the `inputs` elements of the signature definition `signature_def` include the raw features, as shown in the following example: -```py{:.devsite-disable-click-to-copy} +``` { .py .yaml .no-copy } signature_def['serving_default']: The given SavedModel SignatureDef contains the following input(s): inputs['gestation_weeks'] tensor_info: @@ -1132,30 +993,20 @@ resources used in this tutorial, delete the project that contains the resources. ### Delete the project - +!!! danger "Caution" + + Deleting a project has the following effects: + + - __Everything in the project is deleted.__ If you used an existing project for + this tutorial, when you delete it, you also delete any other work you've done in the project. + - __Custom project IDs are lost.__ When you created this project, you might have created a custom project ID that you want to use in the future. To preserve the URLs that use the project ID, such as an `appspot.com`{translate="no" dir="ltr"} URL, delete selected resources inside the project instead of deleting the whole project. + + If you plan to explore multiple tutorials and quickstarts, reusing projects can help you avoid exceeding project quota limits. 1. In the Google Cloud console, go to the **Manage resources** page. - [Go to Manage resources](https://console.cloud.google.com/iam-admin/projects){: class="button button-primary" target="console" track-type="solution" track-name="consoleLink" track-metadata-position="body" } + [Go to Manage resources](https://console.cloud.google.com/iam-admin/projects){ .md-button .md-button--primary } 1. In the project list, select the project that you want to delete, and then click **Delete**. @@ -1167,14 +1018,14 @@ resources used in this tutorial, delete the project that contains the resources. - To learn about the concepts, challenges, and options of data preprocessing for machine learning on Google Cloud, see the first article in this series, - [Data preprocessing for ML: options and recommendations](../guide/tft_bestpractices). + [Data preprocessing for ML: options and recommendations](../../../guide/tft_bestpractices). - For more information about how to implement, package, and run a tf.Transform pipeline on Dataflow, see the - [Predicting income with Census Dataset](https://github.com/GoogleCloudPlatform/cloudml-samples/tree/master/census/tftransformestimator){: .external } + [Predicting income with Census Dataset](https://github.com/GoogleCloudPlatform/cloudml-samples/tree/master/census/tftransformestimator) sample. - Take the Coursera specialization on ML with - [TensorFlow on Google Cloud](https://www.coursera.org/specializations/machine-learning-tensorflow-gcp){: .external }. + [TensorFlow on Google Cloud](https://www.coursera.org/specializations/machine-learning-tensorflow-gcp). - Learn about best practices for ML engineering in - [Rules of ML](https://developers.google.com/machine-learning/guides/rules-of-ml/){: .external }. + [Rules of ML](https://developers.google.com/machine-learning/guides/rules-of-ml/). - For more reference architectures, diagrams, and best practices, explore the [Cloud Architecture Center](https://cloud.google.com/architecture). diff --git a/docs/tutorials/transform/simple.ipynb b/docs/tutorials/transform/simple.ipynb index 70e9f6963d..e49ca7f86b 100644 --- a/docs/tutorials/transform/simple.ipynb +++ b/docs/tutorials/transform/simple.ipynb @@ -47,19 +47,42 @@ "id": "S5ST8dI25wbA" }, "source": [ - "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", - "\n", - "\u003cdiv class=\"devsite-table-wrapper\"\u003e\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://www.tensorflow.org/tfx/tutorials/transform/simple\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/transform/simple.ipynb\"\u003e\n", - "\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\"\u003eRun in Google Colab\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tfx/blob/master/docs/tutorials/transform/simple.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\"\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\n", - "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://storage.googleapis.com/tensorflow_docs/tfx/docs/tutorials/transform/simple.ipynb\"\u003e\n", - "\u003cimg width=32px src=\"https://www.tensorflow.org/images/download_logo_32px.png\"\u003eDownload notebook\u003c/a\u003e\u003c/td\u003e\n", - "\u003c/table\u003e\u003c/div\u003e" - ] + "Note: We recommend running this tutorial in a Colab notebook, with no setup required! Just click \"Run in Google Colab\".\n", + "\n", + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] }, { "cell_type": "markdown", diff --git a/mkdocs.yml b/mkdocs.yml index 4fa2d04b08..0de5c9f376 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,5 +1,5 @@ site_name: TFX -repo_name: "Tensorflow TFX" +repo_name: "TFX" repo_url: https://github.com/tensorflow/tfx theme: @@ -30,12 +30,12 @@ theme: toggle: icon: material/brightness-4 name: Switch to system preference - logo: assets/tf_full_color_primary_icon.svg favicon: assets/tf_full_color_primary_icon.svg features: - content.code.copy - content.code.select + - content.action.edit plugins: - search - autorefs @@ -82,6 +82,8 @@ plugins: markdown_extensions: - admonition - attr_list + - def_list + - tables - toc: permalink: true - pymdownx.highlight: @@ -94,6 +96,12 @@ markdown_extensions: - pymdownx.superfences - pymdownx.arithmatex: generic: true + - pymdownx.critic + - pymdownx.caret + - pymdownx.keys + - pymdownx.mark + - pymdownx.tilde + - markdown_grid_tables - md_in_html - pymdownx.emoji: emoji_index: !!python/name:material.extensions.emoji.twemoji @@ -130,7 +138,7 @@ nav: - LLM finetuning and conversion: tutorials/tfx/gpt2_finetuning_and_conversion - Custom component tutorial: tutorials/tfx/python_function_component - Recommenders with TFX: tutorials/tfx/recommenders - - Ranking with TFX: mmenders/examples/ranking_tfx + - Ranking with TFX: https://www.tensorflow.org/recommenders/examples/ranking_tfx - Airflow tutorial: tutorials/tfx/airflow_workshop - Neural Structured Learning in TFX: tutorials/tfx/neural_structured_learning - Data Validation: @@ -141,7 +149,7 @@ nav: - Data preprocessing for ML with Google Cloud: tutorials/transform/data_preprocessing_with_cloud - Model Analysis: - Get started with TFMA: tutorials/model_analysis/tfma_basic - - Fairness Indicators tutorial: onsible_ai/fairness_indicators/tutorials/Fairness_Indicators_Example_Colab + - Fairness Indicators tutorial: https://www.tensorflow.org/responsible_ai/fairness_indicators/tutorials/Fairness_Indicators_Example_Colab - Deploy a trained model: - 'Servers: TFX for TensorFlow Serving': tutorials/serving/rest_simple - 'Mobile & IoT: TFX for TensorFlow Lite': tutorials/tfx/tfx_for_mobile @@ -156,7 +164,7 @@ nav: - "TFX Cloud Solutions": guide/solutions.md - "Using Keras with TFX": guide/keras - "Using Non-TensorFlow Frameworks in TFX": guide/non_tf - - "Mobile & IoT: TFX for TensorFlow Lite": tutorials/tfx_for_mobile + - "Mobile & IoT: TFX for TensorFlow Lite": tutorials/tfx/tfx_for_mobile - "TFX Pipelines": - "Understanding TFX pipelines": guide/understanding_tfx_pipelines @@ -242,7 +250,7 @@ nav: - "TensorBoard": "https://www.tensorflow.org/tensorboard" - API: - v1: - - "Overview": api/v1/root + - "Overview": api/v1/index.md - "components": api/v1/components - "dsl": api/v1/dsl - "extensions": api/v1/extensions diff --git a/tfx/components/bulk_inferrer/component.py b/tfx/components/bulk_inferrer/component.py index 297e1fe305..a5fe87e378 100644 --- a/tfx/components/bulk_inferrer/component.py +++ b/tfx/components/bulk_inferrer/component.py @@ -42,14 +42,15 @@ class BulkInferrer(base_beam_component.BaseBeamComponent): ``` Component `outputs` contains: - - `inference_result`: Channel of type `standard_artifacts.InferenceResult` + + - `inference_result`: Channel of type [`standard_artifacts.InferenceResult`][tfx.v1.types.standard_artifacts.InferenceResult] to store the inference results. - - `output_examples`: Channel of type `standard_artifacts.Examples` + - `output_examples`: Channel of type [`standard_artifacts.Examples`][tfx.v1.types.standard_artifacts.Examples] to store the output examples. This is optional controlled by `output_example_spec`. See [the BulkInferrer - guide](https://www.tensorflow.org/tfx/guide/bulkinferrer) for more details. + guide](../../../guide/bulkinferrer) for more details. """ SPEC_CLASS = standard_component_specs.BulkInferrerSpec @@ -69,11 +70,11 @@ def __init__( """Construct an BulkInferrer component. Args: - examples: A BaseChannel of type `standard_artifacts.Examples`, usually + examples: A [BaseChannel][tfx.v1.types.BaseChannel] of type [`standard_artifacts.Examples`][tfx.v1.types.standard_artifacts.Examples], usually produced by an ExampleGen component. _required_ - model: A BaseChannel of type `standard_artifacts.Model`, usually produced - by a Trainer component. - model_blessing: A BaseChannel of type `standard_artifacts.ModelBlessing`, + model: A [BaseChannel][tfx.v1.types.BaseChannel] of type [`standard_artifacts.Model`][tfx.v1.types.standard_artifacts.Model], usually produced + by a [Trainer][tfx.v1.components.Trainer] component. + model_blessing: A [BaseChannel][tfx.v1.types.BaseChannel] of type [`standard_artifacts.ModelBlessing`][tfx.v1.types.standard_artifacts.ModelBlessing], usually produced by a ModelValidator component. data_spec: bulk_inferrer_pb2.DataSpec instance that describes data selection. diff --git a/tfx/components/evaluator/component.py b/tfx/components/evaluator/component.py index 191ce7ac27..e8ccfbe7d1 100644 --- a/tfx/components/evaluator/component.py +++ b/tfx/components/evaluator/component.py @@ -33,13 +33,13 @@ class Evaluator(base_beam_component.BaseBeamComponent): """A TFX component to evaluate models trained by a TFX Trainer component. Component `outputs` contains: - - `evaluation`: Channel of type `standard_artifacts.ModelEvaluation` to - store - the evaluation results. - - `blessing`: Channel of type `standard_artifacts.ModelBlessing' that + + - `evaluation`: Channel of type [`standard_artifacts.ModelEvaluation`][tfx.v1.types.standard_artifacts.ModelEvaluation] to + store the evaluation results. + - `blessing`: Channel of type [`standard_artifacts.ModelBlessing`][tfx.v1.types.standard_artifacts.ModelBlessing] that contains the blessing result. - See [the Evaluator guide](https://www.tensorflow.org/tfx/guide/evaluator) for + See [the Evaluator guide](../../../guide/evaluator) for more details. """ @@ -64,18 +64,18 @@ def __init__( """Construct an Evaluator component. Args: - examples: A BaseChannel of type `standard_artifacts.Examples`, usually + examples: A [BaseChannel][tfx.v1.types.BaseChannel] of type [`standard_artifacts.Examples`][tfx.v1.types.standard_artifacts.Examples], usually produced by an ExampleGen component. _required_ - model: A BaseChannel of type `standard_artifacts.Model`, usually produced - by a Trainer component. - baseline_model: An optional channel of type 'standard_artifacts.Model' as + model: A [BaseChannel][tfx.v1.types.BaseChannel] of type [`standard_artifacts.Model`][tfx.v1.types.standard_artifacts.Model], usually produced + by a [Trainer][tfx.v1.components.Trainer] component. + baseline_model: An optional channel of type ['standard_artifacts.Model'][tfx.v1.types.standard_artifacts.Model] as the baseline model for model diff and model validation purpose. feature_slicing_spec: Deprecated, please use eval_config instead. Only support estimator. [evaluator_pb2.FeatureSlicingSpec](https://github.com/tensorflow/tfx/blob/master/tfx/proto/evaluator.proto) instance that describes how Evaluator should slice the data. fairness_indicator_thresholds: Optional list of float (or - RuntimeParameter) threshold values for use with TFMA fairness + [RuntimeParameter][tfx.v1.dsl.experimental.RuntimeParameter]) threshold values for use with TFMA fairness indicators. Experimental functionality: this interface and functionality may change at any time. TODO(b/142653905): add a link to additional documentation for TFMA fairness indicators here. @@ -90,12 +90,16 @@ def __init__( customization. This functionality is experimental and may change at any time. The module_file can implement following functions at its top level. + ``` {.py .no-copy} def custom_eval_shared_model( eval_saved_model_path, model_name, eval_config, **kwargs, ) -> tfma.EvalSharedModel: + ``` + ``` {.py .no-copy} def custom_extractors( eval_shared_model, eval_config, tensor_adapter_config, ) -> List[tfma.extractors.Extractor]: + ``` module_path: A python path to the custom module that contains the UDFs. See 'module_file' for the required signature of UDFs. This functionality is experimental and this API may change at any time. Note this can not diff --git a/tfx/components/evaluator/constants.py b/tfx/components/evaluator/constants.py index 5aec8b2c71..c57106527a 100644 --- a/tfx/components/evaluator/constants.py +++ b/tfx/components/evaluator/constants.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Constants for [Evaluator](https://www.tensorflow.org/tfx/guide/evaluator).""" +"""Constants for [Evaluator](../../../guide/evaluator).""" # Keys for artifact (custom) properties. ARTIFACT_PROPERTY_BLESSED_KEY = 'blessed' diff --git a/tfx/components/evaluator/executor.py b/tfx/components/evaluator/executor.py index 2fad481272..f01f2e12e3 100644 --- a/tfx/components/evaluator/executor.py +++ b/tfx/components/evaluator/executor.py @@ -40,7 +40,7 @@ class Executor(base_beam_executor.BaseBeamExecutor): - """Executor for [Evaluator](https://www.tensorflow.org/tfx/guide/evaluator).""" + """Executor for [Evaluator](../../../guide/evaluator).""" def _get_slice_spec_from_feature_slicing_spec( self, spec: evaluator_pb2.FeatureSlicingSpec diff --git a/tfx/components/example_diff/component.py b/tfx/components/example_diff/component.py index 4229b4556c..001b3197f2 100644 --- a/tfx/components/example_diff/component.py +++ b/tfx/components/example_diff/component.py @@ -29,7 +29,8 @@ class ExampleDiff(base_beam_component.BaseBeamComponent): """TFX ExampleDiff component. Computes example level diffs according to an ExampleDiffConfig. See TFDV - feature_skew_detector.py for more details. + [feature_skew_detector.py](https://github.com/tensorflow/data-validation/blob/master/tensorflow_data_validation/skew/feature_skew_detector.py) + for more details. This executor is under development and may change. """ @@ -45,10 +46,10 @@ def __init__(self, """Construct an ExampleDiff component. Args: - examples_test: A BaseChannel of `ExamplesPath` type, as generated by the - [ExampleGen component](https://www.tensorflow.org/tfx/guide/examplegen). + examples_test: A [BaseChannel][tfx.v1.types.BaseChannel] of `ExamplesPath` type, as generated by the + [ExampleGen component](../../../guide/examplegen). This needs to contain any splits referenced in `include_split_pairs`. - examples_base: A second BaseChannel of `ExamplesPath` type to which + examples_base: A second [BaseChannel][tfx.v1.types.BaseChannel] of `ExamplesPath` type to which `examples` should be compared. This needs to contain any splits referenced in `include_split_pairs`. config: A ExampleDiffConfig that defines configuration for the skew diff --git a/tfx/components/example_gen/csv_example_gen/component.py b/tfx/components/example_gen/csv_example_gen/component.py index eb246e5a71..cedabb6566 100644 --- a/tfx/components/example_gen/csv_example_gen/component.py +++ b/tfx/components/example_gen/csv_example_gen/component.py @@ -32,31 +32,37 @@ class CsvExampleGen(component.FileBasedExampleGen): # pylint: disable=protected The csv examplegen encodes column values to tf.Example int/float/byte feature. For the case when there's missing cells, the csv examplegen uses: - -- tf.train.Feature(`type`_list=tf.train.`type`List(value=[])), when the + + - tf.train.Feature(`type`_list=tf.train.`type`List(value=[])), when the `type` can be inferred. - -- tf.train.Feature() when it cannot infer the `type` from the column. + - tf.train.Feature() when it cannot infer the `type` from the column. Note that the type inferring will be per input split. If input isn't a single split, users need to ensure the column types align in each pre-splits. For example, given the following csv rows of a split: - header:A,B,C,D - row1: 1,,x,0.1 - row2: 2,,y,0.2 - row3: 3,,,0.3 - row4: + ``` + header:A,B,C,D + row1: 1,,x,0.1 + row2: 2,,y,0.2 + row3: 3,,,0.3 + row4: + ``` The output example will be - example1: 1(int), empty feature(no type), x(string), 0.1(float) - example2: 2(int), empty feature(no type), x(string), 0.2(float) - example3: 3(int), empty feature(no type), empty list(string), 0.3(float) + ``` + example1: 1(int), empty feature(no type), x(string), 0.1(float) + example2: 2(int), empty feature(no type), x(string), 0.2(float) + example3: 3(int), empty feature(no type), empty list(string), 0.3(float) + ``` - Note that the empty feature is `tf.train.Feature()` while empty list string - feature is `tf.train.Feature(bytes_list=tf.train.BytesList(value=[]))`. + Note that the empty feature is `tf.train.Feature()` while empty list string + feature is `tf.train.Feature(bytes_list=tf.train.BytesList(value=[]))`. Component `outputs` contains: - - `examples`: Channel of type `standard_artifacts.Examples` for output train + + - `examples`: Channel of type [`standard_artifacts.Examples`][tfx.v1.types.standard_artifacts.Examples] for output train and eval examples. """ diff --git a/tfx/components/example_gen/import_example_gen/component.py b/tfx/components/example_gen/import_example_gen/component.py index a07856bc9b..5a16a0bf2e 100644 --- a/tfx/components/example_gen/import_example_gen/component.py +++ b/tfx/components/example_gen/import_example_gen/component.py @@ -32,9 +32,9 @@ class ImportExampleGen(component.FileBasedExampleGen): # pylint: disable=protec shuffle the dataset for ML best practice. Component `outputs` contains: - - `examples`: Channel of type `standard_artifacts.Examples` for output - train - and eval examples. + + - `examples`: Channel of type [`standard_artifacts.Examples`][tfx.v1.types.standard_artifacts.Examples] for output + train and eval examples. """ EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor) diff --git a/tfx/components/example_validator/component.py b/tfx/components/example_validator/component.py index 2d8e3a9837..2d23244daf 100644 --- a/tfx/components/example_validator/component.py +++ b/tfx/components/example_validator/component.py @@ -36,29 +36,32 @@ class ExampleValidator(base_component.BaseComponent): The ExampleValidator component identifies anomalies in training and serving data. The component can be configured to detect different classes of anomalies in the data. It can: - - perform validity checks by comparing data statistics against a schema that - codifies expectations of the user. - - run custom validations based on an optional SQL-based config. - Schema Based Example Validation + - perform validity checks by comparing data statistics against a schema that + codifies expectations of the user. + - run custom validations based on an optional SQL-based config. + + ## Schema Based Example Validation + The ExampleValidator component identifies any anomalies in the example data by - comparing data statistics computed by the StatisticsGen component against a + comparing data statistics computed by the [StatisticsGen][tfx.v1.components.StatisticsGen] component against a schema. The schema codifies properties which the input data is expected to satisfy, and is provided and maintained by the user. - ## Example - ``` - # Performs anomaly detection based on statistics and data schema. - validate_stats = ExampleValidator( - statistics=statistics_gen.outputs['statistics'], - schema=infer_schema.outputs['schema']) - ``` + !!! Example + ``` python + # Performs anomaly detection based on statistics and data schema. + validate_stats = ExampleValidator( + statistics=statistics_gen.outputs['statistics'], + schema=infer_schema.outputs['schema']) + ``` Component `outputs` contains: + - `anomalies`: Channel of type `standard_artifacts.ExampleAnomalies`. See [the ExampleValidator - guide](https://www.tensorflow.org/tfx/guide/exampleval) for more details. + guide](../../../guide/exampleval) for more details. """ SPEC_CLASS = standard_component_specs.ExampleValidatorSpec @@ -73,8 +76,8 @@ def __init__(self, """Construct an ExampleValidator component. Args: - statistics: A BaseChannel of type `standard_artifacts.ExampleStatistics`. - schema: A BaseChannel of type `standard_artifacts.Schema`. _required_ + statistics: A [BaseChannel][tfx.v1.types.BaseChannel] of type [`standard_artifacts.ExampleStatistics`][tfx.v1.types.standard_artifacts.ExampleStatistics]. + schema: A [BaseChannel][tfx.v1.types.BaseChannel] of type [`standard_artifacts.Schema`]. _required_ exclude_splits: Names of splits that the example validator should not validate. Default behavior (when exclude_splits is set to None) is excluding no splits. diff --git a/tfx/components/infra_validator/component.py b/tfx/components/infra_validator/component.py index 4161567c88..ef053100bd 100644 --- a/tfx/components/infra_validator/component.py +++ b/tfx/components/infra_validator/component.py @@ -36,7 +36,7 @@ class InfraValidator(base_component.BaseComponent): Full example using TensorFlowServing binary running on local docker. - ``` + ``` python infra_validator = InfraValidator( model=trainer.outputs['model'], examples=test_example_gen.outputs['examples'], @@ -59,7 +59,7 @@ class InfraValidator(base_component.BaseComponent): Minimal example when running on Kubernetes. - ``` + ``` python infra_validator = InfraValidator( model=trainer.outputs['model'], examples=test_example_gen.outputs['examples'], @@ -73,11 +73,12 @@ class InfraValidator(base_component.BaseComponent): ``` Component `outputs` contains: - - `blessing`: Channel of type `standard_artifacts.InfraBlessing` that + + - `blessing`: Channel of type [`standard_artifacts.InfraBlessing`][tfx.v1.types.standard_artifacts.InfraBlessing] that contains the validation result. See [the InfraValidator - guide](https://www.tensorflow.org/tfx/guide/infra_validator) for more + guide](../../../guide/infra_validator) for more details. """ @@ -95,13 +96,13 @@ def __init__( """Construct a InfraValidator component. Args: - model: A `BaseChannel` of `ModelExportPath` type, usually produced by - [Trainer](https://www.tensorflow.org/tfx/guide/trainer) component. + model: A [`BaseChannel`][tfx.v1.types.BaseChannel] of `ModelExportPath` type, usually produced by + [Trainer](../../../guide/trainer) component. _required_ serving_spec: A `ServingSpec` configuration about serving binary and test platform config to launch model server for validation. _required_ - examples: A `BaseChannel` of `ExamplesPath` type, usually produced by - [ExampleGen](https://www.tensorflow.org/tfx/guide/examplegen) component. + examples: A [`BaseChannel`][tfx.v1.types.BaseChannel] of `ExamplesPath` type, usually produced by + [ExampleGen](../../../guide/examplegen) component. If not specified, InfraValidator does not issue requests for validation. request_spec: Optional `RequestSpec` configuration about making requests diff --git a/tfx/components/model_validator/component.py b/tfx/components/model_validator/component.py index f82e74422f..ea7ffe170d 100644 --- a/tfx/components/model_validator/component.py +++ b/tfx/components/model_validator/component.py @@ -74,11 +74,11 @@ def __init__(self, Args: examples: A BaseChannel of type `standard_artifacts.Examples`, usually produced by an - [ExampleGen](https://www.tensorflow.org/tfx/guide/examplegen) component. + [ExampleGen](../../../guide/examplegen) component. _required_ model: A BaseChannel of type `standard_artifacts.Model`, usually produced by - a [Trainer](https://www.tensorflow.org/tfx/guide/trainer) component. + a [Trainer](../../../guide/trainer) component. _required_ blessing: Output channel of type `standard_artifacts.ModelBlessing` that contains the validation result. diff --git a/tfx/components/pusher/component.py b/tfx/components/pusher/component.py index 28bc0460dc..f4bffa1800 100644 --- a/tfx/components/pusher/component.py +++ b/tfx/components/pusher/component.py @@ -32,37 +32,41 @@ class Pusher(base_component.BaseComponent): """A TFX component to push validated TensorFlow models to a model serving platform. The `Pusher` component can be used to push an validated SavedModel from output - of the [Trainer component](https://www.tensorflow.org/tfx/guide/trainer) to + of the [Trainer component](../../../guide/trainer) to [TensorFlow Serving](https://www.tensorflow.org/tfx/serving). The Pusher will check the validation results from the [Evaluator - component](https://www.tensorflow.org/tfx/guide/evaluator) and [InfraValidator - component](https://www.tensorflow.org/tfx/guide/infra_validator) + component](../../../guide/evaluator) and [InfraValidator + component](../../../guide/infra_validator) before deploying the model. If the model has not been blessed, then the model will not be pushed. - *Note:* The executor for this component can be overriden to enable the model - to be pushed to other serving platforms than tf.serving. The [Cloud AI - Platform custom - executor](https://github.com/tensorflow/tfx/tree/master/tfx/extensions/google_cloud_ai_platform/pusher) - provides an example how to implement this. + !!! Note + The executor for this component can be overriden to enable the model + to be pushed to other serving platforms than tf.serving. The [Cloud AI + Platform custom executor](https://github.com/tensorflow/tfx/tree/master/tfx/extensions/google_cloud_ai_platform/pusher) + provides an example how to implement this. - ## Example - ``` - # Checks whether the model passed the validation steps and pushes the model - # to a file destination if check passed. - pusher = Pusher( - model=trainer.outputs['model'], - model_blessing=evaluator.outputs['blessing'], - push_destination=proto.PushDestination( - filesystem=proto.PushDestination.Filesystem( - base_directory=serving_model_dir))) - ``` + !!! Example + ``` python + # Checks whether the model passed the validation steps and pushes the model + # to a file destination if check passed. + pusher = Pusher( + model=trainer.outputs['model'], + model_blessing=evaluator.outputs['blessing'], + push_destination=proto.PushDestination( + filesystem=proto.PushDestination.Filesystem( + base_directory=serving_model_dir, + ) + ), + ) + ``` Component `outputs` contains: - - `pushed_model`: Channel of type `standard_artifacts.PushedModel` with + + - `pushed_model`: Channel of type [`standard_artifacts.PushedModel`][tfx.v1.types.standard_artifacts.PushedModel] with result of push. - See [the Pusher guide](https://www.tensorflow.org/tfx/guide/pusher) for more + See [the Pusher guide](../../../guide/pusher) for more details. """ @@ -81,14 +85,14 @@ def __init__( """Construct a Pusher component. Args: - model: An optional BaseChannel of type `standard_artifacts.Model`, usually - produced by a Trainer component. - model_blessing: An optional BaseChannel of type - `standard_artifacts.ModelBlessing`, usually produced from an Evaluator - component. - infra_blessing: An optional BaseChannel of type - `standard_artifacts.InfraBlessing`, usually produced from an - InfraValidator component. + model: An optional [BaseChannel][tfx.v1.types.BaseChannel] of type `standard_artifacts.Model`, usually + produced by a [Trainer][tfx.v1.components.Trainer] component. + model_blessing: An optional [BaseChannel][tfx.v1.types.BaseChannel] of type + [`standard_artifacts.ModelBlessing`][tfx.v1.types.standard_artifacts.ModelBlessing], + usually produced from an [Evaluator][tfx.v1.components.Evaluator] component. + infra_blessing: An optional [BaseChannel][tfx.v1.types.BaseChannel] of type + [`standard_artifacts.InfraBlessing`][tfx.v1.types.standard_artifacts.InfraBlessing], + usually produced from an [InfraValidator][tfx.v1.components.InfraValidator] component. push_destination: A pusher_pb2.PushDestination instance, providing info for tensorflow serving to load models. Optional if executor_class doesn't require push_destination. diff --git a/tfx/components/pusher/executor.py b/tfx/components/pusher/executor.py index 2d37ad8d38..2ff068699c 100644 --- a/tfx/components/pusher/executor.py +++ b/tfx/components/pusher/executor.py @@ -56,8 +56,8 @@ class Executor(base_executor.BaseExecutor): https://github.com/tensorflow/tfx/blob/master/tfx/examples/chicago_taxi_pipeline/taxi_pipeline_simple.py#L104. For more details on tf.serving itself, please refer to - https://tensorflow.org/tfx/guide/pusher. For a tutuorial on TF Serving, - please refer to https://www.tensorflow.org/tfx/guide/serving. + [the pusher guide](../../../guide/pusher). For a tutuorial on TF Serving, + please refer to [the serving guide](../../../guide/serving). """ def CheckBlessing(self, input_dict: Dict[str, List[types.Artifact]]) -> bool: diff --git a/tfx/components/schema_gen/component.py b/tfx/components/schema_gen/component.py index 914e2966f1..3123129a8e 100644 --- a/tfx/components/schema_gen/component.py +++ b/tfx/components/schema_gen/component.py @@ -40,17 +40,18 @@ class SchemaGen(base_component.BaseComponent): In a typical TFX pipeline, the SchemaGen component generates a schema which is consumed by the other pipeline components. - ## Example - ``` - # Generates schema based on statistics files. - infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics']) - ``` + !!! Example + ``` python + # Generates schema based on statistics files. + infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics']) + ``` Component `outputs` contains: - - `schema`: Channel of type `standard_artifacts.Schema` for schema + + - `schema`: Channel of type [`standard_artifacts.Schema`][tfx.v1.types.standard_artifacts.Schema] for schema result. - See [the SchemaGen guide](https://www.tensorflow.org/tfx/guide/schemagen) + See [the SchemaGen guide](../../../guide/schemagen) for more details. """ SPEC_CLASS = standard_component_specs.SchemaGenSpec @@ -65,10 +66,11 @@ def __init__( """Constructs a SchemaGen component. Args: - statistics: A BaseChannel of `ExampleStatistics` type (required if spec is - not passed). This should contain at least a `train` split. Other splits + statistics: A [BaseChannel][tfx.v1.types.BaseChannel] + of `ExampleStatistics` type (required if spec is not passed). + This should contain at least a `train` split. Other splits are currently ignored. _required_ - infer_feature_shape: Boolean (or RuntimeParameter) value indicating + infer_feature_shape: Boolean (or [RuntimeParameter][tfx.v1.dsl.experimental.RuntimeParameter]) value indicating whether or not to infer the shape of features. If the feature shape is not inferred, downstream Tensorflow Transform component using the schema will parse input as tf.SparseTensor. Default to True if not set. diff --git a/tfx/components/schema_gen/import_schema_gen/__init__.py b/tfx/components/schema_gen/import_schema_gen/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tfx/components/schema_gen/import_schema_gen/component.py b/tfx/components/schema_gen/import_schema_gen/component.py index 7e61dacb20..626c2793c7 100644 --- a/tfx/components/schema_gen/import_schema_gen/component.py +++ b/tfx/components/schema_gen/import_schema_gen/component.py @@ -38,12 +38,14 @@ class ImportSchemaGen(base_component.BaseComponent): ``` Component `outputs` contains: - - `schema`: Channel of type `standard_artifacts.Schema` for schema result. - See [the SchemaGen guide](https://www.tensorflow.org/tfx/guide/schemagen) + - `schema`: Channel of type `standard_artifacts.Schema` for schema result. + + See [the SchemaGen guide](../../../guide/schemagen) for more details. ImportSchemaGen works almost similar to `Importer` except following: + - `schema_file` should be the full file path instead of directory holding it. - `schema_file` is copied to the output artifact. This is different from `Importer` that loads an "Artifact" by setting its URI to the given path. diff --git a/tfx/components/statistics_gen/component.py b/tfx/components/statistics_gen/component.py index addccc4c59..5fbeaae479 100644 --- a/tfx/components/statistics_gen/component.py +++ b/tfx/components/statistics_gen/component.py @@ -44,7 +44,7 @@ class StatisticsGen(base_beam_component.BaseBeamComponent): statistics of each split provided in the input examples. Please see [the StatisticsGen - guide](https://www.tensorflow.org/tfx/guide/statsgen) for more details. + guide](../../../guide/statsgen) for more details. """ SPEC_CLASS = standard_component_specs.StatisticsGenSpec @@ -59,7 +59,7 @@ def __init__(self, Args: examples: A BaseChannel of `ExamplesPath` type, likely generated by the - [ExampleGen component](https://www.tensorflow.org/tfx/guide/examplegen). + [ExampleGen component](../../../guide/examplegen). This needs to contain two splits labeled `train` and `eval`. _required_ schema: A `Schema` channel to use for automatically configuring the value diff --git a/tfx/components/trainer/component.py b/tfx/components/trainer/component.py index 7357e615b6..93dd4052cc 100644 --- a/tfx/components/trainer/component.py +++ b/tfx/components/trainer/component.py @@ -32,35 +32,38 @@ class Trainer(base_component.BaseComponent): """A TFX component to train a TensorFlow model. The Trainer component is used to train and eval a model using given inputs and - a user-supplied run_fn function. + a user-supplied `run_fn` function. An example of `run_fn()` can be found in the [user-supplied code](https://github.com/tensorflow/tfx/blob/master/tfx/examples/penguin/penguin_utils_keras.py) of the TFX penguin pipeline example. - *Note:* This component trains locally. For cloud distributed training, please - refer to [Cloud AI Platform - Trainer](https://github.com/tensorflow/tfx/tree/master/tfx/extensions/google_cloud_ai_platform/trainer). - - ## Example - ``` - # Uses user-provided Python function that trains a model using TF. - trainer = Trainer( - module_file=module_file, - examples=transform.outputs['transformed_examples'], - schema=infer_schema.outputs['schema'], - transform_graph=transform.outputs['transform_graph'], - train_args=proto.TrainArgs(splits=['train'], num_steps=10000), - eval_args=proto.EvalArgs(splits=['eval'], num_steps=5000)) - ``` + !!! Note + This component trains locally. For cloud distributed training, please + refer to [Cloud AI Platform + Trainer](https://github.com/tensorflow/tfx/tree/master/tfx/extensions/google_cloud_ai_platform/trainer). + + !!! Example + ``` + # Uses user-provided Python function that trains a model using TF. + trainer = Trainer( + module_file=module_file, + examples=transform.outputs["transformed_examples"], + schema=infer_schema.outputs["schema"], + transform_graph=transform.outputs["transform_graph"], + train_args=proto.TrainArgs(splits=["train"], num_steps=10000), + eval_args=proto.EvalArgs(splits=["eval"], num_steps=5000), + ) + ``` Component `outputs` contains: - - `model`: Channel of type `standard_artifacts.Model` for trained model. - - `model_run`: Channel of type `standard_artifacts.ModelRun`, as the working + + - `model`: Channel of type [`standard_artifacts.Model`][tfx.v1.types.standard_artifacts.Model] for trained model. + - `model_run`: Channel of type [`standard_artifacts.ModelRun`][tfx.v1.types.standard_artifacts.ModelRun], as the working dir of models, can be used to output non-model related output (e.g., TensorBoard logs). - Please see [the Trainer guide](https://www.tensorflow.org/tfx/guide/trainer) + Please see [the Trainer guide](../../../guide/trainer) for more details. """ @@ -89,54 +92,62 @@ def __init__( """Construct a Trainer component. Args: - examples: A BaseChannel of type `standard_artifacts.Examples`, serving as - the source of examples used in training (required). May be raw or + examples: A [BaseChannel][tfx.v1.types.BaseChannel] of type [`standard_artifacts.Examples`][tfx.v1.types.standard_artifacts.Examples], + serving as the source of examples used in training (required). May be raw or transformed. transformed_examples: Deprecated (no compatibility guarantee). Please set 'examples' instead. - transform_graph: An optional BaseChannel of type - `standard_artifacts.TransformGraph`, serving as the input transform - graph if present. - schema: An optional BaseChannel of type `standard_artifacts.Schema`, + transform_graph: An optional [BaseChannel][tfx.v1.types.BaseChannel] of type + [`standard_artifacts.TransformGraph`][tfx.v1.types.standard_artifacts.TransformGraph], + serving as the input transform graph if present. + schema: An optional [BaseChannel][tfx.v1.types.BaseChannel] of type + [`standard_artifacts.Schema`][tfx.v1.types.standard_artifacts.Schema], serving as the schema of training and eval data. Schema is optional when - 1) transform_graph is provided which contains schema. 2) user module - bypasses the usage of schema, e.g., hardcoded. - base_model: A BaseChannel of type `Model`, containing model that will be + + 1. transform_graph is provided which contains schema. + 2. user module bypasses the usage of schema, e.g., hardcoded. + base_model: A [BaseChannel][tfx.v1.types.BaseChannel] of type `Model`, containing model that will be used for training. This can be used for warmstart, transfer learning or model ensembling. - hyperparameters: A BaseChannel of type - `standard_artifacts.HyperParameters`, serving as the hyperparameters for - training module. Tuner's output best hyperparameters can be feed into - this. + hyperparameters: A [BaseChannel] of type + [`standard_artifacts.HyperParameters`][tfx.v1.types.standard_artifacts.HyperParameters], + serving as the hyperparameters for training module. Tuner's output best + hyperparameters can be feed into this. module_file: A path to python module file containing UDF model definition. - The module_file must implement a function named `run_fn` at its top + The `module_file` must implement a function named `run_fn` at its top level with function signature: - `def run_fn(trainer.fn_args_utils.FnArgs)`, - and the trained model must be saved to FnArgs.serving_model_dir when + ```python + def run_fn(trainer.fn_args_utils.FnArgs) + ``` + and the trained model must be saved to `FnArgs.serving_model_dir` when this function is executed. - For Estimator based Executor, The module_file must implement a function + For Estimator based Executor, The `module_file` must implement a function named `trainer_fn` at its top level. The function must have the following signature. - def trainer_fn(trainer.fn_args_utils.FnArgs, - tensorflow_metadata.proto.v0.schema_pb2) -> Dict: + ``` python + def trainer_fn(trainer.fn_args_utils.FnArgs, + tensorflow_metadata.proto.v0.schema_pb2) -> Dict: ... - where the returned Dict has the following key-values. - 'estimator': an instance of tf.estimator.Estimator - 'train_spec': an instance of tf.estimator.TrainSpec - 'eval_spec': an instance of tf.estimator.EvalSpec - 'eval_input_receiver_fn': an instance of tfma EvalInputReceiver. - Exactly one of 'module_file' or 'run_fn' must be supplied if Trainer - uses GenericExecutor (default). Use of a RuntimeParameter for this + ``` + where the returned Dict has the following key-values. + + - `estimator`: an instance of `tf.estimator.Estimator` + - `train_spec`: an instance of `tf.estimator.TrainSpec` + - `eval_spec`: an instance of `tf.estimator.EvalSpec` + - `eval_input_receiver_fn`: an instance of tfma `EvalInputReceiver`. + + Exactly one of `module_file` or `run_fn` must be supplied if Trainer + uses GenericExecutor (default). Use of a [RuntimeParameter][tfx.v1.dsl.experimental.RuntimeParameter] for this argument is experimental. run_fn: A python path to UDF model definition function for generic trainer. See 'module_file' for details. Exactly one of 'module_file' or 'run_fn' must be supplied if Trainer uses GenericExecutor (default). Use - of a RuntimeParameter for this argument is experimental. + of a [RuntimeParameter][tfx.v1.dsl.experimental.RuntimeParameter] for this argument is experimental. trainer_fn: A python path to UDF model definition function for estimator based trainer. See 'module_file' for the required signature of the UDF. Exactly one of 'module_file' or 'trainer_fn' must be supplied if Trainer - uses Estimator based Executor. Use of a RuntimeParameter for this + uses Estimator based Executor. Use of a [RuntimeParameter][tfx.v1.dsl.experimental.RuntimeParameter] for this argument is experimental. train_args: A proto.TrainArgs instance, containing args used for training Currently only splits and num_steps are available. Default behavior @@ -151,11 +162,11 @@ def trainer_fn(trainer.fn_args_utils.FnArgs, Raises: ValueError: - - When both or neither of 'module_file' and user function + - When both or neither of `module_file` and user function (e.g., trainer_fn and run_fn) is supplied. - - When both or neither of 'examples' and 'transformed_examples' + - When both or neither of `examples` and `transformed_examples` is supplied. - - When 'transformed_examples' is supplied but 'transform_graph' + - When `transformed_examples` is supplied but `transform_graph` is not supplied. """ if [bool(module_file), bool(run_fn), bool(trainer_fn)].count(True) != 1: diff --git a/tfx/components/trainer/fn_args_utils.py b/tfx/components/trainer/fn_args_utils.py index 613f84702e..30ad5fc8cd 100644 --- a/tfx/components/trainer/fn_args_utils.py +++ b/tfx/components/trainer/fn_args_utils.py @@ -48,7 +48,7 @@ Optional[schema_pb2.Schema], ], Iterator[pa.RecordBatch]]), ('data_view_decode_fn', Optional[Callable[[tf.Tensor], Dict[str, Any]]])]) -DataAccessor.__doc__ = """ +""" For accessing the data on disk. Contains factories that can create tf.data.Datasets or other means to access diff --git a/tfx/components/transform/component.py b/tfx/components/transform/component.py index 7ee88c6df0..1430917e1e 100644 --- a/tfx/components/transform/component.py +++ b/tfx/components/transform/component.py @@ -60,26 +60,28 @@ class Transform(base_beam_component.BaseBeamComponent): code](https://github.com/tensorflow/tfx/blob/master/tfx/examples/bert/mrpc/bert_mrpc_utils.py) of the TFX BERT MRPC pipeline example. - ## Example - ``` - # Performs transformations and feature engineering in training and serving. - transform = Transform( - examples=example_gen.outputs['examples'], - schema=infer_schema.outputs['schema'], - module_file=module_file) - ``` + !!! Example + ``` python + # Performs transformations and feature engineering in training and serving. + transform = Transform( + examples=example_gen.outputs['examples'], + schema=infer_schema.outputs['schema'], + module_file=module_file, + ) + ``` Component `outputs` contains: - - `transform_graph`: Channel of type `standard_artifacts.TransformGraph`, + + - `transform_graph`: Channel of type [`standard_artifacts.TransformGraph`][tfx.v1.types.standard_artifacts.TransformGraph], which includes an exported Tensorflow graph suitable for both training and serving. - - `transformed_examples`: Channel of type `standard_artifacts.Examples` for + - `transformed_examples`: Channel of type [`standard_artifacts.Examples`][tfx.v1.types.standard_artifacts.Examples] for materialized transformed examples, which includes transform splits as specified in splits_config. This is optional controlled by `materialize`. Please see [the Transform - guide](https://www.tensorflow.org/tfx/guide/transform) for more details. + guide](../../../guide/transform) for more details. """ SPEC_CLASS = standard_component_specs.TransformSpec @@ -103,20 +105,20 @@ def __init__( """Construct a Transform component. Args: - examples: A BaseChannel of type `standard_artifacts.Examples` (required). + examples: A [BaseChannel][tfx.v1.types.BaseChannel] of type [`standard_artifacts.Examples`][tfx.v1.types.standard_artifacts.Examples] _required_. This should contain custom splits specified in splits_config. If custom split is not provided, this should contain two splits 'train' and 'eval'. - schema: A BaseChannel of type `standard_artifacts.Schema`. This should + schema: A [BaseChannel][tfx.v1.types.BaseChannel] of type [`standard_artifacts.Schema`][tfx.v1.types.standard_artifacts.Schema]. This should contain a single schema artifact. module_file: The file path to a python module file, from which the 'preprocessing_fn' function will be loaded. Exactly one of 'module_file' or 'preprocessing_fn' must be supplied. The function needs to have the following signature: - ``` + ``` {.python .no-copy} def preprocessing_fn(inputs: Dict[Text, Any]) -> Dict[Text, Any]: - ... + ... ``` where the values of input and returned Dict are either tf.Tensor or tf.SparseTensor. @@ -124,26 +126,29 @@ def preprocessing_fn(inputs: Dict[Text, Any]) -> Dict[Text, Any]: If additional inputs are needed for preprocessing_fn, they can be passed in custom_config: - ``` - def preprocessing_fn(inputs: Dict[Text, Any], custom_config: - Dict[Text, Any]) -> Dict[Text, Any]: - ... + ``` {.python .no-copy} + def preprocessing_fn( + inputs: Dict[Text, Any], + custom_config: Dict[Text, Any], + ) -> Dict[Text, Any]: + ... ``` To update the stats options used to compute the pre-transform or post-transform statistics, optionally define the 'stats-options_updater_fn' within the same module. If implemented, this function needs to have the following signature: + ``` {.python .no-copy} + def stats_options_updater_fn( + stats_type: tfx.components.transform.stats_options_util.StatsType, + stats_options: tfdv.StatsOptions, + ) -> tfdv.StatsOptions: + ... ``` - def stats_options_updater_fn(stats_type: tfx.components.transform - .stats_options_util.StatsType, stats_options: tfdv.StatsOptions) - -> tfdv.StatsOptions: - ... - ``` - Use of a RuntimeParameter for this argument is experimental. + Use of a [RuntimeParameter][tfx.v1.dsl.experimental.RuntimeParameter] for this argument is experimental. preprocessing_fn: The path to python function that implements a 'preprocessing_fn'. See 'module_file' for expected signature of the function. Exactly one of 'module_file' or 'preprocessing_fn' must be - supplied. Use of a RuntimeParameter for this argument is experimental. + supplied. Use of a [RuntimeParameter][tfx.v1.dsl.experimental.RuntimeParameter] for this argument is experimental. splits_config: A transform_pb2.SplitsConfig instance, providing splits that should be analyzed and splits that should be transformed. Note analyze and transform splits can have overlap. Default behavior (when diff --git a/tfx/components/tuner/component.py b/tfx/components/tuner/component.py index 9b28062574..87fe5ef3cf 100644 --- a/tfx/components/tuner/component.py +++ b/tfx/components/tuner/component.py @@ -33,7 +33,7 @@ # args depend on the tuner's implementation. TunerFnResult = NamedTuple('TunerFnResult', [('tuner', base_tuner.BaseTuner), ('fit_kwargs', Dict[str, Any])]) -TunerFnResult.__doc__ = """ +""" Return type of tuner_fn. tuner_fn returns a TunerFnResult that contains: @@ -48,14 +48,15 @@ class Tuner(base_component.BaseComponent): """A TFX component for model hyperparameter tuning. Component `outputs` contains: + - `best_hyperparameters`: Channel of type - `standard_artifacts.HyperParameters` for result of + [`standard_artifacts.HyperParameters`][tfx.v1.types.standard_artifacts.HyperParameters] for result of the best hparams. - - `tuner_results`: Channel of type `standard_artifacts.TunerResults` for + - `tuner_results`: Channel of type [`standard_artifacts.TunerResults`][tfx.v1.types.standard_artifacts.TunerResults] for results of all trials. Experimental: subject to change and no backwards compatibility guarantees. - See [the Tuner guide](https://www.tensorflow.org/tfx/guide/tuner) + See [the Tuner guide](../../../guide/tuner) for more details. """ @@ -76,22 +77,25 @@ def __init__(self, """Construct a Tuner component. Args: - examples: A BaseChannel of type `standard_artifacts.Examples`, serving as + examples: A [BaseChannel][tfx.v1.types.BaseChannel] of type [`standard_artifacts.Examples`][tfx.v1.types.standard_artifacts.Examples], serving as the source of examples that are used in tuning (required). - schema: An optional BaseChannel of type `standard_artifacts.Schema`, + schema: An optional [BaseChannel][tfx.v1.types.BaseChannel] of type [`standard_artifacts.Schema`][tfx.v1.types.standard_artifacts.Schema], serving as the schema of training and eval data. This is used when raw examples are provided. - transform_graph: An optional BaseChannel of type - `standard_artifacts.TransformGraph`, serving as the input transform + transform_graph: An optional [BaseChannel][tfx.v1.types.BaseChannel] of type + [`standard_artifacts.TransformGraph`][tfx.v1.types.standard_artifacts.TransformGraph], serving as the input transform graph if present. This is used when transformed examples are provided. - base_model: A BaseChannel of type `Model`, containing model that will be + base_model: A [BaseChannel][tfx.v1.types.BaseChannel] of type [`Model`][tfx.v1.types.standard_artifacts.Model], containing model that will be used for training. This can be used for warmstart, transfer learning or model ensembling. module_file: A path to python module file containing UDF tuner definition. The module_file must implement a function named `tuner_fn` at its top level. The function must have the following signature. - def tuner_fn(fn_args: FnArgs) -> TunerFnResult: Exactly one of - 'module_file' or 'tuner_fn' must be supplied. + ``` {.python .no-copy} + def tuner_fn(fn_args: FnArgs) -> TunerFnResult: + ... + ``` + Exactly one of 'module_file' or 'tuner_fn' must be supplied. tuner_fn: A python path to UDF model definition function. See 'module_file' for the required signature of the UDF. Exactly one of 'module_file' or 'tuner_fn' must be supplied. diff --git a/tfx/dependencies.py b/tfx/dependencies.py index 8ed768835b..7cb051c75c 100644 --- a/tfx/dependencies.py +++ b/tfx/dependencies.py @@ -33,6 +33,7 @@ branch HEAD. - For the release, we use a range of version, which is also used as a default. """ + from __future__ import annotations import os @@ -98,7 +99,7 @@ def make_required_install_packages(): # TODO(b/332616741): Scipy version 1.13 breaks the TFX OSS test. # Unpin once the issue is resolved. "scipy<1.13", - 'scikit-learn==1.5.1', + "scikit-learn==1.5.1", # TODO(b/291837844): Pinned pyyaml to 5.3.1. # Unpin once the issue with installation is resolved. "pyyaml>=6,<7", @@ -270,6 +271,7 @@ def make_extra_packages_docs() -> list[str]: "mkdocs-jupyter", "mkdocs-caption", "pymdown-extensions", + "markdown-grid-tables", ] diff --git a/tfx/dsl/component/experimental/container_component.py b/tfx/dsl/component/experimental/container_component.py index 7e771976bf..923f55800d 100644 --- a/tfx/dsl/component/experimental/container_component.py +++ b/tfx/dsl/component/experimental/container_component.py @@ -48,29 +48,28 @@ def create_container_component( Returns: Component that can be instantiated and user inside pipeline. - Example: - - ``` - component = create_container_component( - name='TrainModel', - inputs={ - 'training_data': Dataset, - }, - outputs={ - 'model': Model, - }, - parameters={ - 'num_training_steps': int, - }, - image='gcr.io/my-project/my-trainer', - command=[ - 'python3', 'my_trainer', - '--training_data_uri', InputUriPlaceholder('training_data'), - '--model_uri', OutputUriPlaceholder('model'), - '--num_training-steps', InputValuePlaceholder('num_training_steps'), - ] - ) - ``` + !!! Example + ``` python + component = create_container_component( + name="TrainModel", + inputs={ + "training_data": Dataset, + }, + outputs={ + "model": Model, + }, + parameters={ + "num_training_steps": int, + }, + image="gcr.io/my-project/my-trainer", + command=[ + "python3", "my_trainer", + "--training_data_uri", InputUriPlaceholder("training_data"), + "--model_uri", OutputUriPlaceholder("model"), + "--num_training-steps", InputValuePlaceholder("num_training_steps"), + ], + ) + ``` """ if not name: raise ValueError('Component name cannot be empty.') diff --git a/tfx/dsl/component/experimental/decorators.py b/tfx/dsl/component/experimental/decorators.py index d9719f4075..d83bd3cc18 100644 --- a/tfx/dsl/component/experimental/decorators.py +++ b/tfx/dsl/component/experimental/decorators.py @@ -320,7 +320,7 @@ def component( BaseFunctionalComponentFactory, Callable[[types.FunctionType], BaseFunctionalComponentFactory], ]: - """Decorator: creates a component from a typehint-annotated Python function. + '''Decorator: creates a component from a typehint-annotated Python function. This decorator creates a component based on typehint annotations specified for the arguments and return value for a Python function. The decorator can be @@ -368,65 +368,67 @@ def component( This is example usage of component definition using this decorator: - from tfx import v1 as tfx - - InputArtifact = tfx.dsl.components.InputArtifact - OutputArtifact = tfx.dsl.components.OutputArtifact - Parameter = tfx.dsl.components.Parameter - Examples = tfx.types.standard_artifacts.Examples - Model = tfx.types.standard_artifacts.Model - - class MyOutput(TypedDict): - loss: float - accuracy: float - - @component(component_annotation=tfx.dsl.standard_annotations.Train) - def MyTrainerComponent( - training_data: InputArtifact[Examples], - model: OutputArtifact[Model], - dropout_hyperparameter: float, - num_iterations: Parameter[int] = 10 - ) -> MyOutput: - '''My simple trainer component.''' - - records = read_examples(training_data.uri) - model_obj = train_model(records, num_iterations, dropout_hyperparameter) - model_obj.write_to(model.uri) - - return { - 'loss': model_obj.loss, - 'accuracy': model_obj.accuracy - } - - # Example usage in a pipeline graph definition: - # ... - trainer = MyTrainerComponent( - training_data=example_gen.outputs['examples'], - dropout_hyperparameter=other_component.outputs['dropout'], - num_iterations=1000) - pusher = Pusher(model=trainer.outputs['model']) - # ... + ``` python + from tfx import v1 as tfx + + InputArtifact = tfx.dsl.components.InputArtifact + OutputArtifact = tfx.dsl.components.OutputArtifact + Parameter = tfx.dsl.components.Parameter + Examples = tfx.types.standard_artifacts.Examples + Model = tfx.types.standard_artifacts.Model + + + class MyOutput(TypedDict): + loss: float + accuracy: float + + + @component(component_annotation=tfx.dsl.standard_annotations.Train) + def MyTrainerComponent( + training_data: InputArtifact[Examples], + model: OutputArtifact[Model], + dropout_hyperparameter: float, + num_iterations: Parameter[int] = 10, + ) -> MyOutput: + """My simple trainer component.""" + + records = read_examples(training_data.uri) + model_obj = train_model(records, num_iterations, dropout_hyperparameter) + model_obj.write_to(model.uri) + + return {"loss": model_obj.loss, "accuracy": model_obj.accuracy} + + + # Example usage in a pipeline graph definition: + # ... + trainer = MyTrainerComponent( + training_data=example_gen.outputs["examples"], + dropout_hyperparameter=other_component.outputs["dropout"], + num_iterations=1000, + ) + pusher = Pusher(model=trainer.outputs["model"]) + # ... + ``` When the parameter `component_annotation` is not supplied, the default value is None. This is another example usage with `component_annotation` = None: - @component - def MyTrainerComponent( - training_data: InputArtifact[standard_artifacts.Examples], - model: OutputArtifact[standard_artifacts.Model], - dropout_hyperparameter: float, - num_iterations: Parameter[int] = 10 - ) -> Output: - '''My simple trainer component.''' + ``` python + @component + def MyTrainerComponent( + training_data: InputArtifact[standard_artifacts.Examples], + model: OutputArtifact[standard_artifacts.Model], + dropout_hyperparameter: float, + num_iterations: Parameter[int] = 10, + ) -> Output: + """My simple trainer component.""" - records = read_examples(training_data.uri) - model_obj = train_model(records, num_iterations, dropout_hyperparameter) - model_obj.write_to(model.uri) + records = read_examples(training_data.uri) + model_obj = train_model(records, num_iterations, dropout_hyperparameter) + model_obj.write_to(model.uri) - return { - 'loss': model_obj.loss, - 'accuracy': model_obj.accuracy - } + return {"loss": model_obj.loss, "accuracy": model_obj.accuracy} + ``` When the parameter `use_beam` is True, one of the parameters of the decorated function type-annotated by BeamComponentParameter[beam.Pipeline] and the @@ -434,17 +436,19 @@ def MyTrainerComponent( with the tfx pipeline's beam_pipeline_args that's shared with other beam-based components: - @component(use_beam=True) - def DataProcessingComponent( - input_examples: InputArtifact[standard_artifacts.Examples], - output_examples: OutputArtifact[standard_artifacts.Examples], - beam_pipeline: BeamComponentParameter[beam.Pipeline] = None, - ) -> None: - '''My simple trainer component.''' - - records = read_examples(training_data.uri) - with beam_pipeline as p: + ``` python + @component(use_beam=True) + def DataProcessingComponent( + input_examples: InputArtifact[standard_artifacts.Examples], + output_examples: OutputArtifact[standard_artifacts.Examples], + beam_pipeline: BeamComponentParameter[beam.Pipeline] = None, + ) -> None: + """My simple trainer component.""" + + records = read_examples(training_data.uri) + with beam_pipeline as p: ... + ``` Experimental: no backwards compatibility guarantees. @@ -459,19 +463,15 @@ def DataProcessingComponent( Returns: An object that: - 1. you can call like the initializer of a subclass of - `base_component.BaseComponent` (or `base_component.BaseBeamComponent`). - 2. has a test_call() member function for unit testing the inner - implementation of the component. - Today, the returned object is literally a subclass of BaseComponent, so it - can be used as a `Type` e.g. in isinstance() checks. But you must not rely - on this, as we reserve the right to reserve a different kind of object in - future, which _only_ satisfies the two criteria (1.) and (2.) above - without being a `Type` itself. + + 1. you can call like the initializer of a subclass of [`base_component.BaseComponent`][tfx.v1.types.BaseChannel] (or [`base_component.BaseBeamComponent`][tfx.v1.types.BaseBeamComponent]). + 2. has a test_call() member function for unit testing the inner implementation of the component. + + Today, the returned object is literally a subclass of [BaseComponent][tfx.v1.types.BaseChannel], so it can be used as a `Type` e.g. in isinstance() checks. But you must not rely on this, as we reserve the right to reserve a different kind of object in the future, which _only_ satisfies the two criteria (1.) and (2.) above without being a `Type` itself. Raises: EnvironmentError: if the current Python interpreter is not Python 3. - """ + ''' if func is None: # Python decorators with arguments in parentheses result in two function # calls. The first function call supplies the kwargs and the second supplies diff --git a/tfx/dsl/components/common/importer.py b/tfx/dsl/components/common/importer.py index 08ab49d6e5..5d8a100c3c 100644 --- a/tfx/dsl/components/common/importer.py +++ b/tfx/dsl/components/common/importer.py @@ -274,14 +274,16 @@ class Importer(base_node.BaseNode): Here is an example to use the Importer: - ``` + ``` python importer = Importer( - source_uri='uri/to/schema', + source_uri="uri/to/schema", artifact_type=standard_artifacts.Schema, - reimport=False).with_id('import_schema') + reimport=False, + ).with_id("import_schema") schema_gen = SchemaGen( - fixed_schema=importer.outputs['result'], - examples=...) + fixed_schema=importer.outputs["result"], + examples=..., + ) ``` """ diff --git a/tfx/dsl/components/common/resolver.py b/tfx/dsl/components/common/resolver.py index 60f7791bd7..df91a2a89f 100644 --- a/tfx/dsl/components/common/resolver.py +++ b/tfx/dsl/components/common/resolver.py @@ -46,9 +46,9 @@ class ResolverStrategy(abc.ABC): to express the input resolution logic. Currently TFX supports the following builtin ResolverStrategy: - - [LatestArtifactStrategy](/tfx/api_docs/python/tfx/v1/dsl/experimental/LatestArtifactStrategy) - - [LatestBlessedModelStrategy](/tfx/api_docs/python/tfx/v1/dsl/experimental/LatestBlessedModelStrategy) - - [SpanRangeStrategy](/tfx/api_docs/python/tfx/v1/dsl/experimental/SpanRangeStrategy) + - [LatestArtifactStrategy][tfx.v1.dsl.experimental.LatestArtifactStrategy] + - [LatestBlessedModelStrategy][tfx.v1.dsl.experimental.LatestBlessedModelStrategy] + - [SpanRangeStrategy][tfx.v1.dsl.experimental.SpanRangeStrategy] A resolver strategy defines a type behavior used for input selection. A resolver strategy subclass must override the `resolve_artifacts()` function @@ -81,7 +81,7 @@ def resolve_artifacts( Returns: If all entries has enough data after the resolving, returns the resolved - input_dict. Otherise, return None. + input_dict. Otherise, return None. """ @@ -193,27 +193,31 @@ class Resolver(base_node.BaseNode): To use Resolver, pass the followings to the Resolver constructor: * Name of the Resolver instance - * A subclass of ResolverStrategy - * Configs that will be used to construct an instance of ResolverStrategy + * A subclass of [ResolverStrategy][tfx.v1.dsl.experimental.ResolverStrategy] + * Configs that will be used to construct an instance of [ResolverStrategy][tfx.v1.dsl.experimental.ResolverStrategy] * Channels to resolve with their tag, in the form of kwargs Here is an example: - ``` + ``` {.python .no-copy} example_gen = ImportExampleGen(...) examples_resolver = Resolver( - strategy_class=tfx.dsl.experimental.SpanRangeStrategy, - config={'range_config': range_config}, - examples=Channel(type=Examples, producer_component_id=example_gen.id) - ).with_id('Resolver.span_resolver') + strategy_class=tfx.dsl.experimental.SpanRangeStrategy, + config={"range_config": range_config}, + examples=Channel( + type=Examples, + producer_component_id=example_gen.id, + ), + ).with_id("Resolver.span_resolver") trainer = Trainer( - examples=examples_resolver.outputs['examples'], - ...) + examples=examples_resolver.outputs["examples"], + ..., + ) ``` - You can find experimental `ResolverStrategy` classes under - `tfx.v1.dsl.experimental` module, including `LatestArtifactStrategy`, - `LatestBlessedModelStrategy`, `SpanRangeStrategy`, etc. + You can find experimental [`ResolverStrategy`][tfx.v1.dsl.experimental.ResolverStrategy] classes under + [`tfx.v1.dsl.experimental`][tfx.v1.dsl.experimental] module, including [`LatestArtifactStrategy`][tfx.v1.dsl.experimental.LatestArtifactStrategy], + `LatestBlessedModelStrategy`, [`SpanRangeStrategy`][tfx.v1.dsl.experimental.SpanRangeStrategy], etc. """ def __init__(self, diff --git a/tfx/dsl/experimental/conditionals/conditional.py b/tfx/dsl/experimental/conditionals/conditional.py index e2a7aa6ede..1a05f4464a 100644 --- a/tfx/dsl/experimental/conditionals/conditional.py +++ b/tfx/dsl/experimental/conditionals/conditional.py @@ -55,16 +55,18 @@ class Cond(dsl_context_manager.DslContextManager[None]): Usage: - evaluator = Evaluator( - examples=example_gen.outputs['examples'], - model=trainer.outputs['model'], - eval_config=EvalConfig(...)) + ``` python + evaluator = Evaluator( + examples=example_gen.outputs["examples"], + model=trainer.outputs["model"], + eval_config=EvalConfig(...), + ) - with Cond(evaluator.outputs['blessing'].future() - .custom_property('blessed') == 1): + with Cond(evaluator.outputs["blessing"].future().custom_property("blessed") == 1): pusher = Pusher( - model=trainer.outputs['model'], - push_destination=PushDestination(...)) + model=trainer.outputs["model"], push_destination=PushDestination(...) + ) + ``` """ def __init__(self, predicate: placeholder.Predicate): diff --git a/tfx/dsl/input_resolution/strategies/latest_artifact_strategy.py b/tfx/dsl/input_resolution/strategies/latest_artifact_strategy.py index e836e88719..54bea2ce5e 100644 --- a/tfx/dsl/input_resolution/strategies/latest_artifact_strategy.py +++ b/tfx/dsl/input_resolution/strategies/latest_artifact_strategy.py @@ -25,16 +25,16 @@ class LatestArtifactStrategy(resolver.ResolverStrategy): """Strategy that resolves the latest n(=1) artifacts per each channel. - Note that this ResolverStrategy is experimental and is subject to change in - terms of both interface and implementation. + Note that this [ResolverStrategy][tfx.v1.dsl.experimental.ResolverStrategy] is experimental and is subject to change in terms of both interface and implementation. Don't construct LatestArtifactStrategy directly, example usage: - ``` - model_resolver = Resolver( - strategy_class=LatestArtifactStrategy, - model=Channel(type=Model), - ).with_id('latest_model_resolver') - model_resolver.outputs['model'] + ``` python + model_resolver.outputs['model'] + model_resolver = Resolver( + strategy_class=LatestArtifactStrategy, + model=Channel(type=Model), + ).with_id("latest_model_resolver") + model_resolver.outputs["model"] ``` """ @@ -63,7 +63,7 @@ def resolve_artifacts( Returns: If `min_count` for every input is met, returns a - Dict[str, List[Artifact]]. Otherwise, return None. + Dict[str, List[Artifact]]. Otherwise, return None. """ resolved_dict = self._resolve(input_dict) all_min_count_met = all( diff --git a/tfx/dsl/input_resolution/strategies/latest_blessed_model_strategy.py b/tfx/dsl/input_resolution/strategies/latest_blessed_model_strategy.py index 109d879f6b..2fee07ac73 100644 --- a/tfx/dsl/input_resolution/strategies/latest_blessed_model_strategy.py +++ b/tfx/dsl/input_resolution/strategies/latest_blessed_model_strategy.py @@ -35,17 +35,17 @@ class LatestBlessedModelStrategy(resolver.ResolverStrategy): """LatestBlessedModelStrategy resolves the latest blessed Model artifact. - Note that this ResolverStrategy is experimental and is subject to change in - terms of both interface and implementation. + Note that this [ResolverStrategy][tfx.v1.dsl.experimental.ResolverStrategy] is experimental and is subject to change in terms of both interface and implementation. Don't construct LatestBlessedModelStrategy directly, example usage: - ``` - model_resolver = Resolver( - strategy_class=LatestBlessedModelStrategy, - model=Channel(type=Model), - model_blessing=Channel(type=ModelBlessing), - ).with_id('latest_blessed_model_resolver') - model_resolver.outputs['model'] + ``` python + model_resolver.outputs['model'] + model_resolver = Resolver( + strategy_class=LatestBlessedModelStrategy, + model=Channel(type=Model), + model_blessing=Channel(type=ModelBlessing), + ).with_id("latest_blessed_model_resolver") + model_resolver.outputs["model"] ``` """ @@ -85,8 +85,8 @@ def resolve_artifacts( input_dict: The input_dict to resolve from. Returns: - The latest blessed Model and its corresponding ModelBlessing, respectively - in the same input channel they were contained to. + The latest blessed Model and its corresponding [ModelBlessing][tfx.v1.types.standard_artifacts.ModelBlessing], respectively + in the same input channel they were contained to. Raises: RuntimeError: if input_dict contains unsupported artifact types. diff --git a/tfx/dsl/input_resolution/strategies/span_range_strategy.py b/tfx/dsl/input_resolution/strategies/span_range_strategy.py index 6e74a7d531..aa607776d0 100644 --- a/tfx/dsl/input_resolution/strategies/span_range_strategy.py +++ b/tfx/dsl/input_resolution/strategies/span_range_strategy.py @@ -40,17 +40,16 @@ def _get_span_custom_property(artifact: types.Artifact) -> int: class SpanRangeStrategy(resolver.ResolverStrategy): """SpanRangeStrategy resolves artifacts based on "span" property. - Note that this ResolverStrategy is experimental and is subject to change in - terms of both interface and implementation. + Note that this [ResolverStrategy][tfx.v1.dsl.experimental.ResolverStrategy] is experimental and is subject to change in terms of both interface and implementation. Don't construct SpanRangeStrategy directly, example usage: - ``` - examples_resolver = Resolver( - strategy_class=SpanRangeStrategy, - config={'range_config': range_config}, - examples=Channel(type=Examples, producer_component_id=example_gen.id), - ).with_id('span_resolver') - examples_resolver.outputs['examples'] + ``` python + examples_resolver = Resolver( + strategy_class=SpanRangeStrategy, + config={"range_config": range_config}, + examples=Channel(type=Examples, producer_component_id=example_gen.id), + ).with_id("span_resolver") + examples_resolver.outputs["examples"] ``` """ diff --git a/tfx/dsl/placeholder/artifact_placeholder.py b/tfx/dsl/placeholder/artifact_placeholder.py index 2acb4000fe..9ab75d205e 100644 --- a/tfx/dsl/placeholder/artifact_placeholder.py +++ b/tfx/dsl/placeholder/artifact_placeholder.py @@ -31,21 +31,22 @@ def input(key: str) -> ArtifactPlaceholder: # pylint: disable=redefined-builtin Returns: A Placeholder that supports + 1. Rendering the whole MLMD artifact proto as text_format. - Example: input('model') - 2. Accessing a specific index using [index], if multiple artifacts are + Example: `#!python input('model')` + 2. Accessing a specific index using `#!python [index]`, if multiple artifacts are associated with the given key. If not specified, default to the first artifact. - Example: input('model')[0] + Example: `#!python input('model')[0]` 3. Getting the URI of an artifact through .uri property. - Example: input('model').uri or input('model')[0].uri + Example: `#!python input('model').uri or input('model')[0].uri` 4. Getting the URI of a specific split of an artifact using - .split_uri(split_name) method. - Example: input('examples')[0].split_uri('train') + `#!python .split_uri(split_name)` method. + Example: `#!python input('examples')[0].split_uri('train')` 5. Getting the value of a primitive artifact through .value property. - Example: input('primitive').value + Example: `#!python input('primitive').value` 6. Concatenating with other placeholders or strings. - Example: input('model').uri + '/model/' + exec_property('version') + Example: `#!python input('model').uri + '/model/' + exec_property('version')` """ return ArtifactPlaceholder(key, is_input=True) @@ -60,21 +61,22 @@ def output(key: str) -> ArtifactPlaceholder: Returns: A Placeholder that supports + 1. Rendering the whole artifact as text_format. - Example: output('model') + Example: `#!python output('model')` 2. Accessing a specific index using [index], if multiple artifacts are associated with the given key. If not specified, default to the first artifact. - Example: output('model')[0] + Example: `#!python output('model')[0]` 3. Getting the URI of an artifact through .uri property. - Example: output('model').uri or output('model')[0].uri + Example: `#!python output('model').uri or output('model')[0].uri` 4. Getting the URI of a specific split of an artifact using - .split_uri(split_name) method. - Example: output('examples')[0].split_uri('train') + `#!python .split_uri(split_name)` method. + Example: `#!python output('examples')[0].split_uri('train')` 5. Getting the value of a primitive artifact through .value property. - Example: output('primitive').value + Example: `#!python output('primitive').value` 6. Concatenating with other placeholders or strings. - Example: output('model').uri + '/model/' + exec_property('version') + Example: `#!python output('model').uri + '/model/' + exec_property('version')` """ return ArtifactPlaceholder(key, is_input=False) diff --git a/tfx/dsl/placeholder/runtime_placeholders.py b/tfx/dsl/placeholder/runtime_placeholders.py index b2b364a7d6..d235ae6c32 100644 --- a/tfx/dsl/placeholder/runtime_placeholders.py +++ b/tfx/dsl/placeholder/runtime_placeholders.py @@ -32,15 +32,16 @@ def exec_property(key: str) -> ExecPropertyPlaceholder: Returns: A Placeholder that supports + 1. Rendering the value of an execution property at a given key. - Example: exec_property('version') + Example: `#!python exec_property('version')` 2. Rendering the whole proto or a proto field of an execution property, if the value is a proto type. The (possibly nested) proto field in a placeholder can be accessed as if accessing a proto field in Python. - Example: exec_property('model_config').num_layers + Example: `#!python exec_property('model_config').num_layers` 3. Concatenating with other placeholders or strings. - Example: output('model').uri + '/model/' + exec_property('version') + Example: `#!python output('model').uri + '/model/' + exec_property('version')` """ return ExecPropertyPlaceholder(key) @@ -56,10 +57,10 @@ def runtime_info(key: RuntimeInfoKeys) -> RuntimeInfoPlaceholder: """Returns a Placeholder that contains runtime information for component. Currently the runtime info includes following keys: - 1. executor_spec: The executor spec proto. - 2. platform_config: A proto that contains platform-specific information for + 1. `executor_spec`: The executor spec proto. + 2. `platform_config`: A proto that contains platform-specific information for the current pipeline node. - 3. pipeline_platform_config: A proto that contains platform-specific + 3. `pipeline_platform_config`: A proto that contains platform-specific information for the pipeline as a whole. @@ -68,8 +69,8 @@ def runtime_info(key: RuntimeInfoKeys) -> RuntimeInfoPlaceholder: Returns: A Placeholder that will render to the information associated with the key. - If the placeholder is proto-valued. Accessing a proto field can be - represented as if accessing a proto field in Python. + If the placeholder is proto-valued. Accessing a proto field can be + represented as if accessing a proto field in Python. Raises: ValueError: If received unsupported key. @@ -82,11 +83,11 @@ def execution_invocation() -> ExecInvocationPlaceholder: Returns: A Placeholder that will render to the ExecutionInvocation proto. - Accessing a proto field is the same as if accessing a proto field in Python. + Accessing a proto field is the same as if accessing a proto field in Python. - Prefer to use input(key)/output(key)/exec_property(key) functions instead of - input_dict/output_dict/execution_properties field from ExecutionInvocation - proto. + Prefer to use input(key)/output(key)/exec_property(key) functions instead of + input_dict/output_dict/execution_properties field from ExecutionInvocation + proto. """ return ExecInvocationPlaceholder() @@ -99,6 +100,7 @@ def environment_variable(key: str) -> EnvironmentVariablePlaceholder: Returns: A Placeholder that supports + 1. Rendering the value of an environment variable for a given key. Example: environment_variable('FOO') 2. Concatenating with other placeholders or strings. diff --git a/tfx/extensions/google_cloud_ai_platform/bulk_inferrer/component.py b/tfx/extensions/google_cloud_ai_platform/bulk_inferrer/component.py index 4333fdcf7e..029f2c1b6e 100644 --- a/tfx/extensions/google_cloud_ai_platform/bulk_inferrer/component.py +++ b/tfx/extensions/google_cloud_ai_platform/bulk_inferrer/component.py @@ -69,9 +69,10 @@ class CloudAIBulkInferrerComponent(base_component.BaseComponent): TODO(b/155325467): Creates a end-to-end test for this component. Component `outputs` contains: - - `inference_result`: Channel of type `standard_artifacts.InferenceResult` + + - `inference_result`: Channel of type [`standard_artifacts.InferenceResult`][tfx.v1.types.standard_artifacts.InferenceResult] to store the inference results. - - `output_examples`: Channel of type `standard_artifacts.Examples` + - `output_examples`: Channel of type [`standard_artifacts.Examples`][tfx.v1.types.standard_artifacts.Examples] to store the output examples. """ @@ -91,11 +92,11 @@ def __init__( """Construct an BulkInferrer component. Args: - examples: A Channel of type `standard_artifacts.Examples`, usually + examples: A Channel of type [`standard_artifacts.Examples`][tfx.v1.types.standard_artifacts.Examples], usually produced by an ExampleGen component. _required_ - model: A Channel of type `standard_artifacts.Model`, usually produced by + model: A Channel of type [`standard_artifacts.Model`][tfx.v1.types.standard_artifacts.Model], usually produced by a Trainer component. - model_blessing: A Channel of type `standard_artifacts.ModelBlessing`, + model_blessing: A Channel of type [`standard_artifacts.ModelBlessing`][tfx.v1.types.standard_artifacts.ModelBlessing], usually produced by a ModelValidator component. data_spec: bulk_inferrer_pb2.DataSpec instance that describes data selection. @@ -105,7 +106,7 @@ def __init__( passed to Google Cloud AI Platform. custom_config.ai_platform_serving_args need to contain the serving job parameters. For the full set of parameters, refer to - https://cloud.google.com/ml-engine/reference/rest/v1/projects.models + [https://cloud.google.com/ml-engine/reference/rest/v1/projects.models](https://cloud.google.com/ml-engine/reference/rest/v1/projects.models) Raises: ValueError: Must not specify inference_result or output_examples depends diff --git a/tfx/extensions/google_cloud_ai_platform/pusher/component.py b/tfx/extensions/google_cloud_ai_platform/pusher/component.py index a1ebf95bf9..be4afcdfa9 100644 --- a/tfx/extensions/google_cloud_ai_platform/pusher/component.py +++ b/tfx/extensions/google_cloud_ai_platform/pusher/component.py @@ -34,15 +34,15 @@ def __init__(self, """Construct a Pusher component. Args: - model: An optional Channel of type `standard_artifacts.Model`, usually - produced by a Trainer component, representing the model used for + model: An optional Channel of type [`standard_artifacts.Model`][tfx.v1.types.standard_artifacts.Model], usually + produced by a [Trainer][tfx.v1.components.Trainer] component, representing the model used for training. model_blessing: An optional Channel of type - `standard_artifacts.ModelBlessing`, usually produced from an Evaluator + [`standard_artifacts.ModelBlessing`][tfx.v1.types.standard_artifacts.ModelBlessing], usually produced from an [Evaluator][tfx.v1.components.Evaluator] component, containing the blessing model. infra_blessing: An optional Channel of type - `standard_artifacts.InfraBlessing`, usually produced from an - InfraValidator component, containing the validation result. + [`standard_artifacts.InfraBlessing`][tfx.v1.types.standard_artifacts.InfraBlessing], usually produced from an + [InfraValidator][tfx.v1.components.InfraValidator] component, containing the validation result. custom_config: A dict which contains the deployment job parameters to be passed to Cloud platforms. """ diff --git a/tfx/extensions/google_cloud_ai_platform/trainer/component.py b/tfx/extensions/google_cloud_ai_platform/trainer/component.py index b6a8b93ecb..49eab5512e 100644 --- a/tfx/extensions/google_cloud_ai_platform/trainer/component.py +++ b/tfx/extensions/google_cloud_ai_platform/trainer/component.py @@ -47,37 +47,46 @@ def __init__(self, """Construct a Trainer component. Args: - examples: A Channel of type `standard_artifacts.Examples`, serving as the + examples: A Channel of type [`standard_artifacts.Examples`][tfx.v1.types.standard_artifacts.Examples], serving as the source of examples used in training (required). May be raw or transformed. transformed_examples: Deprecated field. Please set `examples` instead. transform_graph: An optional Channel of type - `standard_artifacts.TransformGraph`, serving as the input transform + [`standard_artifacts.TransformGraph`][tfx.v1.types.standard_artifacts.TransformGraph], serving as the input transform graph if present. - schema: An optional Channel of type `standard_artifacts.Schema`, serving + schema: An optional Channel of type [`standard_artifacts.Schema`][tfx.v1.types.standard_artifacts.Schema], serving as the schema of training and eval data. Schema is optional when 1) transform_graph is provided which contains schema. 2) user module bypasses the usage of schema, e.g., hardcoded. - base_model: A Channel of type `Model`, containing model that will be used + base_model: A Channel of type [`Model`][tfx.v1.types.standard_artifacts.Model], containing model that will be used for training. This can be used for warmstart, transfer learning or model ensembling. - hyperparameters: A Channel of type `standard_artifacts.HyperParameters`, + hyperparameters: A Channel of type [`standard_artifacts.HyperParameters`][tfx.v1.types.standard_artifacts.HyperParameters], serving as the hyperparameters for training module. Tuner's output best hyperparameters can be feed into this. module_file: A path to python module file containing UDF model definition. The module_file must implement a function named `run_fn` at its top - level with function signature: `def - run_fn(trainer.fn_args_utils.FnArgs)`, and the trained model must be - saved to FnArgs.serving_model_dir when this function is executed. For - Estimator based Executor, The module_file must implement a function - named `trainer_fn` at its top level. The function must have the - following signature. def trainer_fn(trainer.fn_args_utils.FnArgs, - tensorflow_metadata.proto.v0.schema_pb2) -> Dict: ... - where the returned Dict has the following key-values. - 'estimator': an instance of tf.estimator.Estimator - 'train_spec': an instance of tf.estimator.TrainSpec - 'eval_spec': an instance of tf.estimator.EvalSpec - 'eval_input_receiver_fn': an instance of tfma EvalInputReceiver. + level with function signature: + ```python + def run_fn(trainer.fn_args_utils.FnArgs): ... + ``` + and the trained model must be + saved to FnArgs.serving_model_dir when this function is executed. For + Estimator based Executor, The module_file must implement a function + named `trainer_fn` at its top level. The function must have the + following signature. + ```python + def trainer_fn( + trainer.fn_args_utils.FnArgs, + tensorflow_metadata.proto.v0.schema_pb2 + ) -> Dict: ... + ``` + where the returned Dict has the following key-values. + + - `estimator`: an instance of tf.estimator.Estimator + - `train_spec`: an instance of tf.estimator.TrainSpec + - `eval_spec`: an instance of tf.estimator.EvalSpec + - `eval_input_receiver_fn`: an instance of tfma EvalInputReceiver. run_fn: A python path to UDF model definition function for generic trainer. See 'module_file' for details. Exactly one of 'module_file' or 'run_fn' must be supplied if Trainer uses GenericExecutor (default). diff --git a/tfx/extensions/google_cloud_big_query/example_gen/component.py b/tfx/extensions/google_cloud_big_query/example_gen/component.py index db9dd63228..a8567e6374 100644 --- a/tfx/extensions/google_cloud_big_query/example_gen/component.py +++ b/tfx/extensions/google_cloud_big_query/example_gen/component.py @@ -32,7 +32,8 @@ class BigQueryExampleGen(component.QueryBasedExampleGen): and eval examples for downstream components. Component `outputs` contains: - - `examples`: Channel of type `standard_artifacts.Examples` for output train + + - `examples`: Channel of type [`standard_artifacts.Examples`][tfx.v1.types.standard_artifacts.Examples] for output train and eval examples. """ diff --git a/tfx/extensions/google_cloud_big_query/pusher/component.py b/tfx/extensions/google_cloud_big_query/pusher/component.py index 3bd2551dd1..0728d20cd5 100644 --- a/tfx/extensions/google_cloud_big_query/pusher/component.py +++ b/tfx/extensions/google_cloud_big_query/pusher/component.py @@ -25,6 +25,7 @@ class Pusher(pusher_component.Pusher): """Cloud Big Query Pusher component. Component `outputs` contains: + - `pushed_model`: Channel of type `standard_artifacts.PushedModel` with result of push. """ @@ -39,14 +40,14 @@ def __init__(self, """Construct a Pusher component. Args: - model: An optional Channel of type `standard_artifacts.Model`, usually - produced by a Trainer component. + model: An optional Channel of type [`standard_artifacts.Model`][tfx.v1.types.standard_artifacts.Model], usually + produced by a [Trainer][tfx.v1.components.Trainer] component. model_blessing: An optional Channel of type - `standard_artifacts.ModelBlessing`, usually produced from an Evaluator + [`standard_artifacts.ModelBlessing`][tfx.v1.types.standard_artifacts.ModelBlessing], usually produced from an Evaluator component. infra_blessing: An optional Channel of type - `standard_artifacts.InfraBlessing`, usually produced from an - InfraValidator component. + [`standard_artifacts.InfraBlessing`][tfx.v1.types.standard_artifacts.InfraBlessing], usually produced from an + [InfraValidator][tfx.v1.components.InfraValidator] component. custom_config: A dict which contains the deployment job parameters to be passed to Cloud platforms. """ diff --git a/tfx/orchestration/kubeflow/decorators.py b/tfx/orchestration/kubeflow/decorators.py index 03eb99ff7f..65866872cf 100644 --- a/tfx/orchestration/kubeflow/decorators.py +++ b/tfx/orchestration/kubeflow/decorators.py @@ -31,36 +31,40 @@ def exit_handler(func: types.FunctionType) -> Callable[..., Any]: pipeline, parameter should be defined as Parameter[str], passing in FinalStatusStr type when initializing the component. - This is example usage of component definition using this decorator: - ``` - from tfx import v1 as tfx - - @tfx.orchestration.experimental.exit_handler - def MyExitHandlerComponent(final_status: tfx.dsl.components.Parameter[str]): - # parse the final status - pipeline_task_status = pipeline_pb2.PipelineTaskFinalStatus() - proto_utils.json_to_proto(final_status, pipeline_task_status) - print(pipeline_task_status) - ``` - - Example usage in a Vertex AI graph definition: - ``` - exit_handler = exit_handler_component( - final_status=tfx.dsl.experimental.FinalStatusStr()) - - dsl_pipeline = tfx.dsl.Pipeline(...) - - runner = tfx.orchestration.experimental.KubeflowV2DagRunner(...) - runner.set_exit_handler([exit_handler]) - runner.run(pipeline=dsl_pipeline) - ``` + !!! example + This is example usage of component definition using this decorator: + ``` python + from tfx import v1 as tfx + + + @tfx.orchestration.experimental.exit_handler + def MyExitHandlerComponent(final_status: tfx.dsl.components.Parameter[str]): + # parse the final status + pipeline_task_status = pipeline_pb2.PipelineTaskFinalStatus() + proto_utils.json_to_proto(final_status, pipeline_task_status) + print(pipeline_task_status) + ``` + + !!! example + Example usage in a Vertex AI graph definition: + ```python + exit_handler = exit_handler_component( + final_status=tfx.dsl.experimental.FinalStatusStr() + ) + + dsl_pipeline = tfx.dsl.Pipeline(...) + + runner = tfx.orchestration.experimental.KubeflowV2DagRunner(...) + runner.set_exit_handler([exit_handler]) + runner.run(pipeline=dsl_pipeline) + ``` Experimental: no backwards compatibility guarantees. Args: func: Typehint-annotated component executor function. Returns: - `base_component.BaseComponent` subclass for the given component executor + [`base_component.BaseComponent`][tfx.v1.types.BaseComponent] subclass for the given component executor function. """ return component(func) @@ -70,13 +74,15 @@ class FinalStatusStr(str): """FinalStatusStr: is the type for parameter receiving PipelineTaskFinalStatus. Vertex AI backend passes in jsonlized string of - kfp.pipeline_spec.pipeline_spec_pb2.PipelineTaskFinalStatus. + `#!python kfp.pipeline_spec.pipeline_spec_pb2.PipelineTaskFinalStatus`. - This is example usage of FinalStatusStr definition: - ``` - exit_handler = exit_handler_component( - final_status=tfx.dsl.experimental.FinalStatusStr()) - ``` + !!! example + This is example usage of FinalStatusStr definition: + ``` python + exit_handler = exit_handler_component( + final_status=tfx.dsl.experimental.FinalStatusStr() + ) + ``` """ pass diff --git a/tfx/orchestration/pipeline.py b/tfx/orchestration/pipeline.py index b2622eda97..dd8e4984a1 100644 --- a/tfx/orchestration/pipeline.py +++ b/tfx/orchestration/pipeline.py @@ -233,7 +233,7 @@ class Pipeline(base_node.BaseNode): Pipeline object represents the DAG of TFX components, which can be run using one of the pipeline orchestration systems that TFX supports. For details, please refer to the - [guide](https://github.com/tensorflow/tfx/blob/master/docs/guide/build_tfx_pipeline.md). + [guide](../../../guide/build_tfx_pipeline). Attributes: components: A deterministic list of logical components of this pipeline, diff --git a/tfx/types/artifact.py b/tfx/types/artifact.py index 7d283b07c7..8f8fcc3131 100644 --- a/tfx/types/artifact.py +++ b/tfx/types/artifact.py @@ -113,8 +113,8 @@ class Artifact(json_utils.Jsonable): """TFX artifact used for orchestration. This is used for type-checking and inter-component communication. Currently, - it wraps a tuple of (ml_metadata.proto.Artifact, - ml_metadata.proto.ArtifactType) with additional property accessors for + it wraps a tuple of (`#!python ml_metadata.proto.Artifact`, + `#!python ml_metadata.proto.ArtifactType`) with additional property accessors for internal state. A user may create a subclass of Artifact and override the TYPE_NAME property @@ -124,8 +124,9 @@ class Artifact(json_utils.Jsonable): A user may specify artifact type-specific properties for an Artifact subclass by overriding the PROPERTIES dictionary, as detailed below. - Note: the behavior of this class is experimental, without backwards - compatibility guarantees, and may change in upcoming releases. + !!! Note + The behavior of this class is experimental, without backwards + compatibility guarantees, and may change in upcoming releases. """ # String artifact type name used to identify the type in ML Metadata diff --git a/tfx/types/channel.py b/tfx/types/channel.py index c972d221d0..a00c4c3bbc 100644 --- a/tfx/types/channel.py +++ b/tfx/types/channel.py @@ -90,27 +90,29 @@ class TriggerByProperty: class BaseChannel(abc.ABC, Generic[_AT]): """An abstraction for component (BaseNode) artifact inputs. - `BaseChannel` is often interchangeably used with the term 'channel' (not - capital `Channel` which points to the legacy class name). + [`BaseChannel`][tfx.v1.types.BaseChannel] is often interchangeably used with the term 'channel' (not + capital [`Channel`][tfx.v1.dsl.Channel] which points to the legacy class name). Component takes artifact inputs distinguished by each "input key". For example: - trainer = Trainer( - examples=example_gen.outputs['examples']) - ^^^^^^^^ - input key - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - channel + ``` python + trainer = Trainer( + examples=example_gen.outputs['examples'], + ) # ^^^^^^^^ + # input key + # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + # channel + ``` Here "examples" is the input key of the `Examples` artifact type. - `example_gen.outputs['examples']` is a channel. Typically a single channel - refers to a *list of `Artifact` of a homogeneous type*. Since channel is a + `#!python example_gen.outputs["examples"]` is a channel. Typically a single channel + refers to a *list of [`Artifact`][tfx.v1.dsl.Artifact] of a homogeneous type*. Since channel is a declarative abstraction it is not strictly bound to the actual artifact, but is more of an *input selector*. The most commonly used channel type is an `OutputChannel` (in the form of - `component.outputs["key"]`, which selects the artifact produced by the + `#!python component.outputs["key"]`, which selects the artifact produced by the component in the same pipeline run (in synchronous execution mode; more information on OutputChannel docstring), and is typically a single artifact. @@ -217,12 +219,12 @@ def __hash__(self): class Channel(json_utils.Jsonable, BaseChannel): """Legacy channel interface. - `Channel` used to represent the `BaseChannel` concept in the early TFX code, + [`Channel`][tfx.v1.dsl.Channel] used to represent the [`BaseChannel`][tfx.v1.types.BaseChannel] concept in the early TFX code, but due to having too much features in the same class, we refactored it to multiple classes: - BaseChannel for the general input abstraction - - OutputChannel for `component.outputs['key']`. + - OutputChannel for `#!python component.outputs['key']`. - MLMDQueryChannel for simple filter-based input resolution. Please do not use this class directly, but instead use the alternatives. This @@ -732,7 +734,7 @@ def __init__( """Initialization of ExternalPipelineChannel. Args: - artifact_type: Subclass of Artifact for this channel. + artifact_type: Subclass of [Artifact][tfx.v1.dsl.Artifact] for this channel. owner: Owner of the pipeline. pipeline_name: Name of the pipeline the artifacts belong to. producer_component_id: Id of the component produces the artifacts. @@ -780,11 +782,14 @@ class ChannelWrappedPlaceholder(artifact_placeholder.ArtifactPlaceholder): yet reference its name/key wrt. the downstream component in which it is used. So a ChannelWrappedPlaceholder simply remembers the original Channel instance that was used. The Placeholder expression tree built from this wrapper is then - passed to the component that uses it, and encode_placeholder_with_channels() + passed to the component that uses it, and `encode_placeholder_with_channels()` is used to inject the key only later, when encoding the Placeholder. For instance, this allows making Predicates using syntax like: - channel.future().value > 5 + + ``` python + channel.future().value > 5 + ``` """ def __init__( @@ -803,8 +808,8 @@ def set_key(self, key: Optional[str]): setter technically violates this guarantee, but we control the effects of it by _only_ calling the setter right before an `encode()` operation on this placeholder or a larger placeholder that contains it, and then calling - set_key(None) right after. encode_placeholder_with_channels() demonstrates - how to do this correctly and should be the preferred way to call set_key(). + `#!python set_key(None)` right after. `#!python encode_placeholder_with_channels()` demonstrates + how to do this correctly and should be the preferred way to call `#!python set_key()`. Args: key: The new key for the channel. diff --git a/tfx/types/standard_artifacts.py b/tfx/types/standard_artifacts.py index b67a5978b3..981309badf 100644 --- a/tfx/types/standard_artifacts.py +++ b/tfx/types/standard_artifacts.py @@ -84,13 +84,13 @@ class Examples(_TfxArtifact): The file and payload format must be specified as optional custom properties if not using default formats. Please see - https://www.tensorflow.org/tfx/guide/examplegen#span_version_and_split to + [the `ExampleGen` guide](../../../guide/examplegen#span-version-and-split) to understand about span, version and splits. * Properties: - `span`: Integer to distinguish group of Examples. - `version`: Integer to represent updated data. - - `splits`: A list of split names. For example, ["train", "test"]. + - `splits`: A list of split names. For example, `#!python ["train", "test"]`. * File structure: - `{uri}/` @@ -101,10 +101,10 @@ class Examples(_TfxArtifact): * Commonly used custom properties of the Examples artifact: - `file_format`: a string that represents the file format. See - tfx/components/util/tfxio_utils.py:make_tfxio for + [tfx/components/util/tfxio_utils.py](https://github.com/tensorflow/tfx/blob/v1.15.1/tfx/components/util/tfxio_utils.py):make_tfxio for available values. - `payload_format`: int (enum) value of the data payload format. - See tfx/proto/example_gen.proto:PayloadFormat for available formats. + See [tfx/proto/example_gen.proto](https://github.com/tensorflow/tfx/blob/v1.15.1/tfx/proto/example_gen.proto):PayloadFormat for available formats. """ TYPE_NAME = "Examples" TYPE_ANNOTATION = Dataset @@ -299,7 +299,7 @@ class Schema(_TfxArtifact): Schema artifact is used to store the schema of the data. The schema is a proto that describes the data, including the type of each feature, the range of values for each feature, and other - properties. The schema is usually generated by the SchemaGen component, which + properties. The schema is usually generated by the [SchemaGen][tfx.v1.components.SchemaGen] component, which uses the statistics of the data to infer the schema. The schema can be used by other components in the pipeline to validate the data and to generate models. diff --git a/tfx/types/value_artifact.py b/tfx/types/value_artifact.py index 3716e74014..6215695296 100644 --- a/tfx/types/value_artifact.py +++ b/tfx/types/value_artifact.py @@ -106,20 +106,19 @@ def encode(self, value) -> Any: def annotate_as(cls, type_annotation: Optional[Type[SystemArtifact]] = None): """Annotate the value artifact type with a system artifact class. - Example usage: + !!! example "Example usage" - ```python - from tfx import v1 as tfx - OutputArtifact = tfx.dsl.components.OutputArtifact - String = tfx.types.standard_artifacts.String - Model = tfx.dsl.standard_annotations.Model + ```python + from tfx import v1 as tfx - @tfx.dsl.components.component - def MyTrainer( - model: OutputArtifact[String.annotate_as(Model)] - ): - ... - ``` + OutputArtifact = tfx.dsl.components.OutputArtifact + String = tfx.types.standard_artifacts.String + Model = tfx.dsl.standard_annotations.Model + + + @tfx.dsl.components.component + def MyTrainer(model: OutputArtifact[String.annotate_as(Model)]): ... + ``` Args: type_annotation: the standard annotations used to annotate the value @@ -127,9 +126,9 @@ def MyTrainer( `tfx.v1.dsl.standard_annotations`. Returns: - A subclass of the method caller class (e.g., standard_artifacts.String, - standard_artifacts.Float) with TYPE_ANNOTATION attribute set to be - `type_annotation`; returns the original class if`type_annotation` is None. + A subclass of the method caller class (e.g., [`standard_artifacts.String`][tfx.v1.types.standard_artifacts.String], + [`standard_artifacts.Float`][tfx.v1.types.standard_artifacts.Float]) with TYPE_ANNOTATION attribute set to be + `type_annotation`; returns the original class if`type_annotation` is None. """ if not type_annotation: return cls diff --git a/tfx/v1/dsl/standard_annotations.py b/tfx/v1/dsl/standard_annotations.py index beb6c4de7f..36ace9ae18 100644 --- a/tfx/v1/dsl/standard_annotations.py +++ b/tfx/v1/dsl/standard_annotations.py @@ -13,21 +13,20 @@ # limitations under the License. """Public API for base type annotations.""" -from tfx.types import system_artifacts as _system_artifacts -from tfx.types import system_executions as _system_executions - # List of MLMD base artifact type annotations. -Dataset = _system_artifacts.Dataset -Model = _system_artifacts.Model -Statistics = _system_artifacts.Statistics -Metrics = _system_artifacts.Metrics +from tfx.types.system_artifacts import Dataset, Model, Statistics, Metrics # List of MLMD base execution type annotations. -Train = _system_executions.Train -Transform = _system_executions.Transform -Process = _system_executions.Process -Evaluate = _system_executions.Evaluate -Deploy = _system_executions.Deploy +from tfx.types.system_executions import Train, Transform, Process, Evaluate, Deploy -del _system_artifacts -del _system_executions +__all__ = [ + "Dataset", + "Deploy", + "Evaluate", + "Metrics", + "Model", + "Process", + "Statistics", + "Train", + "Transform", +] diff --git a/tfx/v1/orchestration/experimental/__init__.py b/tfx/v1/orchestration/experimental/__init__.py index 4f222b8371..df82230e4e 100644 --- a/tfx/v1/orchestration/experimental/__init__.py +++ b/tfx/v1/orchestration/experimental/__init__.py @@ -14,8 +14,10 @@ """TFX orchestration.experimental module.""" try: - from tfx.orchestration.kubeflow import ( - kubeflow_dag_runner, + from tfx.orchestration.kubeflow.kubeflow_dag_runner import ( + KubeflowDagRunner, + KubeflowDagRunnerConfig, + get_default_kubeflow_metadata_config, ) from tfx.orchestration.kubeflow.decorators import ( exit_handler, @@ -23,28 +25,16 @@ from tfx.orchestration.kubeflow.decorators import ( FinalStatusStr, ) - from tfx.utils import telemetry_utils + from tfx.utils.telemetry_utils import LABEL_KFP_SDK_ENV - KubeflowDagRunner = kubeflow_dag_runner.KubeflowDagRunner - KubeflowDagRunnerConfig = kubeflow_dag_runner.KubeflowDagRunnerConfig - get_default_kubeflow_metadata_config = ( - kubeflow_dag_runner.get_default_kubeflow_metadata_config - ) - LABEL_KFP_SDK_ENV = telemetry_utils.LABEL_KFP_SDK_ENV - - del telemetry_utils - del kubeflow_dag_runner except ImportError: # Import will fail without kfp package. pass try: - from tfx.orchestration.kubeflow.v2 import ( - kubeflow_v2_dag_runner, + from tfx.orchestration.kubeflow.v2.kubeflow_v2_dag_runner import ( + KubeflowV2DagRunner, + KubeflowV2DagRunnerConfig, ) - - KubeflowV2DagRunner = kubeflow_v2_dag_runner.KubeflowV2DagRunner - KubeflowV2DagRunnerConfig = kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig - del kubeflow_v2_dag_runner except ImportError: # Import will fail without kfp package. pass diff --git a/tfx/v1/orchestration/metadata.py b/tfx/v1/orchestration/metadata.py index 2eaaa2f6d8..ccf7f4fab3 100644 --- a/tfx/v1/orchestration/metadata.py +++ b/tfx/v1/orchestration/metadata.py @@ -13,11 +13,11 @@ # limitations under the License. """Public API for metadata.""" -from tfx.orchestration import metadata - -ConnectionConfigType = metadata.ConnectionConfigType -mysql_metadata_connection_config = metadata.mysql_metadata_connection_config -sqlite_metadata_connection_config = metadata.sqlite_metadata_connection_config +from tfx.orchestration.metadata import ( + ConnectionConfigType, + mysql_metadata_connection_config, + sqlite_metadata_connection_config, +) __all__ = [ "mysql_metadata_connection_config", diff --git a/tfx/v1/types/standard_artifacts.py b/tfx/v1/types/standard_artifacts.py index 155ce36ac6..db6b4154b0 100644 --- a/tfx/v1/types/standard_artifacts.py +++ b/tfx/v1/types/standard_artifacts.py @@ -27,6 +27,7 @@ Schema, TransformCache, TransformGraph, + TunerResults, HyperParameters, ) @@ -61,4 +62,5 @@ "String", "TransformCache", "TransformGraph", + "TunerResults", ]