Expose starry_net yaml to GitHub (#10943)

chensun · Googler · web-flow · commit f1e23142b1ea · 2024-06-22T01:22:05.000-07:00
Fix GitHub test error:
```
&gt;       with open(file_path, 'r') as component_stream:
E       FileNotFoundError: [Errno 2] No such file or directory: '/usr/local/lib/python3.8/site-packages/google_cloud_pipeline_components/_implementation/starry_net/evaluation/evaluation.yaml'
/usr/local/lib/python3.8/site-packages/kfp/components/load_yaml_utilities.py:53: FileNotFoundError
```


PiperOrigin-RevId: 645623029

Signed-off-by: Googler &lt;nobody@google.com&gt;
Co-authored-by: Googler &lt;nobody@google.com&gt;
diff --git a/components/google-cloud/google_cloud_pipeline_components/_implementation/starry_net/evaluation/evaluation.yaml b/components/google-cloud/google_cloud_pipeline_components/_implementation/starry_net/evaluation/evaluation.yaml
@@ -0,0 +1,197 @@
+name: model_evaluation_forecasting
+description: |
+  Computes a google.ForecastingMetrics Artifact, containing evaluation metrics given a model's prediction results.
+  Creates a dataflow job with Apache Beam and TFMA to compute evaluation metrics.
+  Supports point forecasting and quantile forecasting for tabular data.
+  Args:
+      project (str):
+          Project to run evaluation container.
+      location (Optional[str]):
+          Location for running the evaluation.
+          If not set, defaulted to `us-central1`.
+      root_dir (str):
+          The GCS directory for keeping staging files.
+          A random subdirectory will be created under the directory to keep job info for resuming
+          the job in case of failure.
+      predictions_format (Optional[str]):
+          The file format for the batch prediction results. `jsonl` is currently the only allowed
+          format.
+          If not set, defaulted to `jsonl`.
+      predictions_gcs_source (Optional[system.Artifact]):
+          An artifact with its URI pointing toward a GCS directory with prediction or explanation
+          files to be used for this evaluation.
+          For prediction results, the files should be named "prediction.results-*".
+          For explanation results, the files should be named "explanation.results-*".
+      predictions_bigquery_source (Optional[google.BQTable]):
+          BigQuery table with prediction or explanation data to be used for this evaluation.
+          For prediction results, the table column should be named "predicted_*".
+      ground_truth_format(Optional[str]):
+          Required for custom tabular and non tabular data.
+          The file format for the ground truth files. `jsonl` is currently the only allowed format.
+          If not set, defaulted to `jsonl`.
+      ground_truth_gcs_source(Optional[Sequence[str]]):
+          Required for custom tabular and non tabular data.
+          The GCS uris representing where the ground truth is located.
+          Used to provide ground truth for each prediction instance when they are not part of the batch prediction jobs prediction instance.
+      ground_truth_bigquery_source(Optional[str]):
+          Required for custom tabular.
+          The BigQuery table uri representing where the ground truth is located.
+          Used to provide ground truth for each prediction instance when they are not part of the batch prediction jobs prediction instance.
+      target_field_name (str):
+          The full name path of the features target field in the predictions file.
+          Formatted to be able to find nested columns, delimited by `.`.
+          Alternatively referred to as the ground truth (or ground_truth_column) field.
+      model (Optional[google.VertexModel]):
+          The Model used for predictions job.
+          Must share the same ancestor Location.
+      prediction_score_column (Optional[str]):
+          Optional. The column name of the field containing batch prediction scores.
+          Formatted to be able to find nested columns, delimited by `.`.
+          If not set, defaulted to `prediction.value` for a `point` forecasting_type and
+          `prediction.quantile_predictions` for a `quantile` forecasting_type.
+      forecasting_type (Optional[str]):
+          Optional. If the problem_type is `forecasting`, then the forecasting type being addressed
+          by this regression evaluation run. `point` and `quantile` are the supported types.
+          If not set, defaulted to `point`.
+      forecasting_quantiles (Optional[Sequence[Float]]):
+          Required for a `quantile` forecasting_type.
+          The list of quantiles in the same order appeared in the quantile prediction score column.
+          If one of the quantiles is set to `0.5f`, point evaluation will be set on that index.
+      example_weight_column (Optional[str]):
+          Optional. The column name of the field containing example weights.
+            Each value of positive_classes provided.
+      point_evaluation_quantile (Optional[Float]):
+          Required for a `quantile` forecasting_type.
+          A quantile in the list of forecasting_quantiles that will be used for point evaluation
+          metrics.
+      dataflow_service_account (Optional[str]):
+          Optional. Service account to run the dataflow job.
+          If not set, dataflow will use the default woker service account.
+          For more details, see https://cloud.google.com/dataflow/docs/concepts/security-and-permissions#default_worker_service_account
+      dataflow_disk_size (Optional[int]):
+          Optional. The disk size (in GB) of the machine executing the evaluation run.
+          If not set, defaulted to `50`.
+      dataflow_machine_type (Optional[str]):
+          Optional. The machine type executing the evaluation run.
+          If not set, defaulted to `n1-standard-4`.
+      dataflow_workers_num (Optional[int]):
+          Optional. The number of workers executing the evaluation run.
+          If not set, defaulted to `10`.
+      dataflow_max_workers_num (Optional[int]):
+          Optional. The max number of workers executing the evaluation run.
+          If not set, defaulted to `25`.
+      dataflow_subnetwork (Optional[str]):
+          Dataflow's fully qualified subnetwork name, when empty the default subnetwork will be
+          used. More details:
+          https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications
+      dataflow_use_public_ips (Optional[bool]):
+          Specifies whether Dataflow workers use public IP addresses.
+      encryption_spec_key_name (Optional[str]):
+          Customer-managed encryption key.
+  Returns:
+      evaluation_metrics (google.ForecastingMetrics):
+          google.ForecastingMetrics artifact representing the forecasting evaluation metrics in GCS.
+inputs:
+  - { name: project, type: String }
+  - { name: location, type: String, default: "us-central1" }
+  - { name: root_dir, type: system.Artifact }
+  - { name: predictions_format, type: String, default: "jsonl" }
+  - { name: predictions_gcs_source, type: Artifact, optional: True }
+  - { name: predictions_bigquery_source, type: google.BQTable, optional: True }
+  - { name: ground_truth_format, type: String, default: "jsonl" }
+  - { name: ground_truth_gcs_source, type: JsonArray, default: "[]" }
+  - { name: ground_truth_bigquery_source, type: String, default: "" }
+  - { name: target_field_name, type: String }
+  - { name: model, type: google.VertexModel, optional: True }
+  - { name: prediction_score_column, type: String, default: "" }
+  - { name: forecasting_type, type: String, default: "point" }
+  - { name: forecasting_quantiles, type: JsonArray, default: "[0.5]" }
+  - { name: example_weight_column, type: String, default: "" }
+  - { name: point_evaluation_quantile, type: Float, default: 0.5 }
+  - { name: dataflow_service_account, type: String, default: "" }
+  - { name: dataflow_disk_size, type: Integer, default: 50 }
+  - { name: dataflow_machine_type, type: String, default: "n1-standard-4" }
+  - { name: dataflow_workers_num, type: Integer, default: 1 }
+  - { name: dataflow_max_workers_num, type: Integer, default: 5 }
+  - { name: dataflow_subnetwork, type: String, default: "" }
+  - { name: dataflow_use_public_ips, type: Boolean, default: "true" }
+  - { name: encryption_spec_key_name, type: String, default: "" }
+outputs:
+  - { name: evaluation_metrics, type: google.ForecastingMetrics }
+  - { name: gcp_resources, type: String }
+implementation:
+  container:
+    image: gcr.io/ml-pipeline/model-evaluation:v0.9
+    command:
+      - python
+      - /main.py
+    args:
+      - --setup_file
+      - /setup.py
+      - --json_mode
+      - "true"
+      - --project_id
+      - { inputValue: project }
+      - --location
+      - { inputValue: location }
+      - --problem_type
+      - "forecasting"
+      - --forecasting_type
+      - { inputValue: forecasting_type }
+      - --forecasting_quantiles
+      - { inputValue: forecasting_quantiles }
+      - --point_evaluation_quantile
+      - { inputValue: point_evaluation_quantile }
+      - --batch_prediction_format
+      - { inputValue: predictions_format }
+      - if:
+          cond: {isPresent: predictions_gcs_source}
+          then:
+          - --batch_prediction_gcs_source
+          - "{{$.inputs.artifacts['predictions_gcs_source'].uri}}"
+      - if:
+          cond: {isPresent: predictions_bigquery_source}
+          then:
+          - --batch_prediction_bigquery_source
+          - "bq://{{$.inputs.artifacts['predictions_bigquery_source'].metadata['projectId']}}.{{$.inputs.artifacts['predictions_bigquery_source'].metadata['datasetId']}}.{{$.inputs.artifacts['predictions_bigquery_source'].metadata['tableId']}}"
+      - if:
+          cond: {isPresent: model}
+          then:
+          - --model_name
+          - "{{$.inputs.artifacts['model'].metadata['resourceName']}}"
+      - --ground_truth_format
+      - { inputValue: ground_truth_format }
+      - --ground_truth_gcs_source
+      - { inputValue: ground_truth_gcs_source }
+      - --ground_truth_bigquery_source
+      - { inputValue: ground_truth_bigquery_source }
+      - --root_dir
+      - "{{$.inputs.artifacts['root_dir'].uri}}"
+      - --target_field_name
+      - "instance.{{$.inputs.parameters['target_field_name']}}"
+      - --prediction_score_column
+      - { inputValue: prediction_score_column }
+      - --dataflow_job_prefix
+      - "evaluation-{{$.pipeline_job_uuid}}-{{$.pipeline_task_uuid}}"
+      - --dataflow_service_account
+      - { inputValue: dataflow_service_account }
+      - --dataflow_disk_size
+      - { inputValue: dataflow_disk_size }
+      - --dataflow_machine_type
+      - { inputValue: dataflow_machine_type }
+      - --dataflow_workers_num
+      - { inputValue: dataflow_workers_num }
+      - --dataflow_max_workers_num
+      - { inputValue: dataflow_max_workers_num }
+      - --dataflow_subnetwork
+      - { inputValue: dataflow_subnetwork }
+      - --dataflow_use_public_ips
+      - { inputValue: dataflow_use_public_ips }
+      - --kms_key_name
+      - { inputValue: encryption_spec_key_name }
+      - --output_metrics_gcs_path
+      - { outputUri: evaluation_metrics }
+      - --gcp_resources
+      - { outputPath: gcp_resources }
+      - --executor_input
+      - "{{$}}"
diff --git a/components/google-cloud/google_cloud_pipeline_components/_implementation/starry_net/upload_model/upload_model.yaml b/components/google-cloud/google_cloud_pipeline_components/_implementation/starry_net/upload_model/upload_model.yaml
@@ -0,0 +1,37 @@
+name: model_upload
+inputs:
+- {name: project, type: String}
+- {name: location, type: String, default: "us-central1"}
+- {name: display_name, type: String}
+- {name: description, type: String, optional: true, default: ''}
+- {name: unmanaged_container_model, type: google.UnmanagedContainerModel, optional: true}
+- {name: encryption_spec_key_name, type: String, optional: true, default: ''}
+- {name: labels, type: JsonObject, optional: true, default: '{}'}
+- {name: parent_model, type: google.VertexModel, optional: true}
+outputs:
+- {name: model, type: google.VertexModel}
+- {name: gcp_resources, type: String}
+implementation:
+  container:
+    image: gcr.io/ml-pipeline/automl-tables-private:1.0.17
+    command: [python3, -u, -m, launcher]
+    args: [
+      --type, UploadModel,
+      --payload,
+      concat: [
+          '{',
+          '"display_name": "', {inputValue: display_name}, '"',
+          ', "description": "', {inputValue: description}, '"',
+          ', "encryption_spec": {"kms_key_name":"', {inputValue: encryption_spec_key_name}, '"}',
+          ', "labels": ', {inputValue: labels},
+          '}'
+      ],
+      --project, {inputValue: project},
+      --location, {inputValue: location},
+      --gcp_resources, {outputPath: gcp_resources},
+      --executor_input, "{{$}}",
+      {if: {
+        cond: {isPresent: parent_model},
+        then: ["--parent_model_name", "{{$.inputs.artifacts['parent_model'].metadata['resourceName']}}",]
+      }},
+    ]