tabular_eval_component.yaml

name: Classification model eval metrics
inputs:
- {name: project, type: String}
- {name: location, type: String}
- {name: api_endpoint, type: String}
- {name: thresholds_dict_str, type: String}
- {name: model, type: Artifact}
outputs:
- {name: metrics, type: Metrics}
- {name: metricsc, type: ClassificationMetrics}
- {name: dep_decision, type: String}
implementation:
  container:
    image: gcr.io/deeplearning-platform-release/tf2-cpu.2-3:latest
    command:
    - sh
    - -c
    - |2

      if ! [ -x "$(command -v pip)" ]; then
          python3 -m ensurepip || python3 -m ensurepip --user || apt-get install python3-pip
      fi

      PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet     --no-warn-script-location 'google-cloud-aiplatform' 'kfp==1.8.9' && "$0" "$@"
    - sh
    - -ec
    - |
      program_path=$(mktemp -d)
      printf "%s" "$0" > "$program_path/ephemeral_component.py"
      python3 -m kfp.v2.components.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
    - |2+

      import kfp
      from kfp.v2 import dsl
      from kfp.v2.dsl import *
      from typing import *

      def classification_model_eval_metrics(
          project: str,
          location: str,  # "us-central1",
          api_endpoint: str,  # "us-central1-aiplatform.googleapis.com",
          thresholds_dict_str: str,
          model: Input[Artifact],
          metrics: Output[Metrics],
          metricsc: Output[ClassificationMetrics],
      ) -> NamedTuple("Outputs", [("dep_decision", str)]):  # Return parameter.

          import json
          import logging

          from google.cloud import aiplatform as aip

          # Fetch model eval info
          def get_eval_info(client, model_name):
              from google.protobuf.json_format import MessageToDict

              response = client.list_model_evaluations(parent=model_name)
              metrics_list = []
              metrics_string_list = []
              for evaluation in response:
                  print("model_evaluation")
                  print(" name:", evaluation.name)
                  print(" metrics_schema_uri:", evaluation.metrics_schema_uri)
                  metrics = MessageToDict(evaluation._pb.metrics)
                  for metric in metrics.keys():
                      logging.info("metric: %s, value: %s", metric, metrics[metric])
                  metrics_str = json.dumps(metrics)
                  metrics_list.append(metrics)
                  metrics_string_list.append(metrics_str)

              return (
                  evaluation.name,
                  metrics_list,
                  metrics_string_list,
              )

          # Use the given metrics threshold(s) to determine whether the model is
          # accurate enough to deploy.
          def classification_thresholds_check(metrics_dict, thresholds_dict):
              for k, v in thresholds_dict.items():
                  logging.info("k {}, v {}".format(k, v))
                  if k in ["auRoc", "auPrc"]:  # higher is better
                      if metrics_dict[k] < v:  # if under threshold, don't deploy
                          logging.info("{} < {}; returning False".format(metrics_dict[k], v))
                          return False
              logging.info("threshold checks passed.")
              return True

          def log_metrics(metrics_list, metricsc):
              test_confusion_matrix = metrics_list[0]["confusionMatrix"]
              logging.info("rows: %s", test_confusion_matrix["rows"])

              # log the ROC curve
              fpr = []
              tpr = []
              thresholds = []
              for item in metrics_list[0]["confidenceMetrics"]:
                  fpr.append(item.get("falsePositiveRate", 0.0))
                  tpr.append(item.get("recall", 0.0))
                  thresholds.append(item.get("confidenceThreshold", 0.0))
              print(f"fpr: {fpr}")
              print(f"tpr: {tpr}")
              print(f"thresholds: {thresholds}")
              metricsc.log_roc_curve(fpr, tpr, thresholds)

              # log the confusion matrix
              annotations = []
              for item in test_confusion_matrix["annotationSpecs"]:
                  annotations.append(item["displayName"])
              logging.info("confusion matrix annotations: %s", annotations)
              metricsc.log_confusion_matrix(
                  annotations,
                  test_confusion_matrix["rows"],
              )

              # log textual metrics info as well
              for metric in metrics_list[0].keys():
                  if metric != "confidenceMetrics":
                      val_string = json.dumps(metrics_list[0][metric])
                      metrics.log_metric(metric, val_string)
              # metrics.metadata["model_type"] = "AutoML Tabular classification"

          logging.getLogger().setLevel(logging.INFO)
          aip.init(project=project)
          # extract the model resource name from the input Model Artifact
          model_resource_path = model.metadata["resourceName"]
          logging.info("model path: %s", model_resource_path)

          client_options = {"api_endpoint": api_endpoint}
          # Initialize client that will be used to create and send requests.
          client = aip.gapic.ModelServiceClient(client_options=client_options)
          eval_name, metrics_list, metrics_str_list = get_eval_info(
              client, model_resource_path
          )
          logging.info("got evaluation name: %s", eval_name)
          logging.info("got metrics list: %s", metrics_list)
          log_metrics(metrics_list, metricsc)

          thresholds_dict = json.loads(thresholds_dict_str)
          deploy = classification_thresholds_check(metrics_list[0], thresholds_dict)
          if deploy:
              dep_decision = "true"
          else:
              dep_decision = "false"
          logging.info("deployment decision is %s", dep_decision)

          return (dep_decision,)

    args:
    - --executor_input
    - {executorInput: null}
    - --function_to_execute
    - classification_model_eval_metrics