openshift-psap
diff --git a/‎roles/watsonx_serving/watsonx_serving_deploy_model/defaults/main/config.yml
Lines changed: 20 additions & 10 deletions b/‎roles/watsonx_serving/watsonx_serving_deploy_model/defaults/main/config.yml
Lines changed: 20 additions & 10 deletions
diff --git a/‎roles/watsonx_serving/watsonx_serving_deploy_model/tasks/main.yml
Lines changed: 52 additions & 5 deletions b/‎roles/watsonx_serving/watsonx_serving_deploy_model/tasks/main.yml
Lines changed: 52 additions & 5 deletions
diff --git a/‎roles/watsonx_serving/watsonx_serving_deploy_model/templates/caikit-tgis-config.yaml.j2
Lines changed: 25 additions & 0 deletions b/‎roles/watsonx_serving/watsonx_serving_deploy_model/templates/caikit-tgis-config.yaml.j2
Lines changed: 25 additions & 0 deletions
diff --git a/‎roles/watsonx_serving/watsonx_serving_deploy_model/templates/serving_runtime.yaml.j2
Lines changed: 43 additions & 37 deletions b/‎roles/watsonx_serving/watsonx_serving_deploy_model/templates/serving_runtime.yaml.j2
Lines changed: 43 additions & 37 deletions
diff --git a/‎roles/watsonx_serving/watsonx_serving_deploy_model/vars/main/resources.yml
Lines changed: 1 addition & 0 deletions b/‎roles/watsonx_serving/watsonx_serving_deploy_model/vars/main/resources.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎roles/watsonx_serving/watsonx_serving_validate_model/tasks/validate_model.yaml
Lines changed: 14 additions & 2 deletions b/‎roles/watsonx_serving/watsonx_serving_validate_model/tasks/validate_model.yaml
Lines changed: 14 additions & 2 deletions
diff --git a/‎testing/watsonx-serving/command_args.yml.j2
Lines changed: 6 additions & 3 deletions b/‎testing/watsonx-serving/command_args.yml.j2
Lines changed: 6 additions & 3 deletions
diff --git a/‎testing/watsonx-serving/config.yaml
Lines changed: 8 additions & 5 deletions b/‎testing/watsonx-serving/config.yaml
Lines changed: 8 additions & 5 deletions
diff --git a/‎testing/watsonx-serving/models/bloom-560m.yaml
Lines changed: 13 additions & 7 deletions b/‎testing/watsonx-serving/models/bloom-560m.yaml
Lines changed: 13 additions & 7 deletions
@@ -19,13 +19,21 @@ watsonx_serving_deploy_model_model_id:
 # Mandatory value
 watsonx_serving_deploy_model_serving_runtime_name:
 
-# the image of the serving runtime
+# the image of the Kserve serving runtime container
 # Mandatory value
-watsonx_serving_deploy_model_serving_runtime_image:
+watsonx_serving_deploy_model_sr_kserve_image:
 
-# the resource request of the serving runtime
+# the resource request of the kserve serving runtime container
 # Mandatory value
-watsonx_serving_deploy_model_serving_runtime_resource_request:
+watsonx_serving_deploy_model_sr_kserve_resource_request:
+
+# the image of the Transformer serving runtime container
+# Mandatory value
+watsonx_serving_deploy_model_sr_transformer_image:
+
+# the resource request of the Transformer serving runtime container
+# Mandatory value
+watsonx_serving_deploy_model_sr_transformer_resource_request:
 
 # the name to give to the inference service
 # Mandatory value
@@ -39,6 +47,12 @@ watsonx_serving_deploy_model_storage_uri:
 # Mandatory value
 watsonx_serving_deploy_model_sa_name:
 
+# extra key/value pairs for the kserve container (will override the values from the secret file)
+watsonx_serving_deploy_model_sr_kserve_extra_env_values: {}
+
+# extra key/value pairs for the transformer container (will override the values from the secret file)
+watsonx_serving_deploy_model_sr_transformer_extra_env_values: {}
+
 # the minimum number of replicas. If none, the field is left unset.
 # Type: Int
 watsonx_serving_deploy_model_inference_service_min_replicas: null
@@ -49,12 +63,8 @@ watsonx_serving_deploy_model_secret_env_file_name: null
 # key to the secret environment key/values in the secret file
 watsonx_serving_deploy_model_secret_env_file_key: null
 
-# extra key/value pairs (will override the values from the secret file)
-# Type: Dict
-watsonx_serving_deploy_model_env_extra_values: {}
-
-# if True, mute the serving runtime container logs
-watsonx_serving_deploy_model_mute_serving_logs: false
+# if True, mute the transformer serving runtime container logs
+watsonx_serving_deploy_model_sr_transformer_mute_logs: false
 
 # if True, deletes the other serving runtime/inference services of the namespace
 watsonx_serving_deploy_model_delete_others: true
 
@@ -116,12 +116,29 @@
 
 # Serving Runtime
 
+- name: Prepare the caikit-tgis-config template
+  template:
+    src: "{{ caikit_tgit_config_template }}"
+    dest: "{{ artifact_extra_logs_dir }}/src/caikit_tgit_config.yaml"
+    mode: '0400'
+
 - name: Prepare the ServingRuntime template
   template:
     src: "{{ serving_runtime_template }}"
     dest: "{{ artifact_extra_logs_dir }}/src/serving_runtime.yaml"
     mode: '0400'
 
+- name: Create or update the Caikit TGIS config
+  shell:
+    set -o pipefail;
+
+    oc create cm {{ watsonx_serving_deploy_model_serving_runtime_name }}-caikit-tgis-config
+       -n {{ watsonx_serving_deploy_model_namespace }}
+       --from-file=caikit.yml="{{ artifact_extra_logs_dir }}/src/caikit_tgit_config.yaml"
+       --dry-run=client
+       -oyaml
+       | oc apply -f-
+
 - name: Create the ServingRuntime
   command:
     oc apply -f "{{ artifact_extra_logs_dir }}/src/serving_runtime.yaml"
@@ -178,19 +195,49 @@
       -n {{ watsonx_serving_deploy_model_namespace }}
     register: inference_service_pod_fetching_cmd
     # wait 60 minutes
-    retries: 60
-    delay: 60
+    retries: 120
+    delay: 30
     until: inference_service_pod_fetching_cmd.stdout | length > 0
 
+  - name: Wait for all the containers to be ready
+    shell: |
+      set -o pipefail;
+      set -e;
+
+      status=$(oc get pod \
+        -ojson \
+        -lserving.kserve.io/inferenceservice={{ watsonx_serving_deploy_model_inference_service_name }} \
+        -n {{ watsonx_serving_deploy_model_namespace }} \
+        | jq -r '.items[0].status.containerStatuses[] | ("" + .name +" ready="+ (.ready|tostring)) +" restarted="+(.restartCount|tostring)')
+      echo "$status"
+
+      if grep -v restarted=0 <<< $status >&2; then
+        echo "Restart detected, aborting" >&2
+        exit 2
+      fi
+
+      if grep -v ready=true <<< $status >&2; then
+        echo "Containers not ready detected, keep waiting ..." >&2
+        exit 1
+      fi
+
+      echo "All the containers are ready, all good :)" >&2
+    register: inference_service_pod_ready
+    failed_when: inference_service_pod_ready.rc == 2
+    until: inference_service_pod_ready.rc == 0 or inference_service_pod_ready.rc == 2
+    # wait 90 minutes
+    retries: 180
+    delay: 30
+
   - name: Wait for the InferenceService Pod to initialize the model
     shell:
       set -o pipefail;
       oc get -f "{{ artifact_extra_logs_dir }}/src/inference_service.yaml"
          -ojsonpath={.status.modelStatus.states.targetModelState}
     register: inference_service_state_cmd
-    # wait 90 minutes
-    retries: 180
-    delay: 30
+    # wait 5 minutes
+    retries: 30
+    delay: 10
     until: inference_service_state_cmd.stdout == "Loaded"
 
   - name: Capture the state of the InferenceService Pod resource
 
@@ -0,0 +1,25 @@
+runtime:
+  library: caikit_nlp
+  local_models_dir: /mnt/models/
+  lazy_load_local_models: true
+
+model_management:
+  finders:
+    default:
+      type: MULTI
+      config:
+        finder_priority:
+        - tgis-auto
+    tgis-auto:
+      type: TGIS-AUTO
+      config:
+        test_connection: true
+  initializers:
+    default:
+      type: LOCAL
+      config:
+        backend_priority:
+        - type: TGIS
+          config:
+            connection:
+              hostname: localhost:8033
@@ -6,72 +6,78 @@ metadata:
 spec:
   containers:
   - name: kserve-container
+    image: {{ watsonx_serving_deploy_model_sr_kserve_image }}
     command: [bash, -cex]
     args:
     - |
-{% if watsonx_serving_deploy_model_mute_serving_logs %}
-      echo "Starting Caikit-serving without stdout logs ..."
-{% else %}
-      echo "Starting Caikit-serving _with_ stdout logs ..."
-{% endif %}
-      TGIS_CONFIG_TEMPLATE="/caikit/config/caikit-tgis.template.yml"
-      EXTENDED_TIMEOUT=20000
-      sed -i 's/load_timeout: .*/load_timeout: '$EXTENDED_TIMEOUT'/' "$TGIS_CONFIG_TEMPLATE"
-      exec ./start-serving.sh {% if watsonx_serving_deploy_model_mute_serving_logs %} > "$HUGGINGFACE_HUB_CACHE/logs" {% endif %}
+      echo "Starting kserver (TGIS) {% if watsonx_serving_deploy_model_sr_transformer_mute_logs %} without {% else %} _with_ {% endif %} stdout logs ..."
+      exec text-generation-launcher --model-name=/mnt/models/artifacts/ {% if watsonx_serving_deploy_model_sr_transformer_mute_logs %} > "/tmp/tgis-logs" {% endif %}
 
     env:
-    - name: RUNTIME_LOCAL_MODELS_DIR
-      value: /mnt/models
-{% if watsonx_serving_deploy_model_serving_runtime_resource_request['nvidia.com/gpu'] %}
-    - name: NUM_GPUS
-      value: "1"
-{% endif %}
-    - name: MODEL_NAME
-      value: "{{ watsonx_serving_deploy_model_model_name }}"
-
     - name: TRANSFORMERS_CACHE
       value: /shared_model_storage/transformers_cache
-    - name: HUGGINGFACE_HUB_CACHE
-      value: /shared_model_storage/transformers_cache
-
-{% for env_key, env_value in watsonx_serving_deploy_model_env_extra_values.items() %}
+{% for env_key, env_value in watsonx_serving_deploy_model_sr_kserve_extra_env_values.items() %}
     - name: "{{ env_key }}"
       value: "{{ env_value }}"
 {% endfor %}
-    envFrom:
+
 {% if watsonx_serving_deploy_model_secret_env_file_name is not none %}
+    envFrom:
     - secretRef:
         name: {{ watsonx_serving_deploy_model_serving_runtime_name }}-secret
 {% endif %}
 
-    image: {{ watsonx_serving_deploy_model_serving_runtime_image }}
-    ports:
-    # Note, KServe only allows a single port, this is the gRPC port. Subject to change in the future
-    - containerPort: 8085
-      name: h2c
-      protocol: TCP
     resources:
       requests:
-        cpu: "{{ watsonx_serving_deploy_model_serving_runtime_resource_request.cpu }}"
-        memory: "{{ watsonx_serving_deploy_model_serving_runtime_resource_request.memory }}Gi"
-{% if watsonx_serving_deploy_model_serving_runtime_resource_request['nvidia.com/gpu'] %}
-        nvidia.com/gpu: "{{ watsonx_serving_deploy_model_serving_runtime_resource_request['nvidia.com/gpu'] }}"
+        cpu: "{{ watsonx_serving_deploy_model_sr_kserve_resource_request.cpu }}"
+        memory: "{{ watsonx_serving_deploy_model_sr_kserve_resource_request.memory }}Gi"
+{% if watsonx_serving_deploy_model_sr_kserve_resource_request['nvidia.com/gpu'] %}
+        nvidia.com/gpu: "{{ watsonx_serving_deploy_model_sr_kserve_resource_request['nvidia.com/gpu'] }}"
 {% endif %}
       limits:
 {% if watsonx_serving_deploy_model_limits_equals_requests %}
-        cpu: "{{ watsonx_serving_deploy_model_serving_runtime_resource_request.cpu }}"
-        memory: "{{ watsonx_serving_deploy_model_serving_runtime_resource_request.memory }}Gi"
+        cpu: "{{ watsonx_serving_deploy_model_sr_kserve_resource_request.cpu }}"
+        memory: "{{ watsonx_serving_deploy_model_sr_kserve_resource_request.memory }}Gi"
 {% endif %}
-{% if watsonx_serving_deploy_model_serving_runtime_resource_request['nvidia.com/gpu'] %}
-        nvidia.com/gpu: "{{ watsonx_serving_deploy_model_serving_runtime_resource_request['nvidia.com/gpu'] }}"
+{% if watsonx_serving_deploy_model_sr_kserve_resource_request['nvidia.com/gpu'] %}
+        nvidia.com/gpu: "{{ watsonx_serving_deploy_model_sr_kserve_resource_request['nvidia.com/gpu'] }}"
 {% endif %}
     volumeMounts:
     - mountPath: /shared_model_storage/transformers_cache
       name: cache-volume
+  # ---
+  # --- transformer-container
+  # ---
+  - name: transformer-container
+    image: {{ watsonx_serving_deploy_model_sr_transformer_image }}
+    ports:
+    - containerPort: 8085
+      name: h2c
+      protocol: TCP
+{% for env_key, env_value in watsonx_serving_deploy_model_sr_transformer_extra_env_values.items() %}
+    - name: "{{ env_key }}"
+      value: "{{ env_value }}"
+{% endfor %}
+    resources:
+      requests:
+        cpu: "{{ watsonx_serving_deploy_model_sr_transformer_resource_request.cpu }}"
+        memory: "{{ watsonx_serving_deploy_model_sr_transformer_resource_request.memory }}Gi"
+      limits:
+{% if watsonx_serving_deploy_model_limits_equals_requests %}
+        cpu: "{{ watsonx_serving_deploy_model_sr_transformer_resource_request.cpu }}"
+        memory: "{{ watsonx_serving_deploy_model_sr_transformer_resource_request.memory }}Gi"
+{% endif %}
+    volumeMounts:
+    - name: config
+      mountPath: /caikit/config/
+      readOnly: true
   volumes:
   - name:  cache-volume
     emptyDir:
       sizeLimit: 180Gi
+  - name: config
+    configMap:
+      name: {{ watsonx_serving_deploy_model_serving_runtime_name }}-caikit-tgis-config
   multiModel: false
   supportedModelFormats:
   # Note: this currently *only* supports caikit format models
 
@@ -1,2 +1,3 @@
+caikit_tgit_config_template: templates/caikit-tgis-config.yaml.j2
 serving_runtime_template: templates/serving_runtime.yaml.j2
 inference_service_template: templates/inference_service.yaml.j2
@@ -21,10 +21,22 @@
 
 - name: Wait for the model to answer successfully
   shell: |
-    curl -Ssf -k "{{ ksvc_hostname_cmd.stdout }}"
-  register: curl_working_cmd
+    set -o pipefail;
+    i=0;
+
+    GRPCURL_DATA=$(cat "{{ watsonx_serving_validate_model_dataset }}" | jq .dataset[$i].input )
+
+    grpcurl \
+        -insecure \
+        -d "$GRPCURL_DATA" \
+        -H "mm-model-id: {{ watsonx_serving_validate_model_model_id }}" \
+        {{ ksvc_hostname_cmd.stdout }}:443 \
+        caikit.runtime.Nlp.NlpService/TextGenerationTaskPredict
+
+  register: grpcurl_working_cmd
   retries: 60
   delay: 5
+  until: grpcurl_working_cmd.rc == 0
 
 - name: Inform | Next task runs the load test
   debug: msg="Next task runs the validation test. It runs {{ watsonx_serving_validate_model_query_count }} queries. Artifacts will be saved into '{{ artifact_extra_logs_dir }}/{{ watsonx_serving_validate_model_inference_service_name }}'."
 
@@ -165,14 +165,17 @@ watsonx_serving deploy_model:
   model_name: {{ tests.scale.model.full_name }}
 
   serving_runtime_name: {{ tests.scale.model.name }}
-  serving_runtime_image: {{ watsonx_serving.model.serving_runtime.image }}
 
-  serving_runtime_resource_request: {{ tests.scale.model.serving_runtime.resource_request }}
+  sr_kserve_image: {{ watsonx_serving.model.serving_runtime.kserve.image }}
+  sr_kserve_resource_request: {{ tests.scale.model.serving_runtime.kserve.resource_request }}
+
+  sr_transformer_image: {{ watsonx_serving.model.serving_runtime.transformer.image }}
+  sr_transformer_resource_request: {{ tests.scale.model.serving_runtime.transformer.resource_request }}
+  sr_transformer_mute_logs: {{ watsonx_serving.model.serving_runtime.transformer.mute_logs }}
 
   inference_service_name: {{ tests.scale.model.name }}
   storage_uri: {{ tests.scale.model.inference_service.storage_uri }}
 
-  mute_serving_logs: {{ watsonx_serving.model.serving_runtime.mute_logs }}
   delete_others: {{ tests.e2e.delete_others }}
   limits_equals_requests: true
 
 
@@ -74,10 +74,11 @@ ci_presets:
     gpu.prepare_cluster: true
     clusters.sutest.compute.machineset.type: g5.2xlarge
 
-  e2e_models:
+  e2e_all_models:
     tests.e2e.models:
-    - flan-t5-large-gpu
+    - flan-t5-small-cpu
     - flan-t5-small-gpu
+    - flan-t5-large-gpu
     - bloom-560m
     - mpt-7b-instruct2
 
@@ -115,7 +116,6 @@ ci_presets:
   e2e_perf_flan:
     extends: [e2e_perf]
     tests.e2e.llm_load_test.duration: 3m
-    watsonx_serving.model.serving_runtime.mute_logs: false # secret key has been disabled
     tests.e2e.models:
     - flan-t5-small-gpu-3gb:
         name: flan-t5-small-gpu
@@ -307,8 +307,11 @@ watsonx_serving:
       query_count: 10
   model:
     serving_runtime:
-      image: quay.io/opendatahub/caikit-tgis-serving:stable
-      mute_logs: true
+      kserve:
+        image: quay.io/opendatahub/text-generation-inference:stable
+      transformer:
+        image: quay.io/opendatahub/caikit-tgis-serving:fast
+        mute_logs: true
   customize:
     serverless:
       enabled: false
 
@@ -3,11 +3,17 @@ full_name: bloom-560m
 secret_key: # no secret key at the moment
 
 serving_runtime:
-  resource_request:
-    cpu: 1.5
-    memory: 6 # in Gi
-    nvidia.com/gpu: 5 # in Gi of GPU memory
-  extra_env: {}
-  min_replicas: 1
+  kserve:
+    resource_request:
+      cpu: 1.5
+      memory: 6 # in Gi
+      nvidia.com/gpu: 5 # in Gi of GPU memory
+    extra_env: {}
+  transformer:
+    resource_request:
+      cpu: 1
+      memory: 3 # in Gi
+    extra_env: {}
 inference_service:
-  storage_uri: "s3://psap-watsonx-models/bloom-560m"
+  min_replicas: 1
+  storage_uri: "s3://psap-watsonx-models/bloom-560m/bloom-560m"
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
	`1`	`+caikit_tgit_config_template: templates/caikit-tgis-config.yaml.j2`
`1`	`2`	`serving_runtime_template: templates/serving_runtime.yaml.j2`
`2`	`3`	`inference_service_template: templates/inference_service.yaml.j2`