Skip to content

Commit cc8fce0

Browse files
authored
Watsonx: update to the last serving runtime architecture (#88)
2 parents ba0d26d + e6f1608 commit cc8fce0

File tree

21 files changed

+365
-160
lines changed

21 files changed

+365
-160
lines changed

roles/watsonx_serving/watsonx_serving_deploy_model/defaults/main/config.yml

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,21 @@ watsonx_serving_deploy_model_model_id:
1919
# Mandatory value
2020
watsonx_serving_deploy_model_serving_runtime_name:
2121

22-
# the image of the serving runtime
22+
# the image of the Kserve serving runtime container
2323
# Mandatory value
24-
watsonx_serving_deploy_model_serving_runtime_image:
24+
watsonx_serving_deploy_model_sr_kserve_image:
2525

26-
# the resource request of the serving runtime
26+
# the resource request of the kserve serving runtime container
2727
# Mandatory value
28-
watsonx_serving_deploy_model_serving_runtime_resource_request:
28+
watsonx_serving_deploy_model_sr_kserve_resource_request:
29+
30+
# the image of the Transformer serving runtime container
31+
# Mandatory value
32+
watsonx_serving_deploy_model_sr_transformer_image:
33+
34+
# the resource request of the Transformer serving runtime container
35+
# Mandatory value
36+
watsonx_serving_deploy_model_sr_transformer_resource_request:
2937

3038
# the name to give to the inference service
3139
# Mandatory value
@@ -39,6 +47,12 @@ watsonx_serving_deploy_model_storage_uri:
3947
# Mandatory value
4048
watsonx_serving_deploy_model_sa_name:
4149

50+
# extra key/value pairs for the kserve container (will override the values from the secret file)
51+
watsonx_serving_deploy_model_sr_kserve_extra_env_values: {}
52+
53+
# extra key/value pairs for the transformer container (will override the values from the secret file)
54+
watsonx_serving_deploy_model_sr_transformer_extra_env_values: {}
55+
4256
# the minimum number of replicas. If none, the field is left unset.
4357
# Type: Int
4458
watsonx_serving_deploy_model_inference_service_min_replicas: null
@@ -49,12 +63,8 @@ watsonx_serving_deploy_model_secret_env_file_name: null
4963
# key to the secret environment key/values in the secret file
5064
watsonx_serving_deploy_model_secret_env_file_key: null
5165

52-
# extra key/value pairs (will override the values from the secret file)
53-
# Type: Dict
54-
watsonx_serving_deploy_model_env_extra_values: {}
55-
56-
# if True, mute the serving runtime container logs
57-
watsonx_serving_deploy_model_mute_serving_logs: false
66+
# if True, mute the transformer serving runtime container logs
67+
watsonx_serving_deploy_model_sr_transformer_mute_logs: false
5868

5969
# if True, deletes the other serving runtime/inference services of the namespace
6070
watsonx_serving_deploy_model_delete_others: true

roles/watsonx_serving/watsonx_serving_deploy_model/tasks/main.yml

Lines changed: 52 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -116,12 +116,29 @@
116116

117117
# Serving Runtime
118118

119+
- name: Prepare the caikit-tgis-config template
120+
template:
121+
src: "{{ caikit_tgit_config_template }}"
122+
dest: "{{ artifact_extra_logs_dir }}/src/caikit_tgit_config.yaml"
123+
mode: '0400'
124+
119125
- name: Prepare the ServingRuntime template
120126
template:
121127
src: "{{ serving_runtime_template }}"
122128
dest: "{{ artifact_extra_logs_dir }}/src/serving_runtime.yaml"
123129
mode: '0400'
124130

131+
- name: Create or update the Caikit TGIS config
132+
shell:
133+
set -o pipefail;
134+
135+
oc create cm {{ watsonx_serving_deploy_model_serving_runtime_name }}-caikit-tgis-config
136+
-n {{ watsonx_serving_deploy_model_namespace }}
137+
--from-file=caikit.yml="{{ artifact_extra_logs_dir }}/src/caikit_tgit_config.yaml"
138+
--dry-run=client
139+
-oyaml
140+
| oc apply -f-
141+
125142
- name: Create the ServingRuntime
126143
command:
127144
oc apply -f "{{ artifact_extra_logs_dir }}/src/serving_runtime.yaml"
@@ -178,19 +195,49 @@
178195
-n {{ watsonx_serving_deploy_model_namespace }}
179196
register: inference_service_pod_fetching_cmd
180197
# wait 60 minutes
181-
retries: 60
182-
delay: 60
198+
retries: 120
199+
delay: 30
183200
until: inference_service_pod_fetching_cmd.stdout | length > 0
184201

202+
- name: Wait for all the containers to be ready
203+
shell: |
204+
set -o pipefail;
205+
set -e;
206+
207+
status=$(oc get pod \
208+
-ojson \
209+
-lserving.kserve.io/inferenceservice={{ watsonx_serving_deploy_model_inference_service_name }} \
210+
-n {{ watsonx_serving_deploy_model_namespace }} \
211+
| jq -r '.items[0].status.containerStatuses[] | ("" + .name +" ready="+ (.ready|tostring)) +" restarted="+(.restartCount|tostring)')
212+
echo "$status"
213+
214+
if grep -v restarted=0 <<< $status >&2; then
215+
echo "Restart detected, aborting" >&2
216+
exit 2
217+
fi
218+
219+
if grep -v ready=true <<< $status >&2; then
220+
echo "Containers not ready detected, keep waiting ..." >&2
221+
exit 1
222+
fi
223+
224+
echo "All the containers are ready, all good :)" >&2
225+
register: inference_service_pod_ready
226+
failed_when: inference_service_pod_ready.rc == 2
227+
until: inference_service_pod_ready.rc == 0 or inference_service_pod_ready.rc == 2
228+
# wait 90 minutes
229+
retries: 180
230+
delay: 30
231+
185232
- name: Wait for the InferenceService Pod to initialize the model
186233
shell:
187234
set -o pipefail;
188235
oc get -f "{{ artifact_extra_logs_dir }}/src/inference_service.yaml"
189236
-ojsonpath={.status.modelStatus.states.targetModelState}
190237
register: inference_service_state_cmd
191-
# wait 90 minutes
192-
retries: 180
193-
delay: 30
238+
# wait 5 minutes
239+
retries: 30
240+
delay: 10
194241
until: inference_service_state_cmd.stdout == "Loaded"
195242

196243
- name: Capture the state of the InferenceService Pod resource
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
runtime:
2+
library: caikit_nlp
3+
local_models_dir: /mnt/models/
4+
lazy_load_local_models: true
5+
6+
model_management:
7+
finders:
8+
default:
9+
type: MULTI
10+
config:
11+
finder_priority:
12+
- tgis-auto
13+
tgis-auto:
14+
type: TGIS-AUTO
15+
config:
16+
test_connection: true
17+
initializers:
18+
default:
19+
type: LOCAL
20+
config:
21+
backend_priority:
22+
- type: TGIS
23+
config:
24+
connection:
25+
hostname: localhost:8033

roles/watsonx_serving/watsonx_serving_deploy_model/templates/serving_runtime.yaml.j2

Lines changed: 43 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -6,72 +6,78 @@ metadata:
66
spec:
77
containers:
88
- name: kserve-container
9+
image: {{ watsonx_serving_deploy_model_sr_kserve_image }}
910
command: [bash, -cex]
1011
args:
1112
- |
12-
{% if watsonx_serving_deploy_model_mute_serving_logs %}
13-
echo "Starting Caikit-serving without stdout logs ..."
14-
{% else %}
15-
echo "Starting Caikit-serving _with_ stdout logs ..."
16-
{% endif %}
17-
TGIS_CONFIG_TEMPLATE="/caikit/config/caikit-tgis.template.yml"
18-
EXTENDED_TIMEOUT=20000
19-
sed -i 's/load_timeout: .*/load_timeout: '$EXTENDED_TIMEOUT'/' "$TGIS_CONFIG_TEMPLATE"
20-
exec ./start-serving.sh {% if watsonx_serving_deploy_model_mute_serving_logs %} > "$HUGGINGFACE_HUB_CACHE/logs" {% endif %}
13+
echo "Starting kserver (TGIS) {% if watsonx_serving_deploy_model_sr_transformer_mute_logs %} without {% else %} _with_ {% endif %} stdout logs ..."
14+
exec text-generation-launcher --model-name=/mnt/models/artifacts/ {% if watsonx_serving_deploy_model_sr_transformer_mute_logs %} > "/tmp/tgis-logs" {% endif %}
2115

2216
env:
23-
- name: RUNTIME_LOCAL_MODELS_DIR
24-
value: /mnt/models
25-
{% if watsonx_serving_deploy_model_serving_runtime_resource_request['nvidia.com/gpu'] %}
26-
- name: NUM_GPUS
27-
value: "1"
28-
{% endif %}
29-
- name: MODEL_NAME
30-
value: "{{ watsonx_serving_deploy_model_model_name }}"
31-
3217
- name: TRANSFORMERS_CACHE
3318
value: /shared_model_storage/transformers_cache
34-
- name: HUGGINGFACE_HUB_CACHE
35-
value: /shared_model_storage/transformers_cache
36-
37-
{% for env_key, env_value in watsonx_serving_deploy_model_env_extra_values.items() %}
19+
{% for env_key, env_value in watsonx_serving_deploy_model_sr_kserve_extra_env_values.items() %}
3820
- name: "{{ env_key }}"
3921
value: "{{ env_value }}"
4022
{% endfor %}
41-
envFrom:
23+
4224
{% if watsonx_serving_deploy_model_secret_env_file_name is not none %}
25+
envFrom:
4326
- secretRef:
4427
name: {{ watsonx_serving_deploy_model_serving_runtime_name }}-secret
4528
{% endif %}
4629

47-
image: {{ watsonx_serving_deploy_model_serving_runtime_image }}
48-
ports:
49-
# Note, KServe only allows a single port, this is the gRPC port. Subject to change in the future
50-
- containerPort: 8085
51-
name: h2c
52-
protocol: TCP
5330
resources:
5431
requests:
55-
cpu: "{{ watsonx_serving_deploy_model_serving_runtime_resource_request.cpu }}"
56-
memory: "{{ watsonx_serving_deploy_model_serving_runtime_resource_request.memory }}Gi"
57-
{% if watsonx_serving_deploy_model_serving_runtime_resource_request['nvidia.com/gpu'] %}
58-
nvidia.com/gpu: "{{ watsonx_serving_deploy_model_serving_runtime_resource_request['nvidia.com/gpu'] }}"
32+
cpu: "{{ watsonx_serving_deploy_model_sr_kserve_resource_request.cpu }}"
33+
memory: "{{ watsonx_serving_deploy_model_sr_kserve_resource_request.memory }}Gi"
34+
{% if watsonx_serving_deploy_model_sr_kserve_resource_request['nvidia.com/gpu'] %}
35+
nvidia.com/gpu: "{{ watsonx_serving_deploy_model_sr_kserve_resource_request['nvidia.com/gpu'] }}"
5936
{% endif %}
6037
limits:
6138
{% if watsonx_serving_deploy_model_limits_equals_requests %}
62-
cpu: "{{ watsonx_serving_deploy_model_serving_runtime_resource_request.cpu }}"
63-
memory: "{{ watsonx_serving_deploy_model_serving_runtime_resource_request.memory }}Gi"
39+
cpu: "{{ watsonx_serving_deploy_model_sr_kserve_resource_request.cpu }}"
40+
memory: "{{ watsonx_serving_deploy_model_sr_kserve_resource_request.memory }}Gi"
6441
{% endif %}
65-
{% if watsonx_serving_deploy_model_serving_runtime_resource_request['nvidia.com/gpu'] %}
66-
nvidia.com/gpu: "{{ watsonx_serving_deploy_model_serving_runtime_resource_request['nvidia.com/gpu'] }}"
42+
{% if watsonx_serving_deploy_model_sr_kserve_resource_request['nvidia.com/gpu'] %}
43+
nvidia.com/gpu: "{{ watsonx_serving_deploy_model_sr_kserve_resource_request['nvidia.com/gpu'] }}"
6744
{% endif %}
6845
volumeMounts:
6946
- mountPath: /shared_model_storage/transformers_cache
7047
name: cache-volume
48+
# ---
49+
# --- transformer-container
50+
# ---
51+
- name: transformer-container
52+
image: {{ watsonx_serving_deploy_model_sr_transformer_image }}
53+
ports:
54+
- containerPort: 8085
55+
name: h2c
56+
protocol: TCP
57+
{% for env_key, env_value in watsonx_serving_deploy_model_sr_transformer_extra_env_values.items() %}
58+
- name: "{{ env_key }}"
59+
value: "{{ env_value }}"
60+
{% endfor %}
61+
resources:
62+
requests:
63+
cpu: "{{ watsonx_serving_deploy_model_sr_transformer_resource_request.cpu }}"
64+
memory: "{{ watsonx_serving_deploy_model_sr_transformer_resource_request.memory }}Gi"
65+
limits:
66+
{% if watsonx_serving_deploy_model_limits_equals_requests %}
67+
cpu: "{{ watsonx_serving_deploy_model_sr_transformer_resource_request.cpu }}"
68+
memory: "{{ watsonx_serving_deploy_model_sr_transformer_resource_request.memory }}Gi"
69+
{% endif %}
70+
volumeMounts:
71+
- name: config
72+
mountPath: /caikit/config/
73+
readOnly: true
7174
volumes:
7275
- name: cache-volume
7376
emptyDir:
7477
sizeLimit: 180Gi
78+
- name: config
79+
configMap:
80+
name: {{ watsonx_serving_deploy_model_serving_runtime_name }}-caikit-tgis-config
7581
multiModel: false
7682
supportedModelFormats:
7783
# Note: this currently *only* supports caikit format models
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
1+
caikit_tgit_config_template: templates/caikit-tgis-config.yaml.j2
12
serving_runtime_template: templates/serving_runtime.yaml.j2
23
inference_service_template: templates/inference_service.yaml.j2

roles/watsonx_serving/watsonx_serving_validate_model/tasks/validate_model.yaml

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,22 @@
2121

2222
- name: Wait for the model to answer successfully
2323
shell: |
24-
curl -Ssf -k "{{ ksvc_hostname_cmd.stdout }}"
25-
register: curl_working_cmd
24+
set -o pipefail;
25+
i=0;
26+
27+
GRPCURL_DATA=$(cat "{{ watsonx_serving_validate_model_dataset }}" | jq .dataset[$i].input )
28+
29+
grpcurl \
30+
-insecure \
31+
-d "$GRPCURL_DATA" \
32+
-H "mm-model-id: {{ watsonx_serving_validate_model_model_id }}" \
33+
{{ ksvc_hostname_cmd.stdout }}:443 \
34+
caikit.runtime.Nlp.NlpService/TextGenerationTaskPredict
35+
36+
register: grpcurl_working_cmd
2637
retries: 60
2738
delay: 5
39+
until: grpcurl_working_cmd.rc == 0
2840

2941
- name: Inform | Next task runs the load test
3042
debug: msg="Next task runs the validation test. It runs {{ watsonx_serving_validate_model_query_count }} queries. Artifacts will be saved into '{{ artifact_extra_logs_dir }}/{{ watsonx_serving_validate_model_inference_service_name }}'."

testing/watsonx-serving/command_args.yml.j2

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -165,14 +165,17 @@ watsonx_serving deploy_model:
165165
model_name: {{ tests.scale.model.full_name }}
166166

167167
serving_runtime_name: {{ tests.scale.model.name }}
168-
serving_runtime_image: {{ watsonx_serving.model.serving_runtime.image }}
169168

170-
serving_runtime_resource_request: {{ tests.scale.model.serving_runtime.resource_request }}
169+
sr_kserve_image: {{ watsonx_serving.model.serving_runtime.kserve.image }}
170+
sr_kserve_resource_request: {{ tests.scale.model.serving_runtime.kserve.resource_request }}
171+
172+
sr_transformer_image: {{ watsonx_serving.model.serving_runtime.transformer.image }}
173+
sr_transformer_resource_request: {{ tests.scale.model.serving_runtime.transformer.resource_request }}
174+
sr_transformer_mute_logs: {{ watsonx_serving.model.serving_runtime.transformer.mute_logs }}
171175

172176
inference_service_name: {{ tests.scale.model.name }}
173177
storage_uri: {{ tests.scale.model.inference_service.storage_uri }}
174178

175-
mute_serving_logs: {{ watsonx_serving.model.serving_runtime.mute_logs }}
176179
delete_others: {{ tests.e2e.delete_others }}
177180
limits_equals_requests: true
178181

testing/watsonx-serving/config.yaml

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -74,10 +74,11 @@ ci_presets:
7474
gpu.prepare_cluster: true
7575
clusters.sutest.compute.machineset.type: g5.2xlarge
7676

77-
e2e_models:
77+
e2e_all_models:
7878
tests.e2e.models:
79-
- flan-t5-large-gpu
79+
- flan-t5-small-cpu
8080
- flan-t5-small-gpu
81+
- flan-t5-large-gpu
8182
- bloom-560m
8283
- mpt-7b-instruct2
8384

@@ -115,7 +116,6 @@ ci_presets:
115116
e2e_perf_flan:
116117
extends: [e2e_perf]
117118
tests.e2e.llm_load_test.duration: 3m
118-
watsonx_serving.model.serving_runtime.mute_logs: false # secret key has been disabled
119119
tests.e2e.models:
120120
- flan-t5-small-gpu-3gb:
121121
name: flan-t5-small-gpu
@@ -307,8 +307,11 @@ watsonx_serving:
307307
query_count: 10
308308
model:
309309
serving_runtime:
310-
image: quay.io/opendatahub/caikit-tgis-serving:stable
311-
mute_logs: true
310+
kserve:
311+
image: quay.io/opendatahub/text-generation-inference:stable
312+
transformer:
313+
image: quay.io/opendatahub/caikit-tgis-serving:fast
314+
mute_logs: true
312315
customize:
313316
serverless:
314317
enabled: false

testing/watsonx-serving/models/bloom-560m.yaml

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,17 @@ full_name: bloom-560m
33
secret_key: # no secret key at the moment
44

55
serving_runtime:
6-
resource_request:
7-
cpu: 1.5
8-
memory: 6 # in Gi
9-
nvidia.com/gpu: 5 # in Gi of GPU memory
10-
extra_env: {}
11-
min_replicas: 1
6+
kserve:
7+
resource_request:
8+
cpu: 1.5
9+
memory: 6 # in Gi
10+
nvidia.com/gpu: 5 # in Gi of GPU memory
11+
extra_env: {}
12+
transformer:
13+
resource_request:
14+
cpu: 1
15+
memory: 3 # in Gi
16+
extra_env: {}
1217
inference_service:
13-
storage_uri: "s3://psap-watsonx-models/bloom-560m"
18+
min_replicas: 1
19+
storage_uri: "s3://psap-watsonx-models/bloom-560m/bloom-560m"

0 commit comments

Comments
 (0)