forked from opendatahub-io/modelmesh
-
Notifications
You must be signed in to change notification settings - Fork 0
/
model-runtime.proto
175 lines (149 loc) · 6.57 KB
/
model-runtime.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
/*
* Copyright 2021 IBM Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy
* of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*/
syntax = "proto3";
package mmesh;
option java_package = "com.ibm.watson.modelmesh.api";
option java_multiple_files = true;
// this is the internal "sidecar" API for interfacing with a
// colocated model runtime container
service ModelRuntime {
// Load a model, return when model is fully loaded.
// Include size of loaded model in response if no additional cost.
// A gRPC error code of PRECONDITION_FAILED or INVALID_ARGUMENT
// should be returned if no attempt to load the model was made
// (so can be sure that no space remains used).
// Note that the RPC may be cancelled by model-mesh prior to completion,
// after which an unloadModel call will immediately be sent for the same model.
// To avoid state inconsistency and "leaking" memory, implementors should
// ensure that this case is properly handled, i.e. that the model doesn't
// remain loaded after returning successfully from this unloadModel call.
rpc loadModel (LoadModelRequest) returns (LoadModelResponse) {}
// Unload a previously loaded (or failed) model. Return when model
// is fully unloaded, or immediately if not found/loaded.
rpc unloadModel (UnloadModelRequest) returns (UnloadModelResponse) {}
// Predict size of not-yet-loaded model - must return almost immediately.
// Should not perform expensive computation or remote lookups.
// Should be a conservative estimate.
// NOTE: Implementation of this RPC is optional.
rpc predictModelSize (PredictModelSizeRequest) returns (PredictModelSizeResponse) {}
// Calculate size (memory consumption) of currently-loaded model.
// NOTE: Implementation of this RPC is only required if models' size
// is not returned in the response to loadModel. If the size computation
// takes a nontrivial amount of time, it's better to return from loadModel
// immediately and implement this to perform the sizing separately.
rpc modelSize (ModelSizeRequest) returns (ModelSizeResponse) {}
// Provide basic runtime status and parameters; called only during startup.
// Before returning a READY status, implementations should check for and
// purge any/all currently-loaded models. Since this is only called during
// startup, there should very rarely be any, but if there are it implies
// the model-mesh container restarted unexpectedly and such a purge must
// be done to ensure continued consistency of state and avoid over-committing
// resources.
rpc runtimeStatus (RuntimeStatusRequest) returns (RuntimeStatusResponse) {}
}
message LoadModelRequest {
string modelId = 1;
string modelType = 2;
string modelPath = 3;
string modelKey = 4;
}
message LoadModelResponse {
// OPTIONAL - If nontrivial cost is involved in
// determining the size, return 0 here and
// do the sizing in the modelSize function
uint64 sizeInBytes = 1;
// EXPERIMENTAL - Applies only if limitModelConcurrency = true
// was returned from runtimeStatus rpc.
// See RuntimeStatusResponse.limitModelConcurrency for more detail
uint32 maxConcurrency = 2;
}
message UnloadModelRequest {
string modelId = 1;
}
message UnloadModelResponse {}
message PredictModelSizeRequest {
string modelId = 1;
string modelType = 2;
string modelPath = 3;
string modelKey = 4;
}
message PredictModelSizeResponse {
uint64 sizeInBytes = 1;
}
message ModelSizeRequest {
string modelId = 1;
}
message ModelSizeResponse {
uint64 sizeInBytes = 1;
}
message RuntimeStatusRequest {
}
message RuntimeStatusResponse {
enum Status {
STARTING = 0;
READY = 1;
FAILING = 2; //not used yet
}
Status status = 1;
// memory capacity for static loaded models, in bytes
uint64 capacityInBytes = 2;
// maximum number of model loads that can be in-flight at the same time
uint32 maxLoadingConcurrency = 3;
// timeout for model loads in milliseconds
uint32 modelLoadingTimeoutMs = 4;
// conservative "default" model size,
// such that "most" models are smaller than this
uint64 defaultModelSizeInBytes = 5;
// version string for this model server code
string runtimeVersion = 6;
// DEPRECATED - the value of this field is not used,
// it will be removed in a future update
uint64 numericRuntimeVersion = 7;
message MethodInfo {
// Optional path of protobuf field numbers, pointing to a
// string field within the RPC's request message
// that should be replaced with the model id for
// which the request applies to.
// All but the last field in the list must be of
// "embedded message" type, the last one must be of string type.
repeated uint32 idInjectionPath = 1;
}
// Map containing information about specific inferencing
// gRPC methods exposed by this runtime, such as a path
// within the protobuf message indicating where the model id
// should be injected.
// If non-empty, and allowAnyMethod is not set to true,
// only RPCs of inference methods contained in this map will
// be forwarded to the runtime (acts as an allow-list).
// The method name keys in the map must be fully qualified,
// including the service name, i.e. "package.ServiceName/MethodName"
map<string,MethodInfo> methodInfos = 8;
// EXPERIMENTAL - Set to true to enable the mode where
// each loaded model reports a maximum inferencing
// concurrency via the maxConcurrency field of
// the LoadModelResponse message. Additional requests
// are queued in the modelmesh framework. Turning this
// on will also enable latency-based autoscaling for
// the models, which attempts to minimize request
// queueing time and requires no other configuration/tuning.
bool limitModelConcurrency = 9;
// If true, any/all RPCs will be forwarded to the runtime
// irrespective of the service/method name. Otherwise,
// only those present in the methodInfos map will be permitted.
// NOTE that this will default to being effectively true if
// the methodInfos map is empty.
bool allowAnyMethod = 10;
}