-
Notifications
You must be signed in to change notification settings - Fork 75
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add new health check configuration fields #1290
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -41,7 +41,8 @@ | |
DEFAULT_PREDICT_CONCURRENCY = 1 | ||
DEFAULT_STREAMING_RESPONSE_READ_TIMEOUT = 60 | ||
DEFAULT_ENABLE_TRACING_DATA = False # This should be in sync with tracing.py. | ||
|
||
MAX_FAILURE_THRESHOLD_SECONDS = 1800 | ||
MIN_FAILURE_THRESHOLD_SECONDS = 30 | ||
DEFAULT_CPU = "1" | ||
DEFAULT_MEMORY = "2Gi" | ||
DEFAULT_USE_GPU = False | ||
|
@@ -154,12 +155,70 @@ def to_list(self, verbose=False) -> List[Dict[str, str]]: | |
return [model.to_dict(verbose=verbose) for model in self.models] | ||
|
||
|
||
@dataclass | ||
class HealthCheck: | ||
spal1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
restart_check_delay_seconds: int = 0 | ||
restart_failure_threshold_seconds: int = MAX_FAILURE_THRESHOLD_SECONDS | ||
stop_traffic_failure_threshold_seconds: int = MAX_FAILURE_THRESHOLD_SECONDS | ||
|
||
def __post_init__(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we want to do these validations on the backend. That gives us more flexibility to not be stuck with these checks for old clients. Can you look into how that might work? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Discussed offline--I'll use this as an opportunity to create a first version of a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't understand why we write so much boilerplate code for this here when pydantic can do most of this. - more a comment on pre-existing code though... I think we should refactor this whole file soon. It's also super error prone that you have to add key literals to |
||
if not (0 <= self.restart_check_delay_seconds <= MAX_FAILURE_THRESHOLD_SECONDS): | ||
raise ValidationError( | ||
f"restart_check_delay_seconds must be between 0 and {MAX_FAILURE_THRESHOLD_SECONDS} seconds." | ||
) | ||
if not ( | ||
MIN_FAILURE_THRESHOLD_SECONDS | ||
<= self.restart_failure_threshold_seconds | ||
<= MAX_FAILURE_THRESHOLD_SECONDS | ||
): | ||
raise ValidationError( | ||
f"restart_failure_threshold_seconds must be between {MIN_FAILURE_THRESHOLD_SECONDS} and {MAX_FAILURE_THRESHOLD_SECONDS} seconds." | ||
) | ||
if not ( | ||
MIN_FAILURE_THRESHOLD_SECONDS | ||
<= self.stop_traffic_failure_threshold_seconds | ||
<= MAX_FAILURE_THRESHOLD_SECONDS | ||
): | ||
raise ValidationError( | ||
f"stop_traffic_failure_threshold_seconds must be between {MIN_FAILURE_THRESHOLD_SECONDS} and {MAX_FAILURE_THRESHOLD_SECONDS} seconds." | ||
) | ||
|
||
if ( | ||
self.restart_check_delay_seconds + self.restart_failure_threshold_seconds | ||
> MAX_FAILURE_THRESHOLD_SECONDS | ||
): | ||
raise ValidationError( | ||
"The sum of restart_check_delay_seconds and max_failures_before_restart " | ||
f"must not exceed {MAX_FAILURE_THRESHOLD_SECONDS} seconds." | ||
) | ||
|
||
@staticmethod | ||
def from_dict(d): | ||
return HealthCheck( | ||
restart_check_delay_seconds=d.get("restart_check_delay_seconds", 0), | ||
restart_failure_threshold_seconds=d.get( | ||
"restart_failure_threshold_seconds", MAX_FAILURE_THRESHOLD_SECONDS | ||
), | ||
stop_traffic_failure_threshold_seconds=d.get( | ||
"stop_traffic_failure_threshold_seconds", MAX_FAILURE_THRESHOLD_SECONDS | ||
), | ||
) | ||
|
||
def to_dict(self): | ||
return { | ||
"restart_check_delay_seconds": self.restart_check_delay_seconds, | ||
"restart_failure_threshold_seconds": self.restart_failure_threshold_seconds, | ||
"stop_traffic_failure_threshold_seconds": self.stop_traffic_failure_threshold_seconds, | ||
} | ||
|
||
|
||
@dataclass | ||
class Runtime: | ||
predict_concurrency: int = DEFAULT_PREDICT_CONCURRENCY | ||
streaming_read_timeout: int = DEFAULT_STREAMING_RESPONSE_READ_TIMEOUT | ||
enable_tracing_data: bool = DEFAULT_ENABLE_TRACING_DATA | ||
enable_debug_logs: bool = False | ||
health_checks: HealthCheck = field(default_factory=HealthCheck) | ||
|
||
@staticmethod | ||
def from_dict(d): | ||
|
@@ -176,18 +235,21 @@ def from_dict(d): | |
"streaming_read_timeout", DEFAULT_STREAMING_RESPONSE_READ_TIMEOUT | ||
) | ||
enable_tracing_data = d.get("enable_tracing_data", DEFAULT_ENABLE_TRACING_DATA) | ||
health_checks = HealthCheck.from_dict(d.get("health_checks", {})) | ||
|
||
return Runtime( | ||
predict_concurrency=predict_concurrency, | ||
streaming_read_timeout=streaming_read_timeout, | ||
enable_tracing_data=enable_tracing_data, | ||
health_checks=health_checks, | ||
) | ||
|
||
def to_dict(self): | ||
return { | ||
"predict_concurrency": self.predict_concurrency, | ||
"streaming_read_timeout": self.streaming_read_timeout, | ||
"enable_tracing_data": self.enable_tracing_data, | ||
"health_checks": self.health_checks.to_dict(), | ||
} | ||
|
||
|
||
|
@@ -819,6 +881,10 @@ def obj_to_dict(obj, verbose: bool = False): | |
d["docker_auth"] = transform_optional( | ||
field_curr_value, lambda data: data.to_dict() | ||
) | ||
elif isinstance(field_curr_value, HealthCheck): | ||
d[field_name] = transform_optional( | ||
field_curr_value, lambda data: data.to_dict() | ||
) | ||
else: | ||
d[field_name] = field_curr_value | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Chains doesn't map 1:1 to the structure of truss config. I wonder if it would make sense to add this field to
ChainletOptions
instead of creating a new class.If you want to proceed with a new class, then there should be some differentiator, explaining which future fields go into
Runtime
and which ones go intoChainletOptions
.