diff --git a/frameworks/cassandra/universe/marathon.json.mustache b/frameworks/cassandra/universe/marathon.json.mustache index abe755e1ff7..c921146c6b0 100644 --- a/frameworks/cassandra/universe/marathon.json.mustache +++ b/frameworks/cassandra/universe/marathon.json.mustache @@ -145,6 +145,26 @@ "minimumHealthCapacity": 0, "maximumOverCapacity": 0 }, + "healthChecks": [ + { + "protocol": "HTTP", + "path": "/v1/plans/deploy", + "gracePeriodSeconds": 900, + "intervalSeconds": 30, + "portIndex": 0, + "timeoutSeconds": 30, + "maxConsecutiveFailures": 0 + }, + { + "protocol": "HTTP", + "path": "/v1/plans/recovery", + "gracePeriodSeconds": 900, + "intervalSeconds": 30, + "portIndex": 0, + "timeoutSeconds": 30, + "maxConsecutiveFailures": 0 + } + ], "portDefinitions": [ { "port": 0, diff --git a/frameworks/elastic/universe/marathon.json.mustache b/frameworks/elastic/universe/marathon.json.mustache index 37b3df99c9a..11a9a00977e 100644 --- a/frameworks/elastic/universe/marathon.json.mustache +++ b/frameworks/elastic/universe/marathon.json.mustache @@ -86,6 +86,26 @@ "minimumHealthCapacity": 0, "maximumOverCapacity": 0 }, + "healthChecks": [ + { + "protocol": "HTTP", + "path": "/v1/plans/deploy", + "gracePeriodSeconds": 900, + "intervalSeconds": 30, + "portIndex": 0, + "timeoutSeconds": 30, + "maxConsecutiveFailures": 0 + }, + { + "protocol": "HTTP", + "path": "/v1/plans/recovery", + "gracePeriodSeconds": 900, + "intervalSeconds": 30, + "portIndex": 0, + "timeoutSeconds": 30, + "maxConsecutiveFailures": 0 + } + ], "portDefinitions": [ { "port": 0, diff --git a/frameworks/hdfs/universe/marathon.json.mustache b/frameworks/hdfs/universe/marathon.json.mustache index c06a175db19..493b080ed6a 100644 --- a/frameworks/hdfs/universe/marathon.json.mustache +++ b/frameworks/hdfs/universe/marathon.json.mustache @@ -117,6 +117,26 @@ "minimumHealthCapacity": 0, "maximumOverCapacity": 0 }, + "healthChecks": [ + { + "protocol": "HTTP", + "path": "/v1/plans/deploy", + "gracePeriodSeconds": 900, + "intervalSeconds": 30, + "portIndex": 0, + "timeoutSeconds": 30, + "maxConsecutiveFailures": 0 + }, + { + "protocol": "HTTP", + "path": "/v1/plans/recovery", + "gracePeriodSeconds": 900, + "intervalSeconds": 30, + "portIndex": 0, + "timeoutSeconds": 30, + "maxConsecutiveFailures": 0 + } + ], "portDefinitions": [ { "port": 0, diff --git a/frameworks/helloworld/universe/marathon.json.mustache b/frameworks/helloworld/universe/marathon.json.mustache index 1a9967c9e63..15aedbbfb79 100644 --- a/frameworks/helloworld/universe/marathon.json.mustache +++ b/frameworks/helloworld/universe/marathon.json.mustache @@ -55,6 +55,26 @@ "minimumHealthCapacity": 0, "maximumOverCapacity": 0 }, + "healthChecks": [ + { + "protocol": "HTTP", + "path": "/v1/plans/deploy", + "gracePeriodSeconds": 900, + "intervalSeconds": 30, + "portIndex": 0, + "timeoutSeconds": 30, + "maxConsecutiveFailures": 0 + }, + { + "protocol": "HTTP", + "path": "/v1/plans/recovery", + "gracePeriodSeconds": 900, + "intervalSeconds": 30, + "portIndex": 0, + "timeoutSeconds": 30, + "maxConsecutiveFailures": 0 + } + ], "portDefinitions": [ { "port": 0, diff --git a/frameworks/kafka/universe/marathon.json.mustache b/frameworks/kafka/universe/marathon.json.mustache index ee8c6b24e16..97c198aaeeb 100644 --- a/frameworks/kafka/universe/marathon.json.mustache +++ b/frameworks/kafka/universe/marathon.json.mustache @@ -156,6 +156,26 @@ "minimumHealthCapacity": 0, "maximumOverCapacity": 0 }, + "healthChecks": [ + { + "protocol": "HTTP", + "path": "/v1/plans/deploy", + "gracePeriodSeconds": 900, + "intervalSeconds": 30, + "portIndex": 0, + "timeoutSeconds": 30, + "maxConsecutiveFailures": 0 + }, + { + "protocol": "HTTP", + "path": "/v1/plans/recovery", + "gracePeriodSeconds": 900, + "intervalSeconds": 30, + "portIndex": 0, + "timeoutSeconds": 30, + "maxConsecutiveFailures": 0 + } + ], "portDefinitions": [ { "port": 0, diff --git a/frameworks/prototype/universe/marathon.json.mustache b/frameworks/prototype/universe/marathon.json.mustache index 3b6f31e2ca4..e1fbba55176 100644 --- a/frameworks/prototype/universe/marathon.json.mustache +++ b/frameworks/prototype/universe/marathon.json.mustache @@ -29,6 +29,26 @@ "minimumHealthCapacity": 0, "maximumOverCapacity": 0 }, + "healthChecks": [ + { + "protocol": "HTTP", + "path": "/v1/plans/deploy", + "gracePeriodSeconds": 900, + "intervalSeconds": 30, + "portIndex": 0, + "timeoutSeconds": 30, + "maxConsecutiveFailures": 0 + }, + { + "protocol": "HTTP", + "path": "/v1/plans/recovery", + "gracePeriodSeconds": 900, + "intervalSeconds": 30, + "portIndex": 0, + "timeoutSeconds": 30, + "maxConsecutiveFailures": 0 + } + ], "portDefinitions": [ { "port": 0, diff --git a/frameworks/proxylite/universe/marathon.json.mustache b/frameworks/proxylite/universe/marathon.json.mustache index f120438ad62..bc3682925bf 100644 --- a/frameworks/proxylite/universe/marathon.json.mustache +++ b/frameworks/proxylite/universe/marathon.json.mustache @@ -30,6 +30,26 @@ "minimumHealthCapacity": 0, "maximumOverCapacity": 0 }, + "healthChecks": [ + { + "protocol": "HTTP", + "path": "/v1/plans/deploy", + "gracePeriodSeconds": 900, + "intervalSeconds": 30, + "portIndex": 0, + "timeoutSeconds": 30, + "maxConsecutiveFailures": 0 + }, + { + "protocol": "HTTP", + "path": "/v1/plans/recovery", + "gracePeriodSeconds": 900, + "intervalSeconds": 30, + "portIndex": 0, + "timeoutSeconds": 30, + "maxConsecutiveFailures": 0 + } + ], "portDefinitions": [ { "port": 0, diff --git a/frameworks/spark/universe/marathon.json.mustache b/frameworks/spark/universe/marathon.json.mustache index e6e4d9d110c..68c1e285653 100644 --- a/frameworks/spark/universe/marathon.json.mustache +++ b/frameworks/spark/universe/marathon.json.mustache @@ -44,6 +44,26 @@ "minimumHealthCapacity": 0, "maximumOverCapacity": 0 }, + "healthChecks": [ + { + "protocol": "HTTP", + "path": "/v1/plans/deploy", + "gracePeriodSeconds": 900, + "intervalSeconds": 30, + "portIndex": 0, + "timeoutSeconds": 30, + "maxConsecutiveFailures": 0 + }, + { + "protocol": "HTTP", + "path": "/v1/plans/recovery", + "gracePeriodSeconds": 900, + "intervalSeconds": 30, + "portIndex": 0, + "timeoutSeconds": 30, + "maxConsecutiveFailures": 0 + } + ], "portDefinitions": [ { "port": 0, diff --git a/frameworks/template/universe/marathon.json.mustache b/frameworks/template/universe/marathon.json.mustache index af9b26c02f8..727bdae15e1 100644 --- a/frameworks/template/universe/marathon.json.mustache +++ b/frameworks/template/universe/marathon.json.mustache @@ -51,6 +51,26 @@ "minimumHealthCapacity": 0, "maximumOverCapacity": 0 }, + "healthChecks": [ + { + "protocol": "HTTP", + "path": "/v1/plans/deploy", + "gracePeriodSeconds": 900, + "intervalSeconds": 30, + "portIndex": 0, + "timeoutSeconds": 30, + "maxConsecutiveFailures": 0 + }, + { + "protocol": "HTTP", + "path": "/v1/plans/recovery", + "gracePeriodSeconds": 900, + "intervalSeconds": 30, + "portIndex": 0, + "timeoutSeconds": 30, + "maxConsecutiveFailures": 0 + } + ], "portDefinitions": [ { "port": 0, diff --git a/sdk/common/src/main/java/com/mesosphere/sdk/offer/CommonTaskUtils.java b/sdk/common/src/main/java/com/mesosphere/sdk/offer/CommonTaskUtils.java index 4d9d835244d..e78fc8b9c82 100644 --- a/sdk/common/src/main/java/com/mesosphere/sdk/offer/CommonTaskUtils.java +++ b/sdk/common/src/main/java/com/mesosphere/sdk/offer/CommonTaskUtils.java @@ -360,16 +360,17 @@ public static Environment.Builder fromMapToEnvironment(Map envir } /** - * Invokes {@link #sendStatus(ExecutorDriver, TaskState, TaskID, SlaveID, ExecutorID, String, byte[])} with a null - * {@code data} value. + * Invokes {@link #sendStatus(ExecutorDriver, TaskState, TaskID, SlaveID, ExecutorID, String, boolean)} with null + * {@code labels} and {@code data} values. */ public static void sendStatus(ExecutorDriver driver, TaskState state, TaskID taskID, SlaveID slaveID, ExecutorID executorID, - String message) { - sendStatus(driver, state, taskID, slaveID, executorID, message, null, null); + String message, + boolean isHealthy) { + sendStatus(driver, state, taskID, slaveID, executorID, message, isHealthy, null, null); } /** @@ -381,6 +382,7 @@ public static void sendStatus(ExecutorDriver driver, SlaveID slaveID, ExecutorID executorID, String message, + boolean isHealthy, Labels labels, byte[] data) { final TaskStatus.Builder builder = TaskStatus.newBuilder(); @@ -391,6 +393,7 @@ public static void sendStatus(ExecutorDriver driver, builder.setSlaveId(slaveID); builder.setExecutorId(executorID); builder.setSource(TaskStatus.Source.SOURCE_EXECUTOR); + builder.setHealthy(isHealthy); if (data != null) { builder.setData(ByteString.copyFrom(data)); diff --git a/sdk/executor/src/main/java/com/mesosphere/sdk/executor/CustomExecutor.java b/sdk/executor/src/main/java/com/mesosphere/sdk/executor/CustomExecutor.java index cf6df9cfcd5..e3ac1efc0da 100644 --- a/sdk/executor/src/main/java/com/mesosphere/sdk/executor/CustomExecutor.java +++ b/sdk/executor/src/main/java/com/mesosphere/sdk/executor/CustomExecutor.java @@ -82,8 +82,10 @@ public void launchTask(final ExecutorDriver driver, final Protos.TaskInfo task) task.getTaskId(), task.getSlaveId(), task.getExecutor().getExecutorId(), - String.format("Exception launching task %s", - t.getMessage())); + String.format( + "Exception launching task %s", + t.getMessage()), + false); } } diff --git a/sdk/executor/src/main/java/com/mesosphere/sdk/executor/HealthCheckHandler.java b/sdk/executor/src/main/java/com/mesosphere/sdk/executor/HealthCheckHandler.java index f4eb03b8ec5..ef19c10de50 100644 --- a/sdk/executor/src/main/java/com/mesosphere/sdk/executor/HealthCheckHandler.java +++ b/sdk/executor/src/main/java/com/mesosphere/sdk/executor/HealthCheckHandler.java @@ -4,7 +4,6 @@ import com.google.protobuf.TextFormat; import com.mesosphere.sdk.offer.CommonTaskUtils; import com.mesosphere.sdk.offer.Constants; - import org.apache.mesos.ExecutorDriver; import org.apache.mesos.Protos; import org.slf4j.Logger; @@ -191,9 +190,26 @@ public void run() { private void handleHealthCheck() { if (healthCheckStats.getConsecutiveFailures() >= healthCheck.getConsecutiveFailures()) { + CommonTaskUtils.sendStatus( + executorDriver, + Protos.TaskState.TASK_FAILED, + taskInfo.getTaskId(), + taskInfo.getSlaveId(), + taskInfo.getExecutor().getExecutorId(), + "Health check failed ", + false); throw new HealthCheckRuntimeException( "Health check exceeded its maximum consecutive failures.", healthCheckStats); + } else if (healthCheckStats.getConsecutiveSuccesses() == 1) { + CommonTaskUtils.sendStatus( + executorDriver, + Protos.TaskState.TASK_RUNNING, + taskInfo.getTaskId(), + taskInfo.getSlaveId(), + taskInfo.getExecutor().getExecutorId(), + "Health check passed", + true); } } @@ -211,6 +227,7 @@ private void handleReadinessCheck() { taskInfo.getSlaveId(), taskInfo.getExecutor().getExecutorId(), "Readiness check passed", + true, labels, null); throw new HealthCheckRuntimeException("Readiness check passed.", healthCheckStats); diff --git a/sdk/executor/src/main/java/com/mesosphere/sdk/executor/ProcessTask.java b/sdk/executor/src/main/java/com/mesosphere/sdk/executor/ProcessTask.java index d22a297b1de..3216dcd0033 100644 --- a/sdk/executor/src/main/java/com/mesosphere/sdk/executor/ProcessTask.java +++ b/sdk/executor/src/main/java/com/mesosphere/sdk/executor/ProcessTask.java @@ -1,8 +1,8 @@ package com.mesosphere.sdk.executor; +import com.mesosphere.sdk.offer.CommonTaskUtils; import org.apache.mesos.ExecutorDriver; import org.apache.mesos.Protos; -import com.mesosphere.sdk.offer.CommonTaskUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -79,11 +79,12 @@ public void run() { final String errorMessage = "Empty command found for: " + taskInfo.getName(); CommonTaskUtils.sendStatus( driver, - Protos.TaskState.TASK_ERROR, + Protos.TaskState.TASK_FAILED, taskInfo.getTaskId(), taskInfo.getSlaveId(), taskInfo.getExecutor().getExecutorId(), - errorMessage); + errorMessage, + false); return; } @@ -96,7 +97,8 @@ public void run() { taskInfo.getTaskId(), taskInfo.getSlaveId(), taskInfo.getExecutor().getExecutorId(), - startMessage); + startMessage, + true); initialized.complete(true); LOGGER.info(startMessage); @@ -106,15 +108,18 @@ public void run() { exit.complete(exitValue); Protos.TaskState taskState; + boolean isHealthy = true; if (exitValue == 0) { taskState = Protos.TaskState.TASK_FINISHED; exitMessage += exitValue; } else if (exitValue > 128) { taskState = Protos.TaskState.TASK_KILLED; exitMessage += (exitValue - 128); + isHealthy = false; } else { - taskState = Protos.TaskState.TASK_ERROR; + taskState = Protos.TaskState.TASK_FAILED; exitMessage += exitValue; + isHealthy = false; } CommonTaskUtils.sendStatus( @@ -123,7 +128,8 @@ public void run() { taskInfo.getTaskId(), taskInfo.getSlaveId(), taskInfo.getExecutor().getExecutorId(), - exitMessage); + exitMessage, + isHealthy); LOGGER.info(exitMessage); if (exitOnTermination) { @@ -141,7 +147,8 @@ public void run() { taskInfo.getTaskId(), taskInfo.getSlaveId(), taskInfo.getExecutor().getExecutorId(), - e.getMessage()); + e.getMessage(), + false); if (exitOnTermination) { driver.abort(); } diff --git a/testing/sdk_install.py b/testing/sdk_install.py index 9fa0d7c84c6..a7ee84f6581 100644 --- a/testing/sdk_install.py +++ b/testing/sdk_install.py @@ -56,7 +56,7 @@ def is_deployment_finished(): print('Checking that deployment of {} has ended:\n- Deploying apps: {}'.format(service_name, deploying_apps)) return not '/{}'.format(service_name) in deploying_apps print("Waiting for marathon deployment to finish...") - sdk_spin.time_wait_noisy(is_deployment_finished, timeout_seconds=30) + sdk_spin.time_wait_noisy(is_deployment_finished) # 4. Ensure the framework is suppressed. #