Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add container memory limit #1169

Merged
merged 3 commits into from
Apr 11, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion lzy/execution-env/src/main/java/ai/lzy/env/Environment.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,15 @@ public void signal(int sigValue) {}

InputStream err();

int waitFor() throws InterruptedException;
int waitFor() throws InterruptedException, OomKilledException;

void signal(int sigValue);
}


class OomKilledException extends RuntimeException {
public OomKilledException(String msg) {
super(msg);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ public record DockerEnvDescription(
String networkMode,
DockerClientConfig dockerClientConfig,
String user,
Set<String> allowedPlatforms // In format os/arch like "linux/amd64". Empty means all are allowed
Set<String> allowedPlatforms, // In format os/arch like "linux/amd64". Empty means all are allowed
@Nullable
Long memLimitMb
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These can be put on one line

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no, no :)
we place annotations on separate lines

) {

public static Builder newBuilder() {
Expand Down Expand Up @@ -57,6 +59,7 @@ public static class Builder {
DockerClientConfig dockerClientConfig;
String user = ROOT_USER_UID;
Set<String> allowedPlatforms = new HashSet<>();
Long memLimitMb = null;

public Builder withName(String name) {
this.name = name;
Expand Down Expand Up @@ -108,12 +111,17 @@ public Builder withAllowedPlatforms(Collection<String> allowedPlatforms) {
return this;
}

public Builder setMemLimitMb(Long memLimitMb) {
this.memLimitMb = memLimitMb;
return this;
}

public DockerEnvDescription build() {
if (StringUtils.isBlank(name)) {
name = "job-" + RandomStringUtils.randomAlphanumeric(5);
}
return new DockerEnvDescription(name, image, mounts, gpu, envVars, networkMode, dockerClientConfig, user,
allowedPlatforms);
allowedPlatforms, memLimitMb);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import java.io.OutputStream;
import java.io.PipedInputStream;
import java.io.PipedOutputStream;
import java.lang.management.ManagementFactory;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
Expand Down Expand Up @@ -114,6 +115,10 @@ public void install(LogStream outStream, LogStream errStream) throws Environment
hostConfig.withNetworkMode(config.networkMode());
}

if (config.memLimitMb() != null) {
hostConfig.withMemory(config.memLimitMb() * 1024 * 1024);
}

AtomicInteger containerCreatingAttempt = new AtomicInteger(0);
final var container = retry.executeSupplier(() -> {
LOG.info("Creating container {}... (attempt {}); image: {}, config: {}",
Expand Down Expand Up @@ -243,11 +248,21 @@ public InputStream err() {
}

@Override
public int waitFor() throws InterruptedException {
public int waitFor() throws InterruptedException, OomKilledException {
try {
feature.get();
return Math.toIntExact(retry.executeSupplier(() -> client.inspectExecCmd(exec.getId()).exec())
var rc = Math.toIntExact(retry.executeSupplier(() -> client.inspectExecCmd(exec.getId()).exec())
.getExitCodeLong());

if (rc == 0) {
return 0;
}

if (isOomKilled()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. you should check OOM before retcode
  2. right now you do inspect twice: on line 254 and line 385

throw new OomKilledException("Process exited with rc %s, and it was killed by OOM killer".formatted(rc));
}

return rc;
} catch (InterruptedException e) {
try {
startCmd.close();
Expand Down Expand Up @@ -361,4 +376,20 @@ private void checkPlatform(InspectImageResponse inspectImageResponse, LogStream
throw new RuntimeException(msg);
}
}

private boolean isOomKilled() {
if (containerId == null) {
return false;
}

var killed = client.inspectContainerCmd(containerId)
.exec()
.getState().getOOMKilled();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Weird formatting


if (killed != null) {
return killed;
}

return false;
}
}
Loading