-
Notifications
You must be signed in to change notification settings - Fork 227
Closed
Labels
Description
Intel MPI E2E tests failed in CI:
mpi-operator/test/e2e/mpi_job_test.go
Lines 207 to 272 in c738a83
ginkgo.Context("with Intel Implementation", func() { | |
ginkgo.BeforeEach(func() { | |
mpiJob.Spec.MPIImplementation = kubeflow.MPIImplementationIntel | |
mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template.Spec.Containers = []corev1.Container{ | |
{ | |
Name: "launcher", | |
Image: intelMPIImage, | |
ImagePullPolicy: corev1.PullIfNotPresent, // use locally built image. | |
Command: []string{}, // uses entrypoint. | |
Args: []string{ | |
"mpirun", | |
"-n", | |
"2", | |
"/home/mpiuser/pi", | |
}, | |
}, | |
} | |
mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Template.Spec.Containers = []corev1.Container{ | |
{ | |
Name: "worker", | |
Image: intelMPIImage, | |
ImagePullPolicy: corev1.PullIfNotPresent, // use locally built image. | |
Command: []string{}, // uses entrypoint. | |
Args: []string{ | |
"/usr/sbin/sshd", | |
"-De", | |
}, | |
ReadinessProbe: &corev1.Probe{ | |
ProbeHandler: corev1.ProbeHandler{ | |
TCPSocket: &corev1.TCPSocketAction{ | |
Port: intstr.FromInt(2222), | |
}, | |
}, | |
InitialDelaySeconds: 3, | |
}, | |
}, | |
} | |
}) | |
ginkgo.When("running as root", func() { | |
ginkgo.It("should succeed", func() { | |
mpiJob := createJobAndWaitForCompletion(mpiJob) | |
expectConditionToBeTrue(mpiJob, kubeflow.JobSucceeded) | |
}) | |
}) | |
ginkgo.When("running as non-root", func() { | |
ginkgo.BeforeEach(func() { | |
mpiJob.Spec.SSHAuthMountPath = "/home/mpiuser/.ssh" | |
mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template.Spec.Containers[0].SecurityContext = &corev1.SecurityContext{ | |
RunAsUser: ptr.To[int64](1000), | |
} | |
workerContainer := &mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Template.Spec.Containers[0] | |
workerContainer.SecurityContext = &corev1.SecurityContext{ | |
RunAsUser: ptr.To[int64](1000), | |
} | |
workerContainer.Args = append(workerContainer.Args, "-f", "/home/mpiuser/.sshd_config") | |
}) | |
ginkgo.It("should succeed", func() { | |
mpiJob := createJobAndWaitForCompletion(mpiJob) | |
expectConditionToBeTrue(mpiJob, kubeflow.JobSucceeded) | |
}) | |
}) | |
}) |
== BEGIN pi-launcher-jlll8 pod logs ==
:: initializing oneAPI environment ...
entrypoint.sh: BASH_VERSION = 5.2.15(1)-release
args: Using "$@" for setvars.sh arguments: mpirun -n 2 /home/mpiuser/pi
:: mpi -- latest
:: oneAPI environment initialized ::
Couldn't resolve pi-launcher... Retrying
Couldn't resolve pi-launcher... Retrying
Couldn't resolve pi-launcher... Retrying
Couldn't resolve pi-launcher... Retrying
Couldn't resolve pi-launcher... Retrying
Couldn't resolve pi-launcher... Retrying
Couldn't resolve pi-launcher... Retrying
Couldn't resolve pi-launcher... Retrying
Couldn't resolve pi-launcher... Retrying
Couldn't resolve pi-launcher... Retrying
Couldn't resolve pi-launcher
Resolved pi-worker-0.pi.e2e-89g8w.svc
Resolved pi-worker-1.pi.e2e-89g8w.svc
Rank 1 on host pi-worker-1
Workers: 2
Rank 0 on host pi-worker-0
== END pi-launcher-jlll8 pod logs ==
== BEGIN pi-worker-0 pod logs ==
:: initializing oneAPI environment ...
entrypoint.sh: BASH_VERSION = 5.2.15(1)-release
args: Using "$@" for setvars.sh arguments: /usr/sbin/sshd -De -f /home/mpiuser/.sshd_config
:: mpi -- latest
:: oneAPI environment initialized ::
Server listening on 0.0.0.0 port 2222.
Server listening on :: port 2222.
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 56026
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 36592
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 34520
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 37484
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 60796
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 35102
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 51398
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 43206
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 38376
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 46258
Accepted publickey for mpiuser from 10.244.0.26 port 36268 ssh2: ECDSA SHA256:CNZ8oQS0SYjcnyrsp2ZlAKHnT+8J+ZSTFENzWrT+9vQ
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 38498
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 37542
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 38298
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 44600
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 41802
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 52118
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 32934
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 54520
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 48770
== END pi-worker-0 pod logs ==
== BEGIN pi-worker-1 pod logs ==
:: initializing oneAPI environment ...
entrypoint.sh: BASH_VERSION = 5.2.15(1)-release
args: Using "$@" for setvars.sh arguments: /usr/sbin/sshd -De -f /home/mpiuser/.sshd_config
:: mpi -- latest
:: oneAPI environment initialized ::
Server listening on 0.0.0.0 port 2222.
Server listening on :: port 2222.
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 47954
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 58200
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 33168
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 42132
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 5[4286](https://github.com/kubeflow/mpi-operator/actions/runs/12369242697/job/34522856743?pr=673#step:4:4287)
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 56804
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 51658
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 56848
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 55746
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 55400
Accepted publickey for mpiuser from 10.244.0.26 port [4293](https://github.com/kubeflow/mpi-operator/actions/runs/12369242697/job/34522856743?pr=673#step:4:4294)0 ssh2: ECDSA SHA256:CNZ8oQS0SYjcnyrsp2ZlAKHnT+8J+ZSTFENzWrT+9vQ
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 37518
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 35264
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 57206
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 46430
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 40660
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 36338
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port [4307](https://github.com/kubeflow/mpi-operator/actions/runs/12369242697/job/34522856743?pr=673#step:4:4308)4
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 53162
kex_exchange_identification: Connection closed by remote host
Connection closed by 10.244.0.1 port 47790
== END pi-worker-1 pod logs ==
[...]
Summarizing 2 Failures:
[Fail] MPIJob with Intel Implementation when running as root [It] should succeed
/home/runner/work/mpi-operator/mpi-operator/test/e2e/mpi_job_test.go:571
[Fail] MPIJob with Intel Implementation when running as non-root [It] should succeed
/home/runner/work/mpi-operator/mpi-operator/test/e2e/mpi_job_test.go:571
Ran 13 of 13 Specs in 836.549 seconds
FAIL! -- 11 Passed | 2 Failed | 0 Pending | 0 Skipped
--- FAIL: TestE2E (836.55s)
FAIL
FAIL github.com/kubeflow/mpi-operator/test/e2e 836.571s
FAIL
We have seen this problem in the following: