Skip to content

Commit

Permalink
ci: delete znode tests (#10201)
Browse files Browse the repository at this point in the history
  • Loading branch information
djanicekpach authored Nov 13, 2024
1 parent 11a2581 commit e56dd5a
Show file tree
Hide file tree
Showing 8 changed files with 2 additions and 588 deletions.
402 changes: 0 additions & 402 deletions .circleci/real_config.yml

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion e2e_tests/pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ markers =
e2e_pbs: end to end pbs integration tests
e2e_saml: tests for saml with okta
e2e_slurm: end to end slurm integration tests
e2e_slurm_restart: slurm integration tests that require restarting the master
e2e_slurm_internet_connected_cluster: slurm integrations for clusters with internet access
test_oauth: end to end test for oauth client, add, remove in EE.
test_model_registry_rbac: end to end test for RBAC model registry.
Expand Down
4 changes: 0 additions & 4 deletions e2e_tests/tests/cluster/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,3 @@
restartable_managed_cluster_multi_resource_pools,
)
from .managed_cluster_k8s import k8s_managed_cluster # noqa
from .managed_slurm_cluster import ( # noqa
managed_slurm_cluster_restarts,
managed_slurm_cluster_session,
)
110 changes: 0 additions & 110 deletions e2e_tests/tests/cluster/managed_slurm_cluster.py

This file was deleted.

54 changes: 1 addition & 53 deletions e2e_tests/tests/cluster/test_master_restart.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import logging
import time
from typing import Iterator

import docker
import pytest
Expand All @@ -15,31 +14,13 @@
from tests import config as conf
from tests import detproc
from tests import experiment as exp
from tests.cluster import (
abstract_cluster,
managed_cluster,
managed_cluster_k8s,
managed_slurm_cluster,
utils,
)
from tests.cluster import abstract_cluster, managed_cluster, managed_cluster_k8s, utils
from tests.experiment import noop
from tests.task import task

logger = logging.getLogger(__name__)


# Create a pytest fixture that returns a restartable instance of ManagedSlurmCluster.
@pytest.fixture
def restartable_managed_slurm_cluster(
managed_slurm_cluster_restarts: managed_slurm_cluster.ManagedSlurmCluster,
) -> Iterator[managed_slurm_cluster.ManagedSlurmCluster]:
try:
yield managed_slurm_cluster_restarts
except Exception:
managed_slurm_cluster_restarts.restart_master()
raise


@pytest.mark.managed_devcluster
def test_master_restart_ok(restartable_managed_cluster: managed_cluster.ManagedCluster) -> None:
_test_master_restart_ok(restartable_managed_cluster)
Expand Down Expand Up @@ -90,14 +71,6 @@ def test_master_restart_ok_k8s(k8s_managed_cluster: managed_cluster_k8s.ManagedK
_test_master_restart_ok(k8s_managed_cluster)


# Test to ensure master restarts successfully.
@pytest.mark.e2e_slurm_restart
def test_master_restart_ok_slurm(
managed_slurm_cluster_restarts: managed_slurm_cluster.ManagedSlurmCluster,
) -> None:
_test_master_restart_ok(managed_slurm_cluster_restarts)


def _test_master_restart_ok(managed_cluster: abstract_cluster.Cluster) -> None:
# - Kill master
# - Restart master
Expand Down Expand Up @@ -143,18 +116,6 @@ def test_master_restart_reattach_recover_experiment_k8s(
_test_master_restart_reattach_recover_experiment(k8s_managed_cluster, downtime)


# Test to ensure that master can reattach to the experiment and resume it, after the determined
# master has restarted.
@pytest.mark.e2e_slurm_restart
@pytest.mark.parametrize("downtime", [0, 20, 60])
def test_master_restart_reattach_recover_experiment_slurm(
managed_slurm_cluster_restarts: managed_slurm_cluster.ManagedSlurmCluster, downtime: int
) -> None:
_test_master_restart_reattach_recover_experiment(
managed_slurm_cluster_restarts, downtime, max_workload_ticks=500
)


@pytest.mark.managed_devcluster
def test_master_agent_restart_reattach_recover_experiment(
restartable_managed_cluster: managed_cluster.ManagedCluster,
Expand Down Expand Up @@ -544,19 +505,6 @@ def test_master_restart_cmd_k8s(
_test_master_restart_cmd(k8s_managed_cluster, slots, downtime)


# Test to ensure that master can recover and complete a command that was in running state
# when the master has restarted.
@pytest.mark.e2e_slurm_restart
@pytest.mark.parametrize("slots", [0, 1])
@pytest.mark.parametrize("downtime", [0, 20, 60])
def test_master_restart_cmd_slurm(
restartable_managed_slurm_cluster: managed_slurm_cluster.ManagedSlurmCluster,
slots: int,
downtime: int,
) -> None:
_test_master_restart_cmd(restartable_managed_slurm_cluster, slots, downtime)


def _test_master_restart_cmd(
managed_cluster: abstract_cluster.Cluster, slots: int, downtime: int
) -> None:
Expand Down
14 changes: 0 additions & 14 deletions e2e_tests/tests/cluster/test_slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,20 +170,6 @@ def test_docker_login() -> None:
)


@pytest.mark.skipif(not torch.cuda.is_available(), reason="no gpu available")
@pytest.mark.e2e_slurm
@pytest.mark.e2e_pbs
@api_utils.skipif_not_hpc()
def test_mnist_pytorch_distributed() -> None:
sess = api_utils.user_session()
config = conf.load_config(conf.tutorials_path("mnist_pytorch/distributed.yaml"))
assert "--epochs 1" in config["entrypoint"], "update test to match tutorial"
config["entrypoint"] = config["entrypoint"].replace("--epochs 1", "--batches 64")
config["max_restarts"] = 0

exp.run_basic_test_with_temp_config(sess, config, conf.fixtures_path("mnist_pytorch"), 1)


@pytest.mark.e2e_slurm
@pytest.mark.e2e_pbs
@api_utils.skipif_not_hpc()
Expand Down
1 change: 0 additions & 1 deletion e2e_tests/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
"e2e_pbs",
"e2e_saml",
"e2e_slurm",
"e2e_slurm_restart",
"e2e_slurm_internet_connected_cluster",
"det_deploy_local",
"test_oauth",
Expand Down
4 changes: 1 addition & 3 deletions tools/slurm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,7 @@ By default, the `test-e2e-*-gcp` jobs are not run within the `test-e2e` workflow
**On branch `main` and `release/rc` branches, these jobs always run without needing to set the `ci-run-allgcp` label.**
The following test suites currently run only on hardware. They do not run successfully with `make slurmcluster` and thus are not executed via GCP as part of the CI/CD gate:
- `test-e2e-slurm-preemption-quarantine`: Currently runs on znode as a part of the nightly test suite.
- `test-e2e-slurm-restart`: Dependent upon znode configuration, so not worth testing on GCP.
## Important Workaround Explained
Expand Down

0 comments on commit e56dd5a

Please sign in to comment.