Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ ignore =
W503,
# N818: exception name should be named with an Error suffix
N818
# B042: Exception class with `__init__` should pass all args to `super().__init__()` in order to work with `copy.copy()`.
# Affected by false positive, https://github.com/PyCQA/flake8-bugbear/issues/525
B042
exclude =
.tox,
.git,
Expand Down
25 changes: 20 additions & 5 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@ jobs:
name:
- Python 3.9 Tests
- Python 3.10 Tests
- Python 3.9 Tests Coverage
- Python 3.11 Tests
- Python 3.12 Tests
- Python 3.13 Tests
- Python 3.12 Tests Coverage
- Code Checks
include:
- name: Python 3.9 Tests
Expand All @@ -39,12 +42,24 @@ jobs:
python: '3.10'
toxdir: cli
toxenv: py310-nocov
- name: Python 3.9 Tests Coverage
python: 3.9
- name: Python 3.11 Tests
python: '3.11'
toxdir: cli
toxenv: py311-nocov
- name: Python 3.12 Tests
python: '3.12'
toxdir: cli
toxenv: py312-nocov
- name: Python 3.13 Tests
python: '3.13'
toxdir: cli
toxenv: py39-cov
toxenv: py313-nocov
- name: Python 3.12 Tests Coverage
python: 3.12
toxdir: cli
toxenv: py312-cov
- name: Code Checks
python: 3.9
python: 3.12
toxdir: cli
toxenv: code-linters

Expand Down
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@ aws-parallelcluster-node CHANGELOG

This file is used to list changes made in each version of the aws-parallelcluster-node package.

3.15.0
------

**CHANGES**
- Direct users to slurm_resume log to see EC2 error codes if no instances are launched.

3.14.0
------

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def read(fname):
"clustermgtd = slurm_plugin.clustermgtd:main",
"computemgtd = slurm_plugin.computemgtd:main",
]
version = "3.14.0"
version = "3.15.0"
requires = ["boto3>=1.7.55", "retrying>=1.3.3"]

setup(
Expand Down
3 changes: 2 additions & 1 deletion src/slurm_plugin/clustermgtd.py
Original file line number Diff line number Diff line change
Expand Up @@ -1262,7 +1262,8 @@ def _reset_timeout_expired_compute_resources(
return
log.info(
"The following compute resources are in down state due to insufficient capacity: %s, "
"compute resources will be reset after insufficient capacity timeout (%s seconds) expired",
"compute resources will be reset after insufficient capacity timeout (%s seconds) expired. "
"Check the slurm_resume log for EC2 error codes.",
self._insufficient_capacity_compute_resources,
self._config.insufficient_capacity_timeout,
)
Expand Down
6 changes: 5 additions & 1 deletion src/slurm_plugin/resume.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,11 @@ def _resume(arg_nodes, resume_config, slurm_resume):
print_with_count(failed_nodes),
)
for error_code, node_list in instance_manager.failed_nodes.items():
_handle_failed_nodes(node_list, reason=f"(Code:{error_code})Failure when resuming nodes")
_handle_failed_nodes(
node_list,
reason=f"(Code:{error_code})Failure when resuming nodes - "
f"Check the slurm_resume log for EC2 error codes",
)

event_publisher = ClusterEventPublisher.create_with_default_publisher(
event_logger,
Expand Down
7 changes: 7 additions & 0 deletions tests/slurm_plugin/test_clustermgtd.py
Original file line number Diff line number Diff line change
Expand Up @@ -3533,6 +3533,13 @@ def test_reset_timeout_expired_compute_resources(
assert_that(cluster_manager._insufficient_capacity_compute_resources).is_equal_to(
expected_insufficient_capacity_compute_resources
)

if expected_insufficient_capacity_compute_resources:
assert (
"compute resources will be reset after insufficient capacity timeout (20 seconds) expired. "
"Check the slurm_resume log for EC2 error codes."
) in caplog.text

if expected_power_save_node_list:
power_save_mock.assert_called_with(
expected_power_save_node_list, reason="Enabling node since insufficient capacity timeout expired"
Expand Down
6 changes: 5 additions & 1 deletion tests/slurm_plugin/test_resume.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,11 @@ def test_resume_launch(
if expected_failed_nodes:
for error_code, nodeset in expected_failed_nodes.items():
mock_handle_failed_nodes_calls.append(
call(nodeset, reason=f"(Code:{error_code})Failure when resuming nodes")
call(
nodeset,
reason=f"(Code:{error_code})Failure when resuming nodes - "
f"Check the slurm_resume log for EC2 error codes",
)
)
mock_handle_failed_nodes.assert_has_calls(mock_handle_failed_nodes_calls)
mock_terminate_instances.assert_called_with(ANY, mock_resume_config.terminate_max_batch_size)
Expand Down
3 changes: 2 additions & 1 deletion tox.ini
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tox]
envlist =
py{39,310}-cov
py{39,310,311,312,313}-cov
code-linters

# Default testenv. Used to run tests on all python versions.
Expand All @@ -14,6 +14,7 @@ usedevelop =
allowlist_externals =
bash
deps =
setuptools
-r tests/requirements.txt
commands =
nocov: pytest -n auto -l -v --basetemp={envtmpdir} --html=report.html --ignore=src tests/
Expand Down