Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix seastar::resource::allocate() error on EC2 m7gd.16xlarge instance #2624

Merged
merged 1 commit into from
Jan 30, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 39 additions & 10 deletions src/core/resource.cc
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,10 @@ optional<T> read_setting_V1V2_as(std::string cg1_path, std::string cg2_fname) {

namespace resource {

static unsigned long get_machine_memory_from_sysconf() {
return ::sysconf(_SC_PAGESIZE) * size_t(::sysconf(_SC_PHYS_PAGES));
}

static
size_t
kernel_memory_reservation() {
Expand Down Expand Up @@ -305,13 +309,25 @@ size_t div_roundup(size_t num, size_t denom) {
return (num + denom - 1) / denom;
}

static size_t alloc_from_node(cpu& this_cpu, hwloc_obj_t node, std::unordered_map<hwloc_obj_t, size_t>& used_mem, size_t alloc) {
static hwloc_uint64_t get_memory_from_hwloc_obj(hwloc_obj_t obj) {
#if HWLOC_API_VERSION >= 0x00020000
// FIXME: support nodes with multiple NUMA nodes, whatever that means
auto local_memory = node->total_memory;
auto total_memory = obj->total_memory;
#else
auto local_memory = node->memory.local_memory;
auto total_memory = obj->memory.total_memory;
#endif
return total_memory;
}

static void set_memory_to_hwloc_obj(hwloc_obj_t machine, hwloc_uint64_t memory) {
#if HWLOC_API_VERSION >= 0x00020000
machine->total_memory = memory;
#else
machine->memory.total_memory = memory;
#endif
}

static size_t alloc_from_node(cpu& this_cpu, hwloc_obj_t node, std::unordered_map<hwloc_obj_t, size_t>& used_mem, size_t alloc) {
auto local_memory = get_memory_from_hwloc_obj(node);
auto taken = std::min(local_memory - used_mem[node], alloc);
if (taken) {
used_mem[node] += taken;
Expand Down Expand Up @@ -574,11 +590,13 @@ resources allocate(configuration& c) {
auto machine_depth = hwloc_get_type_depth(topology, HWLOC_OBJ_MACHINE);
assert(hwloc_get_nbobjs_by_depth(topology, machine_depth) == 1);
auto machine = hwloc_get_obj_by_depth(topology, machine_depth, 0);
#if HWLOC_API_VERSION >= 0x00020000
auto available_memory = machine->total_memory;
#else
auto available_memory = machine->memory.total_memory;
#endif
auto available_memory = get_memory_from_hwloc_obj(machine);
if (!available_memory) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could reuse get_local_memory_from_node() for retrieving the size of memory owned by the machine.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Applied

available_memory = get_machine_memory_from_sysconf();
set_memory_to_hwloc_obj(machine, available_memory);
seastar_logger.warn("hwloc failed to detect machine-wide memory size, using memory size fetched from sysconf");
}

size_t mem = calculate_memory(c, std::min(available_memory,
cgroup::memory_limit()));
// limit memory address to fit in 36-bit, see core/memory.cc:Memory map
Expand All @@ -592,6 +610,7 @@ resources allocate(configuration& c) {
std::vector<std::pair<cpu, size_t>> remains;

auto cpu_sets = distribute_objects(topology, procs);
auto num_nodes = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NUMANODE);

for (auto&& cs : cpu_sets()) {
auto cpu_id = hwloc_bitmap_first(cs);
Expand All @@ -601,6 +620,16 @@ resources allocate(configuration& c) {
if (node == nullptr) {
orphan_pus.push_back(cpu_id);
} else {
if (!get_memory_from_hwloc_obj(node)) {
// If hwloc fails to detect the hardware topology, it falls back to treating
// the system as a single-node configuration. While this code supports
// multi-node setups, the fallback behavior is safe and will function
// correctly in this case.
assert(num_nodes == 1);
auto local_memory = get_machine_memory_from_sysconf();
set_memory_to_hwloc_obj(node, local_memory);
seastar_logger.warn("hwloc failed to detect NUMA node memory size, using memory size fetched from sysfs");
}
cpu_to_node[cpu_id] = node;
seastar_logger.debug("Assign CPU{} to NUMA{}", cpu_id, node->os_index);
}
Expand Down Expand Up @@ -730,7 +759,7 @@ allocate_io_queues(configuration c, std::vector<cpu> cpus) {
resources allocate(configuration& c) {
resources ret;

auto available_memory = ::sysconf(_SC_PAGESIZE) * size_t(::sysconf(_SC_PHYS_PAGES));
auto available_memory = get_machine_memory_from_sysconf();
auto mem = calculate_memory(c, available_memory);
auto procs = c.cpus;
ret.cpus.reserve(procs);
Expand Down