From 3a29fe0721eaba5870a8e44e667164752dfa1fb9 Mon Sep 17 00:00:00 2001 From: Cruz Zhao Date: Wed, 11 Mar 2026 15:59:22 +0800 Subject: [PATCH 1/3] feat: Mooncake backend implementation - Add MooncakeHybridRolloutTransfer with put_rollout/get_rollout (multi_buf) - Add MooncakeStoreConfig and get_data_transfer_backend in data_transfer.py - Add --transfer-backend (ray|mooncake|mooncake_legacy) and --mooncake-mount-segment-size - Wire Mooncake backend in FSDP/Megatron actors and rollout pipeline - Support DataTransferBackend in process_rollout_data for disaggregated training Made-with: Cursor --- slime/backends/megatron_utils/actor.py | 5 + slime/ray/rollout.py | 5 +- slime/utils/arguments.py | 20 + slime/utils/data.py | 24 +- slime/utils/data_transfer.py | 154 +++ slime/utils/rollout_hybrid_transfer.py | 1236 ++++++++++++++++++++++++ 6 files changed, 1440 insertions(+), 4 deletions(-) create mode 100644 slime/utils/data_transfer.py create mode 100644 slime/utils/rollout_hybrid_transfer.py diff --git a/slime/backends/megatron_utils/actor.py b/slime/backends/megatron_utils/actor.py index af4880cf49..b27127d61e 100644 --- a/slime/backends/megatron_utils/actor.py +++ b/slime/backends/megatron_utils/actor.py @@ -16,6 +16,7 @@ from slime.ray.train_actor import TrainRayActor from slime.utils import train_dump_utils from slime.utils.data import process_rollout_data +from slime.utils.data_transfer import get_data_transfer_backend from slime.utils.distributed_utils import get_gloo_group, init_process_group from slime.utils.logging_utils import init_tracking from slime.utils.memory_utils import clear_memory, print_memory @@ -75,6 +76,9 @@ def init( self.train_parallel_config = { "dp_size": mpu.get_data_parallel_world_size(with_context_parallel=False), } + + self.transfer_backend = get_data_transfer_backend(args) + dist.barrier(group=get_gloo_group()) if args.offload_train: @@ -191,6 +195,7 @@ def _get_rollout_data(self, rollout_data_ref: Box) -> RolloutBatch: rollout_data_ref, mpu.get_data_parallel_rank(with_context_parallel=False), mpu.get_data_parallel_world_size(with_context_parallel=False), + transfer_backend=self.transfer_backend, ) # TODO: this is ugly, move to somewhere else? # move tokens to GPU in advance diff --git a/slime/ray/rollout.py b/slime/ray/rollout.py index 7ea0d12b56..6afdf3f6df 100644 --- a/slime/ray/rollout.py +++ b/slime/ray/rollout.py @@ -18,6 +18,7 @@ from slime.backends.sglang_utils.sglang_engine import SGLangEngine from slime.rollout.base_types import call_rollout_fn from slime.utils import logging_utils +from slime.utils.data_transfer import get_data_transfer_backend from slime.utils.health_monitor import RolloutHealthMonitor from slime.utils.http_utils import _wrap_ipv6, find_available_port, get_host_info, init_http_client from slime.utils.logging_utils import configure_logger, init_tracking @@ -390,6 +391,8 @@ def __init__(self, args, pg): self._health_monitors.append(monitor) self._ci_fault_injection_pending = self.args.ci_test # Flag for CI fault injection + self.transfer_backend = get_data_transfer_backend(args) + def _try_ci_fault_injection(self): """Try to inject fault during generate (when health monitor is running).""" if not self._ci_fault_injection_pending: @@ -779,7 +782,7 @@ def _split_train_data_by_dp(self, data, dp_size): # Pass dynamic global_batch_size to training side if hasattr(self, "_dynamic_global_batch_size"): rollout_data["dynamic_global_batch_size"] = self._dynamic_global_batch_size - rollout_data_refs.append(Box(ray.put(rollout_data))) + rollout_data_refs.append(self.transfer_backend.put(rollout_data)) return rollout_data_refs diff --git a/slime/utils/arguments.py b/slime/utils/arguments.py index d8da881ea3..2370aa5273 100644 --- a/slime/utils/arguments.py +++ b/slime/utils/arguments.py @@ -322,6 +322,26 @@ def add_rollout_arguments(parser): default=False, help=("Whether to shuffle the prompts during rollout."), ) + parser.add_argument( + "--transfer-backend", + type=str, + choices=["ray", "mooncake", "mooncake_legacy"], + default="ray", + help=( + "Backend for transferring rollout data from rollout workers to training actors. " + "'ray' uses Ray Object Store (default). 'mooncake' uses Mooncake distributed store for disaggregated setups." + ), + ) + parser.add_argument( + "--mooncake-mount-segment-size", + type=int, + default=None, + help=( + "Mooncake segment size to mount (bytes). Default 1 GiB. " + "Set to 0 to disable segment mounting. " + "Also configurable via MOONCAKE_MOUNT_SEGMENT_SIZE or MOONCAKE_GLOBAL_SEGMENT_SIZE env." + ), + ) parser.add_argument( "--rollout-seed", type=int, diff --git a/slime/utils/data.py b/slime/utils/data.py index 4bb81e5677..b367e77f74 100644 --- a/slime/utils/data.py +++ b/slime/utils/data.py @@ -4,9 +4,9 @@ import os import random import re +from argparse import Namespace import numpy as np -import ray try: import pyarrow.parquet as pq @@ -15,6 +15,7 @@ from slime.utils.types import MultimodalTypes, Sample +from .data_transfer import DataTransferBackend from .timer import Timer __all__ = ["Dataset"] @@ -296,9 +297,26 @@ def get_minimum_num_micro_batch_size(total_lengths, max_tokens_per_gpu): return len(batches) -def process_rollout_data(args, rollout_data_ref, dp_rank, dp_size): +def process_rollout_data( + args: Namespace, + rollout_data_ref: list, + dp_rank: int, + dp_size: int, + transfer_backend: DataTransferBackend | None = None, +) -> dict: assert len(rollout_data_ref) == dp_size - rollout_data = ray.get(rollout_data_ref[dp_rank].inner) + if transfer_backend is not None: + rollout_data = transfer_backend.get(rollout_data_ref[dp_rank]) + else: + import ray + + from slime.utils.misc import Box + + ref = rollout_data_ref[dp_rank] + if isinstance(ref, Box): + rollout_data = ray.get(ref.inner) + else: + rollout_data = ray.get(ref) partition = rollout_data.pop("partition") total_lengths = rollout_data["total_lengths"] diff --git a/slime/utils/data_transfer.py b/slime/utils/data_transfer.py new file mode 100644 index 0000000000..f017104e6c --- /dev/null +++ b/slime/utils/data_transfer.py @@ -0,0 +1,154 @@ +import logging +import os +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any + +logger = logging.getLogger(__name__) + + +class DataTransferBackend(ABC): + """Abstract base class for data transfer backends.""" + + @abstractmethod + def put(self, data: Any) -> Any: + """ + Store data and return a handle/key. + """ + pass + + @abstractmethod + def get(self, handle: Any) -> Any: + """ + Retrieve data using the handle/key. + """ + pass + + def cleanup(self, handle: Any): # noqa: B027 + """ + Clean up data associated with the handle (optional). + """ + pass + + +class RayDataTransfer(DataTransferBackend): + """Default data transfer using Ray Object Store.""" + + def put(self, data: Any) -> Any: + import ray + + from slime.utils.misc import Box + + return Box(ray.put(data)) + + def get(self, handle: Any) -> Any: + import ray + + from slime.utils.misc import Box + + if isinstance(handle, Box): + return ray.get(handle.inner) + return ray.get(handle) + + +DEFAULT_MOUNT_SEGMENT_SIZE = 4 * 1024 * 1024 * 1024 # 4 GiB (larger for benchmark/training) +DEFAULT_LOCAL_BUFFER_SIZE = 2 * 1024 * 1024 * 1024 # 2 GiB + + +def _parse_segment_size(value) -> int: + if isinstance(value, int): + return value + if isinstance(value, str): + s = value.strip().lower() + if s.endswith("gb"): + num = s[:-2].strip() + if not num: + raise ValueError("Invalid segment size: missing number before 'gb'") + return int(num) * 1024 * 1024 * 1024 + return int(s) + return int(value) + + +@dataclass +class MooncakeStoreConfig: + local_hostname: str + metadata_server: str + mount_segment_size: int # Segment to mount (bytes). 0 = no mount. + local_buffer_size: int + protocol: str + device_name: str | None + master_server_address: str + + @staticmethod + def load_from_env(overrides: dict | None = None) -> "MooncakeStoreConfig": + """Load config from environment variables. + + Required: MOONCAKE_MASTER + Optional: MOONCAKE_PROTOCOL (default tcp), MOONCAKE_DEVICE, + MOONCAKE_TE_META_DATA_SERVER (default P2PHANDSHAKE), + MOONCAKE_MOUNT_SEGMENT_SIZE or MOONCAKE_GLOBAL_SEGMENT_SIZE (default 4 GiB, 0 = no mount), + MOONCAKE_LOCAL_BUFFER_SIZE (default 2 GiB). + Set MOONCAKE_PROTOCOL=rdma for best performance (requires InfiniBand/RoCE). + """ + # MC_STORE_MEMCPY=0 for cross-node RDMA: LOCAL_MEMCPY path can SIGSEGV when + # Put buffers are not in Client's mounted segment. Use TRANSFER_ENGINE (RDMA). + os.environ.setdefault("MC_STORE_MEMCPY", "0") + if not os.getenv("MOONCAKE_MASTER"): + raise ValueError( + "Neither the environment variable 'MOONCAKE_CONFIG_PATH' nor 'MOONCAKE_MASTER' is set." + ) + local_hostname = os.getenv("MOONCAKE_LOCAL_HOSTNAME", "") + if not local_hostname or local_hostname in ("localhost", "127.0.0.1"): + try: + import ray + if ray.is_initialized(): + local_hostname = ray.util.get_node_ip_address() + else: + import socket + local_hostname = socket.gethostbyname(socket.gethostname()) + except Exception: + local_hostname = "127.0.0.1" + + overrides = overrides or {} + mount_sz = overrides.get("mount_segment_size") + if mount_sz is None: + env_val = os.getenv("MOONCAKE_MOUNT_SEGMENT_SIZE") or os.getenv( + "MOONCAKE_GLOBAL_SEGMENT_SIZE" + ) + mount_sz = ( + _parse_segment_size(env_val) + if (env_val is not None and env_val.strip()) + else DEFAULT_MOUNT_SEGMENT_SIZE + ) + else: + mount_sz = int(mount_sz) + + return MooncakeStoreConfig( + local_hostname=local_hostname, + metadata_server=os.getenv("MOONCAKE_TE_META_DATA_SERVER", "P2PHANDSHAKE"), + mount_segment_size=mount_sz, + local_buffer_size=_parse_segment_size( + os.getenv("MOONCAKE_LOCAL_BUFFER_SIZE", DEFAULT_LOCAL_BUFFER_SIZE) + ), + protocol=os.getenv("MOONCAKE_PROTOCOL", "tcp"), # use "rdma" for RDMA + device_name=os.getenv("MOONCAKE_DEVICE", ""), + master_server_address=os.getenv("MOONCAKE_MASTER"), + ) + + +def get_data_transfer_backend(args): + """Factory function to get the appropriate backend.""" + backend_name = getattr(args, "transfer_backend", "ray") + if backend_name in ("mooncake", "mooncake_legacy"): + from slime.utils.rollout_hybrid_transfer import MooncakeHybridRolloutTransfer + + use_legacy = backend_name == "mooncake_legacy" + mount_segment_size = getattr(args, "mooncake_mount_segment_size", None) + return MooncakeHybridRolloutTransfer( + tensor_min_bytes=1024 * 1024, + enable_auto_cleanup=True, + use_legacy_path=use_legacy, + mount_segment_size=mount_segment_size, + ) + else: + return RayDataTransfer() diff --git a/slime/utils/rollout_hybrid_transfer.py b/slime/utils/rollout_hybrid_transfer.py new file mode 100644 index 0000000000..591f05b81e --- /dev/null +++ b/slime/utils/rollout_hybrid_transfer.py @@ -0,0 +1,1236 @@ +""" +Mooncake Hybrid Rollout Transfer - Rewritten per design. + +Design: +- Legacy multi-key (default): each tensor in separate key, more stable +- Single-Key SGL (optional): aggregate into one buffer, one key +- Mem pool: reuse get buffers (get_into + memoryview, 1 copy) +- put/get: DataTransferBackend adapters +- Pickle 5 OOB: extract tensor buffers, zero-copy deserialize +""" +import os +import pickle +import queue +import struct +import threading +import time +import uuid +import copyreg +import weakref +import logging +from dataclasses import dataclass + +import numpy as np +import torch + +from slime.utils.data_transfer import MooncakeStoreConfig + +logger = logging.getLogger(__name__) + +DTYPE_MAP = { + torch.float32: 0, + torch.float64: 1, + torch.int8: 2, + torch.uint8: 3, + torch.int16: 4, + torch.int32: 6, + torch.int64: 8, + torch.bool: 10, + torch.float16: 11, + torch.bfloat16: 12, +} + +# --- Pickle 5 OOB for PyTorch tensors (zero-copy) --- + + +def _reconstruct_torch_tensor(buf, dtype, shape): + if isinstance(buf, torch.Tensor): + t = buf + else: + try: + m = memoryview(buf) + arr = np.array(m, copy=False) + # PyTorch requires writable arrays; copy if read-only to avoid undefined behavior + if not arr.flags.writeable: + arr = np.array(arr, copy=True) + t = torch.from_numpy(arr) + except TypeError: + t = torch.from_numpy(buf) + if t.dtype != dtype: + t = t.view(dtype) + return t.reshape(shape) + + +def _reduce_torch_tensor(t: torch.Tensor): + if t.device.type != "cpu": + t = t.cpu() + t_contig = t.contiguous() + shape, dtype = t_contig.shape, t_contig.dtype + if t_contig.element_size() == 2 and dtype in (torch.bfloat16, torch.float16): + t_contig = t_contig.view(torch.int16) + return (_reconstruct_torch_tensor, (pickle.PickleBuffer(t_contig.numpy()), dtype, shape)) + + +copyreg.pickle(torch.Tensor, _reduce_torch_tensor) + + +@dataclass(frozen=True) +class HybridRolloutHandle: + meta_key: str + meta_size: int + tensor_keys: list[str] + tensor_sizes: list[int] + padded_sizes: list[int] | None = None # None = legacy multi-key + + +def _pack_ragged_1d_int32(list_of_arrays: list[np.ndarray]) -> tuple[torch.Tensor, torch.Tensor]: + lengths = np.array([a.shape[0] for a in list_of_arrays], dtype=np.int64) + offsets = np.zeros(len(lengths) + 1, dtype=np.int64) + np.cumsum(lengths, out=offsets[1:]) + flat = np.concatenate(list_of_arrays, axis=0, dtype=np.int32) + return torch.from_numpy(flat), torch.from_numpy(offsets) + + +def _pack_ragged_1d_float32(list_of_lists: list[list[float]]) -> tuple[torch.Tensor, torch.Tensor]: + lengths = np.array([len(x) for x in list_of_lists], dtype=np.int64) + offsets = np.zeros(len(lengths) + 1, dtype=np.int64) + np.cumsum(lengths, out=offsets[1:]) + flat = np.empty(int(offsets[-1]), dtype=np.float32) + pos = 0 + for xs in list_of_lists: + n = len(xs) + flat[pos : pos + n] = np.asarray(xs, dtype=np.float32) + pos += n + return torch.from_numpy(flat), torch.from_numpy(offsets) + + +def _unpack_ragged_1d_int32(flat: torch.Tensor, offsets: torch.Tensor) -> list[np.ndarray]: + flat_np = flat.cpu().numpy().astype(np.int32, copy=False) + off = offsets.cpu().numpy().astype(np.int64, copy=False) + return [flat_np[int(off[i]) : int(off[i + 1])] for i in range(len(off) - 1)] + + +def _unpack_ragged_1d_float32(flat: torch.Tensor, offsets: torch.Tensor) -> list[list[float]]: + flat_np = flat.cpu().numpy().astype(np.float32, copy=False) + off = offsets.cpu().numpy().astype(np.int64, copy=False) + return [flat_np[int(off[i]) : int(off[i + 1])].tolist() for i in range(len(off) - 1)] + + +def _unpack_ragged(data: dict) -> dict: + """Standalone unpack for ragged arrays (used by benchmarks).""" + if "tokens" in data and isinstance(data["tokens"], dict) and data["tokens"].get("__ragged_type__") == "1d_int32": + data["tokens"] = _unpack_ragged_1d_int32(data["tokens"]["flat"], data["tokens"]["offsets"]) + if "labels" in data and isinstance(data["labels"], dict) and data["labels"].get("__ragged_type__") == "1d_int32": + data["labels"] = _unpack_ragged_1d_int32(data["labels"]["flat"], data["labels"]["offsets"]) + if ( + "rollout_log_probs" in data + and isinstance(data["rollout_log_probs"], dict) + and data["rollout_log_probs"].get("__ragged_type__") == "1d_float32" + ): + data["rollout_log_probs"] = _unpack_ragged_1d_float32( + data["rollout_log_probs"]["flat"], + data["rollout_log_probs"]["offsets"], + ) + return data + + +# --- Numpy meta format (SLIME_USE_NUMPY_META=1): struct + numpy, no pickle --- +NUMPY_META_MAGIC = b"SLM1" +NUMPY_META_VERSION = 1 + + +def _serialize_rollout_meta_numpy(rollout: dict) -> bytes: + """Serialize scalar metadata to compact binary. Tensor refs not included.""" + n = len(rollout.get("partition", rollout.get("response_lengths", []))) + if n == 0: + n = len(rollout.get("total_lengths", [])) + has_labels = "labels" in rollout and isinstance(rollout["labels"], list) + has_routed = "rollout_routed_experts" in rollout and isinstance(rollout["rollout_routed_experts"], list) + num_layers = rollout.get("num_layers", 64) + moe_topk = rollout.get("moe_router_topk", 2) + + parts = [ + NUMPY_META_MAGIC, + struct.pack(" dict: + """Deserialize meta_bytes + ready tensors into rollout dict.""" + # Header: magic(4) + ver(2) + reserved(2) + n(4) + has_labels(1) + has_routed(1) + num_layers(4) + moe_topk(4) + reserved(4) = 26 + offset = 26 + partition = np.frombuffer(meta_bytes, dtype=np.int32, count=n, offset=offset) + offset += n * 4 + response_lengths = np.frombuffer(meta_bytes, dtype=np.int32, count=n, offset=offset) + offset += n * 4 + rewards = np.frombuffer(meta_bytes, dtype=np.float32, count=n, offset=offset) + offset += n * 4 + total_lengths = np.frombuffer(meta_bytes, dtype=np.int32, count=n, offset=offset) + offset += n * 4 + loss_lengths = np.frombuffer(meta_bytes, dtype=np.int32, count=n, offset=offset) + offset += n * 4 + loss_total = int(loss_lengths.sum()) + loss_packed = np.frombuffer(meta_bytes, dtype=np.int32, count=loss_total, offset=offset) + offset += loss_total * 4 + loss_masks = [] + pos = 0 + for L in loss_lengths: + loss_masks.append(loss_packed[pos : pos + int(L)].tolist()) + pos += int(L) + + idx = 0 + tokens_flat = ready[idx] + tokens_off = ready[idx + 1] + idx += 2 + log_probs_flat = ready[idx] + log_probs_off = ready[idx + 1] + idx += 2 + labels_flat, labels_off = None, None + if has_labels: + labels_flat = ready[idx] + labels_off = ready[idx + 1] + idx += 2 + routed: list[torch.Tensor] = [] + if has_routed: + for _ in range(n): + routed.append(ready[idx]) + idx += 1 + + data = { + "partition": partition.tolist(), + "response_lengths": response_lengths.tolist(), + "rewards": rewards.tolist(), + "loss_masks": loss_masks, + "total_lengths": total_lengths.tolist(), + "tokens": _unpack_ragged_1d_int32(tokens_flat, tokens_off), + "rollout_log_probs": _unpack_ragged_1d_float32(log_probs_flat, log_probs_off), + } + if has_labels and labels_flat is not None: + data["labels"] = _unpack_ragged_1d_int32(labels_flat, labels_off) + if has_routed and routed: + data["rollout_routed_experts"] = [r.cpu().numpy() for r in routed] + return data + + +def _prepare_rollout_for_numpy( + rollout: dict, profile_out: dict | None = None +) -> tuple[bytes, list[torch.Tensor]]: + """Pack ragged arrays, serialize meta with numpy format, return (meta_bytes, tensor_buffers). + Tensor order: tokens_flat, tokens_off, log_probs_flat, log_probs_off, [labels_flat, labels_off], [routed_i...] + """ + t0 = time.perf_counter() + tensors: list[torch.Tensor] = [] + + if "tokens" in rollout and isinstance(rollout["tokens"], list): + flat, off = _pack_ragged_1d_int32(rollout["tokens"]) + tensors.extend([flat, off]) + if "rollout_log_probs" in rollout and isinstance(rollout["rollout_log_probs"], list): + flat, off = _pack_ragged_1d_float32(rollout["rollout_log_probs"]) + tensors.extend([flat, off]) + if "labels" in rollout and isinstance(rollout["labels"], list): + flat, off = _pack_ragged_1d_int32(rollout["labels"]) + tensors.extend([flat, off]) + if "rollout_routed_experts" in rollout and isinstance(rollout["rollout_routed_experts"], list): + for arr in rollout["rollout_routed_experts"]: + t = torch.from_numpy(np.asarray(arr, dtype=np.int32)) + tensors.append(t) + + meta_bytes = _serialize_rollout_meta_numpy(rollout) + if profile_out is not None: + profile_out["pack_ms"] = (time.perf_counter() - t0) * 1000 + profile_out["pickle_ms"] = 0 + profile_out["buffer_convert_ms"] = 0 + return meta_bytes, tensors + + +def _prepare_rollout_for_pickle( + rollout: dict, profile_out: dict | None = None +) -> tuple[bytes, list[torch.Tensor]]: + """Pack ragged arrays, pickle with OOB, return (meta_bytes, tensor_buffers).""" + t0 = time.perf_counter() + data = rollout.copy() + if "tokens" in data and isinstance(data["tokens"], list): + flat, off = _pack_ragged_1d_int32(data["tokens"]) + data["tokens"] = {"__ragged_type__": "1d_int32", "flat": flat, "offsets": off} + if "labels" in data and isinstance(data["labels"], list): + flat, off = _pack_ragged_1d_int32(data["labels"]) + data["labels"] = {"__ragged_type__": "1d_int32", "flat": flat, "offsets": off} + if "rollout_log_probs" in data and isinstance(data["rollout_log_probs"], list): + flat, off = _pack_ragged_1d_float32(data["rollout_log_probs"]) + data["rollout_log_probs"] = {"__ragged_type__": "1d_float32", "flat": flat, "offsets": off} + t_after_pack = time.perf_counter() + + buffers = [] + + def _cb(b): + buffers.append(b) + + meta_bytes = pickle.dumps(data, protocol=5, buffer_callback=_cb) + t_after_pickle = time.perf_counter() + tensors = [] + for b in buffers: + arr = np.array(memoryview(b), copy=False) + tensors.append(torch.from_numpy(arr)) + if profile_out is not None: + profile_out["pack_ms"] = (t_after_pack - t0) * 1000 + profile_out["pickle_ms"] = (t_after_pickle - t_after_pack) * 1000 + profile_out["buffer_convert_ms"] = (time.perf_counter() - t_after_pickle) * 1000 + return meta_bytes, tensors + + +def _pad8(x: int) -> int: + return (x + 7) // 8 * 8 + + +def _pack_ragged_1d_int32_into( + buf: np.ndarray, offset: int, list_of_arrays: list[np.ndarray] +) -> tuple[int, int, int]: + """Pack list of int32 arrays into buf at offset. Returns (flat_bytes, offsets_bytes, total_bytes). + Flat is padded to 8-byte boundary so offsets (int64) are aligned.""" + lengths = np.array([a.shape[0] for a in list_of_arrays], dtype=np.int64) + n_flat = int(lengths.sum()) + n_off = len(lengths) + 1 + flat_bytes = n_flat * 4 + flat_padded = _pad8(flat_bytes) + flat = buf.view(np.uint8)[offset : offset + flat_bytes].view(np.int32) + off_arr = buf.view(np.uint8)[offset + flat_padded : offset + flat_padded + n_off * 8].view(np.int64) + np.cumsum(lengths, out=off_arr[1:]) + off_arr[0] = 0 + flat[:] = np.concatenate(list_of_arrays, axis=0, dtype=np.int32) + return flat_bytes, n_off * 8, flat_padded + n_off * 8 + + +def _pack_ragged_1d_int32_into_split( + buf: np.ndarray, + offset_flat: int, + offset_off: int, + list_of_arrays: list[np.ndarray], +) -> tuple[int, int]: + """Pack flat and offsets at separate offsets. Returns (flat_bytes, off_bytes).""" + lengths = np.array([a.shape[0] for a in list_of_arrays], dtype=np.int64) + n_flat = int(lengths.sum()) + n_off = len(lengths) + 1 + flat_bytes = n_flat * 4 + off_bytes = n_off * 8 + flat = buf.view(np.uint8)[offset_flat : offset_flat + flat_bytes].view(np.int32) + off_arr = buf.view(np.uint8)[offset_off : offset_off + off_bytes].view(np.int64) + np.cumsum(lengths, out=off_arr[1:]) + off_arr[0] = 0 + flat[:] = np.concatenate(list_of_arrays, axis=0, dtype=np.int32) + return flat_bytes, off_bytes + + +def _pack_ragged_1d_float32_into_split( + buf: np.ndarray, + offset_flat: int, + offset_off: int, + list_of_lists: list[list[float]], +) -> tuple[int, int]: + """Pack flat and offsets at separate offsets. Returns (flat_bytes, off_bytes).""" + lengths = np.array([len(x) for x in list_of_lists], dtype=np.int64) + n_flat = int(lengths.sum()) + n_off = len(lengths) + 1 + flat_bytes = n_flat * 4 + off_bytes = n_off * 8 + flat = buf.view(np.uint8)[offset_flat : offset_flat + flat_bytes].view(np.float32) + off_arr = buf.view(np.uint8)[offset_off : offset_off + off_bytes].view(np.int64) + np.cumsum(lengths, out=off_arr[1:]) + off_arr[0] = 0 + pos = 0 + for xs in list_of_lists: + n = len(xs) + flat[pos : pos + n] = np.asarray(xs, dtype=np.float32) + pos += n + return flat_bytes, off_bytes + + +def _pack_ragged_1d_float32_into( + buf: np.ndarray, offset: int, list_of_lists: list[list[float]] +) -> tuple[int, int, int]: + """Pack list of float lists into buf at offset. Returns (flat_bytes, offsets_bytes, total_bytes). + Flat is padded to 8-byte boundary so offsets (int64) are aligned.""" + lengths = np.array([len(x) for x in list_of_lists], dtype=np.int64) + n_flat = int(lengths.sum()) + n_off = len(lengths) + 1 + flat_bytes = n_flat * 4 + flat_padded = _pad8(flat_bytes) + flat = buf.view(np.uint8)[offset : offset + flat_bytes].view(np.float32) + off_arr = buf.view(np.uint8)[offset + flat_padded : offset + flat_padded + n_off * 8].view(np.int64) + np.cumsum(lengths, out=off_arr[1:]) + off_arr[0] = 0 + pos = 0 + for xs in list_of_lists: + n = len(xs) + flat[pos : pos + n] = np.asarray(xs, dtype=np.float32) + pos += n + return flat_bytes, n_off * 8, flat_padded + n_off * 8 + + +def _prepare_rollout_direct_pack( + rollout: dict, + store, + profile_out: dict | None = None, +) -> tuple[bytes, list[torch.Tensor], tuple[int, int] | None, object]: + """ + Scheme C: Pack directly into a single buffer, no intermediate tensor allocation. + Returns (meta_bytes, tensor_views, registered_range, buffer_holder). + registered_range=(ptr, size) if buffer was registered; None if from alloc_from_mem_pool. + buffer_holder keeps the buffer alive. + """ + import ctypes + + t0 = time.perf_counter() + data = rollout.copy() + + def _pad64(x: int) -> int: + return (x + 63) // 64 * 64 + + # Compute sizes (flat padded to 8B for int64 offsets alignment) + sizes: list[int] = [] + if "tokens" in data and isinstance(data["tokens"], list): + n_flat = sum(a.shape[0] for a in data["tokens"]) + n_off = len(data["tokens"]) + 1 + sizes.append(_pad64(_pad8(n_flat * 4) + n_off * 8)) + if "labels" in data and isinstance(data["labels"], list): + n_flat = sum(a.shape[0] for a in data["labels"]) + n_off = len(data["labels"]) + 1 + sizes.append(_pad64(_pad8(n_flat * 4) + n_off * 8)) + if "rollout_log_probs" in data and isinstance(data["rollout_log_probs"], list): + n_flat = sum(len(x) for x in data["rollout_log_probs"]) + n_off = len(data["rollout_log_probs"]) + 1 + sizes.append(_pad64(_pad8(n_flat * 4) + n_off * 8)) + + total = sum(sizes) + if total == 0: + meta_bytes, tensors = _prepare_rollout_for_pickle(rollout, profile_out) + return meta_bytes, tensors, None, None + + # Allocate: try alloc_from_mem_pool first, fallback to torch.empty + ptr = 0 + buf_holder: object = None + registered_range: tuple[int, int] | None = None + if hasattr(store, "alloc_from_mem_pool"): + ptr = store.alloc_from_mem_pool(total) + if ptr == 0: + buf = torch.empty(total, dtype=torch.uint8) + ptr = buf.data_ptr() + store.register_buffer(ptr, total) + buf_holder = buf + registered_range = (ptr, total) + + if buf_holder is not None: + buf_np = buf_holder.numpy() + else: + buf_np = np.frombuffer((ctypes.c_byte * total).from_address(ptr), dtype=np.uint8, count=total) + cur = 0 + + if "tokens" in data and isinstance(data["tokens"], list): + flat_b, _, sz = _pack_ragged_1d_int32_into(buf_np, cur, data["tokens"]) + n_flat = flat_b // 4 + flat_padded = _pad8(flat_b) + flat_np = buf_np.view(np.int32)[cur // 4 : (cur + flat_b) // 4] + off_np = buf_np.view(np.int64)[(cur + flat_padded) // 8 : (cur + sz) // 8] + flat = torch.from_numpy(flat_np) + off = torch.from_numpy(off_np) + data["tokens"] = {"__ragged_type__": "1d_int32", "flat": flat, "offsets": off} + cur += _pad64(sz) + + if "labels" in data and isinstance(data["labels"], list): + flat_b, _, sz = _pack_ragged_1d_int32_into(buf_np, cur, data["labels"]) + flat_padded = _pad8(flat_b) + flat_np = buf_np.view(np.int32)[cur // 4 : (cur + flat_b) // 4] + off_np = buf_np.view(np.int64)[(cur + flat_padded) // 8 : (cur + sz) // 8] + flat = torch.from_numpy(flat_np) + off = torch.from_numpy(off_np) + data["labels"] = {"__ragged_type__": "1d_int32", "flat": flat, "offsets": off} + cur += _pad64(sz) + + if "rollout_log_probs" in data and isinstance(data["rollout_log_probs"], list): + flat_b, _, sz = _pack_ragged_1d_float32_into(buf_np, cur, data["rollout_log_probs"]) + flat_padded = _pad8(flat_b) + flat_np = buf_np.view(np.float32)[cur // 4 : (cur + flat_b) // 4] + off_np = buf_np.view(np.int64)[(cur + flat_padded) // 8 : (cur + sz) // 8] + flat = torch.from_numpy(flat_np) + off = torch.from_numpy(off_np) + data["rollout_log_probs"] = {"__ragged_type__": "1d_float32", "flat": flat, "offsets": off} + + buffers = [] + + def _cb(b): + buffers.append(b) + + meta_bytes = pickle.dumps(data, protocol=5, buffer_callback=_cb) + tensor_views = [] + for b in buffers: + arr = np.array(memoryview(b), copy=False) + tensor_views.append(torch.from_numpy(arr)) + if profile_out is not None: + profile_out["pack_ms"] = (time.perf_counter() - t0) * 1000 + profile_out["pickle_ms"] = 0 + profile_out["buffer_convert_ms"] = 0 + return meta_bytes, tensor_views, registered_range, buf_holder + + +class MooncakeHybridRolloutTransfer: + """ + Mooncake rollout transfer: Legacy multi-key (default) or Single-Key SGL. + Implements DataTransferBackend: put(data) -> handle, get(handle) -> data. + """ + + def __init__( + self, + tensor_min_bytes: int = 1 * 1024 * 1024, + enable_auto_cleanup: bool = True, + use_legacy_path: bool | None = None, + mount_segment_size: int | None = None, + cleanup_delay_seconds: float = 5.0, + cleanup_batch_size: int = 100, + ring_buffer_size: int | None = None, + ring_buffer_count: int = 3, + ): + self.tensor_min_bytes = tensor_min_bytes + self.enable_auto_cleanup = enable_auto_cleanup + self.cleanup_delay_seconds = cleanup_delay_seconds + self.cleanup_batch_size = cleanup_batch_size + if ring_buffer_size is None: + ring_buffer_size = int( + os.environ.get("SLIME_RING_BUFFER_SIZE_MB", "2048") + ) * 1024 * 1024 + self._ring_buffer_size = ring_buffer_size + self._ring_buffer_count = ring_buffer_count + if use_legacy_path is None: + use_legacy_path = os.environ.get("SLIME_USE_LEGACY_TRANSFER", "").lower() in ("1", "true", "yes") + self._use_legacy = use_legacy_path + + overrides = {"mount_segment_size": mount_segment_size} if mount_segment_size is not None else None + cfg = MooncakeStoreConfig.load_from_env(overrides=overrides) + from mooncake.store import MooncakeDistributedStore + + self._store = MooncakeDistributedStore() + ret = self._store.setup( + cfg.local_hostname, + cfg.metadata_server, + cfg.mount_segment_size, + cfg.local_buffer_size, + cfg.protocol, + cfg.device_name or "", + cfg.master_server_address, + ) + if ret: + raise RuntimeError(f"Mooncake setup failed: {ret}") + + # Ring buffer: pre-allocated slots for single-key get (zero cold-start) + self._ring_slots: list[torch.Tensor] = [] + self._ring_available: list[int] = [] + self._buffer_origin: dict[int, int] = {} # ptr -> ring_slot_idx + if self._ring_buffer_size > 0 and self._ring_buffer_count > 0: + align = 4 * 1024 * 1024 + sz = ((self._ring_buffer_size + align - 1) // align) * align + for i in range(self._ring_buffer_count): + buf = torch.empty(sz, dtype=torch.uint8) + self._store.register_buffer(buf.data_ptr(), sz) + self._ring_slots.append(buf) + self._ring_available.append(i) + logger.info( + "Ring buffer: %d slots x %d MB (get cold-start eliminated)", + self._ring_buffer_count, + sz // (1024 * 1024), + ) + + # Mem pool: overflow / legacy path + self._get_pool: list[torch.Tensor] = [] + self._pool_max = 4096 + + # Per-put buffers (legacy/single-key need registered memory) + self._put_buffers: dict[int, torch.Tensor] = {} + self._registered_ptrs: set[int] = set() + + # Single buffer for meta + headers (one put at a time) + self._meta_cap = 32 * 1024 * 1024 + self._header_cap = 4096 * 40 + self._meta_buf = torch.empty(self._meta_cap, dtype=torch.uint8) + self._header_buf = torch.empty(self._header_cap, dtype=torch.uint8) + self._store.register_buffer(self._meta_buf.data_ptr(), self._meta_cap) + self._store.register_buffer(self._header_buf.data_ptr(), self._header_cap) + + # Async cleanup: delayed deletion after get_rollout (like MooncakeDataTransfer) + self._pending_deletion = queue.PriorityQueue() + self._cleanup_thread = None + self._cleanup_thread_lock = threading.Lock() + self._cleanup_stop_event = threading.Event() + if self.enable_auto_cleanup: + self._start_cleanup_thread() + + def _start_cleanup_thread(self) -> None: + """Start the background cleanup thread.""" + self._cleanup_thread = threading.Thread( + target=self._cleanup_worker, name="MooncakeHybridCleanupThread", daemon=True + ) + self._cleanup_thread.start() + logger.info("Mooncake hybrid cleanup thread started") + + def _cleanup_worker(self) -> None: + """Background worker that deletes keys when their deletion time is reached.""" + while not self._cleanup_stop_event.is_set(): + try: + keys_to_delete: list[str] = [] + sleep_until: float | None = None + while len(keys_to_delete) < self.cleanup_batch_size: + try: + deletion_time, key = self._pending_deletion.get(timeout=0.5) + except queue.Empty: + break + current_time = time.time() + if current_time >= deletion_time: + keys_to_delete.append(key) + else: + self._pending_deletion.put((deletion_time, key)) + sleep_until = deletion_time + break + if keys_to_delete: + for key in keys_to_delete: + try: + result = self._store.remove(key) + if result != 0: + logger.warning("Failed to delete key %s, error code: %s", key, result) + except Exception as e: + logger.warning("Exception while deleting key %s: %s", key, e) + if sleep_until is not None: + sleep_time = min(0.5, max(0.1, sleep_until - time.time())) + if sleep_time > 0: + time.sleep(sleep_time) + else: + time.sleep(0.5) + except Exception as e: + logger.error("Error in cleanup worker: %s", e, exc_info=True) + time.sleep(1.0) + + def _schedule_handle_deletion(self, handle: HybridRolloutHandle) -> None: + """Schedule all keys for a handle for deletion after the delay period.""" + deletion_time = time.time() + self.cleanup_delay_seconds + keys = [handle.meta_key] + list(handle.tensor_keys) + for key in keys: + self._pending_deletion.put((deletion_time, key)) + + def put(self, data: dict) -> HybridRolloutHandle: + return self.put_rollout(data) + + def get(self, handle: HybridRolloutHandle, auto_cleanup: bool | None = None) -> dict: + return self.get_rollout(handle, auto_cleanup=auto_cleanup) + + def _alloc_get_buffer( + self, size: int, profile: dict | None = None, use_ring: bool = True + ) -> torch.Tensor: + t0 = time.perf_counter() if profile is not None else None + # 1. Ring: single-key path, size fits, slot available + if ( + use_ring + and self._ring_slots + and size <= self._ring_slots[0].numel() + and self._ring_available + ): + idx = self._ring_available.pop() + buf = self._ring_slots[idx] + self._buffer_origin[buf.data_ptr()] = idx + if profile is not None: + profile["alloc_ms"] = (time.perf_counter() - t0) * 1000 + profile["alloc_from_pool"] = True # ring counts as "warm" + return buf + # 2. Pool: best-fit + best_i, best_sz = -1, float("inf") + for i, b in enumerate(self._get_pool): + if b.numel() >= size and b.numel() < best_sz: + best_sz, best_i = b.numel(), i + if best_i >= 0: + buf = self._get_pool.pop(best_i) + if profile is not None: + profile["alloc_ms"] = (time.perf_counter() - t0) * 1000 + profile["alloc_from_pool"] = True + return buf + # 3. Dynamic alloc + align = 4 * 1024 * 1024 + sz = ((size + align - 1) // align) * align + buf = torch.empty(sz, dtype=torch.uint8) + self._store.register_buffer(buf.data_ptr(), sz) + if profile is not None: + profile["alloc_ms"] = (time.perf_counter() - t0) * 1000 + profile["alloc_from_pool"] = False + return buf + + def _return_get_buffer(self, buf: torch.Tensor) -> None: + ptr = buf.data_ptr() + if ptr in self._buffer_origin: + idx = self._buffer_origin.pop(ptr) + self._ring_available.append(idx) + return + if len(self._get_pool) < self._pool_max: + self._get_pool.append(buf) + else: + try: + self._store.unregister_buffer(ptr) + except Exception: + pass + + def _get_put_buffer(self, idx: int, size: int) -> torch.Tensor: + if idx not in self._put_buffers or self._put_buffers[idx].numel() < size: + if idx in self._put_buffers: + old_ptr = self._put_buffers[idx].data_ptr() + self._registered_ptrs.discard(old_ptr) + try: + self._store.unregister_buffer(old_ptr) + except Exception: + pass + align = 4 * 1024 * 1024 + sz = ((size + align - 1) // align) * align + buf = torch.empty(sz, dtype=torch.uint8) + self._store.register_buffer(buf.data_ptr(), sz) + self._registered_ptrs.add(buf.data_ptr()) + self._put_buffers[idx] = buf + return self._put_buffers[idx] + + def put_rollout( + self, rollout: dict, profile_out: dict | None = None + ) -> HybridRolloutHandle: + t0 = time.perf_counter() + use_numpy_meta = os.environ.get("SLIME_USE_NUMPY_META", "").lower() in ("1", "true") + use_direct_pack = os.environ.get("SLIME_PACK_DIRECT_TO_BUFFER", "").lower() in ("1", "true") + if use_numpy_meta and not self._use_legacy: + meta_bytes, tensors = _prepare_rollout_for_numpy(rollout, profile_out) + registered_range = None + elif use_direct_pack and not self._use_legacy: + meta_bytes, tensors, registered_range, _buf_holder = _prepare_rollout_direct_pack( + rollout, self._store, profile_out + ) + else: + meta_bytes, tensors = _prepare_rollout_for_pickle(rollout, profile_out) + registered_range = None + t_prepare = (time.perf_counter() - t0) * 1000 + meta_size = len(meta_bytes) + rid = str(uuid.uuid4()) + + use_split_keys = os.environ.get("SLIME_META_TENSOR_SPLIT_KEYS", "").lower() in ("1", "true") + if self._use_legacy: + handle = self._put_legacy( + rid, meta_bytes, meta_size, tensors, profile_out + ) + elif use_split_keys and not use_numpy_meta: + handle = self._put_two_key( + rid, meta_bytes, meta_size, tensors, profile_out + ) + else: + handle = self._put_single_key( + rid, meta_bytes, meta_size, tensors, profile_out, + registered_range=registered_range, + ) + + if profile_out is not None: + profile_out["prepare_ms"] = t_prepare + profile_out["prepare_bytes"] = meta_size + profile_out["num_tensors"] = len(tensors) + profile_out["total_bytes"] = meta_size + sum( + t.numel() * t.element_size() for t in tensors + ) + profile_out["put_total_ms"] = (time.perf_counter() - t0) * 1000 + return handle + + def _put_legacy( + self, + rid: str, + meta_bytes: bytes, + meta_size: int, + tensors: list[torch.Tensor], + profile_out: dict | None = None, + ) -> HybridRolloutHandle: + """Legacy: meta + each tensor in separate key.""" + t0 = time.perf_counter() + if meta_size > self._meta_cap: + raise RuntimeError(f"Meta size {meta_size} > capacity {self._meta_cap}") + self._meta_buf[:meta_size] = torch.frombuffer(bytearray(meta_bytes), dtype=torch.uint8) + + keys = [] + ptrs_list = [] + sizes_list = [] + tensor_sizes = [] + + # Meta key: header(40) + meta + hdr = struct.pack("iiqqqq", 3, 1, meta_size, -1, -1, -1) + self._header_buf[:40] = torch.tensor(bytearray(hdr), dtype=torch.uint8) + keys.append(f"rollout:{rid}:meta") + ptrs_list.append([self._header_buf.data_ptr(), self._meta_buf.data_ptr()]) + sizes_list.append([40, meta_size]) + tensor_sizes.append(meta_size) + + for i, t in enumerate(tensors): + if not t.is_contiguous(): + t = t.contiguous() + ptr = t.data_ptr() + size = t.numel() * t.element_size() + if ptr not in self._registered_ptrs: + buf = self._get_put_buffer(i, size) + buf[:size].view(t.dtype).reshape(t.shape).copy_(t) + ptr = buf.data_ptr() + hdr = struct.pack( + "iiqqqq", + DTYPE_MAP.get(t.dtype, 3), + t.ndim, + *t.shape, + *([-1] * (4 - t.ndim)), + ) + self._header_buf[(i + 1) * 40 : (i + 2) * 40] = torch.tensor(bytearray(hdr), dtype=torch.uint8) + keys.append(f"rollout:{rid}:{i}") + ptrs_list.append([self._header_buf.data_ptr() + (i + 1) * 40, ptr]) + sizes_list.append([40, size]) + tensor_sizes.append(size) + + t_before_put = time.perf_counter() + ret = self._store.batch_put_from_multi_buffers(keys, ptrs_list, sizes_list) + if profile_out is not None: + profile_out["buffer_prep_ms"] = (t_before_put - t0) * 1000 + profile_out["batch_put_ms"] = (time.perf_counter() - t_before_put) * 1000 + for r in ret: + if r != 0: + raise RuntimeError(f"batch_put_from_multi_buffers failed: {ret}") + + return HybridRolloutHandle( + meta_key=keys[0], + meta_size=meta_size, + tensor_keys=keys[1:], + tensor_sizes=tensor_sizes[1:], + padded_sizes=None, + ) + + def _put_two_key( + self, + rid: str, + meta_bytes: bytes, + meta_size: int, + tensors: list[torch.Tensor], + profile_out: dict | None = None, + ) -> HybridRolloutHandle: + """Two-key: meta in one key, concatenated tensors in another (pickle OOB only).""" + t0 = time.perf_counter() + if meta_size > self._meta_cap: + raise RuntimeError(f"Meta size {meta_size} > capacity {self._meta_cap}") + self._meta_buf[:meta_size] = torch.frombuffer(bytearray(meta_bytes), dtype=torch.uint8) + + meta_key = f"rollout:{rid}:meta" + tensor_key = f"rollout:{rid}:tensors" + + hdr = struct.pack("iiqqqq", 3, 1, meta_size, -1, -1, -1) + self._header_buf[:40] = torch.tensor(bytearray(hdr), dtype=torch.uint8) + + ptrs_list = [] + sizes_list = [] + tensor_sizes = [] + + meta_ptrs = [self._header_buf.data_ptr(), self._meta_buf.data_ptr()] + meta_sizes = [40, meta_size] + ptrs_list.append(meta_ptrs) + sizes_list.append(meta_sizes) + + tensor_ptrs = [] + tensor_sizes_out = [] + for i, t in enumerate(tensors): + if not t.is_contiguous(): + t = t.contiguous() + ptr = t.data_ptr() + size = t.numel() * t.element_size() + if ptr not in self._registered_ptrs: + buf = self._get_put_buffer(i, size) + buf[:size].view(t.dtype).reshape(t.shape).copy_(t) + ptr = buf.data_ptr() + tensor_ptrs.append(ptr) + tensor_sizes_out.append(size) + + ptrs_list.append(tensor_ptrs) + sizes_list.append(tensor_sizes_out) + + keys = [meta_key, tensor_key] + t_before_put = time.perf_counter() + ret = self._store.batch_put_from_multi_buffers(keys, ptrs_list, sizes_list) + if profile_out is not None: + profile_out["buffer_prep_ms"] = (t_before_put - t0) * 1000 + profile_out["batch_put_ms"] = (time.perf_counter() - t_before_put) * 1000 + for r in ret: + if r != 0: + raise RuntimeError(f"batch_put_from_multi_buffers failed: {ret}") + + return HybridRolloutHandle( + meta_key=meta_key, + meta_size=meta_size, + tensor_keys=[tensor_key], + tensor_sizes=tensor_sizes_out, + padded_sizes=tensor_sizes_out, + ) + + def _put_single_key( + self, + rid: str, + meta_bytes: bytes, + meta_size: int, + tensors: list[torch.Tensor], + profile_out: dict | None = None, + registered_range: tuple[int, int] | None = None, + ) -> HybridRolloutHandle: + """Single-Key SGL: one key, one contiguous SGL.""" + t0 = time.perf_counter() + n = len(tensors) + 1 + if n > self._header_cap // 40: + raise RuntimeError(f"Too many tensors: {n}") + if meta_size > self._meta_cap: + raise RuntimeError(f"Meta size {meta_size} > capacity {self._meta_cap}") + + t_meta = time.perf_counter() + self._meta_buf[:meta_size] = torch.frombuffer(bytearray(meta_bytes), dtype=torch.uint8) + meta_t = self._meta_buf[:meta_size] + all_t = [meta_t] + tensors + t_after_meta = time.perf_counter() + + ptrs, sizes, padded, actual = [], [], [], [] + temp_registered: list[int] = [] + use_register_inplace = os.environ.get("SLIME_REGISTER_PICKLE_BUFFERS", "").lower() in ("1", "true") + rng_ptr, rng_size = registered_range or (0, 0) + t_copy_tensors = 0.0 + t_headers = 0.0 + for i, t in enumerate(all_t): + # Ensure contiguous: Mooncake reads raw bytes from ptr; non-contiguous tensors have gaps + if not t.is_contiguous(): + t = t.contiguous() + ptr = t.data_ptr() + sz = t.numel() * t.element_size() + in_registered_range = registered_range is not None and ptr >= rng_ptr and ptr + sz <= rng_ptr + rng_size + _t0 = time.perf_counter() + if i > 0 and ptr not in self._registered_ptrs and not in_registered_range: + if use_register_inplace: + try: + self._store.register_buffer(ptr, sz) + self._registered_ptrs.add(ptr) + temp_registered.append(ptr) + except Exception: + use_register_inplace = False + buf = self._get_put_buffer(i - 1, sz) + buf[:sz].view(t.dtype).reshape(t.shape).copy_(t) + ptr = buf.data_ptr() + else: + buf = self._get_put_buffer(i - 1, sz) + buf[:sz].view(t.dtype).reshape(t.shape).copy_(t) + ptr = buf.data_ptr() + t_copy_tensors += time.perf_counter() - _t0 + pad = (sz + 63) // 64 * 64 + padded.append(pad) + actual.append(sz) + _t1 = time.perf_counter() + hdr = struct.pack( + "iiqqqq", + DTYPE_MAP.get(t.dtype, 3), + t.ndim, + *t.shape, + *([-1] * (4 - t.ndim)), + ) + self._header_buf[i * 40 : (i + 1) * 40] = torch.tensor(bytearray(hdr), dtype=torch.uint8) + t_headers += time.perf_counter() - _t1 + ptrs.extend([self._header_buf.data_ptr() + i * 40, ptr]) + sizes.extend([40, pad]) + + t_before_put = time.perf_counter() + key = f"rollout:{rid}" + if os.environ.get("MC_DEBUG_LOCAL_MEMCPY") in ("1", "true"): + total_sz = sum(sizes) + logger.info( + "[MC_DEBUG] batch_put_from_multi_buffers: key=%s num_slices=%d total_bytes=%d ptrs=%s", + key, len(ptrs), total_sz, + [(hex(p), s) for p, s in zip(ptrs, sizes)][:6], + ) + if len(ptrs) > 6: + logger.info("[MC_DEBUG] ... and %d more slices", len(ptrs) - 6) + ret = self._store.batch_put_from_multi_buffers([key], [ptrs], [sizes]) + if profile_out is not None: + profile_out["buffer_prep_ms"] = (t_before_put - t0) * 1000 + profile_out["meta_copy_ms"] = (t_after_meta - t_meta) * 1000 + profile_out["copy_tensors_ms"] = t_copy_tensors * 1000 + profile_out["copy_headers_ms"] = t_headers * 1000 + profile_out["batch_put_ms"] = (time.perf_counter() - t_before_put) * 1000 + for r in ret: + if r != 0: + raise RuntimeError(f"batch_put_from_multi_buffers failed: {ret}") + for ptr in temp_registered: + self._registered_ptrs.discard(ptr) + try: + self._store.unregister_buffer(ptr) + except Exception: + pass + + return HybridRolloutHandle( + meta_key=key, + meta_size=meta_size, + tensor_keys=[], + tensor_sizes=actual[1:], + padded_sizes=padded, + ) + + def get_rollout( + self, + handle: HybridRolloutHandle, + return_packed: bool = False, + auto_cleanup: bool | None = None, + profile_out: list | None = None, + ) -> dict: + if handle.padded_sizes is not None and len(handle.tensor_keys) == 1: + data = self._get_two_key(handle, return_packed, profile_out) + elif handle.padded_sizes is not None: + data = self._get_single_key(handle, return_packed, profile_out) + else: + data = self._get_legacy(handle, return_packed, profile_out) + should_cleanup = auto_cleanup if auto_cleanup is not None else self.enable_auto_cleanup + if should_cleanup: + with self._cleanup_thread_lock: + if self._cleanup_thread is None or not self._cleanup_thread.is_alive(): + self._start_cleanup_thread() + self._schedule_handle_deletion(handle) + return data + + def _get_two_key( + self, handle: HybridRolloutHandle, return_packed: bool, profile_out: list | None = None + ) -> dict: + """Two-key: meta in one key, concatenated tensors in another (pickle OOB only).""" + profile = {} if profile_out is not None else None + meta_size = handle.meta_size + tensor_sizes = handle.tensor_sizes + total_meta = 40 + meta_size + total_tensors = sum(tensor_sizes) + + alloc_t0 = time.perf_counter() if profile is not None else None + buf_meta = self._alloc_get_buffer(total_meta, None, use_ring=False) + buf_tensors = self._alloc_get_buffer(total_tensors, None, use_ring=False) + if profile is not None: + profile["alloc_ms"] = (time.perf_counter() - alloc_t0) * 1000 + + batch_t0 = time.perf_counter() if profile is not None else None + rets = self._store.batch_get_into( + [handle.meta_key, handle.tensor_keys[0]], + [buf_meta.data_ptr(), buf_tensors.data_ptr()], + [total_meta, total_tensors], + ) + for r in rets: + if r < 0: + raise RuntimeError(f"batch_get_into failed: {rets}") + if profile is not None: + profile["batch_get_ms"] = (time.perf_counter() - batch_t0) * 1000 + + unpack_t0 = time.perf_counter() if profile is not None else None + meta_bytes = memoryview(buf_meta.numpy()[40 : 40 + meta_size]).tobytes() + self._return_get_buffer(buf_meta) + offset = 0 + ready = [] + for sz in tensor_sizes: + v = buf_tensors[offset : offset + sz] + ready.append(v) + offset += sz + + class _RefCount: + def __init__(self, n, pool, b): + self.n, self.pool, self.b = n, pool, b + + def dec(self): + self.n -= 1 + if self.n == 0: + self.pool(self.b) + + if ready: + rc = _RefCount(len(ready), self._return_get_buffer, buf_tensors) + for v in ready: + weakref.finalize(v, rc.dec) + else: + self._return_get_buffer(buf_tensors) + + data = pickle.loads(meta_bytes, buffers=[t.numpy() for t in ready]) + data = self._unpack_ragged(data) if not return_packed else data + if profile is not None: + profile["unpack_ms"] = (time.perf_counter() - unpack_t0) * 1000 + profile_out.append(profile) + return data + + def _get_single_key( + self, handle: HybridRolloutHandle, return_packed: bool, profile_out: list | None = None + ) -> dict: + if not handle.padded_sizes or len(handle.padded_sizes) != len(handle.tensor_sizes) + 1: + raise ValueError( + f"Invalid handle: padded_sizes len {len(handle.padded_sizes or [])} " + f"!= tensor_sizes len {len(handle.tensor_sizes)} + 1" + ) + profile = {} if profile_out is not None else None + total = sum(40 + p for p in handle.padded_sizes) + # Add 64KB margin: Mooncake may require slightly more; avoid -600 Buffer too small + total = total + 65536 + buf = self._alloc_get_buffer(total, profile, use_ring=True) + + max_retries = 30000 + retry_703 = 0 + batch_get_t0 = time.perf_counter() if profile is not None else None + for _ in range(max_retries): + rets = self._store.batch_get_into([handle.meta_key], [buf.data_ptr()], [total]) + ok = True + for r in rets: + if r < 0: + if r == -703: + retry_703 += 1 + time.sleep(0.001) + ok = False + break + raise RuntimeError(f"batch_get_into failed: {rets}") + if ok: + break + else: + raise RuntimeError(f"NOT_FOUND after {max_retries} retries: {handle.meta_key}") + if profile is not None: + profile["batch_get_ms"] = (time.perf_counter() - batch_get_t0) * 1000 + profile["retry_703"] = retry_703 + + unpack_t0 = time.perf_counter() if profile is not None else None + offset = 40 + if offset + handle.meta_size > total: + raise RuntimeError(f"Meta overrun: offset {offset} + meta_size {handle.meta_size} > total {total}") + meta_bytes = memoryview(buf.numpy()[offset : offset + handle.meta_size]).tobytes() + offset += handle.padded_sizes[0] + + ready = [] + for i, sz in enumerate(handle.tensor_sizes): + if offset + 40 > total: + raise RuntimeError(f"Header overrun at tensor {i}: offset {offset} + 40 > total {total}") + h = struct.unpack("iiqqqq", memoryview(buf.numpy()[offset : offset + 40])) + dtype = next((dt for dt, e in DTYPE_MAP.items() if e == h[0]), torch.uint8) + ndim = int(h[1]) + if ndim < 0 or ndim > 4: + raise RuntimeError(f"Invalid ndim {ndim} at tensor {i}") + shape = tuple(int(h[2 + j]) for j in range(ndim)) + offset += 40 + if offset + sz > total: + raise RuntimeError(f"Tensor {i} overrun: offset {offset} + sz {sz} > total {total}") + v = buf[offset : offset + sz].view(dtype).reshape(shape) + ready.append(v) + offset += handle.padded_sizes[i + 1] + + class _RefCount: + def __init__(self, n, pool, b): + self.n, self.pool, self.b = n, pool, b + + def dec(self): + self.n -= 1 + if self.n == 0: + self.pool(self.b) + + if ready: + rc = _RefCount(len(ready), self._return_get_buffer, buf) + for v in ready: + weakref.finalize(v, rc.dec) + else: + self._return_get_buffer(buf) + + if meta_bytes[:4] == NUMPY_META_MAGIC: + _, _, _, n, has_labels, has_routed = struct.unpack("<4sHHibb", meta_bytes[:14]) + data = _deserialize_rollout_meta_numpy(meta_bytes, ready, bool(has_labels), bool(has_routed), n) + else: + data = pickle.loads(meta_bytes, buffers=[t.numpy() for t in ready]) + data = self._unpack_ragged(data) if not return_packed else data + if profile is not None: + profile["unpack_ms"] = (time.perf_counter() - unpack_t0) * 1000 + profile_out.append(profile) + return data + + def _get_legacy( + self, handle: HybridRolloutHandle, return_packed: bool, profile_out: list | None = None + ) -> dict: + profile = {} if profile_out is not None else None + keys = [handle.meta_key] + handle.tensor_keys + sizes = [handle.meta_size] + handle.tensor_sizes + + alloc_t0 = time.perf_counter() if profile is not None else None + bufs = [] + ptrs, szs = [], [] + for s in sizes: + b = self._alloc_get_buffer(40 + s, None, use_ring=False) + bufs.append(b) + ptrs.append(b.data_ptr()) + szs.append(40 + s) + if profile is not None: + profile["alloc_ms"] = (time.perf_counter() - alloc_t0) * 1000 + profile["alloc_from_pool"] = False + + batch_t0 = time.perf_counter() if profile is not None else None + rets = self._store.batch_get_into(keys, ptrs, szs) + for r in rets: + if r < 0: + raise RuntimeError(f"batch_get_into failed: {rets}") + if profile is not None: + profile["batch_get_ms"] = (time.perf_counter() - batch_t0) * 1000 + profile["retry_703"] = 0 + + unpack_t0 = time.perf_counter() if profile is not None else None + meta_bytes = memoryview(bufs[0].numpy()[40 : 40 + handle.meta_size]).tobytes() + self._return_get_buffer(bufs[0]) + + ready = [] + for i, b in enumerate(bufs[1:]): + h = struct.unpack("iiqqqq", memoryview(b.numpy()[:40])) + dtype = next((dt for dt, e in DTYPE_MAP.items() if e == h[0]), torch.uint8) + ndim = h[1] + shape = tuple(h[2 : 2 + ndim]) + v = b[40 : 40 + handle.tensor_sizes[i]].view(dtype).reshape(shape) + weakref.finalize(v, self._return_get_buffer, b) + ready.append(v) + + if meta_bytes[:4] == NUMPY_META_MAGIC: + _, _, _, n, has_labels, has_routed = struct.unpack("<4sHHibb", meta_bytes[:14]) + data = _deserialize_rollout_meta_numpy(meta_bytes, ready, bool(has_labels), bool(has_routed), n) + else: + data = pickle.loads(meta_bytes, buffers=[t.numpy() for t in ready]) + data = self._unpack_ragged(data) if not return_packed else data + if profile is not None: + profile["unpack_ms"] = (time.perf_counter() - unpack_t0) * 1000 + profile_out.append(profile) + return data + + def _unpack_ragged(self, data: dict) -> dict: + return _unpack_ragged(data) + + def cleanup(self, handle: HybridRolloutHandle) -> None: + """Immediately remove all keys for this handle (bypasses delayed deletion).""" + self._store.remove(handle.meta_key) + for k in handle.tensor_keys: + self._store.remove(k) + + def shutdown(self) -> None: + """Stop the cleanup thread gracefully.""" + self._cleanup_stop_event.set() + if self._cleanup_thread is not None: + self._cleanup_thread.join(timeout=5.0) + if self._cleanup_thread.is_alive(): + logger.warning("Cleanup thread did not stop within timeout") + else: + logger.info("Cleanup thread stopped successfully") From 245848bd8e976c3b6957a2e5c3bfbff4994a9150 Mon Sep 17 00:00:00 2001 From: Cruz Zhao Date: Wed, 11 Mar 2026 16:04:20 +0800 Subject: [PATCH 2/3] test: Mooncake transfer test and two-node benchmark - Add test_mooncake_transfer_correctness: verify put_rollout/get_rollout roundtrip - Add mock_rollout utils (make_mock_rollout_data, get_serialized_size) - Add benchmark_ray_vs_mooncake_two_node: Ray vs Mooncake two-node perf Made-with: Cursor --- scripts/benchmark_ray_vs_mooncake_two_node.py | 492 ++++++++++++++++++ slime/utils/mock_rollout.py | 50 ++ tests/test_mooncake_transfer_correctness.py | 79 +++ 3 files changed, 621 insertions(+) create mode 100644 scripts/benchmark_ray_vs_mooncake_two_node.py create mode 100644 slime/utils/mock_rollout.py create mode 100644 tests/test_mooncake_transfer_correctness.py diff --git a/scripts/benchmark_ray_vs_mooncake_two_node.py b/scripts/benchmark_ray_vs_mooncake_two_node.py new file mode 100644 index 0000000000..676841280a --- /dev/null +++ b/scripts/benchmark_ray_vs_mooncake_two_node.py @@ -0,0 +1,492 @@ +#!/usr/bin/env python3 +""" +Two-node benchmark: Ray vs Mooncake rollout transfer using Ray actors. + +Timing (full dict->put->get->dict path): +- Put: wall time from dict to handle (prepare + pack + transfer to store) +- Get: wall time from handle to dict (fetch + unpack) + +Uses Ray actors for true cross-node placement: +- DataGenerator on put-node: creates rollout dict, runs put +- DataConsumer on get-node: runs get natively (handle passed via Ray, not SSH/JSON) + +Prerequisites: +- Ray cluster with at least 2 nodes +- Mooncake: master + clients, MOONCAKE_MASTER, MOONCAKE_PROTOCOL=rdma + +Usage: + export MOONCAKE_MASTER=192.168.22.70:50051 MOONCAKE_PROTOCOL=rdma + python scripts/benchmark_ray_vs_mooncake_two_node.py --put-node 192.168.22.70 --get-node 192.168.22.72 --data-size-mb 100 --num-rounds 20 + python scripts/benchmark_ray_vs_mooncake_two_node.py --backends ray mooncake ... +""" + +import argparse +import gc +import json +import os +import subprocess +import sys +import time +from pathlib import Path + +import numpy as np +import ray + +ROOT = Path(__file__).resolve().parents[1] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + +from slime.utils.mock_rollout import make_mock_rollout_data, get_serialized_size +from slime.utils.data_transfer import MooncakeStoreConfig +from slime.utils.rollout_hybrid_transfer import MooncakeHybridRolloutTransfer + + +@ray.remote(num_cpus=1, num_gpus=0) +class DataGenerator: + def __init__( + self, + data_size_mb: float, + mount_segment_size: int | None, + mooncake_master: str = "", + ): + os.environ["MOONCAKE_PROTOCOL"] = os.environ.get("MOONCAKE_PROTOCOL", "rdma") + os.environ["MOONCAKE_MASTER"] = mooncake_master or os.environ.get("MOONCAKE_MASTER", "") + os.environ["MOONCAKE_LOCAL_HOSTNAME"] = ray.util.get_node_ip_address() + os.environ.setdefault("MOONCAKE_TE_META_DATA_SERVER", "P2PHANDSHAKE") + os.environ.setdefault("MC_STORE_MEMCPY", "0") + if mount_segment_size is not None: + os.environ["MOONCAKE_MOUNT_SEGMENT_SIZE"] = str(mount_segment_size) + self.data, self.actual_mb, self.batch, self.seq = self._make_rollout(data_size_mb) + print(f"Generator initialized on node {ray.util.get_node_ip_address()}") + print(f"Actual Data Size: {self.actual_mb:.2f} MB (batch={self.batch}, seq={self.seq})") + + def _make_rollout(self, target_mb: float): + seq = 2048 + base = make_mock_rollout_data(batch_size=16, seq_len=seq, use_routing_replay=True) + base_mb = get_serialized_size(base) / (1024 * 1024) + batch = max(16, int(16 * (target_mb / max(1e-6, base_mb)))) + data = make_mock_rollout_data(batch_size=batch, seq_len=seq, use_routing_replay=True) + actual_mb = get_serialized_size(data) / (1024 * 1024) + return data, actual_mb, batch, seq + + def get_data(self): + return self.data + + def get_info(self): + return {"mb": self.actual_mb, "batch": self.batch, "seq": self.seq} + + def generate_ray_put(self, rounds: int): + handles = [] + put_times = [] + for _ in range(rounds): + t0 = time.perf_counter() + h = ray.put(self.data) + put_times.append((time.perf_counter() - t0) * 1000) + handles.append(h) + return put_times, handles + + def generate_hybrid_put( + self, + rounds: int, + tensor_min_bytes: int, + mount_segment_size: int | None, + ): + if not hasattr(self, "_hybrid_backend"): + if "SLIME_PUT_FROM_SINGLE_BUFFER" in os.environ: + del os.environ["SLIME_PUT_FROM_SINGLE_BUFFER"] + if "SLIME_PUT_SINGLE_BUFFER_AS_MULTI" in os.environ: + del os.environ["SLIME_PUT_SINGLE_BUFFER_AS_MULTI"] + self._hybrid_backend = MooncakeHybridRolloutTransfer( + tensor_min_bytes=tensor_min_bytes, + enable_auto_cleanup=False, + mount_segment_size=mount_segment_size, + ) + backend = self._hybrid_backend + handles = [] + put_times = [] + for _ in range(rounds): + t0 = time.perf_counter() + h = backend.put_rollout(self.data) + put_times.append((time.perf_counter() - t0) * 1000) + handles.append(h) + return handles, put_times + + +@ray.remote(num_cpus=1, num_gpus=0) +class DataConsumer: + def __init__( + self, + mount_segment_size: int | None, + mooncake_master: str = "", + ): + os.environ["MOONCAKE_PROTOCOL"] = os.environ.get("MOONCAKE_PROTOCOL", "rdma") + os.environ["MOONCAKE_MASTER"] = mooncake_master or os.environ.get("MOONCAKE_MASTER", "") + os.environ["MOONCAKE_LOCAL_HOSTNAME"] = ray.util.get_node_ip_address() + os.environ.setdefault("MOONCAKE_TE_META_DATA_SERVER", "P2PHANDSHAKE") + os.environ.setdefault("MC_STORE_MEMCPY", "0") + if mount_segment_size is not None: + os.environ["MOONCAKE_MOUNT_SEGMENT_SIZE"] = str(mount_segment_size) + print(f"Consumer initialized on node {ray.util.get_node_ip_address()}") + + def warmup_ray(self, handles): + if isinstance(handles, list): + _ = ray.get(handles[0]) + else: + _ = ray.get(handles) + + def consume_ray_get(self, handles): + get_times = [] + for h in handles: + t0 = time.perf_counter() + _ = ray.get(h) + get_times.append((time.perf_counter() - t0) * 1000) + return get_times + + def warmup_hybrid( + self, handle, tensor_min_bytes: int, mount_segment_size: int | None + ): + if not hasattr(self, "_hybrid_backend"): + self._hybrid_backend = MooncakeHybridRolloutTransfer( + tensor_min_bytes=tensor_min_bytes, + enable_auto_cleanup=False, + mount_segment_size=mount_segment_size, + ) + _ = self._hybrid_backend.get_rollout(handle) + self._hybrid_backend.cleanup(handle) + + def consume_hybrid_get( + self, handles, tensor_min_bytes: int, mount_segment_size: int | None + ): + if not hasattr(self, "_hybrid_backend"): + self._hybrid_backend = MooncakeHybridRolloutTransfer( + tensor_min_bytes=tensor_min_bytes, + enable_auto_cleanup=False, + mount_segment_size=mount_segment_size, + ) + get_times = [] + for h in handles: + t0 = time.perf_counter() + _ = self._hybrid_backend.get_rollout(h) + get_times.append((time.perf_counter() - t0) * 1000) + self._hybrid_backend.cleanup(h) + return get_times + + +def _resolve_node_id(ip: str) -> str | None: + for n in ray.nodes(): + if not n.get("Alive"): + continue + if n.get("NodeManagerAddress") == ip: + return n["NodeID"] + return None + + +def main(): + ap = argparse.ArgumentParser( + description="Two-node benchmark: Ray vs Mooncake (Ray actors, handle via Ray)" + ) + ap.add_argument("--put-node", type=str, default=None) + ap.add_argument("--get-node", type=str, default=None) + ap.add_argument("--data-size-mb", type=float, default=100.0) + ap.add_argument("--num-rounds", type=int, default=30) + ap.add_argument( + "--backends", + nargs="+", + default=["ray", "mooncake"], + choices=["ray", "mooncake"], + ) + ap.add_argument("--tensor-min-mb", type=float, default=1.0) + ap.add_argument( + "--warm-up-rounds", + type=int, + default=24, + help="Warmup rounds before timed runs (higher reduces Ray variance)", + ) + ap.add_argument( + "--discard-first", + type=int, + default=5, + help="Discard first N samples after warmup (reduces cold-start effect)", + ) + ap.add_argument( + "--isolate-backends", + action="store_true", + default=True, + help="Run each backend in separate process to avoid memory/interference (default: True)", + ) + ap.add_argument( + "--no-isolate-backends", + action="store_false", + dest="isolate_backends", + help="Run both backends in same process (may increase Ray variance)", + ) + ap.add_argument("--mooncake-segment-size-gb", type=float, default=None) + ap.add_argument( + "--trim-fraction", + type=float, + default=0.15, + help="Trim top/bottom fraction for stats (0=no trim)", + ) + ap.add_argument( + "--output-format", + choices=["table", "json"], + default="table", + help="Output format (json for machine-readable)", + ) + args = ap.parse_args() + + # Isolate backends: run each in separate process to avoid memory/GC interference + if ( + args.isolate_backends + and len(args.backends) > 1 + and args.output_format != "json" + ): + script = Path(__file__).resolve() + base_cmd = [ + sys.executable, + str(script), + "--data-size-mb", + str(args.data_size_mb), + "--num-rounds", + str(args.num_rounds), + "--warm-up-rounds", + str(args.warm_up_rounds), + "--discard-first", + str(args.discard_first), + "--no-isolate-backends", + "--trim-fraction", + str(args.trim_fraction), + "--output-format", + "json", + ] + if args.put_node: + base_cmd += ["--put-node", args.put_node] + if args.get_node: + base_cmd += ["--get-node", args.get_node] + if args.mooncake_segment_size_gb is not None: + base_cmd += ["--mooncake-segment-size-gb", str(args.mooncake_segment_size_gb)] + + all_results = {} + for backend in args.backends: + cmd = base_cmd + ["--backends", backend] + env = os.environ.copy() + env.setdefault("SLIME_UNSAFE_PICKLE", "1") + proc = subprocess.run( + cmd, + env=env, + capture_output=True, + text=True, + cwd=script.parent.parent, + ) + if proc.returncode != 0: + print(f"Backend {backend} failed:\n{proc.stderr}", file=sys.stderr) + sys.exit(1) + for line in proc.stdout.strip().splitlines(): + if line.strip().startswith("{"): + data = json.loads(line) + all_results[data["backend"]] = data + break + + # Print merged table + print("\n" + "=" * 80) + print("Two-node benchmark (isolated processes for fair comparison)") + print("=" * 80) + print( + f"{'Backend':<12} {'Put (ms)':<20} {'Get (ms)':<20} {'End2End (ms)':<20}" + ) + print("-" * 80) + for name in args.backends: + d = all_results.get(name, {}) + pm, ps = d.get("put_mean", 0), d.get("put_std", 0) + gm, gs = d.get("get_mean", 0), d.get("get_std", 0) + tm, ts = d.get("e2e_mean", 0), d.get("e2e_std", 0) + print( + f"{name:<12} " + f"{pm:>7.2f} ± {ps:<6.2f} " + f"{gm:>7.2f} ± {gs:<6.2f} " + f"{tm:>7.2f} ± {ts:<6.2f}" + ) + print("-" * 80) + return + + os.environ.setdefault("SLIME_UNSAFE_PICKLE", "1") + os.environ.setdefault("MOONCAKE_PROTOCOL", "rdma") + os.environ.setdefault("MOONCAKE_TE_META_DATA_SERVER", "P2PHANDSHAKE") + os.environ.setdefault("MC_STORE_MEMCPY", "0") + + overrides = {} + if args.mooncake_segment_size_gb is not None: + overrides["mount_segment_size"] = int( + args.mooncake_segment_size_gb * 1024 * 1024 * 1024 + ) + cfg = MooncakeStoreConfig.load_from_env( + overrides=overrides if overrides else None + ) + mount_segment_size = cfg.mount_segment_size + + ray.init(address="auto", ignore_reinit_error=True, log_to_driver=False) + + quiet = args.output_format == "json" + log = (lambda *a, **kw: None) if quiet else print + + nodes = [n["NodeManagerAddress"] for n in ray.nodes() if n.get("Alive")] + log(f"Active Ray nodes: {nodes}") + if len(nodes) < 2: + raise RuntimeError("Need at least 2 Ray nodes for cross-machine benchmark") + + put_node = args.put_node or nodes[0] + get_node = args.get_node or nodes[1] + put_node_id = _resolve_node_id(put_node) + get_node_id = _resolve_node_id(get_node) + if put_node_id is None: + raise RuntimeError(f"Put node {put_node} not found in ray.nodes()") + if get_node_id is None: + raise RuntimeError(f"Get node {get_node} not found in ray.nodes()") + log(f"Generator -> {put_node}, Consumer -> {get_node}") + + from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy + + mooncake_master = os.environ.get("MOONCAKE_MASTER", f"{put_node}:50051") + gen = DataGenerator.options( + scheduling_strategy=NodeAffinitySchedulingStrategy( + node_id=put_node_id, soft=False + ), + ).remote(args.data_size_mb, mount_segment_size, mooncake_master) + con = DataConsumer.options( + scheduling_strategy=NodeAffinitySchedulingStrategy( + node_id=get_node_id, soft=False + ), + ).remote(mount_segment_size, mooncake_master) + + info = ray.get(gen.get_info.remote()) + log( + f"Target {args.data_size_mb} MB -> actual {info['mb']:.2f} MB " + f"(batch={info['batch']}, seq={info['seq']}), rounds={args.num_rounds}" + ) + + tensor_min_bytes = int(args.tensor_min_mb * 1024 * 1024) + results: dict[str, tuple[np.ndarray, np.ndarray]] = {} + + if "ray" in args.backends: + # Warmup: full-size batches to reduce object store variance (cold start, eviction) + warmup_batch = min(args.warm_up_rounds, args.num_rounds) + for _ in range(max(2, args.warm_up_rounds // max(1, warmup_batch))): + put_times, handles = ray.get(gen.generate_ray_put.remote(warmup_batch)) + ray.get(con.consume_ray_get.remote(handles)) + time.sleep(1.0) # Let object store settle + + # Timed run: disable GC during measurement to reduce variance + total_rounds = args.num_rounds + args.discard_first + put_times, handles = ray.get(gen.generate_ray_put.remote(total_rounds)) + gc.disable() + try: + get_times = ray.get(con.consume_ray_get.remote(handles)) + finally: + gc.enable() + put_arr = np.array(put_times) + get_arr = np.array(get_times) + discard = args.discard_first + if discard > 0 and len(put_arr) > discard: + put_arr = put_arr[discard:] + get_arr = get_arr[discard:] + results["ray"] = (put_arr, get_arr) + + if "mooncake" in args.backends: + # Warmup + handles, _ = ray.get( + gen.generate_hybrid_put.remote( + 1, tensor_min_bytes, mount_segment_size + ) + ) + ray.get( + con.warmup_hybrid.remote( + handles[0], tensor_min_bytes, mount_segment_size + ) + ) + for _ in range(args.warm_up_rounds - 1): + handles, _ = ray.get( + gen.generate_hybrid_put.remote( + 1, tensor_min_bytes, mount_segment_size + ) + ) + ray.get( + con.consume_hybrid_get.remote( + handles, tensor_min_bytes, mount_segment_size + ) + ) + + total_rounds = args.num_rounds + args.discard_first + handles, put_times = ray.get( + gen.generate_hybrid_put.remote( + total_rounds, tensor_min_bytes, mount_segment_size + ) + ) + get_times = ray.get( + con.consume_hybrid_get.remote( + handles, tensor_min_bytes, mount_segment_size + ) + ) + put_arr = np.array(put_times) + get_arr = np.array(get_times) + discard = args.discard_first + if discard > 0 and len(put_arr) > discard: + put_arr = put_arr[discard:] + get_arr = get_arr[discard:] + results["mooncake"] = (put_arr, get_arr) + + def _trimmed_stats(arr: np.ndarray, frac: float): + if frac <= 0 or len(arr) < 4: + return arr.mean(), arr.std() + k = max(1, int(len(arr) * frac)) + s = np.sort(arr) + trimmed = s[k:-k] if k > 0 else s + return float(trimmed.mean()), float(trimmed.std()) + + trim = args.trim_fraction + stats_per_backend = {} + for name, (p, g) in results.items(): + total = p + g + if trim > 0: + pm, ps = _trimmed_stats(p, trim) + gm, gs = _trimmed_stats(g, trim) + tm, ts = _trimmed_stats(total, trim) + else: + pm, ps = p.mean(), p.std() + gm, gs = g.mean(), g.std() + tm, ts = total.mean(), total.std() + stats_per_backend[name] = { + "put_mean": pm, + "put_std": ps, + "get_mean": gm, + "get_std": gs, + "e2e_mean": tm, + "e2e_std": ts, + } + + if args.output_format == "json": + for name, s in stats_per_backend.items(): + print(json.dumps({"backend": name, **s})) + return + + print("\n" + "=" * 80) + print("Two-node benchmark (Ray actors, handle via Ray)") + print("=" * 80) + print( + f"{'Backend':<12} {'Put (ms)':<20} {'Get (ms)':<20} {'End2End (ms)':<20}" + ) + print("-" * 80) + for name, s in stats_per_backend.items(): + pm, ps = s["put_mean"], s["put_std"] + gm, gs = s["get_mean"], s["get_std"] + tm, ts = s["e2e_mean"], s["e2e_std"] + print( + f"{name:<12} " + f"{pm:>7.2f} ± {ps:<6.2f} " + f"{gm:>7.2f} ± {gs:<6.2f} " + f"{tm:>7.2f} ± {ts:<6.2f}" + ) + print("-" * 80) + + +if __name__ == "__main__": + main() diff --git a/slime/utils/mock_rollout.py b/slime/utils/mock_rollout.py new file mode 100644 index 0000000000..48e54ba24c --- /dev/null +++ b/slime/utils/mock_rollout.py @@ -0,0 +1,50 @@ +"""Mock rollout data utilities for tests and benchmarks.""" + +import pickle + +import numpy as np + + +def make_mock_rollout_data( + batch_size: int = 16, + seq_len: int = 2048, + n_samples_per_prompt: int = 4, + use_routing_replay: bool = False, + num_layers: int = 64, + moe_router_topk: int = 2, +) -> dict: + """Create mock rollout data resembling real training data structure.""" + num_samples = batch_size * n_samples_per_prompt + total_lengths = [seq_len + np.random.randint(100, 500) for _ in range(num_samples)] + # response_length must be < total_length for rollout_log_probs shape + response_lengths = [ + min(np.random.randint(100, 500), tot - 1) for tot in total_lengths + ] + + tokens = [np.random.randint(0, 32000, size=l, dtype=np.int32) for l in total_lengths] + loss_masks = [[1] * (resp_len - 50) + [0] * 50 for resp_len in response_lengths] + rollout_log_probs = [ + np.random.randn(tot_len - resp_len).astype(np.float32).tolist() + for tot_len, resp_len in zip(total_lengths, response_lengths) + ] + + data = { + "partition": list(range(num_samples)), + "tokens": tokens, + "response_lengths": response_lengths, + "rewards": [1.0] * num_samples, + "loss_masks": loss_masks, + "rollout_log_probs": rollout_log_probs, + "total_lengths": total_lengths, + } + if use_routing_replay: + data["rollout_routed_experts"] = [ + np.random.randint(0, 8, size=(tot_len - 1, num_layers, moe_router_topk), dtype=np.int32) + for tot_len in total_lengths + ] + return data + + +def get_serialized_size(data: dict) -> int: + """Get size in bytes of serialized data.""" + return len(pickle.dumps(data, protocol=5)) diff --git a/tests/test_mooncake_transfer_correctness.py b/tests/test_mooncake_transfer_correctness.py new file mode 100644 index 0000000000..1277a3b4d9 --- /dev/null +++ b/tests/test_mooncake_transfer_correctness.py @@ -0,0 +1,79 @@ +""" +Test Mooncake hybrid rollout transfer correctness. + +Verifies that data put via MooncakeHybridRolloutTransfer.put_rollout() and +retrieved via get_rollout() matches the original (deep equal). + +Prerequisites: +- mooncake_master running (e.g. 127.0.0.1:50051) +- mooncake_client running with MC_STORE_LOCAL_HOT_CACHE_USE_SHM=1 (optional, for hot cache) +- MOONCAKE_MASTER, MOONCAKE_TE_META_DATA_SERVER env set (or defaults to 127.0.0.1) + +Run: pytest tests/test_mooncake_transfer_correctness.py -v +Or: python tests/test_mooncake_transfer_correctness.py +""" +import copy +import os +import sys +from pathlib import Path + +import numpy as np +import torch + +ROOT = Path(__file__).resolve().parents[1] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + +from slime.utils.mock_rollout import make_mock_rollout_data +from slime.utils.rollout_hybrid_transfer import MooncakeHybridRolloutTransfer + + +def _assert_equal_recursive(orig, new, path: str = "root") -> None: + """Recursively assert orig == new with helpful error messages.""" + if isinstance(orig, dict): + assert isinstance(new, dict), f"Type mismatch at {path}: expected dict, got {type(new)}" + assert set(orig.keys()) == set(new.keys()), f"Keys mismatch at {path}" + for k in orig: + _assert_equal_recursive(orig[k], new[k], f"{path}['{k}']") + elif isinstance(orig, list): + assert isinstance(new, list), f"Type mismatch at {path}: expected list, got {type(new)}" + assert len(orig) == len(new), f"Length mismatch at {path}: {len(orig)} vs {len(new)}" + for i, (o, n) in enumerate(zip(orig, new)): + _assert_equal_recursive(o, n, f"{path}[{i}]") + elif isinstance(orig, tuple): + assert isinstance(new, tuple), f"Type mismatch at {path}: expected tuple, got {type(new)}" + assert len(orig) == len(new), f"Length mismatch at {path}" + for i, (o, n) in enumerate(zip(orig, new)): + _assert_equal_recursive(o, n, f"{path}[{i}]") + elif isinstance(orig, np.ndarray): + assert isinstance(new, np.ndarray), f"Type mismatch at {path}: expected ndarray, got {type(new)}" + np.testing.assert_array_equal(orig, new, err_msg=f"Array mismatch at {path}") + elif isinstance(orig, torch.Tensor): + assert isinstance(new, torch.Tensor), f"Type mismatch at {path}: expected Tensor, got {type(new)}" + torch.testing.assert_close(orig, new, msg=f"Tensor mismatch at {path}") + else: + assert type(orig) == type(new), f"Type mismatch at {path}: {type(orig)} vs {type(new)}" + assert orig == new, f"Value mismatch at {path}: {orig} vs {new}" + + +def test_mooncake_hybrid_rollout_transfer_correctness(): + """Verify Mooncake put_rollout/get_rollout roundtrip preserves data.""" + os.environ.setdefault("SLIME_UNSAFE_PICKLE", "1") + os.environ.setdefault("MOONCAKE_PROTOCOL", "tcp") + os.environ.setdefault("MOONCAKE_MASTER", "127.0.0.1:50051") + os.environ.setdefault("MOONCAKE_TE_META_DATA_SERVER", "http://127.0.0.1:8080/metadata") + os.environ.setdefault("MC_STORE_MEMCPY", "1") + + data = make_mock_rollout_data(batch_size=8, seq_len=512, use_routing_replay=True) + xfer = MooncakeHybridRolloutTransfer(tensor_min_bytes=1024 * 1024, enable_auto_cleanup=False) + + handle = xfer.put_rollout(data) + received = xfer.get_rollout(handle, return_packed=False) + + _assert_equal_recursive(data, received) + xfer.cleanup(handle) + + +if __name__ == "__main__": + test_mooncake_hybrid_rollout_transfer_correctness() + print("OK: Mooncake transfer correctness test passed.") From 8b46fd5fcb154b57fc2efddf066a1f0995eaf141 Mon Sep 17 00:00:00 2001 From: Cruz Zhao Date: Wed, 11 Mar 2026 16:04:21 +0800 Subject: [PATCH 3/3] docs: Mooncake backend guide and two-node benchmark results - Add MOONCAKE_BACKEND_GUIDE.md: setup, usage, segment size rule, benchmark summary - Add TWO_NODE_BENCHMARK_RESULTS.md: Ray vs Mooncake E2E results (100/200/500/1000 MB) - Update with stable benchmark results (isolate-backends, warmup=24, discard-first=5) Made-with: Cursor --- docs/MOONCAKE_BACKEND_GUIDE.md | 242 ++++++++++++++++++++++++++ scripts/TWO_NODE_BENCHMARK_RESULTS.md | 100 +++++++++++ 2 files changed, 342 insertions(+) create mode 100644 docs/MOONCAKE_BACKEND_GUIDE.md create mode 100644 scripts/TWO_NODE_BENCHMARK_RESULTS.md diff --git a/docs/MOONCAKE_BACKEND_GUIDE.md b/docs/MOONCAKE_BACKEND_GUIDE.md new file mode 100644 index 0000000000..a5401fe0af --- /dev/null +++ b/docs/MOONCAKE_BACKEND_GUIDE.md @@ -0,0 +1,242 @@ +# Mooncake Backend Guide + +This document covers configuration, data flow, storage modes, and troubleshooting for using Mooncake as Slime's rollout transfer backend. + +--- + +## Table of Contents + +1. [Overview](#1-overview) +2. [Configuration and Environment Variables](#2-configuration-and-environment-variables) +3. [Usage and Best Practices](#3-usage-and-best-practices) +4. [Data Flow and Single-Key Storage](#4-data-flow-and-single-key-storage) +5. [Memory Optimization](#5-memory-optimization) +6. [Benchmark Results](#6-benchmark-results) +7. [Troubleshooting](#7-troubleshooting) +8. [References](#8-references) + +--- + +## 1. Overview + +### 1.1 What It Is + +**Mooncake Hybrid** is Slime's rollout data transfer backend that uses Mooncake distributed store instead of Ray Object Store for moving data from rollout nodes to training nodes. Core logic lives in `slime/utils/rollout_hybrid_transfer.py`. + +| Backend | Description | Best For | +|---------|-------------|----------| +| **ray** (default) | Ray Object Store | Single-node, colocated training and inference | +| **mooncake** | Mooncake Hybrid | Multi-node, disaggregated setups with RDMA | + +### 1.2 Storage Modes + +| Mode | Keys | Environment Variable | Description | +|------|------|----------------------|-------------| +| **Legacy** | 1 + N | `SLIME_USE_LEGACY_TRANSFER=1` | One meta key, one key per tensor | +| **Single-Key** | 1 | Default (non-Legacy) | All data in one contiguous value, one key | +| **Two-Key** | 2 | `SLIME_META_TENSOR_SPLIT_KEYS=1` | Meta in one key, tensors concatenated in another | + +--- + +## 2. Configuration and Environment Variables + +### 2.1 Required + +| Variable | Description | Example | +|----------|-------------|---------| +| `MOONCAKE_MASTER` | Master address (host:port) | `192.168.22.70:50051` | + +### 2.2 Optional (with defaults) + +| Variable | Default | Description | +|----------|---------|-------------| +| `MOONCAKE_PROTOCOL` | `tcp` | `tcp` or `rdma`; use `rdma` for multi-node | +| `MOONCAKE_DEVICE` | `""` | RDMA device name (e.g. `erdma_0`) | +| `MOONCAKE_LOCAL_HOSTNAME` | Auto-detected | Local hostname/IP | +| `MOONCAKE_TE_META_DATA_SERVER` | `P2PHANDSHAKE` | Transfer engine metadata server | +| `MOONCAKE_MOUNT_SEGMENT_SIZE` | 4 GiB | Segment size to mount | +| `MOONCAKE_LOCAL_BUFFER_SIZE` | 2 GiB | Local buffer size | + +### 2.3 Slime-Specific + +| Variable | Default | Description | +|----------|---------|-------------| +| `SLIME_USE_LEGACY_TRANSFER` | `0` | `1` to use Legacy multi-key mode | +| `SLIME_RING_BUFFER_SIZE_MB` | `2048` | Ring buffer size per slot (MB) for Get | +| `SLIME_UNSAFE_PICKLE` | `0` | `1` to disable restricted unpickler (trusted env only) | +| `MC_STORE_MEMCPY` | `0` | **Must be 0 for cross-node**; can be 1 for same-node | +| `SLIME_PACK_DIRECT_TO_BUFFER` | `0` | `1` to enable Direct Pack optimization | +| `SLIME_REGISTER_PICKLE_BUFFERS` | `0` | `1` to try zero-copy Put via register | +| `SLIME_META_TENSOR_SPLIT_KEYS` | `0` | `1` to enable Two-Key mode | +| `SLIME_USE_NUMPY_META` | `0` | `1` to use numpy-format meta | + +### 2.4 Example Configurations + +**Single-node (TCP):** +```bash +export MOONCAKE_MASTER=127.0.0.1:50051 +export MOONCAKE_PROTOCOL=tcp +export MC_STORE_MEMCPY=1 +``` + +**Two-node (RDMA):** +```bash +export MOONCAKE_MASTER=192.168.22.70:50051 +export MOONCAKE_PROTOCOL=rdma +export MOONCAKE_DEVICE=erdma_0 +export MC_STORE_MEMCPY=0 +``` + +--- + +## 3. Usage and Best Practices + +### 3.1 Command Line + +```bash +python -m slime.train --transfer-backend mooncake ... +``` + +### 3.2 Prerequisites + +1. Mooncake installation and cluster (master + clients on each node) +2. For RDMA multi-node: InfiniBand or RoCE configured correctly +3. For large payloads (500MB+): consider increasing `MOONCAKE_MOUNT_SEGMENT_SIZE` + +### 3.3 Best Practices + +- Use RDMA for multi-node: `MOONCAKE_PROTOCOL=rdma` +- `MC_STORE_MEMCPY=1` only when Put and Get are on the same node +- Segment size: `segment_size ≥ num_rounds × data_size` when batching (benchmark puts all rounds before get) +- `SLIME_UNSAFE_PICKLE=1` only in trusted environments + +--- + +## 4. Data Flow and Single-Key Storage + +### 4.1 Data Flow Overview + +``` +rollout dict (scattered memory) + → Prepare (pack ragged + pickle OOB) + → meta_bytes + tensors + → Put (write to _meta_buf/_header_buf, prepare tensors) + → batch_put_from_multi_buffers (Mooncake RDMA reads ptrs) + → Mooncake store (concatenates into value by layout) + → Get: batch_get_into (read entire value into buf) + → Parse layout (meta + tensor views) + → pickle.loads(meta_bytes, buffers=[...]) + → rollout dict +``` + +### 4.2 Single-Key Value Layout + +The value is a contiguous byte stream: `Hdr0(40B) | Meta(pad0) | Hdr1(40B) | T0(pad1) | Hdr2(40B) | T1(pad2) | ...` + +- **Hdr**: 40B struct with `dtype_id | ndim | shape0..3` +- **Meta**: Pickle meta (OOB references tensors) +- **T0, T1...**: Raw tensor bytes, 64B-aligned + +### 4.3 Put: Scattered Memory → Contiguous Value + +Put-side data lives in separate buffers: `_header_buf`, `_meta_buf`, and per-tensor buffers. `batch_put_from_multi_buffers(key, ptrs, sizes)` receives interleaved ptr/size pairs; Mooncake RDMA-reads them in order and concatenates into the stored value. + +``` +ptrs = [hdr0_ptr, meta_ptr, hdr1_ptr, t0_ptr, hdr2_ptr, t1_ptr, ...] +sizes = [40, pad0, 40, pad1, 40, pad2, ...] +``` + +### 4.4 Get: Single Read + Parse + +1. `batch_get_into([key], [buf_ptr], [total])` reads the entire value +2. `meta_bytes = buf[40:40+meta_size]` +3. Loop over `padded_sizes` to extract tensor views +4. `pickle.loads(meta_bytes, buffers=[...])` reconstructs dict with zero-copy + +--- + +## 5. Memory Optimization + +### 5.1 register_buffer / alloc_from_mem_pool + +- `register_buffer(ptr, size)`: Registers user memory with Mooncake for RDMA +- `alloc_from_mem_pool(size)`: Allocates from Mooncake memory pool; often returns 0 under standard setup + +### 5.2 SLIME_REGISTER_PICKLE_BUFFERS + +When enabled, tries `register_buffer(pickle_tensor_ptr)`; on success, Put is zero-copy; on failure, copies to _put_buffers. + +### 5.3 Direct Pack (SLIME_PACK_DIRECT_TO_BUFFER=1) + +- Packs directly into buffer without intermediate allocations +- Tries `alloc_from_mem_pool` first, falls back to `torch.empty`+`register_buffer` +- Returns `registered_range`; Put skips copy when tensors fall within that range + +--- + +## 6. Benchmark Results + +### 6.1 Two-Node RDMA (Put node 70 → Get node 72) + +**运行时间**: 2026-03-11 +**环境**: Put 192.168.22.70, Get 192.168.22.72, RDMA, warmup=24, discard-first=5, trim=15%, isolate-backends + +| Data Size | Backend | Put (ms) | Get (ms) | E2E (ms) | +|-----------|---------|----------|----------|----------| +| **100 MB** (~99 MB) | Ray | 22.65 ± 3.06 | 41.53 ± 0.31 | 64.39 ± 2.89 | +| | **Mooncake** | 34.15 ± 0.36 | **14.71 ± 0.38** | **48.89 ± 0.26** | +| **200 MB** (~203 MB) | Ray | 42.11 ± 3.94 | 82.83 ± 2.62 | 126.13 ± 4.46 | +| | **Mooncake** | 75.93 ± 3.93 | **29.50 ± 0.53** | **105.51 ± 3.15** | +| **500 MB** (~507 MB) | Ray | 104.18 ± 9.42 | 208.60 ± 6.15 | 315.39 ± 10.89 | +| | **Mooncake** | 170.58 ± 1.12 | **83.82 ± 2.31** | **254.92 ± 2.13** | +| **1000 MB** (~1000 MB) | Ray | 206.07 ± 23.12 | 419.21 ± 10.52 | 627.89 ± 21.03 | +| | **Mooncake** | 374.37 ± 1.69 | **179.50 ± 1.20** | **553.78 ± 3.91** | + +**结论**: +- 100 MB: Mooncake E2E **49 ms** vs Ray **64 ms**,约快 **24%**;Get 端 Mooncake **14.7 ms** vs Ray **41.5 ms**,约快 **2.8×** +- 200 MB: Mooncake E2E **106 ms** vs Ray **126 ms**,约快 **16%** +- 500 MB: Mooncake E2E **255 ms** vs Ray **315 ms**,约快 **19%**;需 `--mooncake-segment-size-gb 16` +- 1000 MB: Mooncake E2E **554 ms** vs Ray **628 ms**,约快 **12%**;需 `--mooncake-segment-size-gb 16`(segment ≥ num_rounds × data_size) + +### 6.2 Running Benchmarks + +```bash +export MOONCAKE_MASTER=192.168.22.70:50051 MOONCAKE_PROTOCOL=rdma SLIME_UNSAFE_PICKLE=1 +python scripts/benchmark_ray_vs_mooncake_two_node.py \ + --put-node 192.168.22.70 --get-node 192.168.22.72 \ + --data-size-mb 100 --num-rounds 30 --warm-up-rounds 24 \ + --discard-first 5 --trim-fraction 0.15 \ + --backends ray mooncake +``` + +- 默认 `--isolate-backends`:Ray 与 Mooncake 分别在独立进程中运行,避免内存/GC 干扰,使 Ray 方差更稳定 +- 运行前确保 Mooncake 处于干净状态(无冲突进程);必要时手动 `pkill -f mooncake_master` 并重启 master +- `--mooncake-segment-size-gb N`: segment 需 ≥ num_rounds × data_size(benchmark 先 put 完再 get) +- 详细结果见 `scripts/TWO_NODE_BENCHMARK_RESULTS.md` + +--- + +## 7. Troubleshooting + +### 7.1 Mooncake Slower Than Ray + +- Confirm Put/Get nodes: should be cross-node (PUT≠GET) +- Use `benchmark_ray_vs_mooncake_two_node.py`; avoid cross_node (SSH) script +- Run Mooncake alone to rule out Ray interference +- Ensure `MC_STORE_MEMCPY=0` for cross-node + +### 7.2 "Overlapped memory region" Warning + +With Direct Pack, this indicates overlapping memory regions and may trigger a slower path. Try disabling `SLIME_PACK_DIRECT_TO_BUFFER`. + +### 7.3 alloc_from_mem_pool Returns 0 + +Common under standard setup; falls back to `torch.empty`+`register_buffer`. Functionality is unaffected. + +--- + +## 8. References + +- [Mooncake Documentation](https://kvcache-ai.github.io/Mooncake/) +- [Mooncake GitHub](https://github.com/kvcache-ai/Mooncake) +- Slime code: `slime/utils/data_transfer.py`, `slime/utils/rollout_hybrid_transfer.py` diff --git a/scripts/TWO_NODE_BENCHMARK_RESULTS.md b/scripts/TWO_NODE_BENCHMARK_RESULTS.md new file mode 100644 index 0000000000..a251314ad5 --- /dev/null +++ b/scripts/TWO_NODE_BENCHMARK_RESULTS.md @@ -0,0 +1,100 @@ +# 双机 Two-Node Benchmark 结果(Ray vs Mooncake) + +**运行时间**: 2026-03-11 +**环境**: Put 192.168.22.70, Get 192.168.22.72, RDMA, warmup=24, discard-first=5, trim=15%, isolate-backends + +--- + +## 1. 测试方法 + +- **脚本**: `scripts/benchmark_ray_vs_mooncake_two_node.py` +- **拓扑**: DataGenerator 在 70(Put),DataConsumer 在 72(Get),handle 经 Ray 传递 +- **时序**: Put = dict→handle 全流程;Get = handle→dict 全流程;E2E = Put + Get +- **稳定化**: `--isolate-backends` 各 backend 独立进程;warmup=24、discard-first=5、trim=15% 降低 Ray 方差 + +--- + +## 2. 最新结果(稳定版) + +### 2.1 100 MB + +| Backend | Put (ms) | Get (ms) | E2E (ms) | +|---------|----------|----------|----------| +| **ray** | 22.65 ± 3.06 | 41.53 ± 0.31 | 64.39 ± 2.89 | +| **mooncake** | 34.15 ± 0.36 | **14.71 ± 0.38** | **48.89 ± 0.26** | + +- 实际数据量: ~99 MB,30 轮 +- Mooncake E2E 约快 24%,Get 约快 2.8× + +### 2.2 200 MB + +| Backend | Put (ms) | Get (ms) | E2E (ms) | +|---------|----------|----------|----------| +| **ray** | 42.11 ± 3.94 | 82.83 ± 2.62 | 126.13 ± 4.46 | +| **mooncake** | 75.93 ± 3.93 | **29.50 ± 0.53** | **105.51 ± 3.15** | + +- 实际数据量: ~203 MB,30 轮,`--mooncake-segment-size-gb 8` +- Mooncake E2E 约快 16% + +### 2.3 500 MB + +| Backend | Put (ms) | Get (ms) | E2E (ms) | +|---------|----------|----------|----------| +| **ray** | 104.18 ± 9.42 | 208.60 ± 6.15 | 315.39 ± 10.89 | +| **mooncake** | 170.58 ± 1.12 | **83.82 ± 2.31** | **254.92 ± 2.13** | + +- 实际数据量: ~507 MB,30 轮,`--mooncake-segment-size-gb 16` +- Mooncake E2E 约快 19%,Get 约快 2.5× + +### 2.4 1000 MB + +| Backend | Put (ms) | Get (ms) | E2E (ms) | +|---------|----------|----------|----------| +| **ray** | 206.07 ± 23.12 | 419.21 ± 10.52 | 627.89 ± 21.03 | +| **mooncake** | 374.37 ± 1.69 | **179.50 ± 1.20** | **553.78 ± 3.91** | + +- 实际数据量: ~1000 MB,8 轮,`--mooncake-segment-size-gb 16` +- Mooncake E2E 约快 12% +- **注意**: segment 需 ≥ (num_rounds + discard_first) × data_size(先 put 完再 get) + +--- + +## 3. 复现命令 + +```bash +export PUT_NODE=192.168.22.70 GET_NODE=192.168.22.72 +export MOONCAKE_MASTER=192.168.22.70:50051 MOONCAKE_PROTOCOL=rdma +export SLIME_UNSAFE_PICKLE=1 + +# 100 MB +python scripts/benchmark_ray_vs_mooncake_two_node.py \ + --put-node $PUT_NODE --get-node $GET_NODE \ + --data-size-mb 100 --num-rounds 30 --warm-up-rounds 24 \ + --backends ray mooncake + +# 200 MB +python scripts/benchmark_ray_vs_mooncake_two_node.py \ + --put-node $PUT_NODE --get-node $GET_NODE \ + --data-size-mb 200 --num-rounds 30 --warm-up-rounds 24 \ + --backends ray mooncake --mooncake-segment-size-gb 8 + +# 500 MB +python scripts/benchmark_ray_vs_mooncake_two_node.py \ + --put-node $PUT_NODE --get-node $GET_NODE \ + --data-size-mb 500 --num-rounds 30 --warm-up-rounds 24 \ + --backends ray mooncake --mooncake-segment-size-gb 16 + +# 1000 MB(segment ≥ 16GB,因 8+5 rounds × 1GB) +python scripts/benchmark_ray_vs_mooncake_two_node.py \ + --put-node $PUT_NODE --get-node $GET_NODE \ + --data-size-mb 1000 --num-rounds 8 --warm-up-rounds 8 --discard-first 3 \ + --backends ray mooncake --mooncake-segment-size-gb 16 +``` + +--- + +## 4. 小结 + +- **100–1000 MB**: Mooncake 双机 E2E 均优于 Ray(约 12%–24%) +- **Ray 方差**: 通过 isolate-backends、warmup=24、discard-first=5、trim=15% 已显著降低 +- **Segment 配置**: segment_size ≥ (num_rounds + discard_first) × data_size(benchmark 先 put 完所有轮次再 get)