diff --git a/slime/ray/rollout.py b/slime/ray/rollout.py index 0bd4c0fa0..ec043385d 100644 --- a/slime/ray/rollout.py +++ b/slime/ray/rollout.py @@ -156,7 +156,7 @@ def start_engines(self, port_cursors: dict[int, int] | None = None) -> tuple[lis else: # Compute base_port from the maximum cursor across all nodes that # this group's engines may land on (conservative: just use global max). - base_port = max(port_cursors.values()) if port_cursors else 15000 + base_port = max(port_cursors.values()) if port_cursors else 15000 + random.randint(0, 20000) addr_and_ports, port_cursors = _allocate_rollout_engine_addr_and_ports_normal( args=self.args, rollout_engines=rollout_engines, @@ -864,7 +864,9 @@ def port(consecutive=1): consecutive=consecutive, ) ) - start_port = port + consecutive + # Advance cursor past the allocated range with a safety gap + # to reduce TOCTOU collisions with other processes. + start_port = port + consecutive + 5 node_port_cursor[node_idx] = start_port return port diff --git a/slime/utils/http_utils.py b/slime/utils/http_utils.py index ede851f6b..f3db802a6 100644 --- a/slime/utils/http_utils.py +++ b/slime/utils/http_utils.py @@ -26,17 +26,21 @@ def find_available_port(base_port: int): def is_port_available(port): - """Return whether a port is available.""" - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + """Return whether a port is available. + + Checks both IPv4 and IPv6 to avoid false positives from SO_REUSEADDR. + Does NOT set SO_REUSEADDR so the check reflects actual availability. + """ + for family in (socket.AF_INET, socket.AF_INET6): try: - s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - s.bind(("", port)) - s.listen(1) - return True + with socket.socket(family, socket.SOCK_STREAM) as s: + s.bind(("" if family == socket.AF_INET else "::", port)) + s.listen(1) except OSError: return False except OverflowError: return False + return True def get_host_info():