-
Notifications
You must be signed in to change notification settings - Fork 173
feat: allow uv-less execution and fingerprint the environment #1491
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,6 +16,12 @@ | |
| import sys | ||
| from pathlib import Path | ||
|
|
||
| # Configure logging to show file location for warnings | ||
| logging.basicConfig( | ||
| format="%(levelname)s:%(name)s:%(filename)s:%(lineno)d: %(message)s", | ||
| level=logging.WARNING, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will set the level as WARNING for all jobs, is that expected? |
||
| ) | ||
|
|
||
| """ | ||
| This is a work around to ensure whenever NeMo RL is imported, that we | ||
| add Megatron-LM to the python path. This is because the only sub-package | ||
|
|
@@ -49,6 +55,159 @@ | |
| os.environ["RAY_ENABLE_UV_RUN_RUNTIME_ENV"] = "0" | ||
|
|
||
|
|
||
| def _is_build_isolation(): | ||
| """Detect if we're running in a uv build isolation environment. | ||
| When running uv lock/sync, uv creates a temporary isolated environment | ||
| in ~/.cache/uv/builds-v*/ to build packages and introspect metadata. | ||
| We skip the fingerprint check in this context since the user is updating dependencies. | ||
| Returns True if in build isolation, False otherwise. | ||
| """ | ||
| # Check if we're in uv's build isolation directory | ||
| # uv always uses paths like: /root/.cache/uv/builds-v0/.tmp*/ | ||
| return "/builds-v" in sys.prefix | ||
|
|
||
|
|
||
| def _check_container_fingerprint(): | ||
| """Check if container dependencies match the current code (container-only). | ||
| This check only runs when NRL_CONTAINER=1 is set (inside containers). | ||
| It compares the container's fingerprint (computed at build time) with | ||
| the current code's fingerprint to detect dependency drift. | ||
| This check is also skipped entirely if NRL_FORCE_REBUILD_VENVS=true is set, | ||
| since environment rebuilding will ensure dependencies are consistent regardless | ||
| of a mismatch. | ||
| If there's a mismatch, raises RuntimeError unless NRL_IGNORE_VERSION_MISMATCH is set. | ||
| """ | ||
| # Skip check if not in container or if we're going to force venv rebuild anyway | ||
| if not os.environ.get("NRL_CONTAINER"): | ||
| return | ||
| if os.environ.get("NRL_FORCE_REBUILD_VENVS", "").lower() == "true": | ||
| logging.info( | ||
| "Skipping container fingerprint check because NRL_FORCE_REBUILD_VENVS=true (venvs will be rebuilt anyway)" | ||
| ) | ||
| return | ||
|
|
||
| # Skip check if we're in a build isolation environment (e.g., during uv lock/sync) | ||
| if _is_build_isolation(): | ||
| logging.debug( | ||
| "Skipping container fingerprint check because we're in a build isolation environment" | ||
| ) | ||
| return | ||
|
|
||
| try: | ||
| import json | ||
| import runpy | ||
| import sys | ||
| from io import StringIO | ||
|
|
||
| # Get repo root (relative to this module) | ||
| repo_root = Path(__file__).parent.parent | ||
| fingerprint_script = repo_root / "tools" / "generate_fingerprint.py" | ||
|
|
||
| # Check if script exists | ||
| if not fingerprint_script.exists(): | ||
| logging.warning( | ||
| f"Fingerprint script not found at {fingerprint_script}, skipping version check" | ||
| ) | ||
| return | ||
|
|
||
| # Compute current code fingerprint using runpy (cleaner than subprocess) | ||
| old_stdout = sys.stdout | ||
| sys.stdout = captured_output = StringIO() | ||
| try: | ||
| runpy.run_path(str(fingerprint_script), run_name="__main__") | ||
| current_fingerprint_json = captured_output.getvalue().strip() | ||
| finally: | ||
| sys.stdout = old_stdout | ||
|
|
||
| if not current_fingerprint_json: | ||
| logging.warning("Failed to compute code fingerprint: empty output") | ||
| return | ||
|
|
||
| current_fingerprint = json.loads(current_fingerprint_json) | ||
|
|
||
| # Read container fingerprint | ||
| container_fingerprint_file = Path("/opt/nemo_rl_container_fingerprint") | ||
| if not container_fingerprint_file.exists(): | ||
| logging.warning( | ||
| "Container fingerprint file not found, skipping version check" | ||
| ) | ||
| return | ||
|
|
||
| container_fingerprint = json.loads( | ||
| container_fingerprint_file.read_text().strip() | ||
| ) | ||
|
|
||
| # Compare fingerprints and find differences | ||
| all_keys = set(current_fingerprint.keys()) | set(container_fingerprint.keys()) | ||
| differences = [] | ||
|
|
||
| for key in sorted(all_keys): | ||
| current_val = current_fingerprint.get(key, "missing") | ||
| container_val = container_fingerprint.get(key, "missing") | ||
|
|
||
| if current_val != container_val: | ||
| differences.append(f" - {key}:") | ||
| differences.append(f" Container: {container_val}") | ||
| differences.append(f" Current: {current_val}") | ||
|
|
||
| if differences: | ||
| diff_text = "\n".join(differences) | ||
| sep_line = "\n" + ("-" * 80) | ||
| warning_msg = ( | ||
| f"{sep_line}\n" | ||
| "WARNING: Container/Code Version Mismatch Detected!\n" | ||
| f"{sep_line}\n" | ||
| "Your container's dependencies do not match your current code.\n" | ||
| "\n" | ||
| "Differences found:\n" | ||
| f"{diff_text}\n" | ||
| "\n" | ||
| "This can lead to unexpected behavior or errors.\n" | ||
| "\n" | ||
| "Solutions:\n" | ||
| " 1. Rebuild the container to match your code\n" | ||
| " 2. Set NRL_FORCE_REBUILD_VENVS=true to rebuild virtual environments\n" | ||
| " (This forces Ray workers to recreate their venvs with updated dependencies)\n" | ||
| " 3. Update the container fingerprint to match your current code (for local dev):\n" | ||
| " python tools/generate_fingerprint.py > /opt/nemo_rl_container_fingerprint\n" | ||
| " 4. Set NRL_IGNORE_VERSION_MISMATCH=1 to bypass this check (not recommended)\n" | ||
| "\n" | ||
| "Learn more about dependency management:\n" | ||
| " https://github.com/NVIDIA-NeMo/RL/blob/main/docs/design-docs/dependency-management.md\n" | ||
| f"{sep_line}\n" | ||
| ) | ||
|
|
||
| # Check if user wants to ignore the mismatch | ||
| if os.environ.get("NRL_IGNORE_VERSION_MISMATCH"): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should the default be to warn instead of raise?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yea, upon reflection i can see this may be annoying. so basically log is default and the an env var can squelch the warning. wdyt? |
||
| logging.warning( | ||
| warning_msg | ||
| + "Proceeding anyway (NRL_IGNORE_VERSION_MISMATCH is set)..." | ||
| ) | ||
| else: | ||
| raise RuntimeError( | ||
| warning_msg | ||
| + "To proceed anyway, set: export NRL_IGNORE_VERSION_MISMATCH=1" | ||
| ) | ||
| else: | ||
| logging.debug("Container fingerprint matches code fingerprint") | ||
|
|
||
| except RuntimeError: | ||
| # Re-raise RuntimeError for version mismatches (user should see this) | ||
| raise | ||
| except Exception as e: | ||
| # Log other errors but don't crash on version check failures | ||
| logging.debug(f"Version check failed (non-fatal): {e}") | ||
|
|
||
|
|
||
| # Perform container version check | ||
| _check_container_fingerprint() | ||
|
|
||
|
|
||
| def _patch_nsight_file(): | ||
| """Patch the nsight.py file to fix the context.py_executable assignment. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,7 +11,9 @@ | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
| import os | ||
| import sys | ||
| from pathlib import Path | ||
|
|
||
| from nemo_rl.distributed.ray_actor_environment_registry import ( | ||
| ACTOR_ENVIRONMENT_REGISTRY, | ||
|
|
@@ -52,6 +54,100 @@ def prefetch_venvs(): | |
|
|
||
| print("\nVenv prefetching complete!") | ||
|
|
||
| # Create convenience python wrapper scripts for frozen environment support (container-only) | ||
| create_frozen_environment_symlinks(venv_configs) | ||
|
|
||
|
|
||
| def create_frozen_environment_symlinks(venv_configs): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IIUC the executable is frozen, but the environment can still be modified by someone manually if they want?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| """Create python-{ClassName} wrapper scripts in /usr/local/bin for frozen environment support. | ||
|
|
||
| Only runs in container (when NRL_CONTAINER=1 is set). | ||
|
|
||
| Args: | ||
| venv_configs: Dictionary mapping py_executable to list of actor FQNs | ||
| """ | ||
| # Only create wrapper scripts in container | ||
| if not os.environ.get("NRL_CONTAINER"): | ||
| print( | ||
| "\nSkipping frozen environment wrapper script creation (not in container)" | ||
| ) | ||
| return | ||
|
|
||
| print("\nCreating frozen environment wrapper scripts...") | ||
|
|
||
| # Collect all wrapper mappings: class_name -> venv_path | ||
| wrapper_mappings = {} | ||
|
|
||
| for py_executable, actor_fqns in venv_configs.items(): | ||
| for actor_fqn in actor_fqns: | ||
| # Extract class name from FQN (last part) | ||
| # e.g., "nemo_rl.models.policy.megatron_policy_worker.MegatronPolicyWorker" -> "MegatronPolicyWorker" | ||
| class_name = actor_fqn.split(".")[-1] | ||
|
|
||
| # Get the venv path that was created | ||
| try: | ||
| python_path = create_local_venv(py_executable, actor_fqn) | ||
|
|
||
| # Check for collisions | ||
| if class_name in wrapper_mappings: | ||
| existing_path = wrapper_mappings[class_name] | ||
| if existing_path != python_path: | ||
| raise RuntimeError( | ||
| f"Collision detected: Multiple venvs want to use name '{class_name}'\n" | ||
| f" Existing: {existing_path}\n" | ||
| f" New: {python_path}\n" | ||
| f"This indicates two different worker classes have the same name." | ||
| ) | ||
| else: | ||
| wrapper_mappings[class_name] = python_path | ||
| except Exception as e: | ||
| print(f" Warning: Could not get venv path for {actor_fqn}: {e}") | ||
| continue | ||
|
|
||
| # Create wrapper scripts | ||
| wrapper_dir = Path("/usr/local/bin") | ||
| created_wrappers = [] | ||
|
|
||
| for class_name, python_path in sorted(wrapper_mappings.items()): | ||
| wrapper_name = f"python-{class_name}" | ||
| wrapper_path = wrapper_dir / wrapper_name | ||
|
|
||
| # Get the venv directory path (parent of bin/python) | ||
| venv_path = Path(python_path).parent.parent | ||
|
|
||
| # Create wrapper script content | ||
| wrapper_content = f"""#!/bin/bash | ||
| VENV_PATH="{venv_path}" | ||
| export VIRTUAL_ENV="$VENV_PATH" | ||
| export PATH="$VENV_PATH/bin:$PATH" | ||
| exec "$VENV_PATH/bin/python" "$@" | ||
| """ | ||
|
|
||
| try: | ||
| # Remove existing wrapper if present | ||
| if wrapper_path.exists() or wrapper_path.is_symlink(): | ||
| wrapper_path.unlink() | ||
|
|
||
| # Write wrapper script | ||
| wrapper_path.write_text(wrapper_content) | ||
|
|
||
| # Make executable | ||
| wrapper_path.chmod(0o755) | ||
|
|
||
| created_wrappers.append(wrapper_name) | ||
| print(f" Created: {wrapper_name} -> {python_path}") | ||
| except Exception as e: | ||
| print(f" Warning: Could not create wrapper script {wrapper_name}: {e}") | ||
| continue | ||
|
|
||
| if created_wrappers: | ||
| print(f"\nCreated {len(created_wrappers)} frozen environment wrapper scripts") | ||
| print("Users can now use these python executables directly:") | ||
| for name in created_wrappers: | ||
| print(f" - {name}") | ||
| else: | ||
| print("\nNo frozen environment wrapper scripts were created") | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| prefetch_venvs() | ||

There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a way to override this to allow local context? i.e. I want build a custom docker image with code on my local machine that is not yet committed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yup, it's on line 6
is that what you are looking for? it basically hot swaps the build context as the layer