isaac-sim
diff --git a/‎docs/source/features/ray.rst
Lines changed: 167 additions & 145 deletions b/‎docs/source/features/ray.rst
Lines changed: 167 additions & 145 deletions
diff --git a/‎source/standalone/workflows/ray/cluster_configs/Dockerfile
Lines changed: 4 additions & 0 deletions b/‎source/standalone/workflows/ray/cluster_configs/Dockerfile
Lines changed: 4 additions & 0 deletions
diff --git a/‎source/standalone/workflows/ray/cluster_configs/google_cloud/kuberay.yaml.jinja
Lines changed: 2 additions & 3 deletions b/‎source/standalone/workflows/ray/cluster_configs/google_cloud/kuberay.yaml.jinja
Lines changed: 2 additions & 3 deletions
diff --git a/‎source/standalone/workflows/ray/grok_cluster_with_kubectl.py
Lines changed: 8 additions & 8 deletions b/‎source/standalone/workflows/ray/grok_cluster_with_kubectl.py
Lines changed: 8 additions & 8 deletions
diff --git a/‎source/standalone/workflows/ray/launch.py
Lines changed: 6 additions & 6 deletions b/‎source/standalone/workflows/ray/launch.py
Lines changed: 6 additions & 6 deletions
diff --git a/‎source/standalone/workflows/ray/submit_job.py
Lines changed: 4 additions & 4 deletions b/‎source/standalone/workflows/ray/submit_job.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎source/standalone/workflows/ray/tuner.py
Lines changed: 27 additions & 13 deletions b/‎source/standalone/workflows/ray/tuner.py
Lines changed: 27 additions & 13 deletions
diff --git a/‎source/standalone/workflows/ray/util.py
Lines changed: 69 additions & 47 deletions b/‎source/standalone/workflows/ray/util.py
Lines changed: 69 additions & 47 deletions
@@ -1,5 +1,9 @@
 FROM isaac-lab-base:latest
 
+# WGet is needed so that GCS or other cloud providers can mark the container as ready.
+# Otherwise the Ray liveliness checks fail.
+RUN apt-get update && apt-get install wget
+
 # Set NVIDIA paths
 ENV PATH="/usr/local/nvidia/bin:$PATH"
 ENV LD_LIBRARY_PATH="/usr/local/nvidia/lib64"
 
@@ -19,7 +19,6 @@ spec:
       block: "true"
       dashboard-host: 0.0.0.0
       dashboard-port: "8265"
-      node-ip-address: "0.0.0.0"
       port: "6379"
       include-dashboard: "true"
       ray-debugger-external: "true"
@@ -30,7 +29,7 @@ spec:
       apiVersion: v1
       kind: Service
       metadata:
-        name: head
+        name: {{ name }}-head
       spec:
         type: LoadBalancer
     template:
@@ -130,7 +129,7 @@ spec:
               volumeMounts:
                 - mountPath: /tmp/ray
                   name: ray-logs
-              command: ["/bin/bash", "-c", "ray start --address=head.{{ namespace }}.svc.cluster.local:6379 && tail -f /dev/null"]
+              command: ["/bin/bash", "-c", "ray start --address={{name}}-head.{{ namespace }}.svc.cluster.local:6379 && tail -f /dev/null"]
             - image: fluent/fluent-bit:1.9.6
               name: fluentbit
               resources:
 
@@ -21,7 +21,7 @@
 
 .. code-block:: bash
 
-    ./isaaclab.sh -p source/standalone/workflows/ray/grok_cluster_with_kubectl.py
+    python3 source/standalone/workflows/ray/grok_cluster_with_kubectl.py
     # For options, supply -h arg
 """
 
@@ -67,9 +67,10 @@ def get_clusters(pods: list, cluster_name_prefix: str) -> set:
 
         match = re.match(r"(" + re.escape(cluster_name_prefix) + r"[-\w]+)", pod_name)
         if match:
-            # Get base name without head/worker suffix
-            base_name = match.group(1).split("-head")[0].split("-worker")[0]
-            clusters.add(base_name)
+            # Get base name without head/worker suffix (skip workers)
+            if "head" in pod_name:
+                base_name = match.group(1).split("-head")[0]
+                clusters.add(base_name)
     return sorted(clusters)
 
 
@@ -90,9 +91,7 @@ def get_mlflow_info(namespace: str = None, cluster_prefix: str = "isaacray") ->
     clusters = get_clusters(pods=pods, cluster_name_prefix=cluster_prefix)
     if len(clusters) > 1:
         raise ValueError("More than one cluster matches prefix, could not automatically determine mlflow info.")
-
-    base_name = cluster_prefix.split("-head")[0].split("-worker")[0]
-    mlflow_name = f"{base_name}-mlflow"
+    mlflow_name = f"{cluster_prefix}-mlflow"
 
     cmd = ["kubectl", "get", "svc", mlflow_name, "-n", namespace, "--no-headers"]
     try:
@@ -102,7 +101,8 @@ def get_mlflow_info(namespace: str = None, cluster_prefix: str = "isaacray") ->
         # Get cluster IP
         cluster_ip = fields[2]
         port = "5000"  # Default MLflow port
-
+        # This needs to be http to be resolved. HTTPS can't be resolved
+        # This should be fine as it is on a subnet on the cluster regardless
         return f"http://{cluster_ip}:{port}"
     except subprocess.CalledProcessError as e:
         raise ValueError(f"Could not grok MLflow: {e}")  # Fixed f-string
 
@@ -8,29 +8,28 @@
 import subprocess
 import yaml
 
+import util
 from jinja2 import Environment, FileSystemLoader
 from kubernetes import config
 
-import source.standalone.workflows.ray.util as util
-
 """This script helps create one or more KubeRay clusters.
 
 Usage:
 
 .. code-block:: bash
     # If the head node is stuck on container creating, make sure to create a secret
-    ./isaaclab.sh -p source/standalone/workflows/ray/launch.py -h
+    python3 source/standalone/workflows/ray/launch.py -h
 
     # Examples
 
     # The following creates 8 GPUx1 nvidia l4 workers
-    ./isaaclab.sh -p source/standalone/workflows/ray/launch.py --cluster_host google_cloud \
+    python3 source/standalone/workflows/ray/launch.py --cluster_host google_cloud \
     --namespace <NAMESPACE> --image <YOUR_ISAAC_RAY_IMAGE> \
     --num_workers 8 --num_clusters 1 --worker_accelerator nvidia-l4 --gpu_per_worker 1
 
     # The following creates 1 GPUx1 nvidia l4 worker, 2 GPUx2 nvidia-tesla-t4 workers,
     # and 2 GPUx4 nvidia-tesla-t4 GPU workers
-    ./isaaclab.sh -p source/standalone/workflows/ray/launch.py --cluster_host google_cloud \
+    python3 source/standalone/workflows/ray/launch.py --cluster_host google_cloud \
     --namespace <NAMESPACE> --image <YOUR_ISAAC_RAY_IMAGE> \
     --num_workers 1 2 --num_clusters 1 \
     --worker_accelerator nvidia-l4 nvidia-tesla-t4 --gpu_per_worker 1 2 4
@@ -53,7 +52,7 @@ def apply_manifest(args: argparse.Namespace) -> None:
     # Set up Jinja2 environment for loading templates
     templates_dir = RAY_DIR / "cluster_configs" / args.cluster_host
     file_loader = FileSystemLoader(str(templates_dir))
-    jinja_env = Environment(loader=file_loader, keep_trailing_newline=True)
+    jinja_env = Environment(loader=file_loader, keep_trailing_newline=True, autoescape=True)
 
     # Define template filename
     template_file = "kuberay.yaml.jinja"
@@ -79,6 +78,7 @@ def apply_manifest(args: argparse.Namespace) -> None:
 
     # Apply the Kubernetes manifest using kubectl
     try:
+        print(cleaned_yaml_string)
         subprocess.run(["kubectl", "apply", "-f", "-"], input=cleaned_yaml_string, text=True, check=True)
     except subprocess.CalledProcessError as e:
         exit(f"An error occurred while running `kubectl`: {e}")
 
@@ -40,16 +40,16 @@
 .. code-block:: bash
 
     # Example; submitting a tuning job
-    ./isaaclab.sh -p source/standalone/workflows/ray/submit_job.py \
+    python3 source/standalone/workflows/ray/submit_job.py \
     --aggregate_jobs /workspace/isaaclab/source/standalone/workflows/ray/tuner.py \
         --cfg_file hyperparameter_tuning/vision_cartpole_cfg.py \
-        --cfg_class CartpoleRGBNoTuneJobCfg --mlflow_uri <ML_FLOW_URI>
+        --cfg_class CartpoleTheiaJobCfg --mlflow_uri <ML_FLOW_URI>
 
     # Example: Submitting resource wrapped job
-    ./isaaclab.sh -p source/standalone/workflows/ray/submit_job.py --aggregate_jobs wrap_resources.py --sub_jobs ./isaaclab.sh -p source/standalone/workflows/rl_games/train.py --task Isaac-Cartpole-v0 --headless+./isaaclab.sh -p source/standalone/workflows/rl_games/train.py --task Isaac-Cartpole-RGB-Camera-Direct-v0 --headless --enable_cameras agent.params.config.max_epochs=150
+    python3 source/standalone/workflows/ray/submit_job.py --aggregate_jobs wrap_resources.py --test
 
     # For all command line arguments
-    ./isaaclab.sh -p source/standalone/workflows/ray/submit_job.py -h
+    python3 source/standalone/workflows/ray/submit_job.py -h
 """
 script_directory = os.path.dirname(os.path.abspath(__file__))
 CONFIG = {"working_dir": script_directory, "executable": "/workspace/isaaclab/isaaclab.sh -p"}
 
@@ -17,8 +17,9 @@
 """
 This script breaks down an aggregate tuning job, as defined by a hyperparameter sweep configuration,
 into individual jobs (shell commands) to run on the GPU-enabled nodes of the cluster.
-By default, (unless combined as a sub-job in a resource-wrapped aggregate job), one worker is created
-for each GPU-enabled node in the cluster for each individual job.
+By default, one worker is created for each GPU-enabled node in the cluster for each individual job.
+To use more than one worker per node (likely the case for multi-GPU machines), supply the
+num_workers_per_node argument.
 
 Each hyperparameter sweep configuration should include the workflow,
 runner arguments, and hydra arguments to vary.
@@ -39,16 +40,15 @@
     ./isaaclab.sh -p source/standalone/workflows/ray/tuner.py -h
 
     # Examples
-    # Local (not within a docker container, when within a local docker container, do not supply run_mode argument)
+    # Local
     ./isaaclab.sh -p source/standalone/workflows/ray/tuner.py --run_mode local \
     --cfg_file source/standalone/workflows/ray/hyperparameter_tuning/vision_cartpole_cfg.py \
-    --cfg_class CartpoleRGBNoTuneJobCfg
-    # Local docker: start the ray server and run above command in the same running container without run_mode arg
+    --cfg_class CartpoleTheiaJobCfg
     # Remote (run grok cluster or create config file mentioned in :file:`submit_job.py`)
     ./isaaclab.sh -p source/standalone/workflows/ray/submit_job.py \
     --aggregate_jobs tuner.py \
     --cfg_file hyperparameter_tuning/vision_cartpole_cfg.py \
-    --cfg_class CartpoleRGBNoTuneJobCfg --mlflow_uri <MLFLOW_URI_FROM_GROK_OR_MANUAL>
+    --cfg_class CartpoleTheiaJobCfg --mlflow_uri <MLFLOW_URI_FROM_GROK_OR_MANUAL>
 
 """
 
@@ -74,7 +74,7 @@ def setup(self, config: dict) -> None:
         print(f"[INFO]: Recovered invocation with {self.invoke_cmd}")
         self.experiment = None
 
-    def reset_config(self, new_config):
+    def reset_config(self, new_config: dict):
         """Allow environments to be re-used by fetching a new invocation command"""
         self.setup(new_config)
         return True
@@ -95,15 +95,15 @@ def step(self) -> dict:
             self.proc = experiment["proc"]
             self.experiment_name = experiment["experiment_name"]
             self.isaac_logdir = experiment["logdir"]
-            self.tensorboard_logdir = self.isaac_logdir + f"/{self.experiment_name}/summaries"
+            self.tensorboard_logdir = self.isaac_logdir + "/" + self.experiment_name
             self.done = False
 
         if self.proc is None:
             raise ValueError("Could not start trial.")
-
-        if self.proc.poll() is not None:  # process finished, signal finish
+        proc_status = self.proc.poll()
+        if proc_status is not None:  # process finished, signal finish
             self.data["done"] = True
-            print("[INFO]: Process finished, returning...")
+            print(f"[INFO]: Process finished with {proc_status}, returning...")
         else:  # wait until the logs are ready or fresh
             data = util.load_tensorboard_logs(self.tensorboard_logdir)
 
@@ -220,10 +220,24 @@ class JobCfg:
     """To be compatible with :meth: invoke_tuning_run and :class:IsaacLabTuneTrainable,
     at a minimum, the tune job should inherit from this class."""
 
-    def __init__(self, cfg):
+    def __init__(self, cfg: dict):
+        """
+        Runner args include command line arguments passed to the task.
+        For example:
+        cfg["runner_args"]["headless_singleton"] = "--headless"
+        cfg["runner_args"]["enable_cameras_singleton"] = "--enable_cameras"
+        """
         assert "runner_args" in cfg, "No runner arguments specified."
+        """
+        Task is the desired task to train on. For example:
+        cfg["runner_args"]["--task"] = tune.choice(["Isaac-Cartpole-RGB-TheiaTiny-v0"])
+        """
         assert "--task" in cfg["runner_args"], "No task specified."
-        assert "hydra_args" in cfg, "No hypeparameters specified."
+        """
+        Hydra args define the hyperparameters varied within the sweep. For example:
+        cfg["hydra_args"]["agent.params.network.cnn.activation"] = tune.choice(["relu", "elu"])
+        """
+        assert "hydra_args" in cfg, "No hyperparameters specified."
         self.cfg = cfg
 
 
 
@@ -6,35 +6,42 @@
 import os
 import re
 import subprocess
+import threading
 from datetime import datetime
 from math import isclose
 
 import ray
+from tensorboard.backend.event_processing.directory_watcher import DirectoryDeletedError
 from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
 
 
 def load_tensorboard_logs(directory: str) -> dict:
-    """From a tensorboard directory, get the latest scalar values.
+    """From a tensorboard directory, get the latest scalar values. If the logs can't be
+    found, check the summaries sublevel.
 
     Args:
         directory: The directory of the tensorboard logging.
 
     Returns:
         The latest available scalar values.
     """
+
     # Initialize the event accumulator with a size guidance for only the latest entry
-    size_guidance = {"scalars": 1}  # Load only the latest entry for scalars
-    event_acc = EventAccumulator(directory, size_guidance=size_guidance)
-    event_acc.Reload()  # Load all data from the directory
+    def get_latest_scalars(path: str) -> dict:
+        event_acc = EventAccumulator(path, size_guidance={"scalars": 1})
+        try:
+            event_acc.Reload()
+            if event_acc.Tags()["scalars"]:
+                return {
+                    tag: event_acc.Scalars(tag)[-1].value
+                    for tag in event_acc.Tags()["scalars"]
+                    if event_acc.Scalars(tag)
+                }
+        except (KeyError, OSError, RuntimeError, DirectoryDeletedError):
+            return {}
 
-    # Extract the latest scalars logged
-    latest_scalars = {}
-    for tag in event_acc.Tags()["scalars"]:
-        events = event_acc.Scalars(tag)
-        if events:  # Check if there is at least one entry
-            latest_event = events[-1]  # Get the latest entry
-            latest_scalars[tag] = latest_event.value
-    return latest_scalars
+    scalars = get_latest_scalars(directory)
+    return scalars or get_latest_scalars(os.path.join(directory, "summaries"))
 
 
 def get_invocation_command_from_cfg(
@@ -190,47 +197,62 @@ def execute_job(
         experiment_info_pattern = re.compile("Exact experiment name requested from command line: (.+)")
         logdir_pattern = re.compile(r"\[INFO\] Logging experiment in directory: (.+)$")
         err_pattern = re.compile("There was an error (.+)$")
-        with process.stdout as stdout:
-            for line in iter(stdout.readline, ""):
+
+        def stream_reader(stream, identifier_string, result_details):
+            for line in iter(stream.readline, ""):
                 line = line.strip()
-                result_details.append(f"{identifier_string}: {line} \n")
+                result_details.append(f"{identifier_string}: {line}\n")
                 if log_all_output:
                     print(f"{identifier_string}: {line}")
 
-                if extract_experiment:
-                    exp_match = experiment_info_pattern.search(line)
-                    log_match = logdir_pattern.search(line)
-                    err_match = err_pattern.search(line)
-                    if err_match:
-                        raise ValueError(f"Encountered an error during trial run. {' '.join(result_details)}")
-
-                    if exp_match:
-                        experiment_name = exp_match.group(1)
-                    if log_match:
-                        logdir = log_match.group(1)
-
-                    if experiment_name and logdir:
-                        result = {
-                            "experiment_name": experiment_name,
-                            "logdir": logdir,
-                            "proc": process,
-                            "result": " ".join(result_details),
-                        }
-                        return result
-
-        with process.stderr as stderr:
-            for line in iter(stderr.readline, ""):
-                line = line.strip()
-                result_details.append(f"{identifier_string}: {line}")
+        # Read stdout until we find experiment info
+        # Do some careful handling prevent overflowing the pipe reading buffer with error 141
+        for line in iter(process.stdout.readline, ""):
+            line = line.strip()
+            result_details.append(f"{identifier_string}: {line} \n")
+            if log_all_output:
                 print(f"{identifier_string}: {line}")
 
-        process.wait()  # Wait for the subprocess to finish naturally if not exited early
-
-    now = datetime.now().strftime("%H:%M:%S.%f")
-    completion_info = f"\n[INFO]: {identifier_string}: Job Started at {start_time}, completed at {now}\n"
-    print(completion_info)
-    result_details.append(completion_info)
-    return " ".join(result_details)
+            if extract_experiment:
+                exp_match = experiment_info_pattern.search(line)
+                log_match = logdir_pattern.search(line)
+                err_match = err_pattern.search(line)
+
+                if err_match:
+                    raise ValueError(f"Encountered an error during trial run. {' '.join(result_details)}")
+
+                if exp_match:
+                    experiment_name = exp_match.group(1)
+                if log_match:
+                    logdir = log_match.group(1)
+
+                if experiment_name and logdir:
+                    # Start stderr reader after finding experiment info
+                    stderr_thread = threading.Thread(
+                        target=stream_reader, args=(process.stderr, identifier_string, result_details)
+                    )
+                    stderr_thread.daemon = True
+                    stderr_thread.start()
+
+                    # Start stdout reader to continue reading to flush buffer
+                    stdout_thread = threading.Thread(
+                        target=stream_reader, args=(process.stdout, identifier_string, result_details)
+                    )
+                    stdout_thread.daemon = True
+                    stdout_thread.start()
+
+                    return {
+                        "experiment_name": experiment_name,
+                        "logdir": logdir,
+                        "proc": process,
+                        "result": " ".join(result_details),
+                    }
+        process.wait()
+        now = datetime.now().strftime("%H:%M:%S.%f")
+        completion_info = f"\n[INFO]: {identifier_string}: Job Started at {start_time}, completed at {now}\n"
+        print(completion_info)
+        result_details.append(completion_info)
+        return " ".join(result_details)
 
 
 def get_gpu_node_resources(