[fine_tuning] ilab: correct throughput and new ilab image (#635)

openshift-psap · Jan 21, 2025 · 82a15ab · 82a15ab
2 parents 7a521a2 + bcea53f
commit 82a15ab
Show file tree

Hide file tree

Showing 10 changed files with 152 additions and 12 deletions.
diff --git a/docs/toolbox.generated/Fine_Tuning.run_fine_tuning_job.rst b/docs/toolbox.generated/Fine_Tuning.run_fine_tuning_job.rst
@@ -108,6 +108,11 @@ Parameters
 * If True, sets the 'limits' of the job with the same value as the request.
 
 
+``shared_memory``  
+
+* Amount of shm (in GB) to give to each of the job pods
+
+
 ``prepare_only``  
 
 * If True, only prepare the environment but do not run the fine-tuning job.

diff --git a/projects/fine_tuning/testing/config.yaml b/projects/fine_tuning/testing/config.yaml
@@ -364,21 +364,24 @@ ci_presets:
     tests.fine_tuning.ilab.enabled: true
     tests.fine_tuning.test_settings.name: ilab
     tests.fine_tuning.test_settings.dataset_name: ilab_skills_data.jsonl
-    tests.fine_tuning.test_settings.model_name: ibm-granite/granite-3b-code-instruct@hf
+    tests.fine_tuning.test_settings.model_name: ibm-granite/granite-3.0-8b-instruct@hf
     tests.fine_tuning.test_settings.dataset_replication: null
     matbench.workload: projects.fine_tuning.visualizations.ilab_training
     matbench.prom_workload: projects.fine_tuning.visualizations.ilab_prom
     matbench.config_file: ilab_training.yaml
     matbench.lts.generate: false
+    tests.fine_tuning.test_settings.shared_memory: 20
     tests.fine_tuning.test_settings.hyper_parameters:
       num_epochs: null
       max_batch_len: null
       NCCL_SOCKET_NTHREADS: null
+      cpu_offload_optimizer: null
+      cpu_offload_params: null
 
   ilab_scale:
     extends: [ilab]
 
-    tests.fine_tuning.test_settings.model_name: ibm-granite/granite-7b-base@hf
+    tests.fine_tuning.test_settings.model_name: ibm-granite/granite-3.0-8b-instruct@hf
     tests.fine_tuning.test_settings.dataset_name: [ilab_large_10000samples_skills_data.jsonl, ilab_large_knowledge_data.jsonl]
 
     tests.fine_tuning.test_settings.pod_count: [1, 2, 4]
@@ -440,25 +443,79 @@ ci_presets:
     tests.fine_tuning.test_settings.hyper_parameters.num_epochs: 1
     tests.fine_tuning.node_count_equal_pod_count: true
 
+  ilab_2x8xh100_secondary_nic:
+    extends: [ilab]
+
+    tests.fine_tuning.matbenchmarking.enabled: true
+    tests.fine_tuning.matbenchmarking.stop_on_error: false
+
+    tests.fine_tuning.test_settings.model_name: ibm-granite/granite-3.0-8b-instruct@hf
+    tests.fine_tuning.test_settings.dataset_name: ilab_large_10000samples_skills_data.jsonl
+
+    tests.fine_tuning.test_settings.gpu: 8
+    tests.fine_tuning.test_settings.pod_count: 2
+
+    tests.fine_tuning.test_settings.secondary_nic_prefix: "network-port-"
+    tests.fine_tuning.test_settings.secondary_nic_count: [1, 2, 4, 6, 8]
+
+    # tests.fine_tuning.test_settings.hyper_parameters.max_batch_len: [60000, 70000, 80000, 85000, 90000, 95000]
+    tests.fine_tuning.test_settings.hyper_parameters.max_batch_len: 85000
+    tests.fine_tuning.test_settings.hyper_parameters.num_epochs: 1
+
   ilab_2x8xh100_scale:
     extends: [ilab]
 
     tests.fine_tuning.matbenchmarking.enabled: true
     tests.fine_tuning.matbenchmarking.stop_on_error: false
 
-    tests.fine_tuning.test_settings.model_name: ibm-granite/granite-7b-base@hf
+    tests.fine_tuning.test_settings.model_name: ibm-granite/granite-3.0-8b-instruct@hf
     tests.fine_tuning.test_settings.dataset_name: ilab_large_10000samples_skills_data.jsonl
 
     tests.fine_tuning.test_settings.gpu: 8
     tests.fine_tuning.test_settings.pod_count: 2
 
     tests.fine_tuning.test_settings.secondary_nic_prefix: "subnet-port-"
-    tests.fine_tuning.test_settings.secondary_nic_count: [8, 6, 4]
+    tests.fine_tuning.test_settings.secondary_nic_count: [8, 6, 4, 2, 1]
 
     # tests.fine_tuning.test_settings.hyper_parameters.max_batch_len: [60000, 70000, 80000, 85000, 90000, 95000]
     tests.fine_tuning.test_settings.hyper_parameters.max_batch_len: 85000
     tests.fine_tuning.test_settings.hyper_parameters.num_epochs: 1
 
+  ilab_2x8xh100_scale_rdma:
+    extends: [ilab]
+
+    tests.fine_tuning.matbenchmarking.enabled: true
+    tests.fine_tuning.matbenchmarking.stop_on_error: false
+
+    tests.fine_tuning.test_settings.model_name: ibm-granite/granite-3.0-8b-instruct@hf
+    tests.fine_tuning.test_settings.dataset_name: ilab_large_10000samples_skills_data.jsonl
+
+    tests.fine_tuning.test_settings.gpu: 8
+    tests.fine_tuning.test_settings.pod_count: 2
+
+    tests.fine_tuning.test_settings.secondary_nic_prefix: "subnet-rdma-port-"
+    tests.fine_tuning.test_settings.secondary_nic_count: [8, 6, 4, 2, 1]
+
+    # tests.fine_tuning.test_settings.hyper_parameters.max_batch_len: [60000, 70000, 80000, 85000, 90000, 95000]
+    tests.fine_tuning.test_settings.hyper_parameters.max_batch_len: 85000
+    tests.fine_tuning.test_settings.hyper_parameters.num_epochs: 1
+
+  ilab_1x8xh100_scale:
+    extends: [ilab]
+
+    tests.fine_tuning.matbenchmarking.enabled: true
+    tests.fine_tuning.matbenchmarking.stop_on_error: false
+
+    tests.fine_tuning.test_settings.model_name: ibm-granite/granite-3.0-8b-instruct@hf
+    tests.fine_tuning.test_settings.dataset_name: ilab_large_10000samples_skills_data.jsonl
+
+    tests.fine_tuning.test_settings.gpu: 8
+    tests.fine_tuning.test_settings.pod_count: 1
+
+    tests.fine_tuning.test_settings.hyper_parameters.max_batch_len: [20000, 60000]
+#    tests.fine_tuning.test_settings.hyper_parameters.max_batch_len: 85000
+    tests.fine_tuning.test_settings.hyper_parameters.num_epochs: 1
+
   # ---
 
   cluster_instructlab:
@@ -696,6 +753,7 @@ tests:
       model_name: bigscience/bloom-560m@hf
       dataset_name: twitter_complaints_small.json
       gpu: 1
+      shared_memory: null
       dataset_replication: 1
       container_image: null
       use_secondary_nic: null
@@ -724,7 +782,8 @@ tests:
       image: quay.io/rhoai/ray:2.35.0-py311-cu121-torch24-fa26
     ilab:
       enabled: false
-      image: registry.redhat.io/rhelai1/instructlab-nvidia-rhel9@sha256:b3dc9af0244aa6b84e6c3ef53e714a316daaefaae67e28de397cd71ee4b2ac7e
+      image: registry.redhat.io/rhelai1/instructlab-nvidia-rhel9@sha256:525ab53de3829cac1a9aabb73194f49e22da8fdcf12a01c56ece961300cdab0d
+      # instructlab 1.3
 matbench:
   preset: null
   workload: projects.fine_tuning.visualizations.fine_tuning # actual workload must be a symlink to this dir

diff --git a/projects/fine_tuning/toolbox/fine_tuning.py b/projects/fine_tuning/toolbox/fine_tuning.py
@@ -34,7 +34,7 @@ def run_fine_tuning_job(
             memory=10,
             cpu=1,
             request_equals_limits=False,
-
+            shared_memory=None,
             prepare_only=False,
             delete_other=False,
 
@@ -72,7 +72,7 @@ def run_fine_tuning_job(
           memory: the number of RAM gigs to request for to the fine-tuning job (in Gigs)
           cpu: the number of CPU cores to request for the fine-tuning job (in cores)
           request_equals_limits: if True, sets the 'limits' of the job with the same value as the request.
-
+          shared_memory:  amount of shm (in GB) to give to each of the job pods
           prepare_only: if True, only prepare the environment but do not run the fine-tuning job.
           delete_other: if True, delete the other PyTorchJobs before running
 

diff --git a/projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/defaults/main/config.yml b/projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/defaults/main/config.yml
@@ -57,6 +57,9 @@ fine_tuning_run_fine_tuning_job_cpu: 1
 # if True, sets the 'limits' of the job with the same value as the request.
 fine_tuning_run_fine_tuning_job_request_equals_limits: false
 
+# amount of shm (in GB) to give to each of the job pods
+fine_tuning_run_fine_tuning_job_shared_memory: null
+
 # if True, only prepare the environment but do not run the fine-tuning job.
 fine_tuning_run_fine_tuning_job_prepare_only: false
 

diff --git a/projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/files/entrypoint/run_ilab.sh b/projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/files/entrypoint/run_ilab.sh
@@ -18,10 +18,34 @@ export TRITON_DUMP_DIR=$TRITON_HOME
 export TRITON_CACHE_DIR=$TRITON_HOME
 export TRITON_OVERRIDE_DIR=$TRITON_HOME
 
+
 mkdir -p "$CACHE_DIR"
 
+if [[ "${WITH_RDMA:-}" ]]; then
+  export NCCL_TOPO_FILE=/mnt/storage/topo.xml
+  num_rdma=$(ls /sys/class/infiniband/ | wc -l)
+  IFS=',' read -ra ADDR <<< "$NCCL_SOCKET_IFNAME"   # Split by comma
+  length=${#ADDR[@]}  # Get the length (number of elements in the array)
+  echo "Length of NCCL_SOCKET_IFNAME: $length"
+  NCCL_IB_HCA=''
+  for idx in $(seq $((num_rdma-1)) -1 $((num_rdma-length))); do
+    # Append the value to the NCCL_IB_HCA string
+    if [ -z "$NCCL_IB_HCA" ]; then
+      NCCL_IB_HCA="mlx5_$idx"  # Initialize the string with the first value
+    else
+      NCCL_IB_HCA="$NCCL_IB_HCA,mlx5_$idx"  # Append the next value with a comma
+    fi
+  done
+  export NCCL_IB_HCA="$NCCL_IB_HCA"
+  export NCCL_IB_DISABLE=0
+  export NCCL_IB_GID_INDEX=3
+  export NCCL_DEBUG=info
+  echo "Using $length SR-IOV NIC’s with rdma"
+fi
+
 if [[ "${NCCL_SOCKET_IFNAME:-}" ]]; then
 
+
     MAPPING="$(cat /mnt/nic-mapping/nodename_ip_mapping.yaml)"
     for ifname in $(echo $NCCL_SOCKET_IFNAME | tr , " "); do
         current_ip=$(ip route | grep "$ifname " | cut -d" " -f9)

diff --git a/projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/tasks/main.yml b/projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/tasks/main.yml
@@ -1,4 +1,8 @@
 ---
+- name: Store the flag for rdma use
+  set_fact:
+    rdma_nics: "{{ fine_tuning_run_fine_tuning_job_use_secondary_nic and fine_tuning_run_fine_tuning_job_use_secondary_nic | select('search', 'rdma') | list}}"
+
 - name: Create the src directory
   file:
     path: "{{ artifact_extra_logs_dir }}/src/"

diff --git a/...cts/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/templates/fine_tuning_job.yaml.j2 b/...cts/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/templates/fine_tuning_job.yaml.j2
@@ -80,6 +80,10 @@ spec:
 {% if fine_tuning_run_fine_tuning_job_sleep_forever %}
             - name: SLEEP_FOREVER
               value: "true"
+{% endif %}
+{% if rdma_nics %}
+            - name: WITH_RDMA
+              value: "true"
 {% endif %}
             - name: MODEL_NAME
               value: "{{ fine_tuning_run_fine_tuning_job_model_name}}"
@@ -101,6 +105,12 @@ spec:
               mountPath: /mnt/entrypoint
             - name: config-volume
               mountPath: /mnt/config
+{% if fine_tuning_run_fine_tuning_job_shared_memory %}
+            - name: shm-volume
+              mountPath: /dev/shm
+            - name: shared-volume
+              mountPath: /mnt/shared
+{% endif %}
             - name: output-volume
               mountPath: /mnt/output
 {% if fine_tuning_run_fine_tuning_job_use_secondary_nic %}
@@ -114,16 +124,30 @@ spec:
 {% endif %}
                 memory: "{{ fine_tuning_run_fine_tuning_job_memory }}Gi"
                 cpu: "{{ fine_tuning_run_fine_tuning_job_cpu }}"
+{% if rdma_nics %}
+                nvidia.com/roce: "{{ fine_tuning_run_fine_tuning_job_use_secondary_nic | length }}"
+{% endif %}
 {% if fine_tuning_run_fine_tuning_job_request_equals_limits %}
               limits: *request_block
 {% elif fine_tuning_run_fine_tuning_job_gpu %}
               limits:
                 nvidia.com/gpu: "{{ fine_tuning_run_fine_tuning_job_gpu }}"
+{% if rdma_nics %}
+                nvidia.com/roce: "{{ fine_tuning_run_fine_tuning_job_use_secondary_nic | length }}"
+{% endif %}
 {% endif %}
           volumes:
           - name: storage-volume
             persistentVolumeClaim:
               claimName: {{ fine_tuning_run_fine_tuning_job_pvc_name }}
+{% if fine_tuning_run_fine_tuning_job_shared_memory %}
+          - name: shm-volume
+            emptyDir:
+              medium: Memory
+              sizeLimit: "{{ fine_tuning_run_fine_tuning_job_shared_memory }}Gi"
+          - name: shared-volume
+            emptyDir: {}
+{% endif %}
           - name: config-volume
             configMap:
               name: {{ job_name_safe }}-config

diff --git a/...ts/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/templates/ilab_base_config.yaml.j2 b/...ts/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/templates/ilab_base_config.yaml.j2
@@ -17,8 +17,6 @@ save_samples: 0
 log_level: INFO
 max_batch_len: 20000
 seed: 42
-cpu_offload_optimizer: true
 distributed_training_framework: fsdp
-cpu_offload_params: true
-is_granite: true
+use_dolomite: true
 #checkpoint_at_epoch: true
diff --git a/projects/fine_tuning/visualizations/fine_tuning/plotting/report.py b/projects/fine_tuning/visualizations/fine_tuning/plotting/report.py
@@ -29,7 +29,7 @@ def __init__(self, flavor):
         from ..store import parsers
 
         self.summary_keys = parsers.SFT_TRAINER_SUMMARY_KEYS if flavor == "SFTTrainer" \
-            else (parsers.ILAB_SUMMARY_KEYS | parsers.ILAB_PROGRESS_KEYS)
+            else parsers.ILAB_SUMMARY_KEYS
 
         self.progress_keys = parsers.SFT_TRAINER_PROGRESS_KEYS if flavor == "SFTTrainer" \
             else parsers.ILAB_PROGRESS_KEYS

diff --git a/projects/fine_tuning/visualizations/fine_tuning/store/parsers.py b/projects/fine_tuning/visualizations/fine_tuning/store/parsers.py
@@ -220,6 +220,7 @@ def parse_dataset_stats(data):
 
 ILAB_SUMMARY_KEYS = {
     "torchrun_exec_time": types.SimpleNamespace(lower_better=True, units="minutes", title="Execution wall-time"),
+    "average_throughput": types.SimpleNamespace(lower_better=False, units="samples/second", title="Average throughput"),
 }
 
 """
@@ -260,12 +261,27 @@ def extract_torchrun_execution_time(line):
 
         ilab_metrics.summary.torchrun_exec_time = int(time_str) / 60 # convert from seconds to minutes
 
-    with open(register_important_file(dirname, artifact_paths.FINE_TUNING_RUN_FINE_TUNING_DIR / "artifacts/pod.log")) as f:
+    def extract_num_of_samples(line):
+        if not line.startswith("Map (num_proc=8): 100%"):
+            return
+
+        _not_used, has_it, after = line.partition("Map (num_proc=8): 100%|██████████| ")
+        if not has_it: return
+
+        num_samples, has_it, after = after.partition("/")
+        if not has_it:
+            log.error(f"Invalid Map line :/ '{line}'")
+            return
+
+        ilab_metrics.summary.num_samples = int(num_samples)
+
+    with (open(register_important_file(dirname, artifact_paths.FINE_TUNING_RUN_FINE_TUNING_DIR / "artifacts/pod.log")) as f):
         # metrics lines are printed in green. Look them up.
         in_green = False
         current_json = ""
         for line in f.readlines():
             extract_torchrun_execution_time(line)
+            extract_num_of_samples(line)
 
             if not in_green:
                 before, green_found, after = line.partition("[92m")
@@ -286,6 +302,13 @@ def extract_torchrun_execution_time(line):
             current_json = ""
             in_green = False
 
+        first_step_timestamp = datetime.datetime.fromisoformat(ilab_metrics.progress[0].timestamp)
+        last_step_timestamp = datetime.datetime.fromisoformat(ilab_metrics.progress[-1].timestamp)
+        period = (last_step_timestamp - first_step_timestamp).total_seconds()
+        num_samples = ilab_metrics.summary.num_samples - ilab_metrics.progress[0].batch_size
+        num_epochs = ilab_metrics.progress[-1].epoch
+        average_throughput = (num_samples+(ilab_metrics.summary.num_samples*num_epochs))/period
+        ilab_metrics.summary.average_throughput = average_throughput
     return ilab_metrics