diff --git a/docs/toolbox.generated/Fine_Tuning.run_fine_tuning_job.rst b/docs/toolbox.generated/Fine_Tuning.run_fine_tuning_job.rst index 54a46905d4..914cd53266 100644 --- a/docs/toolbox.generated/Fine_Tuning.run_fine_tuning_job.rst +++ b/docs/toolbox.generated/Fine_Tuning.run_fine_tuning_job.rst @@ -108,6 +108,11 @@ Parameters * If True, sets the 'limits' of the job with the same value as the request. +``shared_memory`` + +* Amount of shm (in GB) to give to each of the job pods + + ``prepare_only`` * If True, only prepare the environment but do not run the fine-tuning job. diff --git a/projects/fine_tuning/testing/config.yaml b/projects/fine_tuning/testing/config.yaml index e842bdd250..f7dd50c73f 100644 --- a/projects/fine_tuning/testing/config.yaml +++ b/projects/fine_tuning/testing/config.yaml @@ -364,21 +364,24 @@ ci_presets: tests.fine_tuning.ilab.enabled: true tests.fine_tuning.test_settings.name: ilab tests.fine_tuning.test_settings.dataset_name: ilab_skills_data.jsonl - tests.fine_tuning.test_settings.model_name: ibm-granite/granite-3b-code-instruct@hf + tests.fine_tuning.test_settings.model_name: ibm-granite/granite-3.0-8b-instruct@hf tests.fine_tuning.test_settings.dataset_replication: null matbench.workload: projects.fine_tuning.visualizations.ilab_training matbench.prom_workload: projects.fine_tuning.visualizations.ilab_prom matbench.config_file: ilab_training.yaml matbench.lts.generate: false + tests.fine_tuning.test_settings.shared_memory: 20 tests.fine_tuning.test_settings.hyper_parameters: num_epochs: null max_batch_len: null NCCL_SOCKET_NTHREADS: null + cpu_offload_optimizer: null + cpu_offload_params: null ilab_scale: extends: [ilab] - tests.fine_tuning.test_settings.model_name: ibm-granite/granite-7b-base@hf + tests.fine_tuning.test_settings.model_name: ibm-granite/granite-3.0-8b-instruct@hf tests.fine_tuning.test_settings.dataset_name: [ilab_large_10000samples_skills_data.jsonl, ilab_large_knowledge_data.jsonl] tests.fine_tuning.test_settings.pod_count: [1, 2, 4] @@ -440,25 +443,79 @@ ci_presets: tests.fine_tuning.test_settings.hyper_parameters.num_epochs: 1 tests.fine_tuning.node_count_equal_pod_count: true + ilab_2x8xh100_secondary_nic: + extends: [ilab] + + tests.fine_tuning.matbenchmarking.enabled: true + tests.fine_tuning.matbenchmarking.stop_on_error: false + + tests.fine_tuning.test_settings.model_name: ibm-granite/granite-3.0-8b-instruct@hf + tests.fine_tuning.test_settings.dataset_name: ilab_large_10000samples_skills_data.jsonl + + tests.fine_tuning.test_settings.gpu: 8 + tests.fine_tuning.test_settings.pod_count: 2 + + tests.fine_tuning.test_settings.secondary_nic_prefix: "network-port-" + tests.fine_tuning.test_settings.secondary_nic_count: [1, 2, 4, 6, 8] + + # tests.fine_tuning.test_settings.hyper_parameters.max_batch_len: [60000, 70000, 80000, 85000, 90000, 95000] + tests.fine_tuning.test_settings.hyper_parameters.max_batch_len: 85000 + tests.fine_tuning.test_settings.hyper_parameters.num_epochs: 1 + ilab_2x8xh100_scale: extends: [ilab] tests.fine_tuning.matbenchmarking.enabled: true tests.fine_tuning.matbenchmarking.stop_on_error: false - tests.fine_tuning.test_settings.model_name: ibm-granite/granite-7b-base@hf + tests.fine_tuning.test_settings.model_name: ibm-granite/granite-3.0-8b-instruct@hf tests.fine_tuning.test_settings.dataset_name: ilab_large_10000samples_skills_data.jsonl tests.fine_tuning.test_settings.gpu: 8 tests.fine_tuning.test_settings.pod_count: 2 tests.fine_tuning.test_settings.secondary_nic_prefix: "subnet-port-" - tests.fine_tuning.test_settings.secondary_nic_count: [8, 6, 4] + tests.fine_tuning.test_settings.secondary_nic_count: [8, 6, 4, 2, 1] # tests.fine_tuning.test_settings.hyper_parameters.max_batch_len: [60000, 70000, 80000, 85000, 90000, 95000] tests.fine_tuning.test_settings.hyper_parameters.max_batch_len: 85000 tests.fine_tuning.test_settings.hyper_parameters.num_epochs: 1 + ilab_2x8xh100_scale_rdma: + extends: [ilab] + + tests.fine_tuning.matbenchmarking.enabled: true + tests.fine_tuning.matbenchmarking.stop_on_error: false + + tests.fine_tuning.test_settings.model_name: ibm-granite/granite-3.0-8b-instruct@hf + tests.fine_tuning.test_settings.dataset_name: ilab_large_10000samples_skills_data.jsonl + + tests.fine_tuning.test_settings.gpu: 8 + tests.fine_tuning.test_settings.pod_count: 2 + + tests.fine_tuning.test_settings.secondary_nic_prefix: "subnet-rdma-port-" + tests.fine_tuning.test_settings.secondary_nic_count: [8, 6, 4, 2, 1] + + # tests.fine_tuning.test_settings.hyper_parameters.max_batch_len: [60000, 70000, 80000, 85000, 90000, 95000] + tests.fine_tuning.test_settings.hyper_parameters.max_batch_len: 85000 + tests.fine_tuning.test_settings.hyper_parameters.num_epochs: 1 + + ilab_1x8xh100_scale: + extends: [ilab] + + tests.fine_tuning.matbenchmarking.enabled: true + tests.fine_tuning.matbenchmarking.stop_on_error: false + + tests.fine_tuning.test_settings.model_name: ibm-granite/granite-3.0-8b-instruct@hf + tests.fine_tuning.test_settings.dataset_name: ilab_large_10000samples_skills_data.jsonl + + tests.fine_tuning.test_settings.gpu: 8 + tests.fine_tuning.test_settings.pod_count: 1 + + tests.fine_tuning.test_settings.hyper_parameters.max_batch_len: [20000, 60000] +# tests.fine_tuning.test_settings.hyper_parameters.max_batch_len: 85000 + tests.fine_tuning.test_settings.hyper_parameters.num_epochs: 1 + # --- cluster_instructlab: @@ -696,6 +753,7 @@ tests: model_name: bigscience/bloom-560m@hf dataset_name: twitter_complaints_small.json gpu: 1 + shared_memory: null dataset_replication: 1 container_image: null use_secondary_nic: null @@ -724,7 +782,8 @@ tests: image: quay.io/rhoai/ray:2.35.0-py311-cu121-torch24-fa26 ilab: enabled: false - image: registry.redhat.io/rhelai1/instructlab-nvidia-rhel9@sha256:b3dc9af0244aa6b84e6c3ef53e714a316daaefaae67e28de397cd71ee4b2ac7e + image: registry.redhat.io/rhelai1/instructlab-nvidia-rhel9@sha256:525ab53de3829cac1a9aabb73194f49e22da8fdcf12a01c56ece961300cdab0d + # instructlab 1.3 matbench: preset: null workload: projects.fine_tuning.visualizations.fine_tuning # actual workload must be a symlink to this dir diff --git a/projects/fine_tuning/toolbox/fine_tuning.py b/projects/fine_tuning/toolbox/fine_tuning.py index 7323c936c0..743491d2d4 100644 --- a/projects/fine_tuning/toolbox/fine_tuning.py +++ b/projects/fine_tuning/toolbox/fine_tuning.py @@ -34,7 +34,7 @@ def run_fine_tuning_job( memory=10, cpu=1, request_equals_limits=False, - + shared_memory=None, prepare_only=False, delete_other=False, @@ -72,7 +72,7 @@ def run_fine_tuning_job( memory: the number of RAM gigs to request for to the fine-tuning job (in Gigs) cpu: the number of CPU cores to request for the fine-tuning job (in cores) request_equals_limits: if True, sets the 'limits' of the job with the same value as the request. - + shared_memory: amount of shm (in GB) to give to each of the job pods prepare_only: if True, only prepare the environment but do not run the fine-tuning job. delete_other: if True, delete the other PyTorchJobs before running diff --git a/projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/defaults/main/config.yml b/projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/defaults/main/config.yml index 47b32da14f..8c4a374dcf 100644 --- a/projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/defaults/main/config.yml +++ b/projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/defaults/main/config.yml @@ -57,6 +57,9 @@ fine_tuning_run_fine_tuning_job_cpu: 1 # if True, sets the 'limits' of the job with the same value as the request. fine_tuning_run_fine_tuning_job_request_equals_limits: false +# amount of shm (in GB) to give to each of the job pods +fine_tuning_run_fine_tuning_job_shared_memory: null + # if True, only prepare the environment but do not run the fine-tuning job. fine_tuning_run_fine_tuning_job_prepare_only: false diff --git a/projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/files/entrypoint/run_ilab.sh b/projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/files/entrypoint/run_ilab.sh index eaa87217eb..e7348d2142 100644 --- a/projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/files/entrypoint/run_ilab.sh +++ b/projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/files/entrypoint/run_ilab.sh @@ -18,10 +18,34 @@ export TRITON_DUMP_DIR=$TRITON_HOME export TRITON_CACHE_DIR=$TRITON_HOME export TRITON_OVERRIDE_DIR=$TRITON_HOME + mkdir -p "$CACHE_DIR" +if [[ "${WITH_RDMA:-}" ]]; then + export NCCL_TOPO_FILE=/mnt/storage/topo.xml + num_rdma=$(ls /sys/class/infiniband/ | wc -l) + IFS=',' read -ra ADDR <<< "$NCCL_SOCKET_IFNAME" # Split by comma + length=${#ADDR[@]} # Get the length (number of elements in the array) + echo "Length of NCCL_SOCKET_IFNAME: $length" + NCCL_IB_HCA='' + for idx in $(seq $((num_rdma-1)) -1 $((num_rdma-length))); do + # Append the value to the NCCL_IB_HCA string + if [ -z "$NCCL_IB_HCA" ]; then + NCCL_IB_HCA="mlx5_$idx" # Initialize the string with the first value + else + NCCL_IB_HCA="$NCCL_IB_HCA,mlx5_$idx" # Append the next value with a comma + fi + done + export NCCL_IB_HCA="$NCCL_IB_HCA" + export NCCL_IB_DISABLE=0 + export NCCL_IB_GID_INDEX=3 + export NCCL_DEBUG=info + echo "Using $length SR-IOV NIC’s with rdma" +fi + if [[ "${NCCL_SOCKET_IFNAME:-}" ]]; then + MAPPING="$(cat /mnt/nic-mapping/nodename_ip_mapping.yaml)" for ifname in $(echo $NCCL_SOCKET_IFNAME | tr , " "); do current_ip=$(ip route | grep "$ifname " | cut -d" " -f9) diff --git a/projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/tasks/main.yml b/projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/tasks/main.yml index 8f7b6d6f42..a60cf4a280 100644 --- a/projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/tasks/main.yml +++ b/projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/tasks/main.yml @@ -1,4 +1,8 @@ --- +- name: Store the flag for rdma use + set_fact: + rdma_nics: "{{ fine_tuning_run_fine_tuning_job_use_secondary_nic and fine_tuning_run_fine_tuning_job_use_secondary_nic | select('search', 'rdma') | list}}" + - name: Create the src directory file: path: "{{ artifact_extra_logs_dir }}/src/" diff --git a/projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/templates/fine_tuning_job.yaml.j2 b/projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/templates/fine_tuning_job.yaml.j2 index 0671504136..59e7cf1fc2 100644 --- a/projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/templates/fine_tuning_job.yaml.j2 +++ b/projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/templates/fine_tuning_job.yaml.j2 @@ -80,6 +80,10 @@ spec: {% if fine_tuning_run_fine_tuning_job_sleep_forever %} - name: SLEEP_FOREVER value: "true" +{% endif %} +{% if rdma_nics %} + - name: WITH_RDMA + value: "true" {% endif %} - name: MODEL_NAME value: "{{ fine_tuning_run_fine_tuning_job_model_name}}" @@ -101,6 +105,12 @@ spec: mountPath: /mnt/entrypoint - name: config-volume mountPath: /mnt/config +{% if fine_tuning_run_fine_tuning_job_shared_memory %} + - name: shm-volume + mountPath: /dev/shm + - name: shared-volume + mountPath: /mnt/shared +{% endif %} - name: output-volume mountPath: /mnt/output {% if fine_tuning_run_fine_tuning_job_use_secondary_nic %} @@ -114,16 +124,30 @@ spec: {% endif %} memory: "{{ fine_tuning_run_fine_tuning_job_memory }}Gi" cpu: "{{ fine_tuning_run_fine_tuning_job_cpu }}" +{% if rdma_nics %} + nvidia.com/roce: "{{ fine_tuning_run_fine_tuning_job_use_secondary_nic | length }}" +{% endif %} {% if fine_tuning_run_fine_tuning_job_request_equals_limits %} limits: *request_block {% elif fine_tuning_run_fine_tuning_job_gpu %} limits: nvidia.com/gpu: "{{ fine_tuning_run_fine_tuning_job_gpu }}" +{% if rdma_nics %} + nvidia.com/roce: "{{ fine_tuning_run_fine_tuning_job_use_secondary_nic | length }}" +{% endif %} {% endif %} volumes: - name: storage-volume persistentVolumeClaim: claimName: {{ fine_tuning_run_fine_tuning_job_pvc_name }} +{% if fine_tuning_run_fine_tuning_job_shared_memory %} + - name: shm-volume + emptyDir: + medium: Memory + sizeLimit: "{{ fine_tuning_run_fine_tuning_job_shared_memory }}Gi" + - name: shared-volume + emptyDir: {} +{% endif %} - name: config-volume configMap: name: {{ job_name_safe }}-config diff --git a/projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/templates/ilab_base_config.yaml.j2 b/projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/templates/ilab_base_config.yaml.j2 index 1497d0c401..c635cafab9 100644 --- a/projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/templates/ilab_base_config.yaml.j2 +++ b/projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/templates/ilab_base_config.yaml.j2 @@ -17,8 +17,6 @@ save_samples: 0 log_level: INFO max_batch_len: 20000 seed: 42 -cpu_offload_optimizer: true distributed_training_framework: fsdp -cpu_offload_params: true -is_granite: true +use_dolomite: true #checkpoint_at_epoch: true diff --git a/projects/fine_tuning/visualizations/fine_tuning/plotting/report.py b/projects/fine_tuning/visualizations/fine_tuning/plotting/report.py index f7d348194e..5162af5cff 100644 --- a/projects/fine_tuning/visualizations/fine_tuning/plotting/report.py +++ b/projects/fine_tuning/visualizations/fine_tuning/plotting/report.py @@ -29,7 +29,7 @@ def __init__(self, flavor): from ..store import parsers self.summary_keys = parsers.SFT_TRAINER_SUMMARY_KEYS if flavor == "SFTTrainer" \ - else (parsers.ILAB_SUMMARY_KEYS | parsers.ILAB_PROGRESS_KEYS) + else parsers.ILAB_SUMMARY_KEYS self.progress_keys = parsers.SFT_TRAINER_PROGRESS_KEYS if flavor == "SFTTrainer" \ else parsers.ILAB_PROGRESS_KEYS diff --git a/projects/fine_tuning/visualizations/fine_tuning/store/parsers.py b/projects/fine_tuning/visualizations/fine_tuning/store/parsers.py index 9de6929045..577af0be14 100644 --- a/projects/fine_tuning/visualizations/fine_tuning/store/parsers.py +++ b/projects/fine_tuning/visualizations/fine_tuning/store/parsers.py @@ -220,6 +220,7 @@ def parse_dataset_stats(data): ILAB_SUMMARY_KEYS = { "torchrun_exec_time": types.SimpleNamespace(lower_better=True, units="minutes", title="Execution wall-time"), + "average_throughput": types.SimpleNamespace(lower_better=False, units="samples/second", title="Average throughput"), } """ @@ -260,12 +261,27 @@ def extract_torchrun_execution_time(line): ilab_metrics.summary.torchrun_exec_time = int(time_str) / 60 # convert from seconds to minutes - with open(register_important_file(dirname, artifact_paths.FINE_TUNING_RUN_FINE_TUNING_DIR / "artifacts/pod.log")) as f: + def extract_num_of_samples(line): + if not line.startswith("Map (num_proc=8): 100%"): + return + + _not_used, has_it, after = line.partition("Map (num_proc=8): 100%|██████████| ") + if not has_it: return + + num_samples, has_it, after = after.partition("/") + if not has_it: + log.error(f"Invalid Map line :/ '{line}'") + return + + ilab_metrics.summary.num_samples = int(num_samples) + + with (open(register_important_file(dirname, artifact_paths.FINE_TUNING_RUN_FINE_TUNING_DIR / "artifacts/pod.log")) as f): # metrics lines are printed in green. Look them up. in_green = False current_json = "" for line in f.readlines(): extract_torchrun_execution_time(line) + extract_num_of_samples(line) if not in_green: before, green_found, after = line.partition("[92m") @@ -286,6 +302,13 @@ def extract_torchrun_execution_time(line): current_json = "" in_green = False + first_step_timestamp = datetime.datetime.fromisoformat(ilab_metrics.progress[0].timestamp) + last_step_timestamp = datetime.datetime.fromisoformat(ilab_metrics.progress[-1].timestamp) + period = (last_step_timestamp - first_step_timestamp).total_seconds() + num_samples = ilab_metrics.summary.num_samples - ilab_metrics.progress[0].batch_size + num_epochs = ilab_metrics.progress[-1].epoch + average_throughput = (num_samples+(ilab_metrics.summary.num_samples*num_epochs))/period + ilab_metrics.summary.average_throughput = average_throughput return ilab_metrics