Skip to content

Commit

Permalink
[fine_tuning] ilab: correct throughput and new ilab image (#635)
Browse files Browse the repository at this point in the history
  • Loading branch information
openshift-merge-bot[bot] authored Jan 21, 2025
2 parents 7a521a2 + bcea53f commit 82a15ab
Show file tree
Hide file tree
Showing 10 changed files with 152 additions and 12 deletions.
5 changes: 5 additions & 0 deletions docs/toolbox.generated/Fine_Tuning.run_fine_tuning_job.rst
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,11 @@ Parameters
* If True, sets the 'limits' of the job with the same value as the request.


``shared_memory``

* Amount of shm (in GB) to give to each of the job pods


``prepare_only``

* If True, only prepare the environment but do not run the fine-tuning job.
Expand Down
69 changes: 64 additions & 5 deletions projects/fine_tuning/testing/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -364,21 +364,24 @@ ci_presets:
tests.fine_tuning.ilab.enabled: true
tests.fine_tuning.test_settings.name: ilab
tests.fine_tuning.test_settings.dataset_name: ilab_skills_data.jsonl
tests.fine_tuning.test_settings.model_name: ibm-granite/granite-3b-code-instruct@hf
tests.fine_tuning.test_settings.model_name: ibm-granite/granite-3.0-8b-instruct@hf
tests.fine_tuning.test_settings.dataset_replication: null
matbench.workload: projects.fine_tuning.visualizations.ilab_training
matbench.prom_workload: projects.fine_tuning.visualizations.ilab_prom
matbench.config_file: ilab_training.yaml
matbench.lts.generate: false
tests.fine_tuning.test_settings.shared_memory: 20
tests.fine_tuning.test_settings.hyper_parameters:
num_epochs: null
max_batch_len: null
NCCL_SOCKET_NTHREADS: null
cpu_offload_optimizer: null
cpu_offload_params: null

ilab_scale:
extends: [ilab]

tests.fine_tuning.test_settings.model_name: ibm-granite/granite-7b-base@hf
tests.fine_tuning.test_settings.model_name: ibm-granite/granite-3.0-8b-instruct@hf
tests.fine_tuning.test_settings.dataset_name: [ilab_large_10000samples_skills_data.jsonl, ilab_large_knowledge_data.jsonl]

tests.fine_tuning.test_settings.pod_count: [1, 2, 4]
Expand Down Expand Up @@ -440,25 +443,79 @@ ci_presets:
tests.fine_tuning.test_settings.hyper_parameters.num_epochs: 1
tests.fine_tuning.node_count_equal_pod_count: true

ilab_2x8xh100_secondary_nic:
extends: [ilab]

tests.fine_tuning.matbenchmarking.enabled: true
tests.fine_tuning.matbenchmarking.stop_on_error: false

tests.fine_tuning.test_settings.model_name: ibm-granite/granite-3.0-8b-instruct@hf
tests.fine_tuning.test_settings.dataset_name: ilab_large_10000samples_skills_data.jsonl

tests.fine_tuning.test_settings.gpu: 8
tests.fine_tuning.test_settings.pod_count: 2

tests.fine_tuning.test_settings.secondary_nic_prefix: "network-port-"
tests.fine_tuning.test_settings.secondary_nic_count: [1, 2, 4, 6, 8]

# tests.fine_tuning.test_settings.hyper_parameters.max_batch_len: [60000, 70000, 80000, 85000, 90000, 95000]
tests.fine_tuning.test_settings.hyper_parameters.max_batch_len: 85000
tests.fine_tuning.test_settings.hyper_parameters.num_epochs: 1

ilab_2x8xh100_scale:
extends: [ilab]

tests.fine_tuning.matbenchmarking.enabled: true
tests.fine_tuning.matbenchmarking.stop_on_error: false

tests.fine_tuning.test_settings.model_name: ibm-granite/granite-7b-base@hf
tests.fine_tuning.test_settings.model_name: ibm-granite/granite-3.0-8b-instruct@hf
tests.fine_tuning.test_settings.dataset_name: ilab_large_10000samples_skills_data.jsonl

tests.fine_tuning.test_settings.gpu: 8
tests.fine_tuning.test_settings.pod_count: 2

tests.fine_tuning.test_settings.secondary_nic_prefix: "subnet-port-"
tests.fine_tuning.test_settings.secondary_nic_count: [8, 6, 4]
tests.fine_tuning.test_settings.secondary_nic_count: [8, 6, 4, 2, 1]

# tests.fine_tuning.test_settings.hyper_parameters.max_batch_len: [60000, 70000, 80000, 85000, 90000, 95000]
tests.fine_tuning.test_settings.hyper_parameters.max_batch_len: 85000
tests.fine_tuning.test_settings.hyper_parameters.num_epochs: 1

ilab_2x8xh100_scale_rdma:
extends: [ilab]

tests.fine_tuning.matbenchmarking.enabled: true
tests.fine_tuning.matbenchmarking.stop_on_error: false

tests.fine_tuning.test_settings.model_name: ibm-granite/granite-3.0-8b-instruct@hf
tests.fine_tuning.test_settings.dataset_name: ilab_large_10000samples_skills_data.jsonl

tests.fine_tuning.test_settings.gpu: 8
tests.fine_tuning.test_settings.pod_count: 2

tests.fine_tuning.test_settings.secondary_nic_prefix: "subnet-rdma-port-"
tests.fine_tuning.test_settings.secondary_nic_count: [8, 6, 4, 2, 1]

# tests.fine_tuning.test_settings.hyper_parameters.max_batch_len: [60000, 70000, 80000, 85000, 90000, 95000]
tests.fine_tuning.test_settings.hyper_parameters.max_batch_len: 85000
tests.fine_tuning.test_settings.hyper_parameters.num_epochs: 1

ilab_1x8xh100_scale:
extends: [ilab]

tests.fine_tuning.matbenchmarking.enabled: true
tests.fine_tuning.matbenchmarking.stop_on_error: false

tests.fine_tuning.test_settings.model_name: ibm-granite/granite-3.0-8b-instruct@hf
tests.fine_tuning.test_settings.dataset_name: ilab_large_10000samples_skills_data.jsonl

tests.fine_tuning.test_settings.gpu: 8
tests.fine_tuning.test_settings.pod_count: 1

tests.fine_tuning.test_settings.hyper_parameters.max_batch_len: [20000, 60000]
# tests.fine_tuning.test_settings.hyper_parameters.max_batch_len: 85000
tests.fine_tuning.test_settings.hyper_parameters.num_epochs: 1

# ---

cluster_instructlab:
Expand Down Expand Up @@ -696,6 +753,7 @@ tests:
model_name: bigscience/bloom-560m@hf
dataset_name: twitter_complaints_small.json
gpu: 1
shared_memory: null
dataset_replication: 1
container_image: null
use_secondary_nic: null
Expand Down Expand Up @@ -724,7 +782,8 @@ tests:
image: quay.io/rhoai/ray:2.35.0-py311-cu121-torch24-fa26
ilab:
enabled: false
image: registry.redhat.io/rhelai1/instructlab-nvidia-rhel9@sha256:b3dc9af0244aa6b84e6c3ef53e714a316daaefaae67e28de397cd71ee4b2ac7e
image: registry.redhat.io/rhelai1/instructlab-nvidia-rhel9@sha256:525ab53de3829cac1a9aabb73194f49e22da8fdcf12a01c56ece961300cdab0d
# instructlab 1.3
matbench:
preset: null
workload: projects.fine_tuning.visualizations.fine_tuning # actual workload must be a symlink to this dir
Expand Down
4 changes: 2 additions & 2 deletions projects/fine_tuning/toolbox/fine_tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def run_fine_tuning_job(
memory=10,
cpu=1,
request_equals_limits=False,

shared_memory=None,
prepare_only=False,
delete_other=False,

Expand Down Expand Up @@ -72,7 +72,7 @@ def run_fine_tuning_job(
memory: the number of RAM gigs to request for to the fine-tuning job (in Gigs)
cpu: the number of CPU cores to request for the fine-tuning job (in cores)
request_equals_limits: if True, sets the 'limits' of the job with the same value as the request.
shared_memory: amount of shm (in GB) to give to each of the job pods
prepare_only: if True, only prepare the environment but do not run the fine-tuning job.
delete_other: if True, delete the other PyTorchJobs before running
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ fine_tuning_run_fine_tuning_job_cpu: 1
# if True, sets the 'limits' of the job with the same value as the request.
fine_tuning_run_fine_tuning_job_request_equals_limits: false

# amount of shm (in GB) to give to each of the job pods
fine_tuning_run_fine_tuning_job_shared_memory: null

# if True, only prepare the environment but do not run the fine-tuning job.
fine_tuning_run_fine_tuning_job_prepare_only: false

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,34 @@ export TRITON_DUMP_DIR=$TRITON_HOME
export TRITON_CACHE_DIR=$TRITON_HOME
export TRITON_OVERRIDE_DIR=$TRITON_HOME


mkdir -p "$CACHE_DIR"

if [[ "${WITH_RDMA:-}" ]]; then
export NCCL_TOPO_FILE=/mnt/storage/topo.xml
num_rdma=$(ls /sys/class/infiniband/ | wc -l)
IFS=',' read -ra ADDR <<< "$NCCL_SOCKET_IFNAME" # Split by comma
length=${#ADDR[@]} # Get the length (number of elements in the array)
echo "Length of NCCL_SOCKET_IFNAME: $length"
NCCL_IB_HCA=''
for idx in $(seq $((num_rdma-1)) -1 $((num_rdma-length))); do
# Append the value to the NCCL_IB_HCA string
if [ -z "$NCCL_IB_HCA" ]; then
NCCL_IB_HCA="mlx5_$idx" # Initialize the string with the first value
else
NCCL_IB_HCA="$NCCL_IB_HCA,mlx5_$idx" # Append the next value with a comma
fi
done
export NCCL_IB_HCA="$NCCL_IB_HCA"
export NCCL_IB_DISABLE=0
export NCCL_IB_GID_INDEX=3
export NCCL_DEBUG=info
echo "Using $length SR-IOV NIC’s with rdma"
fi

if [[ "${NCCL_SOCKET_IFNAME:-}" ]]; then


MAPPING="$(cat /mnt/nic-mapping/nodename_ip_mapping.yaml)"
for ifname in $(echo $NCCL_SOCKET_IFNAME | tr , " "); do
current_ip=$(ip route | grep "$ifname " | cut -d" " -f9)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
---
- name: Store the flag for rdma use
set_fact:
rdma_nics: "{{ fine_tuning_run_fine_tuning_job_use_secondary_nic and fine_tuning_run_fine_tuning_job_use_secondary_nic | select('search', 'rdma') | list}}"

Check warning on line 4 in projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/tasks/main.yml

View workflow job for this annotation

GitHub Actions / build

jinja[spacing]

Jinja2 spacing could be improved: {{ fine_tuning_run_fine_tuning_job_use_secondary_nic and fine_tuning_run_fine_tuning_job_use_secondary_nic | select('search', 'rdma') | list}} -> {{ fine_tuning_run_fine_tuning_job_use_secondary_nic and fine_tuning_run_fine_tuning_job_use_secondary_nic | select('search', 'rdma') | list }}

Check warning on line 4 in projects/fine_tuning/toolbox/fine_tuning_run_fine_tuning_job/tasks/main.yml

View workflow job for this annotation

GitHub Actions / build

jinja[spacing]

Jinja2 spacing could be improved: {{ fine_tuning_run_fine_tuning_job_use_secondary_nic and fine_tuning_run_fine_tuning_job_use_secondary_nic | select('search', 'rdma') | list}} -> {{ fine_tuning_run_fine_tuning_job_use_secondary_nic and fine_tuning_run_fine_tuning_job_use_secondary_nic | select('search', 'rdma') | list }}

- name: Create the src directory
file:
path: "{{ artifact_extra_logs_dir }}/src/"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ spec:
{% if fine_tuning_run_fine_tuning_job_sleep_forever %}
- name: SLEEP_FOREVER
value: "true"
{% endif %}
{% if rdma_nics %}
- name: WITH_RDMA
value: "true"
{% endif %}
- name: MODEL_NAME
value: "{{ fine_tuning_run_fine_tuning_job_model_name}}"
Expand All @@ -101,6 +105,12 @@ spec:
mountPath: /mnt/entrypoint
- name: config-volume
mountPath: /mnt/config
{% if fine_tuning_run_fine_tuning_job_shared_memory %}
- name: shm-volume
mountPath: /dev/shm
- name: shared-volume
mountPath: /mnt/shared
{% endif %}
- name: output-volume
mountPath: /mnt/output
{% if fine_tuning_run_fine_tuning_job_use_secondary_nic %}
Expand All @@ -114,16 +124,30 @@ spec:
{% endif %}
memory: "{{ fine_tuning_run_fine_tuning_job_memory }}Gi"
cpu: "{{ fine_tuning_run_fine_tuning_job_cpu }}"
{% if rdma_nics %}
nvidia.com/roce: "{{ fine_tuning_run_fine_tuning_job_use_secondary_nic | length }}"
{% endif %}
{% if fine_tuning_run_fine_tuning_job_request_equals_limits %}
limits: *request_block
{% elif fine_tuning_run_fine_tuning_job_gpu %}
limits:
nvidia.com/gpu: "{{ fine_tuning_run_fine_tuning_job_gpu }}"
{% if rdma_nics %}
nvidia.com/roce: "{{ fine_tuning_run_fine_tuning_job_use_secondary_nic | length }}"
{% endif %}
{% endif %}
volumes:
- name: storage-volume
persistentVolumeClaim:
claimName: {{ fine_tuning_run_fine_tuning_job_pvc_name }}
{% if fine_tuning_run_fine_tuning_job_shared_memory %}
- name: shm-volume
emptyDir:
medium: Memory
sizeLimit: "{{ fine_tuning_run_fine_tuning_job_shared_memory }}Gi"
- name: shared-volume
emptyDir: {}
{% endif %}
- name: config-volume
configMap:
name: {{ job_name_safe }}-config
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@ save_samples: 0
log_level: INFO
max_batch_len: 20000
seed: 42
cpu_offload_optimizer: true
distributed_training_framework: fsdp
cpu_offload_params: true
is_granite: true
use_dolomite: true
#checkpoint_at_epoch: true
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def __init__(self, flavor):
from ..store import parsers

self.summary_keys = parsers.SFT_TRAINER_SUMMARY_KEYS if flavor == "SFTTrainer" \
else (parsers.ILAB_SUMMARY_KEYS | parsers.ILAB_PROGRESS_KEYS)
else parsers.ILAB_SUMMARY_KEYS

self.progress_keys = parsers.SFT_TRAINER_PROGRESS_KEYS if flavor == "SFTTrainer" \
else parsers.ILAB_PROGRESS_KEYS
Expand Down
25 changes: 24 additions & 1 deletion projects/fine_tuning/visualizations/fine_tuning/store/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ def parse_dataset_stats(data):

ILAB_SUMMARY_KEYS = {
"torchrun_exec_time": types.SimpleNamespace(lower_better=True, units="minutes", title="Execution wall-time"),
"average_throughput": types.SimpleNamespace(lower_better=False, units="samples/second", title="Average throughput"),
}

"""
Expand Down Expand Up @@ -260,12 +261,27 @@ def extract_torchrun_execution_time(line):

ilab_metrics.summary.torchrun_exec_time = int(time_str) / 60 # convert from seconds to minutes

with open(register_important_file(dirname, artifact_paths.FINE_TUNING_RUN_FINE_TUNING_DIR / "artifacts/pod.log")) as f:
def extract_num_of_samples(line):
if not line.startswith("Map (num_proc=8): 100%"):
return

_not_used, has_it, after = line.partition("Map (num_proc=8): 100%|██████████| ")
if not has_it: return

num_samples, has_it, after = after.partition("/")
if not has_it:
log.error(f"Invalid Map line :/ '{line}'")
return

ilab_metrics.summary.num_samples = int(num_samples)

with (open(register_important_file(dirname, artifact_paths.FINE_TUNING_RUN_FINE_TUNING_DIR / "artifacts/pod.log")) as f):
# metrics lines are printed in green. Look them up.
in_green = False
current_json = ""
for line in f.readlines():
extract_torchrun_execution_time(line)
extract_num_of_samples(line)

if not in_green:
before, green_found, after = line.partition("[92m")
Expand All @@ -286,6 +302,13 @@ def extract_torchrun_execution_time(line):
current_json = ""
in_green = False

first_step_timestamp = datetime.datetime.fromisoformat(ilab_metrics.progress[0].timestamp)
last_step_timestamp = datetime.datetime.fromisoformat(ilab_metrics.progress[-1].timestamp)
period = (last_step_timestamp - first_step_timestamp).total_seconds()
num_samples = ilab_metrics.summary.num_samples - ilab_metrics.progress[0].batch_size
num_epochs = ilab_metrics.progress[-1].epoch
average_throughput = (num_samples+(ilab_metrics.summary.num_samples*num_epochs))/period
ilab_metrics.summary.average_throughput = average_throughput
return ilab_metrics


Expand Down

0 comments on commit 82a15ab

Please sign in to comment.