From 231f34f0876fa7db2f00d07e1d490be99c5c1e2b Mon Sep 17 00:00:00 2001 From: Nikhil-Nunna Date: Mon, 3 Feb 2025 22:17:48 -0600 Subject: [PATCH 01/26] Initial Script ready for review --- tools/scripts/CS_conf.sh | 177 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100755 tools/scripts/CS_conf.sh diff --git a/tools/scripts/CS_conf.sh b/tools/scripts/CS_conf.sh new file mode 100755 index 000000000..9066fa5d3 --- /dev/null +++ b/tools/scripts/CS_conf.sh @@ -0,0 +1,177 @@ +#!/bin/bash + + +function try() { + local func_name="$1" + local func_call="$2" + local file_output="${func_name}_output.txt" + local output + + # Shift to get past the label of the call/output file + shift + # Shift the arguments to pass the remaining ones to the called function + shift + + # Run the command/function and capture its output + output="$("${func_call}" "$@" 2>&1)" + local exit_status=$? + + # Check if the command/function succeeded or failed + if [ $exit_status -ne 0 ]; then + catch "${func_call}" "${output}" "${func_name}" + else + echo "${output}" > "${file_output}" + echo "${func_name} was successful. Output saved to ${file_output}" + fi +} + +function catch() { + local func_call="$1" + local error_message="$2" + local func_name="$3" + echo "An error occurred during ${func_call}" + echo "${error_message}" + echo "in step ${func_name}" +} + +# Function to get version info about ROCm +function rocmver() +{ + # Store the output of rocminfo in a variable + rocminfo_output=$(rocminfo) + + # Grep the variable content for lines containing 'version' + version_info=$(echo "$rocminfo_output" | grep -i "version") + echo "$version_info" +} + + +# Function to get AMD GPU driver version +function amdgpuver() +{ + + # Store the output of dkms in a variable + dkms_output=$(dkms status) + + # Grep the variable content for lines containing 'amdgpu' + amdgpu=$(echo "$dkms_output" | grep "amdgpu") + echo "$amdgpu" + +} + +# Function to Query ACS +function ACSinfo() +{ + + # Store the output of lspci in a variable + lspci_output=$(lspci -vvv) + + # Grep the variable content for lines containing 'ACSCtl' + acs=$(echo "$lspci_output" | grep ACSCtl) + echo "$acs" + +} + +# ROCm version +try "ROCm_version" rocmver +echo "" + +# GPU VRAM info +try "VRAM_info" rocm-smi --showmeminfo vram +echo "" + +# HIP version +try "hip_version" hipconfig --version +echo "" + +# echo "6. RCCL version" ############################################ TO DO + +# echo "" + +# echo "7. RCCL-Tests version" + +# echo "" ################################################################ END TO DO + +# UCX version +try "UCX_version" /opt/ucx/bin/ucx_info -v +echo "" + +# MPI version4 +try "MPI_version4" /opt/ompi4/bin/mpirun --version # the exact path might need to be removed in the context of debug +echo "" + +# MPI version4 +try "MPI_version5" /opt/ompi5/bin/mpirun --version +echo "" + +# OS version +try "OS_version" cat /etc/os-release +echo "" + +# Linux kernel version +try "Linux_Kernel_version" uname -r +echo "" + +# ulimit -a +try "System_resource_allocation" ulimit -a +echo "" + +# Environment Variable Config +try "Environment_Variable_Config" env +echo "" + +# Rdma link info +try "rdma_link" rdma link +echo "" + +# Query Numa balancing status +Try "Numa_Balancing" cat /proc/sys/kernel/numa_balancing +echo "" + + + +# Infiniband device info +# IB device status +try "IB_device_status" ibstatus +echo "" + +# IB device GUIDs +try "IB_devices" ibv_devices +echo "" + +# IB device info +try "IB_devinfo" ibv_devinfo +echo "" + +# IB device status alternate +try "IB_stat" ibstat +echo "" + +# DKMS module info +try "dkms_status" dkms status +echo "" + +# AMDKFD (GPU Driver version) +try "GPU_Driver_Version" amdgpuver +echo "" + + + +# Network information +# IP addresses +try "IP_address_info" ip a +echo "" + +# Network Interface state +try "IP_link_info" ip link +echo "" + +# Route table info +try "IP_route_info" ip route +echo "" + +# Access control service info +try "ACS_info" ACSinfo +echo "" + +# I think after I'm down I need to have all logs output to a folder, just a note to remind myself to do so \ No newline at end of file From 7314b26ff315a207bb6ea487b2a58649bb97062b Mon Sep 17 00:00:00 2001 From: Nikhil-Nunna Date: Tue, 4 Feb 2025 13:59:35 -0600 Subject: [PATCH 02/26] Added RCCL-tests and RCCL versions --- tools/scripts/CS_conf.sh | 87 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 80 insertions(+), 7 deletions(-) diff --git a/tools/scripts/CS_conf.sh b/tools/scripts/CS_conf.sh index 9066fa5d3..11f7216fe 100755 --- a/tools/scripts/CS_conf.sh +++ b/tools/scripts/CS_conf.sh @@ -1,6 +1,34 @@ #!/bin/bash +##################################################### README ##################################################### + + + + + + + + + + + + + + + + + + + + + + +##################################################### end README ##################################################### + + +##################################################### define necessary functions ##################################################### + function try() { local func_name="$1" local func_call="$2" @@ -72,6 +100,52 @@ function ACSinfo() } +# Function to get rccl and rccl-tests version +run_rccl_tests() { + + local rccl_tests_dir="$1" + + # Get rccl-tests branch and version information + local rccl_tests_branch=$(git -C "${rccl_tests_dir}" rev-parse --abbrev-ref HEAD) + local rccl_tests_version=$(git -C "${rccl_tests_dir}" log -1 --format="%H") + + # Set the flag to display RCCL version during the run + export NCCL_DEBUG=VERSION + + + # Run the rccl-tests + # Replace this line with the actual command to run rccl-tests in your environment + + local output_file="rccl_tests_output.txt" + + $1/build/all_reduce_perf -b 8 -e 16M -f 8 -g 2 > "${output_file}" + + # Unset the flag after execution + unset NCCL_DEBUG + + # Extract RCCL, HIP, and ROCm versions from the output file + local rccl_version=$(grep "RCCL version" "${output_file}" | awk '{print $4}') + local hip_version=$(grep "HIP version" "${output_file}" | awk -F ': ' '{print $2}') + local rocm_version=$(grep "ROCm version" "${output_file}" | awk -F ': ' '{print $2}') + + + # Display extracted version information + echo "RCCL Version: ${rccl_version}" + echo "HIP Version: ${hip_version}" + echo "ROCm Version: ${rocm_version}" + + # Display rccl-tests branch and version information + echo "RCCL-Tests Branch: ${rccl_tests_branch}" + echo "RCCL-Tests Version: ${rccl_tests_version}" +} + +##################################################### end define necessary functions ##################################################### + + + + +##################################################### query system with functions and commands for config info ##################################################### + # ROCm version try "ROCm_version" rocmver echo "" @@ -84,13 +158,9 @@ echo "" try "hip_version" hipconfig --version echo "" -# echo "6. RCCL version" ############################################ TO DO - -# echo "" - -# echo "7. RCCL-Tests version" - -# echo "" ################################################################ END TO DO +# RCCL version and RCCL tests version +try "RCCL_and_RCCL_tests_version" run_rccl_tests $1 +echo "" # UCX version try "UCX_version" /opt/ucx/bin/ucx_info -v @@ -174,4 +244,7 @@ echo "" try "ACS_info" ACSinfo echo "" +##################################################### end query system with functions and commands for config info ##################################################### + + # I think after I'm down I need to have all logs output to a folder, just a note to remind myself to do so \ No newline at end of file From f3e5ab1fc0ae0616b5572acca6f4ed96853d4bc6 Mon Sep 17 00:00:00 2001 From: Nikhil-Nunna Date: Tue, 4 Feb 2025 17:10:57 -0600 Subject: [PATCH 03/26] Added output folder and README --- tools/scripts/CS_conf.sh | 45 +++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/tools/scripts/CS_conf.sh b/tools/scripts/CS_conf.sh index 11f7216fe..d960570e9 100755 --- a/tools/scripts/CS_conf.sh +++ b/tools/scripts/CS_conf.sh @@ -3,30 +3,29 @@ ##################################################### README ##################################################### +# This script only requires 1 input arguement, it is the path to the rccl-tests repo. EX: /path/to/rccl-tests/ +# Things to check before running this script, if things from the below list it is fine the script will still continue to +# run but it may produce an error on the missing parts. +# 1. UCX bin folder is on path or in default location under opt +# 2. OMPI bin folder is on path or in default location under opt +# 3. RCCL is either built and added to path or default loction is on path +# 4. RCCL-Tests are built +# 5. rocm-smi is on path +# 6. hipconfig is on path +# 7. rocminfo is on path +# 8. ibstatus is on path +# 9. ibv_devices is on path +# 10. ibv_devinfo is on path +# 11. ibstat device GUIDs - - - - - - - - - - - - - - - - - +# All output will be in a folder called conf-script-output that will be created in the same directory as the script ##################################################### end README ##################################################### + ##################################################### define necessary functions ##################################################### function try() { @@ -48,7 +47,7 @@ function try() { if [ $exit_status -ne 0 ]; then catch "${func_call}" "${output}" "${func_name}" else - echo "${output}" > "${file_output}" + echo "${output}" > "conf-script-output/${file_output}" echo "${func_name} was successful. Output saved to ${file_output}" fi } @@ -143,6 +142,13 @@ run_rccl_tests() { +##################################################### setup output folder ##################################################### + +mkdir conf-script-output + +##################################################### end setup output folder ##################################################### + + ##################################################### query system with functions and commands for config info ##################################################### @@ -245,6 +251,3 @@ try "ACS_info" ACSinfo echo "" ##################################################### end query system with functions and commands for config info ##################################################### - - -# I think after I'm down I need to have all logs output to a folder, just a note to remind myself to do so \ No newline at end of file From 24b755d5bb7731e1f4ba18265020dea96ceeca63 Mon Sep 17 00:00:00 2001 From: Nikhil-Nunna Date: Wed, 12 Feb 2025 17:02:04 -0600 Subject: [PATCH 04/26] Base format built --- tools/scripts/CS_conf.py | 80 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 tools/scripts/CS_conf.py diff --git a/tools/scripts/CS_conf.py b/tools/scripts/CS_conf.py new file mode 100644 index 000000000..93b7599ac --- /dev/null +++ b/tools/scripts/CS_conf.py @@ -0,0 +1,80 @@ +import subprocess +import time +import os +import re + +# Function to run a CLI command and return its output +def run_cli_command(command): + try: + result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True) + return result + except Exception as e: + return f"Error: {str(e)}" + +# Get the status of a particular command +def status_check(summary, result): + # List of errors to check + error_list = [r'No such file or directory', r'Command not found', r'Permission denied', r'cannot access', r'error'] + status = "OK" + if summary == "Missing Data": + status = "WARN" + for error in error_list: + match = re.search(error, result.stderr, re.IGNORECASE) + if match: + status = "WARN" + break + return status + + +# Get OS version +def get_os_version(): + result = run_cli_command('cat /etc/os-release') + match = re.search(r'PRETTY_NAME="(.+)"', result.stdout) + if match: + summary = match.group(1) + else: + summary = "Missing Data" + return summary, result + + +def get_config(): + # Run the commands and store the command outputs + + # OS version + os_summary, os_result = get_os_version() + os_status = status_check(os_summary, os_result) + + + # Create the summary table + summary_table = ( + f"\n\n{'='*60}\n" + f"{'Component':<17}| {'Status':<13} | Value\n" + f"{'='*60}\n" + f"OS Version{' ':<7}| {os_status:<13} | {os_summary}\n" + f"{'='*60}\n\n\n" + ) + + # Combine details + details = ( + f"{'='*30} OS info {'='*30}\n\n" + f"{os_result.stdout}{os_result.stderr}\n\n" + f"{'='*30} Env vars {'='*30}\n\n" + ) + return summary_table, details + + +def main(): + hostname = os.uname().nodename + timestamp = time.strftime("%Y%m%d_%H%M%S") + file_name = f"config.{hostname}.{timestamp}.txt" + + summary_table, details = get_config() + + + # Write the summary table and details to the output file + with open(file_name, "w") as file: + file.write(summary_table) + file.write(details) + +if __name__ == '__main__': + main() \ No newline at end of file From fe016114c45a61da29cce32ee8433cca19c143b9 Mon Sep 17 00:00:00 2001 From: Nikhil-Nunna Date: Wed, 12 Feb 2025 18:22:44 -0600 Subject: [PATCH 05/26] Added ROCm version --- tools/scripts/CS_conf.py | 65 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 63 insertions(+), 2 deletions(-) diff --git a/tools/scripts/CS_conf.py b/tools/scripts/CS_conf.py index 93b7599ac..86d39643f 100644 --- a/tools/scripts/CS_conf.py +++ b/tools/scripts/CS_conf.py @@ -36,6 +36,14 @@ def get_os_version(): summary = "Missing Data" return summary, result +# Get ROCm Version +def get_ROCm_version(): + result = run_cli_command('cat /opt/rocm/.info/version') + if result.stdout: + summary = result.stdout + else: + summary = "Missing Data" + return summary, result def get_config(): # Run the commands and store the command outputs @@ -44,6 +52,10 @@ def get_config(): os_summary, os_result = get_os_version() os_status = status_check(os_summary, os_result) + # ROCm Version + ROCm_summary, ROCm_result = get_ROCm_version() + ROCm_status = status_check(ROCm_summary, ROCm_result) + # Create the summary table summary_table = ( @@ -51,6 +63,7 @@ def get_config(): f"{'Component':<17}| {'Status':<13} | Value\n" f"{'='*60}\n" f"OS Version{' ':<7}| {os_status:<13} | {os_summary}\n" + f"ROCm Version{' ':<5}| {ROCm_status:<13} | {ROCm_summary}\n" f"{'='*60}\n\n\n" ) @@ -58,7 +71,8 @@ def get_config(): details = ( f"{'='*30} OS info {'='*30}\n\n" f"{os_result.stdout}{os_result.stderr}\n\n" - f"{'='*30} Env vars {'='*30}\n\n" + f"{'='*30} ROCm Version {'='*30}\n\n" + f"{ROCm_result.stdout}{ROCm_result.stderr}\n\n" ) return summary_table, details @@ -77,4 +91,51 @@ def main(): file.write(details) if __name__ == '__main__': - main() \ No newline at end of file + main() + + + +# list of stuff to add +# ROCm version +# GPU VRAM info +# HIP version + +# UCX version +# MPI version4 +# MPI version5 +# ^ +# Note from Nilesh applies to 3 above +# these need to change... the /opt/ paths are mostly unique to our setup... other users might have UCX/OMPI at different paths +# the key is that UCX and OMPI should be a part of PATH and LD_LIBRARY_PATH -- first this needs to be checked, and if true, you can simply query ucx_info -v and mpirun --version +# also, we don't need both OMPI4 and OMPI5 check -- usually there's only one of these as part of the env. + +# Linux kernel version +# ulimit -a +# Environment Variable Config +# Rdma link info +# Query Numa balancing status + + +# Infiniband device info + +# ibstatus +# ibv_devices +# IB_devinfo +# ibstat +# AMDKFD (GPU Driver version) for this one just use DKMS status and put the remainder in the details section + + +# Network information + +# ip a + +# ip link + +# ip route + +# ACSinfo + +# rocminfo +# Another note from Nilesh +# rocminfo you need to parse three things -- no. of GPUs, GPU type (gfx___), and Compute Unit count +# -- we can then use this info to parse in the summary one line like "Found 8 MI300X GPUs" or "Found 8 MI308 GPUs" From 6f8d8cde4f4ce9d01beca3e4fc5718a6e9eb13b2 Mon Sep 17 00:00:00 2001 From: Nikhil-Nunna Date: Wed, 12 Feb 2025 18:46:23 -0600 Subject: [PATCH 06/26] Added function to center titles and Vram information --- tools/scripts/CS_conf.py | 58 +++++++++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 12 deletions(-) diff --git a/tools/scripts/CS_conf.py b/tools/scripts/CS_conf.py index 86d39643f..23431432a 100644 --- a/tools/scripts/CS_conf.py +++ b/tools/scripts/CS_conf.py @@ -3,6 +3,13 @@ import os import re + +# Function to center the titles in the detailed section +def centered_title(title, width, fill_char=" "): + padding_width = (width - len(title)) // 2 + return f'{fill_char*padding_width}{title}{fill_char*padding_width}\n' + + # Function to run a CLI command and return its output def run_cli_command(command): try: @@ -40,11 +47,24 @@ def get_os_version(): def get_ROCm_version(): result = run_cli_command('cat /opt/rocm/.info/version') if result.stdout: - summary = result.stdout + summary = result.stdout.strip() else: summary = "Missing Data" return summary, result + +# Get Vram Version +def get_Vram_version(): + result = run_cli_command('rocm-smi --showmeminfo vram') + if result.stdout: + summary = "Memory Usage in Vram Information section" + else: + summary = "Missing Data" + return summary, result + + + +# Gather all data and build summary table and detailed output format def get_config(): # Run the commands and store the command outputs @@ -56,23 +76,36 @@ def get_config(): ROCm_summary, ROCm_result = get_ROCm_version() ROCm_status = status_check(ROCm_summary, ROCm_result) + # Vram info + vram_summary, vram_result = get_Vram_version() + vram_status = status_check(vram_summary, vram_result) + + + # Create the summary table summary_table = ( - f"\n\n{'='*60}\n" + f"\n\n{'='*80}\n" f"{'Component':<17}| {'Status':<13} | Value\n" - f"{'='*60}\n" + f"{'='*80}\n" f"OS Version{' ':<7}| {os_status:<13} | {os_summary}\n" f"ROCm Version{' ':<5}| {ROCm_status:<13} | {ROCm_summary}\n" - f"{'='*60}\n\n\n" + f"Vram Version{' ':<5}| {vram_status:<13} | {vram_summary}\n" + f"{'='*80}\n\n\n" ) + + # Combine details + details_width = 120 details = ( - f"{'='*30} OS info {'='*30}\n\n" - f"{os_result.stdout}{os_result.stderr}\n\n" - f"{'='*30} ROCm Version {'='*30}\n\n" - f"{ROCm_result.stdout}{ROCm_result.stderr}\n\n" + f"Detailed Output:\n" + f"{centered_title('OS info', details_width, '=')}\n" + f"{os_result.stdout.strip()}{os_result.stderr.strip()}\n\n" + f"{centered_title('ROCm Version', details_width, '=')}\n" + f"{ROCm_result.stdout.strip()}{ROCm_result.stderr.strip()}\n\n" + f"{centered_title('Vram Information', details_width, '=')}\n\n" + f"{vram_result.stdout.strip()}{vram_result.stderr.strip()}\n\n" ) return summary_table, details @@ -92,11 +125,12 @@ def main(): if __name__ == '__main__': main() - - - + + + # list of stuff to add -# ROCm version +# OS version done +# ROCm version done # GPU VRAM info # HIP version From 199dc22142aabdf2e8426680fdb6cb929d7f42d0 Mon Sep 17 00:00:00 2001 From: Nikhil-Nunna Date: Wed, 12 Feb 2025 19:22:39 -0600 Subject: [PATCH 07/26] Added HIP version --- tools/scripts/CS_conf.py | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/tools/scripts/CS_conf.py b/tools/scripts/CS_conf.py index 23431432a..729b80595 100644 --- a/tools/scripts/CS_conf.py +++ b/tools/scripts/CS_conf.py @@ -52,9 +52,17 @@ def get_ROCm_version(): summary = "Missing Data" return summary, result +# Get HIP Version +def get_HIP_version(): + result = run_cli_command('hipconfig --version') + if result.stdout: + summary = result.stdout.strip() + else: + summary = "Missing Data" + return summary, result -# Get Vram Version -def get_Vram_version(): +# Get Vram Information +def get_Vram_info(): result = run_cli_command('rocm-smi --showmeminfo vram') if result.stdout: summary = "Memory Usage in Vram Information section" @@ -64,6 +72,7 @@ def get_Vram_version(): + # Gather all data and build summary table and detailed output format def get_config(): # Run the commands and store the command outputs @@ -76,21 +85,24 @@ def get_config(): ROCm_summary, ROCm_result = get_ROCm_version() ROCm_status = status_check(ROCm_summary, ROCm_result) + # HIP Version + HIP_summary, HIP_result = get_HIP_version() + HIP_status = status_check(HIP_summary, HIP_result) + # Vram info - vram_summary, vram_result = get_Vram_version() + vram_summary, vram_result = get_Vram_info() vram_status = status_check(vram_summary, vram_result) - - # Create the summary table summary_table = ( f"\n\n{'='*80}\n" - f"{'Component':<17}| {'Status':<13} | Value\n" + f"{'Component':<20}| {'Status':<13} | Value\n" f"{'='*80}\n" - f"OS Version{' ':<7}| {os_status:<13} | {os_summary}\n" - f"ROCm Version{' ':<5}| {ROCm_status:<13} | {ROCm_summary}\n" - f"Vram Version{' ':<5}| {vram_status:<13} | {vram_summary}\n" + f"OS Version{' ':<10}| {os_status:<13} | {os_summary}\n" + f"ROCm Version{' ':<8}| {ROCm_status:<13} | {ROCm_summary}\n" + f"HIP Version{' ':<9}| {HIP_status:<13} | {HIP_summary}\n" + f"Vram Information{' ':<4}| {vram_status:<13} | {vram_summary}\n" f"{'='*80}\n\n\n" ) @@ -104,8 +116,10 @@ def get_config(): f"{os_result.stdout.strip()}{os_result.stderr.strip()}\n\n" f"{centered_title('ROCm Version', details_width, '=')}\n" f"{ROCm_result.stdout.strip()}{ROCm_result.stderr.strip()}\n\n" - f"{centered_title('Vram Information', details_width, '=')}\n\n" - f"{vram_result.stdout.strip()}{vram_result.stderr.strip()}\n\n" + f"{centered_title('HIP Version', details_width, '=')}\n" + f"{HIP_result.stdout.strip()}{HIP_result.stderr.strip()}\n\n" + f"{centered_title('Vram Information', details_width, '=')}\n" + f"{vram_result.stdout.strip()}{vram_result.stderr.strip()}\n\n" ) return summary_table, details From ba959683888c6caa7db64a6b79b3a15420961491 Mon Sep 17 00:00:00 2001 From: Nikhil-Nunna Date: Wed, 12 Feb 2025 19:30:29 -0600 Subject: [PATCH 08/26] Cleaned formatting --- tools/scripts/CS_conf.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/scripts/CS_conf.py b/tools/scripts/CS_conf.py index 729b80595..15d293b80 100644 --- a/tools/scripts/CS_conf.py +++ b/tools/scripts/CS_conf.py @@ -96,14 +96,14 @@ def get_config(): # Create the summary table summary_table = ( - f"\n\n{'='*80}\n" + f"\n\n{'='*119}\n" f"{'Component':<20}| {'Status':<13} | Value\n" - f"{'='*80}\n" + f"{'='*119}\n" f"OS Version{' ':<10}| {os_status:<13} | {os_summary}\n" f"ROCm Version{' ':<8}| {ROCm_status:<13} | {ROCm_summary}\n" f"HIP Version{' ':<9}| {HIP_status:<13} | {HIP_summary}\n" f"Vram Information{' ':<4}| {vram_status:<13} | {vram_summary}\n" - f"{'='*80}\n\n\n" + f"{'='*119}\n\n\n" ) @@ -145,8 +145,8 @@ def main(): # list of stuff to add # OS version done # ROCm version done -# GPU VRAM info -# HIP version +# GPU VRAM info done +# HIP version done # UCX version # MPI version4 From b0cc44ded92b28e38bc6f90be6728d631887a4b2 Mon Sep 17 00:00:00 2001 From: Nikhil-Nunna Date: Wed, 12 Feb 2025 22:12:54 -0600 Subject: [PATCH 09/26] UCX version and MPI version --- tools/scripts/CS_conf.py | 83 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 76 insertions(+), 7 deletions(-) diff --git a/tools/scripts/CS_conf.py b/tools/scripts/CS_conf.py index 15d293b80..8d4d9e5e9 100644 --- a/tools/scripts/CS_conf.py +++ b/tools/scripts/CS_conf.py @@ -4,6 +4,11 @@ import re +class CommandResult: + def __init__(self, stdout, stderr): + self.stdout = stdout + self.stderr = stderr + # Function to center the titles in the detailed section def centered_title(title, width, fill_char=" "): padding_width = (width - len(title)) // 2 @@ -32,6 +37,20 @@ def status_check(summary, result): break return status +# Check if a directory is on path or LD_LIBRARY_PATH +def PATH_and_LD_LIBRARY_PATH(dir): + try: + path = os.environ.get('PATH') + LD_path = os.environ.get('LD_LIBRARY_PATH') + except Exception as e: + return False + pattern = re.escape(dir) + match_path = re.search(pattern, path) + match_LD_path = re.search(pattern, LD_path) + if match_LD_path and match_path: + return True + return False + # Get OS version def get_os_version(): @@ -70,6 +89,41 @@ def get_Vram_info(): summary = "Missing Data" return summary, result +# Get UCX version +def ucx_version(): + path_check = PATH_and_LD_LIBRARY_PATH(dir="ucx") + if path_check: + result = run_cli_command('ucx_info -v') + match = re.search(r"Library version: (\d+\.\d+\.\d+)", result.stdout) + if match: + summary = match.group(1) + else: + summary = "Missing Data" + return summary, result + else: + stdout = "" + stderr = "Error: UCX not on PATH or LD_LIBRARY_PATH" + result = CommandResult(stdout=stdout,stderr=stderr) + summary = "UCX not on PATH or LD_LIBRARY_PATH" + return summary, result + +# Get MPI version +def mpi_version(): + path_check = PATH_and_LD_LIBRARY_PATH(dir="ompi") + if path_check: + result = run_cli_command('mpirun --version') + match = re.search(r"mpirun \(Open MPI\) \d+\.\d+\.\d+", result.stdout) + if match: + summary = match.group() + else: + summary = "Missing Data" + return summary, result + else: + stdout = "" + stderr = "Error: ompi4 or ompi5 (only 1 is required) not on PATH or LD_LIBRARY_PATH" + result = CommandResult(stdout=stdout,stderr=stderr) + summary = "ompi4 or ompi5 (only 1 is required) not on PATH or LD_LIBRARY_PATH" + return summary, result @@ -90,8 +144,16 @@ def get_config(): HIP_status = status_check(HIP_summary, HIP_result) # Vram info - vram_summary, vram_result = get_Vram_info() - vram_status = status_check(vram_summary, vram_result) + Vram_summary, Vram_result = get_Vram_info() + Vram_status = status_check(Vram_summary, Vram_result) + + # UCX Version + ucx_summary, ucx_result = ucx_version() + ucx_status = status_check(ucx_summary, ucx_result) + + # MPI Version + mpi_summary, mpi_result = mpi_version() + mpi_status = status_check(mpi_summary, mpi_result) # Create the summary table @@ -102,7 +164,9 @@ def get_config(): f"OS Version{' ':<10}| {os_status:<13} | {os_summary}\n" f"ROCm Version{' ':<8}| {ROCm_status:<13} | {ROCm_summary}\n" f"HIP Version{' ':<9}| {HIP_status:<13} | {HIP_summary}\n" - f"Vram Information{' ':<4}| {vram_status:<13} | {vram_summary}\n" + f"Vram Information{' ':<4}| {Vram_status:<13} | {Vram_summary}\n" + f"UCX Version{' ':<9}| {ucx_status:<13} | {ucx_summary}\n" + f"MPI Version{' ':<9}| {mpi_status:<13} | {mpi_summary}\n" f"{'='*119}\n\n\n" ) @@ -119,7 +183,11 @@ def get_config(): f"{centered_title('HIP Version', details_width, '=')}\n" f"{HIP_result.stdout.strip()}{HIP_result.stderr.strip()}\n\n" f"{centered_title('Vram Information', details_width, '=')}\n" - f"{vram_result.stdout.strip()}{vram_result.stderr.strip()}\n\n" + f"{Vram_result.stdout.strip()}{Vram_result.stderr.strip()}\n\n" + f"{centered_title('UCX Version', details_width, '=')}\n" + f"{ucx_result.stdout.strip()}{ucx_result.stderr.strip()}\n\n" + f"{centered_title('MPI Version', details_width, '=')}\n" + f"{mpi_result.stdout.strip()}{mpi_result.stderr.strip()}\n\n" ) return summary_table, details @@ -148,9 +216,10 @@ def main(): # GPU VRAM info done # HIP version done -# UCX version -# MPI version4 -# MPI version5 +# PATH +# UCX version done +# MPI version4 done +# MPI version5 done # ^ # Note from Nilesh applies to 3 above # these need to change... the /opt/ paths are mostly unique to our setup... other users might have UCX/OMPI at different paths From ae81ab649684368c6704a40b4f2347cc56305846 Mon Sep 17 00:00:00 2001 From: Nikhil-Nunna Date: Wed, 12 Feb 2025 23:23:24 -0600 Subject: [PATCH 10/26] Added NUMA balancing --- tools/scripts/CS_conf.py | 106 +++++++++++++++++++++++++++++++++------ 1 file changed, 92 insertions(+), 14 deletions(-) diff --git a/tools/scripts/CS_conf.py b/tools/scripts/CS_conf.py index 8d4d9e5e9..57fa39b77 100644 --- a/tools/scripts/CS_conf.py +++ b/tools/scripts/CS_conf.py @@ -8,7 +8,7 @@ class CommandResult: def __init__(self, stdout, stderr): self.stdout = stdout self.stderr = stderr - + # Function to center the titles in the detailed section def centered_title(title, width, fill_char=" "): padding_width = (width - len(title)) // 2 @@ -84,7 +84,7 @@ def get_HIP_version(): def get_Vram_info(): result = run_cli_command('rocm-smi --showmeminfo vram') if result.stdout: - summary = "Memory Usage in Vram Information section" + summary = "Memory Usage is detailed in the Vram Information section" else: summary = "Missing Data" return summary, result @@ -125,6 +125,50 @@ def mpi_version(): summary = "ompi4 or ompi5 (only 1 is required) not on PATH or LD_LIBRARY_PATH" return summary, result +# Get Linux kernel version +def get_Linux_kernel_version(): + result = run_cli_command('uname -r') + if result.stdout: + summary = result.stdout.strip() + else: + summary = "Missing Data" + return summary, result + +# Get Resource limits +def get_resource_limits_info(): + result = run_cli_command('ulimit -a') + if result.stdout: + summary = "Output is detailed in the Resource limits section" + else: + summary = "Missing Data" + return summary, result + +# Get Environment config +def get_Environment_config_info(): + result = run_cli_command('env') + if result.stdout: + summary = "Output is detailed in the Environment Config section" + else: + summary = "Missing Data" + return summary, result + +# Get Rdma link info +def get_rdma_link_info(): + result = run_cli_command('rdma link') + if result.stdout: + summary = "Output is detailed in the rdma link section" + else: + summary = "Missing Data" + return summary, result + +# Get NUMA Balancing +def get_NUMA_balancing_info(): + result = run_cli_command('cat /proc/sys/kernel/numa_balancing') + if result.stdout: + summary = result.stdout + else: + summary = "Missing Data" + return summary, result # Gather all data and build summary table and detailed output format @@ -155,18 +199,42 @@ def get_config(): mpi_summary, mpi_result = mpi_version() mpi_status = status_check(mpi_summary, mpi_result) + # Linux kernel version + Lkv_summary, Lkv_result = get_Linux_kernel_version() + Lkv_status = status_check(Lkv_summary, Lkv_result) + + # Resource limits + rlv_summary, rlv_result = get_resource_limits_info() + rlv_status = status_check(rlv_summary, rlv_result) + + # Environment config + env_summary, env_result = get_Environment_config_info() + env_status = status_check(env_summary, env_result) + + # Rdma link info + rdl_summary, rdl_result = get_rdma_link_info() + rdl_status = status_check(rdl_summary, rdl_result) + + # NUMA Balancing info + nb_summary, nb_result = get_NUMA_balancing_info() + nb_status = status_check(nb_summary, nb_result) # Create the summary table summary_table = ( f"\n\n{'='*119}\n" - f"{'Component':<20}| {'Status':<13} | Value\n" + f"{'Component':<30}| {'Status':<13} | Value\n" f"{'='*119}\n" - f"OS Version{' ':<10}| {os_status:<13} | {os_summary}\n" - f"ROCm Version{' ':<8}| {ROCm_status:<13} | {ROCm_summary}\n" - f"HIP Version{' ':<9}| {HIP_status:<13} | {HIP_summary}\n" - f"Vram Information{' ':<4}| {Vram_status:<13} | {Vram_summary}\n" - f"UCX Version{' ':<9}| {ucx_status:<13} | {ucx_summary}\n" - f"MPI Version{' ':<9}| {mpi_status:<13} | {mpi_summary}\n" + f"OS Version{' ':<20}| {os_status:<13} | {os_summary}\n" + f"ROCm Version{' ':<18}| {ROCm_status:<13} | {ROCm_summary}\n" + f"HIP Version{' ':<19}| {HIP_status:<13} | {HIP_summary}\n" + f"Vram Information{' ':<14}| {Vram_status:<13} | {Vram_summary}\n" + f"UCX Version{' ':<19}| {ucx_status:<13} | {ucx_summary}\n" + f"MPI Version{' ':<19}| {mpi_status:<13} | {mpi_summary}\n" + f"Linux Kernel Version{' ':<10}| {Lkv_status:<13} | {Lkv_summary}\n" + f"Resource limits{' ':<15}| {rlv_status:<13} | {rlv_summary}\n" + f"Environment Configuration{' ':<5}| {env_status:<13} | {env_summary}\n" + f"RDMA Link Information{' ':<9}| {rdl_status:<13} | {rdl_summary}\n" + f"NUMA Balancing Information{' ':<4}| {nb_status:<13} | {nb_summary}\n" f"{'='*119}\n\n\n" ) @@ -188,6 +256,16 @@ def get_config(): f"{ucx_result.stdout.strip()}{ucx_result.stderr.strip()}\n\n" f"{centered_title('MPI Version', details_width, '=')}\n" f"{mpi_result.stdout.strip()}{mpi_result.stderr.strip()}\n\n" + f"{centered_title('Linux Kernel Version', details_width, '=')}\n" + f"{Lkv_result.stdout.strip()}{Lkv_result.stderr.strip()}\n\n" + f"{centered_title('Resource limits', details_width, '=')}\n" + f"{rlv_result.stdout.strip()}{rlv_result.stderr.strip()}\n\n" + f"{centered_title('Environment Configuration', details_width, '=')}\n" + f"{env_result.stdout.strip()}{env_result.stderr.strip()}\n\n" + f"{centered_title('RDMA Link Information', details_width, '=')}\n" + f"{rdl_result.stdout.strip()}{rdl_result.stderr.strip()}\n\n" + f"{centered_title('NUMA Balancing Information', details_width, '=')}\n" + f"{nb_result.stdout.strip()}{nb_result.stderr.strip()}\n\n" ) return summary_table, details @@ -226,11 +304,11 @@ def main(): # the key is that UCX and OMPI should be a part of PATH and LD_LIBRARY_PATH -- first this needs to be checked, and if true, you can simply query ucx_info -v and mpirun --version # also, we don't need both OMPI4 and OMPI5 check -- usually there's only one of these as part of the env. -# Linux kernel version -# ulimit -a -# Environment Variable Config -# Rdma link info -# Query Numa balancing status +# Linux kernel version done +# ulimit -a done +# Environment Variable Config done +# Rdma link info done +# Query Numa balancing status done # Infiniband device info From da0caa1c9d773d8ebcf03167b4506c749472806d Mon Sep 17 00:00:00 2001 From: Nikhil-Nunna Date: Thu, 13 Feb 2025 02:28:34 -0600 Subject: [PATCH 11/26] Added rocminfo --- tools/scripts/CS_conf.py | 240 +++++++++++++++++++++++++++++++++++---- 1 file changed, 216 insertions(+), 24 deletions(-) diff --git a/tools/scripts/CS_conf.py b/tools/scripts/CS_conf.py index 57fa39b77..c0ad0e1ee 100644 --- a/tools/scripts/CS_conf.py +++ b/tools/scripts/CS_conf.py @@ -28,7 +28,7 @@ def status_check(summary, result): # List of errors to check error_list = [r'No such file or directory', r'Command not found', r'Permission denied', r'cannot access', r'error'] status = "OK" - if summary == "Missing Data": + if summary == "Unable to detect": status = "WARN" for error in error_list: match = re.search(error, result.stderr, re.IGNORECASE) @@ -59,7 +59,7 @@ def get_os_version(): if match: summary = match.group(1) else: - summary = "Missing Data" + summary = "Unable to detect" return summary, result # Get ROCm Version @@ -68,7 +68,7 @@ def get_ROCm_version(): if result.stdout: summary = result.stdout.strip() else: - summary = "Missing Data" + summary = "Unable to detect" return summary, result # Get HIP Version @@ -77,7 +77,7 @@ def get_HIP_version(): if result.stdout: summary = result.stdout.strip() else: - summary = "Missing Data" + summary = "Unable to detect" return summary, result # Get Vram Information @@ -86,7 +86,7 @@ def get_Vram_info(): if result.stdout: summary = "Memory Usage is detailed in the Vram Information section" else: - summary = "Missing Data" + summary = "Unable to detect" return summary, result # Get UCX version @@ -98,7 +98,7 @@ def ucx_version(): if match: summary = match.group(1) else: - summary = "Missing Data" + summary = "Unable to detect" return summary, result else: stdout = "" @@ -116,7 +116,7 @@ def mpi_version(): if match: summary = match.group() else: - summary = "Missing Data" + summary = "Unable to detect" return summary, result else: stdout = "" @@ -131,7 +131,7 @@ def get_Linux_kernel_version(): if result.stdout: summary = result.stdout.strip() else: - summary = "Missing Data" + summary = "Unable to detect" return summary, result # Get Resource limits @@ -140,7 +140,7 @@ def get_resource_limits_info(): if result.stdout: summary = "Output is detailed in the Resource limits section" else: - summary = "Missing Data" + summary = "Unable to detect" return summary, result # Get Environment config @@ -149,7 +149,7 @@ def get_Environment_config_info(): if result.stdout: summary = "Output is detailed in the Environment Config section" else: - summary = "Missing Data" + summary = "Unable to detect" return summary, result # Get Rdma link info @@ -158,16 +158,125 @@ def get_rdma_link_info(): if result.stdout: summary = "Output is detailed in the rdma link section" else: - summary = "Missing Data" + summary = "Unable to detect" return summary, result # Get NUMA Balancing def get_NUMA_balancing_info(): result = run_cli_command('cat /proc/sys/kernel/numa_balancing') if result.stdout: - summary = result.stdout + summary = result.stdout.strip() + else: + summary = "Unable to detect" + return summary, result + +# Get IB status ########################## UPDATE to NIC STATUS in summary and include data in value +def get_ib_status(): + result = run_cli_command('ibstatus') + if result.stdout: + summary = "Output is detailed in the IBstatus section" + else: + summary = "Unable to detect" + return summary, result + +# Get Device GUIDs +def get_device_GUIDs(): + result = run_cli_command('ibv_devices') + if result.stdout: + summary = "Output is detailed in the IBdevices section" + else: + summary = "Unable to detect" + return summary, result + +# Get IB device info +def get_ib_devinfo(): + result = run_cli_command('ibv_devinfo') + if result.stdout: + summary = "Output is detailed in the IBdevinfo section" + else: + summary = "Unable to detect" + return summary, result + +# Get IBstat info +def get_ibstat(): + result = run_cli_command('ibstat') + if result.stdout: + summary = "Output is detailed in the IBstat section" else: - summary = "Missing Data" + summary = "Unable to detect" + return summary, result + +# Get AMDKFD (GPU Driver version) +def get_gpu_driver(): + result = run_cli_command('dkms status | grep "amdgpu"') + if result.stdout: + pattern = r"^.*amdgpu.*$" + matching_lines = re.findall(pattern, result.stdout, flags=re.MULTILINE) + summary = matching_lines[0] + ", WARN = maybe >1 driver check below" + else: + summary = "Unable to detect" + return summary, result + +# Get DKMS module info +def get_dkms_status(): + result = run_cli_command('dkms status') + if result.stdout: + summary = "DKMS information is detailed in the DKMS Status section" + else: + summary = "Unable to detect" + return summary, result + +# Get IP A +def get_IP_addr(): + result = run_cli_command('ip a') + if result.stdout: + summary = "IP address information is detailed in the IP Addr section" + else: + summary = "Unable to detect" + return summary, result + +# Get IP Link +def get_IP_link(): + result = run_cli_command('ip link') + if result.stdout: + summary = "IP link information is detailed in the IP Link section" + else: + summary = "Unable to detect" + return summary, result + +# Get IP route +def get_IP_route(): + result = run_cli_command('ip route') + if result.stdout: + summary = "IP Route information is detailed in the IP Route section" + else: + summary = "Unable to detect" + return summary, result + +# Get ACS info ####################### no output for this command ask about it +def get_acs_info(): + result = run_cli_command('lspci -vvv | grep ACSCtl') + if result.stdout: + summary = "ACS information is detailed in the ACS section" + else: + summary = "Unable to detect" + return summary, result + +# Get rocminfo ################# Ask how to differentiate which GPUs are which by CU and name +def get_rocminfo(): + result = run_cli_command('rocminfo') + if result.stdout: + gpu_names = [] + compute_units = [] + gpu_pattern = re.compile(r"Name:\s+(gfx\d+).*?Compute Unit:\s+(\d+)", re.DOTALL) + matches = gpu_pattern.findall(result.stdout) + for match in matches: + gpu_names.append(match[0]) + compute_units.append(int(match[1])) + num_gpus = len(gpu_names) + summary = f"Found {num_gpus} GPUs" + else: + summary = "Unable to detect" return summary, result @@ -214,11 +323,61 @@ def get_config(): # Rdma link info rdl_summary, rdl_result = get_rdma_link_info() rdl_status = status_check(rdl_summary, rdl_result) - + # NUMA Balancing info nb_summary, nb_result = get_NUMA_balancing_info() nb_status = status_check(nb_summary, nb_result) + # IB status info + ibs_summary, ibs_result = get_ib_status() + ibs_status = status_check(ibs_summary, ibs_result) + + # Device GUIDs + GUIDs_summary, GUIDs_result = get_device_GUIDs() + GUIDs_status = status_check(GUIDs_summary, GUIDs_result) + + # IB device info + ib_dev_summary, ib_dev_result = get_ib_devinfo() + ib_dev_status = status_check(ib_dev_summary, ib_dev_result) + + # IBstat info + ib_stat_summary, ib_stat_result = get_ibstat() + ib_stat_status = status_check(ib_stat_summary, ib_stat_result) + + # AMD GPU driver version + GPU_driver_summary, GPU_driver_result = get_gpu_driver() + pattern = r"^.*amdgpu.*$" + matching_lines = re.findall(pattern, GPU_driver_result.stdout, flags=re.MULTILINE) + if len(matching_lines) > 1: + GPU_driver_status = "WARN" + else: + GPU_driver_status = status_check(GPU_driver_summary, GPU_driver_result) + + # DKMS module info + dkms_summary, dkms_result = get_dkms_status() + dkms_status = status_check(dkms_summary, dkms_result) + + # IP addr info + ip_addr_summary, ip_addr_result = get_IP_addr() + ip_addr_status = status_check(ip_addr_summary, ip_addr_result) + + # IP link info + ip_link_summary, ip_link_result = get_IP_link() + ip_link_status = status_check(ip_link_summary, ip_link_result) + + # IP route info + ip_route_summary, ip_route_result = get_IP_route() + ip_route_status = status_check(ip_route_summary, ip_route_result) + + # IP ACS info + acs_summary, acs_result = get_acs_info() + acs_status = status_check(acs_summary, acs_result) + + # ROCM info + rocm_info_summary, rocm_info_result = get_rocminfo() + rocm_info_status = status_check(rocm_info_summary, rocm_info_result) + + # Create the summary table summary_table = ( f"\n\n{'='*119}\n" @@ -235,6 +394,17 @@ def get_config(): f"Environment Configuration{' ':<5}| {env_status:<13} | {env_summary}\n" f"RDMA Link Information{' ':<9}| {rdl_status:<13} | {rdl_summary}\n" f"NUMA Balancing Information{' ':<4}| {nb_status:<13} | {nb_summary}\n" + f"IBstatus Information{' ':<10}| {ibs_status:<13} | {ibs_summary}\n" + f"Device GUIDs Information{' ':<6}| {GUIDs_status:<13} | {GUIDs_summary}\n" + f"IB device Information{' ':<9}| {ib_dev_status:<13} | {ib_dev_summary}\n" + f"IBstat Information{' ':<12}| {ib_stat_status:<13} | {ib_stat_summary}\n" + f"AMD GPU driver version{' ':<8}| {GPU_driver_status:<13} | {GPU_driver_summary}\n" + f"DKMS Module Information{' ':<7}| {dkms_status:<13} | {dkms_summary}\n" + f"IP Address Information{' ':<8}| {ip_addr_status:<13} | {ip_addr_summary}\n" + f"IP Link Information{' ':<11}| {ip_link_status:<13} | {ip_link_summary}\n" + f"IP Route Information{' ':<10}| {ip_route_status:<13} | {ip_route_summary}\n" + f"ACS Disabled{' ':<18}| {acs_status:<13} | {acs_summary}\n" + f"Node Status{' ':<19}| {rocm_info_status:<13} | {rocm_info_summary}\n" f"{'='*119}\n\n\n" ) @@ -266,6 +436,28 @@ def get_config(): f"{rdl_result.stdout.strip()}{rdl_result.stderr.strip()}\n\n" f"{centered_title('NUMA Balancing Information', details_width, '=')}\n" f"{nb_result.stdout.strip()}{nb_result.stderr.strip()}\n\n" + f"{centered_title('IBstatus Information', details_width, '=')}\n" + f"{ibs_result.stdout.strip()}{ibs_result.stderr.strip()}\n\n" + f"{centered_title('IBdevices', details_width, '=')}\n" + f"{GUIDs_result.stdout.strip()}{GUIDs_result.stderr.strip()}\n\n" + f"{centered_title('IBdevinfo', details_width, '=')}\n" + f"{ib_dev_result.stdout.strip()}{ib_dev_result.stderr.strip()}\n\n" + f"{centered_title('IBstat', details_width, '=')}\n" + f"{ib_stat_result.stdout.strip()}{ib_stat_result.stderr.strip()}\n\n" + f"{centered_title('GPU Driver Version', details_width, '=')}\n" + f"{GPU_driver_result.stdout.strip()}{GPU_driver_result.stderr.strip()}\n\n" + f"{centered_title('DKMS Status', details_width, '=')}\n" + f"{dkms_result.stdout.strip()}{dkms_result.stderr.strip()}\n\n" + f"{centered_title('IP Addr', details_width, '=')}\n" + f"{ip_addr_result.stdout.strip()}{ip_addr_result.stderr.strip()}\n\n" + f"{centered_title('IP Link', details_width, '=')}\n" + f"{ip_link_result.stdout.strip()}{ip_link_result.stderr.strip()}\n\n" + f"{centered_title('IP Route', details_width, '=')}\n" + f"{ip_route_result.stdout.strip()}{ip_route_result.stderr.strip()}\n\n" + f"{centered_title('ACS', details_width, '=')}\n" + f"{acs_result.stdout.strip()}{acs_result.stderr.strip()}\n\n" + f"{centered_title('ROCm Information', details_width, '=')}\n" + f"{rocm_info_result.stdout.strip()}{rocm_info_result.stderr.strip()}\n\n" ) return summary_table, details @@ -311,24 +503,24 @@ def main(): # Query Numa balancing status done -# Infiniband device info +# Network Interface Controller (NIC) info -# ibstatus -# ibv_devices -# IB_devinfo -# ibstat -# AMDKFD (GPU Driver version) for this one just use DKMS status and put the remainder in the details section +# ibstatus done +# ibv_devices done +# IB_devinfo done +# ibstat done +# AMDKFD (GPU Driver version) for this one just use DKMS status and put the remainder in the details section done # Network information -# ip a +# ip a done -# ip link +# ip link done -# ip route +# ip route done -# ACSinfo +# ACSinfo done # rocminfo # Another note from Nilesh From 4f303ea31bc6d6d252f9f7acd4e6a4ca0000c907 Mon Sep 17 00:00:00 2001 From: Nikhil-Nunna Date: Thu, 13 Feb 2025 03:00:20 -0600 Subject: [PATCH 12/26] Removed notes --- tools/scripts/CS_conf.py | 67 +++++++++------------------------------- 1 file changed, 15 insertions(+), 52 deletions(-) diff --git a/tools/scripts/CS_conf.py b/tools/scripts/CS_conf.py index c0ad0e1ee..9b1e5ec95 100644 --- a/tools/scripts/CS_conf.py +++ b/tools/scripts/CS_conf.py @@ -42,6 +42,9 @@ def PATH_and_LD_LIBRARY_PATH(dir): try: path = os.environ.get('PATH') LD_path = os.environ.get('LD_LIBRARY_PATH') + pattern = re.escape(dir) + match_path = re.search(pattern, path) + match_LD_path = re.search(pattern, LD_path) except Exception as e: return False pattern = re.escape(dir) @@ -170,11 +173,19 @@ def get_NUMA_balancing_info(): summary = "Unable to detect" return summary, result -# Get IB status ########################## UPDATE to NIC STATUS in summary and include data in value +# Get IB status ########################## Ask what the output of Broadcom nic looks like and if it will work with this regex def get_ib_status(): result = run_cli_command('ibstatus') if result.stdout: - summary = "Output is detailed in the IBstatus section" + pattern = r"Infiniband device '[^']+' port \d+ status:\s+default gid:\s+[^ ]+\s+base lid:\s+[^ ]+\s+sm lid:\s+[^ ]+\s+state:\s+\d+: ACTIVE\s+phys state:\s+\d+: LinkUp\s+rate:\s+(\d+) Gb/sec \([^)]+\)\s+link_layer:\s+InfiniBand" + matches = re.findall(pattern, result.stdout) + num_ib_devices = len(matches) + rate_same = all(x == matches[0] for x in matches) + if rate_same: + summary = f"Detected {num_ib_devices} active IB devices running at {matches[0]} Gb/sec" + else: + summary = f"Detected {num_ib_devices} active IB devices running at various rates the peak being {max(matches)} Gb/sec" + else: summary = "Unable to detect" return summary, result @@ -394,7 +405,7 @@ def get_config(): f"Environment Configuration{' ':<5}| {env_status:<13} | {env_summary}\n" f"RDMA Link Information{' ':<9}| {rdl_status:<13} | {rdl_summary}\n" f"NUMA Balancing Information{' ':<4}| {nb_status:<13} | {nb_summary}\n" - f"IBstatus Information{' ':<10}| {ibs_status:<13} | {ibs_summary}\n" + f"NIC Status{' ':<20}| {ibs_status:<13} | {ibs_summary}\n" f"Device GUIDs Information{' ':<6}| {GUIDs_status:<13} | {GUIDs_summary}\n" f"IB device Information{' ':<9}| {ib_dev_status:<13} | {ib_dev_summary}\n" f"IBstat Information{' ':<12}| {ib_stat_status:<13} | {ib_stat_summary}\n" @@ -436,7 +447,7 @@ def get_config(): f"{rdl_result.stdout.strip()}{rdl_result.stderr.strip()}\n\n" f"{centered_title('NUMA Balancing Information', details_width, '=')}\n" f"{nb_result.stdout.strip()}{nb_result.stderr.strip()}\n\n" - f"{centered_title('IBstatus Information', details_width, '=')}\n" + f"{centered_title('Network Interface Controller (NIC) Information', details_width, '=')}\n" f"{ibs_result.stdout.strip()}{ibs_result.stderr.strip()}\n\n" f"{centered_title('IBdevices', details_width, '=')}\n" f"{GUIDs_result.stdout.strip()}{GUIDs_result.stderr.strip()}\n\n" @@ -478,51 +489,3 @@ def main(): if __name__ == '__main__': main() - - -# list of stuff to add -# OS version done -# ROCm version done -# GPU VRAM info done -# HIP version done - -# PATH -# UCX version done -# MPI version4 done -# MPI version5 done -# ^ -# Note from Nilesh applies to 3 above -# these need to change... the /opt/ paths are mostly unique to our setup... other users might have UCX/OMPI at different paths -# the key is that UCX and OMPI should be a part of PATH and LD_LIBRARY_PATH -- first this needs to be checked, and if true, you can simply query ucx_info -v and mpirun --version -# also, we don't need both OMPI4 and OMPI5 check -- usually there's only one of these as part of the env. - -# Linux kernel version done -# ulimit -a done -# Environment Variable Config done -# Rdma link info done -# Query Numa balancing status done - - -# Network Interface Controller (NIC) info - -# ibstatus done -# ibv_devices done -# IB_devinfo done -# ibstat done -# AMDKFD (GPU Driver version) for this one just use DKMS status and put the remainder in the details section done - - -# Network information - -# ip a done - -# ip link done - -# ip route done - -# ACSinfo done - -# rocminfo -# Another note from Nilesh -# rocminfo you need to parse three things -- no. of GPUs, GPU type (gfx___), and Compute Unit count -# -- we can then use this info to parse in the summary one line like "Found 8 MI300X GPUs" or "Found 8 MI308 GPUs" From 400df8ebf7dceb917ac6741c3255a3183d2dff84 Mon Sep 17 00:00:00 2001 From: Nikhil-Nunna Date: Thu, 13 Feb 2025 11:51:58 -0600 Subject: [PATCH 13/26] Changed regex for broadcom Nic --- tools/scripts/CS_conf.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/tools/scripts/CS_conf.py b/tools/scripts/CS_conf.py index 9b1e5ec95..5246ab1c1 100644 --- a/tools/scripts/CS_conf.py +++ b/tools/scripts/CS_conf.py @@ -173,13 +173,16 @@ def get_NUMA_balancing_info(): summary = "Unable to detect" return summary, result -# Get IB status ########################## Ask what the output of Broadcom nic looks like and if it will work with this regex +# Get IB status def get_ib_status(): result = run_cli_command('ibstatus') if result.stdout: - pattern = r"Infiniband device '[^']+' port \d+ status:\s+default gid:\s+[^ ]+\s+base lid:\s+[^ ]+\s+sm lid:\s+[^ ]+\s+state:\s+\d+: ACTIVE\s+phys state:\s+\d+: LinkUp\s+rate:\s+(\d+) Gb/sec \([^)]+\)\s+link_layer:\s+InfiniBand" + pattern = r"Infiniband device '[^']+' port \d+ status:\s+default gid:\s+[^ ]+\s+base lid:\s+[^ ]+\s+sm lid:\s+[^ ]+\s+state:\s+\d+: ACTIVE\s+phys state:\s+\d+: LinkUp\s+rate:\s+(\d+) Gb/sec \([^)]+\)\s+link_layer:\s+" matches = re.findall(pattern, result.stdout) num_ib_devices = len(matches) + if num_ib_devices == 0: + summary = f"Detected {num_ib_devices} active IB devices running" + return summary, result rate_same = all(x == matches[0] for x in matches) if rate_same: summary = f"Detected {num_ib_devices} active IB devices running at {matches[0]} Gb/sec" @@ -223,6 +226,9 @@ def get_gpu_driver(): if result.stdout: pattern = r"^.*amdgpu.*$" matching_lines = re.findall(pattern, result.stdout, flags=re.MULTILINE) + if len(matching_lines) == 0: + summary ="No gpu driver detected" + return summary, result summary = matching_lines[0] + ", WARN = maybe >1 driver check below" else: summary = "Unable to detect" @@ -266,9 +272,14 @@ def get_IP_route(): # Get ACS info ####################### no output for this command ask about it def get_acs_info(): - result = run_cli_command('lspci -vvv | grep ACSCtl') + result = run_cli_command('sudo lspci -vvv | grep ACSCtl') if result.stdout: - summary = "ACS information is detailed in the ACS section" + pattern = r"SrcValid\+" + matches = re.findall(pattern, result.stdout) + if len(matches) != 0: + summary = "ACS has not been disabled" + else: + summary= "ACS has been disabled" else: summary = "Unable to detect" return summary, result From 5e9d4f8315ef1b545310d3445e9bd54b984be351 Mon Sep 17 00:00:00 2001 From: Nikhil-Nunna Date: Thu, 13 Feb 2025 18:19:19 -0600 Subject: [PATCH 14/26] Removed note by the ACS info --- tools/scripts/CS_conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/scripts/CS_conf.py b/tools/scripts/CS_conf.py index 5246ab1c1..5f6f431be 100644 --- a/tools/scripts/CS_conf.py +++ b/tools/scripts/CS_conf.py @@ -270,7 +270,7 @@ def get_IP_route(): summary = "Unable to detect" return summary, result -# Get ACS info ####################### no output for this command ask about it +# Get ACS info def get_acs_info(): result = run_cli_command('sudo lspci -vvv | grep ACSCtl') if result.stdout: From 07e98fc84d4d86dd34a433da9460a4d53d211558 Mon Sep 17 00:00:00 2001 From: Nikhil-Nunna Date: Thu, 13 Feb 2025 18:50:10 -0600 Subject: [PATCH 15/26] Added Hostname to summary and details --- tools/scripts/CS_conf.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tools/scripts/CS_conf.py b/tools/scripts/CS_conf.py index 5f6f431be..f53841f57 100644 --- a/tools/scripts/CS_conf.py +++ b/tools/scripts/CS_conf.py @@ -54,6 +54,14 @@ def PATH_and_LD_LIBRARY_PATH(dir): return True return False +# Get hostname +def get_hostname(): + result = run_cli_command('hostname') + if result.stdout: + summary = result.stdout.strip() + else: + summary = "Unable to detect" + return summary, result # Get OS version def get_os_version(): @@ -306,6 +314,11 @@ def get_rocminfo(): def get_config(): # Run the commands and store the command outputs + + # Hostname + hostname_summary, hostname_result = get_hostname() + hostname_status = status_check(hostname_summary, hostname_result) + # OS version os_summary, os_result = get_os_version() os_status = status_check(os_summary, os_result) @@ -405,6 +418,7 @@ def get_config(): f"\n\n{'='*119}\n" f"{'Component':<30}| {'Status':<13} | Value\n" f"{'='*119}\n" + f"Host Name{' ':<21}| {hostname_status:<13} | {hostname_summary}\n" f"OS Version{' ':<20}| {os_status:<13} | {os_summary}\n" f"ROCm Version{' ':<18}| {ROCm_status:<13} | {ROCm_summary}\n" f"HIP Version{' ':<19}| {HIP_status:<13} | {HIP_summary}\n" @@ -436,6 +450,8 @@ def get_config(): details_width = 120 details = ( f"Detailed Output:\n" + f"{centered_title('Host Name', details_width, '=')}\n" + f"{hostname_result.stdout.strip()}{hostname_result.stderr.strip()}\n\n" f"{centered_title('OS info', details_width, '=')}\n" f"{os_result.stdout.strip()}{os_result.stderr.strip()}\n\n" f"{centered_title('ROCm Version', details_width, '=')}\n" From b383f6cea9088fc98d223ef2586b21aaaf30e078 Mon Sep 17 00:00:00 2001 From: Nikhil-Nunna Date: Thu, 13 Feb 2025 19:04:07 -0600 Subject: [PATCH 16/26] Print summary to terminal --- tools/scripts/CS_conf.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/scripts/CS_conf.py b/tools/scripts/CS_conf.py index f53841f57..969ce670f 100644 --- a/tools/scripts/CS_conf.py +++ b/tools/scripts/CS_conf.py @@ -507,7 +507,9 @@ def main(): summary_table, details = get_config() - + # Print summary out to cli + print(summary_table) + # Write the summary table and details to the output file with open(file_name, "w") as file: file.write(summary_table) From 4889d11c01b719ff7760025bb6b8c00d4ad8d1bc Mon Sep 17 00:00:00 2001 From: Nikhil-Nunna Date: Thu, 13 Feb 2025 20:57:27 -0600 Subject: [PATCH 17/26] Added argparse --- tools/scripts/CS_conf.py | 34 ++++++ tools/scripts/CS_conf.sh | 253 --------------------------------------- 2 files changed, 34 insertions(+), 253 deletions(-) delete mode 100755 tools/scripts/CS_conf.sh diff --git a/tools/scripts/CS_conf.py b/tools/scripts/CS_conf.py index 969ce670f..47fcd5072 100644 --- a/tools/scripts/CS_conf.py +++ b/tools/scripts/CS_conf.py @@ -1,7 +1,26 @@ +## README + + + + + + + + + + + + + + + + + import subprocess import time import os import re +import argparse class CommandResult: @@ -9,6 +28,20 @@ def __init__(self, stdout, stderr): self.stdout = stdout self.stderr = stderr +# Function to Parse arguements + +def parse_arguments(): + parser = argparse.ArgumentParser(description='') + + # Add option flags + parser.add_argument('--ACS', help='Check for ACS status (requires root access):', required=False) + + return parser.parse_args() + + + + + # Function to center the titles in the detailed section def centered_title(title, width, fill_char=" "): padding_width = (width - len(title)) // 2 @@ -501,6 +534,7 @@ def get_config(): def main(): + args = parse_arguments() hostname = os.uname().nodename timestamp = time.strftime("%Y%m%d_%H%M%S") file_name = f"config.{hostname}.{timestamp}.txt" diff --git a/tools/scripts/CS_conf.sh b/tools/scripts/CS_conf.sh deleted file mode 100755 index d960570e9..000000000 --- a/tools/scripts/CS_conf.sh +++ /dev/null @@ -1,253 +0,0 @@ -#!/bin/bash - - -##################################################### README ##################################################### - -# This script only requires 1 input arguement, it is the path to the rccl-tests repo. EX: /path/to/rccl-tests/ - -# Things to check before running this script, if things from the below list it is fine the script will still continue to -# run but it may produce an error on the missing parts. - -# 1. UCX bin folder is on path or in default location under opt -# 2. OMPI bin folder is on path or in default location under opt -# 3. RCCL is either built and added to path or default loction is on path -# 4. RCCL-Tests are built -# 5. rocm-smi is on path -# 6. hipconfig is on path -# 7. rocminfo is on path -# 8. ibstatus is on path -# 9. ibv_devices is on path -# 10. ibv_devinfo is on path -# 11. ibstat device GUIDs - -# All output will be in a folder called conf-script-output that will be created in the same directory as the script - -##################################################### end README ##################################################### - - - -##################################################### define necessary functions ##################################################### - -function try() { - local func_name="$1" - local func_call="$2" - local file_output="${func_name}_output.txt" - local output - - # Shift to get past the label of the call/output file - shift - # Shift the arguments to pass the remaining ones to the called function - shift - - # Run the command/function and capture its output - output="$("${func_call}" "$@" 2>&1)" - local exit_status=$? - - # Check if the command/function succeeded or failed - if [ $exit_status -ne 0 ]; then - catch "${func_call}" "${output}" "${func_name}" - else - echo "${output}" > "conf-script-output/${file_output}" - echo "${func_name} was successful. Output saved to ${file_output}" - fi -} - -function catch() { - local func_call="$1" - local error_message="$2" - local func_name="$3" - echo "An error occurred during ${func_call}" - echo "${error_message}" - echo "in step ${func_name}" -} - -# Function to get version info about ROCm -function rocmver() -{ - # Store the output of rocminfo in a variable - rocminfo_output=$(rocminfo) - - # Grep the variable content for lines containing 'version' - version_info=$(echo "$rocminfo_output" | grep -i "version") - echo "$version_info" -} - - -# Function to get AMD GPU driver version -function amdgpuver() -{ - - # Store the output of dkms in a variable - dkms_output=$(dkms status) - - # Grep the variable content for lines containing 'amdgpu' - amdgpu=$(echo "$dkms_output" | grep "amdgpu") - echo "$amdgpu" - -} - -# Function to Query ACS -function ACSinfo() -{ - - # Store the output of lspci in a variable - lspci_output=$(lspci -vvv) - - # Grep the variable content for lines containing 'ACSCtl' - acs=$(echo "$lspci_output" | grep ACSCtl) - echo "$acs" - -} - -# Function to get rccl and rccl-tests version -run_rccl_tests() { - - local rccl_tests_dir="$1" - - # Get rccl-tests branch and version information - local rccl_tests_branch=$(git -C "${rccl_tests_dir}" rev-parse --abbrev-ref HEAD) - local rccl_tests_version=$(git -C "${rccl_tests_dir}" log -1 --format="%H") - - # Set the flag to display RCCL version during the run - export NCCL_DEBUG=VERSION - - - # Run the rccl-tests - # Replace this line with the actual command to run rccl-tests in your environment - - local output_file="rccl_tests_output.txt" - - $1/build/all_reduce_perf -b 8 -e 16M -f 8 -g 2 > "${output_file}" - - # Unset the flag after execution - unset NCCL_DEBUG - - # Extract RCCL, HIP, and ROCm versions from the output file - local rccl_version=$(grep "RCCL version" "${output_file}" | awk '{print $4}') - local hip_version=$(grep "HIP version" "${output_file}" | awk -F ': ' '{print $2}') - local rocm_version=$(grep "ROCm version" "${output_file}" | awk -F ': ' '{print $2}') - - - # Display extracted version information - echo "RCCL Version: ${rccl_version}" - echo "HIP Version: ${hip_version}" - echo "ROCm Version: ${rocm_version}" - - # Display rccl-tests branch and version information - echo "RCCL-Tests Branch: ${rccl_tests_branch}" - echo "RCCL-Tests Version: ${rccl_tests_version}" -} - -##################################################### end define necessary functions ##################################################### - - - -##################################################### setup output folder ##################################################### - -mkdir conf-script-output - -##################################################### end setup output folder ##################################################### - - - -##################################################### query system with functions and commands for config info ##################################################### - -# ROCm version -try "ROCm_version" rocmver -echo "" - -# GPU VRAM info -try "VRAM_info" rocm-smi --showmeminfo vram -echo "" - -# HIP version -try "hip_version" hipconfig --version -echo "" - -# RCCL version and RCCL tests version -try "RCCL_and_RCCL_tests_version" run_rccl_tests $1 -echo "" - -# UCX version -try "UCX_version" /opt/ucx/bin/ucx_info -v -echo "" - -# MPI version4 -try "MPI_version4" /opt/ompi4/bin/mpirun --version # the exact path might need to be removed in the context of debug -echo "" - -# MPI version4 -try "MPI_version5" /opt/ompi5/bin/mpirun --version -echo "" - -# OS version -try "OS_version" cat /etc/os-release -echo "" - -# Linux kernel version -try "Linux_Kernel_version" uname -r -echo "" - -# ulimit -a -try "System_resource_allocation" ulimit -a -echo "" - -# Environment Variable Config -try "Environment_Variable_Config" env -echo "" - -# Rdma link info -try "rdma_link" rdma link -echo "" - -# Query Numa balancing status -Try "Numa_Balancing" cat /proc/sys/kernel/numa_balancing -echo "" - - - -# Infiniband device info -# IB device status -try "IB_device_status" ibstatus -echo "" - -# IB device GUIDs -try "IB_devices" ibv_devices -echo "" - -# IB device info -try "IB_devinfo" ibv_devinfo -echo "" - -# IB device status alternate -try "IB_stat" ibstat -echo "" - -# DKMS module info -try "dkms_status" dkms status -echo "" - -# AMDKFD (GPU Driver version) -try "GPU_Driver_Version" amdgpuver -echo "" - - - -# Network information -# IP addresses -try "IP_address_info" ip a -echo "" - -# Network Interface state -try "IP_link_info" ip link -echo "" - -# Route table info -try "IP_route_info" ip route -echo "" - -# Access control service info -try "ACS_info" ACSinfo -echo "" - -##################################################### end query system with functions and commands for config info ##################################################### From 84fb2e5b4129eed6968d664ad3d8af1ad305da06 Mon Sep 17 00:00:00 2001 From: Nikhil-Nunna Date: Fri, 14 Feb 2025 09:46:11 -0600 Subject: [PATCH 18/26] Added flags and readme --- ..._conf.py => rccl_system_info_collector.py} | 58 +++++++++++++------ 1 file changed, 40 insertions(+), 18 deletions(-) rename tools/scripts/{CS_conf.py => rccl_system_info_collector.py} (86%) diff --git a/tools/scripts/CS_conf.py b/tools/scripts/rccl_system_info_collector.py similarity index 86% rename from tools/scripts/CS_conf.py rename to tools/scripts/rccl_system_info_collector.py index 47fcd5072..22dcbba8e 100644 --- a/tools/scripts/CS_conf.py +++ b/tools/scripts/rccl_system_info_collector.py @@ -1,19 +1,20 @@ ## README +# This script gathers configuration information from your system to help identify and debug any issues when running the ROCm Communication Collectives Library (RCCL). Please ensure that python3 is installed on your system and added to your system's PATH environment variable. +# Prerequisites +# python3 (make sure it's added to the PATH) +# Sudo access on the system if you want ACS info +# Usage +# To run the script and gather the configuration information, execute the following command: +# default +# python3 rccl_system_info_collector.py +# when running with acs flag +# sudo python3 rccl_system_info_collector.py --acs +# Note: Running the script without sudo will not check if ACS is disable or not, sudo is needed to gather all system configuration information. - - - - - - - - - - - +# The script will gather essential system configuration information, such as OS information, network information, driver versions, etc., to help with debugging RCCL issues. It will generate a report in a readable format, which you can share with the support team or use for troubleshooting. import subprocess @@ -21,6 +22,7 @@ import os import re import argparse +import textwrap class CommandResult: @@ -31,10 +33,24 @@ def __init__(self, stdout, stderr): # Function to Parse arguements def parse_arguments(): - parser = argparse.ArgumentParser(description='') + readme = '''\ +This script gathers configuration information from your system to help identify and debug any issues when running the ROCm Communication Collectives Library (RCCL). Please ensure that python3 is installed on your system and added to your system's PATH environment variable.\n +Prerequisites\n +- python3 (make sure it's added to the PATH)\n +- Sudo access on the system if you want ACS info\n +Usage\n +To run the script and gather the configuration information, execute the following command:\n +- default\n + python3 rccl_system_info_collector.py\n +- when running with acs flag\n + sudo python3 rccl_system_info_collector.py --acs\n +Note: Running the script without sudo will not check if ACS is disabled or not, sudo is needed to gather all system configuration information.\n +The script will gather essential system configuration information, such as OS information, network information, driver versions, etc., to help with debugging RCCL issues. It will generate a report in a readable format, which you can share with the support team or use for troubleshooting.\n + ''' + parser = argparse.ArgumentParser(description=textwrap.dedent(readme), formatter_class=argparse.RawDescriptionHelpFormatter) # Add option flags - parser.add_argument('--ACS', help='Check for ACS status (requires root access):', required=False) + parser.add_argument('-a','--acs', help='Check for ACS status (requires root access)', required=False, action='store_true') return parser.parse_args() @@ -344,7 +360,7 @@ def get_rocminfo(): # Gather all data and build summary table and detailed output format -def get_config(): +def get_config(root_enabled): # Run the commands and store the command outputs @@ -437,9 +453,14 @@ def get_config(): ip_route_summary, ip_route_result = get_IP_route() ip_route_status = status_check(ip_route_summary, ip_route_result) - # IP ACS info - acs_summary, acs_result = get_acs_info() - acs_status = status_check(acs_summary, acs_result) + # ACS info + if root_enabled: + acs_summary, acs_result = get_acs_info() + acs_status = status_check(acs_summary, acs_result) + else: + acs_summary = "This field require the acs flag to be set when running the script and root access" + acs_result = CommandResult(stdout="",stderr="Error: " + acs_summary) + acs_status = "SKIPPED" # ROCM info rocm_info_summary, rocm_info_result = get_rocminfo() @@ -535,11 +556,12 @@ def get_config(): def main(): args = parse_arguments() + root_enabled = args.acs hostname = os.uname().nodename timestamp = time.strftime("%Y%m%d_%H%M%S") file_name = f"config.{hostname}.{timestamp}.txt" - summary_table, details = get_config() + summary_table, details = get_config(root_enabled) # Print summary out to cli print(summary_table) From 241e64859a11e68d8f1195e592a27fcacb5fe394 Mon Sep 17 00:00:00 2001 From: Nikhil-Nunna Date: Fri, 14 Feb 2025 10:52:16 -0600 Subject: [PATCH 19/26] Added GPU ID --- tools/scripts/rccl_system_info_collector.py | 28 ++++++++++++++------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/tools/scripts/rccl_system_info_collector.py b/tools/scripts/rccl_system_info_collector.py index 22dcbba8e..0299edcb4 100644 --- a/tools/scripts/rccl_system_info_collector.py +++ b/tools/scripts/rccl_system_info_collector.py @@ -341,19 +341,29 @@ def get_acs_info(): summary = "Unable to detect" return summary, result -# Get rocminfo ################# Ask how to differentiate which GPUs are which by CU and name +# Get rocminfo def get_rocminfo(): result = run_cli_command('rocminfo') if result.stdout: - gpu_names = [] - compute_units = [] - gpu_pattern = re.compile(r"Name:\s+(gfx\d+).*?Compute Unit:\s+(\d+)", re.DOTALL) + gpu_pattern = re.compile(r"Name:\s+(gfx\d+)(?:.*?Marketing Name:\s+([^\n]+))?.*?Compute Unit:\s+(\d+)", re.DOTALL) matches = gpu_pattern.findall(result.stdout) - for match in matches: - gpu_names.append(match[0]) - compute_units.append(int(match[1])) - num_gpus = len(gpu_names) - summary = f"Found {num_gpus} GPUs" + num_gpus = len(matches) + valid_marketing_names = ["MI300X", "MI200", "MI300A", "MI308"] + gpu_name = "" + for name in valid_marketing_names: + if name in matches[0][1]: + gpu_name = name + break + if gpu_name == "": + if "gfx942" == matches[0][0] and 304 == int(matches[0][2]): + gpu_name = "MI300X" + elif "gfx942" == matches[0][0] and 80 == int(matches[0][2]): + gpu_name = "MI308" + elif "gfx942" == matches[0][0] and 228 == int(matches[0][2]): + gpu_name = "MI300A" + elif "gfx90a" == matches[0][0] and 228 == int(matches[0][2]): + gpu_name = "MI200" + summary = f"Found {num_gpus} {gpu_name} GPUs" else: summary = "Unable to detect" return summary, result From a170bec4cbd8ce944e07b6aaf1efb5d545218fa3 Mon Sep 17 00:00:00 2001 From: Nikhil-Nunna Date: Fri, 14 Feb 2025 10:56:28 -0600 Subject: [PATCH 20/26] fixed spelling --- tools/scripts/rccl_system_info_collector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/scripts/rccl_system_info_collector.py b/tools/scripts/rccl_system_info_collector.py index 0299edcb4..3da0b9f0c 100644 --- a/tools/scripts/rccl_system_info_collector.py +++ b/tools/scripts/rccl_system_info_collector.py @@ -468,7 +468,7 @@ def get_config(root_enabled): acs_summary, acs_result = get_acs_info() acs_status = status_check(acs_summary, acs_result) else: - acs_summary = "This field require the acs flag to be set when running the script and root access" + acs_summary = "This field requires the acs flag to be set when running the script and root access" acs_result = CommandResult(stdout="",stderr="Error: " + acs_summary) acs_status = "SKIPPED" From f77811fabc799bb898b3158651f543fb3c8fd6a4 Mon Sep 17 00:00:00 2001 From: Nikhil-Nunna Date: Fri, 14 Feb 2025 12:21:23 -0600 Subject: [PATCH 21/26] renamed script again --- .../scripts/{rccl_system_info_collector.py => rcclDiagnostics.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tools/scripts/{rccl_system_info_collector.py => rcclDiagnostics.py} (100%) diff --git a/tools/scripts/rccl_system_info_collector.py b/tools/scripts/rcclDiagnostics.py similarity index 100% rename from tools/scripts/rccl_system_info_collector.py rename to tools/scripts/rcclDiagnostics.py From 59d560a43749575069be75f02587b960867dfdb4 Mon Sep 17 00:00:00 2001 From: Nikhil-Nunna Date: Fri, 14 Feb 2025 14:42:13 -0600 Subject: [PATCH 22/26] Added file descriptor and locked mem checks --- tools/scripts/rcclDiagnostics.py | 80 ++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/tools/scripts/rcclDiagnostics.py b/tools/scripts/rcclDiagnostics.py index 3da0b9f0c..52715a7b0 100644 --- a/tools/scripts/rcclDiagnostics.py +++ b/tools/scripts/rcclDiagnostics.py @@ -368,6 +368,79 @@ def get_rocminfo(): summary = "Unable to detect" return summary, result +def checklimits_from_file(): + summary = "" + try: + with open('/etc/security/limits.conf', 'r') as file: + lines = file.read().splitlines() + + # Reverse lines list to check for the last occurrence (to avoid overwriting) + lines.reverse() + + limit_soft_nofile_line = '* soft nofile 1048576' + limit_hard_nofile_line = '* hard nofile 1048576' + limit_soft_memlock_line = '* soft memlock unlimited' + limit_hard_memlock_line = '* hard memlock unlimited' + + lines_to_check = [ + limit_soft_nofile_line, + limit_hard_nofile_line, + limit_soft_memlock_line, + limit_hard_memlock_line, + ] + + missing_lines = [] + + for line in lines_to_check: + if line not in lines: + missing_lines.append(line) + + if missing_lines: + summary = "Limits not set" + error = "" + for missing_line in missing_lines: + error += missing_line + "\n" + results = CommandResult(stdout="",stderr="Error: The following lines are missing in /etc/security/limits.conf:" + error) + return summary, results + else: + print("All required lines are present in /etc/security/limits.conf.") + summary = "Limits set correctly" + results = CommandResult(stdout="All required lines are present in /etc/security/limits.conf.",stderr="") + return summary, results + + except FileNotFoundError: + summary = "Unable to detect" + results = CommandResult(stdout="",stderr="Error: File /etc/security/limits.conf not found on this system.") + return summary, results + except Exception as e: + summary = "Unable to detect" + results = CommandResult(stdout="",stderr=f"Error opening or reading /etc/security/limits.conf: {str(e)}") + return summary, results + + + +# Check max file descriptors and max lock memory +def checklimits(): + result = run_cli_command('ulimit -n') + result2 = run_cli_command('ulimit -l') + if result.stdout and result2.stdout: + file_descriptors = int(result.stdout) + locked_mem = str(result2.stdout).strip() + if file_descriptors >= 1048576 and locked_mem == "unlimited": + summary = "Max file descriptors and locked memory are set correctly" + stdout = "ulimit -n output:\n" + result.stdout + "\n" + "ulimit -l output:\n" + result2.stdout + results = CommandResult(stdout=stdout, stderr="") + return summary, results + else: + summary, results = checklimits_from_file() + return summary, results + + + else: + summary, results = checklimits_from_file() + return summary, results + + # Gather all data and build summary table and detailed output format def get_config(root_enabled): @@ -475,6 +548,10 @@ def get_config(root_enabled): # ROCM info rocm_info_summary, rocm_info_result = get_rocminfo() rocm_info_status = status_check(rocm_info_summary, rocm_info_result) + + # Check max file descriptors and max lock memory + limits_summary, limits_result = checklimits() + limits_status = status_check(limits_summary, limits_result) # Create the summary table @@ -505,6 +582,7 @@ def get_config(root_enabled): f"IP Route Information{' ':<10}| {ip_route_status:<13} | {ip_route_summary}\n" f"ACS Disabled{' ':<18}| {acs_status:<13} | {acs_summary}\n" f"Node Status{' ':<19}| {rocm_info_status:<13} | {rocm_info_summary}\n" + f"File Descriptor Information{' ':<3}| {limits_status:<13} | {limits_summary}\n" f"{'='*119}\n\n\n" ) @@ -560,6 +638,8 @@ def get_config(root_enabled): f"{acs_result.stdout.strip()}{acs_result.stderr.strip()}\n\n" f"{centered_title('ROCm Information', details_width, '=')}\n" f"{rocm_info_result.stdout.strip()}{rocm_info_result.stderr.strip()}\n\n" + f"{centered_title('File Descriptor Limits', details_width, '=')}\n" + f"{limits_result.stdout.strip()}{limits_result.stderr.strip()}\n\n" ) return summary_table, details From 173614c27d293121af5ca490838ad281dcf65b89 Mon Sep 17 00:00:00 2001 From: Nikhil-Nunna Date: Fri, 14 Feb 2025 14:42:51 -0600 Subject: [PATCH 23/26] Added file descriptor and locked mem checks --- tools/scripts/rcclDiagnostics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/scripts/rcclDiagnostics.py b/tools/scripts/rcclDiagnostics.py index 52715a7b0..1811c8e42 100644 --- a/tools/scripts/rcclDiagnostics.py +++ b/tools/scripts/rcclDiagnostics.py @@ -427,7 +427,7 @@ def checklimits(): file_descriptors = int(result.stdout) locked_mem = str(result2.stdout).strip() if file_descriptors >= 1048576 and locked_mem == "unlimited": - summary = "Max file descriptors and locked memory are set correctly" + summary = "Limits set correctly" stdout = "ulimit -n output:\n" + result.stdout + "\n" + "ulimit -l output:\n" + result2.stdout results = CommandResult(stdout=stdout, stderr="") return summary, results From 518513a82d7996a2abd6d0822c36d4ed1416f227 Mon Sep 17 00:00:00 2001 From: Nikhil-Nunna Date: Fri, 14 Feb 2025 15:02:45 -0600 Subject: [PATCH 24/26] Removed extra spaces from summary table --- tools/scripts/rcclDiagnostics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/scripts/rcclDiagnostics.py b/tools/scripts/rcclDiagnostics.py index 1811c8e42..ff0065a72 100644 --- a/tools/scripts/rcclDiagnostics.py +++ b/tools/scripts/rcclDiagnostics.py @@ -583,7 +583,7 @@ def get_config(root_enabled): f"ACS Disabled{' ':<18}| {acs_status:<13} | {acs_summary}\n" f"Node Status{' ':<19}| {rocm_info_status:<13} | {rocm_info_summary}\n" f"File Descriptor Information{' ':<3}| {limits_status:<13} | {limits_summary}\n" - f"{'='*119}\n\n\n" + f"{'='*119}" ) @@ -591,7 +591,7 @@ def get_config(root_enabled): # Combine details details_width = 120 details = ( - f"Detailed Output:\n" + f"\n\n\nDetailed Output:\n" f"{centered_title('Host Name', details_width, '=')}\n" f"{hostname_result.stdout.strip()}{hostname_result.stderr.strip()}\n\n" f"{centered_title('OS info', details_width, '=')}\n" From b798a3e827ff234cce0fa58a81c420cce358900b Mon Sep 17 00:00:00 2001 From: Nikhil-Nunna Date: Fri, 14 Feb 2025 15:08:48 -0600 Subject: [PATCH 25/26] printing output file location --- tools/scripts/rcclDiagnostics.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/scripts/rcclDiagnostics.py b/tools/scripts/rcclDiagnostics.py index ff0065a72..0c0d8472e 100644 --- a/tools/scripts/rcclDiagnostics.py +++ b/tools/scripts/rcclDiagnostics.py @@ -655,7 +655,8 @@ def main(): # Print summary out to cli print(summary_table) - + current_directory = os.getcwd() + print("Detailed output file is at: " + current_directory + "/" + file_name) # Write the summary table and details to the output file with open(file_name, "w") as file: file.write(summary_table) From e252cea68ae002dbc963afc078ce3513d30d24be Mon Sep 17 00:00:00 2001 From: Nikhil-Nunna Date: Fri, 14 Feb 2025 17:24:20 -0600 Subject: [PATCH 26/26] Removed sudo in code and ACS flag --- tools/scripts/rcclDiagnostics.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/tools/scripts/rcclDiagnostics.py b/tools/scripts/rcclDiagnostics.py index 0c0d8472e..8b256afce 100644 --- a/tools/scripts/rcclDiagnostics.py +++ b/tools/scripts/rcclDiagnostics.py @@ -10,9 +10,9 @@ # default # python3 rccl_system_info_collector.py -# when running with acs flag -# sudo python3 rccl_system_info_collector.py --acs -# Note: Running the script without sudo will not check if ACS is disable or not, sudo is needed to gather all system configuration information. +# when you require acs output +# sudo python3 rccl_system_info_collector.py +# Note: Running the script without sudo will not check if ACS is disabled or not, sudo is needed to complete system configuration information but the script will skip what it can't get. # The script will gather essential system configuration information, such as OS information, network information, driver versions, etc., to help with debugging RCCL issues. It will generate a report in a readable format, which you can share with the support team or use for troubleshooting. @@ -42,15 +42,14 @@ def parse_arguments(): To run the script and gather the configuration information, execute the following command:\n - default\n python3 rccl_system_info_collector.py\n -- when running with acs flag\n - sudo python3 rccl_system_info_collector.py --acs\n -Note: Running the script without sudo will not check if ACS is disabled or not, sudo is needed to gather all system configuration information.\n +- when you require acs output\n + sudo python3 rccl_system_info_collector.py\n +Note: Running the script without sudo will not check if ACS is disabled or not, sudo is needed to complete system configuration information but the script will skip what it can't get.\n The script will gather essential system configuration information, such as OS information, network information, driver versions, etc., to help with debugging RCCL issues. It will generate a report in a readable format, which you can share with the support team or use for troubleshooting.\n ''' parser = argparse.ArgumentParser(description=textwrap.dedent(readme), formatter_class=argparse.RawDescriptionHelpFormatter) # Add option flags - parser.add_argument('-a','--acs', help='Check for ACS status (requires root access)', required=False, action='store_true') return parser.parse_args() @@ -329,7 +328,7 @@ def get_IP_route(): # Get ACS info def get_acs_info(): - result = run_cli_command('sudo lspci -vvv | grep ACSCtl') + result = run_cli_command('lspci -vvv | grep ACSCtl') if result.stdout: pattern = r"SrcValid\+" matches = re.findall(pattern, result.stdout) @@ -541,7 +540,7 @@ def get_config(root_enabled): acs_summary, acs_result = get_acs_info() acs_status = status_check(acs_summary, acs_result) else: - acs_summary = "This field requires the acs flag to be set when running the script and root access" + acs_summary = "Requires script to be run with root access" acs_result = CommandResult(stdout="",stderr="Error: " + acs_summary) acs_status = "SKIPPED" @@ -646,11 +645,13 @@ def get_config(root_enabled): def main(): args = parse_arguments() - root_enabled = args.acs hostname = os.uname().nodename timestamp = time.strftime("%Y%m%d_%H%M%S") file_name = f"config.{hostname}.{timestamp}.txt" - + if os.geteuid() == 0: + root_enabled = True + else: + root_enabled = False summary_table, details = get_config(root_enabled) # Print summary out to cli