From c00f6d222d2c3d0807dadd13d83c1d8accb22f7e Mon Sep 17 00:00:00 2001 From: Axel Huebl Date: Tue, 14 Jan 2025 10:13:49 -0800 Subject: [PATCH] CI: PyTorch Surrogate Example (#621) * CI: PyTorch Surrogate Example Cover our PyTorch surrogate example in CI. * CI: Add Extra Example Requirements (CPU) * Silence Torch Warning And andticipate default change in future releases. * PyTorch Threading Mixed with AMReX is icky Issues as soon as we use MPI+OMP and add our `Drift` element. * CTest: Skip Analysis/Plot if Run Failed ... to produce output * CTest: Define 42 as Skip Return Code Better than passing the test https://cmake.org/cmake/help/latest/prop_test/SKIP_RETURN_CODE.html --- .github/workflows/dependencies/gcc-openmpi.sh | 2 ++ .github/workflows/dependencies/gcc.sh | 2 ++ examples/CMakeLists.txt | 25 +++++++++++++++++++ .../run_ml_surrogate_15_stage.py | 18 +++++++------ .../surrogate_model_definitions.py | 4 +-- examples/requirements_torch_cpu.txt | 6 +++++ 6 files changed, 47 insertions(+), 10 deletions(-) create mode 100644 examples/requirements_torch_cpu.txt diff --git a/.github/workflows/dependencies/gcc-openmpi.sh b/.github/workflows/dependencies/gcc-openmpi.sh index a53e8395c..f68cf260c 100755 --- a/.github/workflows/dependencies/gcc-openmpi.sh +++ b/.github/workflows/dependencies/gcc-openmpi.sh @@ -32,4 +32,6 @@ python3 -m pip install -U -r src/python/impactx/dashboard/requirements.txt python3 -m pip install -U -r examples/requirements.txt python3 -m pip install -U -r tests/python/requirements.txt +# extra tests +python3 -m pip install -U -r examples/requirements_torch_cpu.txt python3 -m pip install -U openPMD-validator diff --git a/.github/workflows/dependencies/gcc.sh b/.github/workflows/dependencies/gcc.sh index c39caa33a..657ec93e4 100755 --- a/.github/workflows/dependencies/gcc.sh +++ b/.github/workflows/dependencies/gcc.sh @@ -30,4 +30,6 @@ python3 -m pip install -U -r src/python/impactx/dashboard/requirements.txt python3 -m pip install -U -r examples/requirements.txt python3 -m pip install -U -r tests/python/requirements.txt +# extra tests +python3 -m pip install -U -r examples/requirements_torch_cpu.txt python3 -m pip install -U openPMD-validator diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index ce1704b09..93b429087 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -116,6 +116,8 @@ function(add_impactx_test name input is_mpi analysis_script plot_script) else() set_property(TEST ${name}.run APPEND PROPERTY ENVIRONMENT "OMP_NUM_THREADS=2") endif() + # special return code for skipped tests (e.g., runtime prerequisite fails) + set_tests_properties(${name}.run PROPERTIES SKIP_RETURN_CODE 42) # analysis and plots set(THIS_Python_SCRIPT_EXE) @@ -131,6 +133,11 @@ function(add_impactx_test name input is_mpi analysis_script plot_script) # make HDF5 I/O more robust on various filesystems set_property(TEST ${name}.analysis APPEND PROPERTY ENVIRONMENT "HDF5_USE_FILE_LOCKING=FALSE") + + # run test failed? Mark this as skipped + set_property(TEST ${name}.analysis PROPERTY SKIP_REGULAR_EXPRESSION + "Supplied directory is not valid: diags" + ) endif() if(plot_script) add_test(NAME ${name}.plot @@ -141,6 +148,11 @@ function(add_impactx_test name input is_mpi analysis_script plot_script) # make HDF5 I/O more robust on various filesystems set_property(TEST ${name}.plot APPEND PROPERTY ENVIRONMENT "HDF5_USE_FILE_LOCKING=FALSE") + + # run test failed? Mark this as skipped + set_property(TEST ${name}.plot PROPERTY SKIP_REGULAR_EXPRESSION + "ValueError: No objects to concatenate" + ) endif() endfunction() @@ -1000,6 +1012,7 @@ add_impactx_test(spectrometer.py OFF # no plot script yet ) + # Chicane with CSR ########################################################### # if(ImpactX_FFT) @@ -1097,6 +1110,7 @@ add_impactx_test(linac-segment.py OFF # no plot script yet ) + # Iteration of a linear one-turn map ######################################### # # w/o space charge @@ -1112,3 +1126,14 @@ add_impactx_test(linear-map.py examples/linear_map/analysis_map.py OFF # no plot script yet ) + + +# PyTorch Surrogate: Staged LPA ############################################## +# +add_impactx_test(pytorch_surrogate_model + examples/pytorch_surrogate_model/run_ml_surrogate_15_stage.py + OFF # ImpactX MPI-parallel + examples/pytorch_surrogate_model/analyze_ml_surrogate_15_stage.py + examples/pytorch_surrogate_model/visualize_ml_surrogate_15_stage.py +) +label_impactx_test(pytorch_surrogate_model slow) diff --git a/examples/pytorch_surrogate_model/run_ml_surrogate_15_stage.py b/examples/pytorch_surrogate_model/run_ml_surrogate_15_stage.py index 3539993ae..85d62fbc2 100644 --- a/examples/pytorch_surrogate_model/run_ml_surrogate_15_stage.py +++ b/examples/pytorch_surrogate_model/run_ml_surrogate_15_stage.py @@ -37,7 +37,7 @@ print("Warning: Cannot import PyTorch. Skipping test.") import sys - sys.exit(0) + sys.exit(42) # ImpactX special return code for skipped tests import zipfile from urllib import request @@ -100,18 +100,19 @@ def download_and_unzip(url, data_dir): data_url = "https://zenodo.org/records/10810754/files/models.zip?download=1" download_and_unzip(data_url, "models.zip") -# It was found that the PyTorch multithreaded defaults interfere with MPI-enabled AMReX -# when initializing the models: https://github.com/AMReX-Codes/pyamrex/issues/322 +# It was found that the PyTorch multithreaded defaults interfere with AMReX OpenMP +# when initializing the models or iterating elements: +# https://github.com/AMReX-Codes/pyamrex/issues/322 +# https://github.com/ECP-WarpX/impactx/issues/773#issuecomment-2585043099 # So we manually set the number of threads to serial (1). -if Config.have_mpi: - n_threads = torch.get_num_threads() - torch.set_num_threads(1) +# Torch threading is not a problem with GPUs and might work when MPI is disabled. +# Could also just be a mixing of OpenMP libraries (gomp and llvm omp) when using the +# pre-build PyTorch pip packages. +torch.set_num_threads(1) model_list = [ surrogate_model(f"models/beam_stage_{stage_i}_model.pt", device=device) for stage_i in range(N_stage) ] -if Config.have_mpi: - torch.set_num_threads(n_threads) pp_amrex = amr.ParmParse("amrex") pp_amrex.add("the_arena_init_size", 0) @@ -328,6 +329,7 @@ def set_lens(self, pc, step, period): lpa = LPASurrogateStage(i, model_list[i], L_surrogate, L_stage_period * i) lpa.nslice = n_slice lpa.ds = L_surrogate + lpa.threadsafe = False lpa_stages.append(lpa) monitor = elements.BeamMonitor("monitor") diff --git a/examples/pytorch_surrogate_model/surrogate_model_definitions.py b/examples/pytorch_surrogate_model/surrogate_model_definitions.py index 4819c9d49..bf37ff9a1 100644 --- a/examples/pytorch_surrogate_model/surrogate_model_definitions.py +++ b/examples/pytorch_surrogate_model/surrogate_model_definitions.py @@ -106,9 +106,9 @@ class surrogate_model: def __init__(self, model_file, device=None): self.device = device if device is None: - model_dict = torch.load(model_file, map_location="cpu") + model_dict = torch.load(model_file, map_location="cpu", weights_only=False) else: - model_dict = torch.load(model_file, map_location=device) + model_dict = torch.load(model_file, map_location=device, weights_only=False) self.source_means = torch.tensor( model_dict["source_means"], device=self.device, dtype=torch.float64 ) diff --git a/examples/requirements_torch_cpu.txt b/examples/requirements_torch_cpu.txt new file mode 100644 index 000000000..8f90fd677 --- /dev/null +++ b/examples/requirements_torch_cpu.txt @@ -0,0 +1,6 @@ +# This is for CPU CI tests with extra requirements. +# +# For PyTorch, see alternative packages, e.g., for GPU here: +# https://pytorch.org/get-started/locally/ +--extra-index-url https://download.pytorch.org/whl/cpu +torch