-
Notifications
You must be signed in to change notification settings - Fork 49
Stochastic Physics CPU and GPU Optimizations - NGARCH #65
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
6dec22a
bb5e3d3
65efd75
aaa25ec
2628ff8
854d5cb
70f44a1
459268f
c4e6c4f
42ba72b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,15 +1,16 @@ | ||
| # Contributors | ||
| | GitHub user | Real Name | Affiliation | Date | | ||
| | --------------- | ----------------- | ----------- | ---------- | | ||
| | james-bruten-mo | James Bruten | Met Office | 2025-12-09 | | ||
| | jedbakerMO | Jed Baker | Met Office | 2025-12-29 | | ||
| | jennyhickson | Jenny Hickson | Met Office | 2025-12-10 | | ||
| | mike-hobson | Mike Hobson | Met Office | 2025-12-17 | | ||
| | mo-marqh | mark Hedley | Met Office | 2025-12-11 | | ||
| | yaswant | Yaswant Pradhan | Met Office | 2025-12-16 | | ||
| | oakleybrunt | Oakley Brunt | Met Office | 2025-12-19 | | ||
| | harry-shepherd | Harry Shepherd | Met Office | 2026-01-08 | | ||
| | DrTVockerodtMO | Terence Vockerodt | Met Office | 2026-01-08 | | ||
| | MetBenjaminWent | Benjamin Went | Met Office | 2026-01-15 | | ||
| | timgraham-Met | Tim Graham | Met Office | 2026-01-15 | | ||
| | mo-alistairp | Alistair Pirrie | Met Office | 2026-01-19 | | ||
| | GitHub user | Real Name | Affiliation | Date | | ||
| | --------------- | ------------------ | -------------------------------- | ---------- | | ||
| | james-bruten-mo | James Bruten | Met Office | 2025-12-09 | | ||
| | jedbakerMO | Jed Baker | Met Office | 2025-12-29 | | ||
| | jennyhickson | Jenny Hickson | Met Office | 2025-12-10 | | ||
| | mike-hobson | Mike Hobson | Met Office | 2025-12-17 | | ||
| | mo-marqh | mark Hedley | Met Office | 2025-12-11 | | ||
| | yaswant | Yaswant Pradhan | Met Office | 2025-12-16 | | ||
| | oakleybrunt | Oakley Brunt | Met Office | 2025-12-19 | | ||
| | harry-shepherd | Harry Shepherd | Met Office | 2026-01-08 | | ||
| | DrTVockerodtMO | Terence Vockerodt | Met Office | 2026-01-08 | | ||
| | MetBenjaminWent | Benjamin Went | Met Office | 2026-01-15 | | ||
| | timgraham-Met | Tim Graham | Met Office | 2026-01-15 | | ||
| | mo-alistairp | Alistair Pirrie | Met Office | 2026-01-19 | | ||
| | jasonjunweilyu | Junwei (Jason) Lyu | Bureau of Meteorology, Australia | 2025-12-17 | |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,228 @@ | ||
| ############################################################################## | ||
| # (C) Crown copyright Met Office. All rights reserved. | ||
| # The file LICENCE, distributed with this code, contains details of the terms | ||
| # under which the code may be used. | ||
| ############################################################################## | ||
|
|
||
|
|
||
| '''PSyclone transformation script for physics_constants_mod to apply colouring | ||
| and GPU offloading/CPU parallelization. Also adds redundant computation to | ||
| the level-1 halo for setval_* generically. This is based on | ||
| https://github.com/stfc/PSyclone/blob/master/examples/lfric/ | ||
| scripts/gpu_offloading.py . | ||
| ''' | ||
|
|
||
| import os | ||
| import sys | ||
| from psyclone.domain.lfric import LFRicConstants | ||
| from psyclone.psyir.nodes import Directive, Loop, Routine | ||
| from psyclone.psyir.transformations import ( | ||
| ACCKernelsTrans, TransformationError, OMPTargetTrans) | ||
| from psyclone.transformations import ( | ||
| Dynamo0p3ColourTrans, Dynamo0p3OMPLoopTrans, | ||
| Dynamo0p3RedundantComputationTrans, OMPParallelTrans, | ||
| ACCParallelTrans, ACCLoopTrans, ACCRoutineTrans, | ||
| OMPDeclareTargetTrans, OMPLoopTrans) | ||
| from psyclone.domain.common.transformations import KernelModuleInlineTrans | ||
|
|
||
|
|
||
| # Names of any invoke that we won't add any GPU offloading | ||
| INVOKE_EXCLUSIONS = [ | ||
| ] | ||
|
|
||
| # Names of any kernel that we won't add parallelization | ||
| KERNEL_EXCLUSIONS = ["get_Pnm_star_code",] | ||
| # get_Pnm_star_code has data dependencies in the loops | ||
| # and is tested to be not suitable | ||
| # for parallelization | ||
|
|
||
| # Names of any kernels that we won't offload to GPU | ||
| GPU_KERNEL_EXCLUSIONS = [ | ||
| ] | ||
|
|
||
| OFFLOAD_DIRECTIVES = os.getenv('LFRIC_OFFLOAD_DIRECTIVES', "none") | ||
|
|
||
|
|
||
| def trans(psyir): | ||
| '''Applies PSyclone colouring and GPU offloading transformations. Any | ||
| kernels that cannot be offloaded to GPU are parallelised using OpenMP | ||
| on the CPU if they can be parallelised. Any setval_* kernels are | ||
| transformed so as to compute into the L1 halos. | ||
| :param psyir: the PSyIR of the PSy-layer. | ||
| :type psyir: :py:class:`psyclone.psyir.nodes.FileContainer` | ||
| ''' | ||
| inline_trans = KernelModuleInlineTrans() | ||
| rtrans = Dynamo0p3RedundantComputationTrans() | ||
| ctrans = Dynamo0p3ColourTrans() | ||
| otrans = Dynamo0p3OMPLoopTrans() | ||
| const = LFRicConstants() | ||
| cpu_parallel = OMPParallelTrans() | ||
|
|
||
| if OFFLOAD_DIRECTIVES == "omp": | ||
| # Use OpenMP offloading | ||
| loop_offloading_trans = OMPLoopTrans( | ||
| omp_directive="teamsdistributeparalleldo", | ||
| omp_schedule="none" | ||
| ) | ||
| # OpenMP does not have a kernels parallelism directive equivalent | ||
| # to OpenACC 'kernels' | ||
| kernels_trans = None | ||
| gpu_region_trans = OMPTargetTrans() | ||
| gpu_annotation_trans = OMPDeclareTargetTrans() | ||
| elif OFFLOAD_DIRECTIVES == "acc": | ||
| # Use OpenACC offloading | ||
| loop_offloading_trans = ACCLoopTrans() | ||
| kernels_trans = ACCKernelsTrans() | ||
| gpu_region_trans = ACCParallelTrans(default_present=False) | ||
| gpu_annotation_trans = ACCRoutineTrans() | ||
| elif OFFLOAD_DIRECTIVES == "none": | ||
| pass | ||
| else: | ||
| print( | ||
| f"The PSyclone transformation script expects the " | ||
| f"LFRIC_OFFLOAD_DIRECTIVES to be set to 'omp' or 'acc' or " | ||
| f"'none' but found '{OFFLOAD_DIRECTIVES}'." | ||
| ) | ||
| sys.exit(-1) | ||
|
|
||
| print(f"PSy name = '{psyir.name}'") | ||
|
|
||
| for subroutine in psyir.walk(Routine): | ||
|
|
||
| print("Transforming invoke '{0}' ...".format(subroutine.name)) | ||
|
|
||
| # Make setval_* compute redundantly to the level 1 halo if it | ||
| # is in its own loop | ||
| for loop in subroutine.loops(): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Like above, could this functionality be nested inside a function to reduce the length of this script?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I can add this in a new PR if needed. It is indeed doing the same thing as |
||
| if loop.iteration_space == "dof": | ||
| if len(loop.kernels()) == 1: | ||
| if loop.kernels()[0].name in ["setval_c"]: | ||
| rtrans.apply(loop, options={"depth": 1}) | ||
|
|
||
| if ( | ||
| psyir.name.lower() in INVOKE_EXCLUSIONS | ||
| or OFFLOAD_DIRECTIVES == "none" | ||
| ): | ||
| print( | ||
| f"Not adding GPU offloading to invoke '{subroutine.name}'" | ||
| ) | ||
| offload = False | ||
| else: | ||
| offload = True | ||
|
|
||
| # Keep a record of any kernels we fail and succeed to offload | ||
| succeeded_offload = set() | ||
| failed_to_offload = set() | ||
|
|
||
| # Colour loops over cells unless they are on discontinuous spaces | ||
| # (alternatively we could annotate the kernels with atomics) | ||
| for loop in subroutine.loops(): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Like above, could this functionality be nested inside a function to reduce the length of this script?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, it is |
||
| if loop.iteration_space.endswith("cell_column"): | ||
| if (loop.field_space.orig_name not in | ||
| const.VALID_DISCONTINUOUS_NAMES): | ||
| ctrans.apply(loop) | ||
|
|
||
| # Mark kernels inside the loops over cells as GPU-enabled | ||
| # and inline them. | ||
| for loop in subroutine.loops(): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Like above, could this functionality be nested inside a function to reduce the length of this script?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I can add this in a new PR if needed. |
||
| if loop.iteration_space.endswith("cell_column"): | ||
| if offload: | ||
| for kern in loop.kernels(): | ||
| if kern.name.lower() in ( | ||
| GPU_KERNEL_EXCLUSIONS + KERNEL_EXCLUSIONS + | ||
| list(succeeded_offload) | ||
| ): | ||
| continue | ||
|
|
||
| try: | ||
| gpu_annotation_trans.apply( | ||
| kern, options={'force': True} | ||
| ) | ||
| print(f"GPU-annotated kernel '{kern.name}'") | ||
|
|
||
| try: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure how much I trust nested try statements, but given the dependency between annotating the kernel, then in-lining it, this doesn't seem unreasonable. |
||
| inline_trans.apply(kern) | ||
| print(f"Module-inlined kernel '{kern.name}'") | ||
| succeeded_offload.add(kern.name.lower()) | ||
| except TransformationError as err: | ||
| print( | ||
| f"Failed to module-inline '{kern.name}'" | ||
| f" due to:\n{err.value}" | ||
| ) | ||
| except TransformationError as err: | ||
| failed_to_offload.add(kern.name.lower()) | ||
| print( | ||
| f"Failed to annotate '{kern.name}' with " | ||
| f"GPU-enabled directive due to:\n{err.value}" | ||
| ) | ||
| # For annotated or inlined kernels we could attempt to | ||
| # provide compile-time dimensions for the temporary | ||
| # arrays and convert to code unsupported intrinsics. | ||
|
|
||
| # Add GPU offloading to loops unless they are over colours or are null. | ||
| for loop in subroutine.walk(Loop): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Like above, could this functionality be nested inside a function to reduce the length of this script?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I can add this in a new PR if needed. |
||
| kernel_names = [k.name.lower() for k in loop.kernels()] | ||
| if offload and all( | ||
| name not in ( | ||
| list(failed_to_offload) + GPU_KERNEL_EXCLUSIONS + | ||
| KERNEL_EXCLUSIONS | ||
| ) | ||
| for name in kernel_names | ||
| ): | ||
| try: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. However here, could the if's and the try wrapper around the choices be reversed? Have the ifs for the options, and then trys for each option?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I can verify this in the new PR. |
||
| if loop.loop_type == "colours": | ||
| pass | ||
| if loop.loop_type == "colour": | ||
| loop_offloading_trans.apply( | ||
| loop, options={"independent": True}) | ||
| gpu_region_trans.apply(loop.ancestor(Directive)) | ||
| if loop.loop_type == "": | ||
| loop_offloading_trans.apply( | ||
| loop, options={"independent": True} | ||
| ) | ||
| gpu_region_trans.apply(loop.ancestor(Directive)) | ||
| if loop.loop_type == "dof": | ||
| # Loops over dofs can contains reductions | ||
| if kernels_trans: | ||
| # If kernel offloading is available it should | ||
| # manage them | ||
| kernels_trans.apply(loop) | ||
| else: | ||
| # Otherwise, if the reductions exists, they will | ||
| # be detected by the dependencyAnalysis and raise | ||
| # a TransformationError captured below | ||
| loop_offloading_trans.apply( | ||
| loop, options={"independent": True}) | ||
| gpu_region_trans.apply(loop.ancestor(Directive)) | ||
| # Alternatively we could use loop parallelism with | ||
| # reduction clauses | ||
| print(f"Successfully offloaded loop with {kernel_names}") | ||
| except TransformationError as err: | ||
| print(f"Failed to offload loop with {kernel_names} " | ||
| f"because: {err}") | ||
|
|
||
| # Apply OpenMP thread parallelism for any kernels we've not been able | ||
| # to offload to GPU. | ||
| for loop in subroutine.walk(Loop): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Like above, could this functionality be nested inside a function to reduce the length of this script?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I can add this in a new PR if needed. |
||
| if any( | ||
| kern.name.lower() in KERNEL_EXCLUSIONS | ||
| for kern in loop.kernels() | ||
| ): | ||
| continue | ||
|
|
||
| if ( | ||
| not offload | ||
| or any( | ||
| kern.name.lower() in ( | ||
| list(failed_to_offload) + GPU_KERNEL_EXCLUSIONS | ||
| ) | ||
| for kern in loop.kernels() | ||
| ) | ||
| ): | ||
| if loop.loop_type not in ["colours", "null"]: | ||
| cpu_parallel.apply(loop) | ||
| otrans.apply(loop, options={"reprod": True}) | ||
| print(subroutine.view()) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This seems like a good way to ensure this is set, however, could these checks be nested inside a function to reduce the length of this script?
For PSYKAL LFRic API, these would be added here:
https://github.com/MetOffice/lfric_core/blob/main/infrastructure/build/psyclone/psyclone_tools.py
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I can add this in a new PR if needed.