From 6000775bb59a726c95021970735a7fb1fd8c4171 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Wed, 15 Oct 2025 10:10:51 +0100 Subject: [PATCH 01/33] Start updating nemo README --- examples/nemo/README.md | 59 +------------- examples/nemo/scripts/README.md | 140 ++++++++++++++++++++++++++++++++ 2 files changed, 142 insertions(+), 57 deletions(-) create mode 100644 examples/nemo/scripts/README.md diff --git a/examples/nemo/README.md b/examples/nemo/README.md index 1a750f0603..5b1b527750 100644 --- a/examples/nemo/README.md +++ b/examples/nemo/README.md @@ -34,6 +34,7 @@ POSSIBILITY OF SUCH DAMAGE. Author A. R. Porter, STFC Daresbury Lab Modified by R. W. Ford, STFC Daresbury Lab Modified by J. Henrichs, Bureau of Meteorology +Modified by S. Siso, STFC Daresbury Lab --> @@ -54,63 +55,7 @@ Contains: ## Scripts -Contains the scripts used to process the NEMO code base and to add profiling -instrumentation (https://psyclone.readthedocs.io/en/latest/user_guide/profiling.html) -and OpenACC or OpenMP directives: - -1. `process_nemo.py` is a driver script that allows the user to specify - which files to process with PSyclone, the transformation script to use - and where to put the outputs: - - $ ./process_nemo.py -h - usage: process_nemo.py [-h] [-o OUT_DIR] [-s SCRIPT_FILE] [-x] - input_file [input_file ...] - - Process the specified NEMO source files using PSyclone - - positional arguments: - input_file One or more NEMO pre-processed source files - - optional arguments: - -h, --help show this help message and exit - -o OUT_DIR Destination directory for processed source files - -s SCRIPT_FILE PSyclone transformation script - -x exit immediately if PSyclone fails - -p add profiling instrumentation to the PROFILE_ONLY file - list. Note that files processed by the SCRIPT_FILE may - be introducing profiling instrumentation as part of - that script. - - In addition to the command-line flags, the script itself contains two - variables that may be used to control its behaviour: - - - `EXCLUDED_FILES`: list of filenames that PSyclone will not attempt to process. - - `PROFILE_ONLY`: list of filenames to add profiling instrumentation but - do not attempt to further process by PSyclone. - - Finally, the precise invocation to use when running PSyclone may be - specified by setting the `PSYCLONE` environment variable. If this is not set - then `psyclone` must be in the user's PATH. - -2. PSyclone transformation scripts: - - `kernels_trans.py` adds OpenACC kernel directives and places fine-grained - profiling instrumentation around any regions that haven't had OpenACC - added. - - `omp_cpu_trans.py` adds OpenMP directives for CPU threading parallelism. - - `omp_gpu_trans.py` adds OpenMP offloading directives for GPU acceleration. - -These scripts are a *work in progress* and are being developed to work on the -MO_GO8 configuration of NEMO supplied by the Met Office. This configuration is -based on version 4.0.2 of NEMO and is compiled using: - - ./makenemo -n MO_GO8_GPU -r SPITZ12 -m linux_nvfortran_gpu \ - del_key "key_iomput key_mpp_mpi" add_key "key_nosignedzero" - -(where you will need an `arch/arch-linux_nvfortran_gpu.fcm` FCM configuration -file specifying how to use the NVIDIA compiler). - -If you are applying PSyclone to any other version or configuration of NEMO then -these scripts should serve as a useful starting point. +Contains the scripts used to process the NEMO codebase. ## Example 1 diff --git a/examples/nemo/scripts/README.md b/examples/nemo/scripts/README.md new file mode 100644 index 0000000000..4ca8ac736a --- /dev/null +++ b/examples/nemo/scripts/README.md @@ -0,0 +1,140 @@ + + +# PSyclone NEMO Examples + +This directory contains various examples of the use of PSyclone to transform +source code from the NEMO ocean model. + +> [!Important] +> The NEMO build system, `makenemo`, has the ability to apply psyclone +> scripts that come with the NEMO repository (see +> [the NEMO user guide](https://sites.nemo-ocean.io/user-guide/psyclone.html)) +> but these are pinned to a particular release of PSyclone. By contrast, +> the process presented in this README uses the experimenatal `psyclonefc` +> command to intercept any compilation command and wrap it with a psyclone +> code-transformation step, and therefore bypasses the makenemo `-p` option. +> This is the recommended way to apply upstream psyclone transformations, as it +> is not contrained by the file-exclusions and backward compatibility guarantees +> of the `makenemo` scripts. + +## Downloading the NEMO source and data files + + +```bash +git clone https://forge.nemo-ocean.eu/nemo/nemo.git --branch 5.0 --single-branch +wget https://gws-access.jasmin.ac.uk/public/nemo/sette_inputs/r5.0.0/ORCA2_ICE_v5.0.0.tar.gz +tar -xzf ORCA2_ICE_v5.0.0.tar.gz +``` + +## Set up environment variables + +The code that psyclone produces, the compiler that reads it and the flags that +the build system uses should match in order to produce a successful run. In this +example we use the `KGO/arch-linux_spack.fcm` and the `insert_loop_parallelism.py` +transformation script. Both contain environment variables + +First, the arch file MPIF90 needs to be set to `psyclonefc`, this is a command line utility +that can be used instead of a call to the compiler that first processes the given +source file (using the options give to PSYCLONE_OPTS) and then send the output to +a compiler (given by PSYCLONE_COMPILER). + + + +```bash +export MPIF90=psyclonefc +export PSYCLONE_COMPILER=mpif90 +export PSYCLONE_OPTS="-l output -s ${PSYCLONE_NEMO_EXAMPLES_DIR}/insert_loop_parallelism.py" +``` + +Then the injected parallel directives + +Chose the flags between: +- Example of serial transformations with no parallel directives +```bash +export PARALLEL_DIRECTIVES="" +export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g" +``` + +- Example of inserting OpenMP CPU threading parallelism +```bash +export PARALLEL_DIRECTIVES="omp_threading" +export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp" +``` + +- Example of inserting OpenMP GPU offloading with reproducible build flags +```bash +export PARALLEL_DIRECTIVES="omp_offloading" +export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform" +export REPRODUCIBLE=1 +``` + +- Example of inserting OpenACC GPU offloading with reproducible build flags (-mp=gpu is needed for reproducibility) +```bash +export PARALLEL_DIRECTIVES="acc_offloading" +export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -acc=gpu -mp=gpu -gpu=mem:managed,math_uniform" +export REPRODUCIBLE=1 +``` + +- Example of fast GPU build flags +```bash +unset REPRODUCIBLE +export PARALLEL_DIRECTIVES="omp_offloading+omp_threading" +export FCFLAGS="-i4 -Mr8 -O3 -mp=gpu -gpu=mem:managed" +``` + +## Compile and Run NEMO with psyclone processing + +```bash +./makenemo -r ORCA2_ICE_PISCES -m linux_test -n ORCA2_psycloned del_key "key_xios key_top" -j 6 -v 1 +``` + +```bash +# Prepare problem +ln -sf ${ORCA2_INPUTS}/ORCA2_ICE_v5.0.0/* cfgs/ORCA2_psycloned/EXP00/. +cd cfgs/ORCA2_ICE_PISCES_psycloned/EXP00 +# Reduce num of iterations and add timing/runstat +sed -i "s/nn_itend.*/nn_itend = 10/" namelist_cfg +sed -i "s/ln_icebergs.*/ln_icebergs = .false./" namelist_cfg +sed -i "s/\&namctl.*/\&namctl\n ln_timing = .true. \n sn_cfctl%l_runstat = .true.\n/" namelist_cfg + +# Run problem +OMP_NUM_THREADS=4 CUDA_VISIBLE_DEVICES=1,2 mpirun -n 2 ./nemo +diff ${PSYCLONE_NEMO_EXAMPLES_DIR}/KGOs/run.stat.orca_ice_pisces.nvhpc.10steps run.stat + +``` + From 48f0a8c038d47755affff857997c3d27f8b4cb55 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 11 Nov 2025 07:27:43 +0000 Subject: [PATCH 02/33] Update NEMO readmes --- examples/nemo/README.md | 3 +- examples/nemo/scripts/README.md | 106 +++++++++++++++++++++++--------- 2 files changed, 78 insertions(+), 31 deletions(-) diff --git a/examples/nemo/README.md b/examples/nemo/README.md index 5b1b527750..093b85a33d 100644 --- a/examples/nemo/README.md +++ b/examples/nemo/README.md @@ -55,7 +55,8 @@ Contains: ## Scripts -Contains the scripts used to process the NEMO codebase. +Contains a collection of example scripts and the instructions to process the NEMO code. These +are testend in our integration test against NEMOv4.0.2 and NEMOv5.0. ## Example 1 diff --git a/examples/nemo/scripts/README.md b/examples/nemo/scripts/README.md index 4ca8ac736a..606b9e3697 100644 --- a/examples/nemo/scripts/README.md +++ b/examples/nemo/scripts/README.md @@ -42,38 +42,48 @@ source code from the NEMO ocean model. > [!Important] > The NEMO build system, `makenemo`, has the ability to apply psyclone -> scripts that come with the NEMO repository (see -> [the NEMO user guide](https://sites.nemo-ocean.io/user-guide/psyclone.html)) -> but these are pinned to a particular release of PSyclone. By contrast, -> the process presented in this README uses the experimenatal `psyclonefc` -> command to intercept any compilation command and wrap it with a psyclone -> code-transformation step, and therefore bypasses the makenemo `-p` option. +> scripts that come with the NEMO repository with the `-p` flag (see +> [the NEMO user guide](https://sites.nemo-ocean.io/user-guide/psyclone.html)), +> but these are pinned to a particular release of PSyclone and have constrains +> defined in `mk/sct_psyclone.sh` script. By contrast, the process presented in +> this README uses the experimental `psyclonefc` compiler wrapper command which +> bypases the `makenemo -p` and instead intercepts any compilation command and +> wraps it with a psyclone call followed by a compiler call. > This is the recommended way to apply upstream psyclone transformations, as it -> is not contrained by the file-exclusions and backward compatibility guarantees -> of the `makenemo` scripts. +> is not constrained by the file-exclusions and backward compatibility guarantees +> of the scripts inside the NEMO repository. ## Downloading the NEMO source and data files - +To test the examples you can download NEMO and its input data as follows: ```bash git clone https://forge.nemo-ocean.eu/nemo/nemo.git --branch 5.0 --single-branch wget https://gws-access.jasmin.ac.uk/public/nemo/sette_inputs/r5.0.0/ORCA2_ICE_v5.0.0.tar.gz tar -xzf ORCA2_ICE_v5.0.0.tar.gz -``` +``` -## Set up environment variables +The examples have been tested with NEMOv4.0.2 and NEMOv5.0, but we aim to support any +version of NEMO. If you encounter any issue applying these examples please report to +the authors. -The code that psyclone produces, the compiler that reads it and the flags that -the build system uses should match in order to produce a successful run. In this -example we use the `KGO/arch-linux_spack.fcm` and the `insert_loop_parallelism.py` -transformation script. Both contain environment variables -First, the arch file MPIF90 needs to be set to `psyclonefc`, this is a command line utility -that can be used instead of a call to the compiler that first processes the given -source file (using the options give to PSYCLONE_OPTS) and then send the output to -a compiler (given by PSYCLONE_COMPILER). +## Set up environment variables + +In order to provide a flexible system that works with different directives and +compilers we provide a parameterised transformation script +`insert_loop_parallelism.py` and an example NEMO arch file `KGO/arch-linux_spack.fcm` +with multiple environment variables. These, together with the `psyclonefc` +environment variables have to be set up appropriately depending on the desired +output. +First of all, the arch file has a `MPIF90` to choose the compiler, this +needs to be set to `psyclonefc`. This is a compiler wrapper utility that +substitutes its call with: an invocation to psyclone to process the given +source file (using the options provided in `PSYCLONE_OPTS`) and then send the +output to a compiler (provided by `PSYCLONE_COMPILER`). +For example, to apply the `insert_loop_parallelism.py` and compile it with +`mpif90` we can use the following set up: ```bash export MPIF90=psyclonefc @@ -81,48 +91,86 @@ export PSYCLONE_COMPILER=mpif90 export PSYCLONE_OPTS="-l output -s ${PSYCLONE_NEMO_EXAMPLES_DIR}/insert_loop_parallelism.py" ``` -Then the injected parallel directives +As mentioned, the transformation script is parameterised with a `PARALLEL_DIRECTIVES` +variable that have to be consistent with the chosen Fortran flags. -Chose the flags between: -- Example of serial transformations with no parallel directives +For instance, for the `nvfortran` compiler, you can choose between: +- Serial transformations with no parallel directives ```bash export PARALLEL_DIRECTIVES="" export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g" ``` -- Example of inserting OpenMP CPU threading parallelism +- Inserting OpenMP CPU threading parallelism ```bash export PARALLEL_DIRECTIVES="omp_threading" export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp" ``` -- Example of inserting OpenMP GPU offloading with reproducible build flags +- Inserting OpenMP GPU offloading with reproducible build flags ```bash export PARALLEL_DIRECTIVES="omp_offloading" export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform" export REPRODUCIBLE=1 ``` -- Example of inserting OpenACC GPU offloading with reproducible build flags (-mp=gpu is needed for reproducibility) +- Inserting OpenACC GPU offloading with reproducible build flags (-mp=gpu is needed for reproducibility) ```bash export PARALLEL_DIRECTIVES="acc_offloading" export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -acc=gpu -mp=gpu -gpu=mem:managed,math_uniform" export REPRODUCIBLE=1 ``` -- Example of fast GPU build flags +- A fast GPU build flags ```bash unset REPRODUCIBLE export PARALLEL_DIRECTIVES="omp_offloading+omp_threading" export FCFLAGS="-i4 -Mr8 -O3 -mp=gpu -gpu=mem:managed" ``` -## Compile and Run NEMO with psyclone processing +In addition, `ASYNC_PARALLEL`, `ENABLE_INLINING`, `PROFILING` + +## Compilation + +Once the environment variables are set, use the `makenemo` command with +the desired NEMO configuration and keys. For example: ```bash -./makenemo -r ORCA2_ICE_PISCES -m linux_test -n ORCA2_psycloned del_key "key_xios key_top" -j 6 -v 1 +./makenemo -r ORCA2_ICE_PISCES -m arch-linux_spack -n ORCA2_psycloned ... ``` +If everything worked you can see the generated files in the +`/BLD/tmp` directory. + +## Fixing issues and tuning the generated implementation + +Since this is now a two-step process. There are two locations where you can modify +files that will alter the output result. First is the input source code. For this +we recommend using the built-in `makenemo` functionality + +```bash +./makenemo -e ... +``` + +In addition to the source, you can also modify the recipe that psyclone uses to +transform the code. In this example you can do so by changing any detail of the +`insert_loop_parallelism.py` transformation script, but the `FILES_TO_SKIP` +global variable is particularly relevant as it allows psyclone skip processing +the listed files. If modifying a particular file is known to cause problems or +performance regressions, include it in this list. + +You can also do both. For example if you want to provide a modified file that +already includes directives, you need to reference it with the `-e ` +and in the FILES_TO_SKIP (otherwise Psyclone would ignore the given directives +and try to insert its own). This is currently the optimal approach for `seaice` +and `lbclnk.f90` GPU offloading. + +## Running the generated code + +Finally, once the NEMO `makenemo` build has succeeded, we can run NEMO from +the configuration EXP00 directory. We include some known-good-outputs in the +`KGO` directory, but be aware that these can be compiler/flags/system-sensitive: + ```bash # Prepare problem ln -sf ${ORCA2_INPUTS}/ORCA2_ICE_v5.0.0/* cfgs/ORCA2_psycloned/EXP00/. @@ -135,6 +183,4 @@ sed -i "s/\&namctl.*/\&namctl\n ln_timing = .true. \n sn_cfctl%l_runstat = .tr # Run problem OMP_NUM_THREADS=4 CUDA_VISIBLE_DEVICES=1,2 mpirun -n 2 ./nemo diff ${PSYCLONE_NEMO_EXAMPLES_DIR}/KGOs/run.stat.orca_ice_pisces.nvhpc.10steps run.stat - ``` - From a9618c089a27933538a753d2687810c67a964535 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 11 Nov 2025 09:36:15 +0000 Subject: [PATCH 03/33] Add initial version of the unified insert_loop_parallelism.py script --- .../nemo/scripts/insert_loop_parallelism.py | 308 ++++++++++++++++++ 1 file changed, 308 insertions(+) create mode 100755 examples/nemo/scripts/insert_loop_parallelism.py diff --git a/examples/nemo/scripts/insert_loop_parallelism.py b/examples/nemo/scripts/insert_loop_parallelism.py new file mode 100755 index 0000000000..33f7bdc287 --- /dev/null +++ b/examples/nemo/scripts/insert_loop_parallelism.py @@ -0,0 +1,308 @@ +#!/usr/bin/env python +# ----------------------------------------------------------------------------- +# BSD 3-Clause License +# +# Copyright (c) 2021-2025, Science and Technology Facilities Council. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# ----------------------------------------------------------------------------- +# Authors: S. Siso, STFC Daresbury Lab + +''' PSyclone transformation script showing the introduction of OpenMP for GPU +directives into Nemo code. ''' + +import os +import sys +from utils import ( + add_profiling, inline_calls, insert_explicit_loop_parallelism, + normalise_loops, enhance_tree_information, PARALLELISATION_ISSUES, + NEMO_MODULES_TO_IMPORT) +from psyclone.psyir.nodes import Routine, Loop +from psyclone.psyir.transformations import OMPTargetTrans +from psyclone.transformations import ( + OMPLoopTrans, OMPDeclareTargetTrans, TransformationError) +from psyclone.transformations import ( + ACCParallelTrans, ACCLoopTrans, ACCRoutineTrans) + + +# This environment variable informs if this is targeting NEMOv4 +NEMOV4 = os.environ.get('NEMOV4', False) + +# This environment variable informs which parallelisation directives to use +# It supports acc_offloading, omp_offloading and omp_threading +# They can be combined, e.g PARALLEL_DIRECTIVES='omp_offloading+omp_threading' +PARALLEL_DIRECTIVES = os.environ.get('PARALLEL_DIRECTIVES', '') + +# By default, allow optimisations that may change the results, e.g. reductions, +# offloading instrinsics without math_uniform, ... +REPRODUCIBLE = os.environ.get('REPRODUCIBLE', False) + +# This environment variable informs if profiling hooks have to be inserted. +PROFILING_ENABLED = os.environ.get('ENABLE_PROFILING', False) + +# By default, we don't do module inlining as it's still under development. +INLINING_ENABLED = os.environ.get('ENABLE_INLINING', False) + +# Whether to chase the imported modules to improve symbol information (it can +# also be a list of module filenames to limit the chasing to only specific +# modules). This has to be used in combination with '-I' command flag in order +# to point to the module location directory. We also strongly recommend using +# the '--enable-cache' flag to reduce the performance overhead. +RESOLVE_IMPORTS = NEMO_MODULES_TO_IMPORT + +# List of all files that psyclone will skip processing +FILES_TO_SKIP = [ + "fldread.f90", +] + +NEMOV5_EXCLUSIONS = [ + "dynhpg.f90", + "dynspg_ts.f90", + "sbcssm.f90", + "tramle.f90", + "trazdf.f90", +] + +NEMOV4_EXCLUSIONS = [ + "dynspg_ts.f90", +] + +if NEMOV4: + FILES_TO_SKIP.extend(NEMOV4_EXCLUSIONS) +else: + FILES_TO_SKIP.extend(NEMOV5_EXCLUSIONS) + +SKIP_FOR_PERFORMANCE = [ + "iom.f90", + "iom_nf90.f90", + "iom_def.f90", + "timing.f90", + "lbclnk.f90", + "histcom.f90", +] + +OFFLOADING_ISSUES = [ + # Produces different output results + "zdftke.f90", + # The following issues only affect BENCH (because ice is enabled?) + # Runtime Error: Illegal address during kernel execution + "trcrad.f90", + # nvhpc > 24.11 - Signal 11 issues + "trcbbl.f90", + "bdyice.f90", + "sedfunc.f90", + "stpmlf.f90", + "trddyn.f90", + "trczdf.f90", + "trcice_pisces.f90", + "dtatsd.f90", + "trcatf.f90", +] + +if "acc_offloading" in PARALLEL_DIRECTIVES: + OFFLOADING_ISSUES = OFFLOADING_ISSUES + [ + # Fail in OpenACC ORCA2_ICE_PISCES + "dynzdf.f90", + "trabbl.f90", + "trazdf.f90", + "zdfsh2.f90", + ] + + +def select_transformations(): + ''' + Use the PARALLEL_DIRECTIVES global to select what specific transformations + to apply to insert the desired directives. + ''' + process_directives = PARALLEL_DIRECTIVES + + if 'omp_offloading' in process_directives: + offload_region_trans = OMPTargetTrans() + mark_for_gpu_trans = OMPDeclareTargetTrans() + if NEMOV4: + # TODO #2895: Explore why loop/teams loop diverge for NEMOv4 + gpu_loop_trans = OMPLoopTrans(omp_schedule="none") + gpu_loop_trans.omp_directive = "loop" + else: + gpu_loop_trans = OMPLoopTrans(omp_schedule="none") + gpu_loop_trans.omp_directive = "teamsloop" + process_directives = process_directives.replace('omp_offloading', '') + elif 'acc_offloading' in process_directives: + offload_region_trans = ACCParallelTrans(default_present=False) + mark_for_gpu_trans = ACCRoutineTrans() + gpu_loop_trans = ACCLoopTrans() + process_directives = process_directives.replace('acc_offloading', '') + else: + offload_region_trans = None + mark_for_gpu_trans = None + gpu_loop_trans = None + + if 'omp_threading' in process_directives: + cpu_loop_trans = OMPLoopTrans(omp_schedule="static") + cpu_loop_trans.omp_directive = "paralleldo" + process_directives = process_directives.replace('omp_threading', '') + else: + cpu_loop_trans = None + + process_directives = process_directives.replace('+', '') + if process_directives != '': + sys.exit(f"Unkown PARALLEL_DIRECTIVES: {process_directives}") + + return (offload_region_trans, mark_for_gpu_trans, + gpu_loop_trans, cpu_loop_trans) + + +def filter_files_by_name(name: str) -> bool: + ''' + :returns: whether to transform a file with the given name. Contrary to + FILES_TO_SKIP, this will still run the files through psyclone, but + it is useful + ''' + # The two options below are useful for file-by-file exhaustive tests. + # If the environemnt has ONLY_FILE defined, only process that one file and + # known-good files that need a "declare target" inside. + only_do_files = [os.environ.get('ONLY_FILE', False)] + if "offloading" in PARALLEL_DIRECTIVES: + only_do_files.extend(["lib_fortran.f90", "solfrac_mod.f90"]) + if only_do_files and name not in only_do_files: + return True + # If the environemnt has ALL_BUT_FILE defined, process all files but + # the one named file. + all_but_file = os.environ.get('ALL_BUT_FILE', False) + if all_but_file and name == all_but_file: + return True + + if name in SKIP_FOR_PERFORMANCE: + return True + + # Parallelising this file currently causes a noticeable slowdown + if name.startswith("icethd"): + return True + + # This file fails for gcc NEMOv5 BENCH + if not NEMOV4 and name == "icedyn_rhg_evp.f90": + return True + + # Many of the obs_ files have problems to be offloaded to the GPU + if name.startswith("obs_"): + return True + + return False + + +def trans(psyir): + ''' Normalise and add directives to all possible loops, including the + implicit ones. + + :param psyir: the PSyIR of the provided file. + :type psyir: :py:class:`psyclone.psyir.nodes.FileContainer` + + ''' + if filter_files_by_name(psyir.name): + return + + (offload_region_trans, mark_for_gpu_trans, gpu_loop_trans, + cpu_loop_trans) = select_transformations() + + disable_profiling_for = [] + + for subroutine in psyir.walk(Routine): + + # Skip initialisation subroutines + if (subroutine.name.endswith('_alloc') or + subroutine.name.endswith('_init') or + subroutine.name.startswith('Agrif') or + subroutine.name.startswith('dia_') or + subroutine.name == 'dom_msk' or + subroutine.name == 'dom_zgr' or + subroutine.name == 'dom_ngb'): + continue + + enhance_tree_information(subroutine) + + normalise_loops( + subroutine, + hoist_local_arrays=False, + convert_array_notation=True, + # See issue #3022 + loopify_array_intrinsics=psyir.name != "getincom.f90", + convert_range_loops=True, + hoist_expressions=True + ) + + # Perform module-inlining of called routines. + if INLINING_ENABLED: + inline_calls(subroutine) + + # These are functions that are called from inside parallel regions, + # annotate them with 'omp declare target' + if ( + mark_for_gpu_trans and + subroutine.name.lower().startswith("sign_") + or subroutine.name.lower() == "solfrac" + or (psyir.name == "sbc_phy.f90" and not subroutine.walk(Loop)) + ): + try: + mark_for_gpu_trans.apply(subroutine) + print(f"Marked {subroutine.name} as GPU-enabled") + except TransformationError as err: + print(err) + # We continue parallelising inside the routine, but this could + # change if the parallelisation directives added below are not + # nestable, in that case we could add a 'continue' here + disable_profiling_for.append(subroutine.name) + + elif (psyir.name not in PARALLELISATION_ISSUES + OFFLOADING_ISSUES + and gpu_loop_trans): + print( + f"Adding offload directives to subroutine: {subroutine.name}") + insert_explicit_loop_parallelism( + subroutine, + region_directive_trans=offload_region_trans, + loop_directive_trans=gpu_loop_trans, + collapse=True, + privatise_arrays=not NEMOV4, + enable_reductions=not REPRODUCIBLE, + uniform_intrinsics_only=REPRODUCIBLE, + ) + elif psyir.name not in PARALLELISATION_ISSUES and cpu_loop_trans: + # These have issues offloading, but we can still do threading + print(f"Adding OpenMP threading to subroutine: {subroutine.name}") + insert_explicit_loop_parallelism( + subroutine, + loop_directive_trans=cpu_loop_trans, + privatise_arrays=not NEMOV4, + enable_reductions=not REPRODUCIBLE, + ) + + # Iterate again and add profiling hooks when needed + for subroutine in psyir.walk(Routine): + if PROFILING_ENABLED and subroutine.name not in disable_profiling_for: + print(f"Adding profiling hooks to subroutine: {subroutine.name}") + add_profiling(subroutine.children) From 525e111e0f589d2468d5220cbe53597dffc899f0 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 11 Nov 2025 10:10:19 +0000 Subject: [PATCH 04/33] Fixes to NEMO insert_loop_parallelism.py --- examples/nemo/scripts/insert_loop_parallelism.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/nemo/scripts/insert_loop_parallelism.py b/examples/nemo/scripts/insert_loop_parallelism.py index 33f7bdc287..98bb1ccf55 100755 --- a/examples/nemo/scripts/insert_loop_parallelism.py +++ b/examples/nemo/scripts/insert_loop_parallelism.py @@ -44,9 +44,10 @@ normalise_loops, enhance_tree_information, PARALLELISATION_ISSUES, NEMO_MODULES_TO_IMPORT) from psyclone.psyir.nodes import Routine, Loop -from psyclone.psyir.transformations import OMPTargetTrans +from psyclone.psyir.transformations import ( + OMPTargetTrans, OMPDeclareTargetTrans) from psyclone.transformations import ( - OMPLoopTrans, OMPDeclareTargetTrans, TransformationError) + OMPLoopTrans, TransformationError) from psyclone.transformations import ( ACCParallelTrans, ACCLoopTrans, ACCRoutineTrans) From d037cc02df4a8995bc6e147c187eb1222103b308 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 11 Nov 2025 10:58:39 +0000 Subject: [PATCH 05/33] Remove NEMO omp_cpu_trans.py --- .github/workflows/nemo_tests.yml | 3 +- .github/workflows/nemo_v5_tests.yml | 3 +- .../nemo/scripts/insert_loop_parallelism.py | 42 ++++-- examples/nemo/scripts/omp_cpu_trans.py | 130 ------------------ 4 files changed, 36 insertions(+), 142 deletions(-) delete mode 100755 examples/nemo/scripts/omp_cpu_trans.py diff --git a/.github/workflows/nemo_tests.yml b/.github/workflows/nemo_tests.yml index 75b5ccde30..a1f0195310 100644 --- a/.github/workflows/nemo_tests.yml +++ b/.github/workflows/nemo_tests.yml @@ -208,7 +208,8 @@ jobs: export PSYCLONE_NEMO_DIR=${GITHUB_WORKSPACE}/examples/nemo/scripts export PSYCLONE_COMPILER=$MPIF90 export MPIF90=psyclonefc - export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/omp_cpu_trans.py -I ${MPI_HOME}/include" + export PARALLEL_DIRECTIVES="omp_threading" + export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py -I ${MPI_HOME}/include" export FCFLAGS="-i4 -r8 -O2 -heap-arrays -fp-model=precise -g -qopenmp" export NEMOV4=1 # Enables specific NEMOV4 exclusions in the PSyclone transformation script diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index a282ae7bc7..22a6fcae0f 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -202,7 +202,8 @@ jobs: cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm export PSYCLONE_COMPILER=$MPIF90 export MPIF90=psyclonefc - export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/omp_cpu_trans.py" + export PARALLEL_DIRECTIVES="omp_threading" + export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py" export FCFLAGS="-fdefault-real-8 -O2 -fcray-pointer -ffree-line-length-none -g -fopenmp" # Clean up and compile diff --git a/examples/nemo/scripts/insert_loop_parallelism.py b/examples/nemo/scripts/insert_loop_parallelism.py index 98bb1ccf55..bf9b75f16c 100755 --- a/examples/nemo/scripts/insert_loop_parallelism.py +++ b/examples/nemo/scripts/insert_loop_parallelism.py @@ -70,6 +70,10 @@ # By default, we don't do module inlining as it's still under development. INLINING_ENABLED = os.environ.get('ENABLE_INLINING', False) +# This environment variable informs if we're enabling asynchronous +# parallelism. +ASYNC_PARALLEL = os.environ.get('ASYNC_PARALLEL', False) + # Whether to chase the imported modules to improve symbol information (it can # also be a list of module filenames to limit the chasing to only specific # modules). This has to be used in combination with '-I' command flag in order @@ -135,6 +139,19 @@ "zdfsh2.f90", ] +ASYNC_ISSUES = [ + # Runtime Error: (CUDA_ERROR_LAUNCH_FAILED): Launch failed + # (often invalid pointer dereference) in get_cstrgsurf + "sbcclo.f90", + "trcldf.f90", + # Runtime Error: Illegal address during kernel execution with + # asynchronicity. + "zdfiwm.f90", + "zdfsh2.f90", + # Diverging results with asynchronicity + "traadv_fct.f90", + "bdy_oce.f90", +] def select_transformations(): ''' @@ -182,17 +199,18 @@ def select_transformations(): def filter_files_by_name(name: str) -> bool: ''' :returns: whether to transform a file with the given name. Contrary to - FILES_TO_SKIP, this will still run the files through psyclone, but - it is useful + FILES_TO_SKIP, this will still run the files through psyclone. ''' # The two options below are useful for file-by-file exhaustive tests. # If the environemnt has ONLY_FILE defined, only process that one file and # known-good files that need a "declare target" inside. - only_do_files = [os.environ.get('ONLY_FILE', False)] - if "offloading" in PARALLEL_DIRECTIVES: - only_do_files.extend(["lib_fortran.f90", "solfrac_mod.f90"]) - if only_do_files and name not in only_do_files: - return True + only_file = os.environ.get('ONLY_FILE', False) + if only_file: + files_to_do = [only_file] + if "offloading" in PARALLEL_DIRECTIVES: + files_to_do.extend(["lib_fortran.f90", "solfrac_mod.f90"]) + if name in files_to_do: + return True # If the environemnt has ALL_BUT_FILE defined, process all files but # the one named file. all_but_file = os.environ.get('ALL_BUT_FILE', False) @@ -232,6 +250,7 @@ def trans(psyir): cpu_loop_trans) = select_transformations() disable_profiling_for = [] + enable_async = ASYNC_PARALLEL and psyir.name not in ASYNC_ISSUES for subroutine in psyir.walk(Routine): @@ -254,6 +273,7 @@ def trans(psyir): # See issue #3022 loopify_array_intrinsics=psyir.name != "getincom.f90", convert_range_loops=True, + increase_array_ranks=not NEMOV4, hoist_expressions=True ) @@ -265,9 +285,9 @@ def trans(psyir): # annotate them with 'omp declare target' if ( mark_for_gpu_trans and - subroutine.name.lower().startswith("sign_") - or subroutine.name.lower() == "solfrac" - or (psyir.name == "sbc_phy.f90" and not subroutine.walk(Loop)) + (subroutine.name.lower().startswith("sign_") + or subroutine.name.lower() == "solfrac" + or (psyir.name == "sbc_phy.f90" and not subroutine.walk(Loop))) ): try: mark_for_gpu_trans.apply(subroutine) @@ -291,6 +311,7 @@ def trans(psyir): privatise_arrays=not NEMOV4, enable_reductions=not REPRODUCIBLE, uniform_intrinsics_only=REPRODUCIBLE, + asynchronous_parallelism=enable_async, ) elif psyir.name not in PARALLELISATION_ISSUES and cpu_loop_trans: # These have issues offloading, but we can still do threading @@ -300,6 +321,7 @@ def trans(psyir): loop_directive_trans=cpu_loop_trans, privatise_arrays=not NEMOV4, enable_reductions=not REPRODUCIBLE, + asynchronous_parallelism=enable_async, ) # Iterate again and add profiling hooks when needed diff --git a/examples/nemo/scripts/omp_cpu_trans.py b/examples/nemo/scripts/omp_cpu_trans.py deleted file mode 100755 index 524423696a..0000000000 --- a/examples/nemo/scripts/omp_cpu_trans.py +++ /dev/null @@ -1,130 +0,0 @@ -#!/usr/bin/env python -# ----------------------------------------------------------------------------- -# BSD 3-Clause License -# -# Copyright (c) 2021-2025, Science and Technology Facilities Council. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. -# ----------------------------------------------------------------------------- -# Authors: S. Siso, STFC Daresbury Lab - -''' PSyclone transformation script to insert OpenMP for CPU -directives into Nemo code. Tested with ECMWF Nemo 4.0 code. ''' - -import os -from utils import ( - insert_explicit_loop_parallelism, normalise_loops, add_profiling, - enhance_tree_information, PARALLELISATION_ISSUES, - NEMO_MODULES_TO_IMPORT) -from psyclone.psyir.nodes import Routine -from psyclone.transformations import OMPLoopTrans - -# Enable the insertion of profiling hooks during the transformation script -PROFILING_ENABLED = False - -# Whether to chase the imported modules to improve symbol information (it can -# also be a list of module filenames to limit the chasing to only specific -# modules). This has to be used in combination with '-I' command flag in order -# to point to the module location directory. We also strongly recommend using -# the '--enable-cache' flag to reduce the performance overhead. -RESOLVE_IMPORTS = NEMO_MODULES_TO_IMPORT - -# A environment variable can inform if this is targeting NEMOv4, in which case -# array privatisation is disabled. -NEMOV4 = os.environ.get('NEMOV4', False) - -# By default, allow optimisations that may change the results, e.g. reductions -REPRODUCIBLE = os.environ.get('REPRODUCIBLE', False) - -# List of all files that psyclone will skip processing -FILES_TO_SKIP = [] -if not NEMOV4: - # TODO #3112: These produce diverging run.stat results in gcc NEMOv5 BENCH - FILES_TO_SKIP = [ - "dynhpg.f90", - "dynspg_ts.f90", - "sbcssm.f90", - "tramle.f90", - "trazdf.f90", - ] - -if PROFILING_ENABLED: - # Fails with profiling enabled. issue #2723 - FILES_TO_SKIP.append("mppini.f90") - - -def trans(psyir): - ''' Add OpenMP Parallel and Do directives to all loops, including the - implicit ones. - - :param psyir: the PSyIR of the provided file. - :type psyir: :py:class:`psyclone.psyir.nodes.FileContainer` - - ''' - - # Parallelising this file currently causes a noticeable slowdown - if psyir.name.startswith("icethd"): - return - - # This file fails for gcc NEMOv5 BENCH - if not NEMOV4 and psyir.name == "icedyn_rhg_evp.f90": - return - - omp_parallel_trans = None - omp_loop_trans = OMPLoopTrans(omp_schedule="static") - omp_loop_trans.omp_directive = "paralleldo" - - for subroutine in psyir.walk(Routine): - print(f"Adding OpenMP threading to subroutine: {subroutine.name}") - - if PROFILING_ENABLED: - add_profiling(subroutine.children) - - enhance_tree_information(subroutine) - - normalise_loops( - subroutine, - hoist_local_arrays=False, - convert_array_notation=True, - # See issue #3022 - loopify_array_intrinsics=psyir.name != "getincom.f90", - convert_range_loops=True, - hoist_expressions=False, - scalarise_loops=False - ) - - if psyir.name not in PARALLELISATION_ISSUES: - insert_explicit_loop_parallelism( - subroutine, - region_directive_trans=omp_parallel_trans, - loop_directive_trans=omp_loop_trans, - collapse=False, - privatise_arrays=not NEMOV4, - enable_reductions=not REPRODUCIBLE, - ) From 394f8f39172fc2f30c89cba962732ccb51e02e34 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 11 Nov 2025 11:55:45 +0000 Subject: [PATCH 06/33] Replace NEMO omp_gpu_trans.py --- .github/workflows/nemo_tests.yml | 6 ++++ .github/workflows/nemo_v5_tests.yml | 32 ++++++++++++------- examples/nemo/scripts/Makefile | 9 ++++-- .../nemo/scripts/insert_loop_parallelism.py | 7 ++-- examples/nemo/scripts/utils.py | 5 ++- 5 files changed, 41 insertions(+), 18 deletions(-) diff --git a/.github/workflows/nemo_tests.yml b/.github/workflows/nemo_tests.yml index a1f0195310..0e5e7561e0 100644 --- a/.github/workflows/nemo_tests.yml +++ b/.github/workflows/nemo_tests.yml @@ -130,6 +130,8 @@ jobs: module load perl/${PERL_VERSION} make clean export NEMOV4=1 # Enables specific NEMOV4 exclusions in the PSyclone transformation script + export PARALLEL_DIRECTIVES="omp_offloading+omp_threading" + export REPRODUCIBLE=1 make -j ${NUM_PARALLEL} openmp_gpu make -j ${NUM_PARALLEL} compile-openmp_gpu export NV_ACC_POOL_THRESHOLD=75 @@ -180,6 +182,8 @@ jobs: module load hdf5/${HDF5_VERSION} netcdf-c/${NETCDF_C_VERSION} netcdf-fortran/${NETCDF_FORTRAN_VERSION} module load perl/${PERL_VERSION} make clean + export PARALLEL_DIRECTIVES="acc_offloading" + export REPRODUCIBLE=1 make -j ${NUM_PARALLEL} openacc_loops COMPILER_ARCH=linux_nvidia_acc_gpu make -j ${NUM_PARALLEL} compile-openacc_loops export NV_ACC_POOL_THRESHOLD=75 @@ -241,6 +245,8 @@ jobs: module load perl/${PERL_VERSION} make clean export NEMOV4=1 # Enables specific NEMOV4 exclusions in the PSyclone transformation script + export PARALLEL_DIRECTIVES="omp_offloading+omp_threading" + export REPRODUCIBLE=1 export ASYNC_PARALLEL=1 make -j ${NUM_PARALLEL} openmp_gpu make -j ${NUM_PARALLEL} compile-openmp_gpu diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index 22a6fcae0f..ea2e5515ec 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -246,7 +246,8 @@ jobs: export REPRODUCIBLE=1 export PSYCLONE_COMPILER=$MPIF90 export MPIF90=psyclonefc - export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py" + export PARALLEL_DIRECTIVES="omp_offloading+omp_threading" + export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py" # Clean up and compile rm -rf tests/${TEST_DIR} ./makenemo -r BENCH -m linux_spack_profile -n ${TEST_DIR} -j ${NUM_PARALLEL} -v 1 @@ -296,13 +297,14 @@ jobs: # We compile at "-O2 -Mnofma -Mnovect -gpu=math_uniform" to permit comparison of the results. cd $NEMO_DIR cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm - export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform" + export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -acc -mp=gpu -gpu=mem:managed,math_uniform" + export PARALLEL_DIRECTIVES="acc_offloading" export REPRODUCIBLE=1 # Clean up and compile rm -rf cfgs/${TEST_DIR} - ./makenemo -r GOSI10p0.0_like_eORCA1 -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \ - -j ${NUM_PARALLEL} -v 1 + ./makenemo -r GOSI10p0.0_like_eORCA1 -m linux_spack -n ${TEST_DIR} \ + -p ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py -j ${NUM_PARALLEL} -v 1 # Run test cd $NEMO_DIR/cfgs/${TEST_DIR}/EXP00 @@ -332,11 +334,13 @@ jobs: cd $NEMO_DIR cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform" + export PARALLEL_DIRECTIVES="omp_offloading+omp_threading" export REPRODUCIBLE=1 # Clean up and compile rm -rf cfgs/${TEST_DIR} - ./makenemo -r ORCA2_ICE_PISCES -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \ + ./makenemo -r ORCA2_ICE_PISCES -m linux_spack -n ${TEST_DIR} \ + -p ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py \ add_key "key_mpi_off key_nosignedzero" -j ${NUM_PARALLEL} -v 1 # Run test @@ -375,12 +379,13 @@ jobs: export ENABLE_PROFILING=1 # We compile with "-O2 -Mnofma -Mnovect -gpu=math_uniform" to permit comparison of the results. export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform" + export PARALLEL_DIRECTIVES="omp_offloading+omp_threading" export REPRODUCIBLE=1 export ASYNC_PARALLEL=1 # Clean up and compile rm -rf tests/${TEST_DIR} - ./makenemo -r BENCH -m linux_spack_profile -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \ - -j ${NUM_PARALLEL} -v 1 + ./makenemo -r BENCH -m linux_spack_profile -n ${TEST_DIR} \ + -p ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py -j ${NUM_PARALLEL} -v 1 # Run reproducible test cd $NEMO_DIR/tests/${TEST_DIR}/EXP00 @@ -399,8 +404,8 @@ jobs: rm -rf tests/${TEST_DIR} export NV_ACC_POOL_THRESHOLD=75 export CUDA_VISIBLE_DEVICES=1 - ./makenemo -r BENCH -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \ - -j ${NUM_PARALLEL} -v 1 + ./makenemo -r BENCH -m linux_spack -n ${TEST_DIR} \ + -p ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py -j ${NUM_PARALLEL} -v 1 # Run non-reproducible test cd $NEMO_DIR/tests/${TEST_DIR}/EXP00 cp $PSYCLONE_NEMO_DIR/KGOs/namelist_cfg_bench_small namelist_cfg @@ -424,13 +429,15 @@ jobs: # We compile at "-O2 -Mnofma -Mnovect -gpu=math_uniform" to permit comparison of the results. cd $NEMO_DIR cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm + export PARALLEL_DIRECTIVES="omp_offloading+omp_threading" export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform" export REPRODUCIBLE=1 export ASYNC_PARALLEL=1 # Clean up and compile rm -rf cfgs/${TEST_DIR} - ./makenemo -r ORCA2_ICE_PISCES -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \ + ./makenemo -r ORCA2_ICE_PISCES -m linux_spack -n ${TEST_DIR} \ + -p ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py \ add_key "key_mpi_off key_nosignedzero" -j ${NUM_PARALLEL} -v 1 # Run test @@ -460,14 +467,15 @@ jobs: # We compile at "-O2 -Mnofma -Mnovect -gpu=math_uniform" to permit comparison of the results. cd $NEMO_DIR cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm + export PARALLEL_DIRECTIVES="omp_offloading" export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform" export REPRODUCIBLE=1 export ASYNC_PARALLEL=1 # Clean up and compile rm -rf cfgs/${TEST_DIR} - ./makenemo -r GOSI10p0.0_like_eORCA1 -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \ - -j ${NUM_PARALLEL} -v 1 + ./makenemo -r GOSI10p0.0_like_eORCA1 -m linux_spack -n ${TEST_DIR} \ + -p ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py -j ${NUM_PARALLEL} -v 1 # Run test cd $NEMO_DIR/cfgs/${TEST_DIR}/EXP00 diff --git a/examples/nemo/scripts/Makefile b/examples/nemo/scripts/Makefile index c1828e0e8a..41c3955de7 100644 --- a/examples/nemo/scripts/Makefile +++ b/examples/nemo/scripts/Makefile @@ -61,16 +61,19 @@ psycloned-passthrough/%.f90: ${ROOT_SRC}%.f90 psycloned-passthrough psyclone -s passthrough.py -l output -I ${ROOT_SRC} -I ${MPI_INC_DIR} -o $@ $< psycloned-openmp_cpu/%.f90: ${ROOT_SRC}%.f90 psycloned-openmp_cpu - psyclone -s omp_cpu_trans.py -l output -I ${ROOT_SRC} -I ${MPI_INC_DIR} -o $@ $< + export PARALLEL_DIRECTIVES="cpu_threading" + psyclone -s insert_loop_parallelism.py -l output -I ${ROOT_SRC} -I ${MPI_INC_DIR} -o $@ $< psycloned-openmp_gpu/%.f90: ${ROOT_SRC}%.f90 psycloned-openmp_gpu - psyclone -s omp_gpu_trans.py -l output -I ${ROOT_SRC} -I ${MPI_INC_DIR} -o $@ $< + export PARALLEL_DIRECTIVES="omp_offloading" + psyclone -s insert_loop_parallelism.py -l output -I ${ROOT_SRC} -I ${MPI_INC_DIR} -o $@ $< psycloned-openacc_kernels/%.f90: ${ROOT_SRC}%.f90 psycloned-openacc_kernels psyclone -s acc_kernels_trans.py -l output -I ${ROOT_SRC} -I ${MPI_INC_DIR} -o $@ $< psycloned-openacc_loops/%.f90: ${ROOT_SRC}%.f90 psycloned-openacc_loops - psyclone -s acc_loops_trans.py -l output -I ${ROOT_SRC} -o $@ $< + export PARALLEL_DIRECTIVES="acc_offloading" + psyclone -s insert_loop_parallelism.py -l output -I ${ROOT_SRC} -o $@ $< # Get the number of Makefile parallel jobs to pass it to the makenemo MAKE_PID := $(shell echo $$PPID) diff --git a/examples/nemo/scripts/insert_loop_parallelism.py b/examples/nemo/scripts/insert_loop_parallelism.py index bf9b75f16c..bf7b566c9b 100755 --- a/examples/nemo/scripts/insert_loop_parallelism.py +++ b/examples/nemo/scripts/insert_loop_parallelism.py @@ -83,7 +83,8 @@ # List of all files that psyclone will skip processing FILES_TO_SKIP = [ - "fldread.f90", + "icefrm.f90", # Has unsupportet implicit symbol declaration + "icerst.f90" ] NEMOV5_EXCLUSIONS = [ @@ -128,6 +129,7 @@ "trcice_pisces.f90", "dtatsd.f90", "trcatf.f90", + "stp2d.f90", ] if "acc_offloading" in PARALLEL_DIRECTIVES: @@ -221,7 +223,8 @@ def filter_files_by_name(name: str) -> bool: return True # Parallelising this file currently causes a noticeable slowdown - if name.startswith("icethd"): + # if name.startswith("icethd"): + if name.startswith("ice"): return True # This file fails for gcc NEMOv5 BENCH diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index 82cde1e696..4b7567542d 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -249,6 +249,8 @@ def normalise_loops( :param hoist_expressions: whether to hoist bounds and loop invariant statements out of the loop nest. ''' + filename = schedule.root.name + if hoist_local_arrays and schedule.name not in CONTAINS_STMT_FUNCTIONS: # Apply the HoistLocalArraysTrans when possible, it cannot be applied # to files with statement functions because it will attempt to put the @@ -283,7 +285,8 @@ def normalise_loops( except TransformationError as err: print(err.value) - if convert_range_loops: + # TODO #2951: fldread has a bug in ArrayAssignment2LoopsTrans + if convert_range_loops and schedule.root.name != "fldread.f90": # Convert all array implicit loops to explicit loops explicit_loops = ArrayAssignment2LoopsTrans() for assignment in schedule.walk(Assignment): From 6fd4de110b18e28421ab0624a2ac51607dda0450 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 11 Nov 2025 13:30:06 +0000 Subject: [PATCH 07/33] Fix some issues with new NEMO script --- .github/workflows/nemo_tests.yml | 1 + examples/nemo/eg1/Makefile | 4 ++-- examples/nemo/eg2/Makefile | 4 ++-- examples/nemo/scripts/README.md | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/nemo_tests.yml b/.github/workflows/nemo_tests.yml index 0e5e7561e0..fe83fbec57 100644 --- a/.github/workflows/nemo_tests.yml +++ b/.github/workflows/nemo_tests.yml @@ -184,6 +184,7 @@ jobs: make clean export PARALLEL_DIRECTIVES="acc_offloading" export REPRODUCIBLE=1 + export NEMOV4=1 make -j ${NUM_PARALLEL} openacc_loops COMPILER_ARCH=linux_nvidia_acc_gpu make -j ${NUM_PARALLEL} compile-openacc_loops export NV_ACC_POOL_THRESHOLD=75 diff --git a/examples/nemo/eg1/Makefile b/examples/nemo/eg1/Makefile index bf90d21e65..93ccb37292 100644 --- a/examples/nemo/eg1/Makefile +++ b/examples/nemo/eg1/Makefile @@ -40,8 +40,8 @@ include ../../common.mk transform: ${PSYCLONE} -s ./openmp_cpu_levels_trans.py ../code/tra_adv.F90 ${PSYCLONE} -s ./openmp_gpu_levels_trans.py ../code/tra_adv.F90 - ${PSYCLONE} -s ../scripts/omp_cpu_trans.py ../code/tra_adv.F90 - ${PSYCLONE} -s ../scripts/omp_gpu_trans.py ../code/tra_adv.F90 + PARALLEL_DIRECTICVES="omp_threading" ${PSYCLONE} -s ../scripts/insert_loop_parallelism.py ../code/tra_adv.F90 + PARALLEL_DIRECTICVES="omp_offloading" ${PSYCLONE} -s ../scripts/insert_loop_parallelism.py ../code/tra_adv.F90 compile: transform @echo "No compilation supported for nemo/eg1" diff --git a/examples/nemo/eg2/Makefile b/examples/nemo/eg2/Makefile index c9949bfa50..b21e66b48d 100644 --- a/examples/nemo/eg2/Makefile +++ b/examples/nemo/eg2/Makefile @@ -43,8 +43,8 @@ transform: omp_levels omp_levels: ${PSYCLONE} -s ./omp_levels_trans.py ../code/traldf_iso.F90 - ${PSYCLONE} -s ../scripts/omp_cpu_trans.py ../code/traldf_iso.F90 - ${PSYCLONE} -s ../scripts/omp_gpu_trans.py ../code/traldf_iso.F90 + PARALLEL_DIRECTICVES="omp_threading" ${PSYCLONE} -s ../scripts/insert_loop_parallelism.py ../code/traldf_iso.F90 + PARALLEL_DIRECTICVES="omp_offloading" ${PSYCLONE} -s ../scripts/insert_loop_parallelism.py ../code/traldf_iso.F90 compile: transform diff --git a/examples/nemo/scripts/README.md b/examples/nemo/scripts/README.md index 606b9e3697..c32956b2cd 100644 --- a/examples/nemo/scripts/README.md +++ b/examples/nemo/scripts/README.md @@ -128,7 +128,7 @@ export PARALLEL_DIRECTIVES="omp_offloading+omp_threading" export FCFLAGS="-i4 -Mr8 -O3 -mp=gpu -gpu=mem:managed" ``` -In addition, `ASYNC_PARALLEL`, `ENABLE_INLINING`, `PROFILING` +TODO: Mention `ASYNC_PARALLEL`, `ENABLE_INLINING`, `PROFILING` ## Compilation From 6a89eb9093f41f4963ed65615087d91621bf6a15 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 11 Nov 2025 15:05:07 +0000 Subject: [PATCH 08/33] Attempt to fix issues with NEMO OpenACC --- .github/workflows/nemo_v5_tests.yml | 10 +++---- .../nemo/scripts/insert_loop_parallelism.py | 11 ++++--- src/psyclone/transformations.py | 29 ++++++++++++++++++- 3 files changed, 40 insertions(+), 10 deletions(-) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index ea2e5515ec..b7adaf7d42 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -56,7 +56,7 @@ jobs: bench_gfortran_omp_cpu: ${{ steps.bench_gfortran_omp_cpu.outputs.time }} bench_nvfortran_omp_offload: ${{ steps.bench_nvfortran_omp_offload.outputs.time }} bench_nvfortran_omp_offload_build: ${{ steps.bench_nvfortran_omp_offload.outputs.build_time }} - orca1_nvfortran_omp_offload: ${{ steps.orca1_nvfortran_omp_offload.outputs.time }} + orca1_nvfortran_acc_offload: ${{ steps.orca1_nvfortran_acc_offload.outputs.time }} orca2_nvfortran_omp_offload: ${{ steps.orca2_nvfortran_omp_offload.outputs.time }} bench_nvfortran_omp_offload_async: ${{ steps.bench_nvfortran_omp_offload_async.outputs.time }} orca2_nvfortran_omp_offload_async: ${{ steps.orca2_nvfortran_omp_offload_async.outputs.time }} @@ -282,7 +282,7 @@ jobs: echo "build_time=${BUILD_ELAPSED}" >> "${GITHUB_OUTPUT}" - name: NEMO 5.0 nvidia OpenMP for GPUs (UKMO ORCA1 - managed memory) - id: orca1_nvfortran_omp_offload + id: orca1_nvfortran_acc_offload run: | # Set up environment source /archive/psyclone-spack/psyclone-spack-Jun25/spack-repo/share/spack/setup-env.sh @@ -291,7 +291,7 @@ jobs: export PSYCLONE_NEMO_DIR=${GITHUB_WORKSPACE}/examples/nemo/scripts export PSYCLONE_HOME=${PWD}/.runner_venv export NEMO_DIR=/archive/psyclone-tests/latest-run/UKMO-NEMOv5 - export TEST_DIR=ORCA1_OMP_OFFLOAD_NVHPC + export TEST_DIR=ORCA1_ACC_OFFLOAD_NVHPC # Set up FCM: PATHs are loaded from SPACK, we only need to set the FCFLAGS # We compile at "-O2 -Mnofma -Mnovect -gpu=math_uniform" to permit comparison of the results. @@ -531,10 +531,10 @@ jobs: '"$COMMON_FIELDS"' }, { - ci_test: "NEMOv5 OpenMP for GPU (ORCA1)", + ci_test: "NEMOv5 OpenACC for GPU (ORCA1)", nemo_version: "NEMO 5.0-RC MO patch", compiler:"nvhpc-'"$NVFORTRAN_VERSION"'", - elapsed_time: '"${{needs.run_if_on_mirror.outputs.orca1_nvfortran_omp_offload}}"', + elapsed_time: '"${{needs.run_if_on_mirror.outputs.orca1_nvfortran_acc_offload}}"', '"$COMMON_FIELDS"' }, { diff --git a/examples/nemo/scripts/insert_loop_parallelism.py b/examples/nemo/scripts/insert_loop_parallelism.py index bf7b566c9b..74eb172172 100755 --- a/examples/nemo/scripts/insert_loop_parallelism.py +++ b/examples/nemo/scripts/insert_loop_parallelism.py @@ -82,17 +82,18 @@ RESOLVE_IMPORTS = NEMO_MODULES_TO_IMPORT # List of all files that psyclone will skip processing -FILES_TO_SKIP = [ - "icefrm.f90", # Has unsupportet implicit symbol declaration - "icerst.f90" -] +FILES_TO_SKIP = [] NEMOV5_EXCLUSIONS = [ + # Fail in gcc NEMOv5 BENCH "dynhpg.f90", "dynspg_ts.f90", "sbcssm.f90", "tramle.f90", "trazdf.f90", + # Fail when enabling seaice + "icefrm.f90", # Has unsupported implicit symbol declaration + "icerst.f90" ] NEMOV4_EXCLUSIONS = [ @@ -155,6 +156,7 @@ "bdy_oce.f90", ] + def select_transformations(): ''' Use the PARALLEL_DIRECTIVES global to select what specific transformations @@ -260,6 +262,7 @@ def trans(psyir): # Skip initialisation subroutines if (subroutine.name.endswith('_alloc') or subroutine.name.endswith('_init') or + subroutine.name.startswith('init_') or subroutine.name.startswith('Agrif') or subroutine.name.startswith('dia_') or subroutine.name == 'dom_msk' or diff --git a/src/psyclone/transformations.py b/src/psyclone/transformations.py index c3a9e2cc8e..bf9c4e46f8 100644 --- a/src/psyclone/transformations.py +++ b/src/psyclone/transformations.py @@ -61,7 +61,7 @@ ACCDataDirective, ACCDirective, ACCEnterDataDirective, ACCKernelsDirective, ACCLoopDirective, ACCParallelDirective, ACCRoutineDirective, Call, CodeBlock, Directive, Literal, Loop, Node, - OMPDirective, OMPMasterDirective, + OMPDirective, OMPMasterDirective, Reference, OMPParallelDirective, OMPParallelDoDirective, OMPSerialDirective, Return, Schedule, OMPReductionClause, OMPSingleDirective, PSyDataNode, IntrinsicCall) @@ -1187,9 +1187,16 @@ def validate(self, node_list, options=None): avoid using unsupported nodes inside a region. :param bool options["default_present"]: this flag controls if the inserted directive should include the default_present clause. + :param bool options["allow_strings"]: whether to allow the + transformation on assignments involving character types. Defaults + to False. + :param bool options["verbose"]: whether to allow the + transformation on assignments involving character types. Defaults + to False. ''' node_list = self.get_node_list(node_list) + verbose = options.get("allow_strings", False) super().validate(node_list, options) if options is not None and "default_present" in options: if not isinstance(options["default_present"], bool): @@ -1199,6 +1206,23 @@ def validate(self, node_list, options=None): ) device_string = options.get("device_string", "") if options else "" for node in node_list: + if not options.get("allow_strings", False): + # Check there are no character assignments in the region + for datanode in node.walk((Reference, Literal), + stop_type=Reference): + dtype = datanode.datatype + # Don't allow CHARACTERS on GPU + if hasattr(dtype, "intrinsic"): + if dtype.intrinsic == ScalarType.Intrinsic.CHARACTER: + message = ( + f"OpenACC Parallel cannot enclose a region " + f"that uses characters, but found: " + f"{datanode.debug_string()}" + ) + if verbose: + node.preceding_comment = message + raise TransformationError(message) + for call in node.walk(Call): if not call.is_available_on_device(device_string): if isinstance(call, IntrinsicCall): @@ -1231,6 +1255,9 @@ def apply(self, target_nodes, options=None): avoid using unsupported nodes inside a region. :param bool options["default_present"]: this flag controls if the inserted directive should include the default_present clause. + :param bool options["allow_strings"]: whether to allow the + transformation on assignments involving character types. Defaults + to False. ''' if not options: From 2252a48965ad557d90dde544968245c848cc0736 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 11 Nov 2025 16:49:35 +0000 Subject: [PATCH 09/33] Add allow_strings validation option to ACCParallel --- src/psyclone/transformations.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/psyclone/transformations.py b/src/psyclone/transformations.py index bf9c4e46f8..52f60f04d8 100644 --- a/src/psyclone/transformations.py +++ b/src/psyclone/transformations.py @@ -1196,7 +1196,9 @@ def validate(self, node_list, options=None): ''' node_list = self.get_node_list(node_list) - verbose = options.get("allow_strings", False) + verbose = options.get("allow_strings", False) if options else False + device_string = options.get("device_string", "") if options else "" + allow_strings = options.get("allow_strings", "") if options else False super().validate(node_list, options) if options is not None and "default_present" in options: if not isinstance(options["default_present"], bool): @@ -1204,9 +1206,8 @@ def validate(self, node_list, options=None): f"The provided 'default_present' option must be a " f"boolean, but found '{options['default_present']}'." ) - device_string = options.get("device_string", "") if options else "" for node in node_list: - if not options.get("allow_strings", False): + if not allow_strings: # Check there are no character assignments in the region for datanode in node.walk((Reference, Literal), stop_type=Reference): From 73fb247a00c52cfb5191aa960fdaa500fdb0c97c Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 11 Nov 2025 17:25:20 +0000 Subject: [PATCH 10/33] Exlcude icb from NEMO insert_loop_parallelism and skip failing async --- .github/workflows/nemo_v5_tests.yml | 19 ++++++++++--------- examples/nemo/scripts/Makefile | 3 --- .../nemo/scripts/insert_loop_parallelism.py | 2 ++ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index b7adaf7d42..896f59ebef 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -436,18 +436,19 @@ jobs: # Clean up and compile rm -rf cfgs/${TEST_DIR} - ./makenemo -r ORCA2_ICE_PISCES -m linux_spack -n ${TEST_DIR} \ - -p ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py \ - add_key "key_mpi_off key_nosignedzero" -j ${NUM_PARALLEL} -v 1 + # ./makenemo -r ORCA2_ICE_PISCES -m linux_spack -n ${TEST_DIR} \ + # -p ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py \ + # add_key "key_mpi_off key_nosignedzero" -j ${NUM_PARALLEL} -v 1 # Run test - cd $NEMO_DIR/cfgs/${TEST_DIR}/EXP00 - ln -sf /archive/psyclone-tests/nemo-inputs/UKMO-eORCA2/* . + # cd $NEMO_DIR/cfgs/${TEST_DIR}/EXP00 + # ln -sf /archive/psyclone-tests/nemo-inputs/UKMO-eORCA2/* . # Uses both, threading and offloading - export CUDA_VISIBLE_DEVICES=1 - OMP_NUM_THREADS=4 ./nemo - diff $PSYCLONE_NEMO_DIR/KGOs/run.stat.orca2.nvhpc.10steps run.stat - export VAR_TIME=$(grep "local proces" timing.output | head -n 1 | awk '{print $4}' | tr -d s) + # export CUDA_VISIBLE_DEVICES=1 + # OMP_NUM_THREADS=4 ./nemo + # diff $PSYCLONE_NEMO_DIR/KGOs/run.stat.orca2.nvhpc.10steps run.stat + # export VAR_TIME=$(grep "local proces" timing.output | head -n 1 | awk '{print $4}' | tr -d s) + export VAR_TIME="0.0" echo "time=${VAR_TIME}" >> "${GITHUB_OUTPUT}" - name: NEMO 5.0 nvidia Async OpenMP for GPUs (UKMO ORCA1 - managed memory) diff --git a/examples/nemo/scripts/Makefile b/examples/nemo/scripts/Makefile index 41c3955de7..9ebeb09634 100644 --- a/examples/nemo/scripts/Makefile +++ b/examples/nemo/scripts/Makefile @@ -61,18 +61,15 @@ psycloned-passthrough/%.f90: ${ROOT_SRC}%.f90 psycloned-passthrough psyclone -s passthrough.py -l output -I ${ROOT_SRC} -I ${MPI_INC_DIR} -o $@ $< psycloned-openmp_cpu/%.f90: ${ROOT_SRC}%.f90 psycloned-openmp_cpu - export PARALLEL_DIRECTIVES="cpu_threading" psyclone -s insert_loop_parallelism.py -l output -I ${ROOT_SRC} -I ${MPI_INC_DIR} -o $@ $< psycloned-openmp_gpu/%.f90: ${ROOT_SRC}%.f90 psycloned-openmp_gpu - export PARALLEL_DIRECTIVES="omp_offloading" psyclone -s insert_loop_parallelism.py -l output -I ${ROOT_SRC} -I ${MPI_INC_DIR} -o $@ $< psycloned-openacc_kernels/%.f90: ${ROOT_SRC}%.f90 psycloned-openacc_kernels psyclone -s acc_kernels_trans.py -l output -I ${ROOT_SRC} -I ${MPI_INC_DIR} -o $@ $< psycloned-openacc_loops/%.f90: ${ROOT_SRC}%.f90 psycloned-openacc_loops - export PARALLEL_DIRECTIVES="acc_offloading" psyclone -s insert_loop_parallelism.py -l output -I ${ROOT_SRC} -o $@ $< # Get the number of Makefile parallel jobs to pass it to the makenemo diff --git a/examples/nemo/scripts/insert_loop_parallelism.py b/examples/nemo/scripts/insert_loop_parallelism.py index 74eb172172..8cf639ccad 100755 --- a/examples/nemo/scripts/insert_loop_parallelism.py +++ b/examples/nemo/scripts/insert_loop_parallelism.py @@ -228,6 +228,8 @@ def filter_files_by_name(name: str) -> bool: # if name.startswith("icethd"): if name.startswith("ice"): return True + if name.startswith("icb"): + return True # This file fails for gcc NEMOv5 BENCH if not NEMOV4 and name == "icedyn_rhg_evp.f90": From 6607aecd31e83bf721eae10954901264af46a45e Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 11 Nov 2025 23:29:01 +0000 Subject: [PATCH 11/33] Reorder NEMO readme and add section about identifying problems --- examples/nemo/scripts/README.md | 67 ++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 23 deletions(-) diff --git a/examples/nemo/scripts/README.md b/examples/nemo/scripts/README.md index c32956b2cd..49cd0a8b64 100644 --- a/examples/nemo/scripts/README.md +++ b/examples/nemo/scripts/README.md @@ -130,7 +130,7 @@ export FCFLAGS="-i4 -Mr8 -O3 -mp=gpu -gpu=mem:managed" TODO: Mention `ASYNC_PARALLEL`, `ENABLE_INLINING`, `PROFILING` -## Compilation +## Compiling and running the application Once the environment variables are set, use the `makenemo` command with the desired NEMO configuration and keys. For example: @@ -140,9 +140,50 @@ the desired NEMO configuration and keys. For example: ``` If everything worked you can see the generated files in the -`/BLD/tmp` directory. +`/BLD/tmp` directory. And you can run the binary from the +EXP00 directory. For example, for a hybrid MPI+OMP offloading+OMP threading +we can do: + +```bash +# Prepare problem +ln -sf ${ORCA2_INPUTS}/ORCA2_ICE_v5.0.0/* cfgs/ORCA2_psycloned/EXP00/. +cd cfgs/ORCA2_psycloned/EXP00 +# Reduce num of iterations and add timing/runstat +sed -i "s/nn_itend.*/nn_itend = 10/" namelist_cfg +sed -i "s/ln_icebergs.*/ln_icebergs = .false./" namelist_cfg +sed -i "s/\&namctl.*/\&namctl\n ln_timing = .true. \n sn_cfctl%l_runstat = .true.\n/" namelist_cfg +# Run problem +OMP_NUM_THREADS=4 CUDA_VISIBLE_DEVICES=1,2 mpirun -n 2 ./nemo +``` + +## Identifying the cause of issues + +A difficulty of working with code-transformation scripts is that it is possible +to incorrect transform a file semantics while still creating valid Fortran. +This means that the transformation will succeed and the generated code will +compile, but the results will diverge. This gets more complicated with parallel +programming because certain operations like reductions or atomics are not +always reproducible. For NEMO we typically compare the generated `run.stat` field +values. To do that we recommend: + +- Starting building NEMO without `psyclonefc` and conservative optimisation flags + and ru it serially. Then store the generated `run.stat`. +- Then switch to using `psyclonefc` with the `PSYCLONE_OTPS="-s passthrough.py"`, + this will make all files pass through psyclone but without applying any + transformations. Check if the results still match. +- Then build it with `PARALLEL_DIRECTIVES="" PSYCLONE_OTPS="-s insert_loop_parallelism.py"` + and check if the results still match +- Then run it `REPRODUCIBLE=1 PARALLEL_DIRECTIVES="omp_threading" PSYCLONE_OTPS="-s insert_loop_parallelism.py"` + and see if the results still match. +- Finally, run it with `REPRODUCIBLE=1 PARALLEL_DIRECTIVES="omp_offloading" PSYCLONE_OTPS="-s insert_loop_parallelism.py"` + +Orthogonally to finding which step is causing the divergence we may want to find +which file/s are causing it. This folder also contains a `do_file_by_file.sh` +script that build NEMO many times, each with only one file being transformed, +and compares the results with the stores `run.stat` -## Fixing issues and tuning the generated implementation + +## Tuning the generated implementation Since this is now a two-step process. There are two locations where you can modify files that will alter the output result. First is the input source code. For this @@ -164,23 +205,3 @@ already includes directives, you need to reference it with the `-e ` and in the FILES_TO_SKIP (otherwise Psyclone would ignore the given directives and try to insert its own). This is currently the optimal approach for `seaice` and `lbclnk.f90` GPU offloading. - -## Running the generated code - -Finally, once the NEMO `makenemo` build has succeeded, we can run NEMO from -the configuration EXP00 directory. We include some known-good-outputs in the -`KGO` directory, but be aware that these can be compiler/flags/system-sensitive: - -```bash -# Prepare problem -ln -sf ${ORCA2_INPUTS}/ORCA2_ICE_v5.0.0/* cfgs/ORCA2_psycloned/EXP00/. -cd cfgs/ORCA2_ICE_PISCES_psycloned/EXP00 -# Reduce num of iterations and add timing/runstat -sed -i "s/nn_itend.*/nn_itend = 10/" namelist_cfg -sed -i "s/ln_icebergs.*/ln_icebergs = .false./" namelist_cfg -sed -i "s/\&namctl.*/\&namctl\n ln_timing = .true. \n sn_cfctl%l_runstat = .true.\n/" namelist_cfg - -# Run problem -OMP_NUM_THREADS=4 CUDA_VISIBLE_DEVICES=1,2 mpirun -n 2 ./nemo -diff ${PSYCLONE_NEMO_EXAMPLES_DIR}/KGOs/run.stat.orca_ice_pisces.nvhpc.10steps run.stat -``` From 1e57470248698010881045af93ff5c90b0174764 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 11 Nov 2025 23:34:19 +0000 Subject: [PATCH 12/33] Update NEMO README --- examples/nemo/scripts/README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/nemo/scripts/README.md b/examples/nemo/scripts/README.md index 49cd0a8b64..8ca7479ee6 100644 --- a/examples/nemo/scripts/README.md +++ b/examples/nemo/scripts/README.md @@ -185,9 +185,10 @@ and compares the results with the stores `run.stat` ## Tuning the generated implementation -Since this is now a two-step process. There are two locations where you can modify -files that will alter the output result. First is the input source code. For this -we recommend using the built-in `makenemo` functionality +Since this is now a two-step process, there are two locations where you can modify +files that will alter the output result. First is manually modifying the original +source code. For this we recommend using the built-in `makenemo` functionality +that allow to point to a directory with patched source files: ```bash ./makenemo -e ... From aea4c0edbdba687553dbf7e24c6e1bd8992f6f78 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Wed, 12 Nov 2025 12:07:07 +0000 Subject: [PATCH 13/33] Update NEMO script to only exclude files with offloading issues in NEMOv5 --- .github/workflows/nemo_tests.yml | 7 +- examples/nemo/scripts/README.md | 16 ++++- .../nemo/scripts/insert_loop_parallelism.py | 65 +++++++++++-------- 3 files changed, 53 insertions(+), 35 deletions(-) diff --git a/.github/workflows/nemo_tests.yml b/.github/workflows/nemo_tests.yml index fe83fbec57..aaec4b6744 100644 --- a/.github/workflows/nemo_tests.yml +++ b/.github/workflows/nemo_tests.yml @@ -190,12 +190,7 @@ jobs: export NV_ACC_POOL_THRESHOLD=75 export CUDA_VISIBLE_DEVICES=1 make run-openacc_loops - # Check the output is as expected (TODO #2895: improve numerical reproducibility) - make output-openacc_loops | grep -q " it : 10" || (echo "Error: 'it : 10' not found!" & false) - make output-openacc_loops | grep -q "|ssh|_max: 0.259483" || (echo "Error: '|ssh|_max: 0.259483' not found!" & false) - make output-openacc_loops | grep -q "|U|_max: 0.458515" || (echo "Error: '|U|_max: 0.458515' not found!" & false) - make output-openacc_loops | grep -q "S_min: 0.482686" || (echo "Error: 'S_min: 0.482686' not found!" & false) - make output-openacc_loops | grep -q "S_max: 0.407622" || (echo "Error: 'S_max: 0.407622' not found!" & false) + diff <(make -s output-openacc_loops) KGOs/run.stat.nemo4.spitz12.nvhpc.10steps export VAR_TIME=$(grep -A 1 "Elapsed Time" <(make -s time-openacc_loops) | head -n 2 | tail -n 1 | awk '{print $1}') echo "time=${VAR_TIME}" >> "${GITHUB_OUTPUT}" diff --git a/examples/nemo/scripts/README.md b/examples/nemo/scripts/README.md index 8ca7479ee6..1067b20f2e 100644 --- a/examples/nemo/scripts/README.md +++ b/examples/nemo/scripts/README.md @@ -62,9 +62,10 @@ wget https://gws-access.jasmin.ac.uk/public/nemo/sette_inputs/r5.0.0/ORCA2_ICE_v tar -xzf ORCA2_ICE_v5.0.0.tar.gz ``` -The examples have been tested with NEMOv4.0.2 and NEMOv5.0, but we aim to support any -version of NEMO. If you encounter any issue applying these examples please report to -the authors. +The examples have been tested with NEMOv4.0.2 (SPLITZ configuration) and +NEMOv5.0 (BENCH and ORCA_ICE_PISCIES configuration), but we aim to support +any version of NEMO. If you encounter any issue applying these examples +please report to the authors. ## Set up environment variables @@ -128,6 +129,15 @@ export PARALLEL_DIRECTIVES="omp_offloading+omp_threading" export FCFLAGS="-i4 -Mr8 -O3 -mp=gpu -gpu=mem:managed" ``` +> [!Note] +> Currently, NEMOv4 and NEMOv5 take different optimisation paths, so it is +> imporant to also set: +> +> ```bash +> export NEMOv4=1 +> ``` +> when applying the transformations to NEMOv4. + TODO: Mention `ASYNC_PARALLEL`, `ENABLE_INLINING`, `PROFILING` ## Compiling and running the application diff --git a/examples/nemo/scripts/insert_loop_parallelism.py b/examples/nemo/scripts/insert_loop_parallelism.py index 8cf639ccad..ef09e01c45 100755 --- a/examples/nemo/scripts/insert_loop_parallelism.py +++ b/examples/nemo/scripts/insert_loop_parallelism.py @@ -41,8 +41,7 @@ import sys from utils import ( add_profiling, inline_calls, insert_explicit_loop_parallelism, - normalise_loops, enhance_tree_information, PARALLELISATION_ISSUES, - NEMO_MODULES_TO_IMPORT) + normalise_loops, enhance_tree_information, NEMO_MODULES_TO_IMPORT) from psyclone.psyir.nodes import Routine, Loop from psyclone.psyir.transformations import ( OMPTargetTrans, OMPDeclareTargetTrans) @@ -91,7 +90,7 @@ "sbcssm.f90", "tramle.f90", "trazdf.f90", - # Fail when enabling seaice + # Fail in nvfortran when enabling seaice "icefrm.f90", # Has unsupported implicit symbol declaration "icerst.f90" ] @@ -114,33 +113,47 @@ "histcom.f90", ] -OFFLOADING_ISSUES = [ - # Produces different output results - "zdftke.f90", - # The following issues only affect BENCH (because ice is enabled?) - # Runtime Error: Illegal address during kernel execution - "trcrad.f90", - # nvhpc > 24.11 - Signal 11 issues - "trcbbl.f90", - "bdyice.f90", - "sedfunc.f90", - "stpmlf.f90", - "trddyn.f90", - "trczdf.f90", - "trcice_pisces.f90", - "dtatsd.f90", - "trcatf.f90", - "stp2d.f90", -] - -if "acc_offloading" in PARALLEL_DIRECTIVES: - OFFLOADING_ISSUES = OFFLOADING_ISSUES + [ +# These files change the results from the baseline when psyclone adds +# parallelisation dirctives +PARALLELISATION_ISSUES = [] +if not NEMOV4: + PARALLELISATION_ISSUES.extend([ + "ldfc1d_c2d.f90", + "tramle.f90", + "traqsr.f90", + ]) + +# These files change the results from the baseline when psyclone adds +# offloading dirctives +OFFLOADING_ISSUES = [] +if not NEMOV4: + OFFLOADING_ISSUES.extend([ + # Produces different output results + "zdftke.f90", + # The following issues only affect BENCH (because ice is enabled?) + # Runtime Error: Illegal address during kernel execution + "trcrad.f90", + # nvhpc > 24.11 - Signal 11 issues + "trcbbl.f90", + "bdyice.f90", + "sedfunc.f90", + "stpmlf.f90", + "trddyn.f90", + "trczdf.f90", + "trcice_pisces.f90", + "dtatsd.f90", + "trcatf.f90", + "stp2d.f90", + ]) + +if not NEMOV4 and "acc_offloading" in PARALLEL_DIRECTIVES: + OFFLOADING_ISSUES.extend([ # Fail in OpenACC ORCA2_ICE_PISCES "dynzdf.f90", "trabbl.f90", "trazdf.f90", "zdfsh2.f90", - ] + ]) ASYNC_ISSUES = [ # Runtime Error: (CUDA_ERROR_LAUNCH_FAILED): Launch failed @@ -226,7 +239,7 @@ def filter_files_by_name(name: str) -> bool: # Parallelising this file currently causes a noticeable slowdown # if name.startswith("icethd"): - if name.startswith("ice"): + if not NEMOV4 and name.startswith("ice"): return True if name.startswith("icb"): return True From 782a1635e15e9ed17ea31a7ce332b5516483c483 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Wed, 12 Nov 2025 14:49:06 +0000 Subject: [PATCH 14/33] Replace acc_loop_trans in NEMOv4 test --- .github/workflows/nemo_tests.yml | 43 +++++--- examples/nemo/scripts/README.md | 6 +- examples/nemo/scripts/acc_loops_trans.py | 123 ----------------------- 3 files changed, 33 insertions(+), 139 deletions(-) delete mode 100755 examples/nemo/scripts/acc_loops_trans.py diff --git a/.github/workflows/nemo_tests.yml b/.github/workflows/nemo_tests.yml index aaec4b6744..08f9fed6c2 100644 --- a/.github/workflows/nemo_tests.yml +++ b/.github/workflows/nemo_tests.yml @@ -173,25 +173,39 @@ jobs: - name: NEMO MetOffice OpenACC loops for GPU id: nemo_acc_loops run: | + source /archive/psyclone-spack/psyclone-spack-Jun25/spack-repo/share/spack/setup-env.sh + spack unload && spack load nemo-build-environment%nvhpc . .runner_venv/bin/activate + + # Set up envvars export PSYCLONE_NEMO_DIR=${GITHUB_WORKSPACE}/examples/nemo/scripts export PROFILE_HOME=${GITHUB_WORKSPACE}/lib/profiling/nvidia/ export NEMO_DIR=${PREFIX}/UKMO-NEMOv4 - cd $PSYCLONE_NEMO_DIR - module load nvidia-hpcsdk/${NVFORTRAN_VERSION} - module load hdf5/${HDF5_VERSION} netcdf-c/${NETCDF_C_VERSION} netcdf-fortran/${NETCDF_FORTRAN_VERSION} - module load perl/${PERL_VERSION} - make clean + export TEST_DIR=SPITZ12_ACC_LOOPS_NVHPC + export PSYCLONE_COMPILER=$MPIF90 + export MPIF90=psyclonefc export PARALLEL_DIRECTIVES="acc_offloading" export REPRODUCIBLE=1 - export NEMOV4=1 - make -j ${NUM_PARALLEL} openacc_loops - COMPILER_ARCH=linux_nvidia_acc_gpu make -j ${NUM_PARALLEL} compile-openacc_loops + export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py -I ${MPI_HOME}/include" + export FCFLAGS="-i4 -Mr8 -O2 -Mnovect -Mnofma -g -acc -mp=gpu -gpu=mem:managed,math_uniform" + export NEMOV4=1 # Enables specific NEMOV4 exclusions in the PSyclone transformation script + + # Compile + cd ${PREFIX}/ECMWF-NEMOv4 + cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm + rm -rf cfgs/${TEST_DIR} + ./makenemo -r SPITZ12 -m linux_spack -n ${TEST_DIR} \ + add_key "IEEE_IS_NAN=ieee_is_nan key_nosignedzero" \ + del_key "key_iomput key_mpp_mpi key_si3" -j ${NUM_PARALLEL} + + # Run test export NV_ACC_POOL_THRESHOLD=75 export CUDA_VISIBLE_DEVICES=1 - make run-openacc_loops - diff <(make -s output-openacc_loops) KGOs/run.stat.nemo4.spitz12.nvhpc.10steps - export VAR_TIME=$(grep -A 1 "Elapsed Time" <(make -s time-openacc_loops) | head -n 2 | tail -n 1 | awk '{print $1}') + cd cfgs/${TEST_DIR}/EXP00/ + ln -sf /archive/psyclone-tests/nemo-inputs/UKMO-eORCA1_GO8_NEMOv4/*.nc . + ./nemo + diff run.stat $PSYCLONE_NEMO_DIR/KGOs/run.stat.nemo4.spitz12.nvhpc.10steps + export VAR_TIME=$(grep -A 1 "Elapsed Time" timing.output | head -n 2 | tail -n 1 | awk '{print $1}') echo "time=${VAR_TIME}" >> "${GITHUB_OUTPUT}" # PSyclone, compile and run ECMWF NEMO with OpenMP for CPUs. This uses @@ -205,6 +219,7 @@ jobs: source .runner_venv/bin/activate # Set up envvars + export TEST_DIR=SPITZ12_ACC_LOOPS_NVHPC export PSYCLONE_NEMO_DIR=${GITHUB_WORKSPACE}/examples/nemo/scripts export PSYCLONE_COMPILER=$MPIF90 export MPIF90=psyclonefc @@ -215,12 +230,14 @@ jobs: # Compile cd ${PREFIX}/ECMWF-NEMOv4 - ./makenemo -r SPITZ12 -m linux_spack -n SPITZ12_psyclone \ + cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm + rm -rf cfgs/${TEST_DIR} + ./makenemo -r SPITZ12 -m linux_spack -n ${TEST_DIR} \ add_key "IEEE_IS_NAN=ieee_is_nan key_nosignedzero" \ del_key "key_iomput" -j ${NUM_PARALLEL} # Run NEMO - cd cfgs/SPITZ12_psyclone/EXP00/ + cd cfgs/${TEST_DIR}/EXP00/ ln -sf /archive/psyclone-tests/nemo-inputs/ECMWF-eORCA1_GO8/* . export OMP_NUM_THREADS=12 ./nemo diff --git a/examples/nemo/scripts/README.md b/examples/nemo/scripts/README.md index 1067b20f2e..215fc009e3 100644 --- a/examples/nemo/scripts/README.md +++ b/examples/nemo/scripts/README.md @@ -169,15 +169,15 @@ OMP_NUM_THREADS=4 CUDA_VISIBLE_DEVICES=1,2 mpirun -n 2 ./nemo ## Identifying the cause of issues A difficulty of working with code-transformation scripts is that it is possible -to incorrect transform a file semantics while still creating valid Fortran. +to incorrectly transform a file semantics while still creating valid Fortran. This means that the transformation will succeed and the generated code will compile, but the results will diverge. This gets more complicated with parallel programming because certain operations like reductions or atomics are not always reproducible. For NEMO we typically compare the generated `run.stat` field values. To do that we recommend: -- Starting building NEMO without `psyclonefc` and conservative optimisation flags - and ru it serially. Then store the generated `run.stat`. +- Starting building NEMO *without* `psyclonefc` and conservative optimisation flags + and run it serially. Then store the generated `run.stat`. - Then switch to using `psyclonefc` with the `PSYCLONE_OTPS="-s passthrough.py"`, this will make all files pass through psyclone but without applying any transformations. Check if the results still match. diff --git a/examples/nemo/scripts/acc_loops_trans.py b/examples/nemo/scripts/acc_loops_trans.py deleted file mode 100755 index 4c2addd5cf..0000000000 --- a/examples/nemo/scripts/acc_loops_trans.py +++ /dev/null @@ -1,123 +0,0 @@ -#!/usr/bin/env python -# ----------------------------------------------------------------------------- -# BSD 3-Clause License -# -# Copyright (c) 2023-2025, Science and Technology Facilities Council. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. -# ----------------------------------------------------------------------------- -# Authors: S. Siso, STFC Daresbury Lab - -''' PSyclone transformation script showing the introduction of OpenACC loop -directives into Nemo code. ''' - -from utils import ( - insert_explicit_loop_parallelism, normalise_loops, add_profiling, - enhance_tree_information, NOT_PERFORMANT, NEMO_MODULES_TO_IMPORT) -from psyclone.psyir.nodes import Routine -from psyclone.transformations import ( - ACCParallelTrans, ACCLoopTrans, ACCRoutineTrans) - -# Enable the insertion of profiling hooks during the transformation script -PROFILING_ENABLED = True - -# Whether to chase the imported modules to improve symbol information (it can -# also be a list of module filenames to limit the chasing to only specific -# modules). This has to be used in combination with '-I' command flag in order -# to point to the module location directory. We also strongly recommend using -# the '--enable-cache' flag to reduce the performance overhead. -RESOLVE_IMPORTS = NEMO_MODULES_TO_IMPORT - -# List of all files that psyclone will skip processing -FILES_TO_SKIP = NOT_PERFORMANT - - -def trans(psyir): - ''' Add OpenACC Parallel and Loop directives to all loops, including the - implicit ones, to parallelise the code and execute it in an acceleration - device. - - :param psyir: the PSyIR of the provided file. - :type psyir: :py:class:`psyclone.psyir.nodes.FileContainer` - - ''' - acc_region_trans = ACCParallelTrans(default_present=False) - acc_loop_trans = ACCLoopTrans() - - # TODO #2317: Has structure accesses that can not be offloaded and has - # a problematic range to loop expansion of (1:1) - if psyir.name.startswith("obs_"): - print("Skipping", psyir.name) - return - - for subroutine in psyir.walk(Routine): - print(f"Transforming subroutine: {subroutine.name}") - - if PROFILING_ENABLED: - add_profiling(subroutine.children) - - # S-0074-Illegal number or type of arguments to ubound [and lbound] - # - keyword argument array; and NVFORTRAN-S-0082-Illegal substring - # expression for variable filtide - if subroutine.name in ("bdytide_init", "sbc_cpl_init"): - print("Skipping", subroutine.name) - continue - - # OpenACC fails in the following routines with the Compiler error: - # Could not find allocated-variable index for symbol - xxx - # This all happen on characters arrays, e.g. cd_nat - if subroutine.name in ("lbc_nfd_2d_ptr", "lbc_nfd_3d_ptr", - "lbc_nfd_4d_ptr", "bdy_dyn", "dia_obs_init"): - print("Skipping", subroutine.name) - continue - - enhance_tree_information(subroutine) - - normalise_loops( - subroutine, - hoist_local_arrays=True, - convert_array_notation=True, - convert_range_loops=True, - hoist_expressions=True - ) - - # These are functions that are called from inside parallel regions, - # annotate them with 'acc routine' - if subroutine.name.lower().startswith("sign_"): - ACCRoutineTrans().apply(subroutine) - print(f"Marked {subroutine.name} as GPU-enabled") - continue - - insert_explicit_loop_parallelism( - subroutine, - region_directive_trans=acc_region_trans, - loop_directive_trans=acc_loop_trans, - # Collapse is necessary to give GPUs enough parallel items - collapse=True, - ) From ca2c06eddd1c52776ebb4483bb154c908f01d47a Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Wed, 12 Nov 2025 15:07:59 +0000 Subject: [PATCH 15/33] Remove unneeded NEMO flag --- .github/workflows/nemo_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nemo_tests.yml b/.github/workflows/nemo_tests.yml index 08f9fed6c2..cb5b6f8620 100644 --- a/.github/workflows/nemo_tests.yml +++ b/.github/workflows/nemo_tests.yml @@ -186,7 +186,7 @@ jobs: export MPIF90=psyclonefc export PARALLEL_DIRECTIVES="acc_offloading" export REPRODUCIBLE=1 - export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py -I ${MPI_HOME}/include" + export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py" export FCFLAGS="-i4 -Mr8 -O2 -Mnovect -Mnofma -g -acc -mp=gpu -gpu=mem:managed,math_uniform" export NEMOV4=1 # Enables specific NEMOV4 exclusions in the PSyclone transformation script From 1e52822d92d0002c9196ffe35bd351124fe475a7 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Wed, 12 Nov 2025 15:38:32 +0000 Subject: [PATCH 16/33] Fix wrong path in NEMO integration test --- .github/workflows/nemo_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nemo_tests.yml b/.github/workflows/nemo_tests.yml index cb5b6f8620..29003c647f 100644 --- a/.github/workflows/nemo_tests.yml +++ b/.github/workflows/nemo_tests.yml @@ -191,7 +191,7 @@ jobs: export NEMOV4=1 # Enables specific NEMOV4 exclusions in the PSyclone transformation script # Compile - cd ${PREFIX}/ECMWF-NEMOv4 + cd ${PREFIX}/UKMO-NEMOv4 cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm rm -rf cfgs/${TEST_DIR} ./makenemo -r SPITZ12 -m linux_spack -n ${TEST_DIR} \ From 0a7c24f39a65103842784990ebc06e93c3e442d8 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Thu, 13 Nov 2025 22:03:24 +0000 Subject: [PATCH 17/33] Small change of location of NEMO utils.py conditional --- examples/nemo/scripts/utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index 41c0b465ae..90adbd3332 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -238,9 +238,6 @@ def normalise_loops( # Convert all array implicit loops to explicit loops explicit_loops = ArrayAssignment2LoopsTrans() for assignment in schedule.walk(Assignment): - if filename == "fldread.f90": - # TODO #2951: This file has issues converting SturctureRefs - continue try: explicit_loops.apply( assignment, options={'verbose': True}) From 98f5dca84c4f7d87afd286fa38381d214cbd2420 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Mon, 1 Dec 2025 12:41:01 +0000 Subject: [PATCH 18/33] Fix issues with CI and integration tests --- examples/nemo/scripts/insert_loop_parallelism.py | 4 +--- src/psyclone/psyir/nodes/structure_reference.py | 7 +++++-- src/psyclone/tests/psyir/nodes/structure_reference_test.py | 5 +++++ 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/examples/nemo/scripts/insert_loop_parallelism.py b/examples/nemo/scripts/insert_loop_parallelism.py index ef09e01c45..e6d26d7c1f 100755 --- a/examples/nemo/scripts/insert_loop_parallelism.py +++ b/examples/nemo/scripts/insert_loop_parallelism.py @@ -41,7 +41,7 @@ import sys from utils import ( add_profiling, inline_calls, insert_explicit_loop_parallelism, - normalise_loops, enhance_tree_information, NEMO_MODULES_TO_IMPORT) + normalise_loops, NEMO_MODULES_TO_IMPORT) from psyclone.psyir.nodes import Routine, Loop from psyclone.psyir.transformations import ( OMPTargetTrans, OMPDeclareTargetTrans) @@ -285,8 +285,6 @@ def trans(psyir): subroutine.name == 'dom_ngb'): continue - enhance_tree_information(subroutine) - normalise_loops( subroutine, hoist_local_arrays=False, diff --git a/src/psyclone/psyir/nodes/structure_reference.py b/src/psyclone/psyir/nodes/structure_reference.py index 1a7f298f6b..91e11b9389 100644 --- a/src/psyclone/psyir/nodes/structure_reference.py +++ b/src/psyclone/psyir/nodes/structure_reference.py @@ -348,8 +348,11 @@ def _get_cursor_shape(cursor, cursor_type): if not isinstance(cursor_type, (UnresolvedType, UnsupportedType)): # Once we've hit an Unresolved/UnsupportedType the cursor_type # will remain set to that as we can't do any better. - cursor_type = cursor_type.components[ - cursor.name.lower()].datatype + try: + cursor_type = cursor_type.components[ + cursor.name.lower()].datatype + except KeyError: + return UnresolvedType() try: cursor_shape = _get_cursor_shape(cursor, cursor_type) except NotImplementedError: diff --git a/src/psyclone/tests/psyir/nodes/structure_reference_test.py b/src/psyclone/tests/psyir/nodes/structure_reference_test.py index 379c5bc66f..b4fa1b0e97 100644 --- a/src/psyclone/tests/psyir/nodes/structure_reference_test.py +++ b/src/psyclone/tests/psyir/nodes/structure_reference_test.py @@ -263,6 +263,11 @@ def test_struc_ref_datatype(): sref0 = nodes.StructureReference.create(ssym0, ["nx"]) assert sref0.datatype == symbols.INTEGER_TYPE + # If the type component is not found (e.g. it is inherited, which psyclone + # does not support), return UnresolvedType + sref = nodes.StructureReference.create(ssym0, ["not_specified"]) + assert sref.datatype == symbols.UnresolvedType() + # Symbol with type defined by DataTypeSymbol grid_type_symbol = symbols.DataTypeSymbol("grid_type", grid_type) ssym = symbols.DataSymbol("grid", grid_type_symbol) From 71270ce7642ffca9d3c318e7800306dc5495ab91 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 2 Dec 2025 11:54:46 +0000 Subject: [PATCH 19/33] In ECMWF NEMOv4 include mpif.h during cpp --- .github/workflows/nemo_tests.yml | 2 +- examples/nemo/scripts/KGOs/arch-linux_spack.fcm | 2 +- examples/nemo/scripts/KGOs/arch-linux_spack_profile.fcm | 4 +--- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/nemo_tests.yml b/.github/workflows/nemo_tests.yml index 95b3762f9e..9b1bfb6383 100644 --- a/.github/workflows/nemo_tests.yml +++ b/.github/workflows/nemo_tests.yml @@ -224,7 +224,7 @@ jobs: export PSYCLONE_COMPILER=$MPIF90 export MPIF90=psyclonefc export PARALLEL_DIRECTIVES="omp_threading" - export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py -I ${MPI_HOME}/include" + export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py" export FCFLAGS="-i4 -r8 -O2 -heap-arrays -fp-model=precise -g -qopenmp" export NEMOV4=1 # Enables specific NEMOV4 exclusions in the PSyclone transformation script diff --git a/examples/nemo/scripts/KGOs/arch-linux_spack.fcm b/examples/nemo/scripts/KGOs/arch-linux_spack.fcm index 94943258f3..b2b23e196e 100644 --- a/examples/nemo/scripts/KGOs/arch-linux_spack.fcm +++ b/examples/nemo/scripts/KGOs/arch-linux_spack.fcm @@ -9,7 +9,7 @@ %NCDF_INC -I${NCDF_F_HOME}/include -I${NCDF_C_HOME}/include -I${HDF5_HOME}/include %NCDF_LIB -L${NCDF_F_HOME}/lib -lnetcdff -L${NCDF_C_HOME}/lib -lnetcdf -%CPP cpp -Dkey_nosignedzero +%CPP cpp -Dkey_nosignedzero -I${MPI_HOME}/include %FC ${MPIF90} -c %FCFLAGS ${FCFLAGS} %FFLAGS %FCFLAGS diff --git a/examples/nemo/scripts/KGOs/arch-linux_spack_profile.fcm b/examples/nemo/scripts/KGOs/arch-linux_spack_profile.fcm index 2e6c8df745..151861e342 100644 --- a/examples/nemo/scripts/KGOs/arch-linux_spack_profile.fcm +++ b/examples/nemo/scripts/KGOs/arch-linux_spack_profile.fcm @@ -12,9 +12,7 @@ %PROFILE_INC -I${PROFILING_DIR} %PROFILE_LIB -L${PROFILING_DIR} -lnvtx_prof -L${CUDA_HOME}/lib64 -cuda -lnvToolsExt - - -%CPP cpp -Dkey_nosignedzero +%CPP cpp -Dkey_nosignedzero -I${MPI_HOME}/include %FC ${MPIF90} -c %FCFLAGS ${FCFLAGS} %FFLAGS %FCFLAGS From 0b80c98ba5560ba8fffb069497158cae079a5aae Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 2 Dec 2025 12:21:59 +0000 Subject: [PATCH 20/33] Test ACCParallelTrans allow_strings --- .../transformations/transformations_test.py | 41 ++++++++++++++++--- src/psyclone/transformations.py | 8 ++-- 2 files changed, 40 insertions(+), 9 deletions(-) diff --git a/src/psyclone/tests/psyir/transformations/transformations_test.py b/src/psyclone/tests/psyir/transformations/transformations_test.py index 3aeb000149..5e6a234d37 100644 --- a/src/psyclone/tests/psyir/transformations/transformations_test.py +++ b/src/psyclone/tests/psyir/transformations/transformations_test.py @@ -123,7 +123,7 @@ def test_accparalleltrans_validate(fortran_reader): ''' Test that ACCParallelTrans validation fails if it contains non-allowed constructs. ''' - omptargettrans = ACCParallelTrans() + accparalleltrans = ACCParallelTrans() code = ''' function myfunc(a) @@ -134,6 +134,8 @@ def test_accparalleltrans_validate(fortran_reader): integer, dimension(10, 10) :: A integer :: i integer :: j + character*8 :: a, b + character :: c(8), d(8) do i = 1, 10 do j = 1, 10 A(i, j) = myfunc(3) @@ -149,35 +151,62 @@ def test_accparalleltrans_validate(fortran_reader): A(i,j) = GET_COMMAND(2) end do end do + do i = 1, 8 + a(i) = b(i) + end do + do i = 1, 8 + c(i) = d(i) + end do end subroutine ''' psyir = fortran_reader.psyir_from_source(code) loops = psyir.walk(Loop, stop_type=Loop) with pytest.raises(TransformationError) as err: - omptargettrans.validate(loops[0]) + accparalleltrans.validate(loops[0]) assert ("'myfunc' is not available on the accelerator device, and " "therefore it cannot be called from within an ACC parallel region." in str(err.value)) with pytest.raises(TransformationError) as err: - omptargettrans.validate(loops[1]) + accparalleltrans.validate(loops[1]) assert ("Nodes of type 'CodeBlock' cannot be enclosed by a ACCParallel" "Trans transformation" in str(err.value)) with pytest.raises(TransformationError) as err: - omptargettrans.validate(loops[2]) + accparalleltrans.validate(loops[2]) assert ("'GET_COMMAND' is not available on the default accelerator " "device. Use the 'device_string' option to specify a different " "device." in str(err.value)) with pytest.raises(TransformationError) as err: - omptargettrans.validate(loops[2], options={'device_string': - 'nvfortran-all'}) + accparalleltrans.validate(loops[2], options={'device_string': + 'nvfortran-all'}) assert ("'GET_COMMAND' is not available on the 'nvfortran-all' accelerator" " device. Use the 'device_string' option to specify a different " "device." in str(err.value)) + # Character substrings and no verbose option + with pytest.raises(TransformationError) as err: + accparalleltrans.validate(loops[3]) + assert ("ACCParallelTrans doesn't enclose regions that uses characters, " + "but found: b(i), use the 'allow_strings' transformation option " + "to offload this region." in str(err.value)) + assert loops[3].preceding_comment == "" + + # Character array and verbose option + with pytest.raises(TransformationError) as err: + accparalleltrans.validate(loops[4], options={'verbose': True}) + assert ("ACCParallelTrans doesn't enclose regions that uses characters, " + "but found: c(i), use the 'allow_strings' transformation option " + "to offload this region." in str(err.value)) + assert ("but found: c(i), use the 'allow_strings'" + in loops[4].preceding_comment) + + # These validate with the right option + accparalleltrans.validate(loops[3], options={'allow_strings': True}) + accparalleltrans.validate(loops[4], options={'allow_strings': True}) + def test_accenterdata(): ''' Generic tests for the ACCEnterDataTrans class ''' diff --git a/src/psyclone/transformations.py b/src/psyclone/transformations.py index 52f60f04d8..9e8943d624 100644 --- a/src/psyclone/transformations.py +++ b/src/psyclone/transformations.py @@ -1196,7 +1196,7 @@ def validate(self, node_list, options=None): ''' node_list = self.get_node_list(node_list) - verbose = options.get("allow_strings", False) if options else False + verbose = options.get("verbose", False) if options else False device_string = options.get("device_string", "") if options else "" allow_strings = options.get("allow_strings", "") if options else False super().validate(node_list, options) @@ -1216,9 +1216,11 @@ def validate(self, node_list, options=None): if hasattr(dtype, "intrinsic"): if dtype.intrinsic == ScalarType.Intrinsic.CHARACTER: message = ( - f"OpenACC Parallel cannot enclose a region " + f"ACCParallelTrans doesn't enclose regions " f"that uses characters, but found: " - f"{datanode.debug_string()}" + f"{datanode.debug_string()}, use the " + f"'allow_strings' transformation option to " + f"offload this region." ) if verbose: node.preceding_comment = message From 9125e6a5cefcd996a2d2fe7b5926e56fb2962f63 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 2 Dec 2025 12:53:02 +0000 Subject: [PATCH 21/33] Add no-vectorize and no-fma to the NEMOv5 gcc reproducible checks --- .github/workflows/nemo_v5_tests.yml | 4 ++-- .../run.stat.bench.gfortran.small.10steps | 20 +++++++++---------- .../nemo/scripts/insert_loop_parallelism.py | 9 ++------- 3 files changed, 14 insertions(+), 19 deletions(-) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index 8843213ee6..8b54bbfa88 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -107,7 +107,7 @@ jobs: # Set up FCM: PATHs are loaded from SPACK, we only need to set the FCFLAGS cd $NEMO_DIR cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm - export FCFLAGS="-fdefault-real-8 -O2 -fcray-pointer -ffree-line-length-none -g" + export FCFLAGS="-fdefault-real-8 -O2 -mno-fma -fno-tree-vectorize -fcray-pointer -ffree-line-length-none -g" # Clean up and compile rm -rf tests/${TEST_DIR} @@ -204,7 +204,7 @@ jobs: export MPIF90=psyclonefc export PARALLEL_DIRECTIVES="omp_threading" export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py" - export FCFLAGS="-fdefault-real-8 -O2 -fcray-pointer -ffree-line-length-none -g -fopenmp" + export FCFLAGS="-fdefault-real-8 -O2 -mno-fma -fno-tree-vectorize -fcray-pointer -ffree-line-length-none -g -fopenmp" # Clean up and compile rm -rf tests/${TEST_DIR} diff --git a/examples/nemo/scripts/KGOs/run.stat.bench.gfortran.small.10steps b/examples/nemo/scripts/KGOs/run.stat.bench.gfortran.small.10steps index 0f9fc042a7..b0f8f90060 100644 --- a/examples/nemo/scripts/KGOs/run.stat.bench.gfortran.small.10steps +++ b/examples/nemo/scripts/KGOs/run.stat.bench.gfortran.small.10steps @@ -1,10 +1,10 @@ - it : 1 |ssh|_max: 0.2336851764570053D+01 |U|_max: 0.7053248015576865D-02 |V|_max: 0.2308346115751064D-02 S_min: 0.2996908779321225D+02 S_max: 0.3101392941293399D+02 - it : 2 |ssh|_max: 0.3739162010288019D+01 |U|_max: 0.1029843199699749D-01 |V|_max: 0.9493800242777233D-02 S_min: 0.2996911000748410D+02 S_max: 0.3101392863147436D+02 - it : 3 |ssh|_max: 0.4224443521973881D+01 |U|_max: 0.1349433227267360D-01 |V|_max: 0.2284885234302297D-01 S_min: 0.2996913553478158D+02 S_max: 0.3101392784904396D+02 - it : 4 |ssh|_max: 0.4659313564999673D+01 |U|_max: 0.1490637483763983D-01 |V|_max: 0.4048444554221592D-01 S_min: 0.2996916129319160D+02 S_max: 0.3101392717586670D+02 - it : 5 |ssh|_max: 0.4949503007019537D+01 |U|_max: 0.1145357177492709D-01 |V|_max: 0.5364770396337241D-01 S_min: 0.2996918706295251D+02 S_max: 0.3101392651315916D+02 - it : 6 |ssh|_max: 0.5140472974504101D+01 |U|_max: 0.1064859943349832D-01 |V|_max: 0.6818865538920099D-01 S_min: 0.2996921262561763D+02 S_max: 0.3101392594056642D+02 - it : 7 |ssh|_max: 0.5229361171698524D+01 |U|_max: 0.7814316351518531D-02 |V|_max: 0.8358086738712590D-01 S_min: 0.2996923864577586D+02 S_max: 0.3101392538498657D+02 - it : 8 |ssh|_max: 0.5220719217849657D+01 |U|_max: 0.1141515836387377D-01 |V|_max: 0.9761604183740114D-01 S_min: 0.2996926417117689D+02 S_max: 0.3101392490495336D+02 - it : 9 |ssh|_max: 0.5145297949564463D+01 |U|_max: 0.1416399592481803D-01 |V|_max: 0.1152759253498275D+00 S_min: 0.2996929035879930D+02 S_max: 0.3101392444612747D+02 - it : 10 |ssh|_max: 0.4979557010366619D+01 |U|_max: 0.1986785874281591D-01 |V|_max: 0.1303543987480588D+00 S_min: 0.2996931641421843D+02 S_max: 0.3101392405137850D+02 + it : 1 |ssh|_max: 0.2336851764570087D+01 |U|_max: 0.7053248015579857D-02 |V|_max: 0.2308346115756259D-02 S_min: 0.2996908779321225D+02 S_max: 0.3101392941293399D+02 + it : 2 |ssh|_max: 0.3739162010287973D+01 |U|_max: 0.1029843199698906D-01 |V|_max: 0.9493800242775713D-02 S_min: 0.2996911000748410D+02 S_max: 0.3101392863147436D+02 + it : 3 |ssh|_max: 0.4224443521974239D+01 |U|_max: 0.1349433227265986D-01 |V|_max: 0.2284885234301404D-01 S_min: 0.2996913553478157D+02 S_max: 0.3101392784904396D+02 + it : 4 |ssh|_max: 0.4659313564999622D+01 |U|_max: 0.1490637483762341D-01 |V|_max: 0.4048444554220138D-01 S_min: 0.2996916129319160D+02 S_max: 0.3101392717586671D+02 + it : 5 |ssh|_max: 0.4949503007019767D+01 |U|_max: 0.1145357177490677D-01 |V|_max: 0.5364770396337813D-01 S_min: 0.2996918706295251D+02 S_max: 0.3101392651315916D+02 + it : 6 |ssh|_max: 0.5140472974504293D+01 |U|_max: 0.1064859943349158D-01 |V|_max: 0.6818865538921454D-01 S_min: 0.2996921262561763D+02 S_max: 0.3101392594056643D+02 + it : 7 |ssh|_max: 0.5229361171698655D+01 |U|_max: 0.7814316351505392D-02 |V|_max: 0.8358086738711774D-01 S_min: 0.2996923864577587D+02 S_max: 0.3101392538498657D+02 + it : 8 |ssh|_max: 0.5220719217849857D+01 |U|_max: 0.1141515836389672D-01 |V|_max: 0.9761604183737865D-01 S_min: 0.2996926417117689D+02 S_max: 0.3101392490495337D+02 + it : 9 |ssh|_max: 0.5145297949564862D+01 |U|_max: 0.1416399592482473D-01 |V|_max: 0.1152759253497909D+00 S_min: 0.2996929035879930D+02 S_max: 0.3101392444612748D+02 + it : 10 |ssh|_max: 0.4979557010366737D+01 |U|_max: 0.1986785874282448D-01 |V|_max: 0.1303543987480547D+00 S_min: 0.2996931641421842D+02 S_max: 0.3101392405137852D+02 diff --git a/examples/nemo/scripts/insert_loop_parallelism.py b/examples/nemo/scripts/insert_loop_parallelism.py index e6d26d7c1f..31d8618bd4 100755 --- a/examples/nemo/scripts/insert_loop_parallelism.py +++ b/examples/nemo/scripts/insert_loop_parallelism.py @@ -56,7 +56,8 @@ # This environment variable informs which parallelisation directives to use # It supports acc_offloading, omp_offloading and omp_threading -# They can be combined, e.g PARALLEL_DIRECTIVES='omp_offloading+omp_threading' +# They can be combined, e.g PARALLEL_DIRECTIVES='omp_offloading+omp_threading', +# or use none to just apply the serial transformations PARALLEL_DIRECTIVES = os.environ.get('PARALLEL_DIRECTIVES', '') # By default, allow optimisations that may change the results, e.g. reductions, @@ -84,12 +85,6 @@ FILES_TO_SKIP = [] NEMOV5_EXCLUSIONS = [ - # Fail in gcc NEMOv5 BENCH - "dynhpg.f90", - "dynspg_ts.f90", - "sbcssm.f90", - "tramle.f90", - "trazdf.f90", # Fail in nvfortran when enabling seaice "icefrm.f90", # Has unsupported implicit symbol declaration "icerst.f90" From 17eb6c5343885ba318d5657249863311dca557f5 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 2 Dec 2025 14:16:27 +0000 Subject: [PATCH 22/33] Reintroduce NEMOv4 compile-time MPI include --- .github/workflows/nemo_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nemo_tests.yml b/.github/workflows/nemo_tests.yml index 9b1bfb6383..859209f536 100644 --- a/.github/workflows/nemo_tests.yml +++ b/.github/workflows/nemo_tests.yml @@ -225,7 +225,7 @@ jobs: export MPIF90=psyclonefc export PARALLEL_DIRECTIVES="omp_threading" export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py" - export FCFLAGS="-i4 -r8 -O2 -heap-arrays -fp-model=precise -g -qopenmp" + export FCFLAGS="-i4 -r8 -O2 -heap-arrays -fp-model=precise -g -qopenmp -I ${MPI_HOME}/include" export NEMOV4=1 # Enables specific NEMOV4 exclusions in the PSyclone transformation script # Compile From bab0524acba4579a2bb042d9bc37ea46ed6405b2 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 2 Dec 2025 15:31:39 +0000 Subject: [PATCH 23/33] Clean up NEMO README and scritps --- examples/nemo/scripts/README.md | 59 ++-- .../nemo/scripts/insert_loop_parallelism.py | 2 +- examples/nemo/scripts/omp_gpu_trans.py | 270 ------------------ examples/nemo/scripts/utils.py | 10 +- 4 files changed, 41 insertions(+), 300 deletions(-) delete mode 100755 examples/nemo/scripts/omp_gpu_trans.py diff --git a/examples/nemo/scripts/README.md b/examples/nemo/scripts/README.md index 215fc009e3..84f19b2abc 100644 --- a/examples/nemo/scripts/README.md +++ b/examples/nemo/scripts/README.md @@ -37,14 +37,14 @@ Author S. Siso, STFC Daresbury Lab # PSyclone NEMO Examples -This directory contains various examples of the use of PSyclone to transform -source code from the NEMO ocean model. +This directory contains various examples showing how to apply PSyclone to +transform the source code of the NEMO ocean model. > [!Important] > The NEMO build system, `makenemo`, has the ability to apply psyclone > scripts that come with the NEMO repository with the `-p` flag (see > [the NEMO user guide](https://sites.nemo-ocean.io/user-guide/psyclone.html)), -> but these are pinned to a particular release of PSyclone and have constrains +> but these are pinned to a particular release of PSyclone and have constraints > defined in `mk/sct_psyclone.sh` script. By contrast, the process presented in > this README uses the experimental `psyclonefc` compiler wrapper command which > bypases the `makenemo -p` and instead intercepts any compilation command and @@ -72,16 +72,15 @@ please report to the authors. In order to provide a flexible system that works with different directives and compilers we provide a parameterised transformation script -`insert_loop_parallelism.py` and an example NEMO arch file `KGO/arch-linux_spack.fcm` -with multiple environment variables. These, together with the `psyclonefc` -environment variables have to be set up appropriately depending on the desired -output. +`insert_loop_parallelism.py` and a parameterised NEMO arch file +`KGO/arch-linux_spack.fcm`, both with multiple environment variables that need +to be adjusted depending on your desired optimisation target. First of all, the arch file has a `MPIF90` to choose the compiler, this needs to be set to `psyclonefc`. This is a compiler wrapper utility that -substitutes its call with: an invocation to psyclone to process the given -source file (using the options provided in `PSYCLONE_OPTS`) and then send the -output to a compiler (provided by `PSYCLONE_COMPILER`). +substitutes its calls with: an invocation to psyclone to process the given +source file (using the options provided in `PSYCLONE_OPTS`) followed by an +invocation to a compiler (provided by `PSYCLONE_COMPILER`). For example, to apply the `insert_loop_parallelism.py` and compile it with `mpif90` we can use the following set up: @@ -92,8 +91,8 @@ export PSYCLONE_COMPILER=mpif90 export PSYCLONE_OPTS="-l output -s ${PSYCLONE_NEMO_EXAMPLES_DIR}/insert_loop_parallelism.py" ``` -As mentioned, the transformation script is parameterised with a `PARALLEL_DIRECTIVES` -variable that have to be consistent with the chosen Fortran flags. +This transformation script is in turn parameterised with a `PARALLEL_DIRECTIVES` +variable that have to be consistently set up with the chosen `FCFLAGS` flags. For instance, for the `nvfortran` compiler, you can choose between: - Serial transformations with no parallel directives @@ -122,7 +121,7 @@ export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -acc=gpu -mp=gpu -gpu=mem:manag export REPRODUCIBLE=1 ``` -- A fast GPU build flags +- Hybrid directives (what cannot be offloaded fallsback to threading) and fast GPU flags ```bash unset REPRODUCIBLE export PARALLEL_DIRECTIVES="omp_offloading+omp_threading" @@ -131,7 +130,7 @@ export FCFLAGS="-i4 -Mr8 -O3 -mp=gpu -gpu=mem:managed" > [!Note] > Currently, NEMOv4 and NEMOv5 take different optimisation paths, so it is -> imporant to also set: +> important to also set: > > ```bash > export NEMOv4=1 @@ -149,10 +148,13 @@ the desired NEMO configuration and keys. For example: ./makenemo -r ORCA2_ICE_PISCES -m arch-linux_spack -n ORCA2_psycloned ... ``` -If everything worked you can see the generated files in the -`/BLD/tmp` directory. And you can run the binary from the -EXP00 directory. For example, for a hybrid MPI+OMP offloading+OMP threading -we can do: +If everything worked you will see psyclone generated files in the +`/BLD/tmp` directory and the final binary in the +`/EXP00` directory. + +You can run this binary using the appropriate command from the configuration +and inserted programming model. For example, for a hybrid +MPI+OMP offloading+OMP threading you can do: ```bash # Prepare problem @@ -170,22 +172,25 @@ OMP_NUM_THREADS=4 CUDA_VISIBLE_DEVICES=1,2 mpirun -n 2 ./nemo A difficulty of working with code-transformation scripts is that it is possible to incorrectly transform a file semantics while still creating valid Fortran. + This means that the transformation will succeed and the generated code will compile, but the results will diverge. This gets more complicated with parallel programming because certain operations like reductions or atomics are not -always reproducible. For NEMO we typically compare the generated `run.stat` field -values. To do that we recommend: +always reproducible. Therefore, to understand what causes the results divergence +it is usefulk to apply the transformations step-by-step while checking if the +`run.stat` values change. Some useful steps are: - Starting building NEMO *without* `psyclonefc` and conservative optimisation flags - and run it serially. Then store the generated `run.stat`. -- Then switch to using `psyclonefc` with the `PSYCLONE_OTPS="-s passthrough.py"`, - this will make all files pass through psyclone but without applying any + and run it serially (O2, no vectorisation, no-fma). Then store the generated `run.stat`. +- Then switch to using `psyclonefc` with the `PSYCLONE_OPTS="-s passthrough.py"`, + this will make psyclone process all files but without applying any transformations. Check if the results still match. -- Then build it with `PARALLEL_DIRECTIVES="" PSYCLONE_OTPS="-s insert_loop_parallelism.py"` - and check if the results still match -- Then run it `REPRODUCIBLE=1 PARALLEL_DIRECTIVES="omp_threading" PSYCLONE_OTPS="-s insert_loop_parallelism.py"` +- Then build it with `PSYCLONE_OPTS="-s insert_loop_parallelism.py"` but keeping + the `PARALLEL_DIRECTIVES=""` empty. This will apply serial transformations but + no directives yet. +- Then run it `REPRODUCIBLE=1 PARALLEL_DIRECTIVES="omp_threading" PSYCLONE_OPTS="-s insert_loop_parallelism.py"` and see if the results still match. -- Finally, run it with `REPRODUCIBLE=1 PARALLEL_DIRECTIVES="omp_offloading" PSYCLONE_OTPS="-s insert_loop_parallelism.py"` +- Finally, run it with `REPRODUCIBLE=1 PARALLEL_DIRECTIVES="omp_offloading" PSYCLONE_OPTS="-s insert_loop_parallelism.py"` Orthogonally to finding which step is causing the divergence we may want to find which file/s are causing it. This folder also contains a `do_file_by_file.sh` diff --git a/examples/nemo/scripts/insert_loop_parallelism.py b/examples/nemo/scripts/insert_loop_parallelism.py index 31d8618bd4..5bef70f3da 100755 --- a/examples/nemo/scripts/insert_loop_parallelism.py +++ b/examples/nemo/scripts/insert_loop_parallelism.py @@ -87,7 +87,6 @@ NEMOV5_EXCLUSIONS = [ # Fail in nvfortran when enabling seaice "icefrm.f90", # Has unsupported implicit symbol declaration - "icerst.f90" ] NEMOV4_EXCLUSIONS = [ @@ -129,6 +128,7 @@ # Runtime Error: Illegal address during kernel execution "trcrad.f90", # nvhpc > 24.11 - Signal 11 issues + "icerst.f90", # When enabling ice* parallelisation "trcbbl.f90", "bdyice.f90", "sedfunc.f90", diff --git a/examples/nemo/scripts/omp_gpu_trans.py b/examples/nemo/scripts/omp_gpu_trans.py deleted file mode 100755 index cc6612c5b9..0000000000 --- a/examples/nemo/scripts/omp_gpu_trans.py +++ /dev/null @@ -1,270 +0,0 @@ -#!/usr/bin/env python -# ----------------------------------------------------------------------------- -# BSD 3-Clause License -# -# Copyright (c) 2021-2025, Science and Technology Facilities Council. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. -# ----------------------------------------------------------------------------- -# Authors: S. Siso, STFC Daresbury Lab - -''' PSyclone transformation script showing the introduction of OpenMP for GPU -directives into Nemo code. ''' - -import os -from utils import ( - add_profiling, inline_calls, insert_explicit_loop_parallelism, - normalise_loops, PARALLELISATION_ISSUES, NEMO_MODULES_TO_IMPORT) -from psyclone.psyir.nodes import Routine, Loop -from psyclone.psyir.transformations import ( - OMPTargetTrans, OMPDeclareTargetTrans) -from psyclone.transformations import ( - OMPLoopTrans, TransformationError) - - -# This environment variable informs if profiling hooks have to be inserted. -PROFILING_ENABLED = os.environ.get('ENABLE_PROFILING', False) - -# By default, we don't do module inlining as it's still under development. -INLINING_ENABLED = os.environ.get('ENABLE_INLINING', False) - -# By default, we allow all device intrinsics (not only the reproducible ones) -REPRODUCIBLE = os.environ.get('REPRODUCIBLE', False) - -# This environment variable informs if this is targeting NEMOv4, in which case -# array privatisation is disabled and some more files excluded -NEMOV4 = os.environ.get('NEMOV4', False) - -# This environment variable informs if we're enabling asynchronous -# parallelism. -ASYNC_PARALLEL = os.environ.get('ASYNC_PARALLEL', False) - -# Whether to chase the imported modules to improve symbol information (it can -# also be a list of module filenames to limit the chasing to only specific -# modules). This has to be used in combination with '-I' command flag in order -# to point to the module location directory. We also strongly recommend using -# the '--enable-cache' flag to reduce the performance overhead. -RESOLVE_IMPORTS = NEMO_MODULES_TO_IMPORT - -# List of all files that psyclone will skip processing -FILES_TO_SKIP = [ - "icefrm.f90", # Has an unsupported implicit symbol declaration -] - -NEMOV5_EXCLUSIONS = [] - -NEMOV4_EXCLUSIONS = [ - "dynspg_ts.f90", - "tranxt.f90", -] - -SKIP_FOR_PERFORMANCE = [ - "iom.f90", - "iom_nf90.f90", - "iom_def.f90", - "timing.f90", - "histcom.f90", -] - -OFFLOADING_ISSUES = [ - # Produces different output results - "zdftke.f90", - # The following issues only affect BENCH (because ice is enabled?) - # Runtime Error: Illegal address during kernel execution - "trcrad.f90", - # Signal 11 issues - "trcbbl.f90", - "bdyice.f90", - "sedfunc.f90", - "stpmlf.f90", - "trddyn.f90", - "trczdf.f90", - "trcice_pisces.f90", - "dtatsd.f90", - "trcatf.f90", - "stp2d.f90", -] - -ASYNC_ISSUES = [ - # TODO #3220: Explore the cause of the async issues - # Runtime Error: (CUDA_ERROR_LAUNCH_FAILED): Launch failed - # (often invalid pointer dereference) in get_cstrgsurf - "sbcclo.f90", - "trcldf.f90", - # Runtime Error: Illegal address during kernel execution with - # asynchronicity. - "zdfiwm.f90", - "zdfsh2.f90", - # Diverging results with asynchronicity - "traadv_fct.f90", -] - - -def trans(psyir): - ''' Add OpenMP Target and Loop directives to all loops, including the - implicit ones, to parallelise the code and execute it in an acceleration - device. - - :param psyir: the PSyIR of the provided file. - :type psyir: :py:class:`psyclone.psyir.nodes.FileContainer` - - ''' - # The two options below are useful for file-by-file exhaustive tests. - # If the environemnt has ONLY_FILE defined, only process that one file and - # known-good files that need a "declare target" inside. - only_do_file = os.environ.get('ONLY_FILE', False) - only_do_files = (only_do_file, "lib_fortran.f90", "solfrac_mod.f90") - if only_do_file and psyir.name not in only_do_files: - return - # If the environemnt has ALL_BUT_FILE defined, process all files but - # the one named file. - all_but_file = os.environ.get('ALL_BUT_FILE', False) - if all_but_file and psyir.name == all_but_file: - return - - omp_target_trans = OMPTargetTrans() - if NEMOV4: - # TODO #2895: Explore why loop/teams loop diverge for NEMOv4 - omp_gpu_loop_trans = OMPLoopTrans(omp_schedule="none") - omp_gpu_loop_trans.omp_directive = "loop" - else: - omp_gpu_loop_trans = OMPLoopTrans(omp_schedule="none") - omp_gpu_loop_trans.omp_directive = "teamsloop" - omp_cpu_loop_trans = OMPLoopTrans(omp_schedule="static") - omp_cpu_loop_trans.omp_directive = "paralleldo" - - disable_profiling_for = [] - enable_async = ASYNC_PARALLEL and psyir.name not in ASYNC_ISSUES - - for subroutine in psyir.walk(Routine): - - # The exclusion below could be in the FILES_TO_SKIP global parameter, - # but in this script, for testing purposes, we exclude them here so the - # PSyclone frontend and backend are still tested and it also allows to - # insert profiling hooks later on. - if psyir.name in SKIP_FOR_PERFORMANCE: - continue - if NEMOV4 and psyir.name in NEMOV4_EXCLUSIONS: - continue - if not NEMOV4 and psyir.name in NEMOV5_EXCLUSIONS: - continue - # ICE routines do not perform well on GPU, so we skip them - if psyir.name.startswith("ice"): - continue - # Many of the obs_ files have problems to be offloaded to the GPU - if psyir.name.startswith("obs_"): - continue - # Skip initialisation and diagnostic subroutines - if (subroutine.name.endswith('_alloc') or - subroutine.name.endswith('_init') or - subroutine.name.startswith('Agrif') or - subroutine.name.startswith('dia_') or - subroutine.name == 'dom_msk' or - subroutine.name == 'dom_zgr' or - subroutine.name == 'dom_ngb'): - continue - - normalise_loops( - subroutine, - hoist_local_arrays=False, - convert_array_notation=True, - # See issue #3022 - loopify_array_intrinsics=psyir.name != "getincom.f90", - convert_range_loops=True, - increase_array_ranks=not NEMOV4, - hoist_expressions=True - ) - # Perform module-inlining of called routines. - if INLINING_ENABLED: - inline_calls(subroutine) - - # These are functions that are called from inside parallel regions, - # annotate them with 'omp declare target' - if ( - subroutine.name.lower().startswith("sign_") - or subroutine.name.lower() == "solfrac" - or (psyir.name == "sbc_phy.f90" and not subroutine.walk(Loop)) - ): - try: - OMPDeclareTargetTrans().apply(subroutine) - print(f"Marked {subroutine.name} as GPU-enabled") - except TransformationError as err: - print(err) - # We continue parallelising inside the routine, but this could - # change if the parallelisation directives added below are not - # nestable, in that case we could add a 'continue' here - disable_profiling_for.append(subroutine.name) - - if NEMOV4: - # For nemo4 always offload but without privatisation - print(f"Adding OpenMP offloading to subroutine: {subroutine.name}") - insert_explicit_loop_parallelism( - subroutine, - region_directive_trans=omp_target_trans, - loop_directive_trans=omp_gpu_loop_trans, - collapse=True, - privatise_arrays=False, - asynchronous_parallelism=enable_async, - uniform_intrinsics_only=REPRODUCIBLE, - enable_reductions=not REPRODUCIBLE - ) - elif psyir.name not in PARALLELISATION_ISSUES + OFFLOADING_ISSUES: - print(f"Adding OpenMP offloading to subroutine: {subroutine.name}") - insert_explicit_loop_parallelism( - subroutine, - region_directive_trans=omp_target_trans, - loop_directive_trans=omp_gpu_loop_trans, - collapse=True, - asynchronous_parallelism=enable_async, - privatise_arrays=True, - uniform_intrinsics_only=REPRODUCIBLE, - enable_reductions=not REPRODUCIBLE - ) - elif psyir.name not in PARALLELISATION_ISSUES: - # This have issues offloading, but we can still do OpenMP threading - print(f"Adding OpenMP threading to subroutine: {subroutine.name}") - # If asynchronous parallelism is enabled, these subroutines in - # sbcclo.f90 fail if they're parallelised on the CPU. - if (ASYNC_PARALLEL and subroutine.name in - ("get_cssrcsurf", "get_cstrgsurf")): - continue - insert_explicit_loop_parallelism( - subroutine, - loop_directive_trans=omp_cpu_loop_trans, - asynchronous_parallelism=enable_async, - privatise_arrays=True, - ) - - # Iterate again and add profiling hooks when needed - for subroutine in psyir.walk(Routine): - if psyir.name in SKIP_FOR_PERFORMANCE: - continue - if PROFILING_ENABLED and subroutine.name not in disable_profiling_for: - print(f"Adding profiling hooks to subroutine: {subroutine.name}") - add_profiling(subroutine.children) diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index 8bdb359eee..7fe3b06b46 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -232,11 +232,13 @@ def normalise_loops( except TransformationError as err: print(err.value) - # TODO #2951: fldread has a bug in ArrayAssignment2LoopsTrans - if convert_range_loops and filename != "fldread.f90": + if convert_range_loops: # Convert all array implicit loops to explicit loops explicit_loops = ArrayAssignment2LoopsTrans() for assignment in schedule.walk(Assignment): + if filename == "fldread.f90": + # TODO #2951: This file has issues converting SturctureRefs + continue try: explicit_loops.apply( assignment, options={'verbose': True}) @@ -273,6 +275,10 @@ def normalise_loops( except TransformationError: pass + # TODO #1928: In order to perform better on the GPU, nested loops with two + # sibling inner loops need to be fused or apply loop fission to the + # top level. This would allow the collapse clause to be applied. + def increase_rank_and_reorder_nemov5_loops(routine: Routine): ''' This method increases the rank of temporary arrays used inside selected From 2c1f46842b75666a92a4fdf89e9a84ec8b02ce49 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 2 Dec 2025 15:33:38 +0000 Subject: [PATCH 24/33] Uncomment integration test for NEMO ORCA2 async --- .github/workflows/nemo_v5_tests.yml | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index 8b54bbfa88..8308e206b7 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -436,19 +436,18 @@ jobs: # Clean up and compile rm -rf cfgs/${TEST_DIR} - # ./makenemo -r ORCA2_ICE_PISCES -m linux_spack -n ${TEST_DIR} \ - # -p ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py \ - # add_key "key_mpi_off key_nosignedzero" -j ${NUM_PARALLEL} -v 1 + ./makenemo -r ORCA2_ICE_PISCES -m linux_spack -n ${TEST_DIR} \ + -p ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py \ + add_key "key_mpi_off key_nosignedzero" -j ${NUM_PARALLEL} -v 1 # Run test - # cd $NEMO_DIR/cfgs/${TEST_DIR}/EXP00 - # ln -sf /archive/psyclone-tests/nemo-inputs/UKMO-eORCA2/* . + cd $NEMO_DIR/cfgs/${TEST_DIR}/EXP00 + ln -sf /archive/psyclone-tests/nemo-inputs/UKMO-eORCA2/* . # Uses both, threading and offloading - # export CUDA_VISIBLE_DEVICES=1 - # OMP_NUM_THREADS=4 ./nemo - # diff $PSYCLONE_NEMO_DIR/KGOs/run.stat.orca2.nvhpc.10steps run.stat - # export VAR_TIME=$(grep "local proces" timing.output | head -n 1 | awk '{print $4}' | tr -d s) - export VAR_TIME="0.0" + export CUDA_VISIBLE_DEVICES=1 + OMP_NUM_THREADS=4 ./nemo + diff $PSYCLONE_NEMO_DIR/KGOs/run.stat.orca2.nvhpc.10steps run.stat + export VAR_TIME=$(grep "local proces" timing.output | head -n 1 | awk '{print $4}' | tr -d s) echo "time=${VAR_TIME}" >> "${GITHUB_OUTPUT}" - name: NEMO 5.0 nvidia Async OpenMP for GPUs (UKMO ORCA1 - managed memory) From 3ae2c17c81cf9827389b957aca9b59c9fe30eb95 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 2 Dec 2025 17:16:38 +0000 Subject: [PATCH 25/33] Update NEMOv4 with mpi include flag --- .github/workflows/nemo_tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/nemo_tests.yml b/.github/workflows/nemo_tests.yml index 859209f536..95b3762f9e 100644 --- a/.github/workflows/nemo_tests.yml +++ b/.github/workflows/nemo_tests.yml @@ -224,8 +224,8 @@ jobs: export PSYCLONE_COMPILER=$MPIF90 export MPIF90=psyclonefc export PARALLEL_DIRECTIVES="omp_threading" - export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py" - export FCFLAGS="-i4 -r8 -O2 -heap-arrays -fp-model=precise -g -qopenmp -I ${MPI_HOME}/include" + export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py -I ${MPI_HOME}/include" + export FCFLAGS="-i4 -r8 -O2 -heap-arrays -fp-model=precise -g -qopenmp" export NEMOV4=1 # Enables specific NEMOV4 exclusions in the PSyclone transformation script # Compile From 882c1469ee9c246e4caafbf93d3e0bfd3a911e95 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Wed, 3 Dec 2025 10:13:08 +0000 Subject: [PATCH 26/33] Try removing exclusions from NEMO tests --- examples/nemo/scripts/insert_loop_parallelism.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/nemo/scripts/insert_loop_parallelism.py b/examples/nemo/scripts/insert_loop_parallelism.py index 5bef70f3da..a6d00da7a2 100755 --- a/examples/nemo/scripts/insert_loop_parallelism.py +++ b/examples/nemo/scripts/insert_loop_parallelism.py @@ -90,7 +90,7 @@ ] NEMOV4_EXCLUSIONS = [ - "dynspg_ts.f90", + # "dynspg_ts.f90", ] if NEMOV4: @@ -103,7 +103,7 @@ "iom_nf90.f90", "iom_def.f90", "timing.f90", - "lbclnk.f90", + # "lbclnk.f90", "histcom.f90", ] From dfb6b0a32237513b9fe9e0daa4d3d08097590e23 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Wed, 3 Dec 2025 14:24:24 +0000 Subject: [PATCH 27/33] Make NEMOv4 dynspg_ts exclusion exclusive to the ArrayAssignment2LoopsTrans --- .../nemo/scripts/insert_loop_parallelism.py | 50 +++++++++---------- examples/nemo/scripts/utils.py | 26 ++++++---- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/examples/nemo/scripts/insert_loop_parallelism.py b/examples/nemo/scripts/insert_loop_parallelism.py index a6d00da7a2..94494ab0df 100755 --- a/examples/nemo/scripts/insert_loop_parallelism.py +++ b/examples/nemo/scripts/insert_loop_parallelism.py @@ -84,43 +84,39 @@ # List of all files that psyclone will skip processing FILES_TO_SKIP = [] -NEMOV5_EXCLUSIONS = [ - # Fail in nvfortran when enabling seaice - "icefrm.f90", # Has unsupported implicit symbol declaration -] - -NEMOV4_EXCLUSIONS = [ - # "dynspg_ts.f90", -] - -if NEMOV4: - FILES_TO_SKIP.extend(NEMOV4_EXCLUSIONS) -else: - FILES_TO_SKIP.extend(NEMOV5_EXCLUSIONS) - +# There files are skipped because transforming them degrade the performance SKIP_FOR_PERFORMANCE = [ "iom.f90", "iom_nf90.f90", "iom_def.f90", "timing.f90", - # "lbclnk.f90", "histcom.f90", ] # These files change the results from the baseline when psyclone adds # parallelisation dirctives PARALLELISATION_ISSUES = [] + +# These files change the results from the baseline when psyclone adds +# offloading dirctives +OFFLOADING_ISSUES = [] + if not NEMOV4: + FILES_TO_SKIP.extend([ + # Fail in nvfortran when enabling seaice + "icefrm.f90", # Has unsupported implicit symbol declaration + ]) + + SKIP_FOR_PERFORMANCE.extend([ + "lbclnk.f90", + ]) + PARALLELISATION_ISSUES.extend([ "ldfc1d_c2d.f90", "tramle.f90", "traqsr.f90", ]) -# These files change the results from the baseline when psyclone adds -# offloading dirctives -OFFLOADING_ISSUES = [] -if not NEMOV4: OFFLOADING_ISSUES.extend([ # Produces different output results "zdftke.f90", @@ -141,14 +137,14 @@ "stp2d.f90", ]) -if not NEMOV4 and "acc_offloading" in PARALLEL_DIRECTIVES: - OFFLOADING_ISSUES.extend([ - # Fail in OpenACC ORCA2_ICE_PISCES - "dynzdf.f90", - "trabbl.f90", - "trazdf.f90", - "zdfsh2.f90", - ]) + if "acc_offloading" in PARALLEL_DIRECTIVES: + OFFLOADING_ISSUES.extend([ + # Fail in OpenACC ORCA2_ICE_PISCES + "dynzdf.f90", + "trabbl.f90", + "trazdf.f90", + "zdfsh2.f90", + ]) ASYNC_ISSUES = [ # Runtime Error: (CUDA_ERROR_LAUNCH_FAILED): Launch failed diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index 7fe3b06b46..d3010dbe29 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -202,6 +202,7 @@ def normalise_loops( statements out of the loop nest. ''' filename = schedule.root.name + nemo_v4 = os.environ.get('NEMOV4', False) if hoist_local_arrays and schedule.name not in CONTAINS_STMT_FUNCTIONS: # Apply the HoistLocalArraysTrans when possible, it cannot be applied # to files with statement functions because it will attempt to put the @@ -233,17 +234,20 @@ def normalise_loops( print(err.value) if convert_range_loops: - # Convert all array implicit loops to explicit loops - explicit_loops = ArrayAssignment2LoopsTrans() - for assignment in schedule.walk(Assignment): - if filename == "fldread.f90": - # TODO #2951: This file has issues converting SturctureRefs - continue - try: - explicit_loops.apply( - assignment, options={'verbose': True}) - except TransformationError: - pass + if filename == "fldread.f90": + # TODO #2951: This file has issues converting SturctureRefs + pass + elif nemo_v4 and filename == "dynspg_ts.f90": + pass + else: + # Convert all array implicit loops to explicit loops + explicit_loops = ArrayAssignment2LoopsTrans() + for assignment in schedule.walk(Assignment): + try: + explicit_loops.apply( + assignment, options={'verbose': True}) + except TransformationError: + pass if scalarise_loops: # Apply scalarisation to every loop. Execute this in reverse order From 4f926045615b728e0382c97d34f33c95aab38c50 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 9 Dec 2025 17:06:57 +0000 Subject: [PATCH 28/33] #3244 Fix typos and add TODO --- examples/nemo/eg1/Makefile | 4 ++-- examples/nemo/eg2/Makefile | 4 ++-- examples/nemo/scripts/README.md | 24 ++++++++++++------------ examples/nemo/scripts/utils.py | 4 +++- 4 files changed, 19 insertions(+), 17 deletions(-) diff --git a/examples/nemo/eg1/Makefile b/examples/nemo/eg1/Makefile index 93ccb37292..4ef7eb6cad 100644 --- a/examples/nemo/eg1/Makefile +++ b/examples/nemo/eg1/Makefile @@ -40,8 +40,8 @@ include ../../common.mk transform: ${PSYCLONE} -s ./openmp_cpu_levels_trans.py ../code/tra_adv.F90 ${PSYCLONE} -s ./openmp_gpu_levels_trans.py ../code/tra_adv.F90 - PARALLEL_DIRECTICVES="omp_threading" ${PSYCLONE} -s ../scripts/insert_loop_parallelism.py ../code/tra_adv.F90 - PARALLEL_DIRECTICVES="omp_offloading" ${PSYCLONE} -s ../scripts/insert_loop_parallelism.py ../code/tra_adv.F90 + PARLLEL_DIRECTIVES="omp_threading" ${PSYCLONE} -s ../scripts/insert_loop_parallelism.py ../code/tra_adv.F90 + PARLLEL_DIRECTIVES="omp_offloading" ${PSYCLONE} -s ../scripts/insert_loop_parallelism.py ../code/tra_adv.F90 compile: transform @echo "No compilation supported for nemo/eg1" diff --git a/examples/nemo/eg2/Makefile b/examples/nemo/eg2/Makefile index b21e66b48d..cee5a268c1 100644 --- a/examples/nemo/eg2/Makefile +++ b/examples/nemo/eg2/Makefile @@ -43,8 +43,8 @@ transform: omp_levels omp_levels: ${PSYCLONE} -s ./omp_levels_trans.py ../code/traldf_iso.F90 - PARALLEL_DIRECTICVES="omp_threading" ${PSYCLONE} -s ../scripts/insert_loop_parallelism.py ../code/traldf_iso.F90 - PARALLEL_DIRECTICVES="omp_offloading" ${PSYCLONE} -s ../scripts/insert_loop_parallelism.py ../code/traldf_iso.F90 + PARLLEL_DIRECTIVES="omp_threading" ${PSYCLONE} -s ../scripts/insert_loop_parallelism.py ../code/traldf_iso.F90 + PARLLEL_DIRECTIVES="omp_offloading" ${PSYCLONE} -s ../scripts/insert_loop_parallelism.py ../code/traldf_iso.F90 compile: transform diff --git a/examples/nemo/scripts/README.md b/examples/nemo/scripts/README.md index 84f19b2abc..6eb7945a5f 100644 --- a/examples/nemo/scripts/README.md +++ b/examples/nemo/scripts/README.md @@ -41,15 +41,15 @@ This directory contains various examples showing how to apply PSyclone to transform the source code of the NEMO ocean model. > [!Important] -> The NEMO build system, `makenemo`, has the ability to apply psyclone +> The NEMO build system, `makenemo`, has the ability to apply PSyclone > scripts that come with the NEMO repository with the `-p` flag (see > [the NEMO user guide](https://sites.nemo-ocean.io/user-guide/psyclone.html)), > but these are pinned to a particular release of PSyclone and have constraints > defined in `mk/sct_psyclone.sh` script. By contrast, the process presented in > this README uses the experimental `psyclonefc` compiler wrapper command which -> bypases the `makenemo -p` and instead intercepts any compilation command and -> wraps it with a psyclone call followed by a compiler call. -> This is the recommended way to apply upstream psyclone transformations, as it +> bypasses the `makenemo -p` and instead intercepts any compilation command and +> wraps it with a PSyclone call followed by a compiler call. +> This is the recommended way to apply upstream PSyclone transformations, as it > is not constrained by the file-exclusions and backward compatibility guarantees > of the scripts inside the NEMO repository. @@ -78,7 +78,7 @@ to be adjusted depending on your desired optimisation target. First of all, the arch file has a `MPIF90` to choose the compiler, this needs to be set to `psyclonefc`. This is a compiler wrapper utility that -substitutes its calls with: an invocation to psyclone to process the given +substitutes its calls with: an invocation to PSyclone to process the given source file (using the options provided in `PSYCLONE_OPTS`) followed by an invocation to a compiler (provided by `PSYCLONE_COMPILER`). @@ -148,7 +148,7 @@ the desired NEMO configuration and keys. For example: ./makenemo -r ORCA2_ICE_PISCES -m arch-linux_spack -n ORCA2_psycloned ... ``` -If everything worked you will see psyclone generated files in the +If everything worked you will see PSyclone generated files in the `/BLD/tmp` directory and the final binary in the `/EXP00` directory. @@ -177,13 +177,13 @@ This means that the transformation will succeed and the generated code will compile, but the results will diverge. This gets more complicated with parallel programming because certain operations like reductions or atomics are not always reproducible. Therefore, to understand what causes the results divergence -it is usefulk to apply the transformations step-by-step while checking if the +it is useful to apply the transformations step-by-step while checking if the `run.stat` values change. Some useful steps are: - Starting building NEMO *without* `psyclonefc` and conservative optimisation flags and run it serially (O2, no vectorisation, no-fma). Then store the generated `run.stat`. - Then switch to using `psyclonefc` with the `PSYCLONE_OPTS="-s passthrough.py"`, - this will make psyclone process all files but without applying any + this will make PSyclone process all files but without applying any transformations. Check if the results still match. - Then build it with `PSYCLONE_OPTS="-s insert_loop_parallelism.py"` but keeping the `PARALLEL_DIRECTIVES=""` empty. This will apply serial transformations but @@ -192,7 +192,7 @@ it is usefulk to apply the transformations step-by-step while checking if the and see if the results still match. - Finally, run it with `REPRODUCIBLE=1 PARALLEL_DIRECTIVES="omp_offloading" PSYCLONE_OPTS="-s insert_loop_parallelism.py"` -Orthogonally to finding which step is causing the divergence we may want to find +Alongside finding which step is causing the divergence we may want to find which file/s are causing it. This folder also contains a `do_file_by_file.sh` script that build NEMO many times, each with only one file being transformed, and compares the results with the stores `run.stat` @@ -209,15 +209,15 @@ that allow to point to a directory with patched source files: ./makenemo -e ... ``` -In addition to the source, you can also modify the recipe that psyclone uses to +In addition to the source, you can also modify the recipe that PSyclone uses to transform the code. In this example you can do so by changing any detail of the `insert_loop_parallelism.py` transformation script, but the `FILES_TO_SKIP` -global variable is particularly relevant as it allows psyclone skip processing +global variable is particularly relevant as it allows PSyclone skip processing the listed files. If modifying a particular file is known to cause problems or performance regressions, include it in this list. You can also do both. For example if you want to provide a modified file that already includes directives, you need to reference it with the `-e ` -and in the FILES_TO_SKIP (otherwise Psyclone would ignore the given directives +and in the FILES_TO_SKIP (otherwise PSyclone would ignore the given directives and try to insert its own). This is currently the optimal approach for `seaice` and `lbclnk.f90` GPU offloading. diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index d3010dbe29..fd992a76eb 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -235,9 +235,11 @@ def normalise_loops( if convert_range_loops: if filename == "fldread.f90": - # TODO #2951: This file has issues converting SturctureRefs + # TODO #2951: This file has issues converting StructureRefs pass elif nemo_v4 and filename == "dynspg_ts.f90": + # TODO #3256: Is there an issue with the L/UBOUND intrinsics + # that this transformation adds? pass else: # Convert all array implicit loops to explicit loops From 1a177d25381a82c89459153ef579a40a0aa2babf Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Mon, 15 Dec 2025 09:34:08 +0000 Subject: [PATCH 29/33] #2144 Try to improve hybrid performance --- .../nemo/scripts/insert_loop_parallelism.py | 24 ++++++++++++------- examples/nemo/scripts/utils.py | 8 +------ 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/examples/nemo/scripts/insert_loop_parallelism.py b/examples/nemo/scripts/insert_loop_parallelism.py index 94494ab0df..fd72f531fb 100755 --- a/examples/nemo/scripts/insert_loop_parallelism.py +++ b/examples/nemo/scripts/insert_loop_parallelism.py @@ -44,7 +44,7 @@ normalise_loops, NEMO_MODULES_TO_IMPORT) from psyclone.psyir.nodes import Routine, Loop from psyclone.psyir.transformations import ( - OMPTargetTrans, OMPDeclareTargetTrans) + OMPTargetTrans, OMPDeclareTargetTrans, OMPMinimiseSyncTrans) from psyclone.transformations import ( OMPLoopTrans, TransformationError) from psyclone.transformations import ( @@ -304,13 +304,16 @@ def trans(psyir): print(f"Marked {subroutine.name} as GPU-enabled") except TransformationError as err: print(err) - # We continue parallelising inside the routine, but this could - # change if the parallelisation directives added below are not - # nestable, in that case we could add a 'continue' here disable_profiling_for.append(subroutine.name) + # We won't continue parallelising inside the routine, but this + # could change if the parallelisation directives added below are + # nestable, in that case remove the 'continue' + continue - elif (psyir.name not in PARALLELISATION_ISSUES + OFFLOADING_ISSUES - and gpu_loop_trans): + if ( + psyir.name not in PARALLELISATION_ISSUES + OFFLOADING_ISSUES + and gpu_loop_trans + ): print( f"Adding offload directives to subroutine: {subroutine.name}") insert_explicit_loop_parallelism( @@ -323,17 +326,22 @@ def trans(psyir): uniform_intrinsics_only=REPRODUCIBLE, asynchronous_parallelism=enable_async, ) - elif psyir.name not in PARALLELISATION_ISSUES and cpu_loop_trans: - # These have issues offloading, but we can still do threading + if psyir.name not in PARALLELISATION_ISSUES and cpu_loop_trans: print(f"Adding OpenMP threading to subroutine: {subroutine.name}") insert_explicit_loop_parallelism( subroutine, loop_directive_trans=cpu_loop_trans, + collapse=False, privatise_arrays=not NEMOV4, enable_reductions=not REPRODUCIBLE, asynchronous_parallelism=enable_async, ) + # If we are adding asynchronous parallelism then we now try to minimise + # the number of barriers. + if enable_async: + OMPMinimiseSyncTrans().apply(subroutine) + # Iterate again and add profiling hooks when needed for subroutine in psyir.walk(Routine): if PROFILING_ENABLED and subroutine.name not in disable_profiling_for: diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index fd992a76eb..92abf64f49 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -46,7 +46,7 @@ from psyclone.psyir.transformations import ( ArrayAssignment2LoopsTrans, HoistLoopBoundExprTrans, HoistLocalArraysTrans, HoistTrans, InlineTrans, Maxval2LoopTrans, ProfileTrans, - OMPMinimiseSyncTrans, Reference2ArrayRangeTrans, + Reference2ArrayRangeTrans, ScalarisationTrans, IncreaseRankLoopArraysTrans) from psyclone.transformations import TransformationError @@ -471,12 +471,6 @@ def insert_explicit_loop_parallelism( # associted to the loop in the generated output. continue - # If we are adding asynchronous parallelism then we now try to minimise - # the number of barriers. - if enable_nowaits: - minsync_trans = OMPMinimiseSyncTrans() - minsync_trans.apply(schedule) - def add_profiling(children: Union[List[Node], Schedule]): ''' From 32057f1cab6b9be6006928017eaa591ae32d5957 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Mon, 15 Dec 2025 11:40:21 +0000 Subject: [PATCH 30/33] #3244 Revert last changes and don't collapse CPU loops --- .../nemo/scripts/insert_loop_parallelism.py | 23 +++++++------------ examples/nemo/scripts/utils.py | 8 ++++++- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/examples/nemo/scripts/insert_loop_parallelism.py b/examples/nemo/scripts/insert_loop_parallelism.py index fd72f531fb..542f0a1615 100755 --- a/examples/nemo/scripts/insert_loop_parallelism.py +++ b/examples/nemo/scripts/insert_loop_parallelism.py @@ -44,7 +44,7 @@ normalise_loops, NEMO_MODULES_TO_IMPORT) from psyclone.psyir.nodes import Routine, Loop from psyclone.psyir.transformations import ( - OMPTargetTrans, OMPDeclareTargetTrans, OMPMinimiseSyncTrans) + OMPTargetTrans, OMPDeclareTargetTrans) from psyclone.transformations import ( OMPLoopTrans, TransformationError) from psyclone.transformations import ( @@ -304,16 +304,13 @@ def trans(psyir): print(f"Marked {subroutine.name} as GPU-enabled") except TransformationError as err: print(err) + # We continue parallelising inside the routine, but this could + # change if the parallelisation directives added below are not + # nestable, in that case we could add a 'continue' here disable_profiling_for.append(subroutine.name) - # We won't continue parallelising inside the routine, but this - # could change if the parallelisation directives added below are - # nestable, in that case remove the 'continue' - continue - if ( - psyir.name not in PARALLELISATION_ISSUES + OFFLOADING_ISSUES - and gpu_loop_trans - ): + elif (psyir.name not in PARALLELISATION_ISSUES + OFFLOADING_ISSUES + and gpu_loop_trans): print( f"Adding offload directives to subroutine: {subroutine.name}") insert_explicit_loop_parallelism( @@ -326,7 +323,8 @@ def trans(psyir): uniform_intrinsics_only=REPRODUCIBLE, asynchronous_parallelism=enable_async, ) - if psyir.name not in PARALLELISATION_ISSUES and cpu_loop_trans: + elif psyir.name not in PARALLELISATION_ISSUES and cpu_loop_trans: + # These have issues offloading, but we can still do threading print(f"Adding OpenMP threading to subroutine: {subroutine.name}") insert_explicit_loop_parallelism( subroutine, @@ -337,11 +335,6 @@ def trans(psyir): asynchronous_parallelism=enable_async, ) - # If we are adding asynchronous parallelism then we now try to minimise - # the number of barriers. - if enable_async: - OMPMinimiseSyncTrans().apply(subroutine) - # Iterate again and add profiling hooks when needed for subroutine in psyir.walk(Routine): if PROFILING_ENABLED and subroutine.name not in disable_profiling_for: diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index 92abf64f49..fd992a76eb 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -46,7 +46,7 @@ from psyclone.psyir.transformations import ( ArrayAssignment2LoopsTrans, HoistLoopBoundExprTrans, HoistLocalArraysTrans, HoistTrans, InlineTrans, Maxval2LoopTrans, ProfileTrans, - Reference2ArrayRangeTrans, + OMPMinimiseSyncTrans, Reference2ArrayRangeTrans, ScalarisationTrans, IncreaseRankLoopArraysTrans) from psyclone.transformations import TransformationError @@ -471,6 +471,12 @@ def insert_explicit_loop_parallelism( # associted to the loop in the generated output. continue + # If we are adding asynchronous parallelism then we now try to minimise + # the number of barriers. + if enable_nowaits: + minsync_trans = OMPMinimiseSyncTrans() + minsync_trans.apply(schedule) + def add_profiling(children: Union[List[Node], Schedule]): ''' From 55d44fc25b216ebfc0fb6d2c06c327beea4326ed Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 20 Jan 2026 17:37:43 +0000 Subject: [PATCH 31/33] Remove old statements brought by the last merge --- examples/nemo/scripts/insert_loop_parallelism.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/examples/nemo/scripts/insert_loop_parallelism.py b/examples/nemo/scripts/insert_loop_parallelism.py index 8c54152188..7a30ae973b 100755 --- a/examples/nemo/scripts/insert_loop_parallelism.py +++ b/examples/nemo/scripts/insert_loop_parallelism.py @@ -268,10 +268,6 @@ def trans(psyir): # insert profiling hooks later on. if psyir.name in SKIP_FOR_PERFORMANCE: continue - if NEMOV4 and psyir.name in NEMOV4_EXCLUSIONS: - continue - if not NEMOV4 and psyir.name in NEMOV5_EXCLUSIONS: - continue # ICE routines do not perform well on GPU, so we skip them if psyir.name.startswith("ice"): continue From 355a80149cb035b1fc8762569bdc7719a15abb1d Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 20 Jan 2026 18:16:19 +0000 Subject: [PATCH 32/33] Clean up previous merge conflicts --- .../nemo/scripts/insert_loop_parallelism.py | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/examples/nemo/scripts/insert_loop_parallelism.py b/examples/nemo/scripts/insert_loop_parallelism.py index 7a30ae973b..5611164b10 100755 --- a/examples/nemo/scripts/insert_loop_parallelism.py +++ b/examples/nemo/scripts/insert_loop_parallelism.py @@ -226,11 +226,15 @@ def filter_files_by_name(name: str) -> bool: if all_but_file and name == all_but_file: return True + # These work but are skiped to improve performance, they could be in the + # FILES_TO_SKIP global parameter, but in this script, for testing purposes, + # we exclude them here so the PSyclone frontend and backend are still + # tested and it also allows to insert profiling hooks later on. if name in SKIP_FOR_PERFORMANCE: return True - # Parallelising this file currently causes a noticeable slowdown - # if name.startswith("icethd"): + # Parallelising ICE or ICB currently causes a noticeable slowdown + # On nemo_main it can be just: if name.startswith("icethd"): if not NEMOV4 and name.startswith("ice"): return True if name.startswith("icb"): @@ -240,6 +244,10 @@ def filter_files_by_name(name: str) -> bool: if not NEMOV4 and name == "icedyn_rhg_evp.f90": return True + # Many of the obs_ files have problems with OpenACC + if name.startswith("obs_") and "acc" in PARALLEL_DIRECTIVES: + return True + return False @@ -262,15 +270,6 @@ def trans(psyir): for subroutine in psyir.walk(Routine): - # The exclusion below could be in the FILES_TO_SKIP global parameter, - # but in this script, for testing purposes, we exclude them here so the - # PSyclone frontend and backend are still tested and it also allows to - # insert profiling hooks later on. - if psyir.name in SKIP_FOR_PERFORMANCE: - continue - # ICE routines do not perform well on GPU, so we skip them - if psyir.name.startswith("ice"): - continue # Skip initialisation and diagnostic subroutines if (subroutine.name.endswith('_alloc') or subroutine.name.endswith('_init') or From 69ad36fc9cadd992616ba7428d4c8af1738be6c6 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Thu, 22 Jan 2026 09:14:00 +0000 Subject: [PATCH 33/33] Revert some OpenACC changes --- .github/workflows/nemo_v5_tests.yml | 16 +++---- .../nemo/scripts/insert_loop_parallelism.py | 44 +++++++++---------- 2 files changed, 29 insertions(+), 31 deletions(-) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index b1abf736b1..5077a3fbaa 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -56,7 +56,7 @@ jobs: bench_gfortran_omp_cpu: ${{ steps.bench_gfortran_omp_cpu.outputs.time }} bench_nvfortran_omp_offload: ${{ steps.bench_nvfortran_omp_offload.outputs.time }} bench_nvfortran_omp_offload_build: ${{ steps.bench_nvfortran_omp_offload.outputs.build_time }} - orca1_nvfortran_acc_offload: ${{ steps.orca1_nvfortran_acc_offload.outputs.time }} + orca1_nvfortran_omp_offload: ${{ steps.orca1_nvfortran_omp_offload.outputs.time }} orca2_nvfortran_omp_offload: ${{ steps.orca2_nvfortran_omp_offload.outputs.time }} bench_nvfortran_omp_offload_async: ${{ steps.bench_nvfortran_omp_offload_async.outputs.time }} orca2_nvfortran_omp_offload_async: ${{ steps.orca2_nvfortran_omp_offload_async.outputs.time }} @@ -282,7 +282,7 @@ jobs: echo "build_time=${BUILD_ELAPSED}" >> "${GITHUB_OUTPUT}" - name: NEMO 5.0 nvidia OpenMP for GPUs (UKMO ORCA1 - managed memory) - id: orca1_nvfortran_acc_offload + id: orca1_nvfortran_omp_offload run: | # Set up environment source /archive/psyclone-spack/psyclone-spack-Jun25/spack-repo/share/spack/setup-env.sh @@ -291,14 +291,14 @@ jobs: export PSYCLONE_NEMO_DIR=${GITHUB_WORKSPACE}/examples/nemo/scripts export PSYCLONE_HOME=${PWD}/.runner_venv export NEMO_DIR=/archive/psyclone-tests/latest-run/UKMO-NEMOv5 - export TEST_DIR=ORCA1_ACC_OFFLOAD_NVHPC + export TEST_DIR=ORCA1_OMP_OFFLOAD_NVHPC # Set up FCM: PATHs are loaded from SPACK, we only need to set the FCFLAGS # We compile at "-O2 -Mnofma -Mnovect -gpu=math_uniform" to permit comparison of the results. cd $NEMO_DIR cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm - export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -acc -mp=gpu -gpu=mem:managed,math_uniform" - export PARALLEL_DIRECTIVES="acc_offloading" + export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform" + export PARALLEL_DIRECTIVES="omp_offloading+omp_threading" export REPRODUCIBLE=1 # Clean up and compile @@ -312,7 +312,7 @@ jobs: # Make sure mpi has been built with cuda support ompi_info --parsable --all | grep mpi_built_with_cuda_support:value # Run with round robin allocations of GPUs to MPI ranks - mpirun -n 2 sh -c 'CUDA_VISIBLE_DEVICES=$OMPI_COMM_WORLD_LOCAL_RANK ./nemo' + OMP_NUM_THREADS=4 mpirun -n 2 sh -c 'CUDA_VISIBLE_DEVICES=$OMPI_COMM_WORLD_LOCAL_RANK ./nemo' diff $PSYCLONE_NEMO_DIR/KGOs/run.stat.orca1.nvhpc.10steps run.stat export VAR_TIME=$(grep "local MPI proces" timing.output | head -n 1 | awk '{print $5}' | tr -d s) echo "time=${VAR_TIME}" >> "${GITHUB_OUTPUT}" @@ -531,10 +531,10 @@ jobs: '"$COMMON_FIELDS"' }, { - ci_test: "NEMOv5 OpenACC for GPU (ORCA1)", + ci_test: "NEMOv5 OpenMP for GPU (ORCA1)", nemo_version: "NEMO 5.0-RC MO patch", compiler:"nvhpc-'"$NVFORTRAN_VERSION"'", - elapsed_time: '"${{needs.run_if_on_mirror.outputs.orca1_nvfortran_acc_offload}}"', + elapsed_time: '"${{needs.run_if_on_mirror.outputs.orca1_nvfortran_omp_offload}}"', '"$COMMON_FIELDS"' }, { diff --git a/examples/nemo/scripts/insert_loop_parallelism.py b/examples/nemo/scripts/insert_loop_parallelism.py index 5611164b10..035e6e0714 100755 --- a/examples/nemo/scripts/insert_loop_parallelism.py +++ b/examples/nemo/scripts/insert_loop_parallelism.py @@ -61,7 +61,7 @@ PARALLEL_DIRECTIVES = os.environ.get('PARALLEL_DIRECTIVES', '') # By default, allow optimisations that may change the results, e.g. reductions, -# offloading instrinsics without math_uniform, ... +# offloading intrinsics without math_uniform, ... REPRODUCIBLE = os.environ.get('REPRODUCIBLE', False) # This environment variable informs if profiling hooks have to be inserted. @@ -94,11 +94,11 @@ ] # These files change the results from the baseline when psyclone adds -# parallelisation dirctives +# parallelisation directives PARALLELISATION_ISSUES = [] # These files change the results from the baseline when psyclone adds -# offloading dirctives +# offloading directives OFFLOADING_ISSUES = [] if not NEMOV4: @@ -138,14 +138,14 @@ "stp2d.f90", ]) - if "acc_offloading" in PARALLEL_DIRECTIVES: - OFFLOADING_ISSUES.extend([ - # Fail in OpenACC ORCA2_ICE_PISCES - "dynzdf.f90", - "trabbl.f90", - "trazdf.f90", - "zdfsh2.f90", - ]) + # if "acc_offloading" in PARALLEL_DIRECTIVES: + # OFFLOADING_ISSUES.extend([ + # # Fail in OpenACC ORCA2_ICE_PISCES + # "dynzdf.f90", + # "trabbl.f90", + # "trazdf.f90", + # "zdfsh2.f90", + # ]) ASYNC_ISSUES = [ # Runtime Error: (CUDA_ERROR_LAUNCH_FAILED): Launch failed @@ -199,7 +199,7 @@ def select_transformations(): process_directives = process_directives.replace('+', '') if process_directives != '': - sys.exit(f"Unkown PARALLEL_DIRECTIVES: {process_directives}") + sys.exit(f"Unknown PARALLEL_DIRECTIVES: {process_directives}") return (offload_region_trans, mark_for_gpu_trans, gpu_loop_trans, cpu_loop_trans) @@ -211,22 +211,23 @@ def filter_files_by_name(name: str) -> bool: FILES_TO_SKIP, this will still run the files through psyclone. ''' # The two options below are useful for file-by-file exhaustive tests. - # If the environemnt has ONLY_FILE defined, only process that one file and + # If the environment has ONLY_FILE defined, only process that one file and # known-good files that need a "declare target" inside. only_file = os.environ.get('ONLY_FILE', False) if only_file: files_to_do = [only_file] if "offloading" in PARALLEL_DIRECTIVES: - files_to_do.extend(["lib_fortran.f90", "solfrac_mod.f90"]) - if name in files_to_do: + files_to_do.extend( + ["lib_fortran.f90", "solfrac_mod.f90", "sbc_phy.f90"]) + if name not in files_to_do: return True - # If the environemnt has ALL_BUT_FILE defined, process all files but + # If the environment has ALL_BUT_FILE defined, process all files but # the one named file. all_but_file = os.environ.get('ALL_BUT_FILE', False) if all_but_file and name == all_but_file: return True - # These work but are skiped to improve performance, they could be in the + # These work but are skipped to improve performance, they could be in the # FILES_TO_SKIP global parameter, but in this script, for testing purposes, # we exclude them here so the PSyclone frontend and backend are still # tested and it also allows to insert profiling hooks later on. @@ -244,10 +245,6 @@ def filter_files_by_name(name: str) -> bool: if not NEMOV4 and name == "icedyn_rhg_evp.f90": return True - # Many of the obs_ files have problems with OpenACC - if name.startswith("obs_") and "acc" in PARALLEL_DIRECTIVES: - return True - return False @@ -267,6 +264,7 @@ def trans(psyir): disable_profiling_for = [] enable_async = ASYNC_PARALLEL and psyir.name not in ASYNC_ISSUES + privatise_arrays = not (NEMOV4 or "acc" in PARALLEL_DIRECTIVES) for subroutine in psyir.walk(Routine): @@ -323,7 +321,7 @@ def trans(psyir): region_directive_trans=offload_region_trans, loop_directive_trans=gpu_loop_trans, collapse=True, - privatise_arrays=not NEMOV4, + privatise_arrays=privatise_arrays, enable_reductions=not REPRODUCIBLE, uniform_intrinsics_only=REPRODUCIBLE, asynchronous_parallelism=enable_async, @@ -335,7 +333,7 @@ def trans(psyir): subroutine, loop_directive_trans=cpu_loop_trans, collapse=False, - privatise_arrays=not NEMOV4, + privatise_arrays=privatise_arrays, enable_reductions=not REPRODUCIBLE, asynchronous_parallelism=enable_async, )