From 8e623c1597a43e9b071fac2349d636235852cbc3 Mon Sep 17 00:00:00 2001 From: Mohammad Imaran Date: Thu, 11 Sep 2025 15:20:08 +0100 Subject: [PATCH 01/27] Acc kernel and utils file --- examples/nemo/scripts/acc_kernels_trans.py | 231 +++++++++++++-------- examples/nemo/scripts/utils.py | 194 ++++------------- 2 files changed, 188 insertions(+), 237 deletions(-) diff --git a/examples/nemo/scripts/acc_kernels_trans.py b/examples/nemo/scripts/acc_kernels_trans.py index 2466733e29..9803e68810 100755 --- a/examples/nemo/scripts/acc_kernels_trans.py +++ b/examples/nemo/scripts/acc_kernels_trans.py @@ -33,7 +33,7 @@ # ----------------------------------------------------------------------------- # Authors: R. W. Ford, A. R. Porter, N. Nobre and S. Siso, STFC Daresbury Lab -'''A transformation script that seeks to apply OpenACC KERNELS and optionally, +"""A transformation script that seeks to apply OpenACC KERNELS and optionally, OpenACC DATA directives to NEMO style code. In order to use it you must first install PSyclone. See README.md in the top-level directory. @@ -54,28 +54,49 @@ the process of attempting to create the largest possible Kernel region. Tested with the NVIDIA HPC SDK version 23.7. -''' +""" import logging -from utils import (add_profiling, enhance_tree_information, inline_calls, - NOT_PERFORMANT, NEMO_MODULES_TO_IMPORT) +from utils import ( + add_profiling, + enhance_tree_information, + NOT_PERFORMANT, + NEMO_MODULES_TO_IMPORT, +) from psyclone.errors import InternalError from psyclone.psyGen import TransInfo from psyclone.psyir.nodes import ( - IfBlock, ArrayReference, Assignment, BinaryOperation, Loop, Routine, - Literal, ACCLoopDirective) -from psyclone.psyir.transformations import (ACCKernelsTrans, ACCUpdateTrans, - TransformationError, ProfileTrans) + IfBlock, + ArrayReference, + Assignment, + BinaryOperation, + Loop, + Routine, + Literal, + ACCLoopDirective, + IntrinsicCall, +) +from psyclone.psyir.transformations import ( + ACCKernelsTrans, + ACCUpdateTrans, + TransformationError, + ProfileTrans, + DebugChecksumTrans, +) from psyclone.transformations import ACCEnterDataTrans +# from psyclone.psyir.symbols import DataSymbol, DataTypeSymbol, ArrayType + # Set up some loop_type inference rules in order to reference useful domain # loop constructs by name -Loop.set_loop_type_inference_rules({ +Loop.set_loop_type_inference_rules( + { "lon": {"variable": "ji"}, "lat": {"variable": "jj"}, "levels": {"variable": "jk"}, - "tracers": {"variable": "jt"} -}) + "tracers": {"variable": "jt"}, + } +) # Whether to chase the imported modules to improve symbol information (it can # also be a list of module filenames to limit the chasing to only specific @@ -86,15 +107,16 @@ # Get the PSyclone transformations we will use ACC_KERN_TRANS = ACCKernelsTrans() -ACC_LOOP_TRANS = TransInfo().get_trans_name('ACCLoopTrans') -ACC_ROUTINE_TRANS = TransInfo().get_trans_name('ACCRoutineTrans') +ACC_LOOP_TRANS = TransInfo().get_trans_name("ACCLoopTrans") +ACC_ROUTINE_TRANS = TransInfo().get_trans_name("ACCRoutineTrans") ACC_EDATA_TRANS = ACCEnterDataTrans() ACC_UPDATE_TRANS = ACCUpdateTrans() PROFILE_TRANS = ProfileTrans() +CHECKSUM_TRANS = DebugChecksumTrans() # Whether or not to add profiling calls around unaccelerated regions # N.B. this can inhibit PSyclone's ability to inline! -PROFILE_NONACC = False +PROFILE_NONACC = True # Whether or not to add OpenACC enter data and update directives to explicitly # move data between host and device memory @@ -105,25 +127,39 @@ # Routines we do not attempt to add any OpenACC to (because it breaks with # the Nvidia compiler or because it just isn't worth it) -ACC_IGNORE = ["day_mth", # Just calendar operations - "obs_surf_alloc", "oce_alloc", - # Compiler fails w/ "Unsupported local variable" - # Zero performance impact since outside execution path - "copy_obfbdata", "merge_obfbdata", - "turb_ncar", # Transforming hurts performance - "iom_open", "iom_get_123d", "iom_nf90_rp0123d", - "trc_bc_ini", "p2z_ini", "p4z_ini", "sto_par_init", - "bdytide_init", "bdy_init", "bdy_segs", "sbc_cpl_init", - "asm_inc_init", "dia_obs_init"] # Str handling, init routine - - -class ExcludeSettings(): - ''' +ACC_IGNORE = [ + "day_mth", # Just calendar operations + "obs_surf_alloc", + "oce_alloc", + # Compiler fails w/ "Unsupported local variable" + # Zero performance impact since outside execution path + "copy_obfbdata", + "merge_obfbdata", + "turb_ncar", # Transforming hurts performance + "iom_open", + "iom_get_123d", + "iom_nf90_rp0123d", + "trc_bc_ini", + "p2z_ini", + "p4z_ini", + "sto_par_init", + "bdytide_init", + "bdy_init", + "bdy_segs", + "sbc_cpl_init", + "asm_inc_init", + "dia_obs_init", +] # Str handling, init routine + + +class ExcludeSettings: + """ Class to hold settings on what to exclude from OpenACC KERNELS regions. :param Optional[dict[str, bool]] settings: map of settings to override. - ''' + """ + def __init__(self, settings=None): if settings is None: settings = {} @@ -133,16 +169,18 @@ def __init__(self, settings=None): # Routines which are exceptions to the OpenACC Kernels regions exclusion rules. -EXCLUDING = {"default": ExcludeSettings(), - # Exclude for better GPU performance (requires further analysis). - "dyn_spg_ts": ExcludeSettings({"ifs_scalars": True}), - "tra_zdf_imp": ExcludeSettings({"ifs_scalars": True}), - # Exclude due to compiler bug preventing CPU multicore executions. - "dom_vvl_init": ExcludeSettings({"ifs_scalars": True})} +EXCLUDING = { + "default": ExcludeSettings(), + # Exclude for better GPU performance (requires further analysis). + "dyn_spg_ts": ExcludeSettings({"ifs_scalars": True}), + "tra_zdf_imp": ExcludeSettings({"ifs_scalars": True}), + # Exclude due to compiler bug preventing CPU multicore executions. + "dom_vvl_init": ExcludeSettings({"ifs_scalars": True}), +} def log_msg(name, msg, node): - ''' + """ Log a message indicating why a transformation could not be performed. :param str name: the name of the routine. @@ -150,7 +188,7 @@ def log_msg(name, msg, node): :param node: the PSyIR node that prevented the transformation. :type node: :py:class:`psyclone.psyir.nodes.Node` - ''' + """ # Create a str representation of the position of the problematic node # in the PSyIR tree. node_strings = [] @@ -165,7 +203,7 @@ def log_msg(name, msg, node): def valid_acc_kernel(node): - ''' + """ Whether the sub-tree that has `node` at its root is eligible to be enclosed within an OpenACC KERNELS directive. @@ -175,19 +213,20 @@ def valid_acc_kernel(node): :returns: True if the sub-tree can be enclosed in a KERNELS region. :rtype: bool - ''' + """ # The Fortran routine which our parent represents routine_name = node.ancestor(Routine).name try: # Since we do this check on a node-by-node basis, we disable the # check that the 'region' contains a loop. - ACC_KERN_TRANS.validate(node, options={"disable_loop_check": - True}) + ACC_KERN_TRANS.validate(node, options={"disable_loop_check": True}) except TransformationError as err: - log_msg(routine_name, - f"Node rejected by ACCKernelTrans.validate: " - f"{err.value}", node) + log_msg( + routine_name, + f"Node rejected by ACCKernelTrans.validate: " f"{err.value}", + node, + ) return False # Allow for per-routine setting of what to exclude from within KERNELS @@ -197,24 +236,33 @@ def valid_acc_kernel(node): # Rather than walk the tree multiple times, look for both excluded node # types and possibly problematic operations - excluded_types = (IfBlock, Loop) + excluded_types = (IfBlock, Loop, ArrayReference, IntrinsicCall) excluded_nodes = node.walk(excluded_types) for enode in excluded_nodes: + if isinstance(enode, IntrinsicCall): + if "dim" in enode.argument_names: + return False if isinstance(enode, IfBlock): # We permit IF blocks originating from WHERE constructs and # single-statement IF blocks containing a Loop in KERNELS regions - if "was_where" in enode.annotations or \ - "was_single_stmt" in enode.annotations and enode.walk(Loop): + if ( + "was_where" in enode.annotations + or "was_single_stmt" in enode.annotations + and enode.walk(Loop) + ): continue arrays = enode.condition.walk(ArrayReference) # We exclude if statements where the condition expression does # not refer to arrays at all as this may cause compiler issues # (get "Missing branch target block") or produce faster code. - if not arrays and excluding.ifs_scalars and \ - not isinstance(enode.condition, BinaryOperation): + if ( + not arrays + and excluding.ifs_scalars + and not isinstance(enode.condition, BinaryOperation) + ): log_msg(routine_name, "IF references scalars", enode) return False # When using CUDA Unified Memory, only allocated arrays reside in @@ -224,8 +272,10 @@ def valid_acc_kernel(node): # arrays are often static in NEMO. Hence, we disallow IFs where the # logical expression involves the latter. if any(len(array.children) == 1 for array in arrays): - log_msg(routine_name, - "IF references 1D arrays that may be static", enode) + log_msg( + routine_name, + "IF references 1D arrays that may be static", enode + ) return False elif isinstance(enode, Loop): @@ -236,13 +286,15 @@ def valid_acc_kernel(node): # In general, this heuristic will depend upon how many levels the # model configuration will contain. child = enode.loop_body[0] if enode.loop_body.children else None - if isinstance(child, Loop) and child.loop_type == "levels": - # We have a loop around a loop over levels - log_msg(routine_name, "Loop is around a loop over levels", - enode) - return False - if enode.loop_type == "levels" and \ - len(enode.loop_body.children) > 1: + # if isinstance(child, Loop) and child.loop_type == "levels": + # We have a loop around a loop over levels + # log_msg(routine_name, "Loop is around a loop over levels", + # enode) + # return False + if ( + enode.loop_type == "levels" + and len(enode.loop_body.children) > 1 + ): # The body of the loop contains more than one statement. # How many distinct loop nests are there? loop_count = 0 @@ -250,16 +302,19 @@ def valid_acc_kernel(node): if child.walk(Loop): loop_count += 1 if loop_count > 1: - log_msg(routine_name, - "Loop over levels contains several " - "other loops", enode) + log_msg( + routine_name, + "Loop over levels contains several " + "other loops", + enode, + ) return False return True def add_kernels(children): - ''' + """ Walks through the PSyIR inserting OpenACC KERNELS directives at as high a level as possible. @@ -270,7 +325,7 @@ def add_kernels(children): :returns: True if any KERNELS regions are successfully added. :rtype: bool - ''' + """ added_kernels = False if not children: return added_kernels @@ -305,7 +360,7 @@ def add_kernels(children): def try_kernels_trans(nodes): - ''' + """ Attempt to enclose the supplied list of nodes within a kernels region. If the transformation fails then the error message is reported but execution continues. @@ -316,7 +371,7 @@ def try_kernels_trans(nodes): :returns: True if the transformation was successful, False otherwise. :rtype: bool - ''' + """ # We only enclose the proposed region if it contains a loop. have_loop = False for node in nodes: @@ -330,7 +385,6 @@ def try_kernels_trans(nodes): break if not have_loop: return False - try: ACC_KERN_TRANS.apply(nodes, {"default_present": False}) @@ -344,15 +398,17 @@ def try_kernels_trans(nodes): # We put a COLLAPSE(2) clause on any perfectly-nested lat-lon # loops that have a Literal value for their step. The latter # condition is necessary to avoid compiler errors. - if (loop.variable.name == "jj" and - isinstance(loop.step_expr, Literal) and - isinstance(loop.loop_body[0], Loop) and - loop.loop_body[0].variable.name == "ji" and - isinstance(loop.loop_body[0].step_expr, Literal) and - len(loop.loop_body.children) == 1): + if ( + loop.variable.name == "jj" + and isinstance(loop.step_expr, Literal) + and isinstance(loop.loop_body[0], Loop) + and loop.loop_body[0].variable.name == "ji" + and isinstance(loop.loop_body[0].step_expr, Literal) + and len(loop.loop_body.children) == 1 + ): try: ACC_LOOP_TRANS.apply(loop, {"collapse": 2}) - except (TransformationError) as err: + except TransformationError as err: print(f"Failed to collapse lat-lon loop: {loop}") print(f"Error was: {err}") @@ -364,14 +420,16 @@ def try_kernels_trans(nodes): def trans(psyir): - '''Applies OpenACC 'kernels' directives to NEMO code. Data movement can be + """Applies OpenACC 'kernels' directives to NEMO code. Data movement can be handled manually or through CUDA's managed-memory functionality. :param psyir: the PSyIR of the provided file. :type psyir: :py:class:`psyclone.psyir.nodes.FileContainer` - ''' - logging.basicConfig(filename='psyclone.log', filemode='w', - level=logging.INFO) + """ + logging.basicConfig( + filename="psyclone.log", + filemode="w", level=logging.INFO + ) for subroutine in psyir.walk(Routine): print(f"Transforming subroutine: {subroutine.name}") @@ -387,14 +445,13 @@ def trans(psyir): if subroutine.name.lower() not in ACC_IGNORE: print(f"Transforming {subroutine.name} with acc kernels") enhance_tree_information(subroutine) - inline_calls(subroutine) - have_kernels = add_kernels(subroutine.children) - if have_kernels and ACC_EXPLICIT_MEM_MANAGEMENT: - print(f"Transforming {subroutine.name} with acc enter data") - ACC_EDATA_TRANS.apply(subroutine) + # inline_calls(subroutine) + add_kernels(subroutine.children) else: - print(f"Addition of OpenACC to routine {subroutine.name} " - f"disabled!") + print( + f"Addition of OpenACC to routine {subroutine.name} " + f"disabled!" + ) # Add required OpenACC update directives to every routine, including to # those with no device code and that execute exclusively on the host @@ -404,6 +461,8 @@ def trans(psyir): # Add profiling instrumentation if PROFILE_NONACC: - print(f"Adding profiling to non-OpenACC regions in " - f"{subroutine.name}") + print( + f"Adding profiling to non-OpenACC regions in " + f"{subroutine.name}" + ) add_profiling(subroutine.children) diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index 70f7b4e7a1..0fb7647807 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -35,21 +35,20 @@ ''' Utilities file to parallelise Nemo code. ''' -import os from typing import List, Union from psyclone.domain.common.transformations import KernelModuleInlineTrans from psyclone.psyir.nodes import ( + Assignment, Loop, Directive, Node, Reference, CodeBlock, ArrayReference, - Call, Return, IfBlock, Routine, Schedule, IntrinsicCall, - StructureReference) + Call, Return, IfBlock, Routine, Schedule, IntrinsicCall ) + from psyclone.psyir.symbols import ( DataSymbol, INTEGER_TYPE, ScalarType, RoutineSymbol) from psyclone.psyir.transformations import ( ArrayAssignment2LoopsTrans, HoistLoopBoundExprTrans, HoistLocalArraysTrans, HoistTrans, InlineTrans, Maxval2LoopTrans, ProfileTrans, - OMPMinimiseSyncTrans, Reference2ArrayRangeTrans, - ScalarisationTrans, IncreaseRankLoopArraysTrans) + Reference2ArrayRangeTrans, ScalarisationTrans) from psyclone.transformations import TransformationError # USE statements to chase to gather additional symbol information. @@ -64,7 +63,8 @@ "icbdia.f90", "icbini.f90", "icbstp.f90", "iom.f90", "iom_nf90.f90", "obs_grid.f90", "obs_averg_h2d.f90", "obs_profiles_def.f90", "obs_types.f90", "obs_read_prof.f90", "obs_write.f90", "tide_mod.f90", - "zdfosm.f90", "obs_read_surf.f90", + "zdfosm.f90", "obs_read_surf.f90","dynldf_lev.f90", "ldftra.f90", "tramle.f90", + "trcsink.f90", "p4zpoc.f90" ] # If routine names contain these substrings then we do not profile them @@ -80,6 +80,11 @@ # function calls if the symbol is imported from some other module. # We therefore work-around this by keeping a list of known NEMO functions # from v4 and v5. +DEBUGCHECKSUM_IGNORE = [ ] +#DEBUGCHECKSUM_IGNORE = ['ldf_slp_init', 'dyn_spg_ts_init', +# 'lbc_lnk_pt2pt_dp', 'dyn_vor_init', 'tke_tke', +# 'p4z_fechem', 'p4z_micro', 'p4z_meso', 'tra_mle_trp_MLF', +# 'tra_adv', "p4z_lys", "tra_adv_fct"] NEMO_FUNCTIONS = [ # Internal funtions can be obtained with: # $ grep -rhi "end function" src/ | awk '{print $3}' | uniq | sort @@ -146,26 +151,7 @@ 'visc_air', 'visc_air_sclr', 'visc_air_vctr', 'w1', 'w2', 'z0_from_Cd', 'z0tq_LKB', 'zdf_gls_alloc', 'zdf_iwm_alloc', 'zdf_mfc_alloc', 'zdf_mxl_alloc', 'zdf_oce_alloc', 'zdf_osm_alloc', 'zdf_phy_alloc', - 'zdf_tke_alloc', 'zdf_tmx_alloc', 'itau2date', - # grep -rh "INTERFACE" src | grep -v "END" | awk '{print $2}' | uniq | sort - 'alpha_sw', 'bulk_formula', 'cp_air', 'debug', 'DECAL_FEEDBACK', - 'DECAL_FEEDBACK_2D', 'depth_to_e3', 'de_sat_dt_ice', 'dia_ar5_hst', - 'dia_ptr_hst', 'div_hor', 'dom_tile_copyin', 'dom_tile_copyout', - 'dq_sat_dt_ice', 'dyn_vor', 'e3_to_depth', 'eos', 'eos_fzp', - 'eos_rab', 'e_sat', 'e_sat_ice', 'f_h_louis', 'f_m_louis', - 'gamma_moist', 'glob_2Dmax', 'glob_2Dmin', 'glob_2Dsum', 'glob_3Dmax', - 'glob_3Dmin', 'glob_3Dsum', 'halo_mng_resize', 'icb_utl_bilin_h', - 'ice_var_itd', 'ice_var_snwblow', 'ice_var_snwfra', 'iom_get', - 'iom_getatt', 'iom_nf90_get', 'iom_put', 'iom_putatt', - 'iom_rstput', 'lbc_lnk', 'lbc_lnk_neicoll', 'lbc_lnk_pt2pt', - 'lbc_nfd', 'lbnd_ij', 'ldf_eiv_trp', 'local_2Dmax', 'local_2Dmin', - 'local_2Dsum', 'local_3Dmax', 'local_3Dmin', 'local_3Dsum', - 'L_vap', 'mpp_max', 'mpp_maxloc', 'mpp_min', 'mpp_minloc', - 'mpp_nfd', 'mpp_sum', 'pres_temp', 'prt_ctl_sum', 'ptr_mpp_sum', - 'ptr_sj', 'ptr_sum', 'qlw_net', 'q_sat', 'rho_air', 'Ri_bulk', - 'SIGN', 'sum3x3', 'theta_exner', 'tra_mle_trp', 'trd_vor_zint', - 'virt_temp', 'visc_air', 'wAimp', 'wzv', 'zdf_osm_iomput', - 'zdf_osm_velocity_rotation', + 'zdf_tke_alloc', 'zdf_tmx_alloc','dynldf_lev_lap', 'ldf_eiv_trp_t', ] # Currently fparser has no way of distinguishing array accesses from statement @@ -177,7 +163,7 @@ PARALLELISATION_ISSUES = [ "ldfc1d_c2d.f90", "tramle.f90", - "traqsr.f90", + "dynspg_ts.f90", ] PRIVATISATION_ISSUES = [ @@ -317,9 +303,8 @@ def normalise_loops( convert_array_notation: bool = True, loopify_array_intrinsics: bool = True, convert_range_loops: bool = True, - scalarise_loops: bool = False, - increase_array_ranks: bool = False, hoist_expressions: bool = True, + scalarise_loops: bool = False, ): ''' Normalise all loops in the given schedule so that they are in an appropriate form for the Parallelisation transformations to analyse @@ -334,12 +319,10 @@ def normalise_loops( operate on arrays to explicit loops (currently only maxval). :param bool convert_range_loops: whether to convert ranges to explicit loops. + :param bool hoist_expressions: whether to hoist bounds and loop invariant + statements out of the loop nest. :param scalarise_loops: whether to attempt to convert arrays to scalars where possible, default is False. - :param increase_array_ranks: whether to increase the rank of selected - arrays. - :param hoist_expressions: whether to hoist bounds and loop invariant - statements out of the loop nest. ''' if hoist_local_arrays and schedule.name not in CONTAINS_STMT_FUNCTIONS: # Apply the HoistLocalArraysTrans when possible, it cannot be applied @@ -375,8 +358,6 @@ def normalise_loops( # Convert all array implicit loops to explicit loops explicit_loops = ArrayAssignment2LoopsTrans() for assignment in schedule.walk(Assignment): - if assignment.walk(StructureReference): - continue # TODO #2951 Fix issues with structure_refs try: explicit_loops.apply(assignment) except TransformationError: @@ -392,9 +373,6 @@ def normalise_loops( for loop in loops: scalartrans.apply(loop) - if increase_array_ranks: - increase_rank_and_reorder_nemov5_loops(schedule) - if hoist_expressions: # First hoist all possible expressions for loop in schedule.walk(Loop): @@ -417,69 +395,12 @@ def normalise_loops( # top level. This would allow the collapse clause to be applied. -def increase_rank_and_reorder_nemov5_loops(routine: Routine): - ''' This method increases the rank of temporary arrays used inside selected - loops (in order to parallelise the outer loop without overlapping them) - and then rearranges the outer loop next to the inner ones (in order to - collapse them), so that more parallelism can be leverage. This is useful - in GPU contexts, but it increases the memory footprint and may not be - beneficial for caching-architectures. - - :param routine: the target routine. - - ''' - irlatrans = IncreaseRankLoopArraysTrans() - - # Map of routines and arrays - selection = { - "dyn_zdf": ['zwd', 'zwi', 'zws'], - "tra_zdf_imp": ['zwd', 'zwi', 'zws', 'zwt'] - } - - if routine.name not in selection: - return - - for outer_loop in routine.walk(Loop, stop_type=Loop): - if outer_loop.variable.name == "jj": - # Increase the rank of the temporary arrays in this loop - irlatrans.apply(outer_loop, arrays=selection[routine.name]) - # Now reorder the code - for child in outer_loop.loop_body[:]: - # Move the contents of the jj loop outside it - outer_loop.parent.addchild(child.detach(), - index=outer_loop.position) - # Add a new jj loop around each inner loop that is not 'jn' - target_loop = [] - for inner_loop in child.walk(Loop, stop_type=Loop): - if inner_loop.variable.name != "jn": - target_loop.append(inner_loop) - else: - for next_loop in inner_loop.loop_body.walk( - Loop, stop_type=Loop): - target_loop.append(next_loop) - for inner_loop in target_loop: - if isinstance(inner_loop.loop_body[0], Loop): - inner_loop = inner_loop.loop_body[0] - inner_loop.replace_with( - Loop.create( - outer_loop.variable, - outer_loop.start_expr.copy(), - outer_loop.stop_expr.copy(), - outer_loop.step_expr.copy(), - children=[inner_loop.copy()] - ) - ) - # Remove the now empty jj loop - outer_loop.detach() - - def insert_explicit_loop_parallelism( schedule, region_directive_trans=None, loop_directive_trans=None, collapse: bool = True, privatise_arrays: bool = False, - asynchronous_parallelism: bool = False, uniform_intrinsics_only: bool = False, ): ''' For each loop in the schedule that doesn't already have a Directive @@ -499,22 +420,17 @@ def insert_explicit_loop_parallelism( many nested loops as possible. :param privatise_arrays: whether to attempt to privatise arrays that cause write-write race conditions. - :param asynchronous_parallelism: whether to attempt to add asynchronocity - to the parallel sections. :param uniform_intrinsics_only: if True it prevent offloading loops with non-reproducible device intrinsics. ''' - nemo_v4 = os.environ.get('NEMOV4', False) - if schedule.name == "ts_wgt": - return # TODO #2937 WaW dependency incorrectly considered private # Add the parallel directives in each loop for loop in schedule.walk(Loop): if loop.ancestor(Directive): continue # Skip if an outer loop is already parallelised opts = {"collapse": collapse, "privatise_arrays": privatise_arrays, - "verbose": True, "nowait": asynchronous_parallelism} + "verbose": True, "nowait": False} if uniform_intrinsics_only: opts["device_string"] = "nvfortran-uniform" @@ -527,51 +443,33 @@ def insert_explicit_loop_parallelism( "and is not the inner loop") continue - if nemo_v4: - # Skip if it is an array operation loop on an ice routine if along - # the third dim or higher or if the loop nests a loop over ice - # points (npti) or if the loop and array dims do not match. - # In addition, they often nest ice linearised loops (npti) - # which we'd rather parallelise - if ('ice' in routine_name - and isinstance(loop.stop_expr, IntrinsicCall) - and (loop.stop_expr.intrinsic in ( - IntrinsicCall.Intrinsic.UBOUND, - IntrinsicCall.Intrinsic.SIZE)) - and (len(loop.walk(Loop)) > 2 - or any(ref.symbol.name in ('npti',) - for lp in loop.loop_body.walk(Loop) - for ref in lp.stop_expr.walk(Reference)) - or (str(len(loop.walk(Loop))) != - loop.stop_expr.arguments[1].value))): - loop.append_preceding_comment( - "PSyclone: ICE Loop not parallelised for performance" - "reasons") - continue - - # Skip if looping over ice categories, ice or snow layers as these - # have small trip counts if they are not collapsed - if not collapse and any( - ref.symbol.name in ('jpl', 'nlay_i', 'nlay_s') - for ref in loop.stop_expr.walk(Reference) - ): - loop.append_preceding_comment( - "PSyclone: Loop not parallelised because stops at 'jpl'," - " 'nlay_i' or 'nlay_s' and is not collapsed.") - continue + # Skip if it is an array operation loop on an ice routine if along the + # third dim or higher or if the loop nests a loop over ice points + # (npti) or if the loop and array dims do not match. + # In addition, they often nest ice linearised loops (npti) + # which we'd rather parallelise + if ('ice' in routine_name + and isinstance(loop.stop_expr, IntrinsicCall) + and (loop.stop_expr.intrinsic in (IntrinsicCall.Intrinsic.UBOUND, + IntrinsicCall.Intrinsic.SIZE)) + and (len(loop.walk(Loop)) > 2 + or any(ref.symbol.name in ('npti',) + for lp in loop.loop_body.walk(Loop) + for ref in lp.stop_expr.walk(Reference)) + or (str(len(loop.walk(Loop))) != + loop.stop_expr.arguments[1].value))): + loop.append_preceding_comment( + "PSyclone: ICE Loop not parallelised for performance reasons") + continue - else: - # In NEMOv5 add the necessary explicit private symbols in icethd - # in order to parallelise the outer loop - if routine_name == "ice_thd_zdf_BL99": - if isinstance(loop.stop_expr, Reference): - if loop.stop_expr.symbol.name == "npti": - for variable in ['zdiagbis', 'zindtbis', 'zindterm', - 'ztib', 'ztrid', 'ztsb']: - st = loop.scope.symbol_table - sym = st.lookup(variable, otherwise=None) - if sym is not None: - loop.explicitly_private_symbols.add(sym) + # Skip if looping over ice categories, ice or snow layers + # as these have only 5, 4, and 1 iterations, respectively + if (any(ref.symbol.name in ('jpl', 'nlay_i', 'nlay_s') + for ref in loop.stop_expr.walk(Reference))): + loop.append_preceding_comment( + "PSyclone: Loop not parallelised because stops at 'jpl'," + " 'nlay_i' or 'nlay_s'.") + continue try: # First check that the region_directive is feasible for this region @@ -590,12 +488,6 @@ def insert_explicit_loop_parallelism( # associted to the loop in the generated output. continue - # If we are adding asynchronous parallelism then we now try to minimise - # the number of barriers. - if asynchronous_parallelism: - minsync_trans = OMPMinimiseSyncTrans() - minsync_trans.apply(schedule) - def add_profiling(children: Union[List[Node], Schedule]): ''' From 7d651cd1f97d1997e86a1d3b91ba3084122f39a3 Mon Sep 17 00:00:00 2001 From: Mohammad Imaran Date: Thu, 11 Sep 2025 15:52:41 +0100 Subject: [PATCH 02/27] Adding workflow for NEMOv5 --- .github/workflows/nemo_v5_tests.yml | 43 +++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index a132271974..7a56824712 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -468,6 +468,42 @@ jobs: diff $PSYCLONE_NEMO_DIR/KGOs/run.stat.orca1.nvhpc.10steps run.stat export TIME_sec=$(grep "local MPI proces" timing.output | head -n 1 | awk '{print $5}' | tr -d s) + + + - name: NEMO 5.0 nvidia OpenACC for GPUs (BENCH - managed memory) + run: | + # Set up environment + source /archive/psyclone-spack/psyclone-spack-Jun25/spack-repo/share/spack/setup-env.sh + spack unload && spack load nemo-build-environment%nvhpc@${NVFORTRAN_VERSION} + source .runner_venv/bin/activate + export PSYCLONE_NEMO_DIR=${GITHUB_WORKSPACE}/examples/nemo/scripts + export PSYCLONE_HOME=${PWD}/.runner_venv + export NEMO_DIR=${HOME}/${NEMODIR_NAME} + export TEST_DIR=BENCH_ACC_OFFLOAD_NVHPC + + # Set up FCM: PATHs are loaded from SPACK, we only need to set the FCFLAGS + # We compile with "-O2 -Mnofma -Mnovect -gpu=math_uniform" to permit comparison of the results. + cd $NEMO_DIR + cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm + export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -acc -gpu=mem:managed,math_uniform" + + # Clean up and compile + # Without key_mpi_off it fails to compile (even without psyclone) + rm -rf tests/${TEST_DIR} + ./makenemo -r BENCH -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/acc_kernel_trans.py \ + add_key "key_mpi_off key_nosignedzero" -j 4 -v 1 + + # Run test + cd $NEMO_DIR/tests/${TEST_DIR}/EXP00 + cp $PSYCLONE_NEMO_DIR/KGOs/namelist_cfg_bench_small namelist_cfg + ./nemo + # tail run.stat + diff $PSYCLONE_NEMO_DIR/KGOs/run.stat.bench.nvhpc.small.10steps run.stat + export TIME_sec=$(grep "local proces" timing.output | head -n 1 | awk '{print $4}' | tr -d s) + + + + upload_if_on_mirror: if: ${{ github.repository == 'stfc/PSyclone-mirror' }} runs-on: ubuntu-latest @@ -531,4 +567,11 @@ jobs: compiler:"nvhpc-'"$NVFORTRAN_VERSION"'", elapsed_time: '"${{needs.run_if_on_mirror.outputs.orca2_nvfortran_omp_offload_async}}"', '"$COMMON_FIELDS"' + }, + { + ci_test: "NEMOv5 OpenACC for GPU (BENCH)", + nemo_version: "NEMOv5", + compiler:"nvhpc-'"$NVFORTRAN_VERSION"'", + date: new Date(), + elapsed_time: '"${TIME_sec}"'' }])' From 796cf5baaecb63584494e7a406a73005dfe0d238 Mon Sep 17 00:00:00 2001 From: Mohammad Imaran Date: Thu, 11 Sep 2025 16:22:44 +0100 Subject: [PATCH 03/27] flake8 compatibility --- .github/workflows/nemo_v5_tests.yml | 12 ++++++------ examples/nemo/scripts/utils.py | 14 +++++--------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index 7a56824712..269cbf73f5 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -568,10 +568,10 @@ jobs: elapsed_time: '"${{needs.run_if_on_mirror.outputs.orca2_nvfortran_omp_offload_async}}"', '"$COMMON_FIELDS"' }, - { - ci_test: "NEMOv5 OpenACC for GPU (BENCH)", - nemo_version: "NEMOv5", - compiler:"nvhpc-'"$NVFORTRAN_VERSION"'", - date: new Date(), - elapsed_time: '"${TIME_sec}"'' + { + ci_test: "NEMOv5 OpenACC for GPU (BENCH)", + nemo_version: "NEMOv5", system: "GlaDos", + compiler:"nvhpc-24.5" , + date: new Date(), + elapsed_time: '"${TIME_sec}"' }])' diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index 0fb7647807..cfe608b5bf 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -41,7 +41,7 @@ from psyclone.psyir.nodes import ( Assignment, Loop, Directive, Node, Reference, CodeBlock, ArrayReference, - Call, Return, IfBlock, Routine, Schedule, IntrinsicCall ) + Call, Return, IfBlock, Routine, Schedule, IntrinsicCall) from psyclone.psyir.symbols import ( DataSymbol, INTEGER_TYPE, ScalarType, RoutineSymbol) @@ -63,8 +63,8 @@ "icbdia.f90", "icbini.f90", "icbstp.f90", "iom.f90", "iom_nf90.f90", "obs_grid.f90", "obs_averg_h2d.f90", "obs_profiles_def.f90", "obs_types.f90", "obs_read_prof.f90", "obs_write.f90", "tide_mod.f90", - "zdfosm.f90", "obs_read_surf.f90","dynldf_lev.f90", "ldftra.f90", "tramle.f90", - "trcsink.f90", "p4zpoc.f90" + "zdfosm.f90", "obs_read_surf.f90", "dynldf_lev.f90", "ldftra.f90", + "tramle.f90", "trcsink.f90", "p4zpoc.f90" ] # If routine names contain these substrings then we do not profile them @@ -80,11 +80,7 @@ # function calls if the symbol is imported from some other module. # We therefore work-around this by keeping a list of known NEMO functions # from v4 and v5. -DEBUGCHECKSUM_IGNORE = [ ] -#DEBUGCHECKSUM_IGNORE = ['ldf_slp_init', 'dyn_spg_ts_init', -# 'lbc_lnk_pt2pt_dp', 'dyn_vor_init', 'tke_tke', -# 'p4z_fechem', 'p4z_micro', 'p4z_meso', 'tra_mle_trp_MLF', -# 'tra_adv', "p4z_lys", "tra_adv_fct"] +DEBUGCHECKSUM_IGNORE = [] NEMO_FUNCTIONS = [ # Internal funtions can be obtained with: # $ grep -rhi "end function" src/ | awk '{print $3}' | uniq | sort @@ -151,7 +147,7 @@ 'visc_air', 'visc_air_sclr', 'visc_air_vctr', 'w1', 'w2', 'z0_from_Cd', 'z0tq_LKB', 'zdf_gls_alloc', 'zdf_iwm_alloc', 'zdf_mfc_alloc', 'zdf_mxl_alloc', 'zdf_oce_alloc', 'zdf_osm_alloc', 'zdf_phy_alloc', - 'zdf_tke_alloc', 'zdf_tmx_alloc','dynldf_lev_lap', 'ldf_eiv_trp_t', + 'zdf_tke_alloc', 'zdf_tmx_alloc', 'dynldf_lev_lap', 'ldf_eiv_trp_t', ] # Currently fparser has no way of distinguishing array accesses from statement From 9f05a8aacf89c3b77078bbced3a6428779b9e723 Mon Sep 17 00:00:00 2001 From: Mohammad Imaran Date: Thu, 11 Sep 2025 20:07:05 +0100 Subject: [PATCH 04/27] flake8 compatibility --- examples/nemo/scripts/acc_kernels_trans.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/examples/nemo/scripts/acc_kernels_trans.py b/examples/nemo/scripts/acc_kernels_trans.py index 9803e68810..e03685bf79 100755 --- a/examples/nemo/scripts/acc_kernels_trans.py +++ b/examples/nemo/scripts/acc_kernels_trans.py @@ -302,12 +302,10 @@ def valid_acc_kernel(node): if child.walk(Loop): loop_count += 1 if loop_count > 1: - log_msg( - routine_name, - "Loop over levels contains several " - "other loops", - enode, - ) + msg = "Loop over levels contains several \ + other loops" + if msg not in enode.preceding_comment: + enode.append_preceding_comment(msg) return False return True From b2ffdbded016e00c6553b515a79e0d3959e296da Mon Sep 17 00:00:00 2001 From: Mohammad Imaran Date: Thu, 11 Sep 2025 15:20:08 +0100 Subject: [PATCH 05/27] Acc kernel and utils file --- examples/nemo/scripts/acc_kernels_trans.py | 231 +++++++++++++-------- examples/nemo/scripts/utils.py | 194 ++++------------- 2 files changed, 188 insertions(+), 237 deletions(-) diff --git a/examples/nemo/scripts/acc_kernels_trans.py b/examples/nemo/scripts/acc_kernels_trans.py index 2466733e29..9803e68810 100755 --- a/examples/nemo/scripts/acc_kernels_trans.py +++ b/examples/nemo/scripts/acc_kernels_trans.py @@ -33,7 +33,7 @@ # ----------------------------------------------------------------------------- # Authors: R. W. Ford, A. R. Porter, N. Nobre and S. Siso, STFC Daresbury Lab -'''A transformation script that seeks to apply OpenACC KERNELS and optionally, +"""A transformation script that seeks to apply OpenACC KERNELS and optionally, OpenACC DATA directives to NEMO style code. In order to use it you must first install PSyclone. See README.md in the top-level directory. @@ -54,28 +54,49 @@ the process of attempting to create the largest possible Kernel region. Tested with the NVIDIA HPC SDK version 23.7. -''' +""" import logging -from utils import (add_profiling, enhance_tree_information, inline_calls, - NOT_PERFORMANT, NEMO_MODULES_TO_IMPORT) +from utils import ( + add_profiling, + enhance_tree_information, + NOT_PERFORMANT, + NEMO_MODULES_TO_IMPORT, +) from psyclone.errors import InternalError from psyclone.psyGen import TransInfo from psyclone.psyir.nodes import ( - IfBlock, ArrayReference, Assignment, BinaryOperation, Loop, Routine, - Literal, ACCLoopDirective) -from psyclone.psyir.transformations import (ACCKernelsTrans, ACCUpdateTrans, - TransformationError, ProfileTrans) + IfBlock, + ArrayReference, + Assignment, + BinaryOperation, + Loop, + Routine, + Literal, + ACCLoopDirective, + IntrinsicCall, +) +from psyclone.psyir.transformations import ( + ACCKernelsTrans, + ACCUpdateTrans, + TransformationError, + ProfileTrans, + DebugChecksumTrans, +) from psyclone.transformations import ACCEnterDataTrans +# from psyclone.psyir.symbols import DataSymbol, DataTypeSymbol, ArrayType + # Set up some loop_type inference rules in order to reference useful domain # loop constructs by name -Loop.set_loop_type_inference_rules({ +Loop.set_loop_type_inference_rules( + { "lon": {"variable": "ji"}, "lat": {"variable": "jj"}, "levels": {"variable": "jk"}, - "tracers": {"variable": "jt"} -}) + "tracers": {"variable": "jt"}, + } +) # Whether to chase the imported modules to improve symbol information (it can # also be a list of module filenames to limit the chasing to only specific @@ -86,15 +107,16 @@ # Get the PSyclone transformations we will use ACC_KERN_TRANS = ACCKernelsTrans() -ACC_LOOP_TRANS = TransInfo().get_trans_name('ACCLoopTrans') -ACC_ROUTINE_TRANS = TransInfo().get_trans_name('ACCRoutineTrans') +ACC_LOOP_TRANS = TransInfo().get_trans_name("ACCLoopTrans") +ACC_ROUTINE_TRANS = TransInfo().get_trans_name("ACCRoutineTrans") ACC_EDATA_TRANS = ACCEnterDataTrans() ACC_UPDATE_TRANS = ACCUpdateTrans() PROFILE_TRANS = ProfileTrans() +CHECKSUM_TRANS = DebugChecksumTrans() # Whether or not to add profiling calls around unaccelerated regions # N.B. this can inhibit PSyclone's ability to inline! -PROFILE_NONACC = False +PROFILE_NONACC = True # Whether or not to add OpenACC enter data and update directives to explicitly # move data between host and device memory @@ -105,25 +127,39 @@ # Routines we do not attempt to add any OpenACC to (because it breaks with # the Nvidia compiler or because it just isn't worth it) -ACC_IGNORE = ["day_mth", # Just calendar operations - "obs_surf_alloc", "oce_alloc", - # Compiler fails w/ "Unsupported local variable" - # Zero performance impact since outside execution path - "copy_obfbdata", "merge_obfbdata", - "turb_ncar", # Transforming hurts performance - "iom_open", "iom_get_123d", "iom_nf90_rp0123d", - "trc_bc_ini", "p2z_ini", "p4z_ini", "sto_par_init", - "bdytide_init", "bdy_init", "bdy_segs", "sbc_cpl_init", - "asm_inc_init", "dia_obs_init"] # Str handling, init routine - - -class ExcludeSettings(): - ''' +ACC_IGNORE = [ + "day_mth", # Just calendar operations + "obs_surf_alloc", + "oce_alloc", + # Compiler fails w/ "Unsupported local variable" + # Zero performance impact since outside execution path + "copy_obfbdata", + "merge_obfbdata", + "turb_ncar", # Transforming hurts performance + "iom_open", + "iom_get_123d", + "iom_nf90_rp0123d", + "trc_bc_ini", + "p2z_ini", + "p4z_ini", + "sto_par_init", + "bdytide_init", + "bdy_init", + "bdy_segs", + "sbc_cpl_init", + "asm_inc_init", + "dia_obs_init", +] # Str handling, init routine + + +class ExcludeSettings: + """ Class to hold settings on what to exclude from OpenACC KERNELS regions. :param Optional[dict[str, bool]] settings: map of settings to override. - ''' + """ + def __init__(self, settings=None): if settings is None: settings = {} @@ -133,16 +169,18 @@ def __init__(self, settings=None): # Routines which are exceptions to the OpenACC Kernels regions exclusion rules. -EXCLUDING = {"default": ExcludeSettings(), - # Exclude for better GPU performance (requires further analysis). - "dyn_spg_ts": ExcludeSettings({"ifs_scalars": True}), - "tra_zdf_imp": ExcludeSettings({"ifs_scalars": True}), - # Exclude due to compiler bug preventing CPU multicore executions. - "dom_vvl_init": ExcludeSettings({"ifs_scalars": True})} +EXCLUDING = { + "default": ExcludeSettings(), + # Exclude for better GPU performance (requires further analysis). + "dyn_spg_ts": ExcludeSettings({"ifs_scalars": True}), + "tra_zdf_imp": ExcludeSettings({"ifs_scalars": True}), + # Exclude due to compiler bug preventing CPU multicore executions. + "dom_vvl_init": ExcludeSettings({"ifs_scalars": True}), +} def log_msg(name, msg, node): - ''' + """ Log a message indicating why a transformation could not be performed. :param str name: the name of the routine. @@ -150,7 +188,7 @@ def log_msg(name, msg, node): :param node: the PSyIR node that prevented the transformation. :type node: :py:class:`psyclone.psyir.nodes.Node` - ''' + """ # Create a str representation of the position of the problematic node # in the PSyIR tree. node_strings = [] @@ -165,7 +203,7 @@ def log_msg(name, msg, node): def valid_acc_kernel(node): - ''' + """ Whether the sub-tree that has `node` at its root is eligible to be enclosed within an OpenACC KERNELS directive. @@ -175,19 +213,20 @@ def valid_acc_kernel(node): :returns: True if the sub-tree can be enclosed in a KERNELS region. :rtype: bool - ''' + """ # The Fortran routine which our parent represents routine_name = node.ancestor(Routine).name try: # Since we do this check on a node-by-node basis, we disable the # check that the 'region' contains a loop. - ACC_KERN_TRANS.validate(node, options={"disable_loop_check": - True}) + ACC_KERN_TRANS.validate(node, options={"disable_loop_check": True}) except TransformationError as err: - log_msg(routine_name, - f"Node rejected by ACCKernelTrans.validate: " - f"{err.value}", node) + log_msg( + routine_name, + f"Node rejected by ACCKernelTrans.validate: " f"{err.value}", + node, + ) return False # Allow for per-routine setting of what to exclude from within KERNELS @@ -197,24 +236,33 @@ def valid_acc_kernel(node): # Rather than walk the tree multiple times, look for both excluded node # types and possibly problematic operations - excluded_types = (IfBlock, Loop) + excluded_types = (IfBlock, Loop, ArrayReference, IntrinsicCall) excluded_nodes = node.walk(excluded_types) for enode in excluded_nodes: + if isinstance(enode, IntrinsicCall): + if "dim" in enode.argument_names: + return False if isinstance(enode, IfBlock): # We permit IF blocks originating from WHERE constructs and # single-statement IF blocks containing a Loop in KERNELS regions - if "was_where" in enode.annotations or \ - "was_single_stmt" in enode.annotations and enode.walk(Loop): + if ( + "was_where" in enode.annotations + or "was_single_stmt" in enode.annotations + and enode.walk(Loop) + ): continue arrays = enode.condition.walk(ArrayReference) # We exclude if statements where the condition expression does # not refer to arrays at all as this may cause compiler issues # (get "Missing branch target block") or produce faster code. - if not arrays and excluding.ifs_scalars and \ - not isinstance(enode.condition, BinaryOperation): + if ( + not arrays + and excluding.ifs_scalars + and not isinstance(enode.condition, BinaryOperation) + ): log_msg(routine_name, "IF references scalars", enode) return False # When using CUDA Unified Memory, only allocated arrays reside in @@ -224,8 +272,10 @@ def valid_acc_kernel(node): # arrays are often static in NEMO. Hence, we disallow IFs where the # logical expression involves the latter. if any(len(array.children) == 1 for array in arrays): - log_msg(routine_name, - "IF references 1D arrays that may be static", enode) + log_msg( + routine_name, + "IF references 1D arrays that may be static", enode + ) return False elif isinstance(enode, Loop): @@ -236,13 +286,15 @@ def valid_acc_kernel(node): # In general, this heuristic will depend upon how many levels the # model configuration will contain. child = enode.loop_body[0] if enode.loop_body.children else None - if isinstance(child, Loop) and child.loop_type == "levels": - # We have a loop around a loop over levels - log_msg(routine_name, "Loop is around a loop over levels", - enode) - return False - if enode.loop_type == "levels" and \ - len(enode.loop_body.children) > 1: + # if isinstance(child, Loop) and child.loop_type == "levels": + # We have a loop around a loop over levels + # log_msg(routine_name, "Loop is around a loop over levels", + # enode) + # return False + if ( + enode.loop_type == "levels" + and len(enode.loop_body.children) > 1 + ): # The body of the loop contains more than one statement. # How many distinct loop nests are there? loop_count = 0 @@ -250,16 +302,19 @@ def valid_acc_kernel(node): if child.walk(Loop): loop_count += 1 if loop_count > 1: - log_msg(routine_name, - "Loop over levels contains several " - "other loops", enode) + log_msg( + routine_name, + "Loop over levels contains several " + "other loops", + enode, + ) return False return True def add_kernels(children): - ''' + """ Walks through the PSyIR inserting OpenACC KERNELS directives at as high a level as possible. @@ -270,7 +325,7 @@ def add_kernels(children): :returns: True if any KERNELS regions are successfully added. :rtype: bool - ''' + """ added_kernels = False if not children: return added_kernels @@ -305,7 +360,7 @@ def add_kernels(children): def try_kernels_trans(nodes): - ''' + """ Attempt to enclose the supplied list of nodes within a kernels region. If the transformation fails then the error message is reported but execution continues. @@ -316,7 +371,7 @@ def try_kernels_trans(nodes): :returns: True if the transformation was successful, False otherwise. :rtype: bool - ''' + """ # We only enclose the proposed region if it contains a loop. have_loop = False for node in nodes: @@ -330,7 +385,6 @@ def try_kernels_trans(nodes): break if not have_loop: return False - try: ACC_KERN_TRANS.apply(nodes, {"default_present": False}) @@ -344,15 +398,17 @@ def try_kernels_trans(nodes): # We put a COLLAPSE(2) clause on any perfectly-nested lat-lon # loops that have a Literal value for their step. The latter # condition is necessary to avoid compiler errors. - if (loop.variable.name == "jj" and - isinstance(loop.step_expr, Literal) and - isinstance(loop.loop_body[0], Loop) and - loop.loop_body[0].variable.name == "ji" and - isinstance(loop.loop_body[0].step_expr, Literal) and - len(loop.loop_body.children) == 1): + if ( + loop.variable.name == "jj" + and isinstance(loop.step_expr, Literal) + and isinstance(loop.loop_body[0], Loop) + and loop.loop_body[0].variable.name == "ji" + and isinstance(loop.loop_body[0].step_expr, Literal) + and len(loop.loop_body.children) == 1 + ): try: ACC_LOOP_TRANS.apply(loop, {"collapse": 2}) - except (TransformationError) as err: + except TransformationError as err: print(f"Failed to collapse lat-lon loop: {loop}") print(f"Error was: {err}") @@ -364,14 +420,16 @@ def try_kernels_trans(nodes): def trans(psyir): - '''Applies OpenACC 'kernels' directives to NEMO code. Data movement can be + """Applies OpenACC 'kernels' directives to NEMO code. Data movement can be handled manually or through CUDA's managed-memory functionality. :param psyir: the PSyIR of the provided file. :type psyir: :py:class:`psyclone.psyir.nodes.FileContainer` - ''' - logging.basicConfig(filename='psyclone.log', filemode='w', - level=logging.INFO) + """ + logging.basicConfig( + filename="psyclone.log", + filemode="w", level=logging.INFO + ) for subroutine in psyir.walk(Routine): print(f"Transforming subroutine: {subroutine.name}") @@ -387,14 +445,13 @@ def trans(psyir): if subroutine.name.lower() not in ACC_IGNORE: print(f"Transforming {subroutine.name} with acc kernels") enhance_tree_information(subroutine) - inline_calls(subroutine) - have_kernels = add_kernels(subroutine.children) - if have_kernels and ACC_EXPLICIT_MEM_MANAGEMENT: - print(f"Transforming {subroutine.name} with acc enter data") - ACC_EDATA_TRANS.apply(subroutine) + # inline_calls(subroutine) + add_kernels(subroutine.children) else: - print(f"Addition of OpenACC to routine {subroutine.name} " - f"disabled!") + print( + f"Addition of OpenACC to routine {subroutine.name} " + f"disabled!" + ) # Add required OpenACC update directives to every routine, including to # those with no device code and that execute exclusively on the host @@ -404,6 +461,8 @@ def trans(psyir): # Add profiling instrumentation if PROFILE_NONACC: - print(f"Adding profiling to non-OpenACC regions in " - f"{subroutine.name}") + print( + f"Adding profiling to non-OpenACC regions in " + f"{subroutine.name}" + ) add_profiling(subroutine.children) diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index 70f7b4e7a1..0fb7647807 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -35,21 +35,20 @@ ''' Utilities file to parallelise Nemo code. ''' -import os from typing import List, Union from psyclone.domain.common.transformations import KernelModuleInlineTrans from psyclone.psyir.nodes import ( + Assignment, Loop, Directive, Node, Reference, CodeBlock, ArrayReference, - Call, Return, IfBlock, Routine, Schedule, IntrinsicCall, - StructureReference) + Call, Return, IfBlock, Routine, Schedule, IntrinsicCall ) + from psyclone.psyir.symbols import ( DataSymbol, INTEGER_TYPE, ScalarType, RoutineSymbol) from psyclone.psyir.transformations import ( ArrayAssignment2LoopsTrans, HoistLoopBoundExprTrans, HoistLocalArraysTrans, HoistTrans, InlineTrans, Maxval2LoopTrans, ProfileTrans, - OMPMinimiseSyncTrans, Reference2ArrayRangeTrans, - ScalarisationTrans, IncreaseRankLoopArraysTrans) + Reference2ArrayRangeTrans, ScalarisationTrans) from psyclone.transformations import TransformationError # USE statements to chase to gather additional symbol information. @@ -64,7 +63,8 @@ "icbdia.f90", "icbini.f90", "icbstp.f90", "iom.f90", "iom_nf90.f90", "obs_grid.f90", "obs_averg_h2d.f90", "obs_profiles_def.f90", "obs_types.f90", "obs_read_prof.f90", "obs_write.f90", "tide_mod.f90", - "zdfosm.f90", "obs_read_surf.f90", + "zdfosm.f90", "obs_read_surf.f90","dynldf_lev.f90", "ldftra.f90", "tramle.f90", + "trcsink.f90", "p4zpoc.f90" ] # If routine names contain these substrings then we do not profile them @@ -80,6 +80,11 @@ # function calls if the symbol is imported from some other module. # We therefore work-around this by keeping a list of known NEMO functions # from v4 and v5. +DEBUGCHECKSUM_IGNORE = [ ] +#DEBUGCHECKSUM_IGNORE = ['ldf_slp_init', 'dyn_spg_ts_init', +# 'lbc_lnk_pt2pt_dp', 'dyn_vor_init', 'tke_tke', +# 'p4z_fechem', 'p4z_micro', 'p4z_meso', 'tra_mle_trp_MLF', +# 'tra_adv', "p4z_lys", "tra_adv_fct"] NEMO_FUNCTIONS = [ # Internal funtions can be obtained with: # $ grep -rhi "end function" src/ | awk '{print $3}' | uniq | sort @@ -146,26 +151,7 @@ 'visc_air', 'visc_air_sclr', 'visc_air_vctr', 'w1', 'w2', 'z0_from_Cd', 'z0tq_LKB', 'zdf_gls_alloc', 'zdf_iwm_alloc', 'zdf_mfc_alloc', 'zdf_mxl_alloc', 'zdf_oce_alloc', 'zdf_osm_alloc', 'zdf_phy_alloc', - 'zdf_tke_alloc', 'zdf_tmx_alloc', 'itau2date', - # grep -rh "INTERFACE" src | grep -v "END" | awk '{print $2}' | uniq | sort - 'alpha_sw', 'bulk_formula', 'cp_air', 'debug', 'DECAL_FEEDBACK', - 'DECAL_FEEDBACK_2D', 'depth_to_e3', 'de_sat_dt_ice', 'dia_ar5_hst', - 'dia_ptr_hst', 'div_hor', 'dom_tile_copyin', 'dom_tile_copyout', - 'dq_sat_dt_ice', 'dyn_vor', 'e3_to_depth', 'eos', 'eos_fzp', - 'eos_rab', 'e_sat', 'e_sat_ice', 'f_h_louis', 'f_m_louis', - 'gamma_moist', 'glob_2Dmax', 'glob_2Dmin', 'glob_2Dsum', 'glob_3Dmax', - 'glob_3Dmin', 'glob_3Dsum', 'halo_mng_resize', 'icb_utl_bilin_h', - 'ice_var_itd', 'ice_var_snwblow', 'ice_var_snwfra', 'iom_get', - 'iom_getatt', 'iom_nf90_get', 'iom_put', 'iom_putatt', - 'iom_rstput', 'lbc_lnk', 'lbc_lnk_neicoll', 'lbc_lnk_pt2pt', - 'lbc_nfd', 'lbnd_ij', 'ldf_eiv_trp', 'local_2Dmax', 'local_2Dmin', - 'local_2Dsum', 'local_3Dmax', 'local_3Dmin', 'local_3Dsum', - 'L_vap', 'mpp_max', 'mpp_maxloc', 'mpp_min', 'mpp_minloc', - 'mpp_nfd', 'mpp_sum', 'pres_temp', 'prt_ctl_sum', 'ptr_mpp_sum', - 'ptr_sj', 'ptr_sum', 'qlw_net', 'q_sat', 'rho_air', 'Ri_bulk', - 'SIGN', 'sum3x3', 'theta_exner', 'tra_mle_trp', 'trd_vor_zint', - 'virt_temp', 'visc_air', 'wAimp', 'wzv', 'zdf_osm_iomput', - 'zdf_osm_velocity_rotation', + 'zdf_tke_alloc', 'zdf_tmx_alloc','dynldf_lev_lap', 'ldf_eiv_trp_t', ] # Currently fparser has no way of distinguishing array accesses from statement @@ -177,7 +163,7 @@ PARALLELISATION_ISSUES = [ "ldfc1d_c2d.f90", "tramle.f90", - "traqsr.f90", + "dynspg_ts.f90", ] PRIVATISATION_ISSUES = [ @@ -317,9 +303,8 @@ def normalise_loops( convert_array_notation: bool = True, loopify_array_intrinsics: bool = True, convert_range_loops: bool = True, - scalarise_loops: bool = False, - increase_array_ranks: bool = False, hoist_expressions: bool = True, + scalarise_loops: bool = False, ): ''' Normalise all loops in the given schedule so that they are in an appropriate form for the Parallelisation transformations to analyse @@ -334,12 +319,10 @@ def normalise_loops( operate on arrays to explicit loops (currently only maxval). :param bool convert_range_loops: whether to convert ranges to explicit loops. + :param bool hoist_expressions: whether to hoist bounds and loop invariant + statements out of the loop nest. :param scalarise_loops: whether to attempt to convert arrays to scalars where possible, default is False. - :param increase_array_ranks: whether to increase the rank of selected - arrays. - :param hoist_expressions: whether to hoist bounds and loop invariant - statements out of the loop nest. ''' if hoist_local_arrays and schedule.name not in CONTAINS_STMT_FUNCTIONS: # Apply the HoistLocalArraysTrans when possible, it cannot be applied @@ -375,8 +358,6 @@ def normalise_loops( # Convert all array implicit loops to explicit loops explicit_loops = ArrayAssignment2LoopsTrans() for assignment in schedule.walk(Assignment): - if assignment.walk(StructureReference): - continue # TODO #2951 Fix issues with structure_refs try: explicit_loops.apply(assignment) except TransformationError: @@ -392,9 +373,6 @@ def normalise_loops( for loop in loops: scalartrans.apply(loop) - if increase_array_ranks: - increase_rank_and_reorder_nemov5_loops(schedule) - if hoist_expressions: # First hoist all possible expressions for loop in schedule.walk(Loop): @@ -417,69 +395,12 @@ def normalise_loops( # top level. This would allow the collapse clause to be applied. -def increase_rank_and_reorder_nemov5_loops(routine: Routine): - ''' This method increases the rank of temporary arrays used inside selected - loops (in order to parallelise the outer loop without overlapping them) - and then rearranges the outer loop next to the inner ones (in order to - collapse them), so that more parallelism can be leverage. This is useful - in GPU contexts, but it increases the memory footprint and may not be - beneficial for caching-architectures. - - :param routine: the target routine. - - ''' - irlatrans = IncreaseRankLoopArraysTrans() - - # Map of routines and arrays - selection = { - "dyn_zdf": ['zwd', 'zwi', 'zws'], - "tra_zdf_imp": ['zwd', 'zwi', 'zws', 'zwt'] - } - - if routine.name not in selection: - return - - for outer_loop in routine.walk(Loop, stop_type=Loop): - if outer_loop.variable.name == "jj": - # Increase the rank of the temporary arrays in this loop - irlatrans.apply(outer_loop, arrays=selection[routine.name]) - # Now reorder the code - for child in outer_loop.loop_body[:]: - # Move the contents of the jj loop outside it - outer_loop.parent.addchild(child.detach(), - index=outer_loop.position) - # Add a new jj loop around each inner loop that is not 'jn' - target_loop = [] - for inner_loop in child.walk(Loop, stop_type=Loop): - if inner_loop.variable.name != "jn": - target_loop.append(inner_loop) - else: - for next_loop in inner_loop.loop_body.walk( - Loop, stop_type=Loop): - target_loop.append(next_loop) - for inner_loop in target_loop: - if isinstance(inner_loop.loop_body[0], Loop): - inner_loop = inner_loop.loop_body[0] - inner_loop.replace_with( - Loop.create( - outer_loop.variable, - outer_loop.start_expr.copy(), - outer_loop.stop_expr.copy(), - outer_loop.step_expr.copy(), - children=[inner_loop.copy()] - ) - ) - # Remove the now empty jj loop - outer_loop.detach() - - def insert_explicit_loop_parallelism( schedule, region_directive_trans=None, loop_directive_trans=None, collapse: bool = True, privatise_arrays: bool = False, - asynchronous_parallelism: bool = False, uniform_intrinsics_only: bool = False, ): ''' For each loop in the schedule that doesn't already have a Directive @@ -499,22 +420,17 @@ def insert_explicit_loop_parallelism( many nested loops as possible. :param privatise_arrays: whether to attempt to privatise arrays that cause write-write race conditions. - :param asynchronous_parallelism: whether to attempt to add asynchronocity - to the parallel sections. :param uniform_intrinsics_only: if True it prevent offloading loops with non-reproducible device intrinsics. ''' - nemo_v4 = os.environ.get('NEMOV4', False) - if schedule.name == "ts_wgt": - return # TODO #2937 WaW dependency incorrectly considered private # Add the parallel directives in each loop for loop in schedule.walk(Loop): if loop.ancestor(Directive): continue # Skip if an outer loop is already parallelised opts = {"collapse": collapse, "privatise_arrays": privatise_arrays, - "verbose": True, "nowait": asynchronous_parallelism} + "verbose": True, "nowait": False} if uniform_intrinsics_only: opts["device_string"] = "nvfortran-uniform" @@ -527,51 +443,33 @@ def insert_explicit_loop_parallelism( "and is not the inner loop") continue - if nemo_v4: - # Skip if it is an array operation loop on an ice routine if along - # the third dim or higher or if the loop nests a loop over ice - # points (npti) or if the loop and array dims do not match. - # In addition, they often nest ice linearised loops (npti) - # which we'd rather parallelise - if ('ice' in routine_name - and isinstance(loop.stop_expr, IntrinsicCall) - and (loop.stop_expr.intrinsic in ( - IntrinsicCall.Intrinsic.UBOUND, - IntrinsicCall.Intrinsic.SIZE)) - and (len(loop.walk(Loop)) > 2 - or any(ref.symbol.name in ('npti',) - for lp in loop.loop_body.walk(Loop) - for ref in lp.stop_expr.walk(Reference)) - or (str(len(loop.walk(Loop))) != - loop.stop_expr.arguments[1].value))): - loop.append_preceding_comment( - "PSyclone: ICE Loop not parallelised for performance" - "reasons") - continue - - # Skip if looping over ice categories, ice or snow layers as these - # have small trip counts if they are not collapsed - if not collapse and any( - ref.symbol.name in ('jpl', 'nlay_i', 'nlay_s') - for ref in loop.stop_expr.walk(Reference) - ): - loop.append_preceding_comment( - "PSyclone: Loop not parallelised because stops at 'jpl'," - " 'nlay_i' or 'nlay_s' and is not collapsed.") - continue + # Skip if it is an array operation loop on an ice routine if along the + # third dim or higher or if the loop nests a loop over ice points + # (npti) or if the loop and array dims do not match. + # In addition, they often nest ice linearised loops (npti) + # which we'd rather parallelise + if ('ice' in routine_name + and isinstance(loop.stop_expr, IntrinsicCall) + and (loop.stop_expr.intrinsic in (IntrinsicCall.Intrinsic.UBOUND, + IntrinsicCall.Intrinsic.SIZE)) + and (len(loop.walk(Loop)) > 2 + or any(ref.symbol.name in ('npti',) + for lp in loop.loop_body.walk(Loop) + for ref in lp.stop_expr.walk(Reference)) + or (str(len(loop.walk(Loop))) != + loop.stop_expr.arguments[1].value))): + loop.append_preceding_comment( + "PSyclone: ICE Loop not parallelised for performance reasons") + continue - else: - # In NEMOv5 add the necessary explicit private symbols in icethd - # in order to parallelise the outer loop - if routine_name == "ice_thd_zdf_BL99": - if isinstance(loop.stop_expr, Reference): - if loop.stop_expr.symbol.name == "npti": - for variable in ['zdiagbis', 'zindtbis', 'zindterm', - 'ztib', 'ztrid', 'ztsb']: - st = loop.scope.symbol_table - sym = st.lookup(variable, otherwise=None) - if sym is not None: - loop.explicitly_private_symbols.add(sym) + # Skip if looping over ice categories, ice or snow layers + # as these have only 5, 4, and 1 iterations, respectively + if (any(ref.symbol.name in ('jpl', 'nlay_i', 'nlay_s') + for ref in loop.stop_expr.walk(Reference))): + loop.append_preceding_comment( + "PSyclone: Loop not parallelised because stops at 'jpl'," + " 'nlay_i' or 'nlay_s'.") + continue try: # First check that the region_directive is feasible for this region @@ -590,12 +488,6 @@ def insert_explicit_loop_parallelism( # associted to the loop in the generated output. continue - # If we are adding asynchronous parallelism then we now try to minimise - # the number of barriers. - if asynchronous_parallelism: - minsync_trans = OMPMinimiseSyncTrans() - minsync_trans.apply(schedule) - def add_profiling(children: Union[List[Node], Schedule]): ''' From 2eacd9da92d066fc1d16fb8cccb84f391f275604 Mon Sep 17 00:00:00 2001 From: Mohammad Imaran Date: Thu, 11 Sep 2025 15:52:41 +0100 Subject: [PATCH 06/27] Adding workflow for NEMOv5 --- .github/workflows/nemo_v5_tests.yml | 43 +++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index a132271974..7a56824712 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -468,6 +468,42 @@ jobs: diff $PSYCLONE_NEMO_DIR/KGOs/run.stat.orca1.nvhpc.10steps run.stat export TIME_sec=$(grep "local MPI proces" timing.output | head -n 1 | awk '{print $5}' | tr -d s) + + + - name: NEMO 5.0 nvidia OpenACC for GPUs (BENCH - managed memory) + run: | + # Set up environment + source /archive/psyclone-spack/psyclone-spack-Jun25/spack-repo/share/spack/setup-env.sh + spack unload && spack load nemo-build-environment%nvhpc@${NVFORTRAN_VERSION} + source .runner_venv/bin/activate + export PSYCLONE_NEMO_DIR=${GITHUB_WORKSPACE}/examples/nemo/scripts + export PSYCLONE_HOME=${PWD}/.runner_venv + export NEMO_DIR=${HOME}/${NEMODIR_NAME} + export TEST_DIR=BENCH_ACC_OFFLOAD_NVHPC + + # Set up FCM: PATHs are loaded from SPACK, we only need to set the FCFLAGS + # We compile with "-O2 -Mnofma -Mnovect -gpu=math_uniform" to permit comparison of the results. + cd $NEMO_DIR + cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm + export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -acc -gpu=mem:managed,math_uniform" + + # Clean up and compile + # Without key_mpi_off it fails to compile (even without psyclone) + rm -rf tests/${TEST_DIR} + ./makenemo -r BENCH -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/acc_kernel_trans.py \ + add_key "key_mpi_off key_nosignedzero" -j 4 -v 1 + + # Run test + cd $NEMO_DIR/tests/${TEST_DIR}/EXP00 + cp $PSYCLONE_NEMO_DIR/KGOs/namelist_cfg_bench_small namelist_cfg + ./nemo + # tail run.stat + diff $PSYCLONE_NEMO_DIR/KGOs/run.stat.bench.nvhpc.small.10steps run.stat + export TIME_sec=$(grep "local proces" timing.output | head -n 1 | awk '{print $4}' | tr -d s) + + + + upload_if_on_mirror: if: ${{ github.repository == 'stfc/PSyclone-mirror' }} runs-on: ubuntu-latest @@ -531,4 +567,11 @@ jobs: compiler:"nvhpc-'"$NVFORTRAN_VERSION"'", elapsed_time: '"${{needs.run_if_on_mirror.outputs.orca2_nvfortran_omp_offload_async}}"', '"$COMMON_FIELDS"' + }, + { + ci_test: "NEMOv5 OpenACC for GPU (BENCH)", + nemo_version: "NEMOv5", + compiler:"nvhpc-'"$NVFORTRAN_VERSION"'", + date: new Date(), + elapsed_time: '"${TIME_sec}"'' }])' From cc8e919ccb7ff3a2249b092045b71caf39e0ca6e Mon Sep 17 00:00:00 2001 From: Mohammad Imaran Date: Thu, 11 Sep 2025 16:22:44 +0100 Subject: [PATCH 07/27] flake8 compatibility --- .github/workflows/nemo_v5_tests.yml | 12 ++++++------ examples/nemo/scripts/utils.py | 14 +++++--------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index 7a56824712..269cbf73f5 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -568,10 +568,10 @@ jobs: elapsed_time: '"${{needs.run_if_on_mirror.outputs.orca2_nvfortran_omp_offload_async}}"', '"$COMMON_FIELDS"' }, - { - ci_test: "NEMOv5 OpenACC for GPU (BENCH)", - nemo_version: "NEMOv5", - compiler:"nvhpc-'"$NVFORTRAN_VERSION"'", - date: new Date(), - elapsed_time: '"${TIME_sec}"'' + { + ci_test: "NEMOv5 OpenACC for GPU (BENCH)", + nemo_version: "NEMOv5", system: "GlaDos", + compiler:"nvhpc-24.5" , + date: new Date(), + elapsed_time: '"${TIME_sec}"' }])' diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index 0fb7647807..cfe608b5bf 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -41,7 +41,7 @@ from psyclone.psyir.nodes import ( Assignment, Loop, Directive, Node, Reference, CodeBlock, ArrayReference, - Call, Return, IfBlock, Routine, Schedule, IntrinsicCall ) + Call, Return, IfBlock, Routine, Schedule, IntrinsicCall) from psyclone.psyir.symbols import ( DataSymbol, INTEGER_TYPE, ScalarType, RoutineSymbol) @@ -63,8 +63,8 @@ "icbdia.f90", "icbini.f90", "icbstp.f90", "iom.f90", "iom_nf90.f90", "obs_grid.f90", "obs_averg_h2d.f90", "obs_profiles_def.f90", "obs_types.f90", "obs_read_prof.f90", "obs_write.f90", "tide_mod.f90", - "zdfosm.f90", "obs_read_surf.f90","dynldf_lev.f90", "ldftra.f90", "tramle.f90", - "trcsink.f90", "p4zpoc.f90" + "zdfosm.f90", "obs_read_surf.f90", "dynldf_lev.f90", "ldftra.f90", + "tramle.f90", "trcsink.f90", "p4zpoc.f90" ] # If routine names contain these substrings then we do not profile them @@ -80,11 +80,7 @@ # function calls if the symbol is imported from some other module. # We therefore work-around this by keeping a list of known NEMO functions # from v4 and v5. -DEBUGCHECKSUM_IGNORE = [ ] -#DEBUGCHECKSUM_IGNORE = ['ldf_slp_init', 'dyn_spg_ts_init', -# 'lbc_lnk_pt2pt_dp', 'dyn_vor_init', 'tke_tke', -# 'p4z_fechem', 'p4z_micro', 'p4z_meso', 'tra_mle_trp_MLF', -# 'tra_adv', "p4z_lys", "tra_adv_fct"] +DEBUGCHECKSUM_IGNORE = [] NEMO_FUNCTIONS = [ # Internal funtions can be obtained with: # $ grep -rhi "end function" src/ | awk '{print $3}' | uniq | sort @@ -151,7 +147,7 @@ 'visc_air', 'visc_air_sclr', 'visc_air_vctr', 'w1', 'w2', 'z0_from_Cd', 'z0tq_LKB', 'zdf_gls_alloc', 'zdf_iwm_alloc', 'zdf_mfc_alloc', 'zdf_mxl_alloc', 'zdf_oce_alloc', 'zdf_osm_alloc', 'zdf_phy_alloc', - 'zdf_tke_alloc', 'zdf_tmx_alloc','dynldf_lev_lap', 'ldf_eiv_trp_t', + 'zdf_tke_alloc', 'zdf_tmx_alloc', 'dynldf_lev_lap', 'ldf_eiv_trp_t', ] # Currently fparser has no way of distinguishing array accesses from statement From 0e1c47731369f2c5d43906ad5aadfedf40f355aa Mon Sep 17 00:00:00 2001 From: Mohammad Imaran Date: Thu, 11 Sep 2025 20:07:05 +0100 Subject: [PATCH 08/27] flake8 compatibility --- examples/nemo/scripts/acc_kernels_trans.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/examples/nemo/scripts/acc_kernels_trans.py b/examples/nemo/scripts/acc_kernels_trans.py index 9803e68810..e03685bf79 100755 --- a/examples/nemo/scripts/acc_kernels_trans.py +++ b/examples/nemo/scripts/acc_kernels_trans.py @@ -302,12 +302,10 @@ def valid_acc_kernel(node): if child.walk(Loop): loop_count += 1 if loop_count > 1: - log_msg( - routine_name, - "Loop over levels contains several " - "other loops", - enode, - ) + msg = "Loop over levels contains several \ + other loops" + if msg not in enode.preceding_comment: + enode.append_preceding_comment(msg) return False return True From 526a3f6c4c0aaa9c899a5c34f27df7d3d9f54f2f Mon Sep 17 00:00:00 2001 From: Mohammad Imaran Date: Wed, 24 Sep 2025 11:11:58 +0100 Subject: [PATCH 09/27] Adding logs as comment --- examples/nemo/scripts/acc_kernels_trans.py | 32 ++-- examples/nemo/scripts/utils.py | 190 ++++++++++++++++----- external/dl_esm_inf | 2 +- 3 files changed, 170 insertions(+), 54 deletions(-) diff --git a/examples/nemo/scripts/acc_kernels_trans.py b/examples/nemo/scripts/acc_kernels_trans.py index e03685bf79..f581db334c 100755 --- a/examples/nemo/scripts/acc_kernels_trans.py +++ b/examples/nemo/scripts/acc_kernels_trans.py @@ -75,6 +75,7 @@ Literal, ACCLoopDirective, IntrinsicCall, + Statement, ) from psyclone.psyir.transformations import ( ACCKernelsTrans, @@ -222,11 +223,10 @@ def valid_acc_kernel(node): # check that the 'region' contains a loop. ACC_KERN_TRANS.validate(node, options={"disable_loop_check": True}) except TransformationError as err: - log_msg( - routine_name, - f"Node rejected by ACCKernelTrans.validate: " f"{err.value}", - node, - ) + msg = f"Node rejected by ACCKernelTrans.validate: {err.value}" + parent_stmt = node.ancestor(Statement, include_self=True) + if msg not in parent_stmt.preceding_comment: + parent_stmt.append_preceding_comment(msg) return False # Allow for per-routine setting of what to exclude from within KERNELS @@ -263,7 +263,9 @@ def valid_acc_kernel(node): and excluding.ifs_scalars and not isinstance(enode.condition, BinaryOperation) ): - log_msg(routine_name, "IF references scalars", enode) + msg = "IF references scalars" + if msg not in enode.preceding_comment: + enode.append_preceding_comment(msg) return False # When using CUDA Unified Memory, only allocated arrays reside in # shared memory (including those that are created by compiler- @@ -272,10 +274,9 @@ def valid_acc_kernel(node): # arrays are often static in NEMO. Hence, we disallow IFs where the # logical expression involves the latter. if any(len(array.children) == 1 for array in arrays): - log_msg( - routine_name, - "IF references 1D arrays that may be static", enode - ) + msg = "IF references 1D arrays that may be static" + if msg not in enode.preceding_comment: + enode.append_preceding_comment(msg) return False elif isinstance(enode, Loop): @@ -407,13 +408,16 @@ def try_kernels_trans(nodes): try: ACC_LOOP_TRANS.apply(loop, {"collapse": 2}) except TransformationError as err: - print(f"Failed to collapse lat-lon loop: {loop}") - print(f"Error was: {err}") + msg = f"Failed to collapse acc lat-lon \ + loop \n Error was: {err}" + if msg not in loop.preceding_comment: + loop.append_preceding_comment(msg) return True except (TransformationError, InternalError) as err: - print(f"Failed to insert acc kernels around nodes: {nodes}") - print(f"Error was: {err}") + msg = f"Failed to insert acc kernels around nodes \n Error was: {err}" + if msg not in node.preceding_comment: + node.append_preceding_comment(msg) return False diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index cfe608b5bf..70f7b4e7a1 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -35,20 +35,21 @@ ''' Utilities file to parallelise Nemo code. ''' +import os from typing import List, Union from psyclone.domain.common.transformations import KernelModuleInlineTrans from psyclone.psyir.nodes import ( - Assignment, Loop, Directive, Node, Reference, CodeBlock, ArrayReference, - Call, Return, IfBlock, Routine, Schedule, IntrinsicCall) - + Call, Return, IfBlock, Routine, Schedule, IntrinsicCall, + StructureReference) from psyclone.psyir.symbols import ( DataSymbol, INTEGER_TYPE, ScalarType, RoutineSymbol) from psyclone.psyir.transformations import ( ArrayAssignment2LoopsTrans, HoistLoopBoundExprTrans, HoistLocalArraysTrans, HoistTrans, InlineTrans, Maxval2LoopTrans, ProfileTrans, - Reference2ArrayRangeTrans, ScalarisationTrans) + OMPMinimiseSyncTrans, Reference2ArrayRangeTrans, + ScalarisationTrans, IncreaseRankLoopArraysTrans) from psyclone.transformations import TransformationError # USE statements to chase to gather additional symbol information. @@ -63,8 +64,7 @@ "icbdia.f90", "icbini.f90", "icbstp.f90", "iom.f90", "iom_nf90.f90", "obs_grid.f90", "obs_averg_h2d.f90", "obs_profiles_def.f90", "obs_types.f90", "obs_read_prof.f90", "obs_write.f90", "tide_mod.f90", - "zdfosm.f90", "obs_read_surf.f90", "dynldf_lev.f90", "ldftra.f90", - "tramle.f90", "trcsink.f90", "p4zpoc.f90" + "zdfosm.f90", "obs_read_surf.f90", ] # If routine names contain these substrings then we do not profile them @@ -80,7 +80,6 @@ # function calls if the symbol is imported from some other module. # We therefore work-around this by keeping a list of known NEMO functions # from v4 and v5. -DEBUGCHECKSUM_IGNORE = [] NEMO_FUNCTIONS = [ # Internal funtions can be obtained with: # $ grep -rhi "end function" src/ | awk '{print $3}' | uniq | sort @@ -147,7 +146,26 @@ 'visc_air', 'visc_air_sclr', 'visc_air_vctr', 'w1', 'w2', 'z0_from_Cd', 'z0tq_LKB', 'zdf_gls_alloc', 'zdf_iwm_alloc', 'zdf_mfc_alloc', 'zdf_mxl_alloc', 'zdf_oce_alloc', 'zdf_osm_alloc', 'zdf_phy_alloc', - 'zdf_tke_alloc', 'zdf_tmx_alloc', 'dynldf_lev_lap', 'ldf_eiv_trp_t', + 'zdf_tke_alloc', 'zdf_tmx_alloc', 'itau2date', + # grep -rh "INTERFACE" src | grep -v "END" | awk '{print $2}' | uniq | sort + 'alpha_sw', 'bulk_formula', 'cp_air', 'debug', 'DECAL_FEEDBACK', + 'DECAL_FEEDBACK_2D', 'depth_to_e3', 'de_sat_dt_ice', 'dia_ar5_hst', + 'dia_ptr_hst', 'div_hor', 'dom_tile_copyin', 'dom_tile_copyout', + 'dq_sat_dt_ice', 'dyn_vor', 'e3_to_depth', 'eos', 'eos_fzp', + 'eos_rab', 'e_sat', 'e_sat_ice', 'f_h_louis', 'f_m_louis', + 'gamma_moist', 'glob_2Dmax', 'glob_2Dmin', 'glob_2Dsum', 'glob_3Dmax', + 'glob_3Dmin', 'glob_3Dsum', 'halo_mng_resize', 'icb_utl_bilin_h', + 'ice_var_itd', 'ice_var_snwblow', 'ice_var_snwfra', 'iom_get', + 'iom_getatt', 'iom_nf90_get', 'iom_put', 'iom_putatt', + 'iom_rstput', 'lbc_lnk', 'lbc_lnk_neicoll', 'lbc_lnk_pt2pt', + 'lbc_nfd', 'lbnd_ij', 'ldf_eiv_trp', 'local_2Dmax', 'local_2Dmin', + 'local_2Dsum', 'local_3Dmax', 'local_3Dmin', 'local_3Dsum', + 'L_vap', 'mpp_max', 'mpp_maxloc', 'mpp_min', 'mpp_minloc', + 'mpp_nfd', 'mpp_sum', 'pres_temp', 'prt_ctl_sum', 'ptr_mpp_sum', + 'ptr_sj', 'ptr_sum', 'qlw_net', 'q_sat', 'rho_air', 'Ri_bulk', + 'SIGN', 'sum3x3', 'theta_exner', 'tra_mle_trp', 'trd_vor_zint', + 'virt_temp', 'visc_air', 'wAimp', 'wzv', 'zdf_osm_iomput', + 'zdf_osm_velocity_rotation', ] # Currently fparser has no way of distinguishing array accesses from statement @@ -159,7 +177,7 @@ PARALLELISATION_ISSUES = [ "ldfc1d_c2d.f90", "tramle.f90", - "dynspg_ts.f90", + "traqsr.f90", ] PRIVATISATION_ISSUES = [ @@ -299,8 +317,9 @@ def normalise_loops( convert_array_notation: bool = True, loopify_array_intrinsics: bool = True, convert_range_loops: bool = True, - hoist_expressions: bool = True, scalarise_loops: bool = False, + increase_array_ranks: bool = False, + hoist_expressions: bool = True, ): ''' Normalise all loops in the given schedule so that they are in an appropriate form for the Parallelisation transformations to analyse @@ -315,10 +334,12 @@ def normalise_loops( operate on arrays to explicit loops (currently only maxval). :param bool convert_range_loops: whether to convert ranges to explicit loops. - :param bool hoist_expressions: whether to hoist bounds and loop invariant - statements out of the loop nest. :param scalarise_loops: whether to attempt to convert arrays to scalars where possible, default is False. + :param increase_array_ranks: whether to increase the rank of selected + arrays. + :param hoist_expressions: whether to hoist bounds and loop invariant + statements out of the loop nest. ''' if hoist_local_arrays and schedule.name not in CONTAINS_STMT_FUNCTIONS: # Apply the HoistLocalArraysTrans when possible, it cannot be applied @@ -354,6 +375,8 @@ def normalise_loops( # Convert all array implicit loops to explicit loops explicit_loops = ArrayAssignment2LoopsTrans() for assignment in schedule.walk(Assignment): + if assignment.walk(StructureReference): + continue # TODO #2951 Fix issues with structure_refs try: explicit_loops.apply(assignment) except TransformationError: @@ -369,6 +392,9 @@ def normalise_loops( for loop in loops: scalartrans.apply(loop) + if increase_array_ranks: + increase_rank_and_reorder_nemov5_loops(schedule) + if hoist_expressions: # First hoist all possible expressions for loop in schedule.walk(Loop): @@ -391,12 +417,69 @@ def normalise_loops( # top level. This would allow the collapse clause to be applied. +def increase_rank_and_reorder_nemov5_loops(routine: Routine): + ''' This method increases the rank of temporary arrays used inside selected + loops (in order to parallelise the outer loop without overlapping them) + and then rearranges the outer loop next to the inner ones (in order to + collapse them), so that more parallelism can be leverage. This is useful + in GPU contexts, but it increases the memory footprint and may not be + beneficial for caching-architectures. + + :param routine: the target routine. + + ''' + irlatrans = IncreaseRankLoopArraysTrans() + + # Map of routines and arrays + selection = { + "dyn_zdf": ['zwd', 'zwi', 'zws'], + "tra_zdf_imp": ['zwd', 'zwi', 'zws', 'zwt'] + } + + if routine.name not in selection: + return + + for outer_loop in routine.walk(Loop, stop_type=Loop): + if outer_loop.variable.name == "jj": + # Increase the rank of the temporary arrays in this loop + irlatrans.apply(outer_loop, arrays=selection[routine.name]) + # Now reorder the code + for child in outer_loop.loop_body[:]: + # Move the contents of the jj loop outside it + outer_loop.parent.addchild(child.detach(), + index=outer_loop.position) + # Add a new jj loop around each inner loop that is not 'jn' + target_loop = [] + for inner_loop in child.walk(Loop, stop_type=Loop): + if inner_loop.variable.name != "jn": + target_loop.append(inner_loop) + else: + for next_loop in inner_loop.loop_body.walk( + Loop, stop_type=Loop): + target_loop.append(next_loop) + for inner_loop in target_loop: + if isinstance(inner_loop.loop_body[0], Loop): + inner_loop = inner_loop.loop_body[0] + inner_loop.replace_with( + Loop.create( + outer_loop.variable, + outer_loop.start_expr.copy(), + outer_loop.stop_expr.copy(), + outer_loop.step_expr.copy(), + children=[inner_loop.copy()] + ) + ) + # Remove the now empty jj loop + outer_loop.detach() + + def insert_explicit_loop_parallelism( schedule, region_directive_trans=None, loop_directive_trans=None, collapse: bool = True, privatise_arrays: bool = False, + asynchronous_parallelism: bool = False, uniform_intrinsics_only: bool = False, ): ''' For each loop in the schedule that doesn't already have a Directive @@ -416,17 +499,22 @@ def insert_explicit_loop_parallelism( many nested loops as possible. :param privatise_arrays: whether to attempt to privatise arrays that cause write-write race conditions. + :param asynchronous_parallelism: whether to attempt to add asynchronocity + to the parallel sections. :param uniform_intrinsics_only: if True it prevent offloading loops with non-reproducible device intrinsics. ''' + nemo_v4 = os.environ.get('NEMOV4', False) + if schedule.name == "ts_wgt": + return # TODO #2937 WaW dependency incorrectly considered private # Add the parallel directives in each loop for loop in schedule.walk(Loop): if loop.ancestor(Directive): continue # Skip if an outer loop is already parallelised opts = {"collapse": collapse, "privatise_arrays": privatise_arrays, - "verbose": True, "nowait": False} + "verbose": True, "nowait": asynchronous_parallelism} if uniform_intrinsics_only: opts["device_string"] = "nvfortran-uniform" @@ -439,33 +527,51 @@ def insert_explicit_loop_parallelism( "and is not the inner loop") continue - # Skip if it is an array operation loop on an ice routine if along the - # third dim or higher or if the loop nests a loop over ice points - # (npti) or if the loop and array dims do not match. - # In addition, they often nest ice linearised loops (npti) - # which we'd rather parallelise - if ('ice' in routine_name - and isinstance(loop.stop_expr, IntrinsicCall) - and (loop.stop_expr.intrinsic in (IntrinsicCall.Intrinsic.UBOUND, - IntrinsicCall.Intrinsic.SIZE)) - and (len(loop.walk(Loop)) > 2 - or any(ref.symbol.name in ('npti',) - for lp in loop.loop_body.walk(Loop) - for ref in lp.stop_expr.walk(Reference)) - or (str(len(loop.walk(Loop))) != - loop.stop_expr.arguments[1].value))): - loop.append_preceding_comment( - "PSyclone: ICE Loop not parallelised for performance reasons") - continue + if nemo_v4: + # Skip if it is an array operation loop on an ice routine if along + # the third dim or higher or if the loop nests a loop over ice + # points (npti) or if the loop and array dims do not match. + # In addition, they often nest ice linearised loops (npti) + # which we'd rather parallelise + if ('ice' in routine_name + and isinstance(loop.stop_expr, IntrinsicCall) + and (loop.stop_expr.intrinsic in ( + IntrinsicCall.Intrinsic.UBOUND, + IntrinsicCall.Intrinsic.SIZE)) + and (len(loop.walk(Loop)) > 2 + or any(ref.symbol.name in ('npti',) + for lp in loop.loop_body.walk(Loop) + for ref in lp.stop_expr.walk(Reference)) + or (str(len(loop.walk(Loop))) != + loop.stop_expr.arguments[1].value))): + loop.append_preceding_comment( + "PSyclone: ICE Loop not parallelised for performance" + "reasons") + continue - # Skip if looping over ice categories, ice or snow layers - # as these have only 5, 4, and 1 iterations, respectively - if (any(ref.symbol.name in ('jpl', 'nlay_i', 'nlay_s') - for ref in loop.stop_expr.walk(Reference))): - loop.append_preceding_comment( - "PSyclone: Loop not parallelised because stops at 'jpl'," - " 'nlay_i' or 'nlay_s'.") - continue + # Skip if looping over ice categories, ice or snow layers as these + # have small trip counts if they are not collapsed + if not collapse and any( + ref.symbol.name in ('jpl', 'nlay_i', 'nlay_s') + for ref in loop.stop_expr.walk(Reference) + ): + loop.append_preceding_comment( + "PSyclone: Loop not parallelised because stops at 'jpl'," + " 'nlay_i' or 'nlay_s' and is not collapsed.") + continue + + else: + # In NEMOv5 add the necessary explicit private symbols in icethd + # in order to parallelise the outer loop + if routine_name == "ice_thd_zdf_BL99": + if isinstance(loop.stop_expr, Reference): + if loop.stop_expr.symbol.name == "npti": + for variable in ['zdiagbis', 'zindtbis', 'zindterm', + 'ztib', 'ztrid', 'ztsb']: + st = loop.scope.symbol_table + sym = st.lookup(variable, otherwise=None) + if sym is not None: + loop.explicitly_private_symbols.add(sym) try: # First check that the region_directive is feasible for this region @@ -484,6 +590,12 @@ def insert_explicit_loop_parallelism( # associted to the loop in the generated output. continue + # If we are adding asynchronous parallelism then we now try to minimise + # the number of barriers. + if asynchronous_parallelism: + minsync_trans = OMPMinimiseSyncTrans() + minsync_trans.apply(schedule) + def add_profiling(children: Union[List[Node], Schedule]): ''' diff --git a/external/dl_esm_inf b/external/dl_esm_inf index 358402ecc4..ad209e9d25 160000 --- a/external/dl_esm_inf +++ b/external/dl_esm_inf @@ -1 +1 @@ -Subproject commit 358402ecc4d88e93a62a3ca13dc9d20d2eb27f90 +Subproject commit ad209e9d252995bd83127de4c481232ca14ed655 From 762bed0dbdbb7556921812e5514d5f67dccfd09c Mon Sep 17 00:00:00 2001 From: Mohammad Imaran Date: Thu, 2 Oct 2025 14:19:45 +0100 Subject: [PATCH 10/27] Comments from PR review --- .github/workflows/nemo_v5_tests.yml | 19 +++++----- examples/nemo/scripts/acc_kernels_trans.py | 41 +++++++--------------- 2 files changed, 22 insertions(+), 38 deletions(-) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index 269cbf73f5..ad0eaaeb36 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -470,7 +470,7 @@ jobs: - - name: NEMO 5.0 nvidia OpenACC for GPUs (BENCH - managed memory) + - name: NEMO 5.0 nvidia OpenACC Kernels for GPUs (BENCH - managed memory) run: | # Set up environment source /archive/psyclone-spack/psyclone-spack-Jun25/spack-repo/share/spack/setup-env.sh @@ -479,16 +479,14 @@ jobs: export PSYCLONE_NEMO_DIR=${GITHUB_WORKSPACE}/examples/nemo/scripts export PSYCLONE_HOME=${PWD}/.runner_venv export NEMO_DIR=${HOME}/${NEMODIR_NAME} - export TEST_DIR=BENCH_ACC_OFFLOAD_NVHPC + export TEST_DIR=BENCH_ACC_KERNELS_OFFLOAD_NVHPC # Set up FCM: PATHs are loaded from SPACK, we only need to set the FCFLAGS # We compile with "-O2 -Mnofma -Mnovect -gpu=math_uniform" to permit comparison of the results. cd $NEMO_DIR cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -acc -gpu=mem:managed,math_uniform" - - # Clean up and compile - # Without key_mpi_off it fails to compile (even without psyclone) + rm -rf tests/${TEST_DIR} ./makenemo -r BENCH -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/acc_kernel_trans.py \ add_key "key_mpi_off key_nosignedzero" -j 4 -v 1 @@ -569,9 +567,10 @@ jobs: '"$COMMON_FIELDS"' }, { - ci_test: "NEMOv5 OpenACC for GPU (BENCH)", - nemo_version: "NEMOv5", system: "GlaDos", - compiler:"nvhpc-24.5" , - date: new Date(), - elapsed_time: '"${TIME_sec}"' + ci_test: "NEMOv5 OpenACC Kernels for GPU (BENCH)", + nemo_version: "NEMO 5.0-RC MO patch", + compiler:"nvhpc-'"$NVFORTRAN_VERSION"'", + elapsed_time: '"${{needs.run_if_on_mirror.outputs.bench_nvfortran_acc_kernels__offload_async}}"', + + '"$COMMON_FIELDS"' }])' diff --git a/examples/nemo/scripts/acc_kernels_trans.py b/examples/nemo/scripts/acc_kernels_trans.py index f581db334c..93755de300 100755 --- a/examples/nemo/scripts/acc_kernels_trans.py +++ b/examples/nemo/scripts/acc_kernels_trans.py @@ -53,7 +53,6 @@ routine) then the script moves a level down the tree and then repeats the process of attempting to create the largest possible Kernel region. -Tested with the NVIDIA HPC SDK version 23.7. """ import logging @@ -64,7 +63,6 @@ NEMO_MODULES_TO_IMPORT, ) from psyclone.errors import InternalError -from psyclone.psyGen import TransInfo from psyclone.psyir.nodes import ( IfBlock, ArrayReference, @@ -73,8 +71,7 @@ Loop, Routine, Literal, - ACCLoopDirective, - IntrinsicCall, + ACCLoopDirective, Statement, ) from psyclone.psyir.transformations import ( @@ -82,11 +79,13 @@ ACCUpdateTrans, TransformationError, ProfileTrans, - DebugChecksumTrans, ) -from psyclone.transformations import ACCEnterDataTrans +from psyclone.transformations import ( + ACCEnterDataTrans, + ACCLoopTrans, + ACCRoutineTrans, +) -# from psyclone.psyir.symbols import DataSymbol, DataTypeSymbol, ArrayType # Set up some loop_type inference rules in order to reference useful domain # loop constructs by name @@ -108,12 +107,11 @@ # Get the PSyclone transformations we will use ACC_KERN_TRANS = ACCKernelsTrans() -ACC_LOOP_TRANS = TransInfo().get_trans_name("ACCLoopTrans") -ACC_ROUTINE_TRANS = TransInfo().get_trans_name("ACCRoutineTrans") +ACC_LOOP_TRANS = ACCLoopTrans() +ACC_ROUTINE_TRANS = ACCRoutineTrans() ACC_EDATA_TRANS = ACCEnterDataTrans() ACC_UPDATE_TRANS = ACCUpdateTrans() PROFILE_TRANS = ProfileTrans() -CHECKSUM_TRANS = DebugChecksumTrans() # Whether or not to add profiling calls around unaccelerated regions # N.B. this can inhibit PSyclone's ability to inline! @@ -148,9 +146,10 @@ "bdy_init", "bdy_segs", "sbc_cpl_init", + # Str handling, init routine "asm_inc_init", "dia_obs_init", -] # Str handling, init routine +] class ExcludeSettings: @@ -236,14 +235,9 @@ def valid_acc_kernel(node): # Rather than walk the tree multiple times, look for both excluded node # types and possibly problematic operations - excluded_types = (IfBlock, Loop, ArrayReference, IntrinsicCall) + excluded_types = (IfBlock, Loop, ArrayReference) excluded_nodes = node.walk(excluded_types) - for enode in excluded_nodes: - if isinstance(enode, IntrinsicCall): - if "dim" in enode.argument_names: - return False - if isinstance(enode, IfBlock): # We permit IF blocks originating from WHERE constructs and # single-statement IF blocks containing a Loop in KERNELS regions @@ -287,11 +281,6 @@ def valid_acc_kernel(node): # In general, this heuristic will depend upon how many levels the # model configuration will contain. child = enode.loop_body[0] if enode.loop_body.children else None - # if isinstance(child, Loop) and child.loop_type == "levels": - # We have a loop around a loop over levels - # log_msg(routine_name, "Loop is around a loop over levels", - # enode) - # return False if ( enode.loop_type == "levels" and len(enode.loop_body.children) > 1 @@ -428,11 +417,6 @@ def trans(psyir): :param psyir: the PSyIR of the provided file. :type psyir: :py:class:`psyclone.psyir.nodes.FileContainer` """ - logging.basicConfig( - filename="psyclone.log", - filemode="w", level=logging.INFO - ) - for subroutine in psyir.walk(Routine): print(f"Transforming subroutine: {subroutine.name}") @@ -447,7 +431,8 @@ def trans(psyir): if subroutine.name.lower() not in ACC_IGNORE: print(f"Transforming {subroutine.name} with acc kernels") enhance_tree_information(subroutine) - # inline_calls(subroutine) + # inline_calls(subroutine) # Inlining isn't robust enough for use + have_kernels = add_kernels(subroutine.children) add_kernels(subroutine.children) else: print( From 47db5254dbfd8deef2dde6d9f3b378be2469c3b7 Mon Sep 17 00:00:00 2001 From: Mohammad Imaran Date: Thu, 2 Oct 2025 15:29:41 +0100 Subject: [PATCH 11/27] Tidying up PR --- examples/nemo/scripts/acc_kernels_trans.py | 15 +++++++-------- examples/nemo/scripts/utils.py | 5 +++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/examples/nemo/scripts/acc_kernels_trans.py b/examples/nemo/scripts/acc_kernels_trans.py index 93755de300..00e3d6d580 100755 --- a/examples/nemo/scripts/acc_kernels_trans.py +++ b/examples/nemo/scripts/acc_kernels_trans.py @@ -71,7 +71,7 @@ Loop, Routine, Literal, - ACCLoopDirective, + ACCLoopDirective, Statement, ) from psyclone.psyir.transformations import ( @@ -433,19 +433,18 @@ def trans(psyir): enhance_tree_information(subroutine) # inline_calls(subroutine) # Inlining isn't robust enough for use have_kernels = add_kernels(subroutine.children) - add_kernels(subroutine.children) + # Add required OpenACC update directives to every routine, + # including to those with no device code and that execute + # exclusively on the host + if ACC_EXPLICIT_MEM_MANAGEMENT and have_kernels: + print(f"Transforming {subroutine.name} with acc update") + ACC_UPDATE_TRANS.apply(subroutine) else: print( f"Addition of OpenACC to routine {subroutine.name} " f"disabled!" ) - # Add required OpenACC update directives to every routine, including to - # those with no device code and that execute exclusively on the host - if ACC_EXPLICIT_MEM_MANAGEMENT: - print(f"Transforming {subroutine.name} with acc update") - ACC_UPDATE_TRANS.apply(subroutine) - # Add profiling instrumentation if PROFILE_NONACC: print( diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index 70f7b4e7a1..a3814d3f43 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -64,8 +64,9 @@ "icbdia.f90", "icbini.f90", "icbstp.f90", "iom.f90", "iom_nf90.f90", "obs_grid.f90", "obs_averg_h2d.f90", "obs_profiles_def.f90", "obs_types.f90", "obs_read_prof.f90", "obs_write.f90", "tide_mod.f90", - "zdfosm.f90", "obs_read_surf.f90", -] + "zdfosm.f90", "obs_read_surf.f90", "obs_surf_def.f90", + "sbcblk_algo_andreas.f90", "isfcpl.f90", "trcsms_cfc.f90" + ] # If routine names contain these substrings then we do not profile them PROFILING_IGNORE = ["flo_dom", "macho", "mpp_", "nemo_gcm", "dyn_ldf" From 2240deed171b8ea381b1cc25a0478e52ab96c877 Mon Sep 17 00:00:00 2001 From: Mohammad Imaran Date: Thu, 2 Oct 2025 19:54:45 +0100 Subject: [PATCH 12/27] Updating NEMOv5 path --- .github/workflows/nemo_v5_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index ad0eaaeb36..8453b977e9 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -478,7 +478,7 @@ jobs: source .runner_venv/bin/activate export PSYCLONE_NEMO_DIR=${GITHUB_WORKSPACE}/examples/nemo/scripts export PSYCLONE_HOME=${PWD}/.runner_venv - export NEMO_DIR=${HOME}/${NEMODIR_NAME} + export NEMO_DIR=/archive/mmi43-nxn02/NEMOv5 export TEST_DIR=BENCH_ACC_KERNELS_OFFLOAD_NVHPC # Set up FCM: PATHs are loaded from SPACK, we only need to set the FCFLAGS From 96a43d02025b77d76cf5923bc3e6c1d6189d4d9a Mon Sep 17 00:00:00 2001 From: Mohammad Imaran Date: Fri, 3 Oct 2025 10:18:21 +0100 Subject: [PATCH 13/27] NEMOv5 path --- .github/workflows/nemo_v5_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index 8453b977e9..a9d8a7566f 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -478,7 +478,7 @@ jobs: source .runner_venv/bin/activate export PSYCLONE_NEMO_DIR=${GITHUB_WORKSPACE}/examples/nemo/scripts export PSYCLONE_HOME=${PWD}/.runner_venv - export NEMO_DIR=/archive/mmi43-nxn02/NEMOv5 + export NEMO_DIR=/archive/psyclone-tests/latest-run/UKMO-NEMOv5 export TEST_DIR=BENCH_ACC_KERNELS_OFFLOAD_NVHPC # Set up FCM: PATHs are loaded from SPACK, we only need to set the FCFLAGS From 5f63ac39c38f26b8ff0e954ed58462bb5508912e Mon Sep 17 00:00:00 2001 From: Mohammad Imaran Date: Fri, 3 Oct 2025 11:13:54 +0100 Subject: [PATCH 14/27] Path and flags --- .github/workflows/nemo_v5_tests.yml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index a9d8a7566f..319379a632 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -482,14 +482,12 @@ jobs: export TEST_DIR=BENCH_ACC_KERNELS_OFFLOAD_NVHPC # Set up FCM: PATHs are loaded from SPACK, we only need to set the FCFLAGS - # We compile with "-O2 -Mnofma -Mnovect -gpu=math_uniform" to permit comparison of the results. cd $NEMO_DIR cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm - export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -acc -gpu=mem:managed,math_uniform" - + export FCFLAGS="-i4 -Mr8 -cpp -Minline -Minfo=accel -Mnofma -O1 -gopt -traceback -acc -gpu=mem:managed,math_uniform" rm -rf tests/${TEST_DIR} - ./makenemo -r BENCH -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/acc_kernel_trans.py \ - add_key "key_mpi_off key_nosignedzero" -j 4 -v 1 + ./makenemo -r BENCH -m linux_spack -n ${TEST_DIR} del_key "key_si3" add_key \ + "key_mpi_off key_nosignedzero" -p ${PSYCLONE_NEMO_DIR}/acc_kernels_trans.py -j ${NUM_PARALLEL} -v 1 # Run test cd $NEMO_DIR/tests/${TEST_DIR}/EXP00 From 88e95df0bd7f3c064660cf886acbf1c116b4262b Mon Sep 17 00:00:00 2001 From: Mohammad Imaran Date: Fri, 3 Oct 2025 17:17:40 +0100 Subject: [PATCH 15/27] Tidying for integration test --- .github/workflows/nemo_v5_tests.yml | 3 +-- examples/nemo/scripts/acc_kernels_trans.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index 319379a632..3af0a3c38e 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -486,8 +486,7 @@ jobs: cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm export FCFLAGS="-i4 -Mr8 -cpp -Minline -Minfo=accel -Mnofma -O1 -gopt -traceback -acc -gpu=mem:managed,math_uniform" rm -rf tests/${TEST_DIR} - ./makenemo -r BENCH -m linux_spack -n ${TEST_DIR} del_key "key_si3" add_key \ - "key_mpi_off key_nosignedzero" -p ${PSYCLONE_NEMO_DIR}/acc_kernels_trans.py -j ${NUM_PARALLEL} -v 1 + ./makenemo -r BENCH -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/acc_kernels_trans.py del_key "key_si3" add_key "key_mpi_off key_nosignedzero" -j ${NUM_PARALLEL} -v 1 # Run test cd $NEMO_DIR/tests/${TEST_DIR}/EXP00 diff --git a/examples/nemo/scripts/acc_kernels_trans.py b/examples/nemo/scripts/acc_kernels_trans.py index 00e3d6d580..a54c235d25 100755 --- a/examples/nemo/scripts/acc_kernels_trans.py +++ b/examples/nemo/scripts/acc_kernels_trans.py @@ -115,7 +115,7 @@ # Whether or not to add profiling calls around unaccelerated regions # N.B. this can inhibit PSyclone's ability to inline! -PROFILE_NONACC = True +PROFILE_NONACC = False # Whether or not to add OpenACC enter data and update directives to explicitly # move data between host and device memory From 3063cae5ac16c1c83cfc697a3d57667545b1ab65 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Thu, 27 Nov 2025 16:24:36 +0000 Subject: [PATCH 16/27] Remove divering keys form acc_kernels makenemo --- .github/workflows/nemo_v5_tests.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index 962024b24a..00f5671b67 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -494,9 +494,10 @@ jobs: # Set up FCM: PATHs are loaded from SPACK, we only need to set the FCFLAGS cd $NEMO_DIR cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm - export FCFLAGS="-i4 -Mr8 -cpp -Minline -Minfo=accel -Mnofma -O1 -gopt -traceback -acc -gpu=mem:managed,math_uniform" + export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform" + export REPRODUCIBLE=1 rm -rf tests/${TEST_DIR} - ./makenemo -r BENCH -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/acc_kernels_trans.py del_key "key_si3" add_key "key_mpi_off key_nosignedzero" -j ${NUM_PARALLEL} -v 1 + ./makenemo -r BENCH -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/acc_kernels_trans.py deadd_key "key_nosignedzero" -j ${NUM_PARALLEL} # Run test cd $NEMO_DIR/tests/${TEST_DIR}/EXP00 From 5c981f3a8d4e32a39f9b1cb2b9026c1d75dab67f Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Thu, 27 Nov 2025 16:31:34 +0000 Subject: [PATCH 17/27] Fix NEMOv5 kernels action --- .github/workflows/nemo_v5_tests.yml | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index 00f5671b67..aaad5886a9 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -60,6 +60,7 @@ jobs: orca2_nvfortran_omp_offload: ${{ steps.orca2_nvfortran_omp_offload.outputs.time }} bench_nvfortran_omp_offload_async: ${{ steps.bench_nvfortran_omp_offload_async.outputs.time }} orca2_nvfortran_omp_offload_async: ${{ steps.orca2_nvfortran_omp_offload_async.outputs.time }} + bench_nvfortran_acc_kernels: ${{ steps.bench_nvfortran_acc_kernels.outputs.time }} steps: - uses: actions/checkout@v3 @@ -478,9 +479,8 @@ jobs: diff $PSYCLONE_NEMO_DIR/KGOs/run.stat.orca1.nvhpc.10steps run.stat export TIME_sec=$(grep "local MPI proces" timing.output | head -n 1 | awk '{print $5}' | tr -d s) - - - name: NEMO 5.0 nvidia OpenACC Kernels for GPUs (BENCH - managed memory) + id: bench_nvfortran_acc_kernels run: | # Set up environment source /archive/psyclone-spack/psyclone-spack-Jun25/spack-repo/share/spack/setup-env.sh @@ -495,9 +495,8 @@ jobs: cd $NEMO_DIR cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform" - export REPRODUCIBLE=1 rm -rf tests/${TEST_DIR} - ./makenemo -r BENCH -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/acc_kernels_trans.py deadd_key "key_nosignedzero" -j ${NUM_PARALLEL} + ./makenemo -r BENCH -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/acc_kernels_trans.py add_key "key_nosignedzero" -j ${NUM_PARALLEL} # Run test cd $NEMO_DIR/tests/${TEST_DIR}/EXP00 @@ -505,10 +504,8 @@ jobs: ./nemo # tail run.stat diff $PSYCLONE_NEMO_DIR/KGOs/run.stat.bench.nvhpc.small.10steps run.stat - export TIME_sec=$(grep "local proces" timing.output | head -n 1 | awk '{print $4}' | tr -d s) - - - + export VAR_TIME=$(grep "local proces" timing.output | head -n 1 | awk '{print $4}' | tr -d s) + echo "time=${VAR_TIME}" >> "${GITHUB_OUTPUT}" upload_if_on_mirror: if: ${{ github.repository == 'stfc/PSyclone-mirror' }} @@ -585,7 +582,6 @@ jobs: ci_test: "NEMOv5 OpenACC Kernels for GPU (BENCH)", nemo_version: "NEMO 5.0-RC MO patch", compiler:"nvhpc-'"$NVFORTRAN_VERSION"'", - elapsed_time: '"${{needs.run_if_on_mirror.outputs.bench_nvfortran_acc_kernels__offload_async}}"', - + elapsed_time: '"${{needs.run_if_on_mirror.outputs.bench_nvfortran_acc_kernels}}"', '"$COMMON_FIELDS"' }])' From 1fa445daf1a86e9852cd3daccad9b4de34481269 Mon Sep 17 00:00:00 2001 From: Mohammad Imaran Date: Tue, 10 Feb 2026 11:08:33 +0000 Subject: [PATCH 18/27] Removing ArrayReference from excluded_types --- examples/nemo/scripts/acc_kernels_trans.py | 27 +--------------------- 1 file changed, 1 insertion(+), 26 deletions(-) diff --git a/examples/nemo/scripts/acc_kernels_trans.py b/examples/nemo/scripts/acc_kernels_trans.py index e0f327b278..735a097ad3 100755 --- a/examples/nemo/scripts/acc_kernels_trans.py +++ b/examples/nemo/scripts/acc_kernels_trans.py @@ -231,7 +231,7 @@ def valid_acc_kernel(node): # Rather than walk the tree multiple times, look for both excluded node # types and possibly problematic operations - excluded_types = (IfBlock, Loop, ArrayReference) + excluded_types = (IfBlock, Loop) excluded_nodes = node.walk(excluded_types) for enode in excluded_nodes: if isinstance(enode, IfBlock): @@ -244,31 +244,6 @@ def valid_acc_kernel(node): ): continue - arrays = enode.condition.walk(ArrayReference) - # We exclude if statements where the condition expression does - # not refer to arrays at all as this may cause compiler issues - # (get "Missing branch target block") or produce faster code. - if ( - not arrays - and excluding.ifs_scalars - and not isinstance(enode.condition, BinaryOperation) - ): - msg = "IF references scalars" - if msg not in enode.preceding_comment: - enode.append_preceding_comment(msg) - return False - # When using CUDA Unified Memory, only allocated arrays reside in - # shared memory (including those that are created by compiler- - # -generated allocs, e.g. for automatic arrays). We assume that all - # arrays of rank 2 or greater are dynamically allocated, whereas 1D - # arrays are often static in NEMO. Hence, we disallow IFs where the - # logical expression involves the latter. - if any(len(array.children) == 1 for array in arrays): - msg = "IF references 1D arrays that may be static" - if msg not in enode.preceding_comment: - enode.append_preceding_comment(msg) - return False - elif isinstance(enode, Loop): # Heuristic: # We don't want to put loops around 3D loops into KERNELS regions From d77e9c015a30c4bf45cb31c79ef72bace157c11f Mon Sep 17 00:00:00 2001 From: Mohammad Imaran Date: Tue, 10 Feb 2026 11:31:35 +0000 Subject: [PATCH 19/27] flake8 compatibility --- examples/nemo/scripts/acc_kernels_trans.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/examples/nemo/scripts/acc_kernels_trans.py b/examples/nemo/scripts/acc_kernels_trans.py index 735a097ad3..1d97e9379c 100755 --- a/examples/nemo/scripts/acc_kernels_trans.py +++ b/examples/nemo/scripts/acc_kernels_trans.py @@ -61,9 +61,7 @@ from psyclone.errors import InternalError from psyclone.psyir.nodes import ( IfBlock, - ArrayReference, Assignment, - BinaryOperation, Loop, Routine, Literal, @@ -211,8 +209,6 @@ def valid_acc_kernel(node): """ # The Fortran routine which our parent represents - routine_name = node.ancestor(Routine).name - try: # Since we do this check on a node-by-node basis, we disable the # check that the 'region' contains a loop. @@ -227,8 +223,6 @@ def valid_acc_kernel(node): # Allow for per-routine setting of what to exclude from within KERNELS # regions. This is because sometimes things work in one context but not # in another (with the Nvidia compiler). - excluding = EXCLUDING.get(routine_name, EXCLUDING["default"]) - # Rather than walk the tree multiple times, look for both excluded node # types and possibly problematic operations excluded_types = (IfBlock, Loop) From 4286c90f1aebc24df224cf5112b6b6892e5efe7e Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Thu, 12 Feb 2026 11:44:36 +0000 Subject: [PATCH 20/27] #3130 revert dl_esm_inf and put back checks for IF blocks within loops --- examples/nemo/scripts/acc_kernels_trans.py | 27 +++++++++++++++++++++- external/dl_esm_inf | 2 +- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/examples/nemo/scripts/acc_kernels_trans.py b/examples/nemo/scripts/acc_kernels_trans.py index 1d97e9379c..c151c0bd83 100755 --- a/examples/nemo/scripts/acc_kernels_trans.py +++ b/examples/nemo/scripts/acc_kernels_trans.py @@ -60,8 +60,10 @@ NOT_PERFORMANT, NEMO_MODULES_TO_IMPORT) from psyclone.errors import InternalError from psyclone.psyir.nodes import ( - IfBlock, + ArrayReference, Assignment, + BinaryOperation, + IfBlock, Loop, Routine, Literal, @@ -209,6 +211,8 @@ def valid_acc_kernel(node): """ # The Fortran routine which our parent represents + routine_name = node.ancestor(Routine).name + try: # Since we do this check on a node-by-node basis, we disable the # check that the 'region' contains a loop. @@ -225,6 +229,8 @@ def valid_acc_kernel(node): # in another (with the Nvidia compiler). # Rather than walk the tree multiple times, look for both excluded node # types and possibly problematic operations + excluding = EXCLUDING.get(routine_name, EXCLUDING["default"]) + excluded_types = (IfBlock, Loop) excluded_nodes = node.walk(excluded_types) for enode in excluded_nodes: @@ -238,6 +244,25 @@ def valid_acc_kernel(node): ): continue + arrays = enode.condition.walk(ArrayReference) + # We exclude if statements where the condition expression does + # not refer to arrays at all as this may cause compiler issues + # (get "Missing branch target block") or produce faster code. + if not arrays and excluding.ifs_scalars and \ + not isinstance(enode.condition, BinaryOperation): + log_msg(routine_name, "IF references scalars", enode) + return False + # When using CUDA Unified Memory, only allocated arrays reside in + # shared memory (including those that are created by compiler- + # -generated allocs, e.g. for automatic arrays). We assume that all + # arrays of rank 2 or greater are dynamically allocated, whereas 1D + # arrays are often static in NEMO. Hence, we disallow IFs where the + # logical expression involves the latter. + if any(len(array.children) == 1 for array in arrays): + log_msg(routine_name, + "IF references 1D arrays that may be static", enode) + return False + elif isinstance(enode, Loop): # Heuristic: # We don't want to put loops around 3D loops into KERNELS regions diff --git a/external/dl_esm_inf b/external/dl_esm_inf index ad209e9d25..358402ecc4 160000 --- a/external/dl_esm_inf +++ b/external/dl_esm_inf @@ -1 +1 @@ -Subproject commit ad209e9d252995bd83127de4c481232ca14ed655 +Subproject commit 358402ecc4d88e93a62a3ca13dc9d20d2eb27f90 From cc0f0604ad09d29a04b579a5dae9e27bb347107d Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Thu, 12 Feb 2026 11:45:03 +0000 Subject: [PATCH 21/27] #3130 add -acc flag to compilation in workflow --- .github/workflows/nemo_v5_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index 68ff1e07ee..e3762b99c3 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -496,7 +496,7 @@ jobs: # Set up FCM: PATHs are loaded from SPACK, we only need to set the FCFLAGS cd $NEMO_DIR cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm - export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform" + export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -acc -mp=gpu -gpu=mem:managed,math_uniform" rm -rf tests/${TEST_DIR} ./makenemo -r BENCH -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/acc_kernels_trans.py add_key "key_nosignedzero" -j ${NUM_PARALLEL} From 06c58852bad864dd92931a807dede100b76c7c5f Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Thu, 12 Feb 2026 11:54:01 +0000 Subject: [PATCH 22/27] #3130 put back explicit memory-movement functionality --- examples/nemo/scripts/acc_kernels_trans.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/examples/nemo/scripts/acc_kernels_trans.py b/examples/nemo/scripts/acc_kernels_trans.py index c151c0bd83..a52b52e676 100755 --- a/examples/nemo/scripts/acc_kernels_trans.py +++ b/examples/nemo/scripts/acc_kernels_trans.py @@ -426,14 +426,20 @@ def trans(psyir): # including to those with no device code and that execute # exclusively on the host if ACC_EXPLICIT_MEM_MANAGEMENT and have_kernels: - print(f"Transforming {subroutine.name} with acc update") - ACC_UPDATE_TRANS.apply(subroutine) + print(f"Transforming {subroutine.name} with acc enter data") + ACC_EDATA_TRANS.apply(subroutine) else: print( f"Addition of OpenACC to routine {subroutine.name} " f"disabled!" ) + # Add required OpenACC update directives to every routine, including to + # those with no device code and that execute exclusively on the host + if ACC_EXPLICIT_MEM_MANAGEMENT: + print(f"Transforming {subroutine.name} with acc update") + ACC_UPDATE_TRANS.apply(subroutine) + # Add profiling instrumentation if PROFILE_NONACC: print( From 0aa650de6b175bbc83734f3573ac009ef072e375 Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Thu, 12 Feb 2026 13:21:14 +0000 Subject: [PATCH 23/27] #3130 tidy script --- examples/nemo/scripts/acc_kernels_trans.py | 49 +++++++++------------- 1 file changed, 19 insertions(+), 30 deletions(-) diff --git a/examples/nemo/scripts/acc_kernels_trans.py b/examples/nemo/scripts/acc_kernels_trans.py index a52b52e676..b15b1a1b54 100755 --- a/examples/nemo/scripts/acc_kernels_trans.py +++ b/examples/nemo/scripts/acc_kernels_trans.py @@ -55,10 +55,10 @@ """ -import logging from utils import (add_profiling, inline_calls, NOT_PERFORMANT, NEMO_MODULES_TO_IMPORT) from psyclone.errors import InternalError +from psyclone.psyir.nodes.node import Node from psyclone.psyir.nodes import ( ArrayReference, Assignment, @@ -175,27 +175,18 @@ def __init__(self, settings=None): } -def log_msg(name, msg, node): +def log_msg(name: str, msg: str, node: Node) -> None: """ - Log a message indicating why a transformation could not be performed. + Adds a comment indicating why a transformation could not be performed. - :param str name: the name of the routine. - :param str msg: the message to log. + :param name: the name of the routine - currently unused. + :param msg: the message to log. :param node: the PSyIR node that prevented the transformation. - :type node: :py:class:`psyclone.psyir.nodes.Node` """ - # Create a str representation of the position of the problematic node - # in the PSyIR tree. - node_strings = [] - parent = node - while parent: - node_strings.append(parent.node_str(colour=False)) - parent = parent.parent - node_strings.reverse() - location = "->".join(node_strings) - # Log the message - logging.info("%s: %s: %s", name, msg, location) + parent_stmt = node.ancestor(Statement, include_self=True) + if msg not in parent_stmt.preceding_comment: + parent_stmt.append_preceding_comment(msg) def valid_acc_kernel(node): @@ -218,10 +209,9 @@ def valid_acc_kernel(node): # check that the 'region' contains a loop. ACC_KERN_TRANS.validate(node, options={"disable_loop_check": True}) except TransformationError as err: - msg = f"Node rejected by ACCKernelTrans.validate: {err.value}" - parent_stmt = node.ancestor(Statement, include_self=True) - if msg not in parent_stmt.preceding_comment: - parent_stmt.append_preceding_comment(msg) + log_msg(routine_name, + f"Node rejected by ACCKernelTrans.validate: {err.value}", + node) return False # Allow for per-routine setting of what to exclude from within KERNELS @@ -232,8 +222,8 @@ def valid_acc_kernel(node): excluding = EXCLUDING.get(routine_name, EXCLUDING["default"]) excluded_types = (IfBlock, Loop) - excluded_nodes = node.walk(excluded_types) - for enode in excluded_nodes: + + for enode in node.walk(excluded_types): if isinstance(enode, IfBlock): # We permit IF blocks originating from WHERE constructs and # single-statement IF blocks containing a Loop in KERNELS regions @@ -248,9 +238,9 @@ def valid_acc_kernel(node): # We exclude if statements where the condition expression does # not refer to arrays at all as this may cause compiler issues # (get "Missing branch target block") or produce faster code. - if not arrays and excluding.ifs_scalars and \ - not isinstance(enode.condition, BinaryOperation): - log_msg(routine_name, "IF references scalars", enode) + if (not arrays and excluding.ifs_scalars and + not isinstance(enode.condition, BinaryOperation)): + log_msg(routine_name, enode, "IF references scalars") return False # When using CUDA Unified Memory, only allocated arrays reside in # shared memory (including those that are created by compiler- @@ -282,10 +272,9 @@ def valid_acc_kernel(node): if child.walk(Loop): loop_count += 1 if loop_count > 1: - msg = "Loop over levels contains several \ - other loops" - if msg not in enode.preceding_comment: - enode.append_preceding_comment(msg) + log_msg(routine_name, + ("Loop over levels contains several other " + "loops"), enode) return False return True From 31b58c39f4f1a769f1f9f8d6ecef30e0f6159aff Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Thu, 12 Feb 2026 13:21:59 +0000 Subject: [PATCH 24/27] #3130 fix name of spack package for kernels workflow --- .github/workflows/nemo_v5_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index e3762b99c3..76641113d3 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -486,7 +486,7 @@ jobs: run: | # Set up environment source /archive/psyclone-spack/psyclone-spack-Jun25/spack-repo/share/spack/setup-env.sh - spack unload && spack load nemo-build-environment%nvhpc@${NVFORTRAN_VERSION} + spack unload && spack load nemo-build-environment %${NVHPC_TOOLCHAIN} source .runner_venv/bin/activate export PSYCLONE_NEMO_DIR=${GITHUB_WORKSPACE}/examples/nemo/scripts export PSYCLONE_HOME=${PWD}/.runner_venv From 0468e3c19daa65ffdff42a9df5238af0b45ad897 Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Thu, 12 Feb 2026 15:12:18 +0000 Subject: [PATCH 25/27] #3130 fix mistake in script --- examples/nemo/scripts/acc_kernels_trans.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/nemo/scripts/acc_kernels_trans.py b/examples/nemo/scripts/acc_kernels_trans.py index b15b1a1b54..a8df3d28f4 100755 --- a/examples/nemo/scripts/acc_kernels_trans.py +++ b/examples/nemo/scripts/acc_kernels_trans.py @@ -240,7 +240,7 @@ def valid_acc_kernel(node): # (get "Missing branch target block") or produce faster code. if (not arrays and excluding.ifs_scalars and not isinstance(enode.condition, BinaryOperation)): - log_msg(routine_name, enode, "IF references scalars") + log_msg(routine_name, "IF references scalars", enode) return False # When using CUDA Unified Memory, only allocated arrays reside in # shared memory (including those that are created by compiler- From d272c318a631868535eb8db95c5571daa5a1fec8 Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Thu, 12 Feb 2026 18:07:32 +0000 Subject: [PATCH 26/27] #3130 fix version of spack installation used for acc kernels --- .github/workflows/nemo_v5_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index 76641113d3..db0911a624 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -485,7 +485,7 @@ jobs: id: bench_nvfortran_acc_kernels run: | # Set up environment - source /archive/psyclone-spack/psyclone-spack-Jun25/spack-repo/share/spack/setup-env.sh + source /archive/psyclone-spack/psyclone-spack-Dec25/spack-repo/share/spack/setup-env.sh spack unload && spack load nemo-build-environment %${NVHPC_TOOLCHAIN} source .runner_venv/bin/activate export PSYCLONE_NEMO_DIR=${GITHUB_WORKSPACE}/examples/nemo/scripts From 9469afbd28fdca32e02371b0079b400533262ada Mon Sep 17 00:00:00 2001 From: Mohammad Imaran Date: Mon, 13 Apr 2026 10:21:09 +0100 Subject: [PATCH 27/27] Compiling without ice --- .github/workflows/nemo_v5_tests.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index db0911a624..272b445db5 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -498,8 +498,7 @@ jobs: cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -acc -mp=gpu -gpu=mem:managed,math_uniform" rm -rf tests/${TEST_DIR} - ./makenemo -r BENCH -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/acc_kernels_trans.py add_key "key_nosignedzero" -j ${NUM_PARALLEL} - + ./makenemo -r BENCH -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/acc_kernels_trans.py del_key "key_si3" add_key "key_nosignedzero" -j ${NUM_PARALLEL} # Run test cd $NEMO_DIR/tests/${TEST_DIR}/EXP00 cp $PSYCLONE_NEMO_DIR/KGOs/namelist_cfg_bench_small namelist_cfg