diff --git a/MG5aMC/mg5amcnlo b/MG5aMC/mg5amcnlo index bfd34580eb..f0884cb7d1 160000 --- a/MG5aMC/mg5amcnlo +++ b/MG5aMC/mg5amcnlo @@ -1 +1 @@ -Subproject commit bfd34580eb59c2a027a502c89995e682a70a95b9 +Subproject commit f0884cb7d1ecb12393ddae54622b2c384bb8e2a8 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc index b61df224f1..1e7dcf38fe 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -504,7 +504,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index 583f3df0c9..b3c0ae1009 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -89,7 +89,7 @@ endif # Create file with the resolved backend in case user chooses 'cppauto' BACKEND_LOG ?= .resolved-backend ifneq ($(BACKEND_ORIG),$(BACKEND)) - $(file >$(BACKEND_LOG),$(BACKEND)) + $(shell echo '$(BACKEND)' >> $(BACKEND_LOG)) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fcheck_sa.f b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fcheck_sa.f index fb942500a5..4ff41257c3 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fcheck_sa.f +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc index 0665bfb93b..cd73d52ed3 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc @@ -591,38 +591,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which is greater than nchannels=%%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d (invalid SDE iconfig=%%d\n > nconfig=%%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which is greater than nchannels=%%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d (invalid SDE iconfig=%%d\n > nconfig=%%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -648,7 +660,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -665,6 +677,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc index 7de8886b1d..de07450c31 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc @@ -86,6 +86,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -110,6 +111,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc index 4372edde52..620d317f16 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc @@ -70,7 +70,7 @@ gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -94,7 +94,7 @@ // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -206,60 +206,61 @@ } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which is greater than nchannels=%%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d (invalid SDE iconfig=%%d\n > nconfig=%%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%%4d rndcol=%%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which is greater than nchannels=%%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d (invalid SDE iconfig=%%d\n > nconfig=%%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%%4d rndcol=%%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -267,32 +268,52 @@ break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%%4d rndcol=%%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%%d icol=%%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/smatrix_multi.f b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/smatrix_multi.f index 1151dc5a6c..858052727f 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/smatrix_multi.f +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/smatrix_multi.f @@ -18,7 +18,7 @@ IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. c ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486) @@ -38,7 +38,7 @@ CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -46,7 +46,7 @@ STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index f8930a863f..07988bc235 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +28,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +38,16 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +56,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0051648616790771484  +DEBUG: model prefixing takes 0.003152132034301758  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -147,7 +148,7 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams -1 processes with 2 diagrams generated in 0.002 s +1 processes with 2 diagrams generated in 0.004 s Total: 1 processes with 2 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -158,10 +159,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vecto INFO: initialize a new directory: CODEGEN_mad_ee_mumu INFO: remove old information in CODEGEN_mad_ee_mumu DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 @@ -173,22 +174,22 @@ FileWriter mu+ mu- WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group epem_mupmum -DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1748]  -Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s -Wrote files for 8 helas calls in 0.285 s +DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1749]  +Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s +Wrote files for 8 helas calls in 0.058 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.122 s +ALOHA: aloha creates 3 routines in 0.118 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.152 s +ALOHA: aloha creates 7 routines in 0.189 s FFV1 FFV1 FFV2 @@ -197,32 +198,32 @@ ALOHA: aloha creates 7 routines in 0.152 s FFV4 FFV2_4 FFV2_4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done. +Output to directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README +/home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README Run "open index.html" to see more information about this process. quit -real 0m4.542s -user 0m1.246s -sys 0m0.587s -Code generation completed in 5 seconds +real 0m2.243s +user 0m1.791s +sys 0m0.326s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * @@ -243,10 +244,10 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards run @@ -273,10 +274,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt index 712b1897aa..db7e3616c4 100644 --- a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat index 7aed5df7db..b3ab00b31d 100644 --- a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-8-gf0884cb7d HEAD * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_functions.f index e986b059a9..47699fa614 100644 --- a/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_functions.f @@ -2022,21 +2022,6 @@ subroutine orxxxx(p,rmass,nhel,nsr , ro) end - complex*16 function THETA_FUNCTIONR(cond, out_true, out_false) - - double precision cond - double precision out_true, out_false - - if (cond.ge.0d0) then - THETA_FUNCTIONR = out_true - else - THETA_FUNCTIONR = out_false - endif - - return - - - end complex*16 function THETA_FUNCTION(cond, out_true, out_false) double precision cond diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..1e7dcf38fe 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -504,7 +504,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc index 1c6406a546..fb09c252b7 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc @@ -926,38 +926,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -983,7 +995,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1000,6 +1012,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1118,7 +1131,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1142,7 +1155,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1254,60 +1267,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1315,32 +1329,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h index 1469ba9333..b590074a0a 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f index 78c4e66a95..1e083ecd15 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f index 03db576967..cf12adfab5 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f @@ -349,6 +349,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -359,6 +362,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -461,7 +465,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -534,19 +538,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=16) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -616,7 +622,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -640,7 +646,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -648,7 +654,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/fcheck_sa.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/fcheck_sa.f index f0220047d7..61be922c33 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/fcheck_sa.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f index 15e4d1a8a2..fa57230d40 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f @@ -304,7 +304,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -342,8 +342,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/addmothers.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/addmothers.f index d6cded9a2d..593c620d9b 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/addmothers.f @@ -111,7 +111,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, if (btest(mlevel,3)) then write(*,*)'unwgt.f: write out diagram ',igraphs(1) endif - lconfig = vec_igraph1(ivec) + lconfig = vec_igraph(ivec) endif is_LC=.true. maxcolor=0 diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cluster.inc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cluster.inc index 8ddf5bee13..940c25eac0 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cluster.inc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cluster.inc @@ -43,5 +43,5 @@ c parameters for sudakovs integer iipdg,iimode common/gamma_args/Q1,iipdg,iimode - integer vec_igraph1(VECSIZE_MEMMAX) - common/vec_igraph/vec_igraph1 + integer vec_igraph(VECSIZE_MEMMAX) + common/vec_igraph/vec_igraph diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk index f5bf67efbc..4cf05f5642 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' >> $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile_original.mk index 6cb56d0409..348c283be7 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile_original.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile_original.mk @@ -58,10 +58,7 @@ $(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp -libcollier.$(dylibext): - ln -s $(LIBDIR)/collier_lib/libcollier.$(dylibext) || echo 'already done' - -gensym: $(SYMMETRY) configs.inc $(LIBS) libcollier.$(dylibext) +gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/myamp.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/myamp.f index bd02dfe2b4..5360566ef4 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/myamp.f @@ -139,7 +139,7 @@ logical function cut_bw(p) $ gForceBW(i,iconfig).eq.1)) if(onshell)then c Remove on-shell forbidden s-channels (gForceBW=2) (JA 2/10/11) - if(gForceBW(i,iconfig).eq.2.and.sde_strat.eq.1) then + if(gForceBW(i,iconfig).eq.2) then cut_bw = .true. return endif diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/reweight.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/reweight.f index 353e025d71..8e4672a421 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/reweight.f @@ -1416,6 +1416,7 @@ double precision function rewgt(p, ivec) rewgt=1.0d0 clustered=.false. + vec_igraph(ivec) = 0 ! default: no MLM graph selected for this event if(ickkw.le.0.and..not.use_syst) return @@ -1467,6 +1468,7 @@ double precision function rewgt(p, ivec) rewgt = 0d0 return endif + vec_igraph(ivec) = igraphs(1) ! save MLM-matched graph for this event c Store pdf information for systematics studies (initial) @@ -1592,10 +1594,6 @@ double precision function rewgt(p, ivec) c alpha_s weight if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then - if (q2now.le.4)then - rewgt=0d0 - return - endif rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1907,7 +1905,7 @@ subroutine update_scale_coupling_vec(all_p, all_wgt,all_q2fact, VECSIZE_USED) else all_q2fact(1,i) = q2fact(1) all_q2fact(2,i) = q2fact(2) - vec_igraph1(i) = igraphs(1) + vec_igraph(i) = igraphs(1) endif c call save_cl_val_to(i) c endif diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py index 74f6b04b68..c248436e7f 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py @@ -1004,8 +1004,6 @@ def __init__(self, finput=None, **opt): self.comments = {} # comment associated to parameters. can be display via help message # store the valid options for a given parameter. self.allowed_value = {} - # allow nickname for some parameter to avoid integer mapping for some var - self.shortcut_values = {} self.default_setup() @@ -1134,11 +1132,6 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): scan_targettype = self.scan_set[lower_name] del self.scan_set[lower_name] - # check if the user used a shortcut value (which are always str) - if lower_name in self.shortcut_values: - if isinstance(value,str) and value.strip().lower() in self.shortcut_values[lower_name]: - value = self.shortcut_values[lower_name][value.strip().lower()] - # 2. Find the type of the attribute that we want if lower_name in self.list_parameter: targettype = self.list_parameter[lower_name] @@ -1317,8 +1310,7 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): def add_param(self, name, value, system=False, comment=False, typelist=None, - allowed=[], - shortcut={}): + allowed=[]): """add a default parameter to the class""" lower_name = name.lower() @@ -1353,11 +1345,6 @@ def add_param(self, name, value, system=False, comment=False, typelist=None, assert val in allowed or '*' in allowed else: assert value in allowed or '*' in allowed - if shortcut: - if allowed and shortcut and '*' not in allowed: - assert all([val in allowed for val in shortcut.values()]), "Some shortcut value are not in the allowed list" - assert all([isinstance(v, str) for v in shortcut.keys()]), "All shortcut values should be str" - self.shortcut_values[lower_name] = shortcut #elif isinstance(value, bool) and allowed != ['*']: # self.allowed_value[name] = [True, False] @@ -4186,10 +4173,8 @@ def default_setup(self): allowed=['partonshower'], comment="list of check that can be bypassed manually.") self.add_param("python_seed", -2, include=False, hidden=True, comment="controlling python seed [handling in particular the final unweighting].\n -1 means use default from random module.\n -2 means set to same value as iseed") self.add_param("lpp1", 1, fortran_name="lpp(1)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='first beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("lpp2", 1, fortran_name="lpp(2)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='second beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("ebeam1", 6500.0, fortran_name="ebeam(1)") self.add_param("ebeam2", 6500.0, fortran_name="ebeam(2)") @@ -4198,24 +4183,18 @@ def default_setup(self): self.add_param("polbeam2", 0.0, fortran_name="pb2", hidden=True, comment="Beam polarization from -100 (left-handed) to 100 (right-handed) --use lpp=0 for this parameter--") self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_proton2', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(2)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (used for beam 2 if group_subprocess was False)') self.add_param('nb_neutron1', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(1)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_neutron2', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(2)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (of beam 2 if group_subprocess was False )') self.add_param('mass_ion1', -1.0, hidden=True, fortran_name="mass_ion(1)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 1)') self.add_param('mass_ion2', -1.0, hidden=True, fortran_name="mass_ion(2)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 2)') valid_pdf = ['lhapdf', 'cteq6_m','cteq6_l', 'cteq6l1','nn23lo', 'nn23lo1', 'nn23nlo','iww','eva','edff','chff','none','mixed']+\ sum(self.allowed_lep_densities.values(),[]) @@ -4228,14 +4207,12 @@ def default_setup(self): self.add_param("fixed_fac_scale1", False, hidden=True) self.add_param("fixed_fac_scale2", False, hidden=True) self.add_param("fixed_extra_scale", False, hidden=True) - self.add_param("scale", 91.1880, shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) + self.add_param("scale", 91.1880) + self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1") + self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2") self.add_param("mue_ref_fixed", 91.1880, hidden=True) self.add_param("dynamical_scale_choice", -1, comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2\n '4' is the center of mass energy\n'0' allows to use the user_hook definition (need to be defined via custom_fct entry) ", - allowed=[-1,0,1,2,3,4,10], - shortcut={'ckkw':-1,'ht':2,'ht/2':3,'et':1,'shat':4}, - ) + allowed=[-1,0,1,2,3,4,10]) self.add_param("mue_over_ref", 1.0, hidden=True, comment='ratio mu_other/mu for dynamical scale') self.add_param("ievo_eva",0,hidden=True, allowed=[0,1],fortran_name="ievo_eva", comment='eva: 0 for EW pdf muf evolution by q^2; 1 for evo by pT^2') @@ -5598,10 +5575,8 @@ def default_setup(self): self.add_param('niters_fo', 6, include=False) #seed and collider self.add_param('iseed', 0) - self.add_param('lpp1', 1, fortran_name='lpp(1)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) - self.add_param('lpp2', 1, fortran_name='lpp(2)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) + self.add_param('lpp1', 1, fortran_name='lpp(1)') + self.add_param('lpp2', 1, fortran_name='lpp(2)') self.add_param('ebeam1', 6500.0, fortran_name='ebeam(1)') self.add_param('ebeam2', 6500.0, fortran_name='ebeam(2)') self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", @@ -5644,15 +5619,13 @@ def default_setup(self): self.add_param('fixed_ren_scale', False) self.add_param('fixed_fac_scale', False) self.add_param('fixed_extra_scale', True, hidden=True, system=True) # set system since running from Ellis-Sexton scale not implemented - self.add_param('mur_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mur_ref_fixed', 91.118) self.add_param('muf1_ref_fixed', -1.0, hidden=True) - self.add_param('muf_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('muf_ref_fixed', 91.118) self.add_param('muf2_ref_fixed', -1.0, hidden=True) - self.add_param('mue_ref_fixed', 91.118, hidden=True, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mue_ref_fixed', 91.118, hidden=True) self.add_param("dynamical_scale_choice", [-1],fortran_name='dyn_scale', - allowed = [-2,-1,0,1,2,3,10], - shortcut={ 'ht/2':3,'ht':2,'et':1}, - comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") + allowed = [-2,-1,0,1,2,3,10], comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") self.add_param('fixed_qes_scale', False, hidden=True) self.add_param('qes_ref_fixed', -1.0, hidden=True) self.add_param('mur_over_ref', 1.0) diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py index 6f82393c3f..3c5601e27d 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py @@ -5205,12 +5205,12 @@ def init_run(self, cards): if self.run_set: self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), - 'lpp': ([str],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), + 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), - 'fixed_scale': ([str],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), + 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), 'pbp':([],['run_card lpp1 1', 'run_card lpp2 1','run_card nb_proton1 82', 'run_card nb_neutron1 126', 'run_card mass_ion1 195.0820996698','run_card nb_proton2 1', 'run_card nb_neutron2 0', 'run_card mass_ion1 -1']), @@ -5795,8 +5795,6 @@ def complete_set(self, text, line, begidx, endidx, formatting=True): allowed_for_run.remove('*') elif isinstance(self.run_card[args[-1]], bool): allowed_for_run = ['True', 'False'] - if args[-1].lower() in self.run_card.shortcut_values: - allowed_for_run += self.run_card.shortcut_values[args[-1].lower()] opts += [str(i) for i in allowed_for_run] diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model_FDG.pkl deleted file mode 100644 index bf5a732979..0000000000 Binary files a/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model_Feynman.pkl deleted file mode 100644 index 3e55c479e2..0000000000 Binary files a/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk b/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index bdea67b952..ff0409b8f3 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +28,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +38,16 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +56,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005957365036010742  +DEBUG: model prefixing takes 0.004814624786376953  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -147,13 +148,13 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams -1 processes with 2 diagrams generated in 0.002 s +1 processes with 2 diagrams generated in 0.004 s Total: 1 processes with 2 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 @@ -162,17 +163,17 @@ INFO: Processing color information for process: e+ e- > mu+ mu- @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. -Generated helas calls for 1 subprocesses (2 diagrams) in 0.002 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. +Generated helas calls for 1 subprocesses (2 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.171 s +ALOHA: aloha creates 4 routines in 0.180 s FFV1 FFV1 FFV2 @@ -181,17 +182,17 @@ ALOHA: aloha creates 4 routines in 0.171 s FFV4 FFV2_4 FFV2_4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. quit -real 0m1.151s -user 0m0.372s -sys 0m0.155s +real 0m0.814s +user 0m0.613s +sys 0m0.104s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..1e7dcf38fe 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -504,7 +504,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc index 22cb8c2604..064d2aacd5 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc @@ -924,38 +924,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -981,7 +993,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -998,6 +1010,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1116,7 +1129,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1140,7 +1153,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1252,60 +1265,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1313,32 +1327,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h index 1469ba9333..b590074a0a 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/fcheck_sa.f b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/fcheck_sa.f index f0220047d7..61be922c33 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/fcheck_sa.f +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk index f5bf67efbc..4cf05f5642 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' >> $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk b/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index dbae24afe0..7ef50d8b31 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +28,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +38,16 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +56,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005540609359741211  +DEBUG: model prefixing takes 0.0038509368896484375  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -148,7 +149,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.004 s +1 processes with 3 diagrams generated in 0.007 s Total: 1 processes with 3 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -159,10 +160,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_ INFO: initialize a new directory: CODEGEN_mad_gg_tt INFO: remove old information in CODEGEN_mad_gg_tt DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 @@ -174,49 +175,49 @@ FileWriter t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1748]  -Generated helas calls for 1 subprocesses (3 diagrams) in 0.004 s -Wrote files for 10 helas calls in 0.266 s +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1749]  +Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s +Wrote files for 10 helas calls in 0.066 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.089 s +ALOHA: aloha creates 2 routines in 0.118 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.093 s +ALOHA: aloha creates 4 routines in 0.160 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done. +Output to directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README +/home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m4.687s -user 0m1.163s -sys 0m0.619s -Code generation completed in 5 seconds +real 0m2.279s +user 0m1.921s +sys 0m0.327s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * @@ -237,10 +238,10 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards run @@ -267,10 +268,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt index 712b1897aa..db7e3616c4 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat index 8b331b055f..38c1f98839 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-8-gf0884cb7d HEAD * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_functions.f index e986b059a9..47699fa614 100644 --- a/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_functions.f @@ -2022,21 +2022,6 @@ subroutine orxxxx(p,rmass,nhel,nsr , ro) end - complex*16 function THETA_FUNCTIONR(cond, out_true, out_false) - - double precision cond - double precision out_true, out_false - - if (cond.ge.0d0) then - THETA_FUNCTIONR = out_true - else - THETA_FUNCTIONR = out_false - endif - - return - - - end complex*16 function THETA_FUNCTION(cond, out_true, out_false) double precision cond diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..1e7dcf38fe 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -504,7 +504,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 091fecf10e..877693ab4e 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -939,38 +939,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -996,7 +1008,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1013,6 +1025,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1131,7 +1144,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1155,7 +1168,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1267,60 +1280,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1328,32 +1342,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index 3c5f6fe31f..b3c3d0ffb4 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f index 7f809ad0ff..6c6b37db2c 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f index a68aa6e4c0..7cf597b197 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f @@ -337,6 +337,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -347,6 +350,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -443,7 +447,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -516,19 +520,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=16) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -598,7 +604,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -622,7 +628,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -630,7 +636,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f index f0220047d7..61be922c33 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f index b47f79aa45..a6ff6ae67f 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f @@ -286,7 +286,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -324,8 +324,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/addmothers.f index d6cded9a2d..593c620d9b 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/addmothers.f @@ -111,7 +111,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, if (btest(mlevel,3)) then write(*,*)'unwgt.f: write out diagram ',igraphs(1) endif - lconfig = vec_igraph1(ivec) + lconfig = vec_igraph(ivec) endif is_LC=.true. maxcolor=0 diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cluster.inc b/epochX/cudacpp/gg_tt.mad/SubProcesses/cluster.inc index 8ddf5bee13..940c25eac0 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cluster.inc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cluster.inc @@ -43,5 +43,5 @@ c parameters for sudakovs integer iipdg,iimode common/gamma_args/Q1,iipdg,iimode - integer vec_igraph1(VECSIZE_MEMMAX) - common/vec_igraph/vec_igraph1 + integer vec_igraph(VECSIZE_MEMMAX) + common/vec_igraph/vec_igraph diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index f5bf67efbc..4cf05f5642 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' >> $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile_original.mk index 6cb56d0409..348c283be7 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile_original.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile_original.mk @@ -58,10 +58,7 @@ $(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp -libcollier.$(dylibext): - ln -s $(LIBDIR)/collier_lib/libcollier.$(dylibext) || echo 'already done' - -gensym: $(SYMMETRY) configs.inc $(LIBS) libcollier.$(dylibext) +gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/myamp.f index bd02dfe2b4..5360566ef4 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/myamp.f @@ -139,7 +139,7 @@ logical function cut_bw(p) $ gForceBW(i,iconfig).eq.1)) if(onshell)then c Remove on-shell forbidden s-channels (gForceBW=2) (JA 2/10/11) - if(gForceBW(i,iconfig).eq.2.and.sde_strat.eq.1) then + if(gForceBW(i,iconfig).eq.2) then cut_bw = .true. return endif diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/reweight.f index 353e025d71..8e4672a421 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/reweight.f @@ -1416,6 +1416,7 @@ double precision function rewgt(p, ivec) rewgt=1.0d0 clustered=.false. + vec_igraph(ivec) = 0 ! default: no MLM graph selected for this event if(ickkw.le.0.and..not.use_syst) return @@ -1467,6 +1468,7 @@ double precision function rewgt(p, ivec) rewgt = 0d0 return endif + vec_igraph(ivec) = igraphs(1) ! save MLM-matched graph for this event c Store pdf information for systematics studies (initial) @@ -1592,10 +1594,6 @@ double precision function rewgt(p, ivec) c alpha_s weight if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then - if (q2now.le.4)then - rewgt=0d0 - return - endif rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1907,7 +1905,7 @@ subroutine update_scale_coupling_vec(all_p, all_wgt,all_q2fact, VECSIZE_USED) else all_q2fact(1,i) = q2fact(1) all_q2fact(2,i) = q2fact(2) - vec_igraph1(i) = igraphs(1) + vec_igraph(i) = igraphs(1) endif c call save_cl_val_to(i) c endif diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py index 74f6b04b68..c248436e7f 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py @@ -1004,8 +1004,6 @@ def __init__(self, finput=None, **opt): self.comments = {} # comment associated to parameters. can be display via help message # store the valid options for a given parameter. self.allowed_value = {} - # allow nickname for some parameter to avoid integer mapping for some var - self.shortcut_values = {} self.default_setup() @@ -1134,11 +1132,6 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): scan_targettype = self.scan_set[lower_name] del self.scan_set[lower_name] - # check if the user used a shortcut value (which are always str) - if lower_name in self.shortcut_values: - if isinstance(value,str) and value.strip().lower() in self.shortcut_values[lower_name]: - value = self.shortcut_values[lower_name][value.strip().lower()] - # 2. Find the type of the attribute that we want if lower_name in self.list_parameter: targettype = self.list_parameter[lower_name] @@ -1317,8 +1310,7 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): def add_param(self, name, value, system=False, comment=False, typelist=None, - allowed=[], - shortcut={}): + allowed=[]): """add a default parameter to the class""" lower_name = name.lower() @@ -1353,11 +1345,6 @@ def add_param(self, name, value, system=False, comment=False, typelist=None, assert val in allowed or '*' in allowed else: assert value in allowed or '*' in allowed - if shortcut: - if allowed and shortcut and '*' not in allowed: - assert all([val in allowed for val in shortcut.values()]), "Some shortcut value are not in the allowed list" - assert all([isinstance(v, str) for v in shortcut.keys()]), "All shortcut values should be str" - self.shortcut_values[lower_name] = shortcut #elif isinstance(value, bool) and allowed != ['*']: # self.allowed_value[name] = [True, False] @@ -4186,10 +4173,8 @@ def default_setup(self): allowed=['partonshower'], comment="list of check that can be bypassed manually.") self.add_param("python_seed", -2, include=False, hidden=True, comment="controlling python seed [handling in particular the final unweighting].\n -1 means use default from random module.\n -2 means set to same value as iseed") self.add_param("lpp1", 1, fortran_name="lpp(1)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='first beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("lpp2", 1, fortran_name="lpp(2)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='second beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("ebeam1", 6500.0, fortran_name="ebeam(1)") self.add_param("ebeam2", 6500.0, fortran_name="ebeam(2)") @@ -4198,24 +4183,18 @@ def default_setup(self): self.add_param("polbeam2", 0.0, fortran_name="pb2", hidden=True, comment="Beam polarization from -100 (left-handed) to 100 (right-handed) --use lpp=0 for this parameter--") self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_proton2', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(2)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (used for beam 2 if group_subprocess was False)') self.add_param('nb_neutron1', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(1)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_neutron2', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(2)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (of beam 2 if group_subprocess was False )') self.add_param('mass_ion1', -1.0, hidden=True, fortran_name="mass_ion(1)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 1)') self.add_param('mass_ion2', -1.0, hidden=True, fortran_name="mass_ion(2)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 2)') valid_pdf = ['lhapdf', 'cteq6_m','cteq6_l', 'cteq6l1','nn23lo', 'nn23lo1', 'nn23nlo','iww','eva','edff','chff','none','mixed']+\ sum(self.allowed_lep_densities.values(),[]) @@ -4228,14 +4207,12 @@ def default_setup(self): self.add_param("fixed_fac_scale1", False, hidden=True) self.add_param("fixed_fac_scale2", False, hidden=True) self.add_param("fixed_extra_scale", False, hidden=True) - self.add_param("scale", 91.1880, shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) + self.add_param("scale", 91.1880) + self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1") + self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2") self.add_param("mue_ref_fixed", 91.1880, hidden=True) self.add_param("dynamical_scale_choice", -1, comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2\n '4' is the center of mass energy\n'0' allows to use the user_hook definition (need to be defined via custom_fct entry) ", - allowed=[-1,0,1,2,3,4,10], - shortcut={'ckkw':-1,'ht':2,'ht/2':3,'et':1,'shat':4}, - ) + allowed=[-1,0,1,2,3,4,10]) self.add_param("mue_over_ref", 1.0, hidden=True, comment='ratio mu_other/mu for dynamical scale') self.add_param("ievo_eva",0,hidden=True, allowed=[0,1],fortran_name="ievo_eva", comment='eva: 0 for EW pdf muf evolution by q^2; 1 for evo by pT^2') @@ -5598,10 +5575,8 @@ def default_setup(self): self.add_param('niters_fo', 6, include=False) #seed and collider self.add_param('iseed', 0) - self.add_param('lpp1', 1, fortran_name='lpp(1)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) - self.add_param('lpp2', 1, fortran_name='lpp(2)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) + self.add_param('lpp1', 1, fortran_name='lpp(1)') + self.add_param('lpp2', 1, fortran_name='lpp(2)') self.add_param('ebeam1', 6500.0, fortran_name='ebeam(1)') self.add_param('ebeam2', 6500.0, fortran_name='ebeam(2)') self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", @@ -5644,15 +5619,13 @@ def default_setup(self): self.add_param('fixed_ren_scale', False) self.add_param('fixed_fac_scale', False) self.add_param('fixed_extra_scale', True, hidden=True, system=True) # set system since running from Ellis-Sexton scale not implemented - self.add_param('mur_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mur_ref_fixed', 91.118) self.add_param('muf1_ref_fixed', -1.0, hidden=True) - self.add_param('muf_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('muf_ref_fixed', 91.118) self.add_param('muf2_ref_fixed', -1.0, hidden=True) - self.add_param('mue_ref_fixed', 91.118, hidden=True, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mue_ref_fixed', 91.118, hidden=True) self.add_param("dynamical_scale_choice", [-1],fortran_name='dyn_scale', - allowed = [-2,-1,0,1,2,3,10], - shortcut={ 'ht/2':3,'ht':2,'et':1}, - comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") + allowed = [-2,-1,0,1,2,3,10], comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") self.add_param('fixed_qes_scale', False, hidden=True) self.add_param('qes_ref_fixed', -1.0, hidden=True) self.add_param('mur_over_ref', 1.0) diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py index 6f82393c3f..3c5601e27d 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py @@ -5205,12 +5205,12 @@ def init_run(self, cards): if self.run_set: self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), - 'lpp': ([str],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), + 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), - 'fixed_scale': ([str],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), + 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), 'pbp':([],['run_card lpp1 1', 'run_card lpp2 1','run_card nb_proton1 82', 'run_card nb_neutron1 126', 'run_card mass_ion1 195.0820996698','run_card nb_proton2 1', 'run_card nb_neutron2 0', 'run_card mass_ion1 -1']), @@ -5795,8 +5795,6 @@ def complete_set(self, text, line, begidx, endidx, formatting=True): allowed_for_run.remove('*') elif isinstance(self.run_card[args[-1]], bool): allowed_for_run = ['True', 'False'] - if args[-1].lower() in self.run_card.shortcut_values: - allowed_for_run += self.run_card.shortcut_values[args[-1].lower()] opts += [str(i) for i in allowed_for_run] diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model_FDG.pkl deleted file mode 100644 index bf5a732979..0000000000 Binary files a/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model_Feynman.pkl deleted file mode 100644 index 3e55c479e2..0000000000 Binary files a/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 20cc72fd46..ddc402e0ef 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +28,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +38,16 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +56,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00434565544128418  +DEBUG: model prefixing takes 0.0030803680419921875  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -148,13 +149,13 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.004 s +1 processes with 3 diagrams generated in 0.006 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 @@ -163,30 +164,30 @@ INFO: Processing color information for process: g g > t t~ @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. -Generated helas calls for 1 subprocesses (3 diagrams) in 0.004 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. +Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.091 s +ALOHA: aloha creates 2 routines in 0.096 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. quit -real 0m0.992s -user 0m0.334s -sys 0m0.123s -Code generation completed in 1 seconds +real 0m0.485s +user 0m0.432s +sys 0m0.048s +Code generation completed in 0 seconds diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..1e7dcf38fe 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -504,7 +504,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc index 61e6f0c54c..0a7fdcf80b 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc @@ -936,38 +936,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -993,7 +1005,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1010,6 +1022,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1128,7 +1141,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1152,7 +1165,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1264,60 +1277,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1325,32 +1339,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h index 3c5f6fe31f..b3c3d0ffb4 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/fcheck_sa.f b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/fcheck_sa.f index f0220047d7..61be922c33 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/fcheck_sa.f +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index f5bf67efbc..4cf05f5642 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' >> $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index 332a0806f1..05e779eedd 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +28,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +38,16 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +56,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005877494812011719  +DEBUG: model prefixing takes 0.0031867027282714844  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -148,7 +149,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.004 s +1 processes with 3 diagrams generated in 0.007 s Total: 1 processes with 3 diagrams add process g g > t t~ g INFO: Checking for minimal orders which gives processes. @@ -156,7 +157,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.010 s +1 processes with 16 diagrams generated in 0.016 s Total: 2 processes with 19 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -167,10 +168,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vect INFO: initialize a new directory: CODEGEN_mad_gg_tt01g INFO: remove old information in CODEGEN_mad_gg_tt01g DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @2 INFO: Processing color information for process: g g > t t~ g @2 @@ -184,9 +185,9 @@ FileWriter t t~ g WEIGHTED<=3 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1749]  INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -195,25 +196,25 @@ FileWriter t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1748]  -Generated helas calls for 2 subprocesses (19 diagrams) in 0.023 s -Wrote files for 46 helas calls in 0.502 s +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1749]  +Generated helas calls for 2 subprocesses (19 diagrams) in 0.039 s +Wrote files for 46 helas calls in 0.166 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.190 s +ALOHA: aloha creates 5 routines in 0.229 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.187 s +ALOHA: aloha creates 10 routines in 0.167 s VVV1 VVV1 FFV1 @@ -223,32 +224,32 @@ ALOHA: aloha creates 10 routines in 0.187 s VVVV1 VVVV3 VVVV4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done. +Output to directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README +/home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README Run "open index.html" to see more information about this process. quit -real 0m5.233s -user 0m1.496s -sys 0m0.718s -Code generation completed in 5 seconds +real 0m2.606s +user 0m2.225s +sys 0m0.354s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * @@ -269,10 +270,10 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards run @@ -299,10 +300,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt index 712b1897aa..db7e3616c4 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat index 30bd3794c3..de51114026 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-8-gf0884cb7d HEAD * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_functions.f index e986b059a9..47699fa614 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_functions.f @@ -2022,21 +2022,6 @@ subroutine orxxxx(p,rmass,nhel,nsr , ro) end - complex*16 function THETA_FUNCTIONR(cond, out_true, out_false) - - double precision cond - double precision out_true, out_false - - if (cond.ge.0d0) then - THETA_FUNCTIONR = out_true - else - THETA_FUNCTIONR = out_false - endif - - return - - - end complex*16 function THETA_FUNCTION(cond, out_true, out_false) double precision cond diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..1e7dcf38fe 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -504,7 +504,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 091fecf10e..877693ab4e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -939,38 +939,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -996,7 +1008,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1013,6 +1025,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1131,7 +1144,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1155,7 +1168,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1267,60 +1280,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1328,32 +1342,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index 3c5f6fe31f..b3c3d0ffb4 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f index 7f809ad0ff..6c6b37db2c 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f index a68aa6e4c0..7cf597b197 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f @@ -337,6 +337,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -347,6 +350,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -443,7 +447,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -516,19 +520,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=16) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -598,7 +604,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -622,7 +628,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -630,7 +636,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f index f0220047d7..61be922c33 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f index b47f79aa45..a6ff6ae67f 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f @@ -286,7 +286,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -324,8 +324,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc index ce41e289c6..a6b6ef610c 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc @@ -1156,38 +1156,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1213,7 +1225,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1230,6 +1242,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1348,7 +1361,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1372,7 +1385,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1484,60 +1497,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1545,32 +1559,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h index 44f2636937..d248effd6c 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f index ca0da2991e..4282896667 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f index a43968abf6..b29c6aeca2 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f @@ -337,6 +337,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -347,6 +350,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -443,7 +447,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -516,19 +520,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=32) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -598,7 +604,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -622,7 +628,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -630,7 +636,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/fcheck_sa.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/fcheck_sa.f index cb7efdfbcf..70c3d08b67 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/fcheck_sa.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f index 3ed3e82f91..41a39a13ca 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f @@ -302,7 +302,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -340,8 +340,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/addmothers.f index d6cded9a2d..593c620d9b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/addmothers.f @@ -111,7 +111,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, if (btest(mlevel,3)) then write(*,*)'unwgt.f: write out diagram ',igraphs(1) endif - lconfig = vec_igraph1(ivec) + lconfig = vec_igraph(ivec) endif is_LC=.true. maxcolor=0 diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cluster.inc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cluster.inc index 8ddf5bee13..940c25eac0 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cluster.inc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cluster.inc @@ -43,5 +43,5 @@ c parameters for sudakovs integer iipdg,iimode common/gamma_args/Q1,iipdg,iimode - integer vec_igraph1(VECSIZE_MEMMAX) - common/vec_igraph/vec_igraph1 + integer vec_igraph(VECSIZE_MEMMAX) + common/vec_igraph/vec_igraph diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk index f5bf67efbc..4cf05f5642 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' >> $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile_original.mk index 6cb56d0409..348c283be7 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile_original.mk +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile_original.mk @@ -58,10 +58,7 @@ $(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp -libcollier.$(dylibext): - ln -s $(LIBDIR)/collier_lib/libcollier.$(dylibext) || echo 'already done' - -gensym: $(SYMMETRY) configs.inc $(LIBS) libcollier.$(dylibext) +gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/myamp.f index bd02dfe2b4..5360566ef4 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/myamp.f @@ -139,7 +139,7 @@ logical function cut_bw(p) $ gForceBW(i,iconfig).eq.1)) if(onshell)then c Remove on-shell forbidden s-channels (gForceBW=2) (JA 2/10/11) - if(gForceBW(i,iconfig).eq.2.and.sde_strat.eq.1) then + if(gForceBW(i,iconfig).eq.2) then cut_bw = .true. return endif diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/reweight.f index 353e025d71..8e4672a421 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/reweight.f @@ -1416,6 +1416,7 @@ double precision function rewgt(p, ivec) rewgt=1.0d0 clustered=.false. + vec_igraph(ivec) = 0 ! default: no MLM graph selected for this event if(ickkw.le.0.and..not.use_syst) return @@ -1467,6 +1468,7 @@ double precision function rewgt(p, ivec) rewgt = 0d0 return endif + vec_igraph(ivec) = igraphs(1) ! save MLM-matched graph for this event c Store pdf information for systematics studies (initial) @@ -1592,10 +1594,6 @@ double precision function rewgt(p, ivec) c alpha_s weight if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then - if (q2now.le.4)then - rewgt=0d0 - return - endif rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1907,7 +1905,7 @@ subroutine update_scale_coupling_vec(all_p, all_wgt,all_q2fact, VECSIZE_USED) else all_q2fact(1,i) = q2fact(1) all_q2fact(2,i) = q2fact(2) - vec_igraph1(i) = igraphs(1) + vec_igraph(i) = igraphs(1) endif c call save_cl_val_to(i) c endif diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py index 74f6b04b68..c248436e7f 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py @@ -1004,8 +1004,6 @@ def __init__(self, finput=None, **opt): self.comments = {} # comment associated to parameters. can be display via help message # store the valid options for a given parameter. self.allowed_value = {} - # allow nickname for some parameter to avoid integer mapping for some var - self.shortcut_values = {} self.default_setup() @@ -1134,11 +1132,6 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): scan_targettype = self.scan_set[lower_name] del self.scan_set[lower_name] - # check if the user used a shortcut value (which are always str) - if lower_name in self.shortcut_values: - if isinstance(value,str) and value.strip().lower() in self.shortcut_values[lower_name]: - value = self.shortcut_values[lower_name][value.strip().lower()] - # 2. Find the type of the attribute that we want if lower_name in self.list_parameter: targettype = self.list_parameter[lower_name] @@ -1317,8 +1310,7 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): def add_param(self, name, value, system=False, comment=False, typelist=None, - allowed=[], - shortcut={}): + allowed=[]): """add a default parameter to the class""" lower_name = name.lower() @@ -1353,11 +1345,6 @@ def add_param(self, name, value, system=False, comment=False, typelist=None, assert val in allowed or '*' in allowed else: assert value in allowed or '*' in allowed - if shortcut: - if allowed and shortcut and '*' not in allowed: - assert all([val in allowed for val in shortcut.values()]), "Some shortcut value are not in the allowed list" - assert all([isinstance(v, str) for v in shortcut.keys()]), "All shortcut values should be str" - self.shortcut_values[lower_name] = shortcut #elif isinstance(value, bool) and allowed != ['*']: # self.allowed_value[name] = [True, False] @@ -4186,10 +4173,8 @@ def default_setup(self): allowed=['partonshower'], comment="list of check that can be bypassed manually.") self.add_param("python_seed", -2, include=False, hidden=True, comment="controlling python seed [handling in particular the final unweighting].\n -1 means use default from random module.\n -2 means set to same value as iseed") self.add_param("lpp1", 1, fortran_name="lpp(1)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='first beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("lpp2", 1, fortran_name="lpp(2)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='second beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("ebeam1", 6500.0, fortran_name="ebeam(1)") self.add_param("ebeam2", 6500.0, fortran_name="ebeam(2)") @@ -4198,24 +4183,18 @@ def default_setup(self): self.add_param("polbeam2", 0.0, fortran_name="pb2", hidden=True, comment="Beam polarization from -100 (left-handed) to 100 (right-handed) --use lpp=0 for this parameter--") self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_proton2', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(2)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (used for beam 2 if group_subprocess was False)') self.add_param('nb_neutron1', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(1)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_neutron2', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(2)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (of beam 2 if group_subprocess was False )') self.add_param('mass_ion1', -1.0, hidden=True, fortran_name="mass_ion(1)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 1)') self.add_param('mass_ion2', -1.0, hidden=True, fortran_name="mass_ion(2)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 2)') valid_pdf = ['lhapdf', 'cteq6_m','cteq6_l', 'cteq6l1','nn23lo', 'nn23lo1', 'nn23nlo','iww','eva','edff','chff','none','mixed']+\ sum(self.allowed_lep_densities.values(),[]) @@ -4228,14 +4207,12 @@ def default_setup(self): self.add_param("fixed_fac_scale1", False, hidden=True) self.add_param("fixed_fac_scale2", False, hidden=True) self.add_param("fixed_extra_scale", False, hidden=True) - self.add_param("scale", 91.1880, shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) + self.add_param("scale", 91.1880) + self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1") + self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2") self.add_param("mue_ref_fixed", 91.1880, hidden=True) self.add_param("dynamical_scale_choice", -1, comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2\n '4' is the center of mass energy\n'0' allows to use the user_hook definition (need to be defined via custom_fct entry) ", - allowed=[-1,0,1,2,3,4,10], - shortcut={'ckkw':-1,'ht':2,'ht/2':3,'et':1,'shat':4}, - ) + allowed=[-1,0,1,2,3,4,10]) self.add_param("mue_over_ref", 1.0, hidden=True, comment='ratio mu_other/mu for dynamical scale') self.add_param("ievo_eva",0,hidden=True, allowed=[0,1],fortran_name="ievo_eva", comment='eva: 0 for EW pdf muf evolution by q^2; 1 for evo by pT^2') @@ -5598,10 +5575,8 @@ def default_setup(self): self.add_param('niters_fo', 6, include=False) #seed and collider self.add_param('iseed', 0) - self.add_param('lpp1', 1, fortran_name='lpp(1)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) - self.add_param('lpp2', 1, fortran_name='lpp(2)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) + self.add_param('lpp1', 1, fortran_name='lpp(1)') + self.add_param('lpp2', 1, fortran_name='lpp(2)') self.add_param('ebeam1', 6500.0, fortran_name='ebeam(1)') self.add_param('ebeam2', 6500.0, fortran_name='ebeam(2)') self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", @@ -5644,15 +5619,13 @@ def default_setup(self): self.add_param('fixed_ren_scale', False) self.add_param('fixed_fac_scale', False) self.add_param('fixed_extra_scale', True, hidden=True, system=True) # set system since running from Ellis-Sexton scale not implemented - self.add_param('mur_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mur_ref_fixed', 91.118) self.add_param('muf1_ref_fixed', -1.0, hidden=True) - self.add_param('muf_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('muf_ref_fixed', 91.118) self.add_param('muf2_ref_fixed', -1.0, hidden=True) - self.add_param('mue_ref_fixed', 91.118, hidden=True, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mue_ref_fixed', 91.118, hidden=True) self.add_param("dynamical_scale_choice", [-1],fortran_name='dyn_scale', - allowed = [-2,-1,0,1,2,3,10], - shortcut={ 'ht/2':3,'ht':2,'et':1}, - comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") + allowed = [-2,-1,0,1,2,3,10], comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") self.add_param('fixed_qes_scale', False, hidden=True) self.add_param('qes_ref_fixed', -1.0, hidden=True) self.add_param('mur_over_ref', 1.0) diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py index 6f82393c3f..3c5601e27d 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py @@ -5205,12 +5205,12 @@ def init_run(self, cards): if self.run_set: self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), - 'lpp': ([str],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), + 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), - 'fixed_scale': ([str],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), + 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), 'pbp':([],['run_card lpp1 1', 'run_card lpp2 1','run_card nb_proton1 82', 'run_card nb_neutron1 126', 'run_card mass_ion1 195.0820996698','run_card nb_proton2 1', 'run_card nb_neutron2 0', 'run_card mass_ion1 -1']), @@ -5795,8 +5795,6 @@ def complete_set(self, text, line, begidx, endidx, formatting=True): allowed_for_run.remove('*') elif isinstance(self.run_card[args[-1]], bool): allowed_for_run = ['True', 'False'] - if args[-1].lower() in self.run_card.shortcut_values: - allowed_for_run += self.run_card.shortcut_values[args[-1].lower()] opts += [str(i) for i in allowed_for_run] diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/py3_model_FDG.pkl deleted file mode 100644 index bf5a732979..0000000000 Binary files a/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/py3_model_Feynman.pkl deleted file mode 100644 index 3e55c479e2..0000000000 Binary files a/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index b836987bc5..68dda708e1 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +28,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +38,16 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +56,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00551295280456543  +DEBUG: model prefixing takes 0.003030061721801758  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -148,7 +149,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.012 s +1 processes with 16 diagrams generated in 0.016 s Total: 1 processes with 16 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -159,10 +160,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector INFO: initialize a new directory: CODEGEN_mad_gg_ttg INFO: remove old information in CODEGEN_mad_gg_ttg DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 @@ -174,25 +175,25 @@ FileWriter t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1748]  -Generated helas calls for 1 subprocesses (16 diagrams) in 0.046 s -Wrote files for 36 helas calls in 0.368 s +DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1749]  +Generated helas calls for 1 subprocesses (16 diagrams) in 0.029 s +Wrote files for 36 helas calls in 0.087 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.190 s +ALOHA: aloha creates 5 routines in 0.247 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.194 s +ALOHA: aloha creates 10 routines in 0.169 s VVV1 VVV1 FFV1 @@ -202,32 +203,32 @@ ALOHA: aloha creates 10 routines in 0.194 s VVVV1 VVVV3 VVVV4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done. +Output to directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README +/home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README Run "open index.html" to see more information about this process. quit -real 0m4.945s -user 0m1.513s -sys 0m0.678s -Code generation completed in 5 seconds +real 0m2.378s +user 0m2.056s +sys 0m0.297s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * @@ -248,10 +249,10 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards run @@ -278,10 +279,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt index 712b1897aa..db7e3616c4 100644 --- a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat index 0fe3df08d4..444f1253f8 100644 --- a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-8-gf0884cb7d HEAD * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_functions.f index e986b059a9..47699fa614 100644 --- a/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_functions.f @@ -2022,21 +2022,6 @@ subroutine orxxxx(p,rmass,nhel,nsr , ro) end - complex*16 function THETA_FUNCTIONR(cond, out_true, out_false) - - double precision cond - double precision out_true, out_false - - if (cond.ge.0d0) then - THETA_FUNCTIONR = out_true - else - THETA_FUNCTIONR = out_false - endif - - return - - - end complex*16 function THETA_FUNCTION(cond, out_true, out_false) double precision cond diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..1e7dcf38fe 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -504,7 +504,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index 0726e0a6ea..85622f37c6 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -1156,38 +1156,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1213,7 +1225,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1230,6 +1242,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1348,7 +1361,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1372,7 +1385,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1484,60 +1497,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1545,32 +1559,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h index 5c057176f6..6ad3c7dd1e 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f index ebf5273614..6dfa640d9e 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f index c32cb4d43c..77820f0e51 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f @@ -337,6 +337,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -347,6 +350,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -443,7 +447,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -516,19 +520,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=32) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -598,7 +604,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -622,7 +628,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -630,7 +636,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/fcheck_sa.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/fcheck_sa.f index cb7efdfbcf..70c3d08b67 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/fcheck_sa.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f index 6724cffa4b..7388a4bf7e 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f @@ -302,7 +302,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -340,8 +340,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/addmothers.f index d6cded9a2d..593c620d9b 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/addmothers.f @@ -111,7 +111,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, if (btest(mlevel,3)) then write(*,*)'unwgt.f: write out diagram ',igraphs(1) endif - lconfig = vec_igraph1(ivec) + lconfig = vec_igraph(ivec) endif is_LC=.true. maxcolor=0 diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cluster.inc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cluster.inc index 8ddf5bee13..940c25eac0 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cluster.inc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cluster.inc @@ -43,5 +43,5 @@ c parameters for sudakovs integer iipdg,iimode common/gamma_args/Q1,iipdg,iimode - integer vec_igraph1(VECSIZE_MEMMAX) - common/vec_igraph/vec_igraph1 + integer vec_igraph(VECSIZE_MEMMAX) + common/vec_igraph/vec_igraph diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk index f5bf67efbc..4cf05f5642 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' >> $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile_original.mk index 6cb56d0409..348c283be7 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile_original.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile_original.mk @@ -58,10 +58,7 @@ $(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp -libcollier.$(dylibext): - ln -s $(LIBDIR)/collier_lib/libcollier.$(dylibext) || echo 'already done' - -gensym: $(SYMMETRY) configs.inc $(LIBS) libcollier.$(dylibext) +gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/myamp.f index bd02dfe2b4..5360566ef4 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/myamp.f @@ -139,7 +139,7 @@ logical function cut_bw(p) $ gForceBW(i,iconfig).eq.1)) if(onshell)then c Remove on-shell forbidden s-channels (gForceBW=2) (JA 2/10/11) - if(gForceBW(i,iconfig).eq.2.and.sde_strat.eq.1) then + if(gForceBW(i,iconfig).eq.2) then cut_bw = .true. return endif diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/reweight.f index 353e025d71..8e4672a421 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/reweight.f @@ -1416,6 +1416,7 @@ double precision function rewgt(p, ivec) rewgt=1.0d0 clustered=.false. + vec_igraph(ivec) = 0 ! default: no MLM graph selected for this event if(ickkw.le.0.and..not.use_syst) return @@ -1467,6 +1468,7 @@ double precision function rewgt(p, ivec) rewgt = 0d0 return endif + vec_igraph(ivec) = igraphs(1) ! save MLM-matched graph for this event c Store pdf information for systematics studies (initial) @@ -1592,10 +1594,6 @@ double precision function rewgt(p, ivec) c alpha_s weight if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then - if (q2now.le.4)then - rewgt=0d0 - return - endif rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1907,7 +1905,7 @@ subroutine update_scale_coupling_vec(all_p, all_wgt,all_q2fact, VECSIZE_USED) else all_q2fact(1,i) = q2fact(1) all_q2fact(2,i) = q2fact(2) - vec_igraph1(i) = igraphs(1) + vec_igraph(i) = igraphs(1) endif c call save_cl_val_to(i) c endif diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py index 74f6b04b68..c248436e7f 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py @@ -1004,8 +1004,6 @@ def __init__(self, finput=None, **opt): self.comments = {} # comment associated to parameters. can be display via help message # store the valid options for a given parameter. self.allowed_value = {} - # allow nickname for some parameter to avoid integer mapping for some var - self.shortcut_values = {} self.default_setup() @@ -1134,11 +1132,6 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): scan_targettype = self.scan_set[lower_name] del self.scan_set[lower_name] - # check if the user used a shortcut value (which are always str) - if lower_name in self.shortcut_values: - if isinstance(value,str) and value.strip().lower() in self.shortcut_values[lower_name]: - value = self.shortcut_values[lower_name][value.strip().lower()] - # 2. Find the type of the attribute that we want if lower_name in self.list_parameter: targettype = self.list_parameter[lower_name] @@ -1317,8 +1310,7 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): def add_param(self, name, value, system=False, comment=False, typelist=None, - allowed=[], - shortcut={}): + allowed=[]): """add a default parameter to the class""" lower_name = name.lower() @@ -1353,11 +1345,6 @@ def add_param(self, name, value, system=False, comment=False, typelist=None, assert val in allowed or '*' in allowed else: assert value in allowed or '*' in allowed - if shortcut: - if allowed and shortcut and '*' not in allowed: - assert all([val in allowed for val in shortcut.values()]), "Some shortcut value are not in the allowed list" - assert all([isinstance(v, str) for v in shortcut.keys()]), "All shortcut values should be str" - self.shortcut_values[lower_name] = shortcut #elif isinstance(value, bool) and allowed != ['*']: # self.allowed_value[name] = [True, False] @@ -4186,10 +4173,8 @@ def default_setup(self): allowed=['partonshower'], comment="list of check that can be bypassed manually.") self.add_param("python_seed", -2, include=False, hidden=True, comment="controlling python seed [handling in particular the final unweighting].\n -1 means use default from random module.\n -2 means set to same value as iseed") self.add_param("lpp1", 1, fortran_name="lpp(1)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='first beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("lpp2", 1, fortran_name="lpp(2)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='second beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("ebeam1", 6500.0, fortran_name="ebeam(1)") self.add_param("ebeam2", 6500.0, fortran_name="ebeam(2)") @@ -4198,24 +4183,18 @@ def default_setup(self): self.add_param("polbeam2", 0.0, fortran_name="pb2", hidden=True, comment="Beam polarization from -100 (left-handed) to 100 (right-handed) --use lpp=0 for this parameter--") self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_proton2', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(2)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (used for beam 2 if group_subprocess was False)') self.add_param('nb_neutron1', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(1)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_neutron2', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(2)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (of beam 2 if group_subprocess was False )') self.add_param('mass_ion1', -1.0, hidden=True, fortran_name="mass_ion(1)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 1)') self.add_param('mass_ion2', -1.0, hidden=True, fortran_name="mass_ion(2)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 2)') valid_pdf = ['lhapdf', 'cteq6_m','cteq6_l', 'cteq6l1','nn23lo', 'nn23lo1', 'nn23nlo','iww','eva','edff','chff','none','mixed']+\ sum(self.allowed_lep_densities.values(),[]) @@ -4228,14 +4207,12 @@ def default_setup(self): self.add_param("fixed_fac_scale1", False, hidden=True) self.add_param("fixed_fac_scale2", False, hidden=True) self.add_param("fixed_extra_scale", False, hidden=True) - self.add_param("scale", 91.1880, shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) + self.add_param("scale", 91.1880) + self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1") + self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2") self.add_param("mue_ref_fixed", 91.1880, hidden=True) self.add_param("dynamical_scale_choice", -1, comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2\n '4' is the center of mass energy\n'0' allows to use the user_hook definition (need to be defined via custom_fct entry) ", - allowed=[-1,0,1,2,3,4,10], - shortcut={'ckkw':-1,'ht':2,'ht/2':3,'et':1,'shat':4}, - ) + allowed=[-1,0,1,2,3,4,10]) self.add_param("mue_over_ref", 1.0, hidden=True, comment='ratio mu_other/mu for dynamical scale') self.add_param("ievo_eva",0,hidden=True, allowed=[0,1],fortran_name="ievo_eva", comment='eva: 0 for EW pdf muf evolution by q^2; 1 for evo by pT^2') @@ -5598,10 +5575,8 @@ def default_setup(self): self.add_param('niters_fo', 6, include=False) #seed and collider self.add_param('iseed', 0) - self.add_param('lpp1', 1, fortran_name='lpp(1)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) - self.add_param('lpp2', 1, fortran_name='lpp(2)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) + self.add_param('lpp1', 1, fortran_name='lpp(1)') + self.add_param('lpp2', 1, fortran_name='lpp(2)') self.add_param('ebeam1', 6500.0, fortran_name='ebeam(1)') self.add_param('ebeam2', 6500.0, fortran_name='ebeam(2)') self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", @@ -5644,15 +5619,13 @@ def default_setup(self): self.add_param('fixed_ren_scale', False) self.add_param('fixed_fac_scale', False) self.add_param('fixed_extra_scale', True, hidden=True, system=True) # set system since running from Ellis-Sexton scale not implemented - self.add_param('mur_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mur_ref_fixed', 91.118) self.add_param('muf1_ref_fixed', -1.0, hidden=True) - self.add_param('muf_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('muf_ref_fixed', 91.118) self.add_param('muf2_ref_fixed', -1.0, hidden=True) - self.add_param('mue_ref_fixed', 91.118, hidden=True, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mue_ref_fixed', 91.118, hidden=True) self.add_param("dynamical_scale_choice", [-1],fortran_name='dyn_scale', - allowed = [-2,-1,0,1,2,3,10], - shortcut={ 'ht/2':3,'ht':2,'et':1}, - comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") + allowed = [-2,-1,0,1,2,3,10], comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") self.add_param('fixed_qes_scale', False, hidden=True) self.add_param('qes_ref_fixed', -1.0, hidden=True) self.add_param('mur_over_ref', 1.0) diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py index 6f82393c3f..3c5601e27d 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py @@ -5205,12 +5205,12 @@ def init_run(self, cards): if self.run_set: self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), - 'lpp': ([str],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), + 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), - 'fixed_scale': ([str],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), + 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), 'pbp':([],['run_card lpp1 1', 'run_card lpp2 1','run_card nb_proton1 82', 'run_card nb_neutron1 126', 'run_card mass_ion1 195.0820996698','run_card nb_proton2 1', 'run_card nb_neutron2 0', 'run_card mass_ion1 -1']), @@ -5795,8 +5795,6 @@ def complete_set(self, text, line, begidx, endidx, formatting=True): allowed_for_run.remove('*') elif isinstance(self.run_card[args[-1]], bool): allowed_for_run = ['True', 'False'] - if args[-1].lower() in self.run_card.shortcut_values: - allowed_for_run += self.run_card.shortcut_values[args[-1].lower()] opts += [str(i) for i in allowed_for_run] diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model_FDG.pkl deleted file mode 100644 index bf5a732979..0000000000 Binary files a/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl deleted file mode 100644 index 3e55c479e2..0000000000 Binary files a/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index ba99f30bdf..234a77535f 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +28,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +38,16 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +56,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005433082580566406  +DEBUG: model prefixing takes 0.0030896663665771484  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -148,13 +149,13 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.012 s +1 processes with 16 diagrams generated in 0.018 s Total: 1 processes with 16 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 @@ -163,18 +164,18 @@ INFO: Processing color information for process: g g > t t~ g @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. -Generated helas calls for 1 subprocesses (16 diagrams) in 0.045 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. +Generated helas calls for 1 subprocesses (16 diagrams) in 0.030 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.206 s +ALOHA: aloha creates 5 routines in 0.184 s VVV1 VVV1 FFV1 @@ -184,17 +185,17 @@ ALOHA: aloha creates 5 routines in 0.206 s VVVV1 VVVV3 VVVV4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. quit -real 0m1.176s -user 0m0.468s -sys 0m0.131s -Code generation completed in 1 seconds +real 0m0.672s +user 0m0.601s +sys 0m0.061s +Code generation completed in 0 seconds diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..1e7dcf38fe 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -504,7 +504,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc index 372ced5d87..887adbc468 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc @@ -1150,38 +1150,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1207,7 +1219,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1224,6 +1236,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1342,7 +1355,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1366,7 +1379,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1478,60 +1491,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1539,32 +1553,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h index 5c057176f6..6ad3c7dd1e 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/fcheck_sa.f b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/fcheck_sa.f index cb7efdfbcf..70c3d08b67 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/fcheck_sa.f +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk index f5bf67efbc..4cf05f5642 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' >> $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index ea9db152a3..d4f521efbc 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +28,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +38,16 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +56,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004921674728393555  +DEBUG: model prefixing takes 0.003898143768310547  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -148,7 +149,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.080 s +1 processes with 123 diagrams generated in 0.127 s Total: 1 processes with 123 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -159,10 +160,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vecto INFO: initialize a new directory: CODEGEN_mad_gg_ttgg INFO: remove old information in CODEGEN_mad_gg_ttgg DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 @@ -174,25 +175,25 @@ FileWriter t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1748]  -Generated helas calls for 1 subprocesses (123 diagrams) in 0.223 s -Wrote files for 222 helas calls in 0.654 s +DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1749]  +Generated helas calls for 1 subprocesses (123 diagrams) in 0.299 s +Wrote files for 222 helas calls in 0.469 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.219 s +ALOHA: aloha creates 5 routines in 0.184 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.197 s +ALOHA: aloha creates 10 routines in 0.237 s VVV1 VVV1 FFV1 @@ -205,32 +206,32 @@ ALOHA: aloha creates 10 routines in 0.197 s VVVV3 VVVV4 VVVV4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done. +Output to directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README +/home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README Run "open index.html" to see more information about this process. quit -real 0m5.675s -user 0m2.118s -sys 0m0.681s -Code generation completed in 6 seconds +real 0m3.393s +user 0m3.029s +sys 0m0.328s +Code generation completed in 4 seconds ************************************************************ * * * W E L C O M E to * @@ -251,10 +252,10 @@ Code generation completed in 6 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards run @@ -281,10 +282,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt index 712b1897aa..db7e3616c4 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat index 5fe0cb01be..5ffde659c4 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-8-gf0884cb7d HEAD * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f index e986b059a9..47699fa614 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f @@ -2022,21 +2022,6 @@ subroutine orxxxx(p,rmass,nhel,nsr , ro) end - complex*16 function THETA_FUNCTIONR(cond, out_true, out_false) - - double precision cond - double precision out_true, out_false - - if (cond.ge.0d0) then - THETA_FUNCTIONR = out_true - else - THETA_FUNCTIONR = out_false - endif - - return - - - end complex*16 function THETA_FUNCTION(cond, out_true, out_false) double precision cond diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..1e7dcf38fe 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -504,7 +504,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc index 0f6ddcae67..93c8bce4aa 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc @@ -3084,38 +3084,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -3141,7 +3153,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -3158,6 +3170,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -3276,7 +3289,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -3300,7 +3313,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -3412,60 +3425,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -3473,32 +3487,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h index 96f4a4724c..6ef3863ae3 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f index 6a61beea31..60ddbdbedd 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f index 0f7fcaa25f..3a363b9a2b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f @@ -337,6 +337,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -347,6 +350,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -443,7 +447,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -516,19 +520,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -598,7 +604,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -622,7 +628,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -630,7 +636,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/fcheck_sa.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/fcheck_sa.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f index b173f22bfc..19b83eff7d 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f @@ -334,7 +334,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -372,8 +372,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/addmothers.f index d6cded9a2d..593c620d9b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/addmothers.f @@ -111,7 +111,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, if (btest(mlevel,3)) then write(*,*)'unwgt.f: write out diagram ',igraphs(1) endif - lconfig = vec_igraph1(ivec) + lconfig = vec_igraph(ivec) endif is_LC=.true. maxcolor=0 diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cluster.inc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cluster.inc index 8ddf5bee13..940c25eac0 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cluster.inc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cluster.inc @@ -43,5 +43,5 @@ c parameters for sudakovs integer iipdg,iimode common/gamma_args/Q1,iipdg,iimode - integer vec_igraph1(VECSIZE_MEMMAX) - common/vec_igraph/vec_igraph1 + integer vec_igraph(VECSIZE_MEMMAX) + common/vec_igraph/vec_igraph diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk index f5bf67efbc..4cf05f5642 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' >> $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile_original.mk index 6cb56d0409..348c283be7 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile_original.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile_original.mk @@ -58,10 +58,7 @@ $(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp -libcollier.$(dylibext): - ln -s $(LIBDIR)/collier_lib/libcollier.$(dylibext) || echo 'already done' - -gensym: $(SYMMETRY) configs.inc $(LIBS) libcollier.$(dylibext) +gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/myamp.f index bd02dfe2b4..5360566ef4 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/myamp.f @@ -139,7 +139,7 @@ logical function cut_bw(p) $ gForceBW(i,iconfig).eq.1)) if(onshell)then c Remove on-shell forbidden s-channels (gForceBW=2) (JA 2/10/11) - if(gForceBW(i,iconfig).eq.2.and.sde_strat.eq.1) then + if(gForceBW(i,iconfig).eq.2) then cut_bw = .true. return endif diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/reweight.f index 353e025d71..8e4672a421 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/reweight.f @@ -1416,6 +1416,7 @@ double precision function rewgt(p, ivec) rewgt=1.0d0 clustered=.false. + vec_igraph(ivec) = 0 ! default: no MLM graph selected for this event if(ickkw.le.0.and..not.use_syst) return @@ -1467,6 +1468,7 @@ double precision function rewgt(p, ivec) rewgt = 0d0 return endif + vec_igraph(ivec) = igraphs(1) ! save MLM-matched graph for this event c Store pdf information for systematics studies (initial) @@ -1592,10 +1594,6 @@ double precision function rewgt(p, ivec) c alpha_s weight if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then - if (q2now.le.4)then - rewgt=0d0 - return - endif rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1907,7 +1905,7 @@ subroutine update_scale_coupling_vec(all_p, all_wgt,all_q2fact, VECSIZE_USED) else all_q2fact(1,i) = q2fact(1) all_q2fact(2,i) = q2fact(2) - vec_igraph1(i) = igraphs(1) + vec_igraph(i) = igraphs(1) endif c call save_cl_val_to(i) c endif diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py index 74f6b04b68..c248436e7f 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py @@ -1004,8 +1004,6 @@ def __init__(self, finput=None, **opt): self.comments = {} # comment associated to parameters. can be display via help message # store the valid options for a given parameter. self.allowed_value = {} - # allow nickname for some parameter to avoid integer mapping for some var - self.shortcut_values = {} self.default_setup() @@ -1134,11 +1132,6 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): scan_targettype = self.scan_set[lower_name] del self.scan_set[lower_name] - # check if the user used a shortcut value (which are always str) - if lower_name in self.shortcut_values: - if isinstance(value,str) and value.strip().lower() in self.shortcut_values[lower_name]: - value = self.shortcut_values[lower_name][value.strip().lower()] - # 2. Find the type of the attribute that we want if lower_name in self.list_parameter: targettype = self.list_parameter[lower_name] @@ -1317,8 +1310,7 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): def add_param(self, name, value, system=False, comment=False, typelist=None, - allowed=[], - shortcut={}): + allowed=[]): """add a default parameter to the class""" lower_name = name.lower() @@ -1353,11 +1345,6 @@ def add_param(self, name, value, system=False, comment=False, typelist=None, assert val in allowed or '*' in allowed else: assert value in allowed or '*' in allowed - if shortcut: - if allowed and shortcut and '*' not in allowed: - assert all([val in allowed for val in shortcut.values()]), "Some shortcut value are not in the allowed list" - assert all([isinstance(v, str) for v in shortcut.keys()]), "All shortcut values should be str" - self.shortcut_values[lower_name] = shortcut #elif isinstance(value, bool) and allowed != ['*']: # self.allowed_value[name] = [True, False] @@ -4186,10 +4173,8 @@ def default_setup(self): allowed=['partonshower'], comment="list of check that can be bypassed manually.") self.add_param("python_seed", -2, include=False, hidden=True, comment="controlling python seed [handling in particular the final unweighting].\n -1 means use default from random module.\n -2 means set to same value as iseed") self.add_param("lpp1", 1, fortran_name="lpp(1)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='first beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("lpp2", 1, fortran_name="lpp(2)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='second beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("ebeam1", 6500.0, fortran_name="ebeam(1)") self.add_param("ebeam2", 6500.0, fortran_name="ebeam(2)") @@ -4198,24 +4183,18 @@ def default_setup(self): self.add_param("polbeam2", 0.0, fortran_name="pb2", hidden=True, comment="Beam polarization from -100 (left-handed) to 100 (right-handed) --use lpp=0 for this parameter--") self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_proton2', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(2)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (used for beam 2 if group_subprocess was False)') self.add_param('nb_neutron1', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(1)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_neutron2', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(2)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (of beam 2 if group_subprocess was False )') self.add_param('mass_ion1', -1.0, hidden=True, fortran_name="mass_ion(1)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 1)') self.add_param('mass_ion2', -1.0, hidden=True, fortran_name="mass_ion(2)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 2)') valid_pdf = ['lhapdf', 'cteq6_m','cteq6_l', 'cteq6l1','nn23lo', 'nn23lo1', 'nn23nlo','iww','eva','edff','chff','none','mixed']+\ sum(self.allowed_lep_densities.values(),[]) @@ -4228,14 +4207,12 @@ def default_setup(self): self.add_param("fixed_fac_scale1", False, hidden=True) self.add_param("fixed_fac_scale2", False, hidden=True) self.add_param("fixed_extra_scale", False, hidden=True) - self.add_param("scale", 91.1880, shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) + self.add_param("scale", 91.1880) + self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1") + self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2") self.add_param("mue_ref_fixed", 91.1880, hidden=True) self.add_param("dynamical_scale_choice", -1, comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2\n '4' is the center of mass energy\n'0' allows to use the user_hook definition (need to be defined via custom_fct entry) ", - allowed=[-1,0,1,2,3,4,10], - shortcut={'ckkw':-1,'ht':2,'ht/2':3,'et':1,'shat':4}, - ) + allowed=[-1,0,1,2,3,4,10]) self.add_param("mue_over_ref", 1.0, hidden=True, comment='ratio mu_other/mu for dynamical scale') self.add_param("ievo_eva",0,hidden=True, allowed=[0,1],fortran_name="ievo_eva", comment='eva: 0 for EW pdf muf evolution by q^2; 1 for evo by pT^2') @@ -5598,10 +5575,8 @@ def default_setup(self): self.add_param('niters_fo', 6, include=False) #seed and collider self.add_param('iseed', 0) - self.add_param('lpp1', 1, fortran_name='lpp(1)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) - self.add_param('lpp2', 1, fortran_name='lpp(2)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) + self.add_param('lpp1', 1, fortran_name='lpp(1)') + self.add_param('lpp2', 1, fortran_name='lpp(2)') self.add_param('ebeam1', 6500.0, fortran_name='ebeam(1)') self.add_param('ebeam2', 6500.0, fortran_name='ebeam(2)') self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", @@ -5644,15 +5619,13 @@ def default_setup(self): self.add_param('fixed_ren_scale', False) self.add_param('fixed_fac_scale', False) self.add_param('fixed_extra_scale', True, hidden=True, system=True) # set system since running from Ellis-Sexton scale not implemented - self.add_param('mur_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mur_ref_fixed', 91.118) self.add_param('muf1_ref_fixed', -1.0, hidden=True) - self.add_param('muf_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('muf_ref_fixed', 91.118) self.add_param('muf2_ref_fixed', -1.0, hidden=True) - self.add_param('mue_ref_fixed', 91.118, hidden=True, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mue_ref_fixed', 91.118, hidden=True) self.add_param("dynamical_scale_choice", [-1],fortran_name='dyn_scale', - allowed = [-2,-1,0,1,2,3,10], - shortcut={ 'ht/2':3,'ht':2,'et':1}, - comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") + allowed = [-2,-1,0,1,2,3,10], comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") self.add_param('fixed_qes_scale', False, hidden=True) self.add_param('qes_ref_fixed', -1.0, hidden=True) self.add_param('mur_over_ref', 1.0) diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py index 6f82393c3f..3c5601e27d 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py @@ -5205,12 +5205,12 @@ def init_run(self, cards): if self.run_set: self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), - 'lpp': ([str],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), + 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), - 'fixed_scale': ([str],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), + 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), 'pbp':([],['run_card lpp1 1', 'run_card lpp2 1','run_card nb_proton1 82', 'run_card nb_neutron1 126', 'run_card mass_ion1 195.0820996698','run_card nb_proton2 1', 'run_card nb_neutron2 0', 'run_card mass_ion1 -1']), @@ -5795,8 +5795,6 @@ def complete_set(self, text, line, begidx, endidx, formatting=True): allowed_for_run.remove('*') elif isinstance(self.run_card[args[-1]], bool): allowed_for_run = ['True', 'False'] - if args[-1].lower() in self.run_card.shortcut_values: - allowed_for_run += self.run_card.shortcut_values[args[-1].lower()] opts += [str(i) for i in allowed_for_run] diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model_FDG.pkl deleted file mode 100644 index bf5a732979..0000000000 Binary files a/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl deleted file mode 100644 index 3e55c479e2..0000000000 Binary files a/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index 7ff994126b..a4e84691b5 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +28,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +38,16 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +56,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.003983020782470703  +DEBUG: model prefixing takes 0.003270387649536133  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -148,13 +149,13 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.081 s +1 processes with 123 diagrams generated in 0.120 s Total: 1 processes with 123 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 @@ -163,18 +164,18 @@ INFO: Processing color information for process: g g > t t~ g g @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. -Generated helas calls for 1 subprocesses (123 diagrams) in 0.216 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. +Generated helas calls for 1 subprocesses (123 diagrams) in 0.290 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.204 s +ALOHA: aloha creates 5 routines in 0.177 s VVV1 VVV1 FFV1 @@ -187,17 +188,17 @@ ALOHA: aloha creates 5 routines in 0.204 s VVVV3 VVVV4 VVVV4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. quit -real 0m1.544s -user 0m0.774s -sys 0m0.144s -Code generation completed in 2 seconds +real 0m1.173s +user 0m1.082s +sys 0m0.081s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..1e7dcf38fe 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -504,7 +504,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc index 08a537c1f2..dfa0ab3dc3 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc @@ -3141,38 +3141,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -3198,7 +3210,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -3215,6 +3227,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -3333,7 +3346,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -3357,7 +3370,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -3469,60 +3482,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -3530,32 +3544,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h index 96f4a4724c..6ef3863ae3 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/fcheck_sa.f b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/fcheck_sa.f +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk index f5bf67efbc..4cf05f5642 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' >> $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index ebb525b6f1..d6ebcb3812 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +28,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +38,16 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +56,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0036034584045410156  +DEBUG: model prefixing takes 0.0031838417053222656  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -148,7 +149,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 0.963 s +1 processes with 1240 diagrams generated in 1.458 s Total: 1 processes with 1240 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -159,16 +160,16 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vect INFO: initialize a new directory: CODEGEN_mad_gg_ttggg INFO: remove old information in CODEGEN_mad_gg_ttggg DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] -INFO: Color-Flow passed to 1630 term in 3s. Introduce 3030 contraction +INFO: Color-Flow passed to 1630 term in 5s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h @@ -176,25 +177,25 @@ FileWriter t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -DEBUG: len(subproc_diagrams_for_config) =  945 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [model_handling.py at line 1748]  -Generated helas calls for 1 subprocesses (1240 diagrams) in 3.355 s -Wrote files for 2281 helas calls in 9.598 s +DEBUG: len(subproc_diagrams_for_config) =  945 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [model_handling.py at line 1749]  +Generated helas calls for 1 subprocesses (1240 diagrams) in 4.918 s +Wrote files for 2281 helas calls in 11.612 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.194 s +ALOHA: aloha creates 5 routines in 0.257 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.231 s +ALOHA: aloha creates 10 routines in 0.168 s VVV1 VVV1 FFV1 @@ -207,32 +208,32 @@ ALOHA: aloha creates 10 routines in 0.231 s VVVV3 VVVV4 VVVV4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. +Output to directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README +/home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README Run "open index.html" to see more information about this process. quit -real 0m20.546s -user 0m16.458s -sys 0m0.884s -Code generation completed in 20 seconds +real 0m23.511s +user 0m22.940s +sys 0m0.432s +Code generation completed in 24 seconds ************************************************************ * * * W E L C O M E to * @@ -253,10 +254,10 @@ Code generation completed in 20 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards run @@ -283,10 +284,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt index 712b1897aa..db7e3616c4 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat index 08a07273bc..e1b2f3835b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-8-gf0884cb7d HEAD * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_functions.f index e986b059a9..47699fa614 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_functions.f @@ -2022,21 +2022,6 @@ subroutine orxxxx(p,rmass,nhel,nsr , ro) end - complex*16 function THETA_FUNCTIONR(cond, out_true, out_false) - - double precision cond - double precision out_true, out_false - - if (cond.ge.0d0) then - THETA_FUNCTIONR = out_true - else - THETA_FUNCTIONR = out_false - endif - - return - - - end complex*16 function THETA_FUNCTION(cond, out_true, out_false) double precision cond diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..1e7dcf38fe 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -504,7 +504,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc index 148ad48435..14d3f28923 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc @@ -30655,38 +30655,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -30712,7 +30724,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -30729,6 +30741,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -30847,7 +30860,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -30871,7 +30884,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -30983,60 +30996,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -31044,32 +31058,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h index 75c52ba31a..fbe1065f6b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f index 95f2b50e68..7a436cbd5c 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f index 14d6ca8aa6..0c9e5f7080 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f @@ -337,6 +337,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -347,6 +350,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -443,7 +447,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -516,19 +520,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=128) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -598,7 +604,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -622,7 +628,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -630,7 +636,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/fcheck_sa.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/fcheck_sa.f index 870c890410..5ae4792dfa 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/fcheck_sa.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f index ff1a367151..e11cef7ff9 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f @@ -398,7 +398,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -436,8 +436,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/addmothers.f index d6cded9a2d..593c620d9b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/addmothers.f @@ -111,7 +111,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, if (btest(mlevel,3)) then write(*,*)'unwgt.f: write out diagram ',igraphs(1) endif - lconfig = vec_igraph1(ivec) + lconfig = vec_igraph(ivec) endif is_LC=.true. maxcolor=0 diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cluster.inc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cluster.inc index 8ddf5bee13..940c25eac0 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cluster.inc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cluster.inc @@ -43,5 +43,5 @@ c parameters for sudakovs integer iipdg,iimode common/gamma_args/Q1,iipdg,iimode - integer vec_igraph1(VECSIZE_MEMMAX) - common/vec_igraph/vec_igraph1 + integer vec_igraph(VECSIZE_MEMMAX) + common/vec_igraph/vec_igraph diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk index f5bf67efbc..4cf05f5642 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' >> $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile_original.mk index 6cb56d0409..348c283be7 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile_original.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile_original.mk @@ -58,10 +58,7 @@ $(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp -libcollier.$(dylibext): - ln -s $(LIBDIR)/collier_lib/libcollier.$(dylibext) || echo 'already done' - -gensym: $(SYMMETRY) configs.inc $(LIBS) libcollier.$(dylibext) +gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/myamp.f index bd02dfe2b4..5360566ef4 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/myamp.f @@ -139,7 +139,7 @@ logical function cut_bw(p) $ gForceBW(i,iconfig).eq.1)) if(onshell)then c Remove on-shell forbidden s-channels (gForceBW=2) (JA 2/10/11) - if(gForceBW(i,iconfig).eq.2.and.sde_strat.eq.1) then + if(gForceBW(i,iconfig).eq.2) then cut_bw = .true. return endif diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/reweight.f index 353e025d71..8e4672a421 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/reweight.f @@ -1416,6 +1416,7 @@ double precision function rewgt(p, ivec) rewgt=1.0d0 clustered=.false. + vec_igraph(ivec) = 0 ! default: no MLM graph selected for this event if(ickkw.le.0.and..not.use_syst) return @@ -1467,6 +1468,7 @@ double precision function rewgt(p, ivec) rewgt = 0d0 return endif + vec_igraph(ivec) = igraphs(1) ! save MLM-matched graph for this event c Store pdf information for systematics studies (initial) @@ -1592,10 +1594,6 @@ double precision function rewgt(p, ivec) c alpha_s weight if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then - if (q2now.le.4)then - rewgt=0d0 - return - endif rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1907,7 +1905,7 @@ subroutine update_scale_coupling_vec(all_p, all_wgt,all_q2fact, VECSIZE_USED) else all_q2fact(1,i) = q2fact(1) all_q2fact(2,i) = q2fact(2) - vec_igraph1(i) = igraphs(1) + vec_igraph(i) = igraphs(1) endif c call save_cl_val_to(i) c endif diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py index 74f6b04b68..c248436e7f 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py @@ -1004,8 +1004,6 @@ def __init__(self, finput=None, **opt): self.comments = {} # comment associated to parameters. can be display via help message # store the valid options for a given parameter. self.allowed_value = {} - # allow nickname for some parameter to avoid integer mapping for some var - self.shortcut_values = {} self.default_setup() @@ -1134,11 +1132,6 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): scan_targettype = self.scan_set[lower_name] del self.scan_set[lower_name] - # check if the user used a shortcut value (which are always str) - if lower_name in self.shortcut_values: - if isinstance(value,str) and value.strip().lower() in self.shortcut_values[lower_name]: - value = self.shortcut_values[lower_name][value.strip().lower()] - # 2. Find the type of the attribute that we want if lower_name in self.list_parameter: targettype = self.list_parameter[lower_name] @@ -1317,8 +1310,7 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): def add_param(self, name, value, system=False, comment=False, typelist=None, - allowed=[], - shortcut={}): + allowed=[]): """add a default parameter to the class""" lower_name = name.lower() @@ -1353,11 +1345,6 @@ def add_param(self, name, value, system=False, comment=False, typelist=None, assert val in allowed or '*' in allowed else: assert value in allowed or '*' in allowed - if shortcut: - if allowed and shortcut and '*' not in allowed: - assert all([val in allowed for val in shortcut.values()]), "Some shortcut value are not in the allowed list" - assert all([isinstance(v, str) for v in shortcut.keys()]), "All shortcut values should be str" - self.shortcut_values[lower_name] = shortcut #elif isinstance(value, bool) and allowed != ['*']: # self.allowed_value[name] = [True, False] @@ -4186,10 +4173,8 @@ def default_setup(self): allowed=['partonshower'], comment="list of check that can be bypassed manually.") self.add_param("python_seed", -2, include=False, hidden=True, comment="controlling python seed [handling in particular the final unweighting].\n -1 means use default from random module.\n -2 means set to same value as iseed") self.add_param("lpp1", 1, fortran_name="lpp(1)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='first beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("lpp2", 1, fortran_name="lpp(2)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='second beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("ebeam1", 6500.0, fortran_name="ebeam(1)") self.add_param("ebeam2", 6500.0, fortran_name="ebeam(2)") @@ -4198,24 +4183,18 @@ def default_setup(self): self.add_param("polbeam2", 0.0, fortran_name="pb2", hidden=True, comment="Beam polarization from -100 (left-handed) to 100 (right-handed) --use lpp=0 for this parameter--") self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_proton2', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(2)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (used for beam 2 if group_subprocess was False)') self.add_param('nb_neutron1', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(1)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_neutron2', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(2)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (of beam 2 if group_subprocess was False )') self.add_param('mass_ion1', -1.0, hidden=True, fortran_name="mass_ion(1)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 1)') self.add_param('mass_ion2', -1.0, hidden=True, fortran_name="mass_ion(2)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 2)') valid_pdf = ['lhapdf', 'cteq6_m','cteq6_l', 'cteq6l1','nn23lo', 'nn23lo1', 'nn23nlo','iww','eva','edff','chff','none','mixed']+\ sum(self.allowed_lep_densities.values(),[]) @@ -4228,14 +4207,12 @@ def default_setup(self): self.add_param("fixed_fac_scale1", False, hidden=True) self.add_param("fixed_fac_scale2", False, hidden=True) self.add_param("fixed_extra_scale", False, hidden=True) - self.add_param("scale", 91.1880, shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) + self.add_param("scale", 91.1880) + self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1") + self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2") self.add_param("mue_ref_fixed", 91.1880, hidden=True) self.add_param("dynamical_scale_choice", -1, comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2\n '4' is the center of mass energy\n'0' allows to use the user_hook definition (need to be defined via custom_fct entry) ", - allowed=[-1,0,1,2,3,4,10], - shortcut={'ckkw':-1,'ht':2,'ht/2':3,'et':1,'shat':4}, - ) + allowed=[-1,0,1,2,3,4,10]) self.add_param("mue_over_ref", 1.0, hidden=True, comment='ratio mu_other/mu for dynamical scale') self.add_param("ievo_eva",0,hidden=True, allowed=[0,1],fortran_name="ievo_eva", comment='eva: 0 for EW pdf muf evolution by q^2; 1 for evo by pT^2') @@ -5598,10 +5575,8 @@ def default_setup(self): self.add_param('niters_fo', 6, include=False) #seed and collider self.add_param('iseed', 0) - self.add_param('lpp1', 1, fortran_name='lpp(1)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) - self.add_param('lpp2', 1, fortran_name='lpp(2)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) + self.add_param('lpp1', 1, fortran_name='lpp(1)') + self.add_param('lpp2', 1, fortran_name='lpp(2)') self.add_param('ebeam1', 6500.0, fortran_name='ebeam(1)') self.add_param('ebeam2', 6500.0, fortran_name='ebeam(2)') self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", @@ -5644,15 +5619,13 @@ def default_setup(self): self.add_param('fixed_ren_scale', False) self.add_param('fixed_fac_scale', False) self.add_param('fixed_extra_scale', True, hidden=True, system=True) # set system since running from Ellis-Sexton scale not implemented - self.add_param('mur_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mur_ref_fixed', 91.118) self.add_param('muf1_ref_fixed', -1.0, hidden=True) - self.add_param('muf_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('muf_ref_fixed', 91.118) self.add_param('muf2_ref_fixed', -1.0, hidden=True) - self.add_param('mue_ref_fixed', 91.118, hidden=True, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mue_ref_fixed', 91.118, hidden=True) self.add_param("dynamical_scale_choice", [-1],fortran_name='dyn_scale', - allowed = [-2,-1,0,1,2,3,10], - shortcut={ 'ht/2':3,'ht':2,'et':1}, - comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") + allowed = [-2,-1,0,1,2,3,10], comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") self.add_param('fixed_qes_scale', False, hidden=True) self.add_param('qes_ref_fixed', -1.0, hidden=True) self.add_param('mur_over_ref', 1.0) diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py index 6f82393c3f..3c5601e27d 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py @@ -5205,12 +5205,12 @@ def init_run(self, cards): if self.run_set: self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), - 'lpp': ([str],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), + 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), - 'fixed_scale': ([str],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), + 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), 'pbp':([],['run_card lpp1 1', 'run_card lpp2 1','run_card nb_proton1 82', 'run_card nb_neutron1 126', 'run_card mass_ion1 195.0820996698','run_card nb_proton2 1', 'run_card nb_neutron2 0', 'run_card mass_ion1 -1']), @@ -5795,8 +5795,6 @@ def complete_set(self, text, line, begidx, endidx, formatting=True): allowed_for_run.remove('*') elif isinstance(self.run_card[args[-1]], bool): allowed_for_run = ['True', 'False'] - if args[-1].lower() in self.run_card.shortcut_values: - allowed_for_run += self.run_card.shortcut_values[args[-1].lower()] opts += [str(i) for i in allowed_for_run] diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model_FDG.pkl deleted file mode 100644 index bf5a732979..0000000000 Binary files a/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl deleted file mode 100644 index 3e55c479e2..0000000000 Binary files a/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index 66cd67a19b..bbd14da044 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +28,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +38,16 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +56,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0029516220092773438  +DEBUG: model prefixing takes 0.0028781890869140625  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -148,13 +149,13 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 0.953 s +1 processes with 1240 diagrams generated in 1.565 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 @@ -163,18 +164,18 @@ INFO: Processing color information for process: g g > t t~ g g g @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 3.379 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. +Generated helas calls for 1 subprocesses (1240 diagrams) in 4.749 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.214 s +ALOHA: aloha creates 5 routines in 0.233 s VVV1 VVV1 FFV1 @@ -187,17 +188,17 @@ ALOHA: aloha creates 5 routines in 0.214 s VVVV3 VVVV4 VVVV4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. quit -real 0m7.419s -user 0m6.626s -sys 0m0.185s -Code generation completed in 7 seconds +real 0m9.477s +user 0m9.321s +sys 0m0.113s +Code generation completed in 10 seconds diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..1e7dcf38fe 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -504,7 +504,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc index 6b89d18559..692ef2ca37 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc @@ -32545,38 +32545,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -32602,7 +32614,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -32619,6 +32631,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -32737,7 +32750,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -32761,7 +32774,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -32873,60 +32886,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -32934,32 +32948,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h index 75c52ba31a..fbe1065f6b 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/fcheck_sa.f b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/fcheck_sa.f index 870c890410..5ae4792dfa 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/fcheck_sa.f +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk index f5bf67efbc..4cf05f5642 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' >> $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index 248fa16d65..37c4a61024 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +28,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +38,16 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -54,7 +55,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0052111148834228516  +DEBUG: model prefixing takes 0.0028390884399414062  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -163,7 +164,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.041 s +8 processes with 40 diagrams generated in 0.054 s Total: 8 processes with 40 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -174,10 +175,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector INFO: initialize a new directory: CODEGEN_mad_gq_ttq INFO: remove old information in CODEGEN_mad_gq_ttq DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 INFO: Processing color information for process: g u > t t~ u @1 @@ -197,9 +198,9 @@ FileWriter t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1749]  INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -208,50 +209,50 @@ FileWriter t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  -Generated helas calls for 2 subprocesses (10 diagrams) in 0.017 s -Wrote files for 32 helas calls in 0.625 s +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1749]  +Generated helas calls for 2 subprocesses (10 diagrams) in 0.023 s +Wrote files for 32 helas calls in 0.115 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.094 s +ALOHA: aloha creates 2 routines in 0.080 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 4 routines in 0.080 s +ALOHA: aloha creates 4 routines in 0.065 s FFV1 FFV1 FFV1 FFV1 VVV1 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done. +Output to directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README +/home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README Run "open index.html" to see more information about this process. quit -real 0m5.076s -user 0m1.391s -sys 0m0.672s -Code generation completed in 5 seconds +real 0m1.991s +user 0m1.660s +sys 0m0.309s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * @@ -272,10 +273,10 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards run @@ -302,10 +303,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt index 712b1897aa..db7e3616c4 100644 --- a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat index aba2f10b06..7ba8666046 100644 --- a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-8-gf0884cb7d HEAD * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_functions.f index e986b059a9..47699fa614 100644 --- a/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_functions.f @@ -2022,21 +2022,6 @@ subroutine orxxxx(p,rmass,nhel,nsr , ro) end - complex*16 function THETA_FUNCTIONR(cond, out_true, out_false) - - double precision cond - double precision out_true, out_false - - if (cond.ge.0d0) then - THETA_FUNCTIONR = out_true - else - THETA_FUNCTIONR = out_false - endif - - return - - - end complex*16 function THETA_FUNCTION(cond, out_true, out_false) double precision cond diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..1e7dcf38fe 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -504,7 +504,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index 787b72a15b..47e6c1de98 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -994,38 +994,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1051,7 +1063,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1068,6 +1080,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1186,7 +1199,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1210,7 +1223,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1322,60 +1335,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1383,32 +1397,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h index ebc491b00d..ab9d7dde82 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f index 4595d5a38e..37932e73a3 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f @@ -794,8 +794,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -896,9 +895,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1208,7 +1206,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1223,10 +1221,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1238,7 +1239,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f index 0f523f574b..f252c024f6 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f @@ -356,6 +356,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -367,6 +370,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -487,7 +491,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -560,19 +564,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=32) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -642,7 +648,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -666,7 +672,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -674,7 +680,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/fcheck_sa.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/fcheck_sa.f index cb7efdfbcf..70c3d08b67 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/fcheck_sa.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f index 90ac031008..14fd0f0017 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f @@ -320,7 +320,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -361,8 +361,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index e2c28c73eb..8655477d5e 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -994,38 +994,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1051,7 +1063,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1068,6 +1080,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1186,7 +1199,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1210,7 +1223,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1322,60 +1335,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1383,32 +1397,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h index 2c3a739550..55c42cb947 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f index e239a05794..748758b702 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f @@ -794,8 +794,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -896,9 +895,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1208,7 +1206,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1223,10 +1221,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1238,7 +1239,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f index 7240e416ab..bb34349714 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f @@ -356,6 +356,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -367,6 +370,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -487,7 +491,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -560,19 +564,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=32) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -642,7 +648,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -666,7 +672,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -674,7 +680,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/fcheck_sa.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/fcheck_sa.f index cb7efdfbcf..70c3d08b67 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/fcheck_sa.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f index aa0f9bedff..784c7b3ebc 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f @@ -320,7 +320,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -361,8 +361,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/addmothers.f index d6cded9a2d..593c620d9b 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/addmothers.f @@ -111,7 +111,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, if (btest(mlevel,3)) then write(*,*)'unwgt.f: write out diagram ',igraphs(1) endif - lconfig = vec_igraph1(ivec) + lconfig = vec_igraph(ivec) endif is_LC=.true. maxcolor=0 diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cluster.inc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cluster.inc index 8ddf5bee13..940c25eac0 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cluster.inc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cluster.inc @@ -43,5 +43,5 @@ c parameters for sudakovs integer iipdg,iimode common/gamma_args/Q1,iipdg,iimode - integer vec_igraph1(VECSIZE_MEMMAX) - common/vec_igraph/vec_igraph1 + integer vec_igraph(VECSIZE_MEMMAX) + common/vec_igraph/vec_igraph diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk index f5bf67efbc..4cf05f5642 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' >> $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile_original.mk index 6cb56d0409..348c283be7 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile_original.mk +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile_original.mk @@ -58,10 +58,7 @@ $(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp -libcollier.$(dylibext): - ln -s $(LIBDIR)/collier_lib/libcollier.$(dylibext) || echo 'already done' - -gensym: $(SYMMETRY) configs.inc $(LIBS) libcollier.$(dylibext) +gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/myamp.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/myamp.f index bd02dfe2b4..5360566ef4 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/myamp.f @@ -139,7 +139,7 @@ logical function cut_bw(p) $ gForceBW(i,iconfig).eq.1)) if(onshell)then c Remove on-shell forbidden s-channels (gForceBW=2) (JA 2/10/11) - if(gForceBW(i,iconfig).eq.2.and.sde_strat.eq.1) then + if(gForceBW(i,iconfig).eq.2) then cut_bw = .true. return endif diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/reweight.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/reweight.f index 353e025d71..8e4672a421 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/reweight.f @@ -1416,6 +1416,7 @@ double precision function rewgt(p, ivec) rewgt=1.0d0 clustered=.false. + vec_igraph(ivec) = 0 ! default: no MLM graph selected for this event if(ickkw.le.0.and..not.use_syst) return @@ -1467,6 +1468,7 @@ double precision function rewgt(p, ivec) rewgt = 0d0 return endif + vec_igraph(ivec) = igraphs(1) ! save MLM-matched graph for this event c Store pdf information for systematics studies (initial) @@ -1592,10 +1594,6 @@ double precision function rewgt(p, ivec) c alpha_s weight if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then - if (q2now.le.4)then - rewgt=0d0 - return - endif rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1907,7 +1905,7 @@ subroutine update_scale_coupling_vec(all_p, all_wgt,all_q2fact, VECSIZE_USED) else all_q2fact(1,i) = q2fact(1) all_q2fact(2,i) = q2fact(2) - vec_igraph1(i) = igraphs(1) + vec_igraph(i) = igraphs(1) endif c call save_cl_val_to(i) c endif diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py index 74f6b04b68..c248436e7f 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py @@ -1004,8 +1004,6 @@ def __init__(self, finput=None, **opt): self.comments = {} # comment associated to parameters. can be display via help message # store the valid options for a given parameter. self.allowed_value = {} - # allow nickname for some parameter to avoid integer mapping for some var - self.shortcut_values = {} self.default_setup() @@ -1134,11 +1132,6 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): scan_targettype = self.scan_set[lower_name] del self.scan_set[lower_name] - # check if the user used a shortcut value (which are always str) - if lower_name in self.shortcut_values: - if isinstance(value,str) and value.strip().lower() in self.shortcut_values[lower_name]: - value = self.shortcut_values[lower_name][value.strip().lower()] - # 2. Find the type of the attribute that we want if lower_name in self.list_parameter: targettype = self.list_parameter[lower_name] @@ -1317,8 +1310,7 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): def add_param(self, name, value, system=False, comment=False, typelist=None, - allowed=[], - shortcut={}): + allowed=[]): """add a default parameter to the class""" lower_name = name.lower() @@ -1353,11 +1345,6 @@ def add_param(self, name, value, system=False, comment=False, typelist=None, assert val in allowed or '*' in allowed else: assert value in allowed or '*' in allowed - if shortcut: - if allowed and shortcut and '*' not in allowed: - assert all([val in allowed for val in shortcut.values()]), "Some shortcut value are not in the allowed list" - assert all([isinstance(v, str) for v in shortcut.keys()]), "All shortcut values should be str" - self.shortcut_values[lower_name] = shortcut #elif isinstance(value, bool) and allowed != ['*']: # self.allowed_value[name] = [True, False] @@ -4186,10 +4173,8 @@ def default_setup(self): allowed=['partonshower'], comment="list of check that can be bypassed manually.") self.add_param("python_seed", -2, include=False, hidden=True, comment="controlling python seed [handling in particular the final unweighting].\n -1 means use default from random module.\n -2 means set to same value as iseed") self.add_param("lpp1", 1, fortran_name="lpp(1)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='first beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("lpp2", 1, fortran_name="lpp(2)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='second beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("ebeam1", 6500.0, fortran_name="ebeam(1)") self.add_param("ebeam2", 6500.0, fortran_name="ebeam(2)") @@ -4198,24 +4183,18 @@ def default_setup(self): self.add_param("polbeam2", 0.0, fortran_name="pb2", hidden=True, comment="Beam polarization from -100 (left-handed) to 100 (right-handed) --use lpp=0 for this parameter--") self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_proton2', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(2)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (used for beam 2 if group_subprocess was False)') self.add_param('nb_neutron1', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(1)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_neutron2', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(2)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (of beam 2 if group_subprocess was False )') self.add_param('mass_ion1', -1.0, hidden=True, fortran_name="mass_ion(1)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 1)') self.add_param('mass_ion2', -1.0, hidden=True, fortran_name="mass_ion(2)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 2)') valid_pdf = ['lhapdf', 'cteq6_m','cteq6_l', 'cteq6l1','nn23lo', 'nn23lo1', 'nn23nlo','iww','eva','edff','chff','none','mixed']+\ sum(self.allowed_lep_densities.values(),[]) @@ -4228,14 +4207,12 @@ def default_setup(self): self.add_param("fixed_fac_scale1", False, hidden=True) self.add_param("fixed_fac_scale2", False, hidden=True) self.add_param("fixed_extra_scale", False, hidden=True) - self.add_param("scale", 91.1880, shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) + self.add_param("scale", 91.1880) + self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1") + self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2") self.add_param("mue_ref_fixed", 91.1880, hidden=True) self.add_param("dynamical_scale_choice", -1, comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2\n '4' is the center of mass energy\n'0' allows to use the user_hook definition (need to be defined via custom_fct entry) ", - allowed=[-1,0,1,2,3,4,10], - shortcut={'ckkw':-1,'ht':2,'ht/2':3,'et':1,'shat':4}, - ) + allowed=[-1,0,1,2,3,4,10]) self.add_param("mue_over_ref", 1.0, hidden=True, comment='ratio mu_other/mu for dynamical scale') self.add_param("ievo_eva",0,hidden=True, allowed=[0,1],fortran_name="ievo_eva", comment='eva: 0 for EW pdf muf evolution by q^2; 1 for evo by pT^2') @@ -5598,10 +5575,8 @@ def default_setup(self): self.add_param('niters_fo', 6, include=False) #seed and collider self.add_param('iseed', 0) - self.add_param('lpp1', 1, fortran_name='lpp(1)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) - self.add_param('lpp2', 1, fortran_name='lpp(2)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) + self.add_param('lpp1', 1, fortran_name='lpp(1)') + self.add_param('lpp2', 1, fortran_name='lpp(2)') self.add_param('ebeam1', 6500.0, fortran_name='ebeam(1)') self.add_param('ebeam2', 6500.0, fortran_name='ebeam(2)') self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", @@ -5644,15 +5619,13 @@ def default_setup(self): self.add_param('fixed_ren_scale', False) self.add_param('fixed_fac_scale', False) self.add_param('fixed_extra_scale', True, hidden=True, system=True) # set system since running from Ellis-Sexton scale not implemented - self.add_param('mur_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mur_ref_fixed', 91.118) self.add_param('muf1_ref_fixed', -1.0, hidden=True) - self.add_param('muf_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('muf_ref_fixed', 91.118) self.add_param('muf2_ref_fixed', -1.0, hidden=True) - self.add_param('mue_ref_fixed', 91.118, hidden=True, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mue_ref_fixed', 91.118, hidden=True) self.add_param("dynamical_scale_choice", [-1],fortran_name='dyn_scale', - allowed = [-2,-1,0,1,2,3,10], - shortcut={ 'ht/2':3,'ht':2,'et':1}, - comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") + allowed = [-2,-1,0,1,2,3,10], comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") self.add_param('fixed_qes_scale', False, hidden=True) self.add_param('qes_ref_fixed', -1.0, hidden=True) self.add_param('mur_over_ref', 1.0) diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py index 6f82393c3f..3c5601e27d 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py @@ -5205,12 +5205,12 @@ def init_run(self, cards): if self.run_set: self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), - 'lpp': ([str],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), + 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), - 'fixed_scale': ([str],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), + 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), 'pbp':([],['run_card lpp1 1', 'run_card lpp2 1','run_card nb_proton1 82', 'run_card nb_neutron1 126', 'run_card mass_ion1 195.0820996698','run_card nb_proton2 1', 'run_card nb_neutron2 0', 'run_card mass_ion1 -1']), @@ -5795,8 +5795,6 @@ def complete_set(self, text, line, begidx, endidx, formatting=True): allowed_for_run.remove('*') elif isinstance(self.run_card[args[-1]], bool): allowed_for_run = ['True', 'False'] - if args[-1].lower() in self.run_card.shortcut_values: - allowed_for_run += self.run_card.shortcut_values[args[-1].lower()] opts += [str(i) for i in allowed_for_run] diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/py3_model_FDG.pkl deleted file mode 100644 index bf5a732979..0000000000 Binary files a/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/py3_model_Feynman.pkl deleted file mode 100644 index 3e55c479e2..0000000000 Binary files a/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk b/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index e76b814911..e81cd2427b 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +28,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +38,16 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -54,7 +55,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005757570266723633  +DEBUG: model prefixing takes 0.0030303001403808594  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -163,13 +164,13 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.040 s +8 processes with 40 diagrams generated in 0.057 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 INFO: Processing color information for process: g u > t t~ u @1 @@ -186,40 +187,40 @@ INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  DEBUG: type(subproc_group)= [output.py at line 223]  DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=1 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. -Generated helas calls for 2 subprocesses (10 diagrams) in 0.016 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. +Generated helas calls for 2 subprocesses (10 diagrams) in 0.023 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.090 s +ALOHA: aloha creates 2 routines in 0.082 s FFV1 FFV1 FFV1 FFV1 VVV1 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. quit -real 0m1.337s -user 0m0.375s -sys 0m0.160s -Code generation completed in 2 seconds +real 0m0.560s +user 0m0.488s +sys 0m0.060s +Code generation completed in 0 seconds diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..1e7dcf38fe 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -504,7 +504,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc index eea3950214..e3a48ec6dc 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc @@ -989,38 +989,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1046,7 +1058,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1063,6 +1075,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1181,7 +1194,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1205,7 +1218,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1317,60 +1330,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1378,32 +1392,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h index ebc491b00d..ab9d7dde82 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/fcheck_sa.f b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/fcheck_sa.f index cb7efdfbcf..70c3d08b67 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/fcheck_sa.f +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc index bb8b2f2773..56eba53054 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc @@ -989,38 +989,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1046,7 +1058,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1063,6 +1075,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1181,7 +1194,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1205,7 +1218,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1317,60 +1330,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1378,32 +1392,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h index 2c3a739550..55c42cb947 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/fcheck_sa.f b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/fcheck_sa.f index cb7efdfbcf..70c3d08b67 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/fcheck_sa.f +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk index f5bf67efbc..4cf05f5642 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' >> $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk b/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt index f374f8f313..2fdbc1fd4e 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +28,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,22 +38,23 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model heft INFO: Restrict model heft with file models/heft/restrict_default.dat . DEBUG: Simplifying conditional expressions  @@ -120,7 +121,7 @@ Defined multiparticle all = g u c d s u~ c~ d~ s~ a ve vm vt e- mu- ve~ vm~ vt~ generate g g > b b~ HIW<=1 INFO: Trying process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Process has 4 diagrams -1 processes with 4 diagrams generated in 0.003 s +1 processes with 4 diagrams generated in 0.004 s Total: 1 processes with 4 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_heft_gg_bb --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -131,10 +132,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_heft_gg_bb --hel_recycling=False --ve INFO: initialize a new directory: CODEGEN_mad_heft_gg_bb INFO: remove old information in CODEGEN_mad_heft_gg_bb DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 @@ -146,55 +147,55 @@ FileWriter b b~ HIG<=1 HIW<=1 @1 INFO: Finding symmetric diagrams for subprocess group gg_bbx -DEBUG: len(subproc_diagrams_for_config) =  4 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1748]  -Generated helas calls for 1 subprocesses (4 diagrams) in 0.005 s -Wrote files for 12 helas calls in 0.268 s +DEBUG: len(subproc_diagrams_for_config) =  4 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1749]  +Generated helas calls for 1 subprocesses (4 diagrams) in 0.009 s +Wrote files for 12 helas calls in 0.067 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.159 s +ALOHA: aloha creates 4 routines in 0.136 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 8 routines in 0.152 s +ALOHA: aloha creates 8 routines in 0.119 s VVS3 VVV1 FFV1 FFV1 FFV1 FFS2 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./HelAmps_heft.h -INFO: Created file HelAmps_heft.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./HelAmps_heft.h +INFO: Created file HelAmps_heft.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.cc INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb done. +Output to directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/README +/home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/README Run "open index.html" to see more information about this process. quit -real 0m4.654s -user 0m1.223s -sys 0m0.605s -Code generation completed in 5 seconds +real 0m1.949s +user 0m1.650s +sys 0m0.283s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * @@ -215,10 +216,10 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards run @@ -245,10 +246,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt b/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt index 712b1897aa..db7e3616c4 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat index 84c16b4cf4..5c112346ee 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-8-gf0884cb7d HEAD * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/heft_gg_bb.mad/Source/DHELAS/aloha_functions.f index e986b059a9..47699fa614 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/heft_gg_bb.mad/Source/DHELAS/aloha_functions.f @@ -2022,21 +2022,6 @@ subroutine orxxxx(p,rmass,nhel,nsr , ro) end - complex*16 function THETA_FUNCTIONR(cond, out_true, out_false) - - double precision cond - double precision out_true, out_false - - if (cond.ge.0d0) then - THETA_FUNCTIONR = out_true - else - THETA_FUNCTIONR = out_false - endif - - return - - - end complex*16 function THETA_FUNCTION(cond, out_true, out_false) double precision cond diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..1e7dcf38fe 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -504,7 +504,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc index c32c974cc1..2876d942ce 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc @@ -953,38 +953,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1010,7 +1022,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1027,6 +1039,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1145,7 +1158,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1169,7 +1182,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1281,60 +1294,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1342,32 +1356,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h index 543e74fad7..8e08d92d87 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig.f index 785453cfcf..3130417167 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f index fc8effb6b2..6346c8cc25 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f @@ -337,6 +337,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -347,6 +350,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -443,7 +447,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -516,19 +520,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=16) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -598,7 +604,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -622,7 +628,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -630,7 +636,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/fcheck_sa.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/fcheck_sa.f index f0220047d7..61be922c33 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/fcheck_sa.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f index 66966ada1a..6fb79f6e5d 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f @@ -286,7 +286,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -324,8 +324,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/addmothers.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/addmothers.f index d6cded9a2d..593c620d9b 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/addmothers.f @@ -111,7 +111,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, if (btest(mlevel,3)) then write(*,*)'unwgt.f: write out diagram ',igraphs(1) endif - lconfig = vec_igraph1(ivec) + lconfig = vec_igraph(ivec) endif is_LC=.true. maxcolor=0 diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cluster.inc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cluster.inc index 8ddf5bee13..940c25eac0 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cluster.inc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cluster.inc @@ -43,5 +43,5 @@ c parameters for sudakovs integer iipdg,iimode common/gamma_args/Q1,iipdg,iimode - integer vec_igraph1(VECSIZE_MEMMAX) - common/vec_igraph/vec_igraph1 + integer vec_igraph(VECSIZE_MEMMAX) + common/vec_igraph/vec_igraph diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk index f5bf67efbc..4cf05f5642 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' >> $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile_original.mk index 6cb56d0409..348c283be7 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile_original.mk +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile_original.mk @@ -58,10 +58,7 @@ $(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp -libcollier.$(dylibext): - ln -s $(LIBDIR)/collier_lib/libcollier.$(dylibext) || echo 'already done' - -gensym: $(SYMMETRY) configs.inc $(LIBS) libcollier.$(dylibext) +gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/myamp.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/myamp.f index bd02dfe2b4..5360566ef4 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/myamp.f @@ -139,7 +139,7 @@ logical function cut_bw(p) $ gForceBW(i,iconfig).eq.1)) if(onshell)then c Remove on-shell forbidden s-channels (gForceBW=2) (JA 2/10/11) - if(gForceBW(i,iconfig).eq.2.and.sde_strat.eq.1) then + if(gForceBW(i,iconfig).eq.2) then cut_bw = .true. return endif diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/reweight.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/reweight.f index 353e025d71..8e4672a421 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/reweight.f @@ -1416,6 +1416,7 @@ double precision function rewgt(p, ivec) rewgt=1.0d0 clustered=.false. + vec_igraph(ivec) = 0 ! default: no MLM graph selected for this event if(ickkw.le.0.and..not.use_syst) return @@ -1467,6 +1468,7 @@ double precision function rewgt(p, ivec) rewgt = 0d0 return endif + vec_igraph(ivec) = igraphs(1) ! save MLM-matched graph for this event c Store pdf information for systematics studies (initial) @@ -1592,10 +1594,6 @@ double precision function rewgt(p, ivec) c alpha_s weight if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then - if (q2now.le.4)then - rewgt=0d0 - return - endif rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1907,7 +1905,7 @@ subroutine update_scale_coupling_vec(all_p, all_wgt,all_q2fact, VECSIZE_USED) else all_q2fact(1,i) = q2fact(1) all_q2fact(2,i) = q2fact(2) - vec_igraph1(i) = igraphs(1) + vec_igraph(i) = igraphs(1) endif c call save_cl_val_to(i) c endif diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/banner.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/banner.py index 74f6b04b68..c248436e7f 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/banner.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/banner.py @@ -1004,8 +1004,6 @@ def __init__(self, finput=None, **opt): self.comments = {} # comment associated to parameters. can be display via help message # store the valid options for a given parameter. self.allowed_value = {} - # allow nickname for some parameter to avoid integer mapping for some var - self.shortcut_values = {} self.default_setup() @@ -1134,11 +1132,6 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): scan_targettype = self.scan_set[lower_name] del self.scan_set[lower_name] - # check if the user used a shortcut value (which are always str) - if lower_name in self.shortcut_values: - if isinstance(value,str) and value.strip().lower() in self.shortcut_values[lower_name]: - value = self.shortcut_values[lower_name][value.strip().lower()] - # 2. Find the type of the attribute that we want if lower_name in self.list_parameter: targettype = self.list_parameter[lower_name] @@ -1317,8 +1310,7 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): def add_param(self, name, value, system=False, comment=False, typelist=None, - allowed=[], - shortcut={}): + allowed=[]): """add a default parameter to the class""" lower_name = name.lower() @@ -1353,11 +1345,6 @@ def add_param(self, name, value, system=False, comment=False, typelist=None, assert val in allowed or '*' in allowed else: assert value in allowed or '*' in allowed - if shortcut: - if allowed and shortcut and '*' not in allowed: - assert all([val in allowed for val in shortcut.values()]), "Some shortcut value are not in the allowed list" - assert all([isinstance(v, str) for v in shortcut.keys()]), "All shortcut values should be str" - self.shortcut_values[lower_name] = shortcut #elif isinstance(value, bool) and allowed != ['*']: # self.allowed_value[name] = [True, False] @@ -4186,10 +4173,8 @@ def default_setup(self): allowed=['partonshower'], comment="list of check that can be bypassed manually.") self.add_param("python_seed", -2, include=False, hidden=True, comment="controlling python seed [handling in particular the final unweighting].\n -1 means use default from random module.\n -2 means set to same value as iseed") self.add_param("lpp1", 1, fortran_name="lpp(1)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='first beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("lpp2", 1, fortran_name="lpp(2)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='second beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("ebeam1", 6500.0, fortran_name="ebeam(1)") self.add_param("ebeam2", 6500.0, fortran_name="ebeam(2)") @@ -4198,24 +4183,18 @@ def default_setup(self): self.add_param("polbeam2", 0.0, fortran_name="pb2", hidden=True, comment="Beam polarization from -100 (left-handed) to 100 (right-handed) --use lpp=0 for this parameter--") self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_proton2', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(2)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (used for beam 2 if group_subprocess was False)') self.add_param('nb_neutron1', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(1)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_neutron2', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(2)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (of beam 2 if group_subprocess was False )') self.add_param('mass_ion1', -1.0, hidden=True, fortran_name="mass_ion(1)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 1)') self.add_param('mass_ion2', -1.0, hidden=True, fortran_name="mass_ion(2)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 2)') valid_pdf = ['lhapdf', 'cteq6_m','cteq6_l', 'cteq6l1','nn23lo', 'nn23lo1', 'nn23nlo','iww','eva','edff','chff','none','mixed']+\ sum(self.allowed_lep_densities.values(),[]) @@ -4228,14 +4207,12 @@ def default_setup(self): self.add_param("fixed_fac_scale1", False, hidden=True) self.add_param("fixed_fac_scale2", False, hidden=True) self.add_param("fixed_extra_scale", False, hidden=True) - self.add_param("scale", 91.1880, shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) + self.add_param("scale", 91.1880) + self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1") + self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2") self.add_param("mue_ref_fixed", 91.1880, hidden=True) self.add_param("dynamical_scale_choice", -1, comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2\n '4' is the center of mass energy\n'0' allows to use the user_hook definition (need to be defined via custom_fct entry) ", - allowed=[-1,0,1,2,3,4,10], - shortcut={'ckkw':-1,'ht':2,'ht/2':3,'et':1,'shat':4}, - ) + allowed=[-1,0,1,2,3,4,10]) self.add_param("mue_over_ref", 1.0, hidden=True, comment='ratio mu_other/mu for dynamical scale') self.add_param("ievo_eva",0,hidden=True, allowed=[0,1],fortran_name="ievo_eva", comment='eva: 0 for EW pdf muf evolution by q^2; 1 for evo by pT^2') @@ -5598,10 +5575,8 @@ def default_setup(self): self.add_param('niters_fo', 6, include=False) #seed and collider self.add_param('iseed', 0) - self.add_param('lpp1', 1, fortran_name='lpp(1)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) - self.add_param('lpp2', 1, fortran_name='lpp(2)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) + self.add_param('lpp1', 1, fortran_name='lpp(1)') + self.add_param('lpp2', 1, fortran_name='lpp(2)') self.add_param('ebeam1', 6500.0, fortran_name='ebeam(1)') self.add_param('ebeam2', 6500.0, fortran_name='ebeam(2)') self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", @@ -5644,15 +5619,13 @@ def default_setup(self): self.add_param('fixed_ren_scale', False) self.add_param('fixed_fac_scale', False) self.add_param('fixed_extra_scale', True, hidden=True, system=True) # set system since running from Ellis-Sexton scale not implemented - self.add_param('mur_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mur_ref_fixed', 91.118) self.add_param('muf1_ref_fixed', -1.0, hidden=True) - self.add_param('muf_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('muf_ref_fixed', 91.118) self.add_param('muf2_ref_fixed', -1.0, hidden=True) - self.add_param('mue_ref_fixed', 91.118, hidden=True, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mue_ref_fixed', 91.118, hidden=True) self.add_param("dynamical_scale_choice", [-1],fortran_name='dyn_scale', - allowed = [-2,-1,0,1,2,3,10], - shortcut={ 'ht/2':3,'ht':2,'et':1}, - comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") + allowed = [-2,-1,0,1,2,3,10], comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") self.add_param('fixed_qes_scale', False, hidden=True) self.add_param('qes_ref_fixed', -1.0, hidden=True) self.add_param('mur_over_ref', 1.0) diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/common_run_interface.py index 6f82393c3f..3c5601e27d 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/common_run_interface.py @@ -5205,12 +5205,12 @@ def init_run(self, cards): if self.run_set: self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), - 'lpp': ([str],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), + 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), - 'fixed_scale': ([str],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), + 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), 'pbp':([],['run_card lpp1 1', 'run_card lpp2 1','run_card nb_proton1 82', 'run_card nb_neutron1 126', 'run_card mass_ion1 195.0820996698','run_card nb_proton2 1', 'run_card nb_neutron2 0', 'run_card mass_ion1 -1']), @@ -5795,8 +5795,6 @@ def complete_set(self, text, line, begidx, endidx, formatting=True): allowed_for_run.remove('*') elif isinstance(self.run_card[args[-1]], bool): allowed_for_run = ['True', 'False'] - if args[-1].lower() in self.run_card.shortcut_values: - allowed_for_run += self.run_card.shortcut_values[args[-1].lower()] opts += [str(i) for i in allowed_for_run] diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk b/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt index e04a2da479..232a901b2a 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +28,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,71 +38,24 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model heft -INFO: download model from https://madgraph.mi.infn.it/Downloads/models/heft.tgz to the following directory: /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/models  ---2026-03-10 10:38:21-- https://madgraph.mi.infn.it/Downloads/models/heft.tgz -Resolving madgraph.mi.infn.it (madgraph.mi.infn.it)... 192.135.21.75 -Connecting to madgraph.mi.infn.it (madgraph.mi.infn.it)|192.135.21.75|:443... connected. -HTTP request sent, awaiting response... 200 OK -Length: 50876 (50K) [application/x-gzip] -Saving to: ‘tmp.tgz’ - - 0K .......... .......... .......... .......... ......... 100% 2.92M=0.02s - -2026-03-10 10:38:22 (2.92 MB/s) - ‘tmp.tgz’ saved [50876/50876] - -heft/ -heft/write_param_card.py -heft/restrict_ckm.dat -heft/couplings.py -heft/HEFT_UFO.log -heft/lorentz.py -heft/__init__.py -heft/__pycache__/ -heft/particles.py -heft/object_library.py -heft/restrict_default.dat -heft/restrict_zeromass_ckm.dat -heft/restrict_no_b_mass.dat -heft/function_library.py -heft/parameters.py -heft/py3_model.pkl -heft/coupling_orders.py -heft/restrict_no_tau_mass.dat -heft/vertices.py -heft/restrict_no_masses.dat -heft/__pycache__/write_param_card.cpython-311.pyc -heft/__pycache__/parameters.cpython-311.pyc -heft/__pycache__/function_library.cpython-311.pyc -heft/__pycache__/coupling_orders.cpython-311.pyc -heft/__pycache__/object_library.cpython-311.pyc -heft/__pycache__/couplings.cpython-311.pyc -heft/__pycache__/particles.cpython-311.pyc -heft/__pycache__/vertices.cpython-311.pyc -heft/__pycache__/lorentz.cpython-311.pyc -heft/__pycache__/__init__.cpython-311.pyc -INFO: reload from .py file -INFO: load particles -INFO: load vertices -WARNING: coupling GC_13=-(complex(0,1)*GH) has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  -WARNING: coupling GC_16=(complex(0,1)*Gphi)/8. has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  -DEBUG: model prefixing takes 0.007684946060180664  INFO: Restrict model heft with file models/heft/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: s u w+ at order: QED=1  @@ -168,13 +121,13 @@ Defined multiparticle all = g u c d s u~ c~ d~ s~ a ve vm vt e- mu- ve~ vm~ vt~ generate g g > b b~ HIW<=1 INFO: Trying process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Process has 4 diagrams -1 processes with 4 diagrams generated in 0.003 s +1 processes with 4 diagrams generated in 0.004 s Total: 1 processes with 4 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_heft_gg_bb Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 @@ -183,34 +136,34 @@ INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/. -Generated helas calls for 1 subprocesses (4 diagrams) in 0.005 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/. +Generated helas calls for 1 subprocesses (4 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.159 s +ALOHA: aloha creates 4 routines in 0.143 s VVS3 VVV1 FFV1 FFV1 FFV1 FFS2 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./HelAmps_heft.h -INFO: Created file HelAmps_heft.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./HelAmps_heft.h +INFO: Created file HelAmps_heft.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.cc INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. quit -real 0m1.669s -user 0m0.522s -sys 0m0.180s -Code generation completed in 2 seconds +real 0m0.486s +user 0m0.429s +sys 0m0.050s +Code generation completed in 0 seconds diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..1e7dcf38fe 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -504,7 +504,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc index 7a1f85c7cc..8f5fdbdfe1 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc @@ -949,38 +949,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1006,7 +1018,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1023,6 +1035,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1141,7 +1154,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1165,7 +1178,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1277,60 +1290,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1338,32 +1352,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h index 543e74fad7..8e08d92d87 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/fcheck_sa.f b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/fcheck_sa.f index f0220047d7..61be922c33 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/fcheck_sa.f +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk index f5bf67efbc..4cf05f5642 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' >> $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk b/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt index 5067c06ff1..dd7b11482d 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt +++ b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +28,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +38,16 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -54,7 +55,7 @@ set zerowidth_tchannel F import model sm-no_b_mass INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.003014802932739258  +DEBUG: model prefixing takes 0.003126382827758789  INFO: Restrict model sm-no_b_mass with file models/sm/restrict_no_b_mass.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -178,7 +179,7 @@ INFO: Process u~ d > t t~ w- added to mirror process d u~ > t t~ w- INFO: Process c~ s > t t~ w- added to mirror process s c~ > t t~ w- INFO: Process d~ u > t t~ w+ added to mirror process u d~ > t t~ w+ INFO: Process s~ c > t t~ w+ added to mirror process c s~ > t t~ w+ -4 processes with 8 diagrams generated in 0.056 s +4 processes with 8 diagrams generated in 0.080 s Total: 4 processes with 8 diagrams add process p p > t t~ w j @1 INFO: Checking for minimal orders which gives processes. @@ -220,7 +221,7 @@ INFO: Process d~ g > t t~ w+ u~ added to mirror process g d~ > t t~ w+ u~ INFO: Process d~ u > t t~ w+ g added to mirror process u d~ > t t~ w+ g INFO: Process s~ g > t t~ w+ c~ added to mirror process g s~ > t t~ w+ c~ INFO: Process s~ c > t t~ w+ g added to mirror process c s~ > t t~ w+ g -12 processes with 144 diagrams generated in 0.331 s +12 processes with 144 diagrams generated in 0.478 s Total: 16 processes with 152 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_nobm_pp_ttW --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -231,10 +232,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_nobm_pp_ttW --hel_recycling=False --v INFO: initialize a new directory: CODEGEN_mad_nobm_pp_ttW INFO: remove old information in CODEGEN_mad_nobm_pp_ttW DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ w+ d WEIGHTED<=5 @1 INFO: Processing color information for process: g u > t t~ w+ d @1 @@ -268,9 +269,9 @@ FileWriter t t~ w+ d WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxwpd -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1749]  INFO: Creating files in directory P1_gd_ttxwmu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -279,9 +280,9 @@ FileWriter t t~ w- u WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gd_ttxwmu -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1749]  INFO: Creating files in directory P1_gux_ttxwmdx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -290,9 +291,9 @@ FileWriter t t~ w- d~ WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxwmdx -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1749]  INFO: Creating files in directory P1_gdx_ttxwpux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -301,9 +302,9 @@ FileWriter t t~ w+ u~ WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gdx_ttxwpux -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1749]  INFO: Creating files in directory P1_udx_ttxwpg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -312,9 +313,9 @@ FileWriter t t~ w+ g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group udx_ttxwpg -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1749]  INFO: Creating files in directory P1_dux_ttxwmg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -323,9 +324,9 @@ FileWriter t t~ w- g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group dux_ttxwmg -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1749]  INFO: Creating files in directory P0_udx_ttxwp DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -334,9 +335,9 @@ FileWriter t t~ w+ WEIGHTED<=4 INFO: Finding symmetric diagrams for subprocess group udx_ttxwp -DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1749]  INFO: Creating files in directory P0_dux_ttxwm DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -345,11 +346,11 @@ FileWriter t t~ w- WEIGHTED<=4 INFO: Finding symmetric diagrams for subprocess group dux_ttxwm -DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1748]  -Generated helas calls for 8 subprocesses (76 diagrams) in 0.104 s -Wrote files for 212 helas calls in 2.138 s +DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1749]  +Generated helas calls for 8 subprocesses (76 diagrams) in 0.174 s +Wrote files for 212 helas calls in 0.602 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines @@ -359,7 +360,7 @@ ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates VVV1 set of routines with options: P0 -ALOHA: aloha creates 6 routines in 0.122 s +ALOHA: aloha creates 6 routines in 0.154 s FFV1 FFV1 FFV1 @@ -367,32 +368,32 @@ ALOHA: aloha creates 6 routines in 0.122 s FFV2 FFV2 VVV1 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./HelAmps_sm_no_b_mass.h -INFO: Created file HelAmps_sm_no_b_mass.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./HelAmps_sm_no_b_mass.h +INFO: Created file HelAmps_sm_no_b_mass.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.cc INFO: Created files Parameters_sm_no_b_mass.h and Parameters_sm_no_b_mass.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW done. +Output to directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/README +/home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/README Run "open index.html" to see more information about this process. quit -real 0m8.122s -user 0m2.522s -sys 0m1.075s -Code generation completed in 8 seconds +real 0m4.432s +user 0m3.751s +sys 0m0.613s +Code generation completed in 4 seconds ************************************************************ * * * W E L C O M E to * @@ -413,10 +414,10 @@ Code generation completed in 8 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards run @@ -443,10 +444,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt index 712b1897aa..db7e3616c4 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat index 3f652ded8d..981120a965 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-8-gf0884cb7d HEAD * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/nobm_pp_ttW.mad/Source/DHELAS/aloha_functions.f index e986b059a9..47699fa614 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/DHELAS/aloha_functions.f @@ -2022,21 +2022,6 @@ subroutine orxxxx(p,rmass,nhel,nsr , ro) end - complex*16 function THETA_FUNCTIONR(cond, out_true, out_false) - - double precision cond - double precision out_true, out_false - - if (cond.ge.0d0) then - THETA_FUNCTIONR = out_true - else - THETA_FUNCTIONR = out_false - endif - - return - - - end complex*16 function THETA_FUNCTION(cond, out_true, out_false) double precision cond diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..1e7dcf38fe 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -504,7 +504,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc index 9d43997b76..1db6dc2843 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc @@ -966,38 +966,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1023,7 +1035,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1040,6 +1052,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1158,7 +1171,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1182,7 +1195,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1294,60 +1307,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1355,32 +1369,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h index 53f417c646..56d598b7a9 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -188,6 +189,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig.f index 16d9b1bce8..1ed73392bb 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig.f @@ -792,8 +792,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -894,9 +893,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1206,7 +1204,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1221,10 +1219,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1236,7 +1237,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig1.f index 983025466d..959af9abb8 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig1.f @@ -344,6 +344,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -354,6 +357,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -460,7 +464,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -533,19 +537,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=48) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -615,7 +621,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -639,7 +645,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -647,7 +653,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/fcheck_sa.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/fcheck_sa.f index cb7efdfbcf..70c3d08b67 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/fcheck_sa.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/matrix1.f index 97ed635786..8ea7cbd981 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/matrix1.f @@ -334,7 +334,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -373,8 +373,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc index 83d25c8021..3734c48717 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc @@ -966,38 +966,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1023,7 +1035,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1040,6 +1052,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1158,7 +1171,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1182,7 +1195,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1294,60 +1307,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1355,32 +1369,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h index 3ac92dd2c9..913c0dfeed 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -188,6 +189,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig.f index 37f83693d3..f4a45af3d4 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig.f @@ -792,8 +792,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -894,9 +893,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1206,7 +1204,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1221,10 +1219,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1236,7 +1237,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig1.f index 2224f52ad1..d2d45ddbaa 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig1.f @@ -344,6 +344,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -354,6 +357,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -460,7 +464,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -533,19 +537,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=48) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -615,7 +621,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -639,7 +645,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -647,7 +653,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/fcheck_sa.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/fcheck_sa.f index cb7efdfbcf..70c3d08b67 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/fcheck_sa.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/matrix1.f index 1496eebe35..72232f43be 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/matrix1.f @@ -334,7 +334,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -373,8 +373,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc index 152beb1322..b516bc3dd1 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc @@ -1162,38 +1162,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1219,7 +1231,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1236,6 +1248,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1354,7 +1367,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1378,7 +1391,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1490,60 +1503,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1551,32 +1565,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h index 20f8a6d2b4..1335e38061 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -188,6 +189,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig.f index af77031e76..c9599a9732 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig.f @@ -792,8 +792,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -894,9 +893,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1206,7 +1204,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1221,10 +1219,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1236,7 +1237,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig1.f index a566870b6b..5681d8535f 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig1.f @@ -344,6 +344,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -354,6 +357,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -460,7 +464,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -533,19 +537,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=96) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -615,7 +621,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -639,7 +645,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -647,7 +653,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/fcheck_sa.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/fcheck_sa.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/matrix1.f index 0f5afbd521..bf4575a44d 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/matrix1.f @@ -382,7 +382,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -421,8 +421,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc index 8f0bfc615c..3bb4f68f77 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc @@ -1162,38 +1162,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1219,7 +1231,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1236,6 +1248,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1354,7 +1367,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1378,7 +1391,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1490,60 +1503,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1551,32 +1565,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h index e94d034748..27f4d1c5c2 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -188,6 +189,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig.f index 633c2bda2a..461b2a1f4c 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig.f @@ -792,8 +792,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -894,9 +893,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1206,7 +1204,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1221,10 +1219,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1236,7 +1237,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig1.f index 7fda166f5a..f72ed6255e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig1.f @@ -343,6 +343,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -353,6 +356,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -457,7 +461,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -530,19 +534,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=96) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -612,7 +618,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -636,7 +642,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -644,7 +650,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/fcheck_sa.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/fcheck_sa.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/matrix1.f index 8d05da36d4..60736d40f4 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/matrix1.f @@ -382,7 +382,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -421,8 +421,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc index 209e073d74..f585a5805c 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc @@ -1162,38 +1162,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1219,7 +1231,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1236,6 +1248,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1354,7 +1367,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1378,7 +1391,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1490,60 +1503,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1551,32 +1565,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h index a83896951d..b11b67d795 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -188,6 +189,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig.f index df3b5e689b..345754eb7e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig.f @@ -792,8 +792,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -894,9 +893,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1206,7 +1204,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1221,10 +1219,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1236,7 +1237,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig1.f index 5a48f895c3..0caf0301e3 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig1.f @@ -343,6 +343,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -353,6 +356,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -457,7 +461,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -530,19 +534,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=96) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -612,7 +618,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -636,7 +642,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -644,7 +650,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/fcheck_sa.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/fcheck_sa.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/matrix1.f index cb4090e743..8befd86e93 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/matrix1.f @@ -382,7 +382,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -421,8 +421,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc index f63f49b5fd..a1b0193992 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc @@ -1162,38 +1162,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1219,7 +1231,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1236,6 +1248,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1354,7 +1367,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1378,7 +1391,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1490,60 +1503,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1551,32 +1565,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h index eadff47f18..96aee249a6 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -188,6 +189,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig.f index 8a448d0444..ab11a90fc1 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig.f @@ -792,8 +792,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -894,9 +893,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1206,7 +1204,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1221,10 +1219,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1236,7 +1237,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig1.f index e2759d19f6..b5b15b7c5c 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig1.f @@ -343,6 +343,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -353,6 +356,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -457,7 +461,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -530,19 +534,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=96) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -612,7 +618,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -636,7 +642,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -644,7 +650,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/fcheck_sa.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/fcheck_sa.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/matrix1.f index bf1d47c73c..1d55f3f5b6 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/matrix1.f @@ -382,7 +382,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -421,8 +421,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc index b97e46ece1..0ab1f3a681 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc @@ -1162,38 +1162,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1219,7 +1231,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1236,6 +1248,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1354,7 +1367,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1378,7 +1391,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1490,60 +1503,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1551,32 +1565,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h index 1642721bee..c3531e18ef 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -188,6 +189,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig.f index a0091febb6..57090d058f 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig.f @@ -792,8 +792,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -894,9 +893,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1206,7 +1204,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1221,10 +1219,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1236,7 +1237,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig1.f index 92e84c1147..b2737ec3fa 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig1.f @@ -343,6 +343,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -353,6 +356,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -457,7 +461,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -530,19 +534,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=96) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -612,7 +618,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -636,7 +642,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -644,7 +650,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/fcheck_sa.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/fcheck_sa.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/matrix1.f index e194b5f639..d475dc7829 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/matrix1.f @@ -382,7 +382,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -421,8 +421,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc index b6bdeb9a02..99663e5339 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc @@ -1162,38 +1162,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1219,7 +1231,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1236,6 +1248,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1354,7 +1367,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1378,7 +1391,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1490,60 +1503,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1551,32 +1565,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h index 3e7ccff73e..8cfc26cf49 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -188,6 +189,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig.f index 369bf6cdf6..3b09bc2ba0 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig.f @@ -792,8 +792,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -894,9 +893,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1206,7 +1204,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1221,10 +1219,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1236,7 +1237,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig1.f index 75c9ced543..41ca9266c1 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig1.f @@ -344,6 +344,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -354,6 +357,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -460,7 +464,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -533,19 +537,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=96) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -615,7 +621,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -639,7 +645,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -647,7 +653,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/fcheck_sa.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/fcheck_sa.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/matrix1.f index 164ddfda7d..795d4cc364 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/matrix1.f @@ -382,7 +382,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -421,8 +421,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/addmothers.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/addmothers.f index d6cded9a2d..593c620d9b 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/addmothers.f @@ -111,7 +111,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, if (btest(mlevel,3)) then write(*,*)'unwgt.f: write out diagram ',igraphs(1) endif - lconfig = vec_igraph1(ivec) + lconfig = vec_igraph(ivec) endif is_LC=.true. maxcolor=0 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cluster.inc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cluster.inc index 8ddf5bee13..940c25eac0 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cluster.inc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cluster.inc @@ -43,5 +43,5 @@ c parameters for sudakovs integer iipdg,iimode common/gamma_args/Q1,iipdg,iimode - integer vec_igraph1(VECSIZE_MEMMAX) - common/vec_igraph/vec_igraph1 + integer vec_igraph(VECSIZE_MEMMAX) + common/vec_igraph/vec_igraph diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk index f5bf67efbc..4cf05f5642 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' >> $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile_original.mk index 6cb56d0409..348c283be7 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile_original.mk +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile_original.mk @@ -58,10 +58,7 @@ $(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp -libcollier.$(dylibext): - ln -s $(LIBDIR)/collier_lib/libcollier.$(dylibext) || echo 'already done' - -gensym: $(SYMMETRY) configs.inc $(LIBS) libcollier.$(dylibext) +gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/myamp.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/myamp.f index bd02dfe2b4..5360566ef4 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/myamp.f @@ -139,7 +139,7 @@ logical function cut_bw(p) $ gForceBW(i,iconfig).eq.1)) if(onshell)then c Remove on-shell forbidden s-channels (gForceBW=2) (JA 2/10/11) - if(gForceBW(i,iconfig).eq.2.and.sde_strat.eq.1) then + if(gForceBW(i,iconfig).eq.2) then cut_bw = .true. return endif diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/reweight.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/reweight.f index 353e025d71..8e4672a421 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/reweight.f @@ -1416,6 +1416,7 @@ double precision function rewgt(p, ivec) rewgt=1.0d0 clustered=.false. + vec_igraph(ivec) = 0 ! default: no MLM graph selected for this event if(ickkw.le.0.and..not.use_syst) return @@ -1467,6 +1468,7 @@ double precision function rewgt(p, ivec) rewgt = 0d0 return endif + vec_igraph(ivec) = igraphs(1) ! save MLM-matched graph for this event c Store pdf information for systematics studies (initial) @@ -1592,10 +1594,6 @@ double precision function rewgt(p, ivec) c alpha_s weight if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then - if (q2now.le.4)then - rewgt=0d0 - return - endif rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1907,7 +1905,7 @@ subroutine update_scale_coupling_vec(all_p, all_wgt,all_q2fact, VECSIZE_USED) else all_q2fact(1,i) = q2fact(1) all_q2fact(2,i) = q2fact(2) - vec_igraph1(i) = igraphs(1) + vec_igraph(i) = igraphs(1) endif c call save_cl_val_to(i) c endif diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/banner.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/banner.py index 74f6b04b68..c248436e7f 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/banner.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/banner.py @@ -1004,8 +1004,6 @@ def __init__(self, finput=None, **opt): self.comments = {} # comment associated to parameters. can be display via help message # store the valid options for a given parameter. self.allowed_value = {} - # allow nickname for some parameter to avoid integer mapping for some var - self.shortcut_values = {} self.default_setup() @@ -1134,11 +1132,6 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): scan_targettype = self.scan_set[lower_name] del self.scan_set[lower_name] - # check if the user used a shortcut value (which are always str) - if lower_name in self.shortcut_values: - if isinstance(value,str) and value.strip().lower() in self.shortcut_values[lower_name]: - value = self.shortcut_values[lower_name][value.strip().lower()] - # 2. Find the type of the attribute that we want if lower_name in self.list_parameter: targettype = self.list_parameter[lower_name] @@ -1317,8 +1310,7 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): def add_param(self, name, value, system=False, comment=False, typelist=None, - allowed=[], - shortcut={}): + allowed=[]): """add a default parameter to the class""" lower_name = name.lower() @@ -1353,11 +1345,6 @@ def add_param(self, name, value, system=False, comment=False, typelist=None, assert val in allowed or '*' in allowed else: assert value in allowed or '*' in allowed - if shortcut: - if allowed and shortcut and '*' not in allowed: - assert all([val in allowed for val in shortcut.values()]), "Some shortcut value are not in the allowed list" - assert all([isinstance(v, str) for v in shortcut.keys()]), "All shortcut values should be str" - self.shortcut_values[lower_name] = shortcut #elif isinstance(value, bool) and allowed != ['*']: # self.allowed_value[name] = [True, False] @@ -4186,10 +4173,8 @@ def default_setup(self): allowed=['partonshower'], comment="list of check that can be bypassed manually.") self.add_param("python_seed", -2, include=False, hidden=True, comment="controlling python seed [handling in particular the final unweighting].\n -1 means use default from random module.\n -2 means set to same value as iseed") self.add_param("lpp1", 1, fortran_name="lpp(1)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='first beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("lpp2", 1, fortran_name="lpp(2)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='second beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("ebeam1", 6500.0, fortran_name="ebeam(1)") self.add_param("ebeam2", 6500.0, fortran_name="ebeam(2)") @@ -4198,24 +4183,18 @@ def default_setup(self): self.add_param("polbeam2", 0.0, fortran_name="pb2", hidden=True, comment="Beam polarization from -100 (left-handed) to 100 (right-handed) --use lpp=0 for this parameter--") self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_proton2', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(2)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (used for beam 2 if group_subprocess was False)') self.add_param('nb_neutron1', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(1)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_neutron2', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(2)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (of beam 2 if group_subprocess was False )') self.add_param('mass_ion1', -1.0, hidden=True, fortran_name="mass_ion(1)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 1)') self.add_param('mass_ion2', -1.0, hidden=True, fortran_name="mass_ion(2)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 2)') valid_pdf = ['lhapdf', 'cteq6_m','cteq6_l', 'cteq6l1','nn23lo', 'nn23lo1', 'nn23nlo','iww','eva','edff','chff','none','mixed']+\ sum(self.allowed_lep_densities.values(),[]) @@ -4228,14 +4207,12 @@ def default_setup(self): self.add_param("fixed_fac_scale1", False, hidden=True) self.add_param("fixed_fac_scale2", False, hidden=True) self.add_param("fixed_extra_scale", False, hidden=True) - self.add_param("scale", 91.1880, shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) + self.add_param("scale", 91.1880) + self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1") + self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2") self.add_param("mue_ref_fixed", 91.1880, hidden=True) self.add_param("dynamical_scale_choice", -1, comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2\n '4' is the center of mass energy\n'0' allows to use the user_hook definition (need to be defined via custom_fct entry) ", - allowed=[-1,0,1,2,3,4,10], - shortcut={'ckkw':-1,'ht':2,'ht/2':3,'et':1,'shat':4}, - ) + allowed=[-1,0,1,2,3,4,10]) self.add_param("mue_over_ref", 1.0, hidden=True, comment='ratio mu_other/mu for dynamical scale') self.add_param("ievo_eva",0,hidden=True, allowed=[0,1],fortran_name="ievo_eva", comment='eva: 0 for EW pdf muf evolution by q^2; 1 for evo by pT^2') @@ -5598,10 +5575,8 @@ def default_setup(self): self.add_param('niters_fo', 6, include=False) #seed and collider self.add_param('iseed', 0) - self.add_param('lpp1', 1, fortran_name='lpp(1)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) - self.add_param('lpp2', 1, fortran_name='lpp(2)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) + self.add_param('lpp1', 1, fortran_name='lpp(1)') + self.add_param('lpp2', 1, fortran_name='lpp(2)') self.add_param('ebeam1', 6500.0, fortran_name='ebeam(1)') self.add_param('ebeam2', 6500.0, fortran_name='ebeam(2)') self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", @@ -5644,15 +5619,13 @@ def default_setup(self): self.add_param('fixed_ren_scale', False) self.add_param('fixed_fac_scale', False) self.add_param('fixed_extra_scale', True, hidden=True, system=True) # set system since running from Ellis-Sexton scale not implemented - self.add_param('mur_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mur_ref_fixed', 91.118) self.add_param('muf1_ref_fixed', -1.0, hidden=True) - self.add_param('muf_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('muf_ref_fixed', 91.118) self.add_param('muf2_ref_fixed', -1.0, hidden=True) - self.add_param('mue_ref_fixed', 91.118, hidden=True, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mue_ref_fixed', 91.118, hidden=True) self.add_param("dynamical_scale_choice", [-1],fortran_name='dyn_scale', - allowed = [-2,-1,0,1,2,3,10], - shortcut={ 'ht/2':3,'ht':2,'et':1}, - comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") + allowed = [-2,-1,0,1,2,3,10], comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") self.add_param('fixed_qes_scale', False, hidden=True) self.add_param('qes_ref_fixed', -1.0, hidden=True) self.add_param('mur_over_ref', 1.0) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/common_run_interface.py index 6f82393c3f..3c5601e27d 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/common_run_interface.py @@ -5205,12 +5205,12 @@ def init_run(self, cards): if self.run_set: self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), - 'lpp': ([str],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), + 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), - 'fixed_scale': ([str],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), + 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), 'pbp':([],['run_card lpp1 1', 'run_card lpp2 1','run_card nb_proton1 82', 'run_card nb_neutron1 126', 'run_card mass_ion1 195.0820996698','run_card nb_proton2 1', 'run_card nb_neutron2 0', 'run_card mass_ion1 -1']), @@ -5795,8 +5795,6 @@ def complete_set(self, text, line, begidx, endidx, formatting=True): allowed_for_run.remove('*') elif isinstance(self.run_card[args[-1]], bool): allowed_for_run = ['True', 'False'] - if args[-1].lower() in self.run_card.shortcut_values: - allowed_for_run += self.run_card.shortcut_values[args[-1].lower()] opts += [str(i) for i in allowed_for_run] diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/py3_model_FDG.pkl deleted file mode 100644 index bf5a732979..0000000000 Binary files a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/py3_model_Feynman.pkl deleted file mode 100644 index 3e55c479e2..0000000000 Binary files a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk b/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index a8e3a6d67a..7a4d32d322 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +28,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +38,16 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -54,7 +55,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004999399185180664  +DEBUG: model prefixing takes 0.0036203861236572266  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -165,7 +166,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~ INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ -5 processes with 7 diagrams generated in 0.015 s +5 processes with 7 diagrams generated in 0.034 s Total: 5 processes with 7 diagrams add process p p > t t~ j @1 INFO: Checking for minimal orders which gives processes. @@ -205,7 +206,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~ INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g -13 processes with 76 diagrams generated in 0.070 s +13 processes with 76 diagrams generated in 0.141 s Total: 18 processes with 83 diagrams add process p p > t t~ j j @2 INFO: Checking for minimal orders which gives processes. @@ -371,7 +372,7 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 0.941 s +65 processes with 1119 diagrams generated in 1.456 s Total: 83 processes with 1202 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -382,10 +383,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vec INFO: initialize a new directory: CODEGEN_mad_pp_tt012j INFO: remove old information in CODEGEN_mad_pp_tt012j DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Processing color information for process: g g > t t~ g g @2 @@ -496,9 +497,9 @@ FileWriter t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1749]  INFO: Creating files in directory P2_gg_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -507,9 +508,9 @@ FileWriter t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1749]  INFO: Creating files in directory P2_gu_ttxgu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -518,9 +519,9 @@ FileWriter t t~ g u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1749]  INFO: Creating files in directory P2_gux_ttxgux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -529,9 +530,9 @@ FileWriter t t~ g u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1749]  INFO: Creating files in directory P2_uux_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -540,9 +541,9 @@ FileWriter t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1749]  INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -551,9 +552,9 @@ FileWriter t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1749]  INFO: Creating files in directory P2_uu_ttxuu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -562,9 +563,9 @@ FileWriter t t~ u u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu -DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1749]  INFO: Creating files in directory P2_uux_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -573,9 +574,9 @@ FileWriter t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux -DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1749]  INFO: Creating files in directory P2_uxux_ttxuxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -584,9 +585,9 @@ FileWriter t t~ u~ u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux -DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1749]  INFO: Creating files in directory P2_uc_ttxuc DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -595,9 +596,9 @@ FileWriter t t~ u c WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1749]  INFO: Creating files in directory P2_uux_ttxccx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -606,9 +607,9 @@ FileWriter t t~ c c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1749]  INFO: Creating files in directory P2_ucx_ttxucx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -617,9 +618,9 @@ FileWriter t t~ u c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1749]  INFO: Creating files in directory P2_uxcx_ttxuxcx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -628,9 +629,9 @@ FileWriter t t~ u~ c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1749]  INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -639,9 +640,9 @@ FileWriter t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1749]  INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -650,9 +651,9 @@ FileWriter t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1749]  INFO: Creating files in directory P1_uux_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -661,9 +662,9 @@ FileWriter t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group uux_ttxg -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1749]  INFO: Creating files in directory P0_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -672,9 +673,9 @@ FileWriter t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1749]  INFO: Creating files in directory P0_uux_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -683,25 +684,25 @@ FileWriter t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group uux_ttx -DEBUG: len(subproc_diagrams_for_config) =  1 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1} [model_handling.py at line 1748]  -Generated helas calls for 18 subprocesses (372 diagrams) in 0.671 s -Wrote files for 810 helas calls in 5.590 s +DEBUG: len(subproc_diagrams_for_config) =  1 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1} [model_handling.py at line 1749]  +Generated helas calls for 18 subprocesses (372 diagrams) in 0.958 s +Wrote files for 810 helas calls in 2.060 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.216 s +ALOHA: aloha creates 5 routines in 0.223 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.194 s +ALOHA: aloha creates 10 routines in 0.209 s VVV1 VVV1 FFV1 @@ -714,32 +715,32 @@ ALOHA: aloha creates 10 routines in 0.194 s VVVV3 VVVV4 VVVV4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done. +Output to directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README +/home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README Run "open index.html" to see more information about this process. quit -real 0m15.089s -user 0m5.988s -sys 0m1.827s -Code generation completed in 16 seconds +real 0m9.575s +user 0m8.442s +sys 0m1.003s +Code generation completed in 10 seconds ************************************************************ * * * W E L C O M E to * @@ -760,10 +761,10 @@ Code generation completed in 16 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards run @@ -790,10 +791,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt index 712b1897aa..db7e3616c4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat index fa1bcf88f4..86d647aa4d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-8-gf0884cb7d HEAD * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/pp_tt012j.mad/Source/DHELAS/aloha_functions.f index e986b059a9..47699fa614 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/pp_tt012j.mad/Source/DHELAS/aloha_functions.f @@ -2022,21 +2022,6 @@ subroutine orxxxx(p,rmass,nhel,nsr , ro) end - complex*16 function THETA_FUNCTIONR(cond, out_true, out_false) - - double precision cond - double precision out_true, out_false - - if (cond.ge.0d0) then - THETA_FUNCTIONR = out_true - else - THETA_FUNCTIONR = out_false - endif - - return - - - end complex*16 function THETA_FUNCTION(cond, out_true, out_false) double precision cond diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..1e7dcf38fe 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -504,7 +504,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc index 8b330d85d5..eb6bb3cfbf 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc @@ -939,38 +939,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -996,7 +1008,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1013,6 +1025,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1131,7 +1144,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1155,7 +1168,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1267,60 +1280,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1328,32 +1342,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h index 1aaf72997b..f67a329ee7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f index e5f47166fb..690f872c32 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f index 0d129ab296..a582ad2fd4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f @@ -337,6 +337,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -347,6 +350,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -443,7 +447,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -516,19 +520,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=16) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -598,7 +604,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -622,7 +628,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -630,7 +636,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/fcheck_sa.f index f0220047d7..61be922c33 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f index 2d0cc3a394..f01adf2ab1 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f @@ -286,7 +286,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -324,8 +324,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc index bd9ec082ce..2781752348 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc @@ -916,38 +916,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -973,7 +985,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -990,6 +1002,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1108,7 +1121,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1132,7 +1145,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1244,60 +1257,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1305,32 +1319,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h index a96df4e864..e822c4f778 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f index ae9439cf9e..b34048e29d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f @@ -794,8 +794,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -896,9 +895,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1208,7 +1206,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1223,10 +1221,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1238,7 +1239,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f index c155307e43..0600c671ce 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f @@ -360,6 +360,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -372,6 +375,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -498,7 +502,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -571,19 +575,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=16) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -653,7 +659,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -677,7 +683,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -685,7 +691,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/fcheck_sa.f index f0220047d7..61be922c33 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f index ccb869545a..1e6c927bfb 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f @@ -289,7 +289,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -330,8 +330,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index 0726e0a6ea..85622f37c6 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -1156,38 +1156,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1213,7 +1225,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1230,6 +1242,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1348,7 +1361,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1372,7 +1385,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1484,60 +1497,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1545,32 +1559,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h index 5c057176f6..6ad3c7dd1e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f index ebf5273614..6dfa640d9e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f index c32cb4d43c..77820f0e51 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f @@ -337,6 +337,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -347,6 +350,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -443,7 +447,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -516,19 +520,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=32) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -598,7 +604,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -622,7 +628,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -630,7 +636,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/fcheck_sa.f index cb7efdfbcf..70c3d08b67 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f index 6724cffa4b..7388a4bf7e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f @@ -302,7 +302,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -340,8 +340,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index 92c74d5c62..42a1ee15b1 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -994,38 +994,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1051,7 +1063,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1068,6 +1080,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1186,7 +1199,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1210,7 +1223,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1322,60 +1335,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1383,32 +1397,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h index ebc491b00d..ab9d7dde82 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f index 4595d5a38e..37932e73a3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f @@ -794,8 +794,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -896,9 +895,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1208,7 +1206,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1223,10 +1221,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1238,7 +1239,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f index 0f523f574b..f252c024f6 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f @@ -356,6 +356,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -367,6 +370,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -487,7 +491,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -560,19 +564,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=32) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -642,7 +648,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -666,7 +672,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -674,7 +680,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/fcheck_sa.f index cb7efdfbcf..70c3d08b67 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f index a06e72a3c3..613dbb7f66 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f @@ -305,7 +305,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -346,8 +346,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index 77d9edb7b2..b8f320369d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -994,38 +994,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1051,7 +1063,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1068,6 +1080,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1186,7 +1199,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1210,7 +1223,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1322,60 +1335,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1383,32 +1397,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h index 2c3a739550..55c42cb947 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f index e239a05794..748758b702 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f @@ -794,8 +794,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -896,9 +895,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1208,7 +1206,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1223,10 +1221,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1238,7 +1239,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f index 7240e416ab..bb34349714 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f @@ -356,6 +356,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -367,6 +370,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -487,7 +491,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -560,19 +564,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=32) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -642,7 +648,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -666,7 +672,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -674,7 +680,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/fcheck_sa.f index cb7efdfbcf..70c3d08b67 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f index a162af362e..22a6b8c5b9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f @@ -305,7 +305,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -346,8 +346,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc index 888768ef3b..eddfb835fb 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc @@ -994,38 +994,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1051,7 +1063,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1068,6 +1080,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1186,7 +1199,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1210,7 +1223,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1322,60 +1335,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1383,32 +1397,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h index 01180e3e92..f776ee3de7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f index b15c35131c..01735be0d3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f @@ -794,8 +794,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -896,9 +895,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1208,7 +1206,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1223,10 +1221,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1238,7 +1239,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f index 95e3e81bc6..632d791617 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f @@ -360,6 +360,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -372,6 +375,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -498,7 +502,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -571,19 +575,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=32) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -653,7 +659,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -677,7 +683,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -685,7 +691,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/fcheck_sa.f index cb7efdfbcf..70c3d08b67 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f index 16e908ba11..0ec6e93020 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f @@ -305,7 +305,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -346,8 +346,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc index f1617232e3..0c082e4dff 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc @@ -3084,38 +3084,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -3141,7 +3153,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -3158,6 +3170,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -3276,7 +3289,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -3300,7 +3313,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -3412,60 +3425,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -3473,32 +3487,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h index 363ab0b79d..f51b7656c4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f index 1108637c49..e7f590a087 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f index 02c9412706..4705c638be 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f @@ -337,6 +337,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -347,6 +350,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -443,7 +447,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -516,19 +520,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -598,7 +604,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -622,7 +628,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -630,7 +636,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f index 51476eb7fa..b4b9172028 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f @@ -334,7 +334,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -372,8 +372,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc index 7e011c2c62..b6939136f8 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc @@ -1491,38 +1491,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1548,7 +1560,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1565,6 +1577,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1683,7 +1696,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1707,7 +1720,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1819,60 +1832,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1880,32 +1894,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h index eb46a03db6..1f9c0ec433 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f index 0f260565e3..5456c9a1d1 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f @@ -794,8 +794,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -896,9 +895,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1208,7 +1206,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1223,10 +1221,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1238,7 +1239,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f index acc21004ae..263e3be1b2 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f @@ -352,6 +352,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -362,6 +365,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -476,7 +480,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -549,19 +553,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -631,7 +637,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -655,7 +661,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -663,7 +669,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f index d46d392b1f..3816770328 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f @@ -337,7 +337,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -378,8 +378,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc index 20e3623198..49f71ba3c3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc @@ -1491,38 +1491,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1548,7 +1560,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1565,6 +1577,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1683,7 +1696,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1707,7 +1720,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1819,60 +1832,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1880,32 +1894,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h index 516900ab3b..916fafcf3e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f index 0ae010df69..3edd289da8 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f @@ -794,8 +794,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -896,9 +895,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1208,7 +1206,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1223,10 +1221,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1238,7 +1239,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f index 2ed82fafaa..16d795c6a6 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f @@ -356,6 +356,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -367,6 +370,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -487,7 +491,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -560,19 +564,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -642,7 +648,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -666,7 +672,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -674,7 +680,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f index ea575a9bc3..c10cd1e6e8 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f @@ -337,7 +337,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -378,8 +378,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc index 1ba94ad37f..c8ba603c5f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc @@ -1491,38 +1491,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1548,7 +1560,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1565,6 +1577,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1683,7 +1696,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1707,7 +1720,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1819,60 +1832,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1880,32 +1894,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h index bcc9e9d736..067e81bad8 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f index 236f6d16a9..c858b2c684 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f @@ -794,8 +794,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -896,9 +895,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1208,7 +1206,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1223,10 +1221,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1238,7 +1239,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f index dcf20fe396..1a6277d156 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f @@ -356,6 +356,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -367,6 +370,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -487,7 +491,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -560,19 +564,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -642,7 +648,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -666,7 +672,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -674,7 +680,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f index a780b1f4fa..8fdfbc4513 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f @@ -337,7 +337,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -378,8 +378,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc index 7665fa9af8..b084c58dc7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc @@ -1072,38 +1072,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1129,7 +1141,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1146,6 +1158,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1264,7 +1277,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1288,7 +1301,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1400,60 +1413,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1461,32 +1475,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h index 553048dc11..650bd18517 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h @@ -168,6 +168,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -192,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f index 956dc07485..37ff46da63 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f @@ -796,8 +796,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -898,9 +897,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1210,7 +1208,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1225,10 +1223,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1240,7 +1241,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f index 9bc73e492f..284cc76158 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f @@ -368,6 +368,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -380,6 +383,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -514,7 +518,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -587,19 +591,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -669,7 +675,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -693,7 +699,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -701,7 +707,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f index 559059580c..572c6ced56 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f @@ -339,7 +339,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -382,8 +382,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc index a7fde33970..f0464ec557 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc @@ -1078,38 +1078,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1135,7 +1147,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1152,6 +1164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1270,7 +1283,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1294,7 +1307,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1406,60 +1419,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1467,32 +1481,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h index b187f2ebf3..0d2dd21169 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h @@ -174,6 +174,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -198,6 +199,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f index 9c2c20435d..bfc32bbd25 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f @@ -802,8 +802,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -904,9 +903,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1216,7 +1214,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1231,10 +1229,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1246,7 +1247,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f index bef5d7dd9f..33cf19f705 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f @@ -400,6 +400,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -412,6 +415,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -586,7 +590,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -659,19 +663,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -741,7 +747,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -765,7 +771,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -773,7 +779,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f index 56a2755163..65520b0758 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f @@ -345,7 +345,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -394,8 +394,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc index a299144ca6..f0b0a896cb 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc @@ -1182,38 +1182,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1239,7 +1251,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1256,6 +1268,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1374,7 +1387,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1398,7 +1411,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1510,60 +1523,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1571,32 +1585,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h index 98e755a489..70826b49e7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f index bed31f9d2f..39a81a621a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f @@ -794,8 +794,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -896,9 +895,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1208,7 +1206,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1223,10 +1221,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1238,7 +1239,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f index 9c2eb40089..d76f34423b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f @@ -360,6 +360,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -372,6 +375,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -498,7 +502,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -571,19 +575,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -653,7 +659,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -677,7 +683,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -685,7 +691,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f index 8d7c00bfcd..0218e57040 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f @@ -337,7 +337,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -378,8 +378,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc index edaf7372cc..83783088e4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc @@ -1078,38 +1078,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1135,7 +1147,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1152,6 +1164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1270,7 +1283,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1294,7 +1307,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1406,60 +1419,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1467,32 +1481,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h index 0c551f2f4d..3c73ffcdae 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h @@ -174,6 +174,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -198,6 +199,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f index 48de6ee6aa..fb7d3f331a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f @@ -802,8 +802,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -904,9 +903,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1216,7 +1214,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1231,10 +1229,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1246,7 +1247,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f index 018c1a985b..19450cafaf 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f @@ -400,6 +400,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -412,6 +415,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -586,7 +590,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -659,19 +663,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -741,7 +747,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -765,7 +771,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -773,7 +779,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f index 440f838b87..2c2555366f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f @@ -345,7 +345,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -394,8 +394,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc index 57a20afa9c..e8ccccc22d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc @@ -1491,38 +1491,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1548,7 +1560,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1565,6 +1577,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1683,7 +1696,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1707,7 +1720,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1819,60 +1832,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1880,32 +1894,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h index 3290858ea0..977c1f0143 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f index 1b37ae6930..0441af0818 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f @@ -794,8 +794,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -896,9 +895,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1208,7 +1206,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1223,10 +1221,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1238,7 +1239,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f index e72dc0ca8c..cdb3a6377b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f @@ -360,6 +360,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -372,6 +375,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -498,7 +502,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -571,19 +575,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -653,7 +659,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -677,7 +683,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -685,7 +691,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f index bc51e47c27..ae3e89ba33 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f @@ -337,7 +337,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -378,8 +378,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc index 4a0583759f..a4dd4e74fc 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc @@ -1182,38 +1182,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1239,7 +1251,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1256,6 +1268,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1374,7 +1387,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1398,7 +1411,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1510,60 +1523,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1571,32 +1585,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h index 880e2dace8..49758d2918 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f index d51e86247a..016741f374 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f @@ -794,8 +794,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -896,9 +895,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1208,7 +1206,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1223,10 +1221,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1238,7 +1239,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f index c8106d783a..0a2a87d5d5 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f @@ -360,6 +360,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -372,6 +375,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -498,7 +502,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -571,19 +575,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -653,7 +659,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -677,7 +683,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -685,7 +691,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f index ae0a828447..3dcb0ae4ef 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f @@ -337,7 +337,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -378,8 +378,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc index 8e34c58b00..9e3d0186a4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc @@ -1072,38 +1072,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1129,7 +1141,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1146,6 +1158,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1264,7 +1277,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1288,7 +1301,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1400,60 +1413,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1461,32 +1475,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h index 314d5b2955..6e7d0b1d10 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h @@ -168,6 +168,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -192,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f index 8991a26bd9..f93d884900 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f @@ -796,8 +796,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -898,9 +897,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1210,7 +1208,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1225,10 +1223,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1240,7 +1241,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f index 5e6645a738..b507ecd05b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f @@ -368,6 +368,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -380,6 +383,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -514,7 +518,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -587,19 +591,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -669,7 +675,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -693,7 +699,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -701,7 +707,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f index ef2d0fcb85..0eba207bd8 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f @@ -339,7 +339,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -382,8 +382,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc index b6b3dab286..70e782fbbb 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc @@ -1182,38 +1182,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1239,7 +1251,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1256,6 +1268,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1374,7 +1387,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1398,7 +1411,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1510,60 +1523,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1571,32 +1585,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h index 89c57825a9..6e4939c539 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -190,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f index 8d5a646679..e28b5f2e76 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f @@ -794,8 +794,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -896,9 +895,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1208,7 +1206,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1223,10 +1221,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1238,7 +1239,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f index 7d08f78919..f01a8215a1 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f @@ -360,6 +360,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -372,6 +375,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -498,7 +502,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -571,19 +575,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -653,7 +659,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -677,7 +683,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -685,7 +691,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/fcheck_sa.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f index 85463860ad..c8a51154a0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f @@ -337,7 +337,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -378,8 +378,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/addmothers.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/addmothers.f index d6cded9a2d..593c620d9b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/addmothers.f @@ -111,7 +111,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, if (btest(mlevel,3)) then write(*,*)'unwgt.f: write out diagram ',igraphs(1) endif - lconfig = vec_igraph1(ivec) + lconfig = vec_igraph(ivec) endif is_LC=.true. maxcolor=0 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cluster.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cluster.inc index 8ddf5bee13..940c25eac0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cluster.inc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cluster.inc @@ -43,5 +43,5 @@ c parameters for sudakovs integer iipdg,iimode common/gamma_args/Q1,iipdg,iimode - integer vec_igraph1(VECSIZE_MEMMAX) - common/vec_igraph/vec_igraph1 + integer vec_igraph(VECSIZE_MEMMAX) + common/vec_igraph/vec_igraph diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk index f5bf67efbc..4cf05f5642 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' >> $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile_original.mk index 6cb56d0409..348c283be7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile_original.mk +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile_original.mk @@ -58,10 +58,7 @@ $(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp -libcollier.$(dylibext): - ln -s $(LIBDIR)/collier_lib/libcollier.$(dylibext) || echo 'already done' - -gensym: $(SYMMETRY) configs.inc $(LIBS) libcollier.$(dylibext) +gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/myamp.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/myamp.f index bd02dfe2b4..5360566ef4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/myamp.f @@ -139,7 +139,7 @@ logical function cut_bw(p) $ gForceBW(i,iconfig).eq.1)) if(onshell)then c Remove on-shell forbidden s-channels (gForceBW=2) (JA 2/10/11) - if(gForceBW(i,iconfig).eq.2.and.sde_strat.eq.1) then + if(gForceBW(i,iconfig).eq.2) then cut_bw = .true. return endif diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/reweight.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/reweight.f index 353e025d71..8e4672a421 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/reweight.f @@ -1416,6 +1416,7 @@ double precision function rewgt(p, ivec) rewgt=1.0d0 clustered=.false. + vec_igraph(ivec) = 0 ! default: no MLM graph selected for this event if(ickkw.le.0.and..not.use_syst) return @@ -1467,6 +1468,7 @@ double precision function rewgt(p, ivec) rewgt = 0d0 return endif + vec_igraph(ivec) = igraphs(1) ! save MLM-matched graph for this event c Store pdf information for systematics studies (initial) @@ -1592,10 +1594,6 @@ double precision function rewgt(p, ivec) c alpha_s weight if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then - if (q2now.le.4)then - rewgt=0d0 - return - endif rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1907,7 +1905,7 @@ subroutine update_scale_coupling_vec(all_p, all_wgt,all_q2fact, VECSIZE_USED) else all_q2fact(1,i) = q2fact(1) all_q2fact(2,i) = q2fact(2) - vec_igraph1(i) = igraphs(1) + vec_igraph(i) = igraphs(1) endif c call save_cl_val_to(i) c endif diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py index 74f6b04b68..c248436e7f 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py @@ -1004,8 +1004,6 @@ def __init__(self, finput=None, **opt): self.comments = {} # comment associated to parameters. can be display via help message # store the valid options for a given parameter. self.allowed_value = {} - # allow nickname for some parameter to avoid integer mapping for some var - self.shortcut_values = {} self.default_setup() @@ -1134,11 +1132,6 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): scan_targettype = self.scan_set[lower_name] del self.scan_set[lower_name] - # check if the user used a shortcut value (which are always str) - if lower_name in self.shortcut_values: - if isinstance(value,str) and value.strip().lower() in self.shortcut_values[lower_name]: - value = self.shortcut_values[lower_name][value.strip().lower()] - # 2. Find the type of the attribute that we want if lower_name in self.list_parameter: targettype = self.list_parameter[lower_name] @@ -1317,8 +1310,7 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): def add_param(self, name, value, system=False, comment=False, typelist=None, - allowed=[], - shortcut={}): + allowed=[]): """add a default parameter to the class""" lower_name = name.lower() @@ -1353,11 +1345,6 @@ def add_param(self, name, value, system=False, comment=False, typelist=None, assert val in allowed or '*' in allowed else: assert value in allowed or '*' in allowed - if shortcut: - if allowed and shortcut and '*' not in allowed: - assert all([val in allowed for val in shortcut.values()]), "Some shortcut value are not in the allowed list" - assert all([isinstance(v, str) for v in shortcut.keys()]), "All shortcut values should be str" - self.shortcut_values[lower_name] = shortcut #elif isinstance(value, bool) and allowed != ['*']: # self.allowed_value[name] = [True, False] @@ -4186,10 +4173,8 @@ def default_setup(self): allowed=['partonshower'], comment="list of check that can be bypassed manually.") self.add_param("python_seed", -2, include=False, hidden=True, comment="controlling python seed [handling in particular the final unweighting].\n -1 means use default from random module.\n -2 means set to same value as iseed") self.add_param("lpp1", 1, fortran_name="lpp(1)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='first beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("lpp2", 1, fortran_name="lpp(2)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='second beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("ebeam1", 6500.0, fortran_name="ebeam(1)") self.add_param("ebeam2", 6500.0, fortran_name="ebeam(2)") @@ -4198,24 +4183,18 @@ def default_setup(self): self.add_param("polbeam2", 0.0, fortran_name="pb2", hidden=True, comment="Beam polarization from -100 (left-handed) to 100 (right-handed) --use lpp=0 for this parameter--") self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_proton2', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(2)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (used for beam 2 if group_subprocess was False)') self.add_param('nb_neutron1', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(1)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_neutron2', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(2)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (of beam 2 if group_subprocess was False )') self.add_param('mass_ion1', -1.0, hidden=True, fortran_name="mass_ion(1)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 1)') self.add_param('mass_ion2', -1.0, hidden=True, fortran_name="mass_ion(2)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 2)') valid_pdf = ['lhapdf', 'cteq6_m','cteq6_l', 'cteq6l1','nn23lo', 'nn23lo1', 'nn23nlo','iww','eva','edff','chff','none','mixed']+\ sum(self.allowed_lep_densities.values(),[]) @@ -4228,14 +4207,12 @@ def default_setup(self): self.add_param("fixed_fac_scale1", False, hidden=True) self.add_param("fixed_fac_scale2", False, hidden=True) self.add_param("fixed_extra_scale", False, hidden=True) - self.add_param("scale", 91.1880, shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) + self.add_param("scale", 91.1880) + self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1") + self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2") self.add_param("mue_ref_fixed", 91.1880, hidden=True) self.add_param("dynamical_scale_choice", -1, comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2\n '4' is the center of mass energy\n'0' allows to use the user_hook definition (need to be defined via custom_fct entry) ", - allowed=[-1,0,1,2,3,4,10], - shortcut={'ckkw':-1,'ht':2,'ht/2':3,'et':1,'shat':4}, - ) + allowed=[-1,0,1,2,3,4,10]) self.add_param("mue_over_ref", 1.0, hidden=True, comment='ratio mu_other/mu for dynamical scale') self.add_param("ievo_eva",0,hidden=True, allowed=[0,1],fortran_name="ievo_eva", comment='eva: 0 for EW pdf muf evolution by q^2; 1 for evo by pT^2') @@ -5598,10 +5575,8 @@ def default_setup(self): self.add_param('niters_fo', 6, include=False) #seed and collider self.add_param('iseed', 0) - self.add_param('lpp1', 1, fortran_name='lpp(1)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) - self.add_param('lpp2', 1, fortran_name='lpp(2)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) + self.add_param('lpp1', 1, fortran_name='lpp(1)') + self.add_param('lpp2', 1, fortran_name='lpp(2)') self.add_param('ebeam1', 6500.0, fortran_name='ebeam(1)') self.add_param('ebeam2', 6500.0, fortran_name='ebeam(2)') self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", @@ -5644,15 +5619,13 @@ def default_setup(self): self.add_param('fixed_ren_scale', False) self.add_param('fixed_fac_scale', False) self.add_param('fixed_extra_scale', True, hidden=True, system=True) # set system since running from Ellis-Sexton scale not implemented - self.add_param('mur_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mur_ref_fixed', 91.118) self.add_param('muf1_ref_fixed', -1.0, hidden=True) - self.add_param('muf_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('muf_ref_fixed', 91.118) self.add_param('muf2_ref_fixed', -1.0, hidden=True) - self.add_param('mue_ref_fixed', 91.118, hidden=True, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mue_ref_fixed', 91.118, hidden=True) self.add_param("dynamical_scale_choice", [-1],fortran_name='dyn_scale', - allowed = [-2,-1,0,1,2,3,10], - shortcut={ 'ht/2':3,'ht':2,'et':1}, - comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") + allowed = [-2,-1,0,1,2,3,10], comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") self.add_param('fixed_qes_scale', False, hidden=True) self.add_param('qes_ref_fixed', -1.0, hidden=True) self.add_param('mur_over_ref', 1.0) diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py index 6f82393c3f..3c5601e27d 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py @@ -5205,12 +5205,12 @@ def init_run(self, cards): if self.run_set: self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), - 'lpp': ([str],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), + 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), - 'fixed_scale': ([str],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), + 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), 'pbp':([],['run_card lpp1 1', 'run_card lpp2 1','run_card nb_proton1 82', 'run_card nb_neutron1 126', 'run_card mass_ion1 195.0820996698','run_card nb_proton2 1', 'run_card nb_neutron2 0', 'run_card mass_ion1 -1']), @@ -5795,8 +5795,6 @@ def complete_set(self, text, line, begidx, endidx, formatting=True): allowed_for_run.remove('*') elif isinstance(self.run_card[args[-1]], bool): allowed_for_run = ['True', 'False'] - if args[-1].lower() in self.run_card.shortcut_values: - allowed_for_run += self.run_card.shortcut_values[args[-1].lower()] opts += [str(i) for i in allowed_for_run] diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/py3_model_FDG.pkl deleted file mode 100644 index bf5a732979..0000000000 Binary files a/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/py3_model_Feynman.pkl deleted file mode 100644 index 3e55c479e2..0000000000 Binary files a/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk b/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt index 9a1af87664..a11d2de469 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +28,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,22 +38,23 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t INFO: load particles INFO: load vertices @@ -70,7 +71,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.06830000877380371  +DEBUG: model prefixing takes 0.04707622528076172  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -85,7 +86,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 2.021 s +1 processes with 72 diagrams generated in 2.609 s Total: 1 processes with 72 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_smeft_gg_tttt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -96,10 +97,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_smeft_gg_tttt --hel_recycling=False - INFO: initialize a new directory: CODEGEN_mad_smeft_gg_tttt INFO: remove old information in CODEGEN_mad_smeft_gg_tttt DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ t t~ @1 @@ -111,25 +112,25 @@ FileWriter t t~ t t~ WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxttx -DEBUG: len(subproc_diagrams_for_config) =  70 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 68, 68: 69, 69: 71, 70: 72} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 68: 67, 69: 68, 71: 69, 72: 70} [model_handling.py at line 1748]  -Generated helas calls for 1 subprocesses (72 diagrams) in 0.097 s -Wrote files for 119 helas calls in 0.474 s +DEBUG: len(subproc_diagrams_for_config) =  70 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 68, 68: 69, 69: 71, 70: 72} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 68: 67, 69: 68, 71: 69, 72: 70} [model_handling.py at line 1749]  +Generated helas calls for 1 subprocesses (72 diagrams) in 0.124 s +Wrote files for 119 helas calls in 0.265 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.204 s +ALOHA: aloha creates 5 routines in 0.209 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 10 routines in 0.193 s +ALOHA: aloha creates 10 routines in 0.190 s VVV5 VVV5 FFV1 @@ -139,32 +140,32 @@ ALOHA: aloha creates 10 routines in 0.193 s VVVV1 VVVV9 VVVV10 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h -INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h +INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt done. +Output to directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/README +/home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/README Run "open index.html" to see more information about this process. quit -real 0m7.520s -user 0m3.917s -sys 0m0.620s -Code generation completed in 8 seconds +real 0m5.754s +user 0m5.380s +sys 0m0.307s +Code generation completed in 6 seconds ************************************************************ * * * W E L C O M E to * @@ -185,10 +186,10 @@ Code generation completed in 8 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards run @@ -215,10 +216,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt index 712b1897aa..db7e3616c4 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat index 5e08560167..9f01c208e8 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-8-gf0884cb7d HEAD * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/smeft_gg_tttt.mad/Source/DHELAS/aloha_functions.f index e986b059a9..47699fa614 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/DHELAS/aloha_functions.f @@ -2022,21 +2022,6 @@ subroutine orxxxx(p,rmass,nhel,nsr , ro) end - complex*16 function THETA_FUNCTIONR(cond, out_true, out_false) - - double precision cond - double precision out_true, out_false - - if (cond.ge.0d0) then - THETA_FUNCTIONR = out_true - else - THETA_FUNCTIONR = out_false - endif - - return - - - end complex*16 function THETA_FUNCTION(cond, out_true, out_false) double precision cond diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..1e7dcf38fe 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -504,7 +504,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc index 0d7fe2e5ae..8d80903b40 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc @@ -2017,38 +2017,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -2074,7 +2086,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -2091,6 +2103,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -2209,7 +2222,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -2233,7 +2246,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -2345,60 +2358,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -2406,32 +2420,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h index 87d1743da6..aa52499cf0 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig.f index e9f856aa23..b68b2dd12c 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f index 7f0900eb3e..731770fcdf 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f @@ -337,6 +337,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -347,6 +350,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -443,7 +447,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -516,19 +520,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=64) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -598,7 +604,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -622,7 +628,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -630,7 +636,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/fcheck_sa.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/fcheck_sa.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f index 0f8b03e464..841be0ffef 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f @@ -334,7 +334,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -372,8 +372,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/addmothers.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/addmothers.f index d6cded9a2d..593c620d9b 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/addmothers.f @@ -111,7 +111,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, if (btest(mlevel,3)) then write(*,*)'unwgt.f: write out diagram ',igraphs(1) endif - lconfig = vec_igraph1(ivec) + lconfig = vec_igraph(ivec) endif is_LC=.true. maxcolor=0 diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cluster.inc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cluster.inc index 8ddf5bee13..940c25eac0 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cluster.inc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cluster.inc @@ -43,5 +43,5 @@ c parameters for sudakovs integer iipdg,iimode common/gamma_args/Q1,iipdg,iimode - integer vec_igraph1(VECSIZE_MEMMAX) - common/vec_igraph/vec_igraph1 + integer vec_igraph(VECSIZE_MEMMAX) + common/vec_igraph/vec_igraph diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk index f5bf67efbc..4cf05f5642 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' >> $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile_original.mk index 6cb56d0409..348c283be7 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile_original.mk +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile_original.mk @@ -58,10 +58,7 @@ $(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp -libcollier.$(dylibext): - ln -s $(LIBDIR)/collier_lib/libcollier.$(dylibext) || echo 'already done' - -gensym: $(SYMMETRY) configs.inc $(LIBS) libcollier.$(dylibext) +gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/myamp.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/myamp.f index bd02dfe2b4..5360566ef4 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/myamp.f @@ -139,7 +139,7 @@ logical function cut_bw(p) $ gForceBW(i,iconfig).eq.1)) if(onshell)then c Remove on-shell forbidden s-channels (gForceBW=2) (JA 2/10/11) - if(gForceBW(i,iconfig).eq.2.and.sde_strat.eq.1) then + if(gForceBW(i,iconfig).eq.2) then cut_bw = .true. return endif diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/reweight.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/reweight.f index 353e025d71..8e4672a421 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/reweight.f @@ -1416,6 +1416,7 @@ double precision function rewgt(p, ivec) rewgt=1.0d0 clustered=.false. + vec_igraph(ivec) = 0 ! default: no MLM graph selected for this event if(ickkw.le.0.and..not.use_syst) return @@ -1467,6 +1468,7 @@ double precision function rewgt(p, ivec) rewgt = 0d0 return endif + vec_igraph(ivec) = igraphs(1) ! save MLM-matched graph for this event c Store pdf information for systematics studies (initial) @@ -1592,10 +1594,6 @@ double precision function rewgt(p, ivec) c alpha_s weight if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then - if (q2now.le.4)then - rewgt=0d0 - return - endif rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1907,7 +1905,7 @@ subroutine update_scale_coupling_vec(all_p, all_wgt,all_q2fact, VECSIZE_USED) else all_q2fact(1,i) = q2fact(1) all_q2fact(2,i) = q2fact(2) - vec_igraph1(i) = igraphs(1) + vec_igraph(i) = igraphs(1) endif c call save_cl_val_to(i) c endif diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/banner.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/banner.py index 74f6b04b68..c248436e7f 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/banner.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/banner.py @@ -1004,8 +1004,6 @@ def __init__(self, finput=None, **opt): self.comments = {} # comment associated to parameters. can be display via help message # store the valid options for a given parameter. self.allowed_value = {} - # allow nickname for some parameter to avoid integer mapping for some var - self.shortcut_values = {} self.default_setup() @@ -1134,11 +1132,6 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): scan_targettype = self.scan_set[lower_name] del self.scan_set[lower_name] - # check if the user used a shortcut value (which are always str) - if lower_name in self.shortcut_values: - if isinstance(value,str) and value.strip().lower() in self.shortcut_values[lower_name]: - value = self.shortcut_values[lower_name][value.strip().lower()] - # 2. Find the type of the attribute that we want if lower_name in self.list_parameter: targettype = self.list_parameter[lower_name] @@ -1317,8 +1310,7 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): def add_param(self, name, value, system=False, comment=False, typelist=None, - allowed=[], - shortcut={}): + allowed=[]): """add a default parameter to the class""" lower_name = name.lower() @@ -1353,11 +1345,6 @@ def add_param(self, name, value, system=False, comment=False, typelist=None, assert val in allowed or '*' in allowed else: assert value in allowed or '*' in allowed - if shortcut: - if allowed and shortcut and '*' not in allowed: - assert all([val in allowed for val in shortcut.values()]), "Some shortcut value are not in the allowed list" - assert all([isinstance(v, str) for v in shortcut.keys()]), "All shortcut values should be str" - self.shortcut_values[lower_name] = shortcut #elif isinstance(value, bool) and allowed != ['*']: # self.allowed_value[name] = [True, False] @@ -4186,10 +4173,8 @@ def default_setup(self): allowed=['partonshower'], comment="list of check that can be bypassed manually.") self.add_param("python_seed", -2, include=False, hidden=True, comment="controlling python seed [handling in particular the final unweighting].\n -1 means use default from random module.\n -2 means set to same value as iseed") self.add_param("lpp1", 1, fortran_name="lpp(1)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='first beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("lpp2", 1, fortran_name="lpp(2)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='second beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("ebeam1", 6500.0, fortran_name="ebeam(1)") self.add_param("ebeam2", 6500.0, fortran_name="ebeam(2)") @@ -4198,24 +4183,18 @@ def default_setup(self): self.add_param("polbeam2", 0.0, fortran_name="pb2", hidden=True, comment="Beam polarization from -100 (left-handed) to 100 (right-handed) --use lpp=0 for this parameter--") self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_proton2', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(2)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (used for beam 2 if group_subprocess was False)') self.add_param('nb_neutron1', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(1)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_neutron2', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(2)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (of beam 2 if group_subprocess was False )') self.add_param('mass_ion1', -1.0, hidden=True, fortran_name="mass_ion(1)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 1)') self.add_param('mass_ion2', -1.0, hidden=True, fortran_name="mass_ion(2)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 2)') valid_pdf = ['lhapdf', 'cteq6_m','cteq6_l', 'cteq6l1','nn23lo', 'nn23lo1', 'nn23nlo','iww','eva','edff','chff','none','mixed']+\ sum(self.allowed_lep_densities.values(),[]) @@ -4228,14 +4207,12 @@ def default_setup(self): self.add_param("fixed_fac_scale1", False, hidden=True) self.add_param("fixed_fac_scale2", False, hidden=True) self.add_param("fixed_extra_scale", False, hidden=True) - self.add_param("scale", 91.1880, shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) + self.add_param("scale", 91.1880) + self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1") + self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2") self.add_param("mue_ref_fixed", 91.1880, hidden=True) self.add_param("dynamical_scale_choice", -1, comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2\n '4' is the center of mass energy\n'0' allows to use the user_hook definition (need to be defined via custom_fct entry) ", - allowed=[-1,0,1,2,3,4,10], - shortcut={'ckkw':-1,'ht':2,'ht/2':3,'et':1,'shat':4}, - ) + allowed=[-1,0,1,2,3,4,10]) self.add_param("mue_over_ref", 1.0, hidden=True, comment='ratio mu_other/mu for dynamical scale') self.add_param("ievo_eva",0,hidden=True, allowed=[0,1],fortran_name="ievo_eva", comment='eva: 0 for EW pdf muf evolution by q^2; 1 for evo by pT^2') @@ -5598,10 +5575,8 @@ def default_setup(self): self.add_param('niters_fo', 6, include=False) #seed and collider self.add_param('iseed', 0) - self.add_param('lpp1', 1, fortran_name='lpp(1)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) - self.add_param('lpp2', 1, fortran_name='lpp(2)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) + self.add_param('lpp1', 1, fortran_name='lpp(1)') + self.add_param('lpp2', 1, fortran_name='lpp(2)') self.add_param('ebeam1', 6500.0, fortran_name='ebeam(1)') self.add_param('ebeam2', 6500.0, fortran_name='ebeam(2)') self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", @@ -5644,15 +5619,13 @@ def default_setup(self): self.add_param('fixed_ren_scale', False) self.add_param('fixed_fac_scale', False) self.add_param('fixed_extra_scale', True, hidden=True, system=True) # set system since running from Ellis-Sexton scale not implemented - self.add_param('mur_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mur_ref_fixed', 91.118) self.add_param('muf1_ref_fixed', -1.0, hidden=True) - self.add_param('muf_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('muf_ref_fixed', 91.118) self.add_param('muf2_ref_fixed', -1.0, hidden=True) - self.add_param('mue_ref_fixed', 91.118, hidden=True, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mue_ref_fixed', 91.118, hidden=True) self.add_param("dynamical_scale_choice", [-1],fortran_name='dyn_scale', - allowed = [-2,-1,0,1,2,3,10], - shortcut={ 'ht/2':3,'ht':2,'et':1}, - comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") + allowed = [-2,-1,0,1,2,3,10], comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") self.add_param('fixed_qes_scale', False, hidden=True) self.add_param('qes_ref_fixed', -1.0, hidden=True) self.add_param('mur_over_ref', 1.0) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/common_run_interface.py index 6f82393c3f..3c5601e27d 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/common_run_interface.py @@ -5205,12 +5205,12 @@ def init_run(self, cards): if self.run_set: self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), - 'lpp': ([str],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), + 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), - 'fixed_scale': ([str],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), + 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), 'pbp':([],['run_card lpp1 1', 'run_card lpp2 1','run_card nb_proton1 82', 'run_card nb_neutron1 126', 'run_card mass_ion1 195.0820996698','run_card nb_proton2 1', 'run_card nb_neutron2 0', 'run_card mass_ion1 -1']), @@ -5795,8 +5795,6 @@ def complete_set(self, text, line, begidx, endidx, formatting=True): allowed_for_run.remove('*') elif isinstance(self.run_card[args[-1]], bool): allowed_for_run = ['True', 'False'] - if args[-1].lower() in self.run_card.shortcut_values: - allowed_for_run += self.run_card.shortcut_values[args[-1].lower()] opts += [str(i) for i in allowed_for_run] diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk b/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt index c1a6a8c137..d4f6b71098 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +28,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,57 +38,23 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t -INFO: download model from http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz to the following directory: /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/models  ---2026-03-10 10:39:42-- http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz -Resolving feynrules.irmp.ucl.ac.be (feynrules.irmp.ucl.ac.be)... 130.104.48.109 -Connecting to feynrules.irmp.ucl.ac.be (feynrules.irmp.ucl.ac.be)|130.104.48.109|:80... connected. -HTTP request sent, awaiting response... 200 Ok -Length: 80562 (79K) [application/x-tar] -Saving to: ‘tmp.tgz’ - - 0K .......... .......... .......... .......... .......... 63% 832K 0s - 50K .......... .......... ........ 100% 70.5M=0.06s - -2026-03-10 10:39:43 (1.27 MB/s) - ‘tmp.tgz’ saved [80562/80562] - -SMEFTsim_topU3l_MwScheme_UFO/ -SMEFTsim_topU3l_MwScheme_UFO/__init__.py -SMEFTsim_topU3l_MwScheme_UFO/param_card_massless.dat -SMEFTsim_topU3l_MwScheme_UFO/CT_couplings.py -SMEFTsim_topU3l_MwScheme_UFO/particles.py -SMEFTsim_topU3l_MwScheme_UFO/write_param_card.py -SMEFTsim_topU3l_MwScheme_UFO/decays.py -SMEFTsim_topU3l_MwScheme_UFO/parameters.py -SMEFTsim_topU3l_MwScheme_UFO/restrict_massless.dat -SMEFTsim_topU3l_MwScheme_UFO/object_library.py -SMEFTsim_topU3l_MwScheme_UFO/coupling_orders.py -SMEFTsim_topU3l_MwScheme_UFO/version.info -SMEFTsim_topU3l_MwScheme_UFO/function_library.py -SMEFTsim_topU3l_MwScheme_UFO/couplings.py -SMEFTsim_topU3l_MwScheme_UFO/propagators.py -SMEFTsim_topU3l_MwScheme_UFO/lorentz.py -SMEFTsim_topU3l_MwScheme_UFO/vertices.py -SMEFTsim_topU3l_MwScheme_UFO/restrict_SMlimit_massless.dat -fail to load model but auto_convert_model is on True. Trying to convert the model -convert model /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/models/SMEFTsim_topU3l_MwScheme_UFO -retry the load of the model +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t INFO: load particles INFO: load vertices @@ -105,7 +71,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.06466126441955566  +DEBUG: model prefixing takes 0.05528879165649414  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -114,22 +80,19 @@ Defined multiparticle l- = e- mu- Defined multiparticle vl = ve vm vt Defined multiparticle vl~ = ve~ vm~ vt~ Defined multiparticle all = g a ve vm vt ve~ vm~ vt~ u c t d s b t1 u~ c~ t~ d~ s~ b~ t1~ z w+ z1 w1+ h h1 w- w1- e- mu- ta- e+ mu+ ta+ -INFO: Change particles name to pass to MG5 convention -Kept definitions of multiparticles p / j / l+ / l- / vl / vl~ unchanged -Defined multiparticle all = g a ve vm vt ve~ vm~ vt~ u c t d s b t1 u~ c~ t~ d~ s~ b~ t1~ z w+ z1 w1+ h h1 w- w1- e- mu- ta- e+ mu+ ta+ generate g g > t t~ t t~ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 2.072 s +1 processes with 72 diagrams generated in 2.631 s Total: 1 processes with 72 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ t t~ @1 @@ -138,18 +101,18 @@ INFO: Processing color information for process: g g > t t~ t t~ @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. -Generated helas calls for 1 subprocesses (72 diagrams) in 0.094 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. +Generated helas calls for 1 subprocesses (72 diagrams) in 0.125 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.194 s +ALOHA: aloha creates 5 routines in 0.224 s VVV5 VVV5 FFV1 @@ -159,17 +122,17 @@ ALOHA: aloha creates 5 routines in 0.194 s VVVV1 VVVV9 VVVV10 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h -INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h +INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. quit -real 0m4.177s -user 0m2.874s -sys 0m0.228s +real 0m3.755s +user 0m3.653s +sys 0m0.071s Code generation completed in 4 seconds diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..1e7dcf38fe 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -504,7 +504,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc index dc1d2ecd53..b7fd59ff8d 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc @@ -1965,38 +1965,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -2022,7 +2034,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -2039,6 +2051,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -2157,7 +2170,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -2181,7 +2194,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -2293,60 +2306,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -2354,32 +2368,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h index 87d1743da6..aa52499cf0 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fcheck_sa.f b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fcheck_sa.f index 6a66bac979..b60ff6b550 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fcheck_sa.f +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk index f5bf67efbc..4cf05f5642 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' >> $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk b/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt index e0e58acbf4..55b4cd592c 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +28,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +38,16 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -547,7 +548,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.055 s +1 processes with 6 diagrams generated in 0.090 s Total: 1 processes with 6 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -558,10 +559,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False -- INFO: initialize a new directory: CODEGEN_mad_susy_gg_t1t1 INFO: remove old information in CODEGEN_mad_susy_gg_t1t1 DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 @@ -573,52 +574,52 @@ FileWriter t1 t1~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_t1t1x -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5} [model_handling.py at line 1748]  -Generated helas calls for 1 subprocesses (6 diagrams) in 0.005 s -Wrote files for 16 helas calls in 0.279 s +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5} [model_handling.py at line 1749]  +Generated helas calls for 1 subprocesses (6 diagrams) in 0.007 s +Wrote files for 16 helas calls in 0.060 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.114 s +ALOHA: aloha creates 3 routines in 0.089 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 6 routines in 0.120 s +ALOHA: aloha creates 6 routines in 0.092 s VVV1 VSS1 VSS1 VSS1 VVSS1 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 done. +Output to directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/README +/home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/README Run "open index.html" to see more information about this process. quit -real 0m5.502s -user 0m1.722s -sys 0m0.643s -Code generation completed in 6 seconds +real 0m2.497s +user 0m2.197s +sys 0m0.282s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * @@ -639,10 +640,10 @@ Code generation completed in 6 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards run @@ -669,10 +670,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt index 712b1897aa..db7e3616c4 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat index ee7d1277ff..f07e5631fd 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-8-gf0884cb7d HEAD * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/susy_gg_t1t1.mad/Source/DHELAS/aloha_functions.f index e986b059a9..47699fa614 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/DHELAS/aloha_functions.f @@ -2022,21 +2022,6 @@ subroutine orxxxx(p,rmass,nhel,nsr , ro) end - complex*16 function THETA_FUNCTIONR(cond, out_true, out_false) - - double precision cond - double precision out_true, out_false - - if (cond.ge.0d0) then - THETA_FUNCTIONR = out_true - else - THETA_FUNCTIONR = out_false - endif - - return - - - end complex*16 function THETA_FUNCTION(cond, out_true, out_false) double precision cond diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..1e7dcf38fe 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -504,7 +504,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc index 7aef93970a..a239d19d92 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc @@ -962,38 +962,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1019,7 +1031,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1036,6 +1048,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1154,7 +1167,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1178,7 +1191,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1290,60 +1303,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1351,32 +1365,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h index 293c26a2e9..f5d3042d1a 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig.f index c8bb469792..fe7a4274ea 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f index bdf00312dc..5e894db7e8 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f @@ -337,6 +337,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -347,6 +350,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -443,7 +447,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -516,19 +520,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=4) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -598,7 +604,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -622,7 +628,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -630,7 +636,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/fcheck_sa.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/fcheck_sa.f index f0220047d7..61be922c33 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/fcheck_sa.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f index c5dcf87c06..2c3622336c 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f @@ -274,7 +274,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -312,8 +312,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/addmothers.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/addmothers.f index d6cded9a2d..593c620d9b 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/addmothers.f @@ -111,7 +111,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, if (btest(mlevel,3)) then write(*,*)'unwgt.f: write out diagram ',igraphs(1) endif - lconfig = vec_igraph1(ivec) + lconfig = vec_igraph(ivec) endif is_LC=.true. maxcolor=0 diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cluster.inc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cluster.inc index 8ddf5bee13..940c25eac0 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cluster.inc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cluster.inc @@ -43,5 +43,5 @@ c parameters for sudakovs integer iipdg,iimode common/gamma_args/Q1,iipdg,iimode - integer vec_igraph1(VECSIZE_MEMMAX) - common/vec_igraph/vec_igraph1 + integer vec_igraph(VECSIZE_MEMMAX) + common/vec_igraph/vec_igraph diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk index f5bf67efbc..4cf05f5642 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' >> $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile_original.mk index 6cb56d0409..348c283be7 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile_original.mk +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile_original.mk @@ -58,10 +58,7 @@ $(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp -libcollier.$(dylibext): - ln -s $(LIBDIR)/collier_lib/libcollier.$(dylibext) || echo 'already done' - -gensym: $(SYMMETRY) configs.inc $(LIBS) libcollier.$(dylibext) +gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/myamp.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/myamp.f index bd02dfe2b4..5360566ef4 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/myamp.f @@ -139,7 +139,7 @@ logical function cut_bw(p) $ gForceBW(i,iconfig).eq.1)) if(onshell)then c Remove on-shell forbidden s-channels (gForceBW=2) (JA 2/10/11) - if(gForceBW(i,iconfig).eq.2.and.sde_strat.eq.1) then + if(gForceBW(i,iconfig).eq.2) then cut_bw = .true. return endif diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/reweight.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/reweight.f index 353e025d71..8e4672a421 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/reweight.f @@ -1416,6 +1416,7 @@ double precision function rewgt(p, ivec) rewgt=1.0d0 clustered=.false. + vec_igraph(ivec) = 0 ! default: no MLM graph selected for this event if(ickkw.le.0.and..not.use_syst) return @@ -1467,6 +1468,7 @@ double precision function rewgt(p, ivec) rewgt = 0d0 return endif + vec_igraph(ivec) = igraphs(1) ! save MLM-matched graph for this event c Store pdf information for systematics studies (initial) @@ -1592,10 +1594,6 @@ double precision function rewgt(p, ivec) c alpha_s weight if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then - if (q2now.le.4)then - rewgt=0d0 - return - endif rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1907,7 +1905,7 @@ subroutine update_scale_coupling_vec(all_p, all_wgt,all_q2fact, VECSIZE_USED) else all_q2fact(1,i) = q2fact(1) all_q2fact(2,i) = q2fact(2) - vec_igraph1(i) = igraphs(1) + vec_igraph(i) = igraphs(1) endif c call save_cl_val_to(i) c endif diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/banner.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/banner.py index 74f6b04b68..c248436e7f 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/banner.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/banner.py @@ -1004,8 +1004,6 @@ def __init__(self, finput=None, **opt): self.comments = {} # comment associated to parameters. can be display via help message # store the valid options for a given parameter. self.allowed_value = {} - # allow nickname for some parameter to avoid integer mapping for some var - self.shortcut_values = {} self.default_setup() @@ -1134,11 +1132,6 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): scan_targettype = self.scan_set[lower_name] del self.scan_set[lower_name] - # check if the user used a shortcut value (which are always str) - if lower_name in self.shortcut_values: - if isinstance(value,str) and value.strip().lower() in self.shortcut_values[lower_name]: - value = self.shortcut_values[lower_name][value.strip().lower()] - # 2. Find the type of the attribute that we want if lower_name in self.list_parameter: targettype = self.list_parameter[lower_name] @@ -1317,8 +1310,7 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): def add_param(self, name, value, system=False, comment=False, typelist=None, - allowed=[], - shortcut={}): + allowed=[]): """add a default parameter to the class""" lower_name = name.lower() @@ -1353,11 +1345,6 @@ def add_param(self, name, value, system=False, comment=False, typelist=None, assert val in allowed or '*' in allowed else: assert value in allowed or '*' in allowed - if shortcut: - if allowed and shortcut and '*' not in allowed: - assert all([val in allowed for val in shortcut.values()]), "Some shortcut value are not in the allowed list" - assert all([isinstance(v, str) for v in shortcut.keys()]), "All shortcut values should be str" - self.shortcut_values[lower_name] = shortcut #elif isinstance(value, bool) and allowed != ['*']: # self.allowed_value[name] = [True, False] @@ -4186,10 +4173,8 @@ def default_setup(self): allowed=['partonshower'], comment="list of check that can be bypassed manually.") self.add_param("python_seed", -2, include=False, hidden=True, comment="controlling python seed [handling in particular the final unweighting].\n -1 means use default from random module.\n -2 means set to same value as iseed") self.add_param("lpp1", 1, fortran_name="lpp(1)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='first beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("lpp2", 1, fortran_name="lpp(2)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='second beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("ebeam1", 6500.0, fortran_name="ebeam(1)") self.add_param("ebeam2", 6500.0, fortran_name="ebeam(2)") @@ -4198,24 +4183,18 @@ def default_setup(self): self.add_param("polbeam2", 0.0, fortran_name="pb2", hidden=True, comment="Beam polarization from -100 (left-handed) to 100 (right-handed) --use lpp=0 for this parameter--") self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_proton2', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(2)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (used for beam 2 if group_subprocess was False)') self.add_param('nb_neutron1', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(1)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_neutron2', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(2)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (of beam 2 if group_subprocess was False )') self.add_param('mass_ion1', -1.0, hidden=True, fortran_name="mass_ion(1)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 1)') self.add_param('mass_ion2', -1.0, hidden=True, fortran_name="mass_ion(2)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 2)') valid_pdf = ['lhapdf', 'cteq6_m','cteq6_l', 'cteq6l1','nn23lo', 'nn23lo1', 'nn23nlo','iww','eva','edff','chff','none','mixed']+\ sum(self.allowed_lep_densities.values(),[]) @@ -4228,14 +4207,12 @@ def default_setup(self): self.add_param("fixed_fac_scale1", False, hidden=True) self.add_param("fixed_fac_scale2", False, hidden=True) self.add_param("fixed_extra_scale", False, hidden=True) - self.add_param("scale", 91.1880, shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) + self.add_param("scale", 91.1880) + self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1") + self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2") self.add_param("mue_ref_fixed", 91.1880, hidden=True) self.add_param("dynamical_scale_choice", -1, comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2\n '4' is the center of mass energy\n'0' allows to use the user_hook definition (need to be defined via custom_fct entry) ", - allowed=[-1,0,1,2,3,4,10], - shortcut={'ckkw':-1,'ht':2,'ht/2':3,'et':1,'shat':4}, - ) + allowed=[-1,0,1,2,3,4,10]) self.add_param("mue_over_ref", 1.0, hidden=True, comment='ratio mu_other/mu for dynamical scale') self.add_param("ievo_eva",0,hidden=True, allowed=[0,1],fortran_name="ievo_eva", comment='eva: 0 for EW pdf muf evolution by q^2; 1 for evo by pT^2') @@ -5598,10 +5575,8 @@ def default_setup(self): self.add_param('niters_fo', 6, include=False) #seed and collider self.add_param('iseed', 0) - self.add_param('lpp1', 1, fortran_name='lpp(1)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) - self.add_param('lpp2', 1, fortran_name='lpp(2)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) + self.add_param('lpp1', 1, fortran_name='lpp(1)') + self.add_param('lpp2', 1, fortran_name='lpp(2)') self.add_param('ebeam1', 6500.0, fortran_name='ebeam(1)') self.add_param('ebeam2', 6500.0, fortran_name='ebeam(2)') self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", @@ -5644,15 +5619,13 @@ def default_setup(self): self.add_param('fixed_ren_scale', False) self.add_param('fixed_fac_scale', False) self.add_param('fixed_extra_scale', True, hidden=True, system=True) # set system since running from Ellis-Sexton scale not implemented - self.add_param('mur_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mur_ref_fixed', 91.118) self.add_param('muf1_ref_fixed', -1.0, hidden=True) - self.add_param('muf_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('muf_ref_fixed', 91.118) self.add_param('muf2_ref_fixed', -1.0, hidden=True) - self.add_param('mue_ref_fixed', 91.118, hidden=True, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mue_ref_fixed', 91.118, hidden=True) self.add_param("dynamical_scale_choice", [-1],fortran_name='dyn_scale', - allowed = [-2,-1,0,1,2,3,10], - shortcut={ 'ht/2':3,'ht':2,'et':1}, - comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") + allowed = [-2,-1,0,1,2,3,10], comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") self.add_param('fixed_qes_scale', False, hidden=True) self.add_param('qes_ref_fixed', -1.0, hidden=True) self.add_param('mur_over_ref', 1.0) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/common_run_interface.py index 6f82393c3f..3c5601e27d 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/common_run_interface.py @@ -5205,12 +5205,12 @@ def init_run(self, cards): if self.run_set: self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), - 'lpp': ([str],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), + 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), - 'fixed_scale': ([str],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), + 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), 'pbp':([],['run_card lpp1 1', 'run_card lpp2 1','run_card nb_proton1 82', 'run_card nb_neutron1 126', 'run_card mass_ion1 195.0820996698','run_card nb_proton2 1', 'run_card nb_neutron2 0', 'run_card mass_ion1 -1']), @@ -5795,8 +5795,6 @@ def complete_set(self, text, line, begidx, endidx, formatting=True): allowed_for_run.remove('*') elif isinstance(self.run_card[args[-1]], bool): allowed_for_run = ['True', 'False'] - if args[-1].lower() in self.run_card.shortcut_values: - allowed_for_run += self.run_card.shortcut_values[args[-1].lower()] opts += [str(i) for i in allowed_for_run] diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt index 0ee162c616..245a1c8d30 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +28,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +38,16 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -547,13 +548,13 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.055 s +1 processes with 6 diagrams generated in 0.087 s Total: 1 processes with 6 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 @@ -562,32 +563,32 @@ INFO: Processing color information for process: g g > t1 t1~ @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. -Generated helas calls for 1 subprocesses (6 diagrams) in 0.004 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. +Generated helas calls for 1 subprocesses (6 diagrams) in 0.008 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.113 s +ALOHA: aloha creates 3 routines in 0.099 s VVV1 VSS1 VSS1 VSS1 VVSS1 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. quit -real 0m1.441s -user 0m0.724s -sys 0m0.134s +real 0m1.068s +user 0m0.985s +sys 0m0.076s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..1e7dcf38fe 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -504,7 +504,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc index c5cac709d7..e6ede120a7 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc @@ -963,38 +963,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -1020,7 +1032,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1037,6 +1049,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1155,7 +1168,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1179,7 +1192,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1291,60 +1304,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1352,32 +1366,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h index 293c26a2e9..f5d3042d1a 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/fcheck_sa.f b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/fcheck_sa.f index f0220047d7..61be922c33 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/fcheck_sa.f +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk index f5bf67efbc..4cf05f5642 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' >> $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt index 88e01c7e57..6154e42325 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +28,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,15 +38,16 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -547,7 +548,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.052 s +1 processes with 3 diagrams generated in 0.084 s Total: 1 processes with 3 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_tt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -558,10 +559,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_tt --hel_recycling=False --ve INFO: initialize a new directory: CODEGEN_mad_susy_gg_tt INFO: remove old information in CODEGEN_mad_susy_gg_tt DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 @@ -573,49 +574,49 @@ FileWriter t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1748]  -Generated helas calls for 1 subprocesses (3 diagrams) in 0.004 s -Wrote files for 10 helas calls in 0.273 s +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1749]  +Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s +Wrote files for 10 helas calls in 0.055 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.095 s +ALOHA: aloha creates 2 routines in 0.100 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.079 s +ALOHA: aloha creates 4 routines in 0.067 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt done. +Output to directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/README +/home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m5.086s -user 0m1.635s -sys 0m0.704s -Code generation completed in 5 seconds +real 0m2.530s +user 0m2.212s +sys 0m0.292s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * @@ -636,10 +637,10 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards run @@ -666,10 +667,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/.mg5/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt index 712b1897aa..db7e3616c4 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat index 3a6928f635..3c7c799a87 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-8-gf0884cb7d HEAD * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/susy_gg_tt.mad/Source/DHELAS/aloha_functions.f index e986b059a9..47699fa614 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/susy_gg_tt.mad/Source/DHELAS/aloha_functions.f @@ -2022,21 +2022,6 @@ subroutine orxxxx(p,rmass,nhel,nsr , ro) end - complex*16 function THETA_FUNCTIONR(cond, out_true, out_false) - - double precision cond - double precision out_true, out_false - - if (cond.ge.0d0) then - THETA_FUNCTIONR = out_true - else - THETA_FUNCTIONR = out_false - endif - - return - - - end complex*16 function THETA_FUNCTION(cond, out_true, out_false) double precision cond diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..1e7dcf38fe 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -504,7 +504,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index b575475690..353c6e5f48 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -939,38 +939,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -996,7 +1008,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1013,6 +1025,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1131,7 +1144,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1155,7 +1168,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1267,60 +1280,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1328,32 +1342,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index 732f9919c9..58e1bfe668 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f index 7f809ad0ff..6c6b37db2c 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f @@ -791,8 +791,7 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C Input: C pp 4 momentum of external particles C wgt weight from Monte Carlo -C imode 0 run, 1 init, 2 reweight, 3 finalize, 4: PDF only, 5: ME -C only +C imode 0 run, 1 init, 2 reweight, 3 finalize C Output: C Amplitude squared and summed C **************************************************** @@ -893,9 +892,8 @@ FUNCTION DSIGPROC(PP,ICONF,IPROC,IMIRROR,SYMCONF,CONFSUB,WGT C endif C set the running scale -C and update the couplings accordingly (but deactivate for -C discrete sampler(imode=5) and - IF (VECSIZE_MEMMAX.LE.1.AND.IMODE.NE.5) THEN ! no-vector (NB not VECSIZE_USED!) +C and update the couplings accordingly + IF (VECSIZE_MEMMAX.LE.1) THEN ! no-vector (NB not VECSIZE_USED!) CALL UPDATE_SCALE_COUPLING(PP, WGT) ENDIF @@ -1205,7 +1203,7 @@ INTEGER FUNCTION GET_NHEL(HEL,PARTID) END - SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) + SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL, IVEC) IMPLICIT NONE INCLUDE 'nexternal.inc' INCLUDE 'maxamps.inc' ! for the definition of maxflow @@ -1220,10 +1218,13 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION JAMP2(0:MAXFLOW) INTEGER ICONFIG ! amplitude selected INTEGER IPROC ! matrix element selected + INTEGER IVEC C C argument OUT C INTEGER ICOL + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C local C @@ -1235,7 +1236,15 @@ SUBROUTINE SELECT_COLOR(RCOL, JAMP2, ICONFIG, IPROC, ICOL) DOUBLE PRECISION XTARGET IF (ICKKW.GT.0) THEN - ICONFIG = IGRAPHS(1) + IF (IVEC.EQ.0) THEN + ICONFIG = IGRAPHS(1) + ELSE + ICONFIG = VEC_IGRAPH(IVEC) + IF(ICONFIG.EQ.0)THEN + ICOL =0 + RETURN + ENDIF + ENDIF ENDIF diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f index a68aa6e4c0..7cf597b197 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f @@ -337,6 +337,9 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, DOUBLE PRECISION P1(0:3, NEXTERNAL) INTEGER IVEC, CURR_WARP, IWARP, NB_WARP_USED INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) + COMMON/VEC_IGRAPH/IGRAPH C C DATA C @@ -347,6 +350,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C ---------- SELECTED_HEL(:) = 0 SELECTED_COL(:) = 0 + IGRAPH(:) = 0 IF(IMODE.EQ.1)THEN NFACT = DSIG1(ALL_PP(0,1,1), ALL_WGT(1), IMODE) @@ -443,7 +447,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, ENDDO ! end loop on IWARP/IVEC ENDDO ! end loop on the CURR_WARP CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) DO CURR_WARP=1, NB_WARP_USED @@ -516,19 +520,21 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ IGRAPH, OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) IMPLICIT NONE INCLUDE 'nexternal.inc' - INCLUDE '../../Source/vector.inc' ! defines VECSIZE_MEMMAX INCLUDE 'maxamps.inc' + INCLUDE 'cluster.inc' ! for IGRAPHS common block (MLM per-event color selection); also defines VECSIZE_MEMMAX via vector.inc INTEGER NCOMB PARAMETER ( NCOMB=16) DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_MEMMAX) DOUBLE PRECISION HEL_RAND(VECSIZE_MEMMAX) DOUBLE PRECISION COL_RAND(VECSIZE_MEMMAX) INTEGER CHANNELS(VECSIZE_MEMMAX) +C Per-event MLM graph: igraphs(1) from REWGT (0 = no MLM) + INTEGER IGRAPH(VECSIZE_MEMMAX) DOUBLE PRECISION OUT(VECSIZE_MEMMAX) INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) @@ -598,7 +604,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461) CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED ) ! cudacppHEL=1 CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities FIRST = .FALSE. C ! This is a workaround for @@ -622,7 +628,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED ) ! cudacppMEs=0 IF ( .NOT. MULTI_CHANNEL ) THEN CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled - & P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2, + & P_MULTI, ALL_G, HEL_RAND, COL_RAND, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ELSE IF( SDE_STRAT.NE.1 ) THEN @@ -630,7 +636,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNELS, STOP ENDIF CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled - & HEL_RAND, COL_RAND, CHANNELS, OUT2, + & HEL_RAND, COL_RAND, CHANNELS, IGRAPH, OUT2, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF CALL COUNTERS_SMATRIX1MULTI_STOP( 0 ) ! cudacppMEs=0 diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f index f0220047d7..61be922c33 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f index ef4145fa88..52a516cda9 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f @@ -286,7 +286,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF ANS=ANS/DBLE(IDEN) - CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL) + CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1, ICOL, IVEC) END @@ -324,8 +324,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) LOGICAL CHOSEN_SO_CONFIGS(NSQAMPSO) DATA CHOSEN_SO_CONFIGS/.TRUE./ SAVE CHOSEN_SO_CONFIGS - DOUBLE PRECISION BWCUTOFF - COMMON/TO_BWCUTOFF/ BWCUTOFF C C ARGUMENTS C diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/addmothers.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/addmothers.f index d6cded9a2d..593c620d9b 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/addmothers.f @@ -111,7 +111,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, if (btest(mlevel,3)) then write(*,*)'unwgt.f: write out diagram ',igraphs(1) endif - lconfig = vec_igraph1(ivec) + lconfig = vec_igraph(ivec) endif is_LC=.true. maxcolor=0 diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cluster.inc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cluster.inc index 8ddf5bee13..940c25eac0 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cluster.inc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cluster.inc @@ -43,5 +43,5 @@ c parameters for sudakovs integer iipdg,iimode common/gamma_args/Q1,iipdg,iimode - integer vec_igraph1(VECSIZE_MEMMAX) - common/vec_igraph/vec_igraph1 + integer vec_igraph(VECSIZE_MEMMAX) + common/vec_igraph/vec_igraph diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk index f5bf67efbc..4cf05f5642 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' >> $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile_original.mk index 6cb56d0409..348c283be7 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile_original.mk +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile_original.mk @@ -58,10 +58,7 @@ $(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp -libcollier.$(dylibext): - ln -s $(LIBDIR)/collier_lib/libcollier.$(dylibext) || echo 'already done' - -gensym: $(SYMMETRY) configs.inc $(LIBS) libcollier.$(dylibext) +gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/myamp.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/myamp.f index bd02dfe2b4..5360566ef4 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/myamp.f @@ -139,7 +139,7 @@ logical function cut_bw(p) $ gForceBW(i,iconfig).eq.1)) if(onshell)then c Remove on-shell forbidden s-channels (gForceBW=2) (JA 2/10/11) - if(gForceBW(i,iconfig).eq.2.and.sde_strat.eq.1) then + if(gForceBW(i,iconfig).eq.2) then cut_bw = .true. return endif diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/reweight.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/reweight.f index 353e025d71..8e4672a421 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/reweight.f @@ -1416,6 +1416,7 @@ double precision function rewgt(p, ivec) rewgt=1.0d0 clustered=.false. + vec_igraph(ivec) = 0 ! default: no MLM graph selected for this event if(ickkw.le.0.and..not.use_syst) return @@ -1467,6 +1468,7 @@ double precision function rewgt(p, ivec) rewgt = 0d0 return endif + vec_igraph(ivec) = igraphs(1) ! save MLM-matched graph for this event c Store pdf information for systematics studies (initial) @@ -1592,10 +1594,6 @@ double precision function rewgt(p, ivec) c alpha_s weight if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then - if (q2now.le.4)then - rewgt=0d0 - return - endif rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1907,7 +1905,7 @@ subroutine update_scale_coupling_vec(all_p, all_wgt,all_q2fact, VECSIZE_USED) else all_q2fact(1,i) = q2fact(1) all_q2fact(2,i) = q2fact(2) - vec_igraph1(i) = igraphs(1) + vec_igraph(i) = igraphs(1) endif c call save_cl_val_to(i) c endif diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/banner.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/banner.py index 74f6b04b68..c248436e7f 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/banner.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/banner.py @@ -1004,8 +1004,6 @@ def __init__(self, finput=None, **opt): self.comments = {} # comment associated to parameters. can be display via help message # store the valid options for a given parameter. self.allowed_value = {} - # allow nickname for some parameter to avoid integer mapping for some var - self.shortcut_values = {} self.default_setup() @@ -1134,11 +1132,6 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): scan_targettype = self.scan_set[lower_name] del self.scan_set[lower_name] - # check if the user used a shortcut value (which are always str) - if lower_name in self.shortcut_values: - if isinstance(value,str) and value.strip().lower() in self.shortcut_values[lower_name]: - value = self.shortcut_values[lower_name][value.strip().lower()] - # 2. Find the type of the attribute that we want if lower_name in self.list_parameter: targettype = self.list_parameter[lower_name] @@ -1317,8 +1310,7 @@ def __setitem__(self, name, value, change_userdefine=False,raiseerror=False): def add_param(self, name, value, system=False, comment=False, typelist=None, - allowed=[], - shortcut={}): + allowed=[]): """add a default parameter to the class""" lower_name = name.lower() @@ -1353,11 +1345,6 @@ def add_param(self, name, value, system=False, comment=False, typelist=None, assert val in allowed or '*' in allowed else: assert value in allowed or '*' in allowed - if shortcut: - if allowed and shortcut and '*' not in allowed: - assert all([val in allowed for val in shortcut.values()]), "Some shortcut value are not in the allowed list" - assert all([isinstance(v, str) for v in shortcut.keys()]), "All shortcut values should be str" - self.shortcut_values[lower_name] = shortcut #elif isinstance(value, bool) and allowed != ['*']: # self.allowed_value[name] = [True, False] @@ -4186,10 +4173,8 @@ def default_setup(self): allowed=['partonshower'], comment="list of check that can be bypassed manually.") self.add_param("python_seed", -2, include=False, hidden=True, comment="controlling python seed [handling in particular the final unweighting].\n -1 means use default from random module.\n -2 means set to same value as iseed") self.add_param("lpp1", 1, fortran_name="lpp(1)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='first beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("lpp2", 1, fortran_name="lpp(2)", allowed=[-1,1,0,2,3,9,-2,-3,4,-4], - shortcut={'p':1,"p~":-1,'e-':3,'e+':-3,'mu-':4,'mu+':-4, 'no':0}, comment='second beam energy distribution:\n 0: fixed energy\n 1: PDF of proton\n -1: PDF of antiproton\n 2:elastic photon from proton, +/-3:PDF of electron/positron, +/-4:PDF of muon/antimuon, 9: PLUGIN MODE') self.add_param("ebeam1", 6500.0, fortran_name="ebeam(1)") self.add_param("ebeam2", 6500.0, fortran_name="ebeam(2)") @@ -4198,24 +4183,18 @@ def default_setup(self): self.add_param("polbeam2", 0.0, fortran_name="pb2", hidden=True, comment="Beam polarization from -100 (left-handed) to 100 (right-handed) --use lpp=0 for this parameter--") self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_proton2', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(2)", - shortcut={'lead':82}, comment='For heavy ion physics nb of proton in the ion (used for beam 2 if group_subprocess was False)') self.add_param('nb_neutron1', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(1)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (for both beam but if group_subprocess was False)') self.add_param('nb_neutron2', 0, hidden=True, allowed=[1,0, 126 , '*'],fortran_name="nb_neutron(2)", - shortcut={'lead':126}, comment='For heavy ion physics nb of neutron in the ion (of beam 2 if group_subprocess was False )') self.add_param('mass_ion1', -1.0, hidden=True, fortran_name="mass_ion(1)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 1)') self.add_param('mass_ion2', -1.0, hidden=True, fortran_name="mass_ion(2)", allowed=[-1,0, 0.938, 207.9766521*0.938, 0.000511, 0.105, '*'], - shortcut={'proton':0.938,'lead':207.9766521*0.938,'electron':0.000511,'muon':0.105}, comment='For heavy ion physics mass in GeV of the ion (of beam 2)') valid_pdf = ['lhapdf', 'cteq6_m','cteq6_l', 'cteq6l1','nn23lo', 'nn23lo1', 'nn23nlo','iww','eva','edff','chff','none','mixed']+\ sum(self.allowed_lep_densities.values(),[]) @@ -4228,14 +4207,12 @@ def default_setup(self): self.add_param("fixed_fac_scale1", False, hidden=True) self.add_param("fixed_fac_scale2", False, hidden=True) self.add_param("fixed_extra_scale", False, hidden=True) - self.add_param("scale", 91.1880, shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) - self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2", shortcut={'mz':91.1880, 'mh':125.0, 'mt':173.0, 'mtau':1.77686}) + self.add_param("scale", 91.1880) + self.add_param("dsqrt_q2fact1", 91.1880, fortran_name="sf1") + self.add_param("dsqrt_q2fact2", 91.1880, fortran_name="sf2") self.add_param("mue_ref_fixed", 91.1880, hidden=True) self.add_param("dynamical_scale_choice", -1, comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2\n '4' is the center of mass energy\n'0' allows to use the user_hook definition (need to be defined via custom_fct entry) ", - allowed=[-1,0,1,2,3,4,10], - shortcut={'ckkw':-1,'ht':2,'ht/2':3,'et':1,'shat':4}, - ) + allowed=[-1,0,1,2,3,4,10]) self.add_param("mue_over_ref", 1.0, hidden=True, comment='ratio mu_other/mu for dynamical scale') self.add_param("ievo_eva",0,hidden=True, allowed=[0,1],fortran_name="ievo_eva", comment='eva: 0 for EW pdf muf evolution by q^2; 1 for evo by pT^2') @@ -5598,10 +5575,8 @@ def default_setup(self): self.add_param('niters_fo', 6, include=False) #seed and collider self.add_param('iseed', 0) - self.add_param('lpp1', 1, fortran_name='lpp(1)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) - self.add_param('lpp2', 1, fortran_name='lpp(2)', - shortcut={'p':1, 'p~':-1, 'e-': 3, 'e+':-3, 'mu-':4, 'mu+':-4}) + self.add_param('lpp1', 1, fortran_name='lpp(1)') + self.add_param('lpp2', 1, fortran_name='lpp(2)') self.add_param('ebeam1', 6500.0, fortran_name='ebeam(1)') self.add_param('ebeam2', 6500.0, fortran_name='ebeam(2)') self.add_param('nb_proton1', 1, hidden=True, allowed=[1,0, 82 , '*'],fortran_name="nb_proton(1)", @@ -5644,15 +5619,13 @@ def default_setup(self): self.add_param('fixed_ren_scale', False) self.add_param('fixed_fac_scale', False) self.add_param('fixed_extra_scale', True, hidden=True, system=True) # set system since running from Ellis-Sexton scale not implemented - self.add_param('mur_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mur_ref_fixed', 91.118) self.add_param('muf1_ref_fixed', -1.0, hidden=True) - self.add_param('muf_ref_fixed', 91.118, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('muf_ref_fixed', 91.118) self.add_param('muf2_ref_fixed', -1.0, hidden=True) - self.add_param('mue_ref_fixed', 91.118, hidden=True, shortcut={'mz':91.118, 'mw':80.419, 'mt':172.5, 'mh':125.0}) + self.add_param('mue_ref_fixed', 91.118, hidden=True) self.add_param("dynamical_scale_choice", [-1],fortran_name='dyn_scale', - allowed = [-2,-1,0,1,2,3,10], - shortcut={ 'ht/2':3,'ht':2,'et':1}, - comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") + allowed = [-2,-1,0,1,2,3,10], comment="\'-1\' is based on CKKW back clustering (following feynman diagram).\n \'1\' is the sum of transverse energy.\n '2' is HT (sum of the transverse mass)\n '3' is HT/2, '0' allows to use the user_hook definition (need to be defined via custom_fct entry) ") self.add_param('fixed_qes_scale', False, hidden=True) self.add_param('qes_ref_fixed', -1.0, hidden=True) self.add_param('mur_over_ref', 1.0) diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/common_run_interface.py index 6f82393c3f..3c5601e27d 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/common_run_interface.py @@ -5205,12 +5205,12 @@ def init_run(self, cards): if self.run_set: self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), - 'lpp': ([str],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), + 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), - 'fixed_scale': ([str],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), + 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), 'pbp':([],['run_card lpp1 1', 'run_card lpp2 1','run_card nb_proton1 82', 'run_card nb_neutron1 126', 'run_card mass_ion1 195.0820996698','run_card nb_proton2 1', 'run_card nb_neutron2 0', 'run_card mass_ion1 -1']), @@ -5795,8 +5795,6 @@ def complete_set(self, text, line, begidx, endidx, formatting=True): allowed_for_run.remove('*') elif isinstance(self.run_card[args[-1]], bool): allowed_for_run = ['True', 'False'] - if args[-1].lower() in self.run_card.shortcut_values: - allowed_for_run += self.run_card.shortcut_values[args[-1].lower()] opts += [str(i) for i in allowed_for_run] diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt index 7142d5e27a..fd66e883f7 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-8-gf0884cb7d HEAD * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -29,6 +28,7 @@ It has been validated for the last time with version: 3.6.5 * Type 'tutorial MadLoop' to learn how MadLoop works * * * ************************************************************ +load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt load MG5 configuration from input/mg5_configuration.txt fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). @@ -38,23 +38,21 @@ eMELA-config does not seem to correspond to a valid eMELA-config executable. Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). MG5_aMC> set eMELA /PATH/TO/eMELA-config +set ninja to /home/dmass/Apps/HEPTools/lib +set collier to /home/dmass/Apps/HEPTools/lib lhapdf-config does not seem to correspond to a valid lhapdf-config executable. Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt.mg +import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F import model MSSM_SLHA2 -INFO: load particles -INFO: load vertices -DEBUG: model prefixing takes 0.4310164451599121  INFO: Restrict model MSSM_SLHA2 with file models/MSSM_SLHA2/restrict_default.dat . INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Simplifying conditional expressions  @@ -550,13 +548,13 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.054 s +1 processes with 3 diagrams generated in 0.076 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 @@ -565,30 +563,30 @@ INFO: Processing color information for process: g g > t t~ @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. Generated helas calls for 1 subprocesses (3 diagrams) in 0.004 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.082 s +ALOHA: aloha creates 2 routines in 0.071 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h +FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. +INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. quit -real 0m2.103s -user 0m1.223s -sys 0m0.178s -Code generation completed in 2 seconds +real 0m0.923s +user 0m0.829s +sys 0m0.081s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h index 4e3f17e0dd..9cdf2f90d1 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h @@ -125,7 +125,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -143,7 +143,7 @@ namespace mg5amcCpu * @param selcol the pointer to the output selected colors * @param goodHelOnly quit after computing good helicities? */ - void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif // Return the number of good helicities (-1 initially when they have not yet @@ -343,6 +343,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -394,6 +395,7 @@ paramCard; #endif "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); #ifdef MGONGPUCPP_VERBOSE @@ -423,6 +425,7 @@ paramCard; #endif const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -454,6 +457,7 @@ paramCard; #endif "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; + m_pmek->setigraph( igraph ); m_pmek->computeMatrixElements( useChannelIds ); #ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/BridgeKernels.cc index 62e2c3af96..2d46db185e 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/BridgeKernels.cc @@ -80,7 +80,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -90,7 +90,7 @@ namespace mg5amcCpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- @@ -139,7 +139,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = true; constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); return m_bridge.nGoodHel(); } @@ -149,7 +149,7 @@ namespace mg5amcGpu { constexpr bool goodHelOnly = false; const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..1e7dcf38fe 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -504,7 +504,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_igraph, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..9382732d9f 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h @@ -46,6 +46,9 @@ namespace mg5amcCpu // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) virtual int computeGoodHelicities() = 0; + // Set the per-event MLM graph array (nullptr = no MLM matching; must be called before computeMatrixElements if needed) + void setigraph( const int* igraph ) { m_igraph = igraph; } + // Compute matrix elements virtual void computeMatrixElements( const bool useChannelIds ) = 0; @@ -84,6 +87,9 @@ namespace mg5amcCpu // The buffer for the channel ids for single-diagram enhancement const BufferChannelIds& m_channelIds; + // The per-event MLM graph array (nullptr = no MLM; set via setigraph before computeMatrixElements) + const int* m_igraph = nullptr; + // The buffer for the output matrix elements BufferMatrixElements& m_matrixElements; diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc index e0e3bfd321..2fb5bea34b 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc @@ -936,38 +936,50 @@ namespace mg5amcCpu select_col( int* allselcol, // output: color selection[nevt] const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); + // Per-event MLM graph (0 = no MLM) + const int igraph = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( channelId != 0 || igraph != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) fptype_sv jamp2_sv[ncolor] = { 0 }; assert( allJamp2s != nullptr ); // sanity check using J2_ACCESS = DeviceAccessJamp2; for( int icolC = 0; icolC < ncolor; icolC++ ) jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph != 0 ) { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 + iconfig = igraph; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) + else { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } } fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] @@ -993,7 +1005,7 @@ namespace mg5amcCpu } else { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) } return; } @@ -1010,6 +1022,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1128,7 +1141,7 @@ namespace mg5amcCpu gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, allIgraph, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1152,7 +1165,7 @@ namespace mg5amcCpu // - firstprivate: give each thread its own copy, and initialise with value from outside #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig +#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, allIgraph, mgOnGpu::icolamp, mgOnGpu::channel2iconfig #else #define _OMPLIST1 #endif @@ -1264,60 +1277,61 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // NB: with MLM, different events in a SIMD page may have different igraph values, so iconfig must be per-event + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int ievt = ievt00 + ieppV; + // Use per-event MLM graph if provided, otherwise use channel2iconfig + const int igraph1_ievt = ( allIgraph != nullptr ) ? allIgraph[ievt] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; + // Determine iconfig: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig; + if( igraph1_ievt != 0 ) + { + iconfig = igraph1_ievt; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + } + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + // NB: targetamp is a scalar fptype (not fptype_sv) - iconfig is per-event so we extract the scalar lane from jamp2_sv + fptype targetamp[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp[icolC] = fptype{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) #if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC][ieppV]; #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; #endif + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); if( okcol ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] @@ -1325,32 +1339,52 @@ namespace mg5amcCpu break; } } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if both channelId and igraph are 0 (see #931) + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; + const int ievt2 = ievt00 + ieppV + neppV; + const int igraph1_ievt2 = ( allIgraph != nullptr ) ? allIgraph[ievt2] : 0; // per-event igraph (may differ across SIMD page with MLM) + if( channelId != 0 || igraph1_ievt2 != 0 ) // no event-by-event choice of color if both channelId and igraph are 0 (fix FPE #783) + { + // Determine iconfig2: use per-event MLM graph if provided, otherwise use channel2iconfig + int iconfig2; + if( igraph1_ievt2 != 0 ) + { + iconfig2 = igraph1_ievt2; // use MLM-matched graph directly as iconfig (F-indexed, 1-based) + } + else + { + iconfig2 = mgOnGpu::channel2iconfig[channelId - 1]; // same channelId as for ievt (sanity checks already done above) + } + fptype targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig2 - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC][ieppV]; + } //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt2] < ( targetamp2[icolC] / targetamp2[ncolor - 1] ) ) { allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); break; } } -#endif } - } - else - { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + else { - const int ievt = ievt00 + ieppV; - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) -#endif } +#endif } #endif // multichannel enabled (random color choice) } diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h index 732f9919c9..58e1bfe668 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -187,6 +188,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const int* allIgraph, // input: per-event MLM graph (0 = no MLM); nullptr if no MLM #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fcheck_sa.f b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fcheck_sa.f index f0220047d7..61be922c33 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fcheck_sa.f +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fcheck_sa.f @@ -20,6 +20,7 @@ PROGRAM FCHECK_SA DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 IGRAPH(NEVTMAX) ! per-event MLM graph (0 = no MLM) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision @@ -62,8 +63,9 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO + IGRAPH(:) = 0 ! no MLM graph matching in standalone check CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk index f5bf67efbc..4cf05f5642 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(shell echo '$(BACKEND)' >> $(BACKEND_LOG)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.cc index 8b3f302975..fea35823f5 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.cc @@ -91,6 +91,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -102,11 +103,11 @@ extern "C" #ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, channelIds, igraph, mes, selhel, selcol, *pgoodHelOnly ); #endif } @@ -129,13 +130,14 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool* pgoodHelOnly ) { //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) ); - fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly ); + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, igraph, mes, selhel, selcol, pgoodHelOnly ); } /** diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.h index 7d5014a138..b3667b03fe 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.h @@ -29,6 +29,7 @@ extern "C" const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -39,6 +40,7 @@ extern "C" const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, + const int* igraph, FORTRANFPTYPE* mes, int* selhel, int* selcol, @@ -46,4 +48,4 @@ extern "C" void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); } -#endif // _FBRIDGE_H_ \ No newline at end of file +#endif // _FBRIDGE_H_ diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.inc index 5708dca15c..590063408a 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.inc @@ -37,6 +37,7 @@ C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection C - CHANID: the input array of channels (Feynman diagrams) to enhance +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -44,13 +45,15 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, CHANID, IGRAPH, MES, SELHEL, SELCOL, + & HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) INTEGER*4 CHANID(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) @@ -65,6 +68,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection +C - IGRAPH: the input per-event MLM graph array (0 = no MLM graph) C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -72,12 +76,13 @@ C - HELONLY: input flag, quit after computing good helicities? C INTERFACE SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY) + & RNDHEL, RNDCOL, IGRAPH, MES, SELHEL, SELCOL, HELONLY) INTEGER*8 PBRIDGE DOUBLE PRECISION MOMENTA(*) DOUBLE PRECISION GS(*) DOUBLE PRECISION RNDHEL(*) DOUBLE PRECISION RNDCOL(*) + INTEGER*4 IGRAPH(*) DOUBLE PRECISION MES(*) INTEGER*4 SELHEL(*) INTEGER*4 SELCOL(*) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"