diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp index 21b91955244..b3b30621d1a 100644 --- a/lib/hsa/mcwamp_hsa.cpp +++ b/lib/hsa/mcwamp_hsa.cpp @@ -61,8 +61,11 @@ // kernel dispatch speed optimization flags ///////////////////////////////////////////////// -// size of default kernarg buffer in the kernarg pool in HSAContext -#define KERNARG_BUFFER_SIZE (512) +// Size of default kernarg buffer in the kernarg pool in HSAContext, in bytes. +// Increased from 512 to 4k to match CUDA default. See +// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#function-parameters +// When this size is exceeded, on-demand allocation of the kernarg buffer is slow. +#define KERNARG_BUFFER_SIZE (4096) // number of pre-allocated kernarg buffers in HSAContext // (some kernels don't allocate signals but nearly all need kernargs) @@ -75,8 +78,10 @@ // MUST be a power of 2. #define MAX_INFLIGHT_COMMANDS_PER_QUEUE (2*8192) -// threshold to clean up finished kernel in HSAQueue.asyncOps -int HCC_ASYNCOPS_SIZE = (2*8192); +// Threshold to clean up finished kernel in HSAQueue.asyncOps. +// Reduced from 16k to 1k at the same time when the HCC_KERNARG_BUFFER_SIZE +// was increased, in order to offset the increase in memory pressure. +int HCC_ASYNCOPS_SIZE = (1024); //---