diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp
index 21b91955244..b3b30621d1a 100644
--- a/lib/hsa/mcwamp_hsa.cpp
+++ b/lib/hsa/mcwamp_hsa.cpp
@@ -61,8 +61,11 @@
 // kernel dispatch speed optimization flags
 /////////////////////////////////////////////////
 
-// size of default kernarg buffer in the kernarg pool in HSAContext
-#define KERNARG_BUFFER_SIZE (512)
+// Size of default kernarg buffer in the kernarg pool in HSAContext, in bytes.
+// Increased from 512 to 4k to match CUDA default. See
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#function-parameters
+// When this size is exceeded, on-demand allocation of the kernarg buffer is slow.
+#define KERNARG_BUFFER_SIZE (4096)
 
 // number of pre-allocated kernarg buffers in HSAContext
 // (some kernels don't allocate signals but nearly all need kernargs)
@@ -75,8 +78,10 @@
 // MUST be a power of 2.
 #define MAX_INFLIGHT_COMMANDS_PER_QUEUE  (2*8192)
 
-// threshold to clean up finished kernel in HSAQueue.asyncOps
-int HCC_ASYNCOPS_SIZE = (2*8192);
+// Threshold to clean up finished kernel in HSAQueue.asyncOps.
+// Reduced from 16k to 1k at the same time when the HCC_KERNARG_BUFFER_SIZE
+// was increased, in order to offset the increase in memory pressure.
+int HCC_ASYNCOPS_SIZE = (1024);
 
 
 //---