From e3c70a7e56d2f7422874bdd914cbf4dd1178c3a0 Mon Sep 17 00:00:00 2001
From: crafcat7 <crafcat7@outlook.com>
Date: Tue, 7 Apr 2026 23:34:28 +0800
Subject: [PATCH 1/4] [feature] arm: speed up fp16 exp_ps floor step on aarch64

Summary:
  Use vcvtmq_s16_f16 for floor computation in exp_ps_f16 on aarch64 while keeping the legacy fallback path for non-aarch64 targets. This reduces the exp_ps hot-path cost on ARM without changing approximation behavior.

  Also reuses the floor result for 2^n construction to avoid redundant vcvt instruction.
---
 src/layer/arm/neon_mathfun_fp16s.h | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/layer/arm/neon_mathfun_fp16s.h b/src/layer/arm/neon_mathfun_fp16s.h
index 2ad7e214470d..eaf8b3c09606 100644
--- a/src/layer/arm/neon_mathfun_fp16s.h
+++ b/src/layer/arm/neon_mathfun_fp16s.h
@@ -184,6 +184,7 @@ static inline float16x8_t log_ps_f16(float16x8_t x)
 static inline float16x4_t exp_ps_f16(float16x4_t x)
 {
     float16x4_t tmp, fx;
+    int16x4_t mm;
 
     float16x4_t one = vdup_n_f16(1);
     x = vmin_f16(x, vdup_n_f16(c_exp_hi_f16));
@@ -197,6 +198,10 @@ static inline float16x4_t exp_ps_f16(float16x4_t x)
 #endif
 
     /* perform a floorf */
+#if defined(__aarch64__)
+    mm = vcvtm_s16_f16(fx);
+    fx = vcvt_f16_s16(mm);
+#else
     tmp = vcvt_f16_s16(vcvt_s16_f16(fx));
 
     /* if greater, substract 1 */
@@ -204,6 +209,8 @@ static inline float16x4_t exp_ps_f16(float16x4_t x)
     mask = vand_u16(mask, (uint16x4_t)(one));
 
     fx = vsub_f16(tmp, (float16x4_t)(mask));
+    mm = vcvt_s16_f16(fx);
+#endif
 
 #if defined(_MSC_VER) && !defined(__clang__)
     tmp = vmul_f16(fx, vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C1)));
@@ -228,8 +235,6 @@ static inline float16x4_t exp_ps_f16(float16x4_t x)
     y = vadd_f16(y, one);
 
     /* build 2^n */
-    int16x4_t mm;
-    mm = vcvt_s16_f16(fx);
     mm = vadd_s16(mm, vdup_n_s16(0xf));
     mm = vshl_n_s16(mm, 10);
     float16x4_t pow2n = vreinterpret_f16_s16(mm);
@@ -241,6 +246,7 @@ static inline float16x4_t exp_ps_f16(float16x4_t x)
 static inline float16x8_t exp_ps_f16(float16x8_t x)
 {
     float16x8_t tmp, fx;
+    int16x8_t mm;
 
     float16x8_t one = vdupq_n_f16(1);
     x = vminq_f16(x, vdupq_n_f16(c_exp_hi_f16));
@@ -255,6 +261,10 @@ static inline float16x8_t exp_ps_f16(float16x8_t x)
 #endif
 
     /* perform a floorf */
+#if defined(__aarch64__)
+    mm = vcvtmq_s16_f16(fx);
+    fx = vcvtq_f16_s16(mm);
+#else
     tmp = vcvtq_f16_s16(vcvtq_s16_f16(fx));
 
     /* if greater, substract 1 */
@@ -262,6 +272,8 @@ static inline float16x8_t exp_ps_f16(float16x8_t x)
     mask = vandq_u16(mask, vreinterpretq_u16_f16(one));
 
     fx = vsubq_f16(tmp, vreinterpretq_f16_u16(mask));
+    mm = vcvtq_s16_f16(fx);
+#endif
 
 #if defined(_MSC_VER) && !defined(__clang__)
     float16x4_t _c_cephes_exp_C1 = vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C1));
@@ -288,8 +300,6 @@ static inline float16x8_t exp_ps_f16(float16x8_t x)
     y = vaddq_f16(y, one);
 
     /* build 2^n */
-    int16x8_t mm;
-    mm = vcvtq_s16_f16(fx);
     mm = vaddq_s16(mm, vdupq_n_s16(0xf));
     mm = vshlq_n_s16(mm, 10);
     float16x8_t pow2n = vreinterpretq_f16_s16(mm);

From b6f2fdab64e14117f10cbc33df97507800864e55 Mon Sep 17 00:00:00 2001
From: crafcat7 <crafcat7@outlook.com>
Date: Wed, 8 Apr 2026 13:26:46 +0800
Subject: [PATCH 2/4] [bugfix] Remove unused macro in neon_mathfun_fp16s.h

Summary:
  __aarch64__ is always true for all armv8.2+ targets the macro condition is unnecessary
---
 src/layer/arm/neon_mathfun_fp16s.h | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/src/layer/arm/neon_mathfun_fp16s.h b/src/layer/arm/neon_mathfun_fp16s.h
index eaf8b3c09606..ba26e133aa30 100644
--- a/src/layer/arm/neon_mathfun_fp16s.h
+++ b/src/layer/arm/neon_mathfun_fp16s.h
@@ -198,19 +198,8 @@ static inline float16x4_t exp_ps_f16(float16x4_t x)
 #endif
 
     /* perform a floorf */
-#if defined(__aarch64__)
     mm = vcvtm_s16_f16(fx);
     fx = vcvt_f16_s16(mm);
-#else
-    tmp = vcvt_f16_s16(vcvt_s16_f16(fx));
-
-    /* if greater, substract 1 */
-    uint16x4_t mask = vcgt_f16(tmp, fx);
-    mask = vand_u16(mask, (uint16x4_t)(one));
-
-    fx = vsub_f16(tmp, (float16x4_t)(mask));
-    mm = vcvt_s16_f16(fx);
-#endif
 
 #if defined(_MSC_VER) && !defined(__clang__)
     tmp = vmul_f16(fx, vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C1)));
@@ -261,19 +250,8 @@ static inline float16x8_t exp_ps_f16(float16x8_t x)
 #endif
 
     /* perform a floorf */
-#if defined(__aarch64__)
     mm = vcvtmq_s16_f16(fx);
     fx = vcvtq_f16_s16(mm);
-#else
-    tmp = vcvtq_f16_s16(vcvtq_s16_f16(fx));
-
-    /* if greater, substract 1 */
-    uint16x8_t mask = vcgtq_f16(tmp, fx);
-    mask = vandq_u16(mask, vreinterpretq_u16_f16(one));
-
-    fx = vsubq_f16(tmp, vreinterpretq_f16_u16(mask));
-    mm = vcvtq_s16_f16(fx);
-#endif
 
 #if defined(_MSC_VER) && !defined(__clang__)
     float16x4_t _c_cephes_exp_C1 = vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C1));

From 4db0394c38cf50c9740303503a6e7f5404174e1c Mon Sep 17 00:00:00 2001
From: crafcat7 <crafcat7@outlook.com>
Date: Wed, 8 Apr 2026 16:26:58 +0800
Subject: [PATCH 3/4] [bugfix] Switch fp16 exp floor path to vrndm+vcvt

Summary:
  Use vrndm/vrndmq plus vcvt for exp_ps_f16 floor conversion on AArch64 while preserving output accuracy on device tests.
---
 src/layer/arm/neon_mathfun_fp16s.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/layer/arm/neon_mathfun_fp16s.h b/src/layer/arm/neon_mathfun_fp16s.h
index ba26e133aa30..e79a1e98af2d 100644
--- a/src/layer/arm/neon_mathfun_fp16s.h
+++ b/src/layer/arm/neon_mathfun_fp16s.h
@@ -198,8 +198,8 @@ static inline float16x4_t exp_ps_f16(float16x4_t x)
 #endif
 
     /* perform a floorf */
-    mm = vcvtm_s16_f16(fx);
-    fx = vcvt_f16_s16(mm);
+    fx = vrndm_f16(fx);
+    mm = vcvt_s16_f16(fx);
 
 #if defined(_MSC_VER) && !defined(__clang__)
     tmp = vmul_f16(fx, vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C1)));
@@ -250,8 +250,8 @@ static inline float16x8_t exp_ps_f16(float16x8_t x)
 #endif
 
     /* perform a floorf */
-    mm = vcvtmq_s16_f16(fx);
-    fx = vcvtq_f16_s16(mm);
+    fx = vrndmq_f16(fx);
+    mm = vcvtq_s16_f16(fx);
 
 #if defined(_MSC_VER) && !defined(__clang__)
     float16x4_t _c_cephes_exp_C1 = vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C1));

From c38d8659beef37c7afdf648b637ffda49048369d Mon Sep 17 00:00:00 2001
From: crafcat7 <crafcat7@outlook.com>
Date: Wed, 8 Apr 2026 17:13:12 +0800
Subject: [PATCH 4/4] [bugfix] Move fp16 exp mm conversion to pow2n stage

Summary:
  Keep floor value in fx via vrndm/vrndmq and perform s16 conversion only when building pow2n.
---
 src/layer/arm/neon_mathfun_fp16s.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/layer/arm/neon_mathfun_fp16s.h b/src/layer/arm/neon_mathfun_fp16s.h
index e79a1e98af2d..4738052180cc 100644
--- a/src/layer/arm/neon_mathfun_fp16s.h
+++ b/src/layer/arm/neon_mathfun_fp16s.h
@@ -184,7 +184,6 @@ static inline float16x8_t log_ps_f16(float16x8_t x)
 static inline float16x4_t exp_ps_f16(float16x4_t x)
 {
     float16x4_t tmp, fx;
-    int16x4_t mm;
 
     float16x4_t one = vdup_n_f16(1);
     x = vmin_f16(x, vdup_n_f16(c_exp_hi_f16));
@@ -199,7 +198,6 @@ static inline float16x4_t exp_ps_f16(float16x4_t x)
 
     /* perform a floorf */
     fx = vrndm_f16(fx);
-    mm = vcvt_s16_f16(fx);
 
 #if defined(_MSC_VER) && !defined(__clang__)
     tmp = vmul_f16(fx, vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C1)));
@@ -224,6 +222,8 @@ static inline float16x4_t exp_ps_f16(float16x4_t x)
     y = vadd_f16(y, one);
 
     /* build 2^n */
+    int16x4_t mm;
+    mm = vcvt_s16_f16(fx);
     mm = vadd_s16(mm, vdup_n_s16(0xf));
     mm = vshl_n_s16(mm, 10);
     float16x4_t pow2n = vreinterpret_f16_s16(mm);
@@ -235,7 +235,6 @@ static inline float16x4_t exp_ps_f16(float16x4_t x)
 static inline float16x8_t exp_ps_f16(float16x8_t x)
 {
     float16x8_t tmp, fx;
-    int16x8_t mm;
 
     float16x8_t one = vdupq_n_f16(1);
     x = vminq_f16(x, vdupq_n_f16(c_exp_hi_f16));
@@ -251,7 +250,6 @@ static inline float16x8_t exp_ps_f16(float16x8_t x)
 
     /* perform a floorf */
     fx = vrndmq_f16(fx);
-    mm = vcvtq_s16_f16(fx);
 
 #if defined(_MSC_VER) && !defined(__clang__)
     float16x4_t _c_cephes_exp_C1 = vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C1));
@@ -278,6 +276,8 @@ static inline float16x8_t exp_ps_f16(float16x8_t x)
     y = vaddq_f16(y, one);
 
     /* build 2^n */
+    int16x8_t mm;
+    mm = vcvtq_s16_f16(fx);
     mm = vaddq_s16(mm, vdupq_n_s16(0xf));
     mm = vshlq_n_s16(mm, 10);
     float16x8_t pow2n = vreinterpretq_f16_s16(mm);