Tencent · nihui · Apr 8, 2026 · Apr 7, 2026 · Apr 8, 2026 · Apr 8, 2026
diff --git a/src/layer/arm/neon_mathfun_fp16s.h b/src/layer/arm/neon_mathfun_fp16s.h
@@ -184,6 +184,7 @@ static inline float16x8_t log_ps_f16(float16x8_t x)
 static inline float16x4_t exp_ps_f16(float16x4_t x)
 {
     float16x4_t tmp, fx;
+    int16x4_t mm;
 
     float16x4_t one = vdup_n_f16(1);
     x = vmin_f16(x, vdup_n_f16(c_exp_hi_f16));
@@ -197,13 +198,19 @@ static inline float16x4_t exp_ps_f16(float16x4_t x)
 #endif
 
     /* perform a floorf */
+#if defined(__aarch64__)
+    mm = vcvtm_s16_f16(fx);
+    fx = vcvt_f16_s16(mm);
+#else
     tmp = vcvt_f16_s16(vcvt_s16_f16(fx));
 
     /* if greater, substract 1 */
     uint16x4_t mask = vcgt_f16(tmp, fx);
     mask = vand_u16(mask, (uint16x4_t)(one));
 
     fx = vsub_f16(tmp, (float16x4_t)(mask));
+    mm = vcvt_s16_f16(fx);
+#endif
 
 #if defined(_MSC_VER) && !defined(__clang__)
     tmp = vmul_f16(fx, vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C1)));
@@ -228,8 +235,6 @@ static inline float16x4_t exp_ps_f16(float16x4_t x)
     y = vadd_f16(y, one);
 
     /* build 2^n */
-    int16x4_t mm;
-    mm = vcvt_s16_f16(fx);
     mm = vadd_s16(mm, vdup_n_s16(0xf));
     mm = vshl_n_s16(mm, 10);
     float16x4_t pow2n = vreinterpret_f16_s16(mm);
@@ -241,6 +246,7 @@ static inline float16x4_t exp_ps_f16(float16x4_t x)
 static inline float16x8_t exp_ps_f16(float16x8_t x)
 {
     float16x8_t tmp, fx;
+    int16x8_t mm;
 
     float16x8_t one = vdupq_n_f16(1);
     x = vminq_f16(x, vdupq_n_f16(c_exp_hi_f16));
@@ -255,13 +261,19 @@ static inline float16x8_t exp_ps_f16(float16x8_t x)
 #endif
 
     /* perform a floorf */
+#if defined(__aarch64__)
+    mm = vcvtmq_s16_f16(fx);
+    fx = vcvtq_f16_s16(mm);
+#else
     tmp = vcvtq_f16_s16(vcvtq_s16_f16(fx));
 
     /* if greater, substract 1 */
     uint16x8_t mask = vcgtq_f16(tmp, fx);
     mask = vandq_u16(mask, vreinterpretq_u16_f16(one));
 
     fx = vsubq_f16(tmp, vreinterpretq_f16_u16(mask));
+    mm = vcvtq_s16_f16(fx);
+#endif
 
 #if defined(_MSC_VER) && !defined(__clang__)
     float16x4_t _c_cephes_exp_C1 = vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C1));
@@ -288,8 +300,6 @@ static inline float16x8_t exp_ps_f16(float16x8_t x)
     y = vaddq_f16(y, one);
 
     /* build 2^n */
-    int16x8_t mm;
-    mm = vcvtq_s16_f16(fx);
     mm = vaddq_s16(mm, vdupq_n_s16(0xf));
     mm = vshlq_n_s16(mm, 10);
     float16x8_t pow2n = vreinterpretq_f16_s16(mm);