diff --git a/src/command.cpp b/src/command.cpp
index 1b9978bfa496..7ea927ed2360 100644
--- a/src/command.cpp
+++ b/src/command.cpp
@@ -359,10 +359,12 @@ void VkCompute::record_upload(const Mat& src, VkMat& dst, const Option& opt)
 {
     // NCNN_LOGE("record_upload buffer");
 
+    const int B = src.n;
+
+    // cpu cast to fp16 (discrete gpu)
     Mat src_fp16;
     if (src.elemsize == src.elempack * 4u)
     {
-        // cpu cast to fp16 (discrete gpu)
         if (vkdev->info.type() == 0 && (opt.use_bf16_storage || opt.use_bf16_packed))
         {
             ncnn::cast_float32_to_bfloat16(src, src_fp16, opt);
@@ -389,26 +391,32 @@ void VkCompute::record_upload(const Mat& src, VkMat& dst, const Option& opt)
         src_fp16 = src_fp16_pack4;
     }
 
-    // upload
+    // upload staging buffer
     VkMat dst_staging;
-    dst_staging.create_like(src_fp16, opt.staging_vkallocator);
+    if (B > 1)
+        dst_staging.create_like_batch(src_fp16.batch(0), B, opt.staging_vkallocator);
+    else
+        dst_staging.create_like(src_fp16, opt.staging_vkallocator);
     if (dst_staging.empty())
         return;
 
     // stash staging
     d->upload_staging_buffers.push_back(dst_staging);
 
-    //     NCNN_LOGE("upload_staging_buffer %p  ->   %p +%d ~%d", src_fp16.data, dst_staging.buffer(), dst_staging.buffer_offset(), dst_staging.buffer_capacity());
-
     // memcpy src to device
-    memcpy(dst_staging.mapped_ptr(), src_fp16.data, src_fp16.total() * src_fp16.elemsize);
+    for (int b = 0; b < B; b++)
+    {
+        const Mat src_b = src_fp16.batch(b);
+        VkMat staging_b = dst_staging.batch(b);
+        memcpy(staging_b.mapped_ptr(), src_b.data, src_b.total() * src_b.elemsize);
+    }
     dst_staging.allocator->flush(dst_staging.data);
 
     // mark device host-write @ null
     dst_staging.data->access_flags = VK_ACCESS_HOST_WRITE_BIT;
     dst_staging.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT;
 
-    // resolve dst_elempack
+    // resolve dst_elempack (from single sample dimensions)
     int dims = src_fp16.dims;
     int elemcount = 0;
     if (dims == 1) elemcount = src_fp16.elempack * src_fp16.w;
@@ -435,6 +443,8 @@ void VkCompute::record_download(const VkMat& src, Mat& dst, const Option& opt)
 {
     // NCNN_LOGE("record_download buffer");
 
+    const int B = src.n;
+
     // resolve dst_elempack
     int dims = src.dims;
     int elemcount = 0;
@@ -480,7 +490,7 @@ void VkCompute::record_download(const VkMat& src, Mat& dst, const Option& opt)
         barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
         barriers[0].buffer = dst_staging.buffer();
         barriers[0].offset = dst_staging.buffer_offset();
-        barriers[0].size = dst_staging.buffer_capacity();
+        barriers[0].size = B > 1 ? dst_staging.nstep * B * dst_staging.elemsize : dst_staging.buffer_capacity();
 
         VkPipelineStageFlags src_stage = dst_staging.data->stage_flags;
         VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_HOST_BIT;
@@ -509,7 +519,10 @@ void VkCompute::record_download(const VkMat& src, Mat& dst, const Option& opt)
 
     // create dst
     Mat dst_fp16;
-    dst_fp16.create_like(dst_staging, opt.blob_allocator);
+    if (B > 1)
+        dst_fp16.create_like_batch(dst_staging.batch(0), B, opt.blob_allocator);
+    else
+        dst_fp16.create_like(dst_staging, opt.blob_allocator);
     if (dst_fp16.empty())
         return;
 
@@ -530,48 +543,42 @@ void VkCompute::record_download(const VkMat& src, Mat& dst, const Option& opt)
     // cast to fp32 (discrete gpu)
     if (dst_fp16.elemsize == dst_fp16.elempack * 2u)
     {
+        int post_cast_type = 0; // 0=none, 1=bf16, 2=fp16
         if (vkdev->info.type() == 0 && (opt.use_bf16_storage || opt.use_bf16_packed))
-        {
-            int dims = dst_fp16.dims;
-            if (dims == 1)
-                dst.create(dst_fp16.w, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator);
-            if (dims == 2)
-                dst.create(dst_fp16.w, dst_fp16.h, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator);
-            if (dims == 3)
-                dst.create(dst_fp16.w, dst_fp16.h, dst_fp16.c, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator);
-            if (dims == 4)
-                dst.create(dst_fp16.w, dst_fp16.h, dst_fp16.d, dst_fp16.c, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator);
-
-            d->download_post_mats.push_back(dst);
-
-            VkComputePrivate::record r;
-            r.type = VkComputePrivate::record::TYPE_post_cast_bfloat16_to_float32;
-            r.command_buffer = 0;
-            r.post_cast_bfloat16_to_float32.download_post_mat_bf16_offset = d->download_post_mats_fp16.size() - 1;
-            r.post_cast_bfloat16_to_float32.download_post_mat_offset = d->download_post_mats.size() - 1;
-            r.post_cast_bfloat16_to_float32.num_threads = opt.num_threads;
-            d->delayed_records.push_back(r);
-        }
+            post_cast_type = 1;
         else if (vkdev->info.type() == 0 && (opt.use_fp16_storage || opt.use_fp16_packed))
+            post_cast_type = 2;
+
+        if (post_cast_type > 0)
         {
-            int dims = dst_fp16.dims;
-            if (dims == 1)
-                dst.create(dst_fp16.w, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator);
-            if (dims == 2)
-                dst.create(dst_fp16.w, dst_fp16.h, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator);
-            if (dims == 3)
-                dst.create(dst_fp16.w, dst_fp16.h, dst_fp16.c, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator);
-            if (dims == 4)
-                dst.create(dst_fp16.w, dst_fp16.h, dst_fp16.d, dst_fp16.c, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator);
+            size_t fp32_elemsize = (size_t)(dst_fp16.elempack * 4u);
+            if (dst_fp16.dims == 1)
+                dst.create_batch(dst_fp16.w, B, fp32_elemsize, dst_fp16.elempack, opt.blob_allocator);
+            else if (dst_fp16.dims == 2)
+                dst.create_batch(dst_fp16.w, dst_fp16.h, B, fp32_elemsize, dst_fp16.elempack, opt.blob_allocator);
+            else if (dst_fp16.dims == 3)
+                dst.create_batch(dst_fp16.w, dst_fp16.h, dst_fp16.c, B, fp32_elemsize, dst_fp16.elempack, opt.blob_allocator);
+            else if (dst_fp16.dims == 4)
+                dst.create_batch(dst_fp16.w, dst_fp16.h, dst_fp16.d, dst_fp16.c, B, fp32_elemsize, dst_fp16.elempack, opt.blob_allocator);
 
             d->download_post_mats.push_back(dst);
 
             VkComputePrivate::record r;
-            r.type = VkComputePrivate::record::TYPE_post_cast_float16_to_float32;
             r.command_buffer = 0;
-            r.post_cast_float16_to_float32.download_post_mat_fp16_offset = d->download_post_mats_fp16.size() - 1;
-            r.post_cast_float16_to_float32.download_post_mat_offset = d->download_post_mats.size() - 1;
-            r.post_cast_float16_to_float32.num_threads = opt.num_threads;
+            if (post_cast_type == 1)
+            {
+                r.type = VkComputePrivate::record::TYPE_post_cast_bfloat16_to_float32;
+                r.post_cast_bfloat16_to_float32.download_post_mat_bf16_offset = d->download_post_mats_fp16.size() - 1;
+                r.post_cast_bfloat16_to_float32.download_post_mat_offset = d->download_post_mats.size() - 1;
+                r.post_cast_bfloat16_to_float32.num_threads = opt.num_threads;
+            }
+            else
+            {
+                r.type = VkComputePrivate::record::TYPE_post_cast_float16_to_float32;
+                r.post_cast_float16_to_float32.download_post_mat_fp16_offset = d->download_post_mats_fp16.size() - 1;
+                r.post_cast_float16_to_float32.download_post_mat_offset = d->download_post_mats.size() - 1;
+                r.post_cast_float16_to_float32.num_threads = opt.num_threads;
+            }
             d->delayed_records.push_back(r);
         }
         else
@@ -589,14 +596,24 @@ void VkCompute::record_clone(const Mat& src, VkMat& dst, const Option& opt)
 {
     //     NCNN_LOGE("record_clone host to buffer");
 
+    const int B = src.n;
+
     // host to staging
     VkMat dst_staging;
-    dst_staging.create_like(src, opt.staging_vkallocator);
+    if (B > 1)
+        dst_staging.create_like_batch(src.batch(0), B, opt.staging_vkallocator);
+    else
+        dst_staging.create_like(src, opt.staging_vkallocator);
     if (dst_staging.empty())
         return;
 
     // memcpy src to device
-    memcpy(dst_staging.mapped_ptr(), src.data, src.total() * src.elemsize);
+    for (int b = 0; b < B; b++)
+    {
+        const Mat src_b = src.batch(b);
+        VkMat staging_b = dst_staging.batch(b);
+        memcpy(staging_b.mapped_ptr(), src_b.data, src_b.total() * src_b.elemsize);
+    }
     dst_staging.allocator->flush(dst_staging.data);
 
     // mark device host-write @ null
@@ -631,6 +648,8 @@ void VkCompute::record_clone(const VkMat& src, Mat& dst, const Option& opt)
 {
     //     NCNN_LOGE("record_clone buffer to host");
 
+    const int B = src.n;
+
     if (!src.allocator->mappable)
     {
         // device to staging
@@ -646,7 +665,10 @@ void VkCompute::record_clone(const VkMat& src, Mat& dst, const Option& opt)
     }
 
     // create dst
-    dst.create_like(src, opt.blob_allocator);
+    if (B > 1)
+        dst.create_like_batch(src.batch(0), B, opt.blob_allocator);
+    else
+        dst.create_like(src, opt.blob_allocator);
     if (dst.empty())
         return;
 
@@ -662,7 +684,7 @@ void VkCompute::record_clone(const VkMat& src, Mat& dst, const Option& opt)
         barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
         barriers[0].buffer = src.buffer();
         barriers[0].offset = src.buffer_offset();
-        barriers[0].size = src.buffer_capacity();
+        barriers[0].size = B > 1 ? src.nstep * B * src.elemsize : src.buffer_capacity();
 
         VkPipelineStageFlags src_stage = src.data->stage_flags;
         VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_HOST_BIT;
@@ -722,8 +744,13 @@ void VkCompute::record_clone(const VkMat& src, VkMat& dst, const Option& opt)
 {
     //     NCNN_LOGE("record_clone buffer to buffer");
 
+    const int B = src.n;
+
     // create dst
-    dst.create_like(src, opt.blob_vkallocator);
+    if (B > 1)
+        dst.create_like_batch(src.batch(0), B, opt.blob_vkallocator);
+    else
+        dst.create_like(src, opt.blob_vkallocator);
     if (dst.empty())
         return;
 
@@ -739,7 +766,7 @@ void VkCompute::record_clone(const VkMat& src, VkMat& dst, const Option& opt)
         barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
         barriers[0].buffer = src.buffer();
         barriers[0].offset = src.buffer_offset();
-        barriers[0].size = src.buffer_capacity();
+        barriers[0].size = B > 1 ? src.nstep * B * src.elemsize : src.buffer_capacity();
 
         VkPipelineStageFlags src_stage = src.data->stage_flags;
         VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT;
@@ -774,12 +801,14 @@ void VkCompute::record_clone(const VkMat& src, VkMat& dst, const Option& opt)
         dst.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT;
     }
 
-    // record device to staging
+    // record copy
     {
+        VkDeviceSize copy_size = B > 1 ? src.nstep * B * src.elemsize : std::min(src.buffer_capacity(), dst.buffer_capacity());
+
         VkBufferCopy* regions = new VkBufferCopy[1];
         regions[0].srcOffset = src.buffer_offset();
         regions[0].dstOffset = dst.buffer_offset();
-        regions[0].size = std::min(src.buffer_capacity(), dst.buffer_capacity());
+        regions[0].size = copy_size;
 
         if (vkdev->info.support_VK_KHR_push_descriptor())
         {
@@ -1985,7 +2014,12 @@ int VkCompute::submit_and_wait()
             // NCNN_LOGE("post_download  %p +%d ~%d  -> %p", src.buffer(), src.buffer_offset(), src.buffer_capacity(), dst.data);
 
             src.allocator->invalidate(src.data);
-            memcpy(dst.data, src.mapped_ptr(), dst.total() * dst.elemsize);
+            for (int b = 0; b < dst.n; b++)
+            {
+                Mat dst_b = dst.batch(b);
+                size_t src_batch_offset = src.nstep * b * src.elemsize;
+                memcpy(dst_b.data, (const unsigned char*)src.mapped_ptr() + src_batch_offset, dst_b.total() * dst_b.elemsize);
+            }
             break;
         }
         case VkComputePrivate::record::TYPE_post_cast_float16_to_float32:
diff --git a/src/layer.cpp b/src/layer.cpp
index a00c937c5643..a12da2537b9d 100644
--- a/src/layer.cpp
+++ b/src/layer.cpp
@@ -27,6 +27,8 @@ Layer::Layer()
     support_any_packing = false;
     support_vulkan_any_packing = false;
 
+    support_batch = false;
+
     featmask = 0;
 
 #if NCNN_VULKAN
@@ -240,6 +242,7 @@ class Layer_final : public Layer
         support_fp16_storage = layer_cpu->support_fp16_storage;
         support_int8_storage = layer_cpu->support_int8_storage;
         support_any_packing = layer_cpu->support_any_packing;
+        support_batch = layer_cpu->support_batch;
 
         support_vulkan = false;
         support_tensor_storage = false;
diff --git a/src/layer.h b/src/layer.h
index 9fa45d7c47a3..ac11176bfe44 100644
--- a/src/layer.h
+++ b/src/layer.h
@@ -75,7 +75,8 @@ class NCNN_EXPORT Layer
     // vulkan accept input blob with any elempack
     bool support_vulkan_any_packing;
 
-    bool support_reserved_1;
+    // support batched input (n > 1), replaces support_reserved_1
+    bool support_batch;
     bool support_reserved_2;
     bool support_reserved_3;
     bool support_reserved_4;
diff --git a/src/layer/arm/cast_arm.cpp b/src/layer/arm/cast_arm.cpp
index f028ad7e10ac..966517f85bb2 100644
--- a/src/layer/arm/cast_arm.cpp
+++ b/src/layer/arm/cast_arm.cpp
@@ -40,13 +40,14 @@ int Cast_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
     int dims = bottom_blob.dims;
     size_t elemsize = bottom_blob.elemsize;
     int elempack = bottom_blob.elempack;
+    int batch = bottom_blob.n;
 
     size_t out_elemsize = elemsize;
     if (type_to == 1)
     {
         if (type_from == 3)
         {
-            Cast::forward(bottom_blob, top_blob, opt);
+            return Cast::forward(bottom_blob, top_blob, opt);
         }
 
         // float32
@@ -69,21 +70,13 @@ int Cast_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
     }
 
     if (dims == 1)
-    {
-        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
-    }
+        top_blob.create_batch(w, batch, out_elemsize, elempack, opt.blob_allocator);
     else if (dims == 2)
-    {
-        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
-    }
+        top_blob.create_batch(w, h, batch, out_elemsize, elempack, opt.blob_allocator);
     else if (dims == 3)
-    {
-        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
-    }
+        top_blob.create_batch(w, h, channels, batch, out_elemsize, elempack, opt.blob_allocator);
     else if (dims == 4)
-    {
-        top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_allocator);
-    }
+        top_blob.create_batch(w, h, d, channels, batch, out_elemsize, elempack, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
@@ -101,11 +94,14 @@ int Cast_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
 
     if (type_from == 3 && type_to == 1)
     {
+        const int total_bc = batch * channels;
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
+        for (int bc = 0; bc < total_bc; bc++)
         {
-            const signed char* ptr = bottom_blob.channel(q);
-            float* outptr = top_blob.channel(q);
+            int b = bc / channels;
+            int q = bc % channels;
+            const signed char* ptr = bottom_blob.batch(b).channel(q);
+            float* outptr = top_blob.batch(b).channel(q);
 
             for (int i = 0; i < size; i++)
             {
diff --git a/src/layer/arm/cast_bf16.h b/src/layer/arm/cast_bf16.h
index 453ba7e8c182..30ec4fe2c1f9 100644
--- a/src/layer/arm/cast_bf16.h
+++ b/src/layer/arm/cast_bf16.h
@@ -21,17 +21,21 @@ static void cast_fp32_to_bf16_neon(const Mat& bottom_blob, Mat& top_blob, const
     const int d = bottom_blob.d;
     const int channels = bottom_blob.c;
     const int elempack = bottom_blob.elempack;
+    const int batch = bottom_blob.n;
 
     const int size = w * h * d * elempack;
 
+    const int total_bc = batch * channels;
     #pragma omp parallel for num_threads(opt.num_threads)
-    for (int q = 0; q < channels; q++)
+    for (int bc = 0; bc < total_bc; bc++)
     {
-        const float* ptr = bottom_blob.channel(q);
+        int b = bc / channels;
+        int q = bc % channels;
+        const float* ptr = bottom_blob.batch(b).channel(q);
 #if __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
-        __bf16* outptr = top_blob.channel(q);
+        __bf16* outptr = top_blob.batch(b).channel(q);
 #else
-        unsigned short* outptr = top_blob.channel(q);
+        unsigned short* outptr = top_blob.batch(b).channel(q);
 #endif
 
         int i = 0;
@@ -185,18 +189,22 @@ static void cast_bf16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const
     const int d = bottom_blob.d;
     const int channels = bottom_blob.c;
     const int elempack = bottom_blob.elempack;
+    const int batch = bottom_blob.n;
 
     const int size = w * h * d * elempack;
 
+    const int total_bc = batch * channels;
     #pragma omp parallel for num_threads(opt.num_threads)
-    for (int q = 0; q < channels; q++)
+    for (int bc = 0; bc < total_bc; bc++)
     {
+        int b = bc / channels;
+        int q = bc % channels;
 #if __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
-        const __bf16* ptr = bottom_blob.channel(q);
+        const __bf16* ptr = bottom_blob.batch(b).channel(q);
 #else
-        const unsigned short* ptr = bottom_blob.channel(q);
+        const unsigned short* ptr = bottom_blob.batch(b).channel(q);
 #endif
-        float* outptr = top_blob.channel(q);
+        float* outptr = top_blob.batch(b).channel(q);
 
         int i = 0;
 #if __ARM_NEON
diff --git a/src/layer/arm/cast_fp16.h b/src/layer/arm/cast_fp16.h
index 929d4b58f7a9..3b9e67965bb3 100644
--- a/src/layer/arm/cast_fp16.h
+++ b/src/layer/arm/cast_fp16.h
@@ -21,14 +21,18 @@ static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const
     const int d = bottom_blob.d;
     const int channels = bottom_blob.c;
     const int elempack = bottom_blob.elempack;
+    const int batch = bottom_blob.n;
 
     const int size = w * h * d * elempack;
 
+    const int total_bc = batch * channels;
     #pragma omp parallel for num_threads(opt.num_threads)
-    for (int q = 0; q < channels; q++)
+    for (int bc = 0; bc < total_bc; bc++)
     {
-        const float* ptr = bottom_blob.channel(q);
-        unsigned short* outptr = top_blob.channel(q);
+        int b = bc / channels;
+        int q = bc % channels;
+        const float* ptr = bottom_blob.batch(b).channel(q);
+        unsigned short* outptr = top_blob.batch(b).channel(q);
 
         int i = 0;
 #if (__ARM_FP & 2)
@@ -179,14 +183,18 @@ static void cast_fp16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const
     const int d = bottom_blob.d;
     const int channels = bottom_blob.c;
     const int elempack = bottom_blob.elempack;
+    const int batch = bottom_blob.n;
 
     const int size = w * h * d * elempack;
 
+    const int total_bc = batch * channels;
     #pragma omp parallel for num_threads(opt.num_threads)
-    for (int q = 0; q < channels; q++)
+    for (int bc = 0; bc < total_bc; bc++)
     {
-        const unsigned short* ptr = bottom_blob.channel(q);
-        float* outptr = top_blob.channel(q);
+        int b = bc / channels;
+        int q = bc % channels;
+        const unsigned short* ptr = bottom_blob.batch(b).channel(q);
+        float* outptr = top_blob.batch(b).channel(q);
 
         int i = 0;
 #if (__ARM_FP & 2)
diff --git a/src/layer/arm/packing_arm.cpp b/src/layer/arm/packing_arm.cpp
index 0b325a94176b..f4963afc76b5 100644
--- a/src/layer/arm/packing_arm.cpp
+++ b/src/layer/arm/packing_arm.cpp
@@ -69,6 +69,7 @@ int Packing_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
     int d = bottom_blob.d;
     int channels = bottom_blob.c;
     int dims = bottom_blob.dims;
+    int batch = bottom_blob.n;
 
     if (!use_padding)
     {
@@ -97,6 +98,7 @@ int Packing_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         top_blob.cstep = bottom_blob.cstep * elempack / out_elempack;
         top_blob.elemsize = elemsize / elempack * out_elempack;
         top_blob.elempack = out_elempack;
+        top_blob.nstep = bottom_blob.nstep * elempack / out_elempack;
         return 0;
     }
 
@@ -105,21 +107,24 @@ int Packing_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         int outh = h * elempack / out_elempack;
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
-        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
+        top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         if (pack1to4)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const float* r0 = bottom_blob.row(i * 4);
-                const float* r1 = bottom_blob.row(i * 4 + 1);
-                const float* r2 = bottom_blob.row(i * 4 + 2);
-                const float* r3 = bottom_blob.row(i * 4 + 3);
+                int b = bi / outh;
+                int i = bi % outh;
+                const float* r0 = bottom_blob.batch(b).row(i * 4);
+                const float* r1 = bottom_blob.batch(b).row(i * 4 + 1);
+                const float* r2 = bottom_blob.batch(b).row(i * 4 + 2);
+                const float* r3 = bottom_blob.batch(b).row(i * 4 + 3);
 
-                float* outptr = top_blob.row(i);
+                float* outptr = top_blob.batch(b).row(i);
 
                 int j = 0;
 #if __ARM_NEON
@@ -152,15 +157,18 @@ int Packing_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         }
         if (pack4to1)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const float* r0 = bottom_blob.row(i);
+                int b = bi / h;
+                int i = bi % h;
+                const float* r0 = bottom_blob.batch(b).row(i);
 
-                float* outptr0 = top_blob.row(i * 4);
-                float* outptr1 = top_blob.row(i * 4 + 1);
-                float* outptr2 = top_blob.row(i * 4 + 2);
-                float* outptr3 = top_blob.row(i * 4 + 3);
+                float* outptr0 = top_blob.batch(b).row(i * 4);
+                float* outptr1 = top_blob.batch(b).row(i * 4 + 1);
+                float* outptr2 = top_blob.batch(b).row(i * 4 + 2);
+                float* outptr3 = top_blob.batch(b).row(i * 4 + 3);
 
                 int j = 0;
 #if __ARM_NEON
@@ -201,23 +209,26 @@ int Packing_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
         if (dims == 3)
-            top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
-        else // if (dims == 4)
-            top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator);
+            top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
+        else
+            top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         if (pack1to4)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const float* r0 = bottom_blob.channel(q * 4);
-                const float* r1 = bottom_blob.channel(q * 4 + 1);
-                const float* r2 = bottom_blob.channel(q * 4 + 2);
-                const float* r3 = bottom_blob.channel(q * 4 + 3);
+                int b = bq / outc;
+                int q = bq % outc;
+                const float* r0 = bottom_blob.batch(b).channel(q * 4);
+                const float* r1 = bottom_blob.batch(b).channel(q * 4 + 1);
+                const float* r2 = bottom_blob.batch(b).channel(q * 4 + 2);
+                const float* r3 = bottom_blob.batch(b).channel(q * 4 + 3);
 
-                float* outptr = top_blob.channel(q);
+                float* outptr = top_blob.batch(b).channel(q);
 
                 int i = 0;
 #if __ARM_NEON
@@ -250,15 +261,18 @@ int Packing_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         }
         if (pack4to1)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const float* r0 = bottom_blob.channel(q);
+                int b = bq / channels;
+                int q = bq % channels;
+                const float* r0 = bottom_blob.batch(b).channel(q);
 
-                float* outptr0 = top_blob.channel(q * 4);
-                float* outptr1 = top_blob.channel(q * 4 + 1);
-                float* outptr2 = top_blob.channel(q * 4 + 2);
-                float* outptr3 = top_blob.channel(q * 4 + 3);
+                float* outptr0 = top_blob.batch(b).channel(q * 4);
+                float* outptr1 = top_blob.batch(b).channel(q * 4 + 1);
+                float* outptr2 = top_blob.batch(b).channel(q * 4 + 2);
+                float* outptr3 = top_blob.batch(b).channel(q * 4 + 3);
 
                 int i = 0;
 #if __ARM_NEON
@@ -328,6 +342,7 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
     int d = bottom_blob.d;
     int channels = bottom_blob.c;
     int dims = bottom_blob.dims;
+    int batch = bottom_blob.n;
 
     if (!use_padding)
     {
@@ -356,6 +371,7 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         top_blob.cstep = bottom_blob.cstep * elempack / out_elempack;
         top_blob.elemsize = elemsize / elempack * out_elempack;
         top_blob.elempack = out_elempack;
+        top_blob.nstep = bottom_blob.nstep * elempack / out_elempack;
         return 0;
     }
 
@@ -364,21 +380,24 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         int outh = h * elempack / out_elempack;
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
-        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
+        top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         if (pack1to4)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i * 4);
-                const unsigned short* r1 = bottom_blob.row<const unsigned short>(i * 4 + 1);
-                const unsigned short* r2 = bottom_blob.row<const unsigned short>(i * 4 + 2);
-                const unsigned short* r3 = bottom_blob.row<const unsigned short>(i * 4 + 3);
+                int b = bi / outh;
+                int i = bi % outh;
+                const unsigned short* r0 = bottom_blob.batch(b).row<const unsigned short>(i * 4);
+                const unsigned short* r1 = bottom_blob.batch(b).row<const unsigned short>(i * 4 + 1);
+                const unsigned short* r2 = bottom_blob.batch(b).row<const unsigned short>(i * 4 + 2);
+                const unsigned short* r3 = bottom_blob.batch(b).row<const unsigned short>(i * 4 + 3);
 
-                unsigned short* outptr = top_blob.row<unsigned short>(i);
+                unsigned short* outptr = top_blob.batch(b).row<unsigned short>(i);
 
                 int j = 0;
 #if __ARM_NEON
@@ -411,15 +430,18 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack4to1)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
+                int b = bi / h;
+                int i = bi % h;
+                const unsigned short* r0 = bottom_blob.batch(b).row<const unsigned short>(i);
 
-                unsigned short* outptr0 = top_blob.row<unsigned short>(i * 4);
-                unsigned short* outptr1 = top_blob.row<unsigned short>(i * 4 + 1);
-                unsigned short* outptr2 = top_blob.row<unsigned short>(i * 4 + 2);
-                unsigned short* outptr3 = top_blob.row<unsigned short>(i * 4 + 3);
+                unsigned short* outptr0 = top_blob.batch(b).row<unsigned short>(i * 4);
+                unsigned short* outptr1 = top_blob.batch(b).row<unsigned short>(i * 4 + 1);
+                unsigned short* outptr2 = top_blob.batch(b).row<unsigned short>(i * 4 + 2);
+                unsigned short* outptr3 = top_blob.batch(b).row<unsigned short>(i * 4 + 3);
 
                 int j = 0;
 #if __ARM_NEON
@@ -451,19 +473,22 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack1to8)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i * 8);
-                const unsigned short* r1 = bottom_blob.row<const unsigned short>(i * 8 + 1);
-                const unsigned short* r2 = bottom_blob.row<const unsigned short>(i * 8 + 2);
-                const unsigned short* r3 = bottom_blob.row<const unsigned short>(i * 8 + 3);
-                const unsigned short* r4 = bottom_blob.row<const unsigned short>(i * 8 + 4);
-                const unsigned short* r5 = bottom_blob.row<const unsigned short>(i * 8 + 5);
-                const unsigned short* r6 = bottom_blob.row<const unsigned short>(i * 8 + 6);
-                const unsigned short* r7 = bottom_blob.row<const unsigned short>(i * 8 + 7);
-
-                unsigned short* outptr = top_blob.row<unsigned short>(i);
+                int b = bi / outh;
+                int i = bi % outh;
+                const unsigned short* r0 = bottom_blob.batch(b).row<const unsigned short>(i * 8);
+                const unsigned short* r1 = bottom_blob.batch(b).row<const unsigned short>(i * 8 + 1);
+                const unsigned short* r2 = bottom_blob.batch(b).row<const unsigned short>(i * 8 + 2);
+                const unsigned short* r3 = bottom_blob.batch(b).row<const unsigned short>(i * 8 + 3);
+                const unsigned short* r4 = bottom_blob.batch(b).row<const unsigned short>(i * 8 + 4);
+                const unsigned short* r5 = bottom_blob.batch(b).row<const unsigned short>(i * 8 + 5);
+                const unsigned short* r6 = bottom_blob.batch(b).row<const unsigned short>(i * 8 + 6);
+                const unsigned short* r7 = bottom_blob.batch(b).row<const unsigned short>(i * 8 + 7);
+
+                unsigned short* outptr = top_blob.batch(b).row<unsigned short>(i);
 
                 int j = 0;
 #if __ARM_NEON
@@ -615,19 +640,22 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack8to1)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
-
-                unsigned short* outptr0 = top_blob.row<unsigned short>(i * 8);
-                unsigned short* outptr1 = top_blob.row<unsigned short>(i * 8 + 1);
-                unsigned short* outptr2 = top_blob.row<unsigned short>(i * 8 + 2);
-                unsigned short* outptr3 = top_blob.row<unsigned short>(i * 8 + 3);
-                unsigned short* outptr4 = top_blob.row<unsigned short>(i * 8 + 4);
-                unsigned short* outptr5 = top_blob.row<unsigned short>(i * 8 + 5);
-                unsigned short* outptr6 = top_blob.row<unsigned short>(i * 8 + 6);
-                unsigned short* outptr7 = top_blob.row<unsigned short>(i * 8 + 7);
+                int b = bi / h;
+                int i = bi % h;
+                const unsigned short* r0 = bottom_blob.batch(b).row<const unsigned short>(i);
+
+                unsigned short* outptr0 = top_blob.batch(b).row<unsigned short>(i * 8);
+                unsigned short* outptr1 = top_blob.batch(b).row<unsigned short>(i * 8 + 1);
+                unsigned short* outptr2 = top_blob.batch(b).row<unsigned short>(i * 8 + 2);
+                unsigned short* outptr3 = top_blob.batch(b).row<unsigned short>(i * 8 + 3);
+                unsigned short* outptr4 = top_blob.batch(b).row<unsigned short>(i * 8 + 4);
+                unsigned short* outptr5 = top_blob.batch(b).row<unsigned short>(i * 8 + 5);
+                unsigned short* outptr6 = top_blob.batch(b).row<unsigned short>(i * 8 + 6);
+                unsigned short* outptr7 = top_blob.batch(b).row<unsigned short>(i * 8 + 7);
 
                 int j = 0;
 #if __ARM_NEON
@@ -769,13 +797,16 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack4to8)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i * 2);
-                const unsigned short* r1 = bottom_blob.row<const unsigned short>(i * 2 + 1);
+                int b = bi / outh;
+                int i = bi % outh;
+                const unsigned short* r0 = bottom_blob.batch(b).row<const unsigned short>(i * 2);
+                const unsigned short* r1 = bottom_blob.batch(b).row<const unsigned short>(i * 2 + 1);
 
-                unsigned short* outptr = top_blob.row<unsigned short>(i);
+                unsigned short* outptr = top_blob.batch(b).row<unsigned short>(i);
 
                 int j = 0;
 #if NCNN_GNU_INLINE_ASM
@@ -836,13 +867,16 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack8to4)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
+                int b = bi / h;
+                int i = bi % h;
+                const unsigned short* r0 = bottom_blob.batch(b).row<const unsigned short>(i);
 
-                unsigned short* outptr0 = top_blob.row<unsigned short>(i * 2);
-                unsigned short* outptr1 = top_blob.row<unsigned short>(i * 2 + 1);
+                unsigned short* outptr0 = top_blob.batch(b).row<unsigned short>(i * 2);
+                unsigned short* outptr1 = top_blob.batch(b).row<unsigned short>(i * 2 + 1);
 
                 int j = 0;
 #if NCNN_GNU_INLINE_ASM
@@ -912,23 +946,26 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
         if (dims == 3)
-            top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
-        else // if (dims == 4)
-            top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator);
+            top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
+        else
+            top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         if (pack1to4)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const unsigned short* r0 = bottom_blob.channel(q * 4);
-                const unsigned short* r1 = bottom_blob.channel(q * 4 + 1);
-                const unsigned short* r2 = bottom_blob.channel(q * 4 + 2);
-                const unsigned short* r3 = bottom_blob.channel(q * 4 + 3);
+                int b = bq / outc;
+                int q = bq % outc;
+                const unsigned short* r0 = bottom_blob.batch(b).channel(q * 4);
+                const unsigned short* r1 = bottom_blob.batch(b).channel(q * 4 + 1);
+                const unsigned short* r2 = bottom_blob.batch(b).channel(q * 4 + 2);
+                const unsigned short* r3 = bottom_blob.batch(b).channel(q * 4 + 3);
 
-                unsigned short* outptr = top_blob.channel(q);
+                unsigned short* outptr = top_blob.batch(b).channel(q);
 
                 int i = 0;
 #if __ARM_NEON
@@ -961,15 +998,18 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack4to1)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const unsigned short* r0 = bottom_blob.channel(q);
+                int b = bq / channels;
+                int q = bq % channels;
+                const unsigned short* r0 = bottom_blob.batch(b).channel(q);
 
-                unsigned short* outptr0 = top_blob.channel(q * 4);
-                unsigned short* outptr1 = top_blob.channel(q * 4 + 1);
-                unsigned short* outptr2 = top_blob.channel(q * 4 + 2);
-                unsigned short* outptr3 = top_blob.channel(q * 4 + 3);
+                unsigned short* outptr0 = top_blob.batch(b).channel(q * 4);
+                unsigned short* outptr1 = top_blob.batch(b).channel(q * 4 + 1);
+                unsigned short* outptr2 = top_blob.batch(b).channel(q * 4 + 2);
+                unsigned short* outptr3 = top_blob.batch(b).channel(q * 4 + 3);
 
                 int i = 0;
 #if __ARM_NEON
@@ -1001,19 +1041,22 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack1to8)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const unsigned short* r0 = bottom_blob.channel(q * 8);
-                const unsigned short* r1 = bottom_blob.channel(q * 8 + 1);
-                const unsigned short* r2 = bottom_blob.channel(q * 8 + 2);
-                const unsigned short* r3 = bottom_blob.channel(q * 8 + 3);
-                const unsigned short* r4 = bottom_blob.channel(q * 8 + 4);
-                const unsigned short* r5 = bottom_blob.channel(q * 8 + 5);
-                const unsigned short* r6 = bottom_blob.channel(q * 8 + 6);
-                const unsigned short* r7 = bottom_blob.channel(q * 8 + 7);
-
-                unsigned short* outptr = top_blob.channel(q);
+                int b = bq / outc;
+                int q = bq % outc;
+                const unsigned short* r0 = bottom_blob.batch(b).channel(q * 8);
+                const unsigned short* r1 = bottom_blob.batch(b).channel(q * 8 + 1);
+                const unsigned short* r2 = bottom_blob.batch(b).channel(q * 8 + 2);
+                const unsigned short* r3 = bottom_blob.batch(b).channel(q * 8 + 3);
+                const unsigned short* r4 = bottom_blob.batch(b).channel(q * 8 + 4);
+                const unsigned short* r5 = bottom_blob.batch(b).channel(q * 8 + 5);
+                const unsigned short* r6 = bottom_blob.batch(b).channel(q * 8 + 6);
+                const unsigned short* r7 = bottom_blob.batch(b).channel(q * 8 + 7);
+
+                unsigned short* outptr = top_blob.batch(b).channel(q);
 
                 int i = 0;
 #if __ARM_NEON
@@ -1165,19 +1208,22 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack8to1)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const unsigned short* r0 = bottom_blob.channel(q);
-
-                unsigned short* outptr0 = top_blob.channel(q * 8);
-                unsigned short* outptr1 = top_blob.channel(q * 8 + 1);
-                unsigned short* outptr2 = top_blob.channel(q * 8 + 2);
-                unsigned short* outptr3 = top_blob.channel(q * 8 + 3);
-                unsigned short* outptr4 = top_blob.channel(q * 8 + 4);
-                unsigned short* outptr5 = top_blob.channel(q * 8 + 5);
-                unsigned short* outptr6 = top_blob.channel(q * 8 + 6);
-                unsigned short* outptr7 = top_blob.channel(q * 8 + 7);
+                int b = bq / channels;
+                int q = bq % channels;
+                const unsigned short* r0 = bottom_blob.batch(b).channel(q);
+
+                unsigned short* outptr0 = top_blob.batch(b).channel(q * 8);
+                unsigned short* outptr1 = top_blob.batch(b).channel(q * 8 + 1);
+                unsigned short* outptr2 = top_blob.batch(b).channel(q * 8 + 2);
+                unsigned short* outptr3 = top_blob.batch(b).channel(q * 8 + 3);
+                unsigned short* outptr4 = top_blob.batch(b).channel(q * 8 + 4);
+                unsigned short* outptr5 = top_blob.batch(b).channel(q * 8 + 5);
+                unsigned short* outptr6 = top_blob.batch(b).channel(q * 8 + 6);
+                unsigned short* outptr7 = top_blob.batch(b).channel(q * 8 + 7);
 
                 int i = 0;
 #if __ARM_NEON
@@ -1319,13 +1365,16 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack4to8)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const unsigned short* r0 = bottom_blob.channel(q * 2);
-                const unsigned short* r1 = bottom_blob.channel(q * 2 + 1);
+                int b = bq / outc;
+                int q = bq % outc;
+                const unsigned short* r0 = bottom_blob.batch(b).channel(q * 2);
+                const unsigned short* r1 = bottom_blob.batch(b).channel(q * 2 + 1);
 
-                unsigned short* outptr = top_blob.channel(q);
+                unsigned short* outptr = top_blob.batch(b).channel(q);
 
                 int i = 0;
 #if NCNN_GNU_INLINE_ASM
@@ -1386,13 +1435,16 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack8to4)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const unsigned short* r0 = bottom_blob.channel(q);
+                int b = bq / channels;
+                int q = bq % channels;
+                const unsigned short* r0 = bottom_blob.batch(b).channel(q);
 
-                unsigned short* outptr0 = top_blob.channel(q * 2);
-                unsigned short* outptr1 = top_blob.channel(q * 2 + 1);
+                unsigned short* outptr0 = top_blob.batch(b).channel(q * 2);
+                unsigned short* outptr1 = top_blob.batch(b).channel(q * 2 + 1);
 
                 int i = 0;
 #if NCNN_GNU_INLINE_ASM
@@ -1487,6 +1539,7 @@ int Packing_arm::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio
     int d = bottom_blob.d;
     int channels = bottom_blob.c;
     int dims = bottom_blob.dims;
+    int batch = bottom_blob.n;
 
     if (!use_padding)
     {
@@ -1515,6 +1568,7 @@ int Packing_arm::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio
         top_blob.cstep = bottom_blob.cstep * elempack / out_elempack;
         top_blob.elemsize = elemsize / elempack * out_elempack;
         top_blob.elempack = out_elempack;
+        top_blob.nstep = bottom_blob.nstep * elempack / out_elempack;
         return 0;
     }
 
@@ -1523,25 +1577,28 @@ int Packing_arm::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio
         int outh = h * elempack / out_elempack;
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
-        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
+        top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         if (pack1to8)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const signed char* r0 = bottom_blob.row<const signed char>(i * 8);
-                const signed char* r1 = bottom_blob.row<const signed char>(i * 8 + 1);
-                const signed char* r2 = bottom_blob.row<const signed char>(i * 8 + 2);
-                const signed char* r3 = bottom_blob.row<const signed char>(i * 8 + 3);
-                const signed char* r4 = bottom_blob.row<const signed char>(i * 8 + 4);
-                const signed char* r5 = bottom_blob.row<const signed char>(i * 8 + 5);
-                const signed char* r6 = bottom_blob.row<const signed char>(i * 8 + 6);
-                const signed char* r7 = bottom_blob.row<const signed char>(i * 8 + 7);
-
-                signed char* outptr = top_blob.row<signed char>(i);
+                int b = bi / outh;
+                int i = bi % outh;
+                const signed char* r0 = bottom_blob.batch(b).row<const signed char>(i * 8);
+                const signed char* r1 = bottom_blob.batch(b).row<const signed char>(i * 8 + 1);
+                const signed char* r2 = bottom_blob.batch(b).row<const signed char>(i * 8 + 2);
+                const signed char* r3 = bottom_blob.batch(b).row<const signed char>(i * 8 + 3);
+                const signed char* r4 = bottom_blob.batch(b).row<const signed char>(i * 8 + 4);
+                const signed char* r5 = bottom_blob.batch(b).row<const signed char>(i * 8 + 5);
+                const signed char* r6 = bottom_blob.batch(b).row<const signed char>(i * 8 + 6);
+                const signed char* r7 = bottom_blob.batch(b).row<const signed char>(i * 8 + 7);
+
+                signed char* outptr = top_blob.batch(b).row<signed char>(i);
 
                 int j = 0;
                 for (; j < w; j++)
@@ -1561,19 +1618,22 @@ int Packing_arm::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio
         }
         if (pack8to1)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const signed char* r0 = bottom_blob.row<const signed char>(i);
-
-                signed char* outptr0 = top_blob.row<signed char>(i * 8);
-                signed char* outptr1 = top_blob.row<signed char>(i * 8 + 1);
-                signed char* outptr2 = top_blob.row<signed char>(i * 8 + 2);
-                signed char* outptr3 = top_blob.row<signed char>(i * 8 + 3);
-                signed char* outptr4 = top_blob.row<signed char>(i * 8 + 4);
-                signed char* outptr5 = top_blob.row<signed char>(i * 8 + 5);
-                signed char* outptr6 = top_blob.row<signed char>(i * 8 + 6);
-                signed char* outptr7 = top_blob.row<signed char>(i * 8 + 7);
+                int b = bi / h;
+                int i = bi % h;
+                const signed char* r0 = bottom_blob.batch(b).row<const signed char>(i);
+
+                signed char* outptr0 = top_blob.batch(b).row<signed char>(i * 8);
+                signed char* outptr1 = top_blob.batch(b).row<signed char>(i * 8 + 1);
+                signed char* outptr2 = top_blob.batch(b).row<signed char>(i * 8 + 2);
+                signed char* outptr3 = top_blob.batch(b).row<signed char>(i * 8 + 3);
+                signed char* outptr4 = top_blob.batch(b).row<signed char>(i * 8 + 4);
+                signed char* outptr5 = top_blob.batch(b).row<signed char>(i * 8 + 5);
+                signed char* outptr6 = top_blob.batch(b).row<signed char>(i * 8 + 6);
+                signed char* outptr7 = top_blob.batch(b).row<signed char>(i * 8 + 7);
 
                 int j = 0;
                 for (; j < w; j++)
@@ -1602,27 +1662,30 @@ int Packing_arm::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
         if (dims == 3)
-            top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
-        else // if (dims == 4)
-            top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator);
+            top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
+        else
+            top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         if (pack1to8)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const signed char* r0 = bottom_blob.channel(q * 8);
-                const signed char* r1 = bottom_blob.channel(q * 8 + 1);
-                const signed char* r2 = bottom_blob.channel(q * 8 + 2);
-                const signed char* r3 = bottom_blob.channel(q * 8 + 3);
-                const signed char* r4 = bottom_blob.channel(q * 8 + 4);
-                const signed char* r5 = bottom_blob.channel(q * 8 + 5);
-                const signed char* r6 = bottom_blob.channel(q * 8 + 6);
-                const signed char* r7 = bottom_blob.channel(q * 8 + 7);
-
-                signed char* outptr = top_blob.channel(q);
+                int b = bq / outc;
+                int q = bq % outc;
+                const signed char* r0 = bottom_blob.batch(b).channel(q * 8);
+                const signed char* r1 = bottom_blob.batch(b).channel(q * 8 + 1);
+                const signed char* r2 = bottom_blob.batch(b).channel(q * 8 + 2);
+                const signed char* r3 = bottom_blob.batch(b).channel(q * 8 + 3);
+                const signed char* r4 = bottom_blob.batch(b).channel(q * 8 + 4);
+                const signed char* r5 = bottom_blob.batch(b).channel(q * 8 + 5);
+                const signed char* r6 = bottom_blob.batch(b).channel(q * 8 + 6);
+                const signed char* r7 = bottom_blob.batch(b).channel(q * 8 + 7);
+
+                signed char* outptr = top_blob.batch(b).channel(q);
 
                 int i = 0;
                 for (; i < size; i++)
@@ -1642,19 +1705,22 @@ int Packing_arm::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio
         }
         if (pack8to1)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const signed char* r0 = bottom_blob.channel(q);
-
-                signed char* outptr0 = top_blob.channel(q * 8);
-                signed char* outptr1 = top_blob.channel(q * 8 + 1);
-                signed char* outptr2 = top_blob.channel(q * 8 + 2);
-                signed char* outptr3 = top_blob.channel(q * 8 + 3);
-                signed char* outptr4 = top_blob.channel(q * 8 + 4);
-                signed char* outptr5 = top_blob.channel(q * 8 + 5);
-                signed char* outptr6 = top_blob.channel(q * 8 + 6);
-                signed char* outptr7 = top_blob.channel(q * 8 + 7);
+                int b = bq / channels;
+                int q = bq % channels;
+                const signed char* r0 = bottom_blob.batch(b).channel(q);
+
+                signed char* outptr0 = top_blob.batch(b).channel(q * 8);
+                signed char* outptr1 = top_blob.batch(b).channel(q * 8 + 1);
+                signed char* outptr2 = top_blob.batch(b).channel(q * 8 + 2);
+                signed char* outptr3 = top_blob.batch(b).channel(q * 8 + 3);
+                signed char* outptr4 = top_blob.batch(b).channel(q * 8 + 4);
+                signed char* outptr5 = top_blob.batch(b).channel(q * 8 + 5);
+                signed char* outptr6 = top_blob.batch(b).channel(q * 8 + 6);
+                signed char* outptr7 = top_blob.batch(b).channel(q * 8 + 7);
 
                 int i = 0;
                 for (; i < size; i++)
diff --git a/src/layer/cast.cpp b/src/layer/cast.cpp
index 3dcff38f3cac..15a34ffcc764 100644
--- a/src/layer/cast.cpp
+++ b/src/layer/cast.cpp
@@ -50,6 +50,7 @@ int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
     int d = bottom_blob.d;
     int channels = bottom_blob.c;
     int dims = bottom_blob.dims;
+    int batch = bottom_blob.n;
     size_t elemsize = bottom_blob.elemsize;
     int elempack = bottom_blob.elempack;
 
@@ -76,33 +77,29 @@ int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
     }
 
     if (dims == 1)
-    {
-        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
-    }
+        top_blob.create_batch(w, batch, out_elemsize, elempack, opt.blob_allocator);
     else if (dims == 2)
-    {
-        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
-    }
+        top_blob.create_batch(w, h, batch, out_elemsize, elempack, opt.blob_allocator);
     else if (dims == 3)
-    {
-        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
-    }
+        top_blob.create_batch(w, h, channels, batch, out_elemsize, elempack, opt.blob_allocator);
     else if (dims == 4)
-    {
-        top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_allocator);
-    }
+        top_blob.create_batch(w, h, d, channels, batch, out_elemsize, elempack, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
     int size = w * h * d * elempack;
 
+    int total_bc = batch * channels;
+
     if (type_from == 1 && type_to == 2)
     {
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
+        for (int bc = 0; bc < total_bc; bc++)
         {
-            const float* ptr = bottom_blob.channel(q);
-            unsigned short* outptr = top_blob.channel(q);
+            int b = bc / channels;
+            int q = bc % channels;
+            const float* ptr = bottom_blob.batch(b).channel(q);
+            unsigned short* outptr = top_blob.batch(b).channel(q);
 
             for (int i = 0; i < size; i++)
             {
@@ -114,10 +111,12 @@ int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
     if (type_from == 2 && type_to == 1)
     {
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
+        for (int bc = 0; bc < total_bc; bc++)
         {
-            const unsigned short* ptr = bottom_blob.channel(q);
-            float* outptr = top_blob.channel(q);
+            int b = bc / channels;
+            int q = bc % channels;
+            const unsigned short* ptr = bottom_blob.batch(b).channel(q);
+            float* outptr = top_blob.batch(b).channel(q);
 
             for (int i = 0; i < size; i++)
             {
@@ -129,10 +128,12 @@ int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
     if (type_from == 3 && type_to == 1)
     {
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
+        for (int bc = 0; bc < total_bc; bc++)
         {
-            const signed char* ptr = bottom_blob.channel(q);
-            float* outptr = top_blob.channel(q);
+            int b = bc / channels;
+            int q = bc % channels;
+            const signed char* ptr = bottom_blob.batch(b).channel(q);
+            float* outptr = top_blob.batch(b).channel(q);
 
             for (int i = 0; i < size; i++)
             {
@@ -144,10 +145,12 @@ int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
     if (type_from == 1 && type_to == 4)
     {
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
+        for (int bc = 0; bc < total_bc; bc++)
         {
-            const float* ptr = bottom_blob.channel(q);
-            unsigned short* outptr = top_blob.channel(q);
+            int b = bc / channels;
+            int q = bc % channels;
+            const float* ptr = bottom_blob.batch(b).channel(q);
+            unsigned short* outptr = top_blob.batch(b).channel(q);
 
             for (int i = 0; i < size; i++)
             {
@@ -159,10 +162,12 @@ int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
     if (type_from == 4 && type_to == 1)
     {
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
+        for (int bc = 0; bc < total_bc; bc++)
         {
-            const unsigned short* ptr = bottom_blob.channel(q);
-            float* outptr = top_blob.channel(q);
+            int b = bc / channels;
+            int q = bc % channels;
+            const unsigned short* ptr = bottom_blob.batch(b).channel(q);
+            float* outptr = top_blob.batch(b).channel(q);
 
             for (int i = 0; i < size; i++)
             {
diff --git a/src/layer/loongarch/cast_loongarch.cpp b/src/layer/loongarch/cast_loongarch.cpp
index b2a0bb8ac58c..bd1b0acdbf6f 100644
--- a/src/layer/loongarch/cast_loongarch.cpp
+++ b/src/layer/loongarch/cast_loongarch.cpp
@@ -29,13 +29,14 @@ int Cast_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
     int dims = bottom_blob.dims;
     size_t elemsize = bottom_blob.elemsize;
     int elempack = bottom_blob.elempack;
+    int batch = bottom_blob.n;
 
     size_t out_elemsize = elemsize;
     if (type_to == 1)
     {
         if (type_from == 3)
         {
-            Cast::forward(bottom_blob, top_blob, opt);
+            return Cast::forward(bottom_blob, top_blob, opt);
         }
 
         // float32
@@ -58,21 +59,13 @@ int Cast_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
     }
 
     if (dims == 1)
-    {
-        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
-    }
+        top_blob.create_batch(w, batch, out_elemsize, elempack, opt.blob_allocator);
     else if (dims == 2)
-    {
-        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
-    }
+        top_blob.create_batch(w, h, batch, out_elemsize, elempack, opt.blob_allocator);
     else if (dims == 3)
-    {
-        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
-    }
+        top_blob.create_batch(w, h, channels, batch, out_elemsize, elempack, opt.blob_allocator);
     else if (dims == 4)
-    {
-        top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_allocator);
-    }
+        top_blob.create_batch(w, h, d, channels, batch, out_elemsize, elempack, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
@@ -80,11 +73,14 @@ int Cast_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
 
     if (type_from == 1 && type_to == 2)
     {
+        const int total_bc = batch * channels;
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
+        for (int bc = 0; bc < total_bc; bc++)
         {
-            const float* ptr = bottom_blob.channel(q);
-            unsigned short* outptr = top_blob.channel(q);
+            int b = bc / channels;
+            int q = bc % channels;
+            const float* ptr = bottom_blob.batch(b).channel(q);
+            unsigned short* outptr = top_blob.batch(b).channel(q);
 
             int i = 0;
 #if __loongarch_sx
@@ -111,11 +107,14 @@ int Cast_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
 
     if (type_from == 2 && type_to == 1)
     {
+        const int total_bc = batch * channels;
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
+        for (int bc = 0; bc < total_bc; bc++)
         {
-            const unsigned short* ptr = bottom_blob.channel(q);
-            float* outptr = top_blob.channel(q);
+            int b = bc / channels;
+            int q = bc % channels;
+            const unsigned short* ptr = bottom_blob.batch(b).channel(q);
+            float* outptr = top_blob.batch(b).channel(q);
 
             int i = 0;
 #if __loongarch_sx
@@ -143,11 +142,14 @@ int Cast_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
 
     if (type_from == 3 && type_to == 1)
     {
+        const int total_bc = batch * channels;
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
+        for (int bc = 0; bc < total_bc; bc++)
         {
-            const signed char* ptr = bottom_blob.channel(q);
-            float* outptr = top_blob.channel(q);
+            int b = bc / channels;
+            int q = bc % channels;
+            const signed char* ptr = bottom_blob.batch(b).channel(q);
+            float* outptr = top_blob.batch(b).channel(q);
 
             for (int i = 0; i < size; i++)
             {
@@ -158,11 +160,14 @@ int Cast_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
 
     if (type_from == 4 && type_to == 1)
     {
+        const int total_bc = batch * channels;
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
+        for (int bc = 0; bc < total_bc; bc++)
         {
-            const unsigned short* ptr = bottom_blob.channel(q);
-            float* outptr = top_blob.channel(q);
+            int b = bc / channels;
+            int q = bc % channels;
+            const unsigned short* ptr = bottom_blob.batch(b).channel(q);
+            float* outptr = top_blob.batch(b).channel(q);
 
             int i = 0;
             for (; i < size; i++)
@@ -176,11 +181,14 @@ int Cast_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
 
     if (type_from == 1 && type_to == 4)
     {
+        const int total_bc = batch * channels;
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
+        for (int bc = 0; bc < total_bc; bc++)
         {
-            const float* ptr = bottom_blob.channel(q);
-            unsigned short* outptr = top_blob.channel(q);
+            int b = bc / channels;
+            int q = bc % channels;
+            const float* ptr = bottom_blob.batch(b).channel(q);
+            unsigned short* outptr = top_blob.batch(b).channel(q);
 
             int i = 0;
             for (; i < size; i++)
diff --git a/src/layer/loongarch/packing_loongarch.cpp b/src/layer/loongarch/packing_loongarch.cpp
index 6225dd49f23b..9f6af1d015d4 100644
--- a/src/layer/loongarch/packing_loongarch.cpp
+++ b/src/layer/loongarch/packing_loongarch.cpp
@@ -54,6 +54,7 @@ int Packing_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
     int d = bottom_blob.d;
     int channels = bottom_blob.c;
     int dims = bottom_blob.dims;
+    int batch = bottom_blob.n;
 
     if (!use_padding)
     {
@@ -82,6 +83,7 @@ int Packing_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
         top_blob.cstep = bottom_blob.cstep * elempack / out_elempack;
         top_blob.elemsize = elemsize / elempack * out_elempack;
         top_blob.elempack = out_elempack;
+        top_blob.nstep = bottom_blob.nstep * elempack / out_elempack;
         return 0;
     }
 
@@ -90,21 +92,24 @@ int Packing_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
         int outh = h * elempack / out_elempack;
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
-        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
+        top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         if (pack1to4)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const float* r0 = bottom_blob.row(i * 4);
-                const float* r1 = bottom_blob.row(i * 4 + 1);
-                const float* r2 = bottom_blob.row(i * 4 + 2);
-                const float* r3 = bottom_blob.row(i * 4 + 3);
+                int b = bi / outh;
+                int i = bi % outh;
+                const float* r0 = bottom_blob.batch(b).row(i * 4);
+                const float* r1 = bottom_blob.batch(b).row(i * 4 + 1);
+                const float* r2 = bottom_blob.batch(b).row(i * 4 + 2);
+                const float* r3 = bottom_blob.batch(b).row(i * 4 + 3);
 
-                float* outptr = top_blob.row(i);
+                float* outptr = top_blob.batch(b).row(i);
 
                 int j = 0;
 #if __loongarch_sx
@@ -150,15 +155,18 @@ int Packing_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
         }
         if (pack4to1)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const float* r0 = bottom_blob.row(i);
+                int b = bi / h;
+                int i = bi % h;
+                const float* r0 = bottom_blob.batch(b).row(i);
 
-                float* outptr0 = top_blob.row(i * 4);
-                float* outptr1 = top_blob.row(i * 4 + 1);
-                float* outptr2 = top_blob.row(i * 4 + 2);
-                float* outptr3 = top_blob.row(i * 4 + 3);
+                float* outptr0 = top_blob.batch(b).row(i * 4);
+                float* outptr1 = top_blob.batch(b).row(i * 4 + 1);
+                float* outptr2 = top_blob.batch(b).row(i * 4 + 2);
+                float* outptr3 = top_blob.batch(b).row(i * 4 + 3);
 
                 int j = 0;
 #if __loongarch_sx
@@ -213,23 +221,26 @@ int Packing_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
         if (dims == 3)
-            top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
-        else // if (dims == 4)
-            top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator);
+            top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
+        else
+            top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         if (pack1to4)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const float* r0 = bottom_blob.channel(q * 4);
-                const float* r1 = bottom_blob.channel(q * 4 + 1);
-                const float* r2 = bottom_blob.channel(q * 4 + 2);
-                const float* r3 = bottom_blob.channel(q * 4 + 3);
+                int b = bq / outc;
+                int q = bq % outc;
+                const float* r0 = bottom_blob.batch(b).channel(q * 4);
+                const float* r1 = bottom_blob.batch(b).channel(q * 4 + 1);
+                const float* r2 = bottom_blob.batch(b).channel(q * 4 + 2);
+                const float* r3 = bottom_blob.batch(b).channel(q * 4 + 3);
 
-                float* outptr = top_blob.channel(q);
+                float* outptr = top_blob.batch(b).channel(q);
 
                 int i = 0;
 #if __loongarch_sx
@@ -275,15 +286,18 @@ int Packing_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
         }
         if (pack4to1)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const float* r0 = bottom_blob.channel(q);
+                int b = bq / channels;
+                int q = bq % channels;
+                const float* r0 = bottom_blob.batch(b).channel(q);
 
-                float* outptr0 = top_blob.channel(q * 4);
-                float* outptr1 = top_blob.channel(q * 4 + 1);
-                float* outptr2 = top_blob.channel(q * 4 + 2);
-                float* outptr3 = top_blob.channel(q * 4 + 3);
+                float* outptr0 = top_blob.batch(b).channel(q * 4);
+                float* outptr1 = top_blob.batch(b).channel(q * 4 + 1);
+                float* outptr2 = top_blob.batch(b).channel(q * 4 + 2);
+                float* outptr3 = top_blob.batch(b).channel(q * 4 + 3);
 
                 int i = 0;
 #if __loongarch_sx
@@ -363,6 +377,7 @@ int Packing_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const
     int d = bottom_blob.d;
     int channels = bottom_blob.c;
     int dims = bottom_blob.dims;
+    int batch = bottom_blob.n;
 
     if (!use_padding)
     {
@@ -391,6 +406,7 @@ int Packing_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const
         top_blob.cstep = bottom_blob.cstep * elempack / out_elempack;
         top_blob.elemsize = elemsize / elempack * out_elempack;
         top_blob.elempack = out_elempack;
+        top_blob.nstep = bottom_blob.nstep * elempack / out_elempack;
         return 0;
     }
 
@@ -399,25 +415,28 @@ int Packing_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const
         int outh = h * elempack / out_elempack;
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
-        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
+        top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         if (pack1to8)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const signed char* r0 = bottom_blob.row<const signed char>(i * 8);
-                const signed char* r1 = bottom_blob.row<const signed char>(i * 8 + 1);
-                const signed char* r2 = bottom_blob.row<const signed char>(i * 8 + 2);
-                const signed char* r3 = bottom_blob.row<const signed char>(i * 8 + 3);
-                const signed char* r4 = bottom_blob.row<const signed char>(i * 8 + 4);
-                const signed char* r5 = bottom_blob.row<const signed char>(i * 8 + 5);
-                const signed char* r6 = bottom_blob.row<const signed char>(i * 8 + 6);
-                const signed char* r7 = bottom_blob.row<const signed char>(i * 8 + 7);
-
-                signed char* outptr = top_blob.row<signed char>(i);
+                int b = bi / outh;
+                int i = bi % outh;
+                const signed char* r0 = bottom_blob.batch(b).row<const signed char>(i * 8);
+                const signed char* r1 = bottom_blob.batch(b).row<const signed char>(i * 8 + 1);
+                const signed char* r2 = bottom_blob.batch(b).row<const signed char>(i * 8 + 2);
+                const signed char* r3 = bottom_blob.batch(b).row<const signed char>(i * 8 + 3);
+                const signed char* r4 = bottom_blob.batch(b).row<const signed char>(i * 8 + 4);
+                const signed char* r5 = bottom_blob.batch(b).row<const signed char>(i * 8 + 5);
+                const signed char* r6 = bottom_blob.batch(b).row<const signed char>(i * 8 + 6);
+                const signed char* r7 = bottom_blob.batch(b).row<const signed char>(i * 8 + 7);
+
+                signed char* outptr = top_blob.batch(b).row<signed char>(i);
 
                 int j = 0;
                 for (; j < w; j++)
@@ -437,19 +456,22 @@ int Packing_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const
         }
         if (pack8to1)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const signed char* r0 = bottom_blob.row<const signed char>(i);
-
-                signed char* outptr0 = top_blob.row<signed char>(i * 8);
-                signed char* outptr1 = top_blob.row<signed char>(i * 8 + 1);
-                signed char* outptr2 = top_blob.row<signed char>(i * 8 + 2);
-                signed char* outptr3 = top_blob.row<signed char>(i * 8 + 3);
-                signed char* outptr4 = top_blob.row<signed char>(i * 8 + 4);
-                signed char* outptr5 = top_blob.row<signed char>(i * 8 + 5);
-                signed char* outptr6 = top_blob.row<signed char>(i * 8 + 6);
-                signed char* outptr7 = top_blob.row<signed char>(i * 8 + 7);
+                int b = bi / h;
+                int i = bi % h;
+                const signed char* r0 = bottom_blob.batch(b).row<const signed char>(i);
+
+                signed char* outptr0 = top_blob.batch(b).row<signed char>(i * 8);
+                signed char* outptr1 = top_blob.batch(b).row<signed char>(i * 8 + 1);
+                signed char* outptr2 = top_blob.batch(b).row<signed char>(i * 8 + 2);
+                signed char* outptr3 = top_blob.batch(b).row<signed char>(i * 8 + 3);
+                signed char* outptr4 = top_blob.batch(b).row<signed char>(i * 8 + 4);
+                signed char* outptr5 = top_blob.batch(b).row<signed char>(i * 8 + 5);
+                signed char* outptr6 = top_blob.batch(b).row<signed char>(i * 8 + 6);
+                signed char* outptr7 = top_blob.batch(b).row<signed char>(i * 8 + 7);
 
                 int j = 0;
                 for (; j < w; j++)
@@ -478,27 +500,30 @@ int Packing_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
         if (dims == 3)
-            top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
-        else // if (dims == 4)
-            top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator);
+            top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
+        else
+            top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         if (pack1to8)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const signed char* r0 = bottom_blob.channel(q * 8);
-                const signed char* r1 = bottom_blob.channel(q * 8 + 1);
-                const signed char* r2 = bottom_blob.channel(q * 8 + 2);
-                const signed char* r3 = bottom_blob.channel(q * 8 + 3);
-                const signed char* r4 = bottom_blob.channel(q * 8 + 4);
-                const signed char* r5 = bottom_blob.channel(q * 8 + 5);
-                const signed char* r6 = bottom_blob.channel(q * 8 + 6);
-                const signed char* r7 = bottom_blob.channel(q * 8 + 7);
-
-                signed char* outptr = top_blob.channel(q);
+                int b = bq / outc;
+                int q = bq % outc;
+                const signed char* r0 = bottom_blob.batch(b).channel(q * 8);
+                const signed char* r1 = bottom_blob.batch(b).channel(q * 8 + 1);
+                const signed char* r2 = bottom_blob.batch(b).channel(q * 8 + 2);
+                const signed char* r3 = bottom_blob.batch(b).channel(q * 8 + 3);
+                const signed char* r4 = bottom_blob.batch(b).channel(q * 8 + 4);
+                const signed char* r5 = bottom_blob.batch(b).channel(q * 8 + 5);
+                const signed char* r6 = bottom_blob.batch(b).channel(q * 8 + 6);
+                const signed char* r7 = bottom_blob.batch(b).channel(q * 8 + 7);
+
+                signed char* outptr = top_blob.batch(b).channel(q);
 
                 int i = 0;
                 for (; i < size; i++)
@@ -518,19 +543,22 @@ int Packing_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const
         }
         if (pack8to1)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const signed char* r0 = bottom_blob.channel(q);
-
-                signed char* outptr0 = top_blob.channel(q * 8);
-                signed char* outptr1 = top_blob.channel(q * 8 + 1);
-                signed char* outptr2 = top_blob.channel(q * 8 + 2);
-                signed char* outptr3 = top_blob.channel(q * 8 + 3);
-                signed char* outptr4 = top_blob.channel(q * 8 + 4);
-                signed char* outptr5 = top_blob.channel(q * 8 + 5);
-                signed char* outptr6 = top_blob.channel(q * 8 + 6);
-                signed char* outptr7 = top_blob.channel(q * 8 + 7);
+                int b = bq / channels;
+                int q = bq % channels;
+                const signed char* r0 = bottom_blob.batch(b).channel(q);
+
+                signed char* outptr0 = top_blob.batch(b).channel(q * 8);
+                signed char* outptr1 = top_blob.batch(b).channel(q * 8 + 1);
+                signed char* outptr2 = top_blob.batch(b).channel(q * 8 + 2);
+                signed char* outptr3 = top_blob.batch(b).channel(q * 8 + 3);
+                signed char* outptr4 = top_blob.batch(b).channel(q * 8 + 4);
+                signed char* outptr5 = top_blob.batch(b).channel(q * 8 + 5);
+                signed char* outptr6 = top_blob.batch(b).channel(q * 8 + 6);
+                signed char* outptr7 = top_blob.batch(b).channel(q * 8 + 7);
 
                 int i = 0;
                 for (; i < size; i++)
diff --git a/src/layer/mips/cast_mips.cpp b/src/layer/mips/cast_mips.cpp
index deb74834ea18..667292e59977 100644
--- a/src/layer/mips/cast_mips.cpp
+++ b/src/layer/mips/cast_mips.cpp
@@ -29,13 +29,14 @@ int Cast_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
     int dims = bottom_blob.dims;
     size_t elemsize = bottom_blob.elemsize;
     int elempack = bottom_blob.elempack;
+    int batch = bottom_blob.n;
 
     size_t out_elemsize = elemsize;
     if (type_to == 1)
     {
         if (type_from == 3)
         {
-            Cast::forward(bottom_blob, top_blob, opt);
+            return Cast::forward(bottom_blob, top_blob, opt);
         }
 
         // float32
@@ -58,21 +59,13 @@ int Cast_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
     }
 
     if (dims == 1)
-    {
-        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
-    }
+        top_blob.create_batch(w, batch, out_elemsize, elempack, opt.blob_allocator);
     else if (dims == 2)
-    {
-        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
-    }
+        top_blob.create_batch(w, h, batch, out_elemsize, elempack, opt.blob_allocator);
     else if (dims == 3)
-    {
-        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
-    }
+        top_blob.create_batch(w, h, channels, batch, out_elemsize, elempack, opt.blob_allocator);
     else if (dims == 4)
-    {
-        top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_allocator);
-    }
+        top_blob.create_batch(w, h, d, channels, batch, out_elemsize, elempack, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
@@ -80,11 +73,14 @@ int Cast_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
 
     if (type_from == 1 && type_to == 2)
     {
+        const int total_bc = batch * channels;
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
+        for (int bc = 0; bc < total_bc; bc++)
         {
-            const float* ptr = bottom_blob.channel(q);
-            unsigned short* outptr = top_blob.channel(q);
+            int b = bc / channels;
+            int q = bc % channels;
+            const float* ptr = bottom_blob.batch(b).channel(q);
+            unsigned short* outptr = top_blob.batch(b).channel(q);
 
             int i = 0;
 #if __mips_msa
@@ -111,11 +107,14 @@ int Cast_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
 
     if (type_from == 2 && type_to == 1)
     {
+        const int total_bc = batch * channels;
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
+        for (int bc = 0; bc < total_bc; bc++)
         {
-            const unsigned short* ptr = bottom_blob.channel(q);
-            float* outptr = top_blob.channel(q);
+            int b = bc / channels;
+            int q = bc % channels;
+            const unsigned short* ptr = bottom_blob.batch(b).channel(q);
+            float* outptr = top_blob.batch(b).channel(q);
 
             int i = 0;
 #if __mips_msa
@@ -143,11 +142,14 @@ int Cast_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
 
     if (type_from == 3 && type_to == 1)
     {
+        const int total_bc = batch * channels;
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
+        for (int bc = 0; bc < total_bc; bc++)
         {
-            const signed char* ptr = bottom_blob.channel(q);
-            float* outptr = top_blob.channel(q);
+            int b = bc / channels;
+            int q = bc % channels;
+            const signed char* ptr = bottom_blob.batch(b).channel(q);
+            float* outptr = top_blob.batch(b).channel(q);
 
             for (int i = 0; i < size; i++)
             {
@@ -158,11 +160,14 @@ int Cast_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
 
     if (type_from == 4 && type_to == 1)
     {
+        const int total_bc = batch * channels;
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
+        for (int bc = 0; bc < total_bc; bc++)
         {
-            const unsigned short* ptr = bottom_blob.channel(q);
-            float* outptr = top_blob.channel(q);
+            int b = bc / channels;
+            int q = bc % channels;
+            const unsigned short* ptr = bottom_blob.batch(b).channel(q);
+            float* outptr = top_blob.batch(b).channel(q);
 
             int i = 0;
             for (; i < size; i++)
@@ -176,11 +181,14 @@ int Cast_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
 
     if (type_from == 1 && type_to == 4)
     {
+        const int total_bc = batch * channels;
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
+        for (int bc = 0; bc < total_bc; bc++)
         {
-            const float* ptr = bottom_blob.channel(q);
-            unsigned short* outptr = top_blob.channel(q);
+            int b = bc / channels;
+            int q = bc % channels;
+            const float* ptr = bottom_blob.batch(b).channel(q);
+            unsigned short* outptr = top_blob.batch(b).channel(q);
 
             int i = 0;
             for (; i < size; i++)
diff --git a/src/layer/mips/packing_mips.cpp b/src/layer/mips/packing_mips.cpp
index a4cea20e1c6c..9e435227f901 100644
--- a/src/layer/mips/packing_mips.cpp
+++ b/src/layer/mips/packing_mips.cpp
@@ -54,6 +54,7 @@ int Packing_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o
     int d = bottom_blob.d;
     int channels = bottom_blob.c;
     int dims = bottom_blob.dims;
+    int batch = bottom_blob.n;
 
     if (!use_padding)
     {
@@ -82,6 +83,7 @@ int Packing_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o
         top_blob.cstep = bottom_blob.cstep * elempack / out_elempack;
         top_blob.elemsize = elemsize / elempack * out_elempack;
         top_blob.elempack = out_elempack;
+        top_blob.nstep = bottom_blob.nstep * elempack / out_elempack;
         return 0;
     }
 
@@ -90,21 +92,24 @@ int Packing_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o
         int outh = h * elempack / out_elempack;
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
-        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
+        top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         if (pack1to4)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const float* r0 = bottom_blob.row(i * 4);
-                const float* r1 = bottom_blob.row(i * 4 + 1);
-                const float* r2 = bottom_blob.row(i * 4 + 2);
-                const float* r3 = bottom_blob.row(i * 4 + 3);
+                int b = bi / outh;
+                int i = bi % outh;
+                const float* r0 = bottom_blob.batch(b).row(i * 4);
+                const float* r1 = bottom_blob.batch(b).row(i * 4 + 1);
+                const float* r2 = bottom_blob.batch(b).row(i * 4 + 2);
+                const float* r3 = bottom_blob.batch(b).row(i * 4 + 3);
 
-                float* outptr = top_blob.row(i);
+                float* outptr = top_blob.batch(b).row(i);
 
                 int j = 0;
 #if __mips_msa
@@ -150,15 +155,18 @@ int Packing_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o
         }
         if (pack4to1)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const float* r0 = bottom_blob.row(i);
+                int b = bi / h;
+                int i = bi % h;
+                const float* r0 = bottom_blob.batch(b).row(i);
 
-                float* outptr0 = top_blob.row(i * 4);
-                float* outptr1 = top_blob.row(i * 4 + 1);
-                float* outptr2 = top_blob.row(i * 4 + 2);
-                float* outptr3 = top_blob.row(i * 4 + 3);
+                float* outptr0 = top_blob.batch(b).row(i * 4);
+                float* outptr1 = top_blob.batch(b).row(i * 4 + 1);
+                float* outptr2 = top_blob.batch(b).row(i * 4 + 2);
+                float* outptr3 = top_blob.batch(b).row(i * 4 + 3);
 
                 int j = 0;
 #if __mips_msa
@@ -213,23 +221,26 @@ int Packing_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
         if (dims == 3)
-            top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
-        else // if (dims == 4)
-            top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator);
+            top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
+        else
+            top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         if (pack1to4)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const float* r0 = bottom_blob.channel(q * 4);
-                const float* r1 = bottom_blob.channel(q * 4 + 1);
-                const float* r2 = bottom_blob.channel(q * 4 + 2);
-                const float* r3 = bottom_blob.channel(q * 4 + 3);
+                int b = bq / outc;
+                int q = bq % outc;
+                const float* r0 = bottom_blob.batch(b).channel(q * 4);
+                const float* r1 = bottom_blob.batch(b).channel(q * 4 + 1);
+                const float* r2 = bottom_blob.batch(b).channel(q * 4 + 2);
+                const float* r3 = bottom_blob.batch(b).channel(q * 4 + 3);
 
-                float* outptr = top_blob.channel(q);
+                float* outptr = top_blob.batch(b).channel(q);
 
                 int i = 0;
 #if __mips_msa
@@ -275,15 +286,18 @@ int Packing_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o
         }
         if (pack4to1)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const float* r0 = bottom_blob.channel(q);
+                int b = bq / channels;
+                int q = bq % channels;
+                const float* r0 = bottom_blob.batch(b).channel(q);
 
-                float* outptr0 = top_blob.channel(q * 4);
-                float* outptr1 = top_blob.channel(q * 4 + 1);
-                float* outptr2 = top_blob.channel(q * 4 + 2);
-                float* outptr3 = top_blob.channel(q * 4 + 3);
+                float* outptr0 = top_blob.batch(b).channel(q * 4);
+                float* outptr1 = top_blob.batch(b).channel(q * 4 + 1);
+                float* outptr2 = top_blob.batch(b).channel(q * 4 + 2);
+                float* outptr3 = top_blob.batch(b).channel(q * 4 + 3);
 
                 int i = 0;
 #if __mips_msa
@@ -363,6 +377,7 @@ int Packing_mips::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opti
     int d = bottom_blob.d;
     int channels = bottom_blob.c;
     int dims = bottom_blob.dims;
+    int batch = bottom_blob.n;
 
     if (!use_padding)
     {
@@ -391,6 +406,7 @@ int Packing_mips::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opti
         top_blob.cstep = bottom_blob.cstep * elempack / out_elempack;
         top_blob.elemsize = elemsize / elempack * out_elempack;
         top_blob.elempack = out_elempack;
+        top_blob.nstep = bottom_blob.nstep * elempack / out_elempack;
         return 0;
     }
 
@@ -399,25 +415,28 @@ int Packing_mips::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opti
         int outh = h * elempack / out_elempack;
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
-        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
+        top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         if (pack1to8)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const signed char* r0 = bottom_blob.row<const signed char>(i * 8);
-                const signed char* r1 = bottom_blob.row<const signed char>(i * 8 + 1);
-                const signed char* r2 = bottom_blob.row<const signed char>(i * 8 + 2);
-                const signed char* r3 = bottom_blob.row<const signed char>(i * 8 + 3);
-                const signed char* r4 = bottom_blob.row<const signed char>(i * 8 + 4);
-                const signed char* r5 = bottom_blob.row<const signed char>(i * 8 + 5);
-                const signed char* r6 = bottom_blob.row<const signed char>(i * 8 + 6);
-                const signed char* r7 = bottom_blob.row<const signed char>(i * 8 + 7);
-
-                signed char* outptr = top_blob.row<signed char>(i);
+                int b = bi / outh;
+                int i = bi % outh;
+                const signed char* r0 = bottom_blob.batch(b).row<const signed char>(i * 8);
+                const signed char* r1 = bottom_blob.batch(b).row<const signed char>(i * 8 + 1);
+                const signed char* r2 = bottom_blob.batch(b).row<const signed char>(i * 8 + 2);
+                const signed char* r3 = bottom_blob.batch(b).row<const signed char>(i * 8 + 3);
+                const signed char* r4 = bottom_blob.batch(b).row<const signed char>(i * 8 + 4);
+                const signed char* r5 = bottom_blob.batch(b).row<const signed char>(i * 8 + 5);
+                const signed char* r6 = bottom_blob.batch(b).row<const signed char>(i * 8 + 6);
+                const signed char* r7 = bottom_blob.batch(b).row<const signed char>(i * 8 + 7);
+
+                signed char* outptr = top_blob.batch(b).row<signed char>(i);
 
                 int j = 0;
                 for (; j < w; j++)
@@ -437,19 +456,22 @@ int Packing_mips::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opti
         }
         if (pack8to1)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const signed char* r0 = bottom_blob.row<const signed char>(i);
-
-                signed char* outptr0 = top_blob.row<signed char>(i * 8);
-                signed char* outptr1 = top_blob.row<signed char>(i * 8 + 1);
-                signed char* outptr2 = top_blob.row<signed char>(i * 8 + 2);
-                signed char* outptr3 = top_blob.row<signed char>(i * 8 + 3);
-                signed char* outptr4 = top_blob.row<signed char>(i * 8 + 4);
-                signed char* outptr5 = top_blob.row<signed char>(i * 8 + 5);
-                signed char* outptr6 = top_blob.row<signed char>(i * 8 + 6);
-                signed char* outptr7 = top_blob.row<signed char>(i * 8 + 7);
+                int b = bi / h;
+                int i = bi % h;
+                const signed char* r0 = bottom_blob.batch(b).row<const signed char>(i);
+
+                signed char* outptr0 = top_blob.batch(b).row<signed char>(i * 8);
+                signed char* outptr1 = top_blob.batch(b).row<signed char>(i * 8 + 1);
+                signed char* outptr2 = top_blob.batch(b).row<signed char>(i * 8 + 2);
+                signed char* outptr3 = top_blob.batch(b).row<signed char>(i * 8 + 3);
+                signed char* outptr4 = top_blob.batch(b).row<signed char>(i * 8 + 4);
+                signed char* outptr5 = top_blob.batch(b).row<signed char>(i * 8 + 5);
+                signed char* outptr6 = top_blob.batch(b).row<signed char>(i * 8 + 6);
+                signed char* outptr7 = top_blob.batch(b).row<signed char>(i * 8 + 7);
 
                 int j = 0;
                 for (; j < w; j++)
@@ -478,27 +500,30 @@ int Packing_mips::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opti
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
         if (dims == 3)
-            top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
-        else // if (dims == 4)
-            top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator);
+            top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
+        else
+            top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         if (pack1to8)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const signed char* r0 = bottom_blob.channel(q * 8);
-                const signed char* r1 = bottom_blob.channel(q * 8 + 1);
-                const signed char* r2 = bottom_blob.channel(q * 8 + 2);
-                const signed char* r3 = bottom_blob.channel(q * 8 + 3);
-                const signed char* r4 = bottom_blob.channel(q * 8 + 4);
-                const signed char* r5 = bottom_blob.channel(q * 8 + 5);
-                const signed char* r6 = bottom_blob.channel(q * 8 + 6);
-                const signed char* r7 = bottom_blob.channel(q * 8 + 7);
-
-                signed char* outptr = top_blob.channel(q);
+                int b = bq / outc;
+                int q = bq % outc;
+                const signed char* r0 = bottom_blob.batch(b).channel(q * 8);
+                const signed char* r1 = bottom_blob.batch(b).channel(q * 8 + 1);
+                const signed char* r2 = bottom_blob.batch(b).channel(q * 8 + 2);
+                const signed char* r3 = bottom_blob.batch(b).channel(q * 8 + 3);
+                const signed char* r4 = bottom_blob.batch(b).channel(q * 8 + 4);
+                const signed char* r5 = bottom_blob.batch(b).channel(q * 8 + 5);
+                const signed char* r6 = bottom_blob.batch(b).channel(q * 8 + 6);
+                const signed char* r7 = bottom_blob.batch(b).channel(q * 8 + 7);
+
+                signed char* outptr = top_blob.batch(b).channel(q);
 
                 int i = 0;
                 for (; i < size; i++)
@@ -518,19 +543,22 @@ int Packing_mips::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opti
         }
         if (pack8to1)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const signed char* r0 = bottom_blob.channel(q);
-
-                signed char* outptr0 = top_blob.channel(q * 8);
-                signed char* outptr1 = top_blob.channel(q * 8 + 1);
-                signed char* outptr2 = top_blob.channel(q * 8 + 2);
-                signed char* outptr3 = top_blob.channel(q * 8 + 3);
-                signed char* outptr4 = top_blob.channel(q * 8 + 4);
-                signed char* outptr5 = top_blob.channel(q * 8 + 5);
-                signed char* outptr6 = top_blob.channel(q * 8 + 6);
-                signed char* outptr7 = top_blob.channel(q * 8 + 7);
+                int b = bq / channels;
+                int q = bq % channels;
+                const signed char* r0 = bottom_blob.batch(b).channel(q);
+
+                signed char* outptr0 = top_blob.batch(b).channel(q * 8);
+                signed char* outptr1 = top_blob.batch(b).channel(q * 8 + 1);
+                signed char* outptr2 = top_blob.batch(b).channel(q * 8 + 2);
+                signed char* outptr3 = top_blob.batch(b).channel(q * 8 + 3);
+                signed char* outptr4 = top_blob.batch(b).channel(q * 8 + 4);
+                signed char* outptr5 = top_blob.batch(b).channel(q * 8 + 5);
+                signed char* outptr6 = top_blob.batch(b).channel(q * 8 + 6);
+                signed char* outptr7 = top_blob.batch(b).channel(q * 8 + 7);
 
                 int i = 0;
                 for (; i < size; i++)
diff --git a/src/layer/packing.cpp b/src/layer/packing.cpp
index 1ec3a332f84a..3bb7a8da729d 100644
--- a/src/layer/packing.cpp
+++ b/src/layer/packing.cpp
@@ -37,6 +37,7 @@ int Packing::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c
     int d = bottom_blob.d;
     int channels = bottom_blob.c;
     int dims = bottom_blob.dims;
+    int batch = bottom_blob.n;
     size_t elemsize = bottom_blob.elemsize;
 
     if (!use_padding)
@@ -68,17 +69,22 @@ int Packing::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c
             top_blob.cstep = bottom_blob.cstep * elempack;
             top_blob.elemsize = elemsize / elempack;
             top_blob.elempack = out_elempack;
+            top_blob.nstep = bottom_blob.nstep * elempack;
             return 0;
         }
 
         int outw = (w * elempack + out_elempack - 1) / out_elempack;
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
-        top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator);
+        top_blob.create_batch(outw, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
-        memcpy(top_blob.data, bottom_blob.data, w * elemsize);
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int b = 0; b < batch; b++)
+        {
+            memcpy((unsigned char*)top_blob.batch(b).data, (unsigned char*)bottom_blob.batch(b).data, w * elemsize);
+        }
 
         return 0;
     }
@@ -89,14 +95,19 @@ int Packing::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c
         size_t out_elemsize = elemsize / elempack * out_elempack;
         size_t lane_size = out_elemsize / out_elempack;
 
-        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
+        top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
+        int total_bi = batch * outh;
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int i = 0; i < outh; i++)
+        for (int bi = 0; bi < total_bi; bi++)
         {
-            unsigned char* outptr = (unsigned char*)top_blob + (size_t)i * w * out_elemsize;
+            int b = bi / outh;
+            int i = bi % outh;
+
+            const unsigned char* bottom_ptr = (const unsigned char*)bottom_blob.batch(b).data;
+            unsigned char* outptr = (unsigned char*)top_blob.batch(b) + (size_t)i * w * out_elemsize;
 
             for (int j = 0; j < w; j++)
             {
@@ -110,7 +121,7 @@ int Packing::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c
 
                     int srck = (i * out_elempack + k) % elempack;
 
-                    const unsigned char* ptr = (const unsigned char*)bottom_blob + (size_t)srcy * w * elemsize;
+                    const unsigned char* ptr = bottom_ptr + (size_t)srcy * w * elemsize;
                     const unsigned char* elem_ptr = ptr + j * elemsize;
 
                     memcpy(out_elem_ptr + k * lane_size, elem_ptr + srck * lane_size, lane_size);
@@ -127,14 +138,20 @@ int Packing::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c
         size_t out_elemsize = elemsize / elempack * out_elempack;
         size_t lane_size = out_elemsize / out_elempack;
 
-        top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
+        top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
+        int total_bq = batch * outc;
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < outc; q++)
+        for (int bq = 0; bq < total_bq; bq++)
         {
-            Mat out = top_blob.channel(q);
+            int b = bq / outc;
+            int q = bq % outc;
+
+            Mat out = top_blob.batch(b).channel(q);
+
+            const Mat bottom_batch = bottom_blob.batch(b);
 
             for (int i = 0; i < h; i++)
             {
@@ -152,7 +169,7 @@ int Packing::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c
 
                         int srck = (q * out_elempack + k) % elempack;
 
-                        const Mat m = bottom_blob.channel(srcq);
+                        const Mat m = bottom_batch.channel(srcq);
                         const unsigned char* ptr = (const unsigned char*)m + (size_t)i * w * elemsize;
                         const unsigned char* elem_ptr = ptr + j * elemsize;
 
@@ -171,14 +188,20 @@ int Packing::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c
         size_t out_elemsize = elemsize / elempack * out_elempack;
         size_t lane_size = out_elemsize / out_elempack;
 
-        top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator);
+        top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
+        int total_bq = batch * outc;
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < outc; q++)
+        for (int bq = 0; bq < total_bq; bq++)
         {
-            Mat out = top_blob.channel(q);
+            int b = bq / outc;
+            int q = bq % outc;
+
+            Mat out = top_blob.batch(b).channel(q);
+
+            const Mat bottom_batch = bottom_blob.batch(b);
 
             for (int z = 0; z < d; z++)
             {
@@ -198,7 +221,7 @@ int Packing::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c
 
                             int srck = (q * out_elempack + k) % elempack;
 
-                            const Mat m = bottom_blob.channel(srcq);
+                            const Mat m = bottom_batch.channel(srcq);
                             const unsigned char* ptr = (const unsigned char*)m + (size_t)(z * h + i) * w * elemsize;
                             const unsigned char* elem_ptr = ptr + j * elemsize;
 
diff --git a/src/layer/riscv/cast_riscv.cpp b/src/layer/riscv/cast_riscv.cpp
index da4e74f242b8..4809cae0a709 100644
--- a/src/layer/riscv/cast_riscv.cpp
+++ b/src/layer/riscv/cast_riscv.cpp
@@ -33,6 +33,7 @@ int Cast_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt
     int dims = bottom_blob.dims;
     size_t elemsize = bottom_blob.elemsize;
     int elempack = bottom_blob.elempack;
+    int batch = bottom_blob.n;
 
     size_t out_elemsize = elemsize;
     if (type_to == 1)
@@ -57,21 +58,13 @@ int Cast_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt
     }
 
     if (dims == 1)
-    {
-        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
-    }
+        top_blob.create_batch(w, batch, out_elemsize, elempack, opt.blob_allocator);
     else if (dims == 2)
-    {
-        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
-    }
+        top_blob.create_batch(w, h, batch, out_elemsize, elempack, opt.blob_allocator);
     else if (dims == 3)
-    {
-        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
-    }
+        top_blob.create_batch(w, h, channels, batch, out_elemsize, elempack, opt.blob_allocator);
     else if (dims == 4)
-    {
-        top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_allocator);
-    }
+        top_blob.create_batch(w, h, d, channels, batch, out_elemsize, elempack, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
@@ -107,11 +100,14 @@ int Cast_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt
 
     if (type_from == 3 && type_to == 1)
     {
+        const int total_bc = batch * channels;
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
+        for (int bc = 0; bc < total_bc; bc++)
         {
-            const signed char* ptr = bottom_blob.channel(q);
-            float* outptr = top_blob.channel(q);
+            int b = bc / channels;
+            int q = bc % channels;
+            const signed char* ptr = bottom_blob.batch(b).channel(q);
+            float* outptr = top_blob.batch(b).channel(q);
 
             for (int i = 0; i < size; i++)
             {
diff --git a/src/layer/riscv/cast_riscv_zfh.cpp b/src/layer/riscv/cast_riscv_zfh.cpp
index 103e8bd48c2a..dfd9c53c3722 100644
--- a/src/layer/riscv/cast_riscv_zfh.cpp
+++ b/src/layer/riscv/cast_riscv_zfh.cpp
@@ -15,14 +15,18 @@ void Cast_riscv::cast_fp32_to_fp16(const Mat& bottom_blob, Mat& top_blob, const
 
     const int size = w * h * d * elempack;
 
+    const int batch = bottom_blob.n;
+    const int total_bc = batch * channels;
     #pragma omp parallel for num_threads(opt.num_threads)
-    for (int q = 0; q < channels; q++)
+    for (int bc = 0; bc < total_bc; bc++)
     {
-        const float* ptr = bottom_blob.channel(q);
+        int b = bc / channels;
+        int q = bc % channels;
+        const float* ptr = bottom_blob.batch(b).channel(q);
 #if __riscv_zfh
-        __fp16* outptr = top_blob.channel(q);
+        __fp16* outptr = top_blob.batch(b).channel(q);
 #else
-        unsigned short* outptr = top_blob.channel(q);
+        unsigned short* outptr = top_blob.batch(b).channel(q);
 #endif
 
 #if __riscv_zvfh
@@ -62,15 +66,19 @@ void Cast_riscv::cast_fp16_to_fp32(const Mat& bottom_blob, Mat& top_blob, const
 
     const int size = w * h * d * elempack;
 
+    const int batch = bottom_blob.n;
+    const int total_bc = batch * channels;
     #pragma omp parallel for num_threads(opt.num_threads)
-    for (int q = 0; q < channels; q++)
+    for (int bc = 0; bc < total_bc; bc++)
     {
+        int b = bc / channels;
+        int q = bc % channels;
 #if __riscv_zfh
-        const __fp16* ptr = bottom_blob.channel(q);
+        const __fp16* ptr = bottom_blob.batch(b).channel(q);
 #else
-        const unsigned short* ptr = bottom_blob.channel(q);
+        const unsigned short* ptr = bottom_blob.batch(b).channel(q);
 #endif
-        float* outptr = top_blob.channel(q);
+        float* outptr = top_blob.batch(b).channel(q);
 
 #if __riscv_zvfh
         int n = size;
diff --git a/src/layer/riscv/packing_riscv.cpp b/src/layer/riscv/packing_riscv.cpp
index d1e51d504ebe..9d3ebe569020 100644
--- a/src/layer/riscv/packing_riscv.cpp
+++ b/src/layer/riscv/packing_riscv.cpp
@@ -78,6 +78,7 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
     int d = bottom_blob.d;
     int channels = bottom_blob.c;
     int dims = bottom_blob.dims;
+    int batch = bottom_blob.n;
 
     if (!use_padding)
     {
@@ -106,6 +107,7 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
         top_blob.cstep = bottom_blob.cstep * elempack / out_elempack;
         top_blob.elemsize = elemsize / elempack * out_elempack;
         top_blob.elempack = out_elempack;
+        top_blob.nstep = bottom_blob.nstep * elempack / out_elempack;
         return 0;
     }
 
@@ -114,21 +116,24 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
         int outh = h * elempack / out_elempack;
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
-        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
+        top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         if (pack1to4)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const float* r0 = bottom_blob.row(i * 4);
-                const float* r1 = bottom_blob.row(i * 4 + 1);
-                const float* r2 = bottom_blob.row(i * 4 + 2);
-                const float* r3 = bottom_blob.row(i * 4 + 3);
+                int b = bi / outh;
+                int i = bi % outh;
+                const float* r0 = bottom_blob.batch(b).row(i * 4);
+                const float* r1 = bottom_blob.batch(b).row(i * 4 + 1);
+                const float* r2 = bottom_blob.batch(b).row(i * 4 + 2);
+                const float* r3 = bottom_blob.batch(b).row(i * 4 + 3);
 
-                float* outptr = top_blob.row(i);
+                float* outptr = top_blob.batch(b).row(i);
 
 #if __riscv_vector
                 int n = w;
@@ -164,15 +169,18 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
         }
         if (pack4to1)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const float* r0 = bottom_blob.row(i);
+                int b = bi / h;
+                int i = bi % h;
+                const float* r0 = bottom_blob.batch(b).row(i);
 
-                float* outptr0 = top_blob.row(i * 4);
-                float* outptr1 = top_blob.row(i * 4 + 1);
-                float* outptr2 = top_blob.row(i * 4 + 2);
-                float* outptr3 = top_blob.row(i * 4 + 3);
+                float* outptr0 = top_blob.batch(b).row(i * 4);
+                float* outptr1 = top_blob.batch(b).row(i * 4 + 1);
+                float* outptr2 = top_blob.batch(b).row(i * 4 + 2);
+                float* outptr3 = top_blob.batch(b).row(i * 4 + 3);
 
 #if __riscv_vector
                 int n = w;
@@ -209,19 +217,22 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
         }
         if (pack1to8)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const float* r0 = bottom_blob.row(i * 8);
-                const float* r1 = bottom_blob.row(i * 8 + 1);
-                const float* r2 = bottom_blob.row(i * 8 + 2);
-                const float* r3 = bottom_blob.row(i * 8 + 3);
-                const float* r4 = bottom_blob.row(i * 8 + 4);
-                const float* r5 = bottom_blob.row(i * 8 + 5);
-                const float* r6 = bottom_blob.row(i * 8 + 6);
-                const float* r7 = bottom_blob.row(i * 8 + 7);
-
-                float* outptr = top_blob.row(i);
+                int b = bi / outh;
+                int i = bi % outh;
+                const float* r0 = bottom_blob.batch(b).row(i * 8);
+                const float* r1 = bottom_blob.batch(b).row(i * 8 + 1);
+                const float* r2 = bottom_blob.batch(b).row(i * 8 + 2);
+                const float* r3 = bottom_blob.batch(b).row(i * 8 + 3);
+                const float* r4 = bottom_blob.batch(b).row(i * 8 + 4);
+                const float* r5 = bottom_blob.batch(b).row(i * 8 + 5);
+                const float* r6 = bottom_blob.batch(b).row(i * 8 + 6);
+                const float* r7 = bottom_blob.batch(b).row(i * 8 + 7);
+
+                float* outptr = top_blob.batch(b).row(i);
 
 #if __riscv_vector
                 int n = w;
@@ -269,19 +280,22 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
         }
         if (pack8to1)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const float* r0 = bottom_blob.row(i);
-
-                float* outptr0 = top_blob.row(i * 8);
-                float* outptr1 = top_blob.row(i * 8 + 1);
-                float* outptr2 = top_blob.row(i * 8 + 2);
-                float* outptr3 = top_blob.row(i * 8 + 3);
-                float* outptr4 = top_blob.row(i * 8 + 4);
-                float* outptr5 = top_blob.row(i * 8 + 5);
-                float* outptr6 = top_blob.row(i * 8 + 6);
-                float* outptr7 = top_blob.row(i * 8 + 7);
+                int b = bi / h;
+                int i = bi % h;
+                const float* r0 = bottom_blob.batch(b).row(i);
+
+                float* outptr0 = top_blob.batch(b).row(i * 8);
+                float* outptr1 = top_blob.batch(b).row(i * 8 + 1);
+                float* outptr2 = top_blob.batch(b).row(i * 8 + 2);
+                float* outptr3 = top_blob.batch(b).row(i * 8 + 3);
+                float* outptr4 = top_blob.batch(b).row(i * 8 + 4);
+                float* outptr5 = top_blob.batch(b).row(i * 8 + 5);
+                float* outptr6 = top_blob.batch(b).row(i * 8 + 6);
+                float* outptr7 = top_blob.batch(b).row(i * 8 + 7);
 
 #if __riscv_vector
                 int n = w;
@@ -329,13 +343,16 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
         }
         if (pack4to8)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const float* r0 = bottom_blob.row(i * 2);
-                const float* r1 = bottom_blob.row(i * 2 + 1);
+                int b = bi / outh;
+                int i = bi % outh;
+                const float* r0 = bottom_blob.batch(b).row(i * 2);
+                const float* r1 = bottom_blob.batch(b).row(i * 2 + 1);
 
-                float* outptr = top_blob.row(i);
+                float* outptr = top_blob.batch(b).row(i);
 
 #if __riscv_vector
                 int n = w;
@@ -383,13 +400,16 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
         }
         if (pack8to4)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const float* r0 = bottom_blob.row(i);
+                int b = bi / h;
+                int i = bi % h;
+                const float* r0 = bottom_blob.batch(b).row(i);
 
-                float* outptr0 = top_blob.row(i * 2);
-                float* outptr1 = top_blob.row(i * 2 + 1);
+                float* outptr0 = top_blob.batch(b).row(i * 2);
+                float* outptr1 = top_blob.batch(b).row(i * 2 + 1);
 
 #if __riscv_vector
                 int n = w;
@@ -446,23 +466,26 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
         if (dims == 3)
-            top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
+            top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
         else // if (dims == 4)
-            top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator);
+            top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         if (pack1to4)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const float* r0 = bottom_blob.channel(q * 4);
-                const float* r1 = bottom_blob.channel(q * 4 + 1);
-                const float* r2 = bottom_blob.channel(q * 4 + 2);
-                const float* r3 = bottom_blob.channel(q * 4 + 3);
+                int b = bq / outc;
+                int q = bq % outc;
+                const float* r0 = bottom_blob.batch(b).channel(q * 4);
+                const float* r1 = bottom_blob.batch(b).channel(q * 4 + 1);
+                const float* r2 = bottom_blob.batch(b).channel(q * 4 + 2);
+                const float* r3 = bottom_blob.batch(b).channel(q * 4 + 3);
 
-                float* outptr = top_blob.channel(q);
+                float* outptr = top_blob.batch(b).channel(q);
 
 #if __riscv_vector
                 int n = size;
@@ -499,15 +522,18 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
         }
         if (pack4to1)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const float* r0 = bottom_blob.channel(q);
+                int b = bq / channels;
+                int q = bq % channels;
+                const float* r0 = bottom_blob.batch(b).channel(q);
 
-                float* outptr0 = top_blob.channel(q * 4);
-                float* outptr1 = top_blob.channel(q * 4 + 1);
-                float* outptr2 = top_blob.channel(q * 4 + 2);
-                float* outptr3 = top_blob.channel(q * 4 + 3);
+                float* outptr0 = top_blob.batch(b).channel(q * 4);
+                float* outptr1 = top_blob.batch(b).channel(q * 4 + 1);
+                float* outptr2 = top_blob.batch(b).channel(q * 4 + 2);
+                float* outptr3 = top_blob.batch(b).channel(q * 4 + 3);
 
 #if __riscv_vector
                 int n = size;
@@ -543,19 +569,22 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
         }
         if (pack1to8)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const float* r0 = bottom_blob.channel(q * 8);
-                const float* r1 = bottom_blob.channel(q * 8 + 1);
-                const float* r2 = bottom_blob.channel(q * 8 + 2);
-                const float* r3 = bottom_blob.channel(q * 8 + 3);
-                const float* r4 = bottom_blob.channel(q * 8 + 4);
-                const float* r5 = bottom_blob.channel(q * 8 + 5);
-                const float* r6 = bottom_blob.channel(q * 8 + 6);
-                const float* r7 = bottom_blob.channel(q * 8 + 7);
-
-                float* outptr = top_blob.channel(q);
+                int b = bq / outc;
+                int q = bq % outc;
+                const float* r0 = bottom_blob.batch(b).channel(q * 8);
+                const float* r1 = bottom_blob.batch(b).channel(q * 8 + 1);
+                const float* r2 = bottom_blob.batch(b).channel(q * 8 + 2);
+                const float* r3 = bottom_blob.batch(b).channel(q * 8 + 3);
+                const float* r4 = bottom_blob.batch(b).channel(q * 8 + 4);
+                const float* r5 = bottom_blob.batch(b).channel(q * 8 + 5);
+                const float* r6 = bottom_blob.batch(b).channel(q * 8 + 6);
+                const float* r7 = bottom_blob.batch(b).channel(q * 8 + 7);
+
+                float* outptr = top_blob.batch(b).channel(q);
 
 #if __riscv_vector
                 int n = size;
@@ -603,19 +632,22 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
         }
         if (pack8to1)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const float* r0 = bottom_blob.channel(q);
-
-                float* outptr0 = top_blob.channel(q * 8);
-                float* outptr1 = top_blob.channel(q * 8 + 1);
-                float* outptr2 = top_blob.channel(q * 8 + 2);
-                float* outptr3 = top_blob.channel(q * 8 + 3);
-                float* outptr4 = top_blob.channel(q * 8 + 4);
-                float* outptr5 = top_blob.channel(q * 8 + 5);
-                float* outptr6 = top_blob.channel(q * 8 + 6);
-                float* outptr7 = top_blob.channel(q * 8 + 7);
+                int b = bq / channels;
+                int q = bq % channels;
+                const float* r0 = bottom_blob.batch(b).channel(q);
+
+                float* outptr0 = top_blob.batch(b).channel(q * 8);
+                float* outptr1 = top_blob.batch(b).channel(q * 8 + 1);
+                float* outptr2 = top_blob.batch(b).channel(q * 8 + 2);
+                float* outptr3 = top_blob.batch(b).channel(q * 8 + 3);
+                float* outptr4 = top_blob.batch(b).channel(q * 8 + 4);
+                float* outptr5 = top_blob.batch(b).channel(q * 8 + 5);
+                float* outptr6 = top_blob.batch(b).channel(q * 8 + 6);
+                float* outptr7 = top_blob.batch(b).channel(q * 8 + 7);
 
 #if __riscv_vector
                 int n = size;
@@ -663,13 +695,16 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
         }
         if (pack4to8)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const float* r0 = bottom_blob.channel(q * 2);
-                const float* r1 = bottom_blob.channel(q * 2 + 1);
+                int b = bq / outc;
+                int q = bq % outc;
+                const float* r0 = bottom_blob.batch(b).channel(q * 2);
+                const float* r1 = bottom_blob.batch(b).channel(q * 2 + 1);
 
-                float* outptr = top_blob.channel(q);
+                float* outptr = top_blob.batch(b).channel(q);
 
 #if __riscv_vector
                 int n = size;
@@ -717,13 +752,16 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
         }
         if (pack8to4)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const float* r0 = bottom_blob.channel(q);
+                int b = bq / channels;
+                int q = bq % channels;
+                const float* r0 = bottom_blob.batch(b).channel(q);
 
-                float* outptr0 = top_blob.channel(q * 2);
-                float* outptr1 = top_blob.channel(q * 2 + 1);
+                float* outptr0 = top_blob.batch(b).channel(q * 2);
+                float* outptr1 = top_blob.batch(b).channel(q * 2 + 1);
 
 #if __riscv_vector
                 int n = size;
@@ -809,6 +847,7 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
     int d = bottom_blob.d;
     int channels = bottom_blob.c;
     int dims = bottom_blob.dims;
+    int batch = bottom_blob.n;
 
     if (!use_padding)
     {
@@ -837,6 +876,7 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
         top_blob.cstep = bottom_blob.cstep * elempack / out_elempack;
         top_blob.elemsize = elemsize / elempack * out_elempack;
         top_blob.elempack = out_elempack;
+        top_blob.nstep = bottom_blob.nstep * elempack / out_elempack;
         return 0;
     }
 
@@ -845,21 +885,24 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
         int outh = h * elempack / out_elempack;
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
-        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
+        top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         if (pack1to4)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i * 4);
-                const unsigned short* r1 = bottom_blob.row<const unsigned short>(i * 4 + 1);
-                const unsigned short* r2 = bottom_blob.row<const unsigned short>(i * 4 + 2);
-                const unsigned short* r3 = bottom_blob.row<const unsigned short>(i * 4 + 3);
+                int b = bi / outh;
+                int i = bi % outh;
+                const unsigned short* r0 = bottom_blob.batch(b).row<const unsigned short>(i * 4);
+                const unsigned short* r1 = bottom_blob.batch(b).row<const unsigned short>(i * 4 + 1);
+                const unsigned short* r2 = bottom_blob.batch(b).row<const unsigned short>(i * 4 + 2);
+                const unsigned short* r3 = bottom_blob.batch(b).row<const unsigned short>(i * 4 + 3);
 
-                unsigned short* outptr = top_blob.row<unsigned short>(i);
+                unsigned short* outptr = top_blob.batch(b).row<unsigned short>(i);
 
 #if __riscv_vector
                 int n = w;
@@ -895,15 +938,18 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
         }
         if (pack4to1)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
+                int b = bi / h;
+                int i = bi % h;
+                const unsigned short* r0 = bottom_blob.batch(b).row<const unsigned short>(i);
 
-                unsigned short* outptr0 = top_blob.row<unsigned short>(i * 4);
-                unsigned short* outptr1 = top_blob.row<unsigned short>(i * 4 + 1);
-                unsigned short* outptr2 = top_blob.row<unsigned short>(i * 4 + 2);
-                unsigned short* outptr3 = top_blob.row<unsigned short>(i * 4 + 3);
+                unsigned short* outptr0 = top_blob.batch(b).row<unsigned short>(i * 4);
+                unsigned short* outptr1 = top_blob.batch(b).row<unsigned short>(i * 4 + 1);
+                unsigned short* outptr2 = top_blob.batch(b).row<unsigned short>(i * 4 + 2);
+                unsigned short* outptr3 = top_blob.batch(b).row<unsigned short>(i * 4 + 3);
 
 #if __riscv_vector
                 int n = w;
@@ -939,19 +985,22 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
         }
         if (pack1to8)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i * 8);
-                const unsigned short* r1 = bottom_blob.row<const unsigned short>(i * 8 + 1);
-                const unsigned short* r2 = bottom_blob.row<const unsigned short>(i * 8 + 2);
-                const unsigned short* r3 = bottom_blob.row<const unsigned short>(i * 8 + 3);
-                const unsigned short* r4 = bottom_blob.row<const unsigned short>(i * 8 + 4);
-                const unsigned short* r5 = bottom_blob.row<const unsigned short>(i * 8 + 5);
-                const unsigned short* r6 = bottom_blob.row<const unsigned short>(i * 8 + 6);
-                const unsigned short* r7 = bottom_blob.row<const unsigned short>(i * 8 + 7);
-
-                unsigned short* outptr = top_blob.row<unsigned short>(i);
+                int b = bi / outh;
+                int i = bi % outh;
+                const unsigned short* r0 = bottom_blob.batch(b).row<const unsigned short>(i * 8);
+                const unsigned short* r1 = bottom_blob.batch(b).row<const unsigned short>(i * 8 + 1);
+                const unsigned short* r2 = bottom_blob.batch(b).row<const unsigned short>(i * 8 + 2);
+                const unsigned short* r3 = bottom_blob.batch(b).row<const unsigned short>(i * 8 + 3);
+                const unsigned short* r4 = bottom_blob.batch(b).row<const unsigned short>(i * 8 + 4);
+                const unsigned short* r5 = bottom_blob.batch(b).row<const unsigned short>(i * 8 + 5);
+                const unsigned short* r6 = bottom_blob.batch(b).row<const unsigned short>(i * 8 + 6);
+                const unsigned short* r7 = bottom_blob.batch(b).row<const unsigned short>(i * 8 + 7);
+
+                unsigned short* outptr = top_blob.batch(b).row<unsigned short>(i);
 
 #if __riscv_vector
                 int n = w;
@@ -999,19 +1048,22 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
         }
         if (pack8to1)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
-
-                unsigned short* outptr0 = top_blob.row<unsigned short>(i * 8);
-                unsigned short* outptr1 = top_blob.row<unsigned short>(i * 8 + 1);
-                unsigned short* outptr2 = top_blob.row<unsigned short>(i * 8 + 2);
-                unsigned short* outptr3 = top_blob.row<unsigned short>(i * 8 + 3);
-                unsigned short* outptr4 = top_blob.row<unsigned short>(i * 8 + 4);
-                unsigned short* outptr5 = top_blob.row<unsigned short>(i * 8 + 5);
-                unsigned short* outptr6 = top_blob.row<unsigned short>(i * 8 + 6);
-                unsigned short* outptr7 = top_blob.row<unsigned short>(i * 8 + 7);
+                int b = bi / h;
+                int i = bi % h;
+                const unsigned short* r0 = bottom_blob.batch(b).row<const unsigned short>(i);
+
+                unsigned short* outptr0 = top_blob.batch(b).row<unsigned short>(i * 8);
+                unsigned short* outptr1 = top_blob.batch(b).row<unsigned short>(i * 8 + 1);
+                unsigned short* outptr2 = top_blob.batch(b).row<unsigned short>(i * 8 + 2);
+                unsigned short* outptr3 = top_blob.batch(b).row<unsigned short>(i * 8 + 3);
+                unsigned short* outptr4 = top_blob.batch(b).row<unsigned short>(i * 8 + 4);
+                unsigned short* outptr5 = top_blob.batch(b).row<unsigned short>(i * 8 + 5);
+                unsigned short* outptr6 = top_blob.batch(b).row<unsigned short>(i * 8 + 6);
+                unsigned short* outptr7 = top_blob.batch(b).row<unsigned short>(i * 8 + 7);
 
 #if __riscv_vector
                 int n = w;
@@ -1059,13 +1111,16 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
         }
         if (pack4to8)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i * 2);
-                const unsigned short* r1 = bottom_blob.row<const unsigned short>(i * 2 + 1);
+                int b = bi / outh;
+                int i = bi % outh;
+                const unsigned short* r0 = bottom_blob.batch(b).row<const unsigned short>(i * 2);
+                const unsigned short* r1 = bottom_blob.batch(b).row<const unsigned short>(i * 2 + 1);
 
-                unsigned short* outptr = top_blob.row<unsigned short>(i);
+                unsigned short* outptr = top_blob.batch(b).row<unsigned short>(i);
 
 #if __riscv_vector
                 int n = w;
@@ -1113,13 +1168,16 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
         }
         if (pack8to4)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
+                int b = bi / h;
+                int i = bi % h;
+                const unsigned short* r0 = bottom_blob.batch(b).row<const unsigned short>(i);
 
-                unsigned short* outptr0 = top_blob.row<unsigned short>(i * 2);
-                unsigned short* outptr1 = top_blob.row<unsigned short>(i * 2 + 1);
+                unsigned short* outptr0 = top_blob.batch(b).row<unsigned short>(i * 2);
+                unsigned short* outptr1 = top_blob.batch(b).row<unsigned short>(i * 2 + 1);
 
 #if __riscv_vector
                 int n = w;
@@ -1176,23 +1234,26 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
         if (dims == 3)
-            top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
+            top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
         else // if (dims == 4)
-            top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator);
+            top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         if (pack1to4)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const unsigned short* r0 = bottom_blob.channel(q * 4);
-                const unsigned short* r1 = bottom_blob.channel(q * 4 + 1);
-                const unsigned short* r2 = bottom_blob.channel(q * 4 + 2);
-                const unsigned short* r3 = bottom_blob.channel(q * 4 + 3);
+                int b = bq / outc;
+                int q = bq % outc;
+                const unsigned short* r0 = bottom_blob.batch(b).channel(q * 4);
+                const unsigned short* r1 = bottom_blob.batch(b).channel(q * 4 + 1);
+                const unsigned short* r2 = bottom_blob.batch(b).channel(q * 4 + 2);
+                const unsigned short* r3 = bottom_blob.batch(b).channel(q * 4 + 3);
 
-                unsigned short* outptr = top_blob.channel(q);
+                unsigned short* outptr = top_blob.batch(b).channel(q);
 
 #if __riscv_vector
                 int n = size;
@@ -1228,15 +1289,18 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
         }
         if (pack4to1)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const unsigned short* r0 = bottom_blob.channel(q);
+                int b = bq / channels;
+                int q = bq % channels;
+                const unsigned short* r0 = bottom_blob.batch(b).channel(q);
 
-                unsigned short* outptr0 = top_blob.channel(q * 4);
-                unsigned short* outptr1 = top_blob.channel(q * 4 + 1);
-                unsigned short* outptr2 = top_blob.channel(q * 4 + 2);
-                unsigned short* outptr3 = top_blob.channel(q * 4 + 3);
+                unsigned short* outptr0 = top_blob.batch(b).channel(q * 4);
+                unsigned short* outptr1 = top_blob.batch(b).channel(q * 4 + 1);
+                unsigned short* outptr2 = top_blob.batch(b).channel(q * 4 + 2);
+                unsigned short* outptr3 = top_blob.batch(b).channel(q * 4 + 3);
 
 #if __riscv_vector
                 int n = size;
@@ -1272,19 +1336,22 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
         }
         if (pack1to8)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const unsigned short* r0 = bottom_blob.channel(q * 8);
-                const unsigned short* r1 = bottom_blob.channel(q * 8 + 1);
-                const unsigned short* r2 = bottom_blob.channel(q * 8 + 2);
-                const unsigned short* r3 = bottom_blob.channel(q * 8 + 3);
-                const unsigned short* r4 = bottom_blob.channel(q * 8 + 4);
-                const unsigned short* r5 = bottom_blob.channel(q * 8 + 5);
-                const unsigned short* r6 = bottom_blob.channel(q * 8 + 6);
-                const unsigned short* r7 = bottom_blob.channel(q * 8 + 7);
-
-                unsigned short* outptr = top_blob.channel(q);
+                int b = bq / outc;
+                int q = bq % outc;
+                const unsigned short* r0 = bottom_blob.batch(b).channel(q * 8);
+                const unsigned short* r1 = bottom_blob.batch(b).channel(q * 8 + 1);
+                const unsigned short* r2 = bottom_blob.batch(b).channel(q * 8 + 2);
+                const unsigned short* r3 = bottom_blob.batch(b).channel(q * 8 + 3);
+                const unsigned short* r4 = bottom_blob.batch(b).channel(q * 8 + 4);
+                const unsigned short* r5 = bottom_blob.batch(b).channel(q * 8 + 5);
+                const unsigned short* r6 = bottom_blob.batch(b).channel(q * 8 + 6);
+                const unsigned short* r7 = bottom_blob.batch(b).channel(q * 8 + 7);
+
+                unsigned short* outptr = top_blob.batch(b).channel(q);
 
 #if __riscv_vector
                 int n = size;
@@ -1332,19 +1399,22 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
         }
         if (pack8to1)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const unsigned short* r0 = bottom_blob.channel(q);
-
-                unsigned short* outptr0 = top_blob.channel(q * 8);
-                unsigned short* outptr1 = top_blob.channel(q * 8 + 1);
-                unsigned short* outptr2 = top_blob.channel(q * 8 + 2);
-                unsigned short* outptr3 = top_blob.channel(q * 8 + 3);
-                unsigned short* outptr4 = top_blob.channel(q * 8 + 4);
-                unsigned short* outptr5 = top_blob.channel(q * 8 + 5);
-                unsigned short* outptr6 = top_blob.channel(q * 8 + 6);
-                unsigned short* outptr7 = top_blob.channel(q * 8 + 7);
+                int b = bq / channels;
+                int q = bq % channels;
+                const unsigned short* r0 = bottom_blob.batch(b).channel(q);
+
+                unsigned short* outptr0 = top_blob.batch(b).channel(q * 8);
+                unsigned short* outptr1 = top_blob.batch(b).channel(q * 8 + 1);
+                unsigned short* outptr2 = top_blob.batch(b).channel(q * 8 + 2);
+                unsigned short* outptr3 = top_blob.batch(b).channel(q * 8 + 3);
+                unsigned short* outptr4 = top_blob.batch(b).channel(q * 8 + 4);
+                unsigned short* outptr5 = top_blob.batch(b).channel(q * 8 + 5);
+                unsigned short* outptr6 = top_blob.batch(b).channel(q * 8 + 6);
+                unsigned short* outptr7 = top_blob.batch(b).channel(q * 8 + 7);
 
 #if __riscv_vector
                 int n = size;
@@ -1392,13 +1462,16 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
         }
         if (pack4to8)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const unsigned short* r0 = bottom_blob.channel(q * 2);
-                const unsigned short* r1 = bottom_blob.channel(q * 2 + 1);
+                int b = bq / outc;
+                int q = bq % outc;
+                const unsigned short* r0 = bottom_blob.batch(b).channel(q * 2);
+                const unsigned short* r1 = bottom_blob.batch(b).channel(q * 2 + 1);
 
-                unsigned short* outptr = top_blob.channel(q);
+                unsigned short* outptr = top_blob.batch(b).channel(q);
 
 #if __riscv_vector
                 int n = size;
@@ -1445,13 +1518,16 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
         }
         if (pack8to4)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const unsigned short* r0 = bottom_blob.channel(q);
+                int b = bq / channels;
+                int q = bq % channels;
+                const unsigned short* r0 = bottom_blob.batch(b).channel(q);
 
-                unsigned short* outptr0 = top_blob.channel(q * 2);
-                unsigned short* outptr1 = top_blob.channel(q * 2 + 1);
+                unsigned short* outptr0 = top_blob.batch(b).channel(q * 2);
+                unsigned short* outptr1 = top_blob.batch(b).channel(q * 2 + 1);
 
 #if __riscv_vector
                 int n = size;
@@ -1533,6 +1609,7 @@ int Packing_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
     int d = bottom_blob.d;
     int channels = bottom_blob.c;
     int dims = bottom_blob.dims;
+    int batch = bottom_blob.n;
 
     if (!use_padding)
     {
@@ -1561,6 +1638,7 @@ int Packing_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
         top_blob.cstep = bottom_blob.cstep * elempack / out_elempack;
         top_blob.elemsize = elemsize / elempack * out_elempack;
         top_blob.elempack = out_elempack;
+        top_blob.nstep = bottom_blob.nstep * elempack / out_elempack;
         return 0;
     }
 
@@ -1569,25 +1647,28 @@ int Packing_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
         int outh = h * elempack / out_elempack;
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
-        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
+        top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         if (pack1to8)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const signed char* r0 = bottom_blob.row<const signed char>(i * 8);
-                const signed char* r1 = bottom_blob.row<const signed char>(i * 8 + 1);
-                const signed char* r2 = bottom_blob.row<const signed char>(i * 8 + 2);
-                const signed char* r3 = bottom_blob.row<const signed char>(i * 8 + 3);
-                const signed char* r4 = bottom_blob.row<const signed char>(i * 8 + 4);
-                const signed char* r5 = bottom_blob.row<const signed char>(i * 8 + 5);
-                const signed char* r6 = bottom_blob.row<const signed char>(i * 8 + 6);
-                const signed char* r7 = bottom_blob.row<const signed char>(i * 8 + 7);
-
-                signed char* outptr = top_blob.row<signed char>(i);
+                int b = bi / outh;
+                int i = bi % outh;
+                const signed char* r0 = bottom_blob.batch(b).row<const signed char>(i * 8);
+                const signed char* r1 = bottom_blob.batch(b).row<const signed char>(i * 8 + 1);
+                const signed char* r2 = bottom_blob.batch(b).row<const signed char>(i * 8 + 2);
+                const signed char* r3 = bottom_blob.batch(b).row<const signed char>(i * 8 + 3);
+                const signed char* r4 = bottom_blob.batch(b).row<const signed char>(i * 8 + 4);
+                const signed char* r5 = bottom_blob.batch(b).row<const signed char>(i * 8 + 5);
+                const signed char* r6 = bottom_blob.batch(b).row<const signed char>(i * 8 + 6);
+                const signed char* r7 = bottom_blob.batch(b).row<const signed char>(i * 8 + 7);
+
+                signed char* outptr = top_blob.batch(b).row<signed char>(i);
 
                 int j = 0;
                 for (; j < w; j++)
@@ -1607,19 +1688,22 @@ int Packing_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
         }
         if (pack8to1)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const signed char* r0 = bottom_blob.row<const signed char>(i);
-
-                signed char* outptr0 = top_blob.row<signed char>(i * 8);
-                signed char* outptr1 = top_blob.row<signed char>(i * 8 + 1);
-                signed char* outptr2 = top_blob.row<signed char>(i * 8 + 2);
-                signed char* outptr3 = top_blob.row<signed char>(i * 8 + 3);
-                signed char* outptr4 = top_blob.row<signed char>(i * 8 + 4);
-                signed char* outptr5 = top_blob.row<signed char>(i * 8 + 5);
-                signed char* outptr6 = top_blob.row<signed char>(i * 8 + 6);
-                signed char* outptr7 = top_blob.row<signed char>(i * 8 + 7);
+                int b = bi / h;
+                int i = bi % h;
+                const signed char* r0 = bottom_blob.batch(b).row<const signed char>(i);
+
+                signed char* outptr0 = top_blob.batch(b).row<signed char>(i * 8);
+                signed char* outptr1 = top_blob.batch(b).row<signed char>(i * 8 + 1);
+                signed char* outptr2 = top_blob.batch(b).row<signed char>(i * 8 + 2);
+                signed char* outptr3 = top_blob.batch(b).row<signed char>(i * 8 + 3);
+                signed char* outptr4 = top_blob.batch(b).row<signed char>(i * 8 + 4);
+                signed char* outptr5 = top_blob.batch(b).row<signed char>(i * 8 + 5);
+                signed char* outptr6 = top_blob.batch(b).row<signed char>(i * 8 + 6);
+                signed char* outptr7 = top_blob.batch(b).row<signed char>(i * 8 + 7);
 
                 int j = 0;
                 for (; j < w; j++)
@@ -1648,27 +1732,30 @@ int Packing_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
         if (dims == 3)
-            top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
+            top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
         else // if (dims == 4)
-            top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator);
+            top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         if (pack1to8)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const signed char* r0 = bottom_blob.channel(q * 8);
-                const signed char* r1 = bottom_blob.channel(q * 8 + 1);
-                const signed char* r2 = bottom_blob.channel(q * 8 + 2);
-                const signed char* r3 = bottom_blob.channel(q * 8 + 3);
-                const signed char* r4 = bottom_blob.channel(q * 8 + 4);
-                const signed char* r5 = bottom_blob.channel(q * 8 + 5);
-                const signed char* r6 = bottom_blob.channel(q * 8 + 6);
-                const signed char* r7 = bottom_blob.channel(q * 8 + 7);
-
-                signed char* outptr = top_blob.channel(q);
+                int b = bq / outc;
+                int q = bq % outc;
+                const signed char* r0 = bottom_blob.batch(b).channel(q * 8);
+                const signed char* r1 = bottom_blob.batch(b).channel(q * 8 + 1);
+                const signed char* r2 = bottom_blob.batch(b).channel(q * 8 + 2);
+                const signed char* r3 = bottom_blob.batch(b).channel(q * 8 + 3);
+                const signed char* r4 = bottom_blob.batch(b).channel(q * 8 + 4);
+                const signed char* r5 = bottom_blob.batch(b).channel(q * 8 + 5);
+                const signed char* r6 = bottom_blob.batch(b).channel(q * 8 + 6);
+                const signed char* r7 = bottom_blob.batch(b).channel(q * 8 + 7);
+
+                signed char* outptr = top_blob.batch(b).channel(q);
 
                 int i = 0;
                 for (; i < size; i++)
@@ -1688,19 +1775,22 @@ int Packing_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
         }
         if (pack8to1)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const signed char* r0 = bottom_blob.channel(q);
-
-                signed char* outptr0 = top_blob.channel(q * 8);
-                signed char* outptr1 = top_blob.channel(q * 8 + 1);
-                signed char* outptr2 = top_blob.channel(q * 8 + 2);
-                signed char* outptr3 = top_blob.channel(q * 8 + 3);
-                signed char* outptr4 = top_blob.channel(q * 8 + 4);
-                signed char* outptr5 = top_blob.channel(q * 8 + 5);
-                signed char* outptr6 = top_blob.channel(q * 8 + 6);
-                signed char* outptr7 = top_blob.channel(q * 8 + 7);
+                int b = bq / channels;
+                int q = bq % channels;
+                const signed char* r0 = bottom_blob.batch(b).channel(q);
+
+                signed char* outptr0 = top_blob.batch(b).channel(q * 8);
+                signed char* outptr1 = top_blob.batch(b).channel(q * 8 + 1);
+                signed char* outptr2 = top_blob.batch(b).channel(q * 8 + 2);
+                signed char* outptr3 = top_blob.batch(b).channel(q * 8 + 3);
+                signed char* outptr4 = top_blob.batch(b).channel(q * 8 + 4);
+                signed char* outptr5 = top_blob.batch(b).channel(q * 8 + 5);
+                signed char* outptr6 = top_blob.batch(b).channel(q * 8 + 6);
+                signed char* outptr7 = top_blob.batch(b).channel(q * 8 + 7);
 
                 int i = 0;
                 for (; i < size; i++)
diff --git a/src/layer/vulkan/packing_vulkan.cpp b/src/layer/vulkan/packing_vulkan.cpp
index ee93534b9b80..84d48ed74727 100644
--- a/src/layer/vulkan/packing_vulkan.cpp
+++ b/src/layer/vulkan/packing_vulkan.cpp
@@ -190,7 +190,8 @@ int Packing_vulkan::destroy_pipeline(const Option& /*opt*/)
 int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
 {
     const int elempack = bottom_blob.elempack;
-    // NCNN_LOGE("Packing_vulkan b2b %d %d   %d %d", elempack, out_elempack, cast_type_from, cast_type_to);
+    const int B = bottom_blob.n;
+    // NCNN_LOGE("Packing_vulkan b2b %d %d   %d %d  n=%d", elempack, out_elempack, cast_type_from, cast_type_to, B);
 
     if (elempack == out_elempack && cast_type_from == cast_type_to && bottom_blob.allocator == opt.blob_vkallocator)
     {
@@ -258,12 +259,18 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
             top_blob.cstep = bottom_blob.cstep * elempack;
             top_blob.elemsize = bottom_blob.elemsize / elempack;
             top_blob.elempack = out_elempack;
+            // preserve byte stride per batch when element size changes
+            if (B > 1)
+                top_blob.nstep = bottom_blob.nstep * bottom_blob.elemsize / top_blob.elemsize;
             return 0;
         }
 
         int outw = (w * elempack + out_elempack - 1) / out_elempack;
 
-        top_blob.create(outw, out_elemsize, out_elempack, opt.blob_vkallocator);
+        if (B > 1)
+            top_blob.create_batch(outw, B, out_elemsize, out_elempack, opt.blob_vkallocator);
+        else
+            top_blob.create(outw, out_elemsize, out_elempack, opt.blob_vkallocator);
         if (top_blob.empty())
             return -100;
     }
@@ -272,7 +279,10 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
     {
         int outh = (h * elempack + out_elempack - 1) / out_elempack;
 
-        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_vkallocator);
+        if (B > 1)
+            top_blob.create_batch(w, outh, B, out_elemsize, out_elempack, opt.blob_vkallocator);
+        else
+            top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_vkallocator);
         if (top_blob.empty())
             return -100;
     }
@@ -281,7 +291,10 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
     {
         int outc = (channels * elempack + out_elempack - 1) / out_elempack;
 
-        top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_vkallocator);
+        if (B > 1)
+            top_blob.create_batch(w, h, outc, B, out_elemsize, out_elempack, opt.blob_vkallocator);
+        else
+            top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_vkallocator);
         if (top_blob.empty())
             return -100;
     }
@@ -290,145 +303,151 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
     {
         int outc = (channels * elempack + out_elempack - 1) / out_elempack;
 
-        top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_vkallocator);
+        if (B > 1)
+            top_blob.create_batch(w, h, d, outc, B, out_elemsize, out_elempack, opt.blob_vkallocator);
+        else
+            top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_vkallocator);
         if (top_blob.empty())
             return -100;
     }
 
-    std::vector<VkMat> buffer_bindings(4);
-    buffer_bindings[0] = bottom_blob;
-    buffer_bindings[1] = bottom_blob;
-    buffer_bindings[2] = top_blob;
-    buffer_bindings[3] = top_blob;
-
-    if (elempack == out_elempack)
+    // dispatch per batch, writing directly to batch sub-views
+    for (int b = 0; b < B; b++)
     {
-        size_t n = 0;
-        size_t c = 0;
-        size_t stride = 0;
-        if (cast_type_from == 1)
+        const VkMat bottom_b = B > 1 ? bottom_blob.batch(b) : bottom_blob;
+        const VkMat top_b = B > 1 ? top_blob.batch(b) : top_blob;
+
+        std::vector<VkMat> buffer_bindings(4);
+        buffer_bindings[0] = bottom_b;
+        buffer_bindings[1] = bottom_b;
+        buffer_bindings[2] = top_b;
+        buffer_bindings[3] = top_b;
+
+        if (elempack == out_elempack)
         {
-            if (dims == 1 || dims == 2)
+            size_t n = 0;
+            size_t c = 0;
+            size_t stride = 0;
+            if (cast_type_from == 1)
             {
-                n = bottom_blob.cstep * elempack;
-                c = 1;
-                stride = top_blob.cstep * out_elempack;
+                if (dims == 1 || dims == 2)
+                {
+                    n = bottom_b.cstep * elempack;
+                    c = 1;
+                    stride = top_b.cstep * out_elempack;
+                }
+                if (dims == 3 || dims == 4)
+                {
+                    n = bottom_b.cstep * elempack;
+                    c = bottom_b.c;
+                    stride = top_b.cstep * out_elempack;
+                }
             }
-            if (dims == 3 || dims == 4)
+            else // if (cast_type_to == 1)
             {
-                n = bottom_blob.cstep * elempack;
-                c = bottom_blob.c;
-                stride = top_blob.cstep * out_elempack;
+                if (dims == 1 || dims == 2)
+                {
+                    n = top_b.cstep * out_elempack;
+                    c = 1;
+                    stride = bottom_b.cstep * elempack;
+                }
+                if (dims == 3 || dims == 4)
+                {
+                    n = top_b.cstep * out_elempack;
+                    c = top_b.c;
+                    stride = bottom_b.cstep * elempack;
+                }
             }
+
+            std::vector<vk_constant_type> constants(3);
+            constants[0].u32 = n / 4;
+            constants[1].u32 = c;
+            constants[2].u32 = stride / 4;
+
+            VkMat dispatcher;
+            dispatcher.w = n / 4;
+            dispatcher.h = c;
+            dispatcher.c = 1;
+
+            cmd.record_pipeline(pipeline_packing, buffer_bindings, constants, dispatcher);
         }
-        else // if (cast_type_to == 1)
+        if (elempack < out_elempack)
         {
-            if (dims == 1 || dims == 2)
+            size_t n = 0;
+            size_t c = 0;
+            size_t stride = 0;
+            if (dims == 1)
             {
-                n = top_blob.cstep * out_elempack;
-                c = 1;
-                stride = bottom_blob.cstep * elempack;
+                n = 1;
+                c = top_b.w;
+                stride = 1;
+            }
+            if (dims == 2)
+            {
+                n = top_b.w;
+                c = top_b.h;
+                stride = bottom_b.w;
             }
             if (dims == 3 || dims == 4)
             {
-                n = top_blob.cstep * out_elempack;
-                c = top_blob.c;
-                stride = bottom_blob.cstep * elempack;
+                n = top_b.cstep;
+                c = top_b.c;
+                stride = bottom_b.cstep;
             }
-        }
-
-        std::vector<vk_constant_type> constants(3);
-        constants[0].u32 = n / 4;
-        constants[1].u32 = c;
-        constants[2].u32 = stride / 4;
-
-        VkMat dispatcher;
-        dispatcher.w = n / 4;
-        dispatcher.h = c;
-        dispatcher.c = 1;
-
-        cmd.record_pipeline(pipeline_packing, buffer_bindings, constants, dispatcher);
-    }
-    if (elempack < out_elempack)
-    {
-        size_t n = 0;
-        size_t c = 0;
-        size_t stride = 0;
-        if (dims == 1)
-        {
-            n = 1;
-            c = top_blob.w;
-            stride = 1;
-        }
-        if (dims == 2)
-        {
-            n = top_blob.w;
-            c = top_blob.h;
-            stride = bottom_blob.w;
-        }
-        if (dims == 3 || dims == 4)
-        {
-            n = top_blob.cstep;
-            c = top_blob.c;
-            stride = bottom_blob.cstep;
-        }
-
-        std::vector<vk_constant_type> constants(3);
-        constants[0].u32 = n;
-        constants[1].u32 = c;
-        constants[2].u32 = stride;
 
-        // NCNN_LOGE("n = %u   c = %u  stride = %u", n, c, stride);
+            std::vector<vk_constant_type> constants(3);
+            constants[0].u32 = n;
+            constants[1].u32 = c;
+            constants[2].u32 = stride;
 
-        VkMat dispatcher;
-        dispatcher.w = n;
-        dispatcher.h = c;
-        dispatcher.c = 1;
+            VkMat dispatcher;
+            dispatcher.w = n;
+            dispatcher.h = c;
+            dispatcher.c = 1;
 
-        if (elempack == 1 && out_elempack == 4)
-        {
-            cmd.record_pipeline(pipeline_packing_pack1to4, buffer_bindings, constants, dispatcher);
-        }
-    }
-    if (elempack > out_elempack)
-    {
-        size_t n = 0;
-        size_t c = 0;
-        size_t stride = 0;
-        if (dims == 1)
-        {
-            n = 1;
-            c = bottom_blob.w;
-            stride = 1;
+            if (elempack == 1 && out_elempack == 4)
+            {
+                cmd.record_pipeline(pipeline_packing_pack1to4, buffer_bindings, constants, dispatcher);
+            }
         }
-        if (dims == 2)
+        if (elempack > out_elempack)
         {
-            n = bottom_blob.w;
-            c = bottom_blob.h;
-            stride = top_blob.w;
-        }
-        if (dims == 3 || dims == 4)
-        {
-            n = bottom_blob.cstep;
-            c = bottom_blob.c;
-            stride = top_blob.cstep;
-        }
-
-        std::vector<vk_constant_type> constants(3);
-        constants[0].u32 = n;
-        constants[1].u32 = c;
-        constants[2].u32 = stride;
+            size_t n = 0;
+            size_t c = 0;
+            size_t stride = 0;
+            if (dims == 1)
+            {
+                n = 1;
+                c = bottom_b.w;
+                stride = 1;
+            }
+            if (dims == 2)
+            {
+                n = bottom_b.w;
+                c = bottom_b.h;
+                stride = top_b.w;
+            }
+            if (dims == 3 || dims == 4)
+            {
+                n = bottom_b.cstep;
+                c = bottom_b.c;
+                stride = top_b.cstep;
+            }
 
-        // NCNN_LOGE("n = %u   c = %u  stride = %u", n, c, stride);
+            std::vector<vk_constant_type> constants(3);
+            constants[0].u32 = n;
+            constants[1].u32 = c;
+            constants[2].u32 = stride;
 
-        VkMat dispatcher;
-        dispatcher.w = n;
-        dispatcher.h = c;
-        dispatcher.c = 1;
+            VkMat dispatcher;
+            dispatcher.w = n;
+            dispatcher.h = c;
+            dispatcher.c = 1;
 
-        if (elempack == 4 && out_elempack == 1)
-        {
-            cmd.record_pipeline(pipeline_packing_pack4to1, buffer_bindings, constants, dispatcher);
+            if (elempack == 4 && out_elempack == 1)
+            {
+                cmd.record_pipeline(pipeline_packing_pack4to1, buffer_bindings, constants, dispatcher);
+            }
         }
     }
 
diff --git a/src/layer/x86/cast_bf16.h b/src/layer/x86/cast_bf16.h
index fbf6d8693f74..872a3ad9172f 100644
--- a/src/layer/x86/cast_bf16.h
+++ b/src/layer/x86/cast_bf16.h
@@ -35,13 +35,17 @@ static void cast_fp32_to_bf16_sse(const Mat& bottom_blob, Mat& top_blob, const O
     const int channels = bottom_blob.c;
     const int elempack = bottom_blob.elempack;
 
+    const int batch = bottom_blob.n;
     const int size = w * h * d * elempack;
 
+    const int total_bc = batch * channels;
     #pragma omp parallel for num_threads(opt.num_threads)
-    for (int q = 0; q < channels; q++)
+    for (int bc = 0; bc < total_bc; bc++)
     {
-        const float* ptr = bottom_blob.channel(q);
-        unsigned short* outptr = top_blob.channel(q);
+        int b = bc / channels;
+        int q = bc % channels;
+        const float* ptr = bottom_blob.batch(b).channel(q);
+        unsigned short* outptr = top_blob.batch(b).channel(q);
 
         int i = 0;
 #if __SSE2__
@@ -107,13 +111,17 @@ static void cast_bf16_to_fp32_sse(const Mat& bottom_blob, Mat& top_blob, const O
     const int channels = bottom_blob.c;
     const int elempack = bottom_blob.elempack;
 
+    const int batch = bottom_blob.n;
     const int size = w * h * d * elempack;
 
+    const int total_bc = batch * channels;
     #pragma omp parallel for num_threads(opt.num_threads)
-    for (int q = 0; q < channels; q++)
+    for (int bc = 0; bc < total_bc; bc++)
     {
-        const unsigned short* ptr = bottom_blob.channel(q);
-        float* outptr = top_blob.channel(q);
+        int b = bc / channels;
+        int q = bc % channels;
+        const unsigned short* ptr = bottom_blob.batch(b).channel(q);
+        float* outptr = top_blob.batch(b).channel(q);
 
         int i = 0;
 #if __SSE2__
diff --git a/src/layer/x86/cast_fp16.h b/src/layer/x86/cast_fp16.h
index 6739700ed2b6..85deabc6704d 100644
--- a/src/layer/x86/cast_fp16.h
+++ b/src/layer/x86/cast_fp16.h
@@ -22,13 +22,17 @@ static void cast_fp32_to_fp16_sse(const Mat& bottom_blob, Mat& top_blob, const O
     const int channels = bottom_blob.c;
     const int elempack = bottom_blob.elempack;
 
+    const int batch = bottom_blob.n;
     const int size = w * h * d * elempack;
 
+    const int total_bc = batch * channels;
     #pragma omp parallel for num_threads(opt.num_threads)
-    for (int q = 0; q < channels; q++)
+    for (int bc = 0; bc < total_bc; bc++)
     {
-        const float* ptr = bottom_blob.channel(q);
-        unsigned short* outptr = top_blob.channel(q);
+        int b = bc / channels;
+        int q = bc % channels;
+        const float* ptr = bottom_blob.batch(b).channel(q);
+        unsigned short* outptr = top_blob.batch(b).channel(q);
 
         int i = 0;
 #if __F16C__
@@ -82,13 +86,17 @@ static void cast_fp16_to_fp32_sse(const Mat& bottom_blob, Mat& top_blob, const O
     const int channels = bottom_blob.c;
     const int elempack = bottom_blob.elempack;
 
+    const int batch = bottom_blob.n;
     const int size = w * h * d * elempack;
 
+    const int total_bc = batch * channels;
     #pragma omp parallel for num_threads(opt.num_threads)
-    for (int q = 0; q < channels; q++)
+    for (int bc = 0; bc < total_bc; bc++)
     {
-        const unsigned short* ptr = bottom_blob.channel(q);
-        float* outptr = top_blob.channel(q);
+        int b = bc / channels;
+        int q = bc % channels;
+        const unsigned short* ptr = bottom_blob.batch(b).channel(q);
+        float* outptr = top_blob.batch(b).channel(q);
 
         int i = 0;
 #if __F16C__
diff --git a/src/layer/x86/cast_x86.cpp b/src/layer/x86/cast_x86.cpp
index 1d252acf4a54..86b2c17f0d81 100644
--- a/src/layer/x86/cast_x86.cpp
+++ b/src/layer/x86/cast_x86.cpp
@@ -36,6 +36,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
     int d = bottom_blob.d;
     int channels = bottom_blob.c;
     int dims = bottom_blob.dims;
+    int batch = bottom_blob.n;
     size_t elemsize = bottom_blob.elemsize;
     int elempack = bottom_blob.elempack;
 
@@ -44,7 +45,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
     {
         if (type_from == 3)
         {
-            Cast::forward(bottom_blob, top_blob, opt);
+            return Cast::forward(bottom_blob, top_blob, opt);
         }
 
         // float32
@@ -67,21 +68,13 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
     }
 
     if (dims == 1)
-    {
-        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
-    }
+        top_blob.create_batch(w, batch, out_elemsize, elempack, opt.blob_allocator);
     else if (dims == 2)
-    {
-        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
-    }
+        top_blob.create_batch(w, h, batch, out_elemsize, elempack, opt.blob_allocator);
     else if (dims == 3)
-    {
-        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
-    }
+        top_blob.create_batch(w, h, channels, batch, out_elemsize, elempack, opt.blob_allocator);
     else if (dims == 4)
-    {
-        top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_allocator);
-    }
+        top_blob.create_batch(w, h, d, channels, batch, out_elemsize, elempack, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
@@ -99,11 +92,14 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
 
     if (type_from == 3 && type_to == 1)
     {
+        const int total_bc = batch * channels;
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
+        for (int bc = 0; bc < total_bc; bc++)
         {
-            const signed char* ptr = bottom_blob.channel(q);
-            float* outptr = top_blob.channel(q);
+            int b = bc / channels;
+            int q = bc % channels;
+            const signed char* ptr = bottom_blob.batch(b).channel(q);
+            float* outptr = top_blob.batch(b).channel(q);
 
             for (int i = 0; i < size; i++)
             {
diff --git a/src/layer/x86/packing_x86.cpp b/src/layer/x86/packing_x86.cpp
index b6211419d84c..0ce08d11934c 100644
--- a/src/layer/x86/packing_x86.cpp
+++ b/src/layer/x86/packing_x86.cpp
@@ -70,6 +70,7 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
     int d = bottom_blob.d;
     int channels = bottom_blob.c;
     int dims = bottom_blob.dims;
+    int batch = bottom_blob.n;
 
     if (!use_padding)
     {
@@ -98,6 +99,7 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         top_blob.cstep = bottom_blob.cstep * elempack / out_elempack;
         top_blob.elemsize = elemsize / elempack * out_elempack;
         top_blob.elempack = out_elempack;
+        top_blob.nstep = bottom_blob.nstep * elempack / out_elempack;
         return 0;
     }
 
@@ -106,21 +108,24 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         int outh = h * elempack / out_elempack;
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
-        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
+        top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         if (pack1to4)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const float* r0 = bottom_blob.row(i * 4);
-                const float* r1 = bottom_blob.row(i * 4 + 1);
-                const float* r2 = bottom_blob.row(i * 4 + 2);
-                const float* r3 = bottom_blob.row(i * 4 + 3);
+                int b = bi / outh;
+                int i = bi % outh;
+                const float* r0 = bottom_blob.batch(b).row(i * 4);
+                const float* r1 = bottom_blob.batch(b).row(i * 4 + 1);
+                const float* r2 = bottom_blob.batch(b).row(i * 4 + 2);
+                const float* r3 = bottom_blob.batch(b).row(i * 4 + 3);
 
-                float* outptr = top_blob.row(i);
+                float* outptr = top_blob.batch(b).row(i);
 
                 int j = 0;
 #if __SSE2__
@@ -159,15 +164,18 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         }
         if (pack4to1)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const float* r0 = bottom_blob.row(i);
+                int b = bi / h;
+                int i = bi % h;
+                const float* r0 = bottom_blob.batch(b).row(i);
 
-                float* outptr0 = top_blob.row(i * 4);
-                float* outptr1 = top_blob.row(i * 4 + 1);
-                float* outptr2 = top_blob.row(i * 4 + 2);
-                float* outptr3 = top_blob.row(i * 4 + 3);
+                float* outptr0 = top_blob.batch(b).row(i * 4);
+                float* outptr1 = top_blob.batch(b).row(i * 4 + 1);
+                float* outptr2 = top_blob.batch(b).row(i * 4 + 2);
+                float* outptr3 = top_blob.batch(b).row(i * 4 + 3);
 
                 int j = 0;
 #if __SSE2__
@@ -206,19 +214,22 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         }
         if (pack1to8)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const float* r0 = bottom_blob.row(i * 8);
-                const float* r1 = bottom_blob.row(i * 8 + 1);
-                const float* r2 = bottom_blob.row(i * 8 + 2);
-                const float* r3 = bottom_blob.row(i * 8 + 3);
-                const float* r4 = bottom_blob.row(i * 8 + 4);
-                const float* r5 = bottom_blob.row(i * 8 + 5);
-                const float* r6 = bottom_blob.row(i * 8 + 6);
-                const float* r7 = bottom_blob.row(i * 8 + 7);
-
-                float* outptr = top_blob.row(i);
+                int b = bi / outh;
+                int i = bi % outh;
+                const float* r0 = bottom_blob.batch(b).row(i * 8);
+                const float* r1 = bottom_blob.batch(b).row(i * 8 + 1);
+                const float* r2 = bottom_blob.batch(b).row(i * 8 + 2);
+                const float* r3 = bottom_blob.batch(b).row(i * 8 + 3);
+                const float* r4 = bottom_blob.batch(b).row(i * 8 + 4);
+                const float* r5 = bottom_blob.batch(b).row(i * 8 + 5);
+                const float* r6 = bottom_blob.batch(b).row(i * 8 + 6);
+                const float* r7 = bottom_blob.batch(b).row(i * 8 + 7);
+
+                float* outptr = top_blob.batch(b).row(i);
 
                 int j = 0;
 #if __AVX__
@@ -269,19 +280,22 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         }
         if (pack8to1)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const float* r0 = bottom_blob.row(i);
-
-                float* outptr0 = top_blob.row(i * 8);
-                float* outptr1 = top_blob.row(i * 8 + 1);
-                float* outptr2 = top_blob.row(i * 8 + 2);
-                float* outptr3 = top_blob.row(i * 8 + 3);
-                float* outptr4 = top_blob.row(i * 8 + 4);
-                float* outptr5 = top_blob.row(i * 8 + 5);
-                float* outptr6 = top_blob.row(i * 8 + 6);
-                float* outptr7 = top_blob.row(i * 8 + 7);
+                int b = bi / h;
+                int i = bi % h;
+                const float* r0 = bottom_blob.batch(b).row(i);
+
+                float* outptr0 = top_blob.batch(b).row(i * 8);
+                float* outptr1 = top_blob.batch(b).row(i * 8 + 1);
+                float* outptr2 = top_blob.batch(b).row(i * 8 + 2);
+                float* outptr3 = top_blob.batch(b).row(i * 8 + 3);
+                float* outptr4 = top_blob.batch(b).row(i * 8 + 4);
+                float* outptr5 = top_blob.batch(b).row(i * 8 + 5);
+                float* outptr6 = top_blob.batch(b).row(i * 8 + 6);
+                float* outptr7 = top_blob.batch(b).row(i * 8 + 7);
 
                 int j = 0;
 #if __AVX__
@@ -333,13 +347,16 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         }
         if (pack4to8)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const float* r0 = bottom_blob.row(i * 2);
-                const float* r1 = bottom_blob.row(i * 2 + 1);
+                int b = bi / outh;
+                int i = bi % outh;
+                const float* r0 = bottom_blob.batch(b).row(i * 2);
+                const float* r1 = bottom_blob.batch(b).row(i * 2 + 1);
 
-                float* outptr = top_blob.row(i);
+                float* outptr = top_blob.batch(b).row(i);
 
                 for (int j = 0; j < w; j++)
                 {
@@ -360,13 +377,16 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         }
         if (pack8to4)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const float* r0 = bottom_blob.row(i);
+                int b = bi / h;
+                int i = bi % h;
+                const float* r0 = bottom_blob.batch(b).row(i);
 
-                float* outptr0 = top_blob.row(i * 2);
-                float* outptr1 = top_blob.row(i * 2 + 1);
+                float* outptr0 = top_blob.batch(b).row(i * 2);
+                float* outptr1 = top_blob.batch(b).row(i * 2 + 1);
 
                 for (int j = 0; j < w; j++)
                 {
@@ -387,27 +407,30 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         }
         if (pack1to16)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const float* r0 = bottom_blob.row(i * 16);
-                const float* r1 = bottom_blob.row(i * 16 + 1);
-                const float* r2 = bottom_blob.row(i * 16 + 2);
-                const float* r3 = bottom_blob.row(i * 16 + 3);
-                const float* r4 = bottom_blob.row(i * 16 + 4);
-                const float* r5 = bottom_blob.row(i * 16 + 5);
-                const float* r6 = bottom_blob.row(i * 16 + 6);
-                const float* r7 = bottom_blob.row(i * 16 + 7);
-                const float* r8 = bottom_blob.row(i * 16 + 8);
-                const float* r9 = bottom_blob.row(i * 16 + 9);
-                const float* ra = bottom_blob.row(i * 16 + 10);
-                const float* rb = bottom_blob.row(i * 16 + 11);
-                const float* rc = bottom_blob.row(i * 16 + 12);
-                const float* rd = bottom_blob.row(i * 16 + 13);
-                const float* re = bottom_blob.row(i * 16 + 14);
-                const float* rf = bottom_blob.row(i * 16 + 15);
-
-                float* outptr = top_blob.row(i);
+                int b = bi / outh;
+                int i = bi % outh;
+                const float* r0 = bottom_blob.batch(b).row(i * 16);
+                const float* r1 = bottom_blob.batch(b).row(i * 16 + 1);
+                const float* r2 = bottom_blob.batch(b).row(i * 16 + 2);
+                const float* r3 = bottom_blob.batch(b).row(i * 16 + 3);
+                const float* r4 = bottom_blob.batch(b).row(i * 16 + 4);
+                const float* r5 = bottom_blob.batch(b).row(i * 16 + 5);
+                const float* r6 = bottom_blob.batch(b).row(i * 16 + 6);
+                const float* r7 = bottom_blob.batch(b).row(i * 16 + 7);
+                const float* r8 = bottom_blob.batch(b).row(i * 16 + 8);
+                const float* r9 = bottom_blob.batch(b).row(i * 16 + 9);
+                const float* ra = bottom_blob.batch(b).row(i * 16 + 10);
+                const float* rb = bottom_blob.batch(b).row(i * 16 + 11);
+                const float* rc = bottom_blob.batch(b).row(i * 16 + 12);
+                const float* rd = bottom_blob.batch(b).row(i * 16 + 13);
+                const float* re = bottom_blob.batch(b).row(i * 16 + 14);
+                const float* rf = bottom_blob.batch(b).row(i * 16 + 15);
+
+                float* outptr = top_blob.batch(b).row(i);
 
                 int j = 0;
 #if __AVX512F__
@@ -490,27 +513,30 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         }
         if (pack16to1)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const float* r0 = bottom_blob.row(i);
-
-                float* outptr0 = top_blob.row(i * 16);
-                float* outptr1 = top_blob.row(i * 16 + 1);
-                float* outptr2 = top_blob.row(i * 16 + 2);
-                float* outptr3 = top_blob.row(i * 16 + 3);
-                float* outptr4 = top_blob.row(i * 16 + 4);
-                float* outptr5 = top_blob.row(i * 16 + 5);
-                float* outptr6 = top_blob.row(i * 16 + 6);
-                float* outptr7 = top_blob.row(i * 16 + 7);
-                float* outptr8 = top_blob.row(i * 16 + 8);
-                float* outptr9 = top_blob.row(i * 16 + 9);
-                float* outptra = top_blob.row(i * 16 + 10);
-                float* outptrb = top_blob.row(i * 16 + 11);
-                float* outptrc = top_blob.row(i * 16 + 12);
-                float* outptrd = top_blob.row(i * 16 + 13);
-                float* outptre = top_blob.row(i * 16 + 14);
-                float* outptrf = top_blob.row(i * 16 + 15);
+                int b = bi / h;
+                int i = bi % h;
+                const float* r0 = bottom_blob.batch(b).row(i);
+
+                float* outptr0 = top_blob.batch(b).row(i * 16);
+                float* outptr1 = top_blob.batch(b).row(i * 16 + 1);
+                float* outptr2 = top_blob.batch(b).row(i * 16 + 2);
+                float* outptr3 = top_blob.batch(b).row(i * 16 + 3);
+                float* outptr4 = top_blob.batch(b).row(i * 16 + 4);
+                float* outptr5 = top_blob.batch(b).row(i * 16 + 5);
+                float* outptr6 = top_blob.batch(b).row(i * 16 + 6);
+                float* outptr7 = top_blob.batch(b).row(i * 16 + 7);
+                float* outptr8 = top_blob.batch(b).row(i * 16 + 8);
+                float* outptr9 = top_blob.batch(b).row(i * 16 + 9);
+                float* outptra = top_blob.batch(b).row(i * 16 + 10);
+                float* outptrb = top_blob.batch(b).row(i * 16 + 11);
+                float* outptrc = top_blob.batch(b).row(i * 16 + 12);
+                float* outptrd = top_blob.batch(b).row(i * 16 + 13);
+                float* outptre = top_blob.batch(b).row(i * 16 + 14);
+                float* outptrf = top_blob.batch(b).row(i * 16 + 15);
 
                 int j = 0;
 #if __AVX512F__
@@ -594,15 +620,18 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         }
         if (pack4to16)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const float* r0 = bottom_blob.row(i * 4);
-                const float* r1 = bottom_blob.row(i * 4 + 1);
-                const float* r2 = bottom_blob.row(i * 4 + 2);
-                const float* r3 = bottom_blob.row(i * 4 + 3);
+                int b = bi / outh;
+                int i = bi % outh;
+                const float* r0 = bottom_blob.batch(b).row(i * 4);
+                const float* r1 = bottom_blob.batch(b).row(i * 4 + 1);
+                const float* r2 = bottom_blob.batch(b).row(i * 4 + 2);
+                const float* r3 = bottom_blob.batch(b).row(i * 4 + 3);
 
-                float* outptr = top_blob.row(i);
+                float* outptr = top_blob.batch(b).row(i);
 
                 for (int j = 0; j < w; j++)
                 {
@@ -633,15 +662,18 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         }
         if (pack16to4)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const float* r0 = bottom_blob.row(i);
+                int b = bi / h;
+                int i = bi % h;
+                const float* r0 = bottom_blob.batch(b).row(i);
 
-                float* outptr0 = top_blob.row(i * 4);
-                float* outptr1 = top_blob.row(i * 4 + 1);
-                float* outptr2 = top_blob.row(i * 4 + 2);
-                float* outptr3 = top_blob.row(i * 4 + 3);
+                float* outptr0 = top_blob.batch(b).row(i * 4);
+                float* outptr1 = top_blob.batch(b).row(i * 4 + 1);
+                float* outptr2 = top_blob.batch(b).row(i * 4 + 2);
+                float* outptr3 = top_blob.batch(b).row(i * 4 + 3);
 
                 for (int j = 0; j < w; j++)
                 {
@@ -672,13 +704,16 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         }
         if (pack8to16)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const float* r0 = bottom_blob.row(i * 2);
-                const float* r1 = bottom_blob.row(i * 2 + 1);
+                int b = bi / outh;
+                int i = bi % outh;
+                const float* r0 = bottom_blob.batch(b).row(i * 2);
+                const float* r1 = bottom_blob.batch(b).row(i * 2 + 1);
 
-                float* outptr = top_blob.row(i);
+                float* outptr = top_blob.batch(b).row(i);
 
                 for (int j = 0; j < w; j++)
                 {
@@ -707,13 +742,16 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         }
         if (pack16to8)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const float* r0 = bottom_blob.row(i);
+                int b = bi / h;
+                int i = bi % h;
+                const float* r0 = bottom_blob.batch(b).row(i);
 
-                float* outptr0 = top_blob.row(i * 2);
-                float* outptr1 = top_blob.row(i * 2 + 1);
+                float* outptr0 = top_blob.batch(b).row(i * 2);
+                float* outptr1 = top_blob.batch(b).row(i * 2 + 1);
 
                 for (int j = 0; j < w; j++)
                 {
@@ -751,23 +789,26 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
         if (dims == 3)
-            top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
+            top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
         else // if (dims == 4)
-            top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator);
+            top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         if (pack1to4)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const float* r0 = bottom_blob.channel(q * 4);
-                const float* r1 = bottom_blob.channel(q * 4 + 1);
-                const float* r2 = bottom_blob.channel(q * 4 + 2);
-                const float* r3 = bottom_blob.channel(q * 4 + 3);
+                int b = bq / outc;
+                int q = bq % outc;
+                const float* r0 = bottom_blob.batch(b).channel(q * 4);
+                const float* r1 = bottom_blob.batch(b).channel(q * 4 + 1);
+                const float* r2 = bottom_blob.batch(b).channel(q * 4 + 2);
+                const float* r3 = bottom_blob.batch(b).channel(q * 4 + 3);
 
-                float* outptr = top_blob.channel(q);
+                float* outptr = top_blob.batch(b).channel(q);
 
                 int i = 0;
 #if __SSE2__
@@ -806,15 +847,18 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         }
         if (pack4to1)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const float* r0 = bottom_blob.channel(q);
+                int b = bq / channels;
+                int q = bq % channels;
+                const float* r0 = bottom_blob.batch(b).channel(q);
 
-                float* outptr0 = top_blob.channel(q * 4);
-                float* outptr1 = top_blob.channel(q * 4 + 1);
-                float* outptr2 = top_blob.channel(q * 4 + 2);
-                float* outptr3 = top_blob.channel(q * 4 + 3);
+                float* outptr0 = top_blob.batch(b).channel(q * 4);
+                float* outptr1 = top_blob.batch(b).channel(q * 4 + 1);
+                float* outptr2 = top_blob.batch(b).channel(q * 4 + 2);
+                float* outptr3 = top_blob.batch(b).channel(q * 4 + 3);
 
                 int i = 0;
 #if __SSE2__
@@ -853,19 +897,22 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         }
         if (pack1to8)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const float* r0 = bottom_blob.channel(q * 8);
-                const float* r1 = bottom_blob.channel(q * 8 + 1);
-                const float* r2 = bottom_blob.channel(q * 8 + 2);
-                const float* r3 = bottom_blob.channel(q * 8 + 3);
-                const float* r4 = bottom_blob.channel(q * 8 + 4);
-                const float* r5 = bottom_blob.channel(q * 8 + 5);
-                const float* r6 = bottom_blob.channel(q * 8 + 6);
-                const float* r7 = bottom_blob.channel(q * 8 + 7);
-
-                float* outptr = top_blob.channel(q);
+                int b = bq / outc;
+                int q = bq % outc;
+                const float* r0 = bottom_blob.batch(b).channel(q * 8);
+                const float* r1 = bottom_blob.batch(b).channel(q * 8 + 1);
+                const float* r2 = bottom_blob.batch(b).channel(q * 8 + 2);
+                const float* r3 = bottom_blob.batch(b).channel(q * 8 + 3);
+                const float* r4 = bottom_blob.batch(b).channel(q * 8 + 4);
+                const float* r5 = bottom_blob.batch(b).channel(q * 8 + 5);
+                const float* r6 = bottom_blob.batch(b).channel(q * 8 + 6);
+                const float* r7 = bottom_blob.batch(b).channel(q * 8 + 7);
+
+                float* outptr = top_blob.batch(b).channel(q);
 
                 int i = 0;
 #if __AVX__
@@ -916,19 +963,22 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         }
         if (pack8to1)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const float* r0 = bottom_blob.channel(q);
-
-                float* outptr0 = top_blob.channel(q * 8);
-                float* outptr1 = top_blob.channel(q * 8 + 1);
-                float* outptr2 = top_blob.channel(q * 8 + 2);
-                float* outptr3 = top_blob.channel(q * 8 + 3);
-                float* outptr4 = top_blob.channel(q * 8 + 4);
-                float* outptr5 = top_blob.channel(q * 8 + 5);
-                float* outptr6 = top_blob.channel(q * 8 + 6);
-                float* outptr7 = top_blob.channel(q * 8 + 7);
+                int b = bq / channels;
+                int q = bq % channels;
+                const float* r0 = bottom_blob.batch(b).channel(q);
+
+                float* outptr0 = top_blob.batch(b).channel(q * 8);
+                float* outptr1 = top_blob.batch(b).channel(q * 8 + 1);
+                float* outptr2 = top_blob.batch(b).channel(q * 8 + 2);
+                float* outptr3 = top_blob.batch(b).channel(q * 8 + 3);
+                float* outptr4 = top_blob.batch(b).channel(q * 8 + 4);
+                float* outptr5 = top_blob.batch(b).channel(q * 8 + 5);
+                float* outptr6 = top_blob.batch(b).channel(q * 8 + 6);
+                float* outptr7 = top_blob.batch(b).channel(q * 8 + 7);
 
                 int i = 0;
 #if __AVX__
@@ -980,13 +1030,16 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         }
         if (pack4to8)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const float* r0 = bottom_blob.channel(q * 2);
-                const float* r1 = bottom_blob.channel(q * 2 + 1);
+                int b = bq / outc;
+                int q = bq % outc;
+                const float* r0 = bottom_blob.batch(b).channel(q * 2);
+                const float* r1 = bottom_blob.batch(b).channel(q * 2 + 1);
 
-                float* outptr = top_blob.channel(q);
+                float* outptr = top_blob.batch(b).channel(q);
 
                 for (int i = 0; i < size; i++)
                 {
@@ -1007,13 +1060,16 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         }
         if (pack8to4)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const float* r0 = bottom_blob.channel(q);
+                int b = bq / channels;
+                int q = bq % channels;
+                const float* r0 = bottom_blob.batch(b).channel(q);
 
-                float* outptr0 = top_blob.channel(q * 2);
-                float* outptr1 = top_blob.channel(q * 2 + 1);
+                float* outptr0 = top_blob.batch(b).channel(q * 2);
+                float* outptr1 = top_blob.batch(b).channel(q * 2 + 1);
 
                 for (int i = 0; i < size; i++)
                 {
@@ -1034,27 +1090,30 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         }
         if (pack1to16)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const float* r0 = bottom_blob.channel(q * 16);
-                const float* r1 = bottom_blob.channel(q * 16 + 1);
-                const float* r2 = bottom_blob.channel(q * 16 + 2);
-                const float* r3 = bottom_blob.channel(q * 16 + 3);
-                const float* r4 = bottom_blob.channel(q * 16 + 4);
-                const float* r5 = bottom_blob.channel(q * 16 + 5);
-                const float* r6 = bottom_blob.channel(q * 16 + 6);
-                const float* r7 = bottom_blob.channel(q * 16 + 7);
-                const float* r8 = bottom_blob.channel(q * 16 + 8);
-                const float* r9 = bottom_blob.channel(q * 16 + 9);
-                const float* ra = bottom_blob.channel(q * 16 + 10);
-                const float* rb = bottom_blob.channel(q * 16 + 11);
-                const float* rc = bottom_blob.channel(q * 16 + 12);
-                const float* rd = bottom_blob.channel(q * 16 + 13);
-                const float* re = bottom_blob.channel(q * 16 + 14);
-                const float* rf = bottom_blob.channel(q * 16 + 15);
-
-                float* outptr = top_blob.channel(q);
+                int b = bq / outc;
+                int q = bq % outc;
+                const float* r0 = bottom_blob.batch(b).channel(q * 16);
+                const float* r1 = bottom_blob.batch(b).channel(q * 16 + 1);
+                const float* r2 = bottom_blob.batch(b).channel(q * 16 + 2);
+                const float* r3 = bottom_blob.batch(b).channel(q * 16 + 3);
+                const float* r4 = bottom_blob.batch(b).channel(q * 16 + 4);
+                const float* r5 = bottom_blob.batch(b).channel(q * 16 + 5);
+                const float* r6 = bottom_blob.batch(b).channel(q * 16 + 6);
+                const float* r7 = bottom_blob.batch(b).channel(q * 16 + 7);
+                const float* r8 = bottom_blob.batch(b).channel(q * 16 + 8);
+                const float* r9 = bottom_blob.batch(b).channel(q * 16 + 9);
+                const float* ra = bottom_blob.batch(b).channel(q * 16 + 10);
+                const float* rb = bottom_blob.batch(b).channel(q * 16 + 11);
+                const float* rc = bottom_blob.batch(b).channel(q * 16 + 12);
+                const float* rd = bottom_blob.batch(b).channel(q * 16 + 13);
+                const float* re = bottom_blob.batch(b).channel(q * 16 + 14);
+                const float* rf = bottom_blob.batch(b).channel(q * 16 + 15);
+
+                float* outptr = top_blob.batch(b).channel(q);
 
                 int i = 0;
 #if __AVX512F__
@@ -1137,27 +1196,30 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         }
         if (pack16to1)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const float* r0 = bottom_blob.channel(q);
-
-                float* outptr0 = top_blob.channel(q * 16);
-                float* outptr1 = top_blob.channel(q * 16 + 1);
-                float* outptr2 = top_blob.channel(q * 16 + 2);
-                float* outptr3 = top_blob.channel(q * 16 + 3);
-                float* outptr4 = top_blob.channel(q * 16 + 4);
-                float* outptr5 = top_blob.channel(q * 16 + 5);
-                float* outptr6 = top_blob.channel(q * 16 + 6);
-                float* outptr7 = top_blob.channel(q * 16 + 7);
-                float* outptr8 = top_blob.channel(q * 16 + 8);
-                float* outptr9 = top_blob.channel(q * 16 + 9);
-                float* outptra = top_blob.channel(q * 16 + 10);
-                float* outptrb = top_blob.channel(q * 16 + 11);
-                float* outptrc = top_blob.channel(q * 16 + 12);
-                float* outptrd = top_blob.channel(q * 16 + 13);
-                float* outptre = top_blob.channel(q * 16 + 14);
-                float* outptrf = top_blob.channel(q * 16 + 15);
+                int b = bq / channels;
+                int q = bq % channels;
+                const float* r0 = bottom_blob.batch(b).channel(q);
+
+                float* outptr0 = top_blob.batch(b).channel(q * 16);
+                float* outptr1 = top_blob.batch(b).channel(q * 16 + 1);
+                float* outptr2 = top_blob.batch(b).channel(q * 16 + 2);
+                float* outptr3 = top_blob.batch(b).channel(q * 16 + 3);
+                float* outptr4 = top_blob.batch(b).channel(q * 16 + 4);
+                float* outptr5 = top_blob.batch(b).channel(q * 16 + 5);
+                float* outptr6 = top_blob.batch(b).channel(q * 16 + 6);
+                float* outptr7 = top_blob.batch(b).channel(q * 16 + 7);
+                float* outptr8 = top_blob.batch(b).channel(q * 16 + 8);
+                float* outptr9 = top_blob.batch(b).channel(q * 16 + 9);
+                float* outptra = top_blob.batch(b).channel(q * 16 + 10);
+                float* outptrb = top_blob.batch(b).channel(q * 16 + 11);
+                float* outptrc = top_blob.batch(b).channel(q * 16 + 12);
+                float* outptrd = top_blob.batch(b).channel(q * 16 + 13);
+                float* outptre = top_blob.batch(b).channel(q * 16 + 14);
+                float* outptrf = top_blob.batch(b).channel(q * 16 + 15);
 
                 int i = 0;
 #if __AVX512F__
@@ -1241,15 +1303,18 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         }
         if (pack4to16)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const float* r0 = bottom_blob.channel(q * 4);
-                const float* r1 = bottom_blob.channel(q * 4 + 1);
-                const float* r2 = bottom_blob.channel(q * 4 + 2);
-                const float* r3 = bottom_blob.channel(q * 4 + 3);
+                int b = bq / outc;
+                int q = bq % outc;
+                const float* r0 = bottom_blob.batch(b).channel(q * 4);
+                const float* r1 = bottom_blob.batch(b).channel(q * 4 + 1);
+                const float* r2 = bottom_blob.batch(b).channel(q * 4 + 2);
+                const float* r3 = bottom_blob.batch(b).channel(q * 4 + 3);
 
-                float* outptr = top_blob.channel(q);
+                float* outptr = top_blob.batch(b).channel(q);
 
                 for (int i = 0; i < size; i++)
                 {
@@ -1280,15 +1345,18 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         }
         if (pack16to4)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const float* r0 = bottom_blob.channel(q);
+                int b = bq / channels;
+                int q = bq % channels;
+                const float* r0 = bottom_blob.batch(b).channel(q);
 
-                float* outptr0 = top_blob.channel(q * 4);
-                float* outptr1 = top_blob.channel(q * 4 + 1);
-                float* outptr2 = top_blob.channel(q * 4 + 2);
-                float* outptr3 = top_blob.channel(q * 4 + 3);
+                float* outptr0 = top_blob.batch(b).channel(q * 4);
+                float* outptr1 = top_blob.batch(b).channel(q * 4 + 1);
+                float* outptr2 = top_blob.batch(b).channel(q * 4 + 2);
+                float* outptr3 = top_blob.batch(b).channel(q * 4 + 3);
 
                 for (int i = 0; i < size; i++)
                 {
@@ -1319,13 +1387,16 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         }
         if (pack8to16)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const float* r0 = bottom_blob.channel(q * 2);
-                const float* r1 = bottom_blob.channel(q * 2 + 1);
+                int b = bq / outc;
+                int q = bq % outc;
+                const float* r0 = bottom_blob.batch(b).channel(q * 2);
+                const float* r1 = bottom_blob.batch(b).channel(q * 2 + 1);
 
-                float* outptr = top_blob.channel(q);
+                float* outptr = top_blob.batch(b).channel(q);
 
                 for (int i = 0; i < size; i++)
                 {
@@ -1354,13 +1425,16 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         }
         if (pack16to8)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const float* r0 = bottom_blob.channel(q);
+                int b = bq / channels;
+                int q = bq % channels;
+                const float* r0 = bottom_blob.batch(b).channel(q);
 
-                float* outptr0 = top_blob.channel(q * 2);
-                float* outptr1 = top_blob.channel(q * 2 + 1);
+                float* outptr0 = top_blob.batch(b).channel(q * 2);
+                float* outptr1 = top_blob.batch(b).channel(q * 2 + 1);
 
                 for (int i = 0; i < size; i++)
                 {
@@ -1433,6 +1507,7 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
     int d = bottom_blob.d;
     int channels = bottom_blob.c;
     int dims = bottom_blob.dims;
+    int batch = bottom_blob.n;
 
     if (!use_padding)
     {
@@ -1461,6 +1536,7 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         top_blob.cstep = bottom_blob.cstep * elempack / out_elempack;
         top_blob.elemsize = elemsize / elempack * out_elempack;
         top_blob.elempack = out_elempack;
+        top_blob.nstep = bottom_blob.nstep * elempack / out_elempack;
         return 0;
     }
 
@@ -1469,21 +1545,24 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         int outh = h * elempack / out_elempack;
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
-        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
+        top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         if (pack1to4)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i * 4);
-                const unsigned short* r1 = bottom_blob.row<const unsigned short>(i * 4 + 1);
-                const unsigned short* r2 = bottom_blob.row<const unsigned short>(i * 4 + 2);
-                const unsigned short* r3 = bottom_blob.row<const unsigned short>(i * 4 + 3);
+                int b = bi / outh;
+                int i = bi % outh;
+                const unsigned short* r0 = bottom_blob.batch(b).row<const unsigned short>(i * 4);
+                const unsigned short* r1 = bottom_blob.batch(b).row<const unsigned short>(i * 4 + 1);
+                const unsigned short* r2 = bottom_blob.batch(b).row<const unsigned short>(i * 4 + 2);
+                const unsigned short* r3 = bottom_blob.batch(b).row<const unsigned short>(i * 4 + 3);
 
-                unsigned short* outptr = top_blob.row<unsigned short>(i);
+                unsigned short* outptr = top_blob.batch(b).row<unsigned short>(i);
 
                 int j = 0;
 #if __SSE2__
@@ -1524,15 +1603,18 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack4to1)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
+                int b = bi / h;
+                int i = bi % h;
+                const unsigned short* r0 = bottom_blob.batch(b).row<const unsigned short>(i);
 
-                unsigned short* outptr0 = top_blob.row<unsigned short>(i * 4);
-                unsigned short* outptr1 = top_blob.row<unsigned short>(i * 4 + 1);
-                unsigned short* outptr2 = top_blob.row<unsigned short>(i * 4 + 2);
-                unsigned short* outptr3 = top_blob.row<unsigned short>(i * 4 + 3);
+                unsigned short* outptr0 = top_blob.batch(b).row<unsigned short>(i * 4);
+                unsigned short* outptr1 = top_blob.batch(b).row<unsigned short>(i * 4 + 1);
+                unsigned short* outptr2 = top_blob.batch(b).row<unsigned short>(i * 4 + 2);
+                unsigned short* outptr3 = top_blob.batch(b).row<unsigned short>(i * 4 + 3);
 
                 int j = 0;
 #if __SSE2__
@@ -1573,19 +1655,22 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack1to8)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i * 8);
-                const unsigned short* r1 = bottom_blob.row<const unsigned short>(i * 8 + 1);
-                const unsigned short* r2 = bottom_blob.row<const unsigned short>(i * 8 + 2);
-                const unsigned short* r3 = bottom_blob.row<const unsigned short>(i * 8 + 3);
-                const unsigned short* r4 = bottom_blob.row<const unsigned short>(i * 8 + 4);
-                const unsigned short* r5 = bottom_blob.row<const unsigned short>(i * 8 + 5);
-                const unsigned short* r6 = bottom_blob.row<const unsigned short>(i * 8 + 6);
-                const unsigned short* r7 = bottom_blob.row<const unsigned short>(i * 8 + 7);
-
-                unsigned short* outptr = top_blob.row<unsigned short>(i);
+                int b = bi / outh;
+                int i = bi % outh;
+                const unsigned short* r0 = bottom_blob.batch(b).row<const unsigned short>(i * 8);
+                const unsigned short* r1 = bottom_blob.batch(b).row<const unsigned short>(i * 8 + 1);
+                const unsigned short* r2 = bottom_blob.batch(b).row<const unsigned short>(i * 8 + 2);
+                const unsigned short* r3 = bottom_blob.batch(b).row<const unsigned short>(i * 8 + 3);
+                const unsigned short* r4 = bottom_blob.batch(b).row<const unsigned short>(i * 8 + 4);
+                const unsigned short* r5 = bottom_blob.batch(b).row<const unsigned short>(i * 8 + 5);
+                const unsigned short* r6 = bottom_blob.batch(b).row<const unsigned short>(i * 8 + 6);
+                const unsigned short* r7 = bottom_blob.batch(b).row<const unsigned short>(i * 8 + 7);
+
+                unsigned short* outptr = top_blob.batch(b).row<unsigned short>(i);
 
                 int j = 0;
                 for (; j < w; j++)
@@ -1605,19 +1690,22 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack8to1)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
-
-                unsigned short* outptr0 = top_blob.row<unsigned short>(i * 8);
-                unsigned short* outptr1 = top_blob.row<unsigned short>(i * 8 + 1);
-                unsigned short* outptr2 = top_blob.row<unsigned short>(i * 8 + 2);
-                unsigned short* outptr3 = top_blob.row<unsigned short>(i * 8 + 3);
-                unsigned short* outptr4 = top_blob.row<unsigned short>(i * 8 + 4);
-                unsigned short* outptr5 = top_blob.row<unsigned short>(i * 8 + 5);
-                unsigned short* outptr6 = top_blob.row<unsigned short>(i * 8 + 6);
-                unsigned short* outptr7 = top_blob.row<unsigned short>(i * 8 + 7);
+                int b = bi / h;
+                int i = bi % h;
+                const unsigned short* r0 = bottom_blob.batch(b).row<const unsigned short>(i);
+
+                unsigned short* outptr0 = top_blob.batch(b).row<unsigned short>(i * 8);
+                unsigned short* outptr1 = top_blob.batch(b).row<unsigned short>(i * 8 + 1);
+                unsigned short* outptr2 = top_blob.batch(b).row<unsigned short>(i * 8 + 2);
+                unsigned short* outptr3 = top_blob.batch(b).row<unsigned short>(i * 8 + 3);
+                unsigned short* outptr4 = top_blob.batch(b).row<unsigned short>(i * 8 + 4);
+                unsigned short* outptr5 = top_blob.batch(b).row<unsigned short>(i * 8 + 5);
+                unsigned short* outptr6 = top_blob.batch(b).row<unsigned short>(i * 8 + 6);
+                unsigned short* outptr7 = top_blob.batch(b).row<unsigned short>(i * 8 + 7);
 
                 int j = 0;
                 for (; j < w; j++)
@@ -1637,13 +1725,16 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack4to8)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i * 2);
-                const unsigned short* r1 = bottom_blob.row<const unsigned short>(i * 2 + 1);
+                int b = bi / outh;
+                int i = bi % outh;
+                const unsigned short* r0 = bottom_blob.batch(b).row<const unsigned short>(i * 2);
+                const unsigned short* r1 = bottom_blob.batch(b).row<const unsigned short>(i * 2 + 1);
 
-                unsigned short* outptr = top_blob.row<unsigned short>(i);
+                unsigned short* outptr = top_blob.batch(b).row<unsigned short>(i);
 
                 for (int j = 0; j < w; j++)
                 {
@@ -1664,13 +1755,16 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack8to4)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
+                int b = bi / h;
+                int i = bi % h;
+                const unsigned short* r0 = bottom_blob.batch(b).row<const unsigned short>(i);
 
-                unsigned short* outptr0 = top_blob.row<unsigned short>(i * 2);
-                unsigned short* outptr1 = top_blob.row<unsigned short>(i * 2 + 1);
+                unsigned short* outptr0 = top_blob.batch(b).row<unsigned short>(i * 2);
+                unsigned short* outptr1 = top_blob.batch(b).row<unsigned short>(i * 2 + 1);
 
                 for (int j = 0; j < w; j++)
                 {
@@ -1691,27 +1785,30 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack1to16)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i * 16);
-                const unsigned short* r1 = bottom_blob.row<const unsigned short>(i * 16 + 1);
-                const unsigned short* r2 = bottom_blob.row<const unsigned short>(i * 16 + 2);
-                const unsigned short* r3 = bottom_blob.row<const unsigned short>(i * 16 + 3);
-                const unsigned short* r4 = bottom_blob.row<const unsigned short>(i * 16 + 4);
-                const unsigned short* r5 = bottom_blob.row<const unsigned short>(i * 16 + 5);
-                const unsigned short* r6 = bottom_blob.row<const unsigned short>(i * 16 + 6);
-                const unsigned short* r7 = bottom_blob.row<const unsigned short>(i * 16 + 7);
-                const unsigned short* r8 = bottom_blob.row<const unsigned short>(i * 16 + 8);
-                const unsigned short* r9 = bottom_blob.row<const unsigned short>(i * 16 + 9);
-                const unsigned short* ra = bottom_blob.row<const unsigned short>(i * 16 + 10);
-                const unsigned short* rb = bottom_blob.row<const unsigned short>(i * 16 + 11);
-                const unsigned short* rc = bottom_blob.row<const unsigned short>(i * 16 + 12);
-                const unsigned short* rd = bottom_blob.row<const unsigned short>(i * 16 + 13);
-                const unsigned short* re = bottom_blob.row<const unsigned short>(i * 16 + 14);
-                const unsigned short* rf = bottom_blob.row<const unsigned short>(i * 16 + 15);
-
-                unsigned short* outptr = top_blob.row<unsigned short>(i);
+                int b = bi / outh;
+                int i = bi % outh;
+                const unsigned short* r0 = bottom_blob.batch(b).row<const unsigned short>(i * 16);
+                const unsigned short* r1 = bottom_blob.batch(b).row<const unsigned short>(i * 16 + 1);
+                const unsigned short* r2 = bottom_blob.batch(b).row<const unsigned short>(i * 16 + 2);
+                const unsigned short* r3 = bottom_blob.batch(b).row<const unsigned short>(i * 16 + 3);
+                const unsigned short* r4 = bottom_blob.batch(b).row<const unsigned short>(i * 16 + 4);
+                const unsigned short* r5 = bottom_blob.batch(b).row<const unsigned short>(i * 16 + 5);
+                const unsigned short* r6 = bottom_blob.batch(b).row<const unsigned short>(i * 16 + 6);
+                const unsigned short* r7 = bottom_blob.batch(b).row<const unsigned short>(i * 16 + 7);
+                const unsigned short* r8 = bottom_blob.batch(b).row<const unsigned short>(i * 16 + 8);
+                const unsigned short* r9 = bottom_blob.batch(b).row<const unsigned short>(i * 16 + 9);
+                const unsigned short* ra = bottom_blob.batch(b).row<const unsigned short>(i * 16 + 10);
+                const unsigned short* rb = bottom_blob.batch(b).row<const unsigned short>(i * 16 + 11);
+                const unsigned short* rc = bottom_blob.batch(b).row<const unsigned short>(i * 16 + 12);
+                const unsigned short* rd = bottom_blob.batch(b).row<const unsigned short>(i * 16 + 13);
+                const unsigned short* re = bottom_blob.batch(b).row<const unsigned short>(i * 16 + 14);
+                const unsigned short* rf = bottom_blob.batch(b).row<const unsigned short>(i * 16 + 15);
+
+                unsigned short* outptr = top_blob.batch(b).row<unsigned short>(i);
 
                 int j = 0;
                 for (; j < w; j++)
@@ -1739,27 +1836,30 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack16to1)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
-
-                unsigned short* outptr0 = top_blob.row<unsigned short>(i * 16);
-                unsigned short* outptr1 = top_blob.row<unsigned short>(i * 16 + 1);
-                unsigned short* outptr2 = top_blob.row<unsigned short>(i * 16 + 2);
-                unsigned short* outptr3 = top_blob.row<unsigned short>(i * 16 + 3);
-                unsigned short* outptr4 = top_blob.row<unsigned short>(i * 16 + 4);
-                unsigned short* outptr5 = top_blob.row<unsigned short>(i * 16 + 5);
-                unsigned short* outptr6 = top_blob.row<unsigned short>(i * 16 + 6);
-                unsigned short* outptr7 = top_blob.row<unsigned short>(i * 16 + 7);
-                unsigned short* outptr8 = top_blob.row<unsigned short>(i * 16 + 8);
-                unsigned short* outptr9 = top_blob.row<unsigned short>(i * 16 + 9);
-                unsigned short* outptra = top_blob.row<unsigned short>(i * 16 + 10);
-                unsigned short* outptrb = top_blob.row<unsigned short>(i * 16 + 11);
-                unsigned short* outptrc = top_blob.row<unsigned short>(i * 16 + 12);
-                unsigned short* outptrd = top_blob.row<unsigned short>(i * 16 + 13);
-                unsigned short* outptre = top_blob.row<unsigned short>(i * 16 + 14);
-                unsigned short* outptrf = top_blob.row<unsigned short>(i * 16 + 15);
+                int b = bi / h;
+                int i = bi % h;
+                const unsigned short* r0 = bottom_blob.batch(b).row<const unsigned short>(i);
+
+                unsigned short* outptr0 = top_blob.batch(b).row<unsigned short>(i * 16);
+                unsigned short* outptr1 = top_blob.batch(b).row<unsigned short>(i * 16 + 1);
+                unsigned short* outptr2 = top_blob.batch(b).row<unsigned short>(i * 16 + 2);
+                unsigned short* outptr3 = top_blob.batch(b).row<unsigned short>(i * 16 + 3);
+                unsigned short* outptr4 = top_blob.batch(b).row<unsigned short>(i * 16 + 4);
+                unsigned short* outptr5 = top_blob.batch(b).row<unsigned short>(i * 16 + 5);
+                unsigned short* outptr6 = top_blob.batch(b).row<unsigned short>(i * 16 + 6);
+                unsigned short* outptr7 = top_blob.batch(b).row<unsigned short>(i * 16 + 7);
+                unsigned short* outptr8 = top_blob.batch(b).row<unsigned short>(i * 16 + 8);
+                unsigned short* outptr9 = top_blob.batch(b).row<unsigned short>(i * 16 + 9);
+                unsigned short* outptra = top_blob.batch(b).row<unsigned short>(i * 16 + 10);
+                unsigned short* outptrb = top_blob.batch(b).row<unsigned short>(i * 16 + 11);
+                unsigned short* outptrc = top_blob.batch(b).row<unsigned short>(i * 16 + 12);
+                unsigned short* outptrd = top_blob.batch(b).row<unsigned short>(i * 16 + 13);
+                unsigned short* outptre = top_blob.batch(b).row<unsigned short>(i * 16 + 14);
+                unsigned short* outptrf = top_blob.batch(b).row<unsigned short>(i * 16 + 15);
 
                 int j = 0;
                 for (; j < w; j++)
@@ -1787,15 +1887,18 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack4to16)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i * 4);
-                const unsigned short* r1 = bottom_blob.row<const unsigned short>(i * 4 + 1);
-                const unsigned short* r2 = bottom_blob.row<const unsigned short>(i * 4 + 2);
-                const unsigned short* r3 = bottom_blob.row<const unsigned short>(i * 4 + 3);
+                int b = bi / outh;
+                int i = bi % outh;
+                const unsigned short* r0 = bottom_blob.batch(b).row<const unsigned short>(i * 4);
+                const unsigned short* r1 = bottom_blob.batch(b).row<const unsigned short>(i * 4 + 1);
+                const unsigned short* r2 = bottom_blob.batch(b).row<const unsigned short>(i * 4 + 2);
+                const unsigned short* r3 = bottom_blob.batch(b).row<const unsigned short>(i * 4 + 3);
 
-                unsigned short* outptr = top_blob.row<unsigned short>(i);
+                unsigned short* outptr = top_blob.batch(b).row<unsigned short>(i);
 
                 for (int j = 0; j < w; j++)
                 {
@@ -1826,15 +1929,18 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack16to4)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
+                int b = bi / h;
+                int i = bi % h;
+                const unsigned short* r0 = bottom_blob.batch(b).row<const unsigned short>(i);
 
-                unsigned short* outptr0 = top_blob.row<unsigned short>(i * 4);
-                unsigned short* outptr1 = top_blob.row<unsigned short>(i * 4 + 1);
-                unsigned short* outptr2 = top_blob.row<unsigned short>(i * 4 + 2);
-                unsigned short* outptr3 = top_blob.row<unsigned short>(i * 4 + 3);
+                unsigned short* outptr0 = top_blob.batch(b).row<unsigned short>(i * 4);
+                unsigned short* outptr1 = top_blob.batch(b).row<unsigned short>(i * 4 + 1);
+                unsigned short* outptr2 = top_blob.batch(b).row<unsigned short>(i * 4 + 2);
+                unsigned short* outptr3 = top_blob.batch(b).row<unsigned short>(i * 4 + 3);
 
                 for (int j = 0; j < w; j++)
                 {
@@ -1865,13 +1971,16 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack8to16)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i * 2);
-                const unsigned short* r1 = bottom_blob.row<const unsigned short>(i * 2 + 1);
+                int b = bi / outh;
+                int i = bi % outh;
+                const unsigned short* r0 = bottom_blob.batch(b).row<const unsigned short>(i * 2);
+                const unsigned short* r1 = bottom_blob.batch(b).row<const unsigned short>(i * 2 + 1);
 
-                unsigned short* outptr = top_blob.row<unsigned short>(i);
+                unsigned short* outptr = top_blob.batch(b).row<unsigned short>(i);
 
                 for (int j = 0; j < w; j++)
                 {
@@ -1900,13 +2009,16 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack16to8)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
+                int b = bi / h;
+                int i = bi % h;
+                const unsigned short* r0 = bottom_blob.batch(b).row<const unsigned short>(i);
 
-                unsigned short* outptr0 = top_blob.row<unsigned short>(i * 2);
-                unsigned short* outptr1 = top_blob.row<unsigned short>(i * 2 + 1);
+                unsigned short* outptr0 = top_blob.batch(b).row<unsigned short>(i * 2);
+                unsigned short* outptr1 = top_blob.batch(b).row<unsigned short>(i * 2 + 1);
 
                 for (int j = 0; j < w; j++)
                 {
@@ -1944,23 +2056,26 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
         if (dims == 3)
-            top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
+            top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
         else // if (dims == 4)
-            top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator);
+            top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         if (pack1to4)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const unsigned short* r0 = bottom_blob.channel(q * 4);
-                const unsigned short* r1 = bottom_blob.channel(q * 4 + 1);
-                const unsigned short* r2 = bottom_blob.channel(q * 4 + 2);
-                const unsigned short* r3 = bottom_blob.channel(q * 4 + 3);
+                int b = bq / outc;
+                int q = bq % outc;
+                const unsigned short* r0 = bottom_blob.batch(b).channel(q * 4);
+                const unsigned short* r1 = bottom_blob.batch(b).channel(q * 4 + 1);
+                const unsigned short* r2 = bottom_blob.batch(b).channel(q * 4 + 2);
+                const unsigned short* r3 = bottom_blob.batch(b).channel(q * 4 + 3);
 
-                unsigned short* outptr = top_blob.channel(q);
+                unsigned short* outptr = top_blob.batch(b).channel(q);
 
                 int i = 0;
 #if __SSE2__
@@ -2000,15 +2115,18 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack4to1)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const unsigned short* r0 = bottom_blob.channel(q);
+                int b = bq / channels;
+                int q = bq % channels;
+                const unsigned short* r0 = bottom_blob.batch(b).channel(q);
 
-                unsigned short* outptr0 = top_blob.channel(q * 4);
-                unsigned short* outptr1 = top_blob.channel(q * 4 + 1);
-                unsigned short* outptr2 = top_blob.channel(q * 4 + 2);
-                unsigned short* outptr3 = top_blob.channel(q * 4 + 3);
+                unsigned short* outptr0 = top_blob.batch(b).channel(q * 4);
+                unsigned short* outptr1 = top_blob.batch(b).channel(q * 4 + 1);
+                unsigned short* outptr2 = top_blob.batch(b).channel(q * 4 + 2);
+                unsigned short* outptr3 = top_blob.batch(b).channel(q * 4 + 3);
 
                 int i = 0;
 #if __SSE2__
@@ -2048,19 +2166,22 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack1to8)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const unsigned short* r0 = bottom_blob.channel(q * 8);
-                const unsigned short* r1 = bottom_blob.channel(q * 8 + 1);
-                const unsigned short* r2 = bottom_blob.channel(q * 8 + 2);
-                const unsigned short* r3 = bottom_blob.channel(q * 8 + 3);
-                const unsigned short* r4 = bottom_blob.channel(q * 8 + 4);
-                const unsigned short* r5 = bottom_blob.channel(q * 8 + 5);
-                const unsigned short* r6 = bottom_blob.channel(q * 8 + 6);
-                const unsigned short* r7 = bottom_blob.channel(q * 8 + 7);
-
-                unsigned short* outptr = top_blob.channel(q);
+                int b = bq / outc;
+                int q = bq % outc;
+                const unsigned short* r0 = bottom_blob.batch(b).channel(q * 8);
+                const unsigned short* r1 = bottom_blob.batch(b).channel(q * 8 + 1);
+                const unsigned short* r2 = bottom_blob.batch(b).channel(q * 8 + 2);
+                const unsigned short* r3 = bottom_blob.batch(b).channel(q * 8 + 3);
+                const unsigned short* r4 = bottom_blob.batch(b).channel(q * 8 + 4);
+                const unsigned short* r5 = bottom_blob.batch(b).channel(q * 8 + 5);
+                const unsigned short* r6 = bottom_blob.batch(b).channel(q * 8 + 6);
+                const unsigned short* r7 = bottom_blob.batch(b).channel(q * 8 + 7);
+
+                unsigned short* outptr = top_blob.batch(b).channel(q);
 
                 int i = 0;
                 for (; i < size; i++)
@@ -2080,19 +2201,22 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack8to1)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const unsigned short* r0 = bottom_blob.channel(q);
-
-                unsigned short* outptr0 = top_blob.channel(q * 8);
-                unsigned short* outptr1 = top_blob.channel(q * 8 + 1);
-                unsigned short* outptr2 = top_blob.channel(q * 8 + 2);
-                unsigned short* outptr3 = top_blob.channel(q * 8 + 3);
-                unsigned short* outptr4 = top_blob.channel(q * 8 + 4);
-                unsigned short* outptr5 = top_blob.channel(q * 8 + 5);
-                unsigned short* outptr6 = top_blob.channel(q * 8 + 6);
-                unsigned short* outptr7 = top_blob.channel(q * 8 + 7);
+                int b = bq / channels;
+                int q = bq % channels;
+                const unsigned short* r0 = bottom_blob.batch(b).channel(q);
+
+                unsigned short* outptr0 = top_blob.batch(b).channel(q * 8);
+                unsigned short* outptr1 = top_blob.batch(b).channel(q * 8 + 1);
+                unsigned short* outptr2 = top_blob.batch(b).channel(q * 8 + 2);
+                unsigned short* outptr3 = top_blob.batch(b).channel(q * 8 + 3);
+                unsigned short* outptr4 = top_blob.batch(b).channel(q * 8 + 4);
+                unsigned short* outptr5 = top_blob.batch(b).channel(q * 8 + 5);
+                unsigned short* outptr6 = top_blob.batch(b).channel(q * 8 + 6);
+                unsigned short* outptr7 = top_blob.batch(b).channel(q * 8 + 7);
 
                 int i = 0;
                 for (; i < size; i++)
@@ -2112,13 +2236,16 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack4to8)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const unsigned short* r0 = bottom_blob.channel(q * 2);
-                const unsigned short* r1 = bottom_blob.channel(q * 2 + 1);
+                int b = bq / outc;
+                int q = bq % outc;
+                const unsigned short* r0 = bottom_blob.batch(b).channel(q * 2);
+                const unsigned short* r1 = bottom_blob.batch(b).channel(q * 2 + 1);
 
-                unsigned short* outptr = top_blob.channel(q);
+                unsigned short* outptr = top_blob.batch(b).channel(q);
 
                 for (int i = 0; i < size; i++)
                 {
@@ -2139,13 +2266,16 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack8to4)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const unsigned short* r0 = bottom_blob.channel(q);
+                int b = bq / channels;
+                int q = bq % channels;
+                const unsigned short* r0 = bottom_blob.batch(b).channel(q);
 
-                unsigned short* outptr0 = top_blob.channel(q * 2);
-                unsigned short* outptr1 = top_blob.channel(q * 2 + 1);
+                unsigned short* outptr0 = top_blob.batch(b).channel(q * 2);
+                unsigned short* outptr1 = top_blob.batch(b).channel(q * 2 + 1);
 
                 for (int i = 0; i < size; i++)
                 {
@@ -2166,27 +2296,30 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack1to16)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const unsigned short* r0 = bottom_blob.channel(q * 16);
-                const unsigned short* r1 = bottom_blob.channel(q * 16 + 1);
-                const unsigned short* r2 = bottom_blob.channel(q * 16 + 2);
-                const unsigned short* r3 = bottom_blob.channel(q * 16 + 3);
-                const unsigned short* r4 = bottom_blob.channel(q * 16 + 4);
-                const unsigned short* r5 = bottom_blob.channel(q * 16 + 5);
-                const unsigned short* r6 = bottom_blob.channel(q * 16 + 6);
-                const unsigned short* r7 = bottom_blob.channel(q * 16 + 7);
-                const unsigned short* r8 = bottom_blob.channel(q * 16 + 8);
-                const unsigned short* r9 = bottom_blob.channel(q * 16 + 9);
-                const unsigned short* ra = bottom_blob.channel(q * 16 + 10);
-                const unsigned short* rb = bottom_blob.channel(q * 16 + 11);
-                const unsigned short* rc = bottom_blob.channel(q * 16 + 12);
-                const unsigned short* rd = bottom_blob.channel(q * 16 + 13);
-                const unsigned short* re = bottom_blob.channel(q * 16 + 14);
-                const unsigned short* rf = bottom_blob.channel(q * 16 + 15);
-
-                unsigned short* outptr = top_blob.channel(q);
+                int b = bq / outc;
+                int q = bq % outc;
+                const unsigned short* r0 = bottom_blob.batch(b).channel(q * 16);
+                const unsigned short* r1 = bottom_blob.batch(b).channel(q * 16 + 1);
+                const unsigned short* r2 = bottom_blob.batch(b).channel(q * 16 + 2);
+                const unsigned short* r3 = bottom_blob.batch(b).channel(q * 16 + 3);
+                const unsigned short* r4 = bottom_blob.batch(b).channel(q * 16 + 4);
+                const unsigned short* r5 = bottom_blob.batch(b).channel(q * 16 + 5);
+                const unsigned short* r6 = bottom_blob.batch(b).channel(q * 16 + 6);
+                const unsigned short* r7 = bottom_blob.batch(b).channel(q * 16 + 7);
+                const unsigned short* r8 = bottom_blob.batch(b).channel(q * 16 + 8);
+                const unsigned short* r9 = bottom_blob.batch(b).channel(q * 16 + 9);
+                const unsigned short* ra = bottom_blob.batch(b).channel(q * 16 + 10);
+                const unsigned short* rb = bottom_blob.batch(b).channel(q * 16 + 11);
+                const unsigned short* rc = bottom_blob.batch(b).channel(q * 16 + 12);
+                const unsigned short* rd = bottom_blob.batch(b).channel(q * 16 + 13);
+                const unsigned short* re = bottom_blob.batch(b).channel(q * 16 + 14);
+                const unsigned short* rf = bottom_blob.batch(b).channel(q * 16 + 15);
+
+                unsigned short* outptr = top_blob.batch(b).channel(q);
 
                 int i = 0;
                 for (; i < size; i++)
@@ -2214,27 +2347,30 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack16to1)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const unsigned short* r0 = bottom_blob.channel(q);
-
-                unsigned short* outptr0 = top_blob.channel(q * 16);
-                unsigned short* outptr1 = top_blob.channel(q * 16 + 1);
-                unsigned short* outptr2 = top_blob.channel(q * 16 + 2);
-                unsigned short* outptr3 = top_blob.channel(q * 16 + 3);
-                unsigned short* outptr4 = top_blob.channel(q * 16 + 4);
-                unsigned short* outptr5 = top_blob.channel(q * 16 + 5);
-                unsigned short* outptr6 = top_blob.channel(q * 16 + 6);
-                unsigned short* outptr7 = top_blob.channel(q * 16 + 7);
-                unsigned short* outptr8 = top_blob.channel(q * 16 + 8);
-                unsigned short* outptr9 = top_blob.channel(q * 16 + 9);
-                unsigned short* outptra = top_blob.channel(q * 16 + 10);
-                unsigned short* outptrb = top_blob.channel(q * 16 + 11);
-                unsigned short* outptrc = top_blob.channel(q * 16 + 12);
-                unsigned short* outptrd = top_blob.channel(q * 16 + 13);
-                unsigned short* outptre = top_blob.channel(q * 16 + 14);
-                unsigned short* outptrf = top_blob.channel(q * 16 + 15);
+                int b = bq / channels;
+                int q = bq % channels;
+                const unsigned short* r0 = bottom_blob.batch(b).channel(q);
+
+                unsigned short* outptr0 = top_blob.batch(b).channel(q * 16);
+                unsigned short* outptr1 = top_blob.batch(b).channel(q * 16 + 1);
+                unsigned short* outptr2 = top_blob.batch(b).channel(q * 16 + 2);
+                unsigned short* outptr3 = top_blob.batch(b).channel(q * 16 + 3);
+                unsigned short* outptr4 = top_blob.batch(b).channel(q * 16 + 4);
+                unsigned short* outptr5 = top_blob.batch(b).channel(q * 16 + 5);
+                unsigned short* outptr6 = top_blob.batch(b).channel(q * 16 + 6);
+                unsigned short* outptr7 = top_blob.batch(b).channel(q * 16 + 7);
+                unsigned short* outptr8 = top_blob.batch(b).channel(q * 16 + 8);
+                unsigned short* outptr9 = top_blob.batch(b).channel(q * 16 + 9);
+                unsigned short* outptra = top_blob.batch(b).channel(q * 16 + 10);
+                unsigned short* outptrb = top_blob.batch(b).channel(q * 16 + 11);
+                unsigned short* outptrc = top_blob.batch(b).channel(q * 16 + 12);
+                unsigned short* outptrd = top_blob.batch(b).channel(q * 16 + 13);
+                unsigned short* outptre = top_blob.batch(b).channel(q * 16 + 14);
+                unsigned short* outptrf = top_blob.batch(b).channel(q * 16 + 15);
 
                 int i = 0;
                 for (; i < size; i++)
@@ -2262,15 +2398,18 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack4to16)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const unsigned short* r0 = bottom_blob.channel(q * 4);
-                const unsigned short* r1 = bottom_blob.channel(q * 4 + 1);
-                const unsigned short* r2 = bottom_blob.channel(q * 4 + 2);
-                const unsigned short* r3 = bottom_blob.channel(q * 4 + 3);
+                int b = bq / outc;
+                int q = bq % outc;
+                const unsigned short* r0 = bottom_blob.batch(b).channel(q * 4);
+                const unsigned short* r1 = bottom_blob.batch(b).channel(q * 4 + 1);
+                const unsigned short* r2 = bottom_blob.batch(b).channel(q * 4 + 2);
+                const unsigned short* r3 = bottom_blob.batch(b).channel(q * 4 + 3);
 
-                unsigned short* outptr = top_blob.channel(q);
+                unsigned short* outptr = top_blob.batch(b).channel(q);
 
                 for (int i = 0; i < size; i++)
                 {
@@ -2301,15 +2440,18 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack16to4)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const unsigned short* r0 = bottom_blob.channel(q);
+                int b = bq / channels;
+                int q = bq % channels;
+                const unsigned short* r0 = bottom_blob.batch(b).channel(q);
 
-                unsigned short* outptr0 = top_blob.channel(q * 4);
-                unsigned short* outptr1 = top_blob.channel(q * 4 + 1);
-                unsigned short* outptr2 = top_blob.channel(q * 4 + 2);
-                unsigned short* outptr3 = top_blob.channel(q * 4 + 3);
+                unsigned short* outptr0 = top_blob.batch(b).channel(q * 4);
+                unsigned short* outptr1 = top_blob.batch(b).channel(q * 4 + 1);
+                unsigned short* outptr2 = top_blob.batch(b).channel(q * 4 + 2);
+                unsigned short* outptr3 = top_blob.batch(b).channel(q * 4 + 3);
 
                 for (int i = 0; i < size; i++)
                 {
@@ -2340,13 +2482,16 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack8to16)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const unsigned short* r0 = bottom_blob.channel(q * 2);
-                const unsigned short* r1 = bottom_blob.channel(q * 2 + 1);
+                int b = bq / outc;
+                int q = bq % outc;
+                const unsigned short* r0 = bottom_blob.batch(b).channel(q * 2);
+                const unsigned short* r1 = bottom_blob.batch(b).channel(q * 2 + 1);
 
-                unsigned short* outptr = top_blob.channel(q);
+                unsigned short* outptr = top_blob.batch(b).channel(q);
 
                 for (int i = 0; i < size; i++)
                 {
@@ -2375,13 +2520,16 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
         }
         if (pack16to8)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const unsigned short* r0 = bottom_blob.channel(q);
+                int b = bq / channels;
+                int q = bq % channels;
+                const unsigned short* r0 = bottom_blob.batch(b).channel(q);
 
-                unsigned short* outptr0 = top_blob.channel(q * 2);
-                unsigned short* outptr1 = top_blob.channel(q * 2 + 1);
+                unsigned short* outptr0 = top_blob.batch(b).channel(q * 2);
+                unsigned short* outptr1 = top_blob.batch(b).channel(q * 2 + 1);
 
                 for (int i = 0; i < size; i++)
                 {
@@ -2444,6 +2592,7 @@ int Packing_x86::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio
     int d = bottom_blob.d;
     int channels = bottom_blob.c;
     int dims = bottom_blob.dims;
+    int batch = bottom_blob.n;
 
     if (!use_padding)
     {
@@ -2472,6 +2621,7 @@ int Packing_x86::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio
         top_blob.cstep = bottom_blob.cstep * elempack / out_elempack;
         top_blob.elemsize = elemsize / elempack * out_elempack;
         top_blob.elempack = out_elempack;
+        top_blob.nstep = bottom_blob.nstep * elempack / out_elempack;
         return 0;
     }
 
@@ -2480,25 +2630,28 @@ int Packing_x86::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio
         int outh = h * elempack / out_elempack;
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
-        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
+        top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         if (pack1to8)
         {
+            const int total_bi = batch * outh;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < outh; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const signed char* r0 = bottom_blob.row<const signed char>(i * 8);
-                const signed char* r1 = bottom_blob.row<const signed char>(i * 8 + 1);
-                const signed char* r2 = bottom_blob.row<const signed char>(i * 8 + 2);
-                const signed char* r3 = bottom_blob.row<const signed char>(i * 8 + 3);
-                const signed char* r4 = bottom_blob.row<const signed char>(i * 8 + 4);
-                const signed char* r5 = bottom_blob.row<const signed char>(i * 8 + 5);
-                const signed char* r6 = bottom_blob.row<const signed char>(i * 8 + 6);
-                const signed char* r7 = bottom_blob.row<const signed char>(i * 8 + 7);
-
-                signed char* outptr = top_blob.row<signed char>(i);
+                int b = bi / outh;
+                int i = bi % outh;
+                const signed char* r0 = bottom_blob.batch(b).row<const signed char>(i * 8);
+                const signed char* r1 = bottom_blob.batch(b).row<const signed char>(i * 8 + 1);
+                const signed char* r2 = bottom_blob.batch(b).row<const signed char>(i * 8 + 2);
+                const signed char* r3 = bottom_blob.batch(b).row<const signed char>(i * 8 + 3);
+                const signed char* r4 = bottom_blob.batch(b).row<const signed char>(i * 8 + 4);
+                const signed char* r5 = bottom_blob.batch(b).row<const signed char>(i * 8 + 5);
+                const signed char* r6 = bottom_blob.batch(b).row<const signed char>(i * 8 + 6);
+                const signed char* r7 = bottom_blob.batch(b).row<const signed char>(i * 8 + 7);
+
+                signed char* outptr = top_blob.batch(b).row<signed char>(i);
 
                 int j = 0;
                 for (; j < w; j++)
@@ -2518,19 +2671,22 @@ int Packing_x86::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio
         }
         if (pack8to1)
         {
+            const int total_bi = batch * h;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int bi = 0; bi < total_bi; bi++)
             {
-                const signed char* r0 = bottom_blob.row<const signed char>(i);
-
-                signed char* outptr0 = top_blob.row<signed char>(i * 8);
-                signed char* outptr1 = top_blob.row<signed char>(i * 8 + 1);
-                signed char* outptr2 = top_blob.row<signed char>(i * 8 + 2);
-                signed char* outptr3 = top_blob.row<signed char>(i * 8 + 3);
-                signed char* outptr4 = top_blob.row<signed char>(i * 8 + 4);
-                signed char* outptr5 = top_blob.row<signed char>(i * 8 + 5);
-                signed char* outptr6 = top_blob.row<signed char>(i * 8 + 6);
-                signed char* outptr7 = top_blob.row<signed char>(i * 8 + 7);
+                int b = bi / h;
+                int i = bi % h;
+                const signed char* r0 = bottom_blob.batch(b).row<const signed char>(i);
+
+                signed char* outptr0 = top_blob.batch(b).row<signed char>(i * 8);
+                signed char* outptr1 = top_blob.batch(b).row<signed char>(i * 8 + 1);
+                signed char* outptr2 = top_blob.batch(b).row<signed char>(i * 8 + 2);
+                signed char* outptr3 = top_blob.batch(b).row<signed char>(i * 8 + 3);
+                signed char* outptr4 = top_blob.batch(b).row<signed char>(i * 8 + 4);
+                signed char* outptr5 = top_blob.batch(b).row<signed char>(i * 8 + 5);
+                signed char* outptr6 = top_blob.batch(b).row<signed char>(i * 8 + 6);
+                signed char* outptr7 = top_blob.batch(b).row<signed char>(i * 8 + 7);
 
                 int j = 0;
                 for (; j < w; j++)
@@ -2559,27 +2715,30 @@ int Packing_x86::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
         if (dims == 3)
-            top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
+            top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
         else // if (dims == 4)
-            top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator);
+            top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         if (pack1to8)
         {
+            const int total_bq = batch * outc;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < outc; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const signed char* r0 = bottom_blob.channel(q * 8);
-                const signed char* r1 = bottom_blob.channel(q * 8 + 1);
-                const signed char* r2 = bottom_blob.channel(q * 8 + 2);
-                const signed char* r3 = bottom_blob.channel(q * 8 + 3);
-                const signed char* r4 = bottom_blob.channel(q * 8 + 4);
-                const signed char* r5 = bottom_blob.channel(q * 8 + 5);
-                const signed char* r6 = bottom_blob.channel(q * 8 + 6);
-                const signed char* r7 = bottom_blob.channel(q * 8 + 7);
-
-                signed char* outptr = top_blob.channel(q);
+                int b = bq / outc;
+                int q = bq % outc;
+                const signed char* r0 = bottom_blob.batch(b).channel(q * 8);
+                const signed char* r1 = bottom_blob.batch(b).channel(q * 8 + 1);
+                const signed char* r2 = bottom_blob.batch(b).channel(q * 8 + 2);
+                const signed char* r3 = bottom_blob.batch(b).channel(q * 8 + 3);
+                const signed char* r4 = bottom_blob.batch(b).channel(q * 8 + 4);
+                const signed char* r5 = bottom_blob.batch(b).channel(q * 8 + 5);
+                const signed char* r6 = bottom_blob.batch(b).channel(q * 8 + 6);
+                const signed char* r7 = bottom_blob.batch(b).channel(q * 8 + 7);
+
+                signed char* outptr = top_blob.batch(b).channel(q);
 
                 int i = 0;
                 for (; i < size; i++)
@@ -2599,19 +2758,22 @@ int Packing_x86::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio
         }
         if (pack8to1)
         {
+            const int total_bq = batch * channels;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int bq = 0; bq < total_bq; bq++)
             {
-                const signed char* r0 = bottom_blob.channel(q);
-
-                signed char* outptr0 = top_blob.channel(q * 8);
-                signed char* outptr1 = top_blob.channel(q * 8 + 1);
-                signed char* outptr2 = top_blob.channel(q * 8 + 2);
-                signed char* outptr3 = top_blob.channel(q * 8 + 3);
-                signed char* outptr4 = top_blob.channel(q * 8 + 4);
-                signed char* outptr5 = top_blob.channel(q * 8 + 5);
-                signed char* outptr6 = top_blob.channel(q * 8 + 6);
-                signed char* outptr7 = top_blob.channel(q * 8 + 7);
+                int b = bq / channels;
+                int q = bq % channels;
+                const signed char* r0 = bottom_blob.batch(b).channel(q);
+
+                signed char* outptr0 = top_blob.batch(b).channel(q * 8);
+                signed char* outptr1 = top_blob.batch(b).channel(q * 8 + 1);
+                signed char* outptr2 = top_blob.batch(b).channel(q * 8 + 2);
+                signed char* outptr3 = top_blob.batch(b).channel(q * 8 + 3);
+                signed char* outptr4 = top_blob.batch(b).channel(q * 8 + 4);
+                signed char* outptr5 = top_blob.batch(b).channel(q * 8 + 5);
+                signed char* outptr6 = top_blob.batch(b).channel(q * 8 + 6);
+                signed char* outptr7 = top_blob.batch(b).channel(q * 8 + 7);
 
                 int i = 0;
                 for (; i < size; i++)
diff --git a/src/mat.cpp b/src/mat.cpp
index f066a15c0417..052bdbbd1156 100644
--- a/src/mat.cpp
+++ b/src/mat.cpp
@@ -22,6 +22,25 @@ Mat Mat::clone(Allocator* _allocator) const
         return Mat();
 
     Mat m;
+    if (n > 1)
+    {
+        m.create_like_batch(*this, n, _allocator);
+
+        if (m.empty())
+            return m;
+
+        // copy batch by batch (nstep may include 4K padding)
+        size_t single_batch_size = cstep * c * elemsize;
+        for (int b = 0; b < n; b++)
+        {
+            const void* src = (const unsigned char*)data + nstep * b * elemsize;
+            void* dst = (unsigned char*)m.data + m.nstep * b * elemsize;
+            memcpy(dst, src, single_batch_size);
+        }
+
+        return m;
+    }
+
     if (dims == 1)
         m.create(w, elemsize, elempack, _allocator);
     else if (dims == 2)
@@ -501,6 +520,12 @@ void Mat::create(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack
 
 void Mat::create_like(const Mat& m, Allocator* _allocator)
 {
+    if (m.n > 1)
+    {
+        create_like_batch(m, m.n, _allocator);
+        return;
+    }
+
     int _dims = m.dims;
     if (_dims == 1)
         create(m.w, m.elemsize, m.elempack, _allocator);
@@ -512,6 +537,178 @@ void Mat::create_like(const Mat& m, Allocator* _allocator)
         create(m.w, m.h, m.d, m.c, m.elemsize, m.elempack, _allocator);
 }
 
+void Mat::create_like_batch(const Mat& m, int _batch, Allocator* _allocator)
+{
+    if (m.dims == 1)
+        create_batch(m.w, _batch, m.elemsize, m.elempack, _allocator);
+    else if (m.dims == 2)
+        create_batch(m.w, m.h, _batch, m.elemsize, m.elempack, _allocator);
+    else if (m.dims == 3)
+        create_batch(m.w, m.h, m.c, _batch, m.elemsize, m.elempack, _allocator);
+    else if (m.dims == 4)
+        create_batch(m.w, m.h, m.d, m.c, _batch, m.elemsize, m.elempack, _allocator);
+}
+
+void Mat::create_batch(int _w, int _batch, size_t _elemsize, int _elempack, Allocator* _allocator)
+{
+    if (_batch <= 1)
+    {
+        create(_w, _elemsize, _elempack, _allocator);
+        return;
+    }
+
+    release();
+
+    elemsize = _elemsize;
+    elempack = _elempack;
+    allocator = _allocator;
+
+    dims = 1;
+    w = _w;
+    h = 1;
+    d = 1;
+    c = 1;
+    n = _batch;
+
+    cstep = alignSize((size_t)w * elemsize, 16) / elemsize;
+    nstep = alignSize(cstep * elemsize, 4096) / elemsize;
+
+    size_t totalsize = alignSize(nstep * n * elemsize, 4);
+    if (totalsize > 0)
+    {
+        if (allocator)
+            data = allocator->fastMalloc(totalsize + (int)sizeof(*refcount));
+        else
+            data = fastMalloc(totalsize + (int)sizeof(*refcount));
+    }
+
+    if (data)
+    {
+        refcount = (int*)(((unsigned char*)data) + totalsize);
+        *refcount = 1;
+    }
+}
+
+void Mat::create_batch(int _w, int _h, int _batch, size_t _elemsize, int _elempack, Allocator* _allocator)
+{
+    if (_batch <= 1)
+    {
+        create(_w, _h, _elemsize, _elempack, _allocator);
+        return;
+    }
+
+    release();
+
+    elemsize = _elemsize;
+    elempack = _elempack;
+    allocator = _allocator;
+
+    dims = 2;
+    w = _w;
+    h = _h;
+    d = 1;
+    c = 1;
+    n = _batch;
+
+    cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+    nstep = alignSize(cstep * elemsize, 4096) / elemsize;
+
+    size_t totalsize = alignSize(nstep * n * elemsize, 4);
+    if (totalsize > 0)
+    {
+        if (allocator)
+            data = allocator->fastMalloc(totalsize + (int)sizeof(*refcount));
+        else
+            data = fastMalloc(totalsize + (int)sizeof(*refcount));
+    }
+
+    if (data)
+    {
+        refcount = (int*)(((unsigned char*)data) + totalsize);
+        *refcount = 1;
+    }
+}
+
+void Mat::create_batch(int _w, int _h, int _c, int _batch, size_t _elemsize, int _elempack, Allocator* _allocator)
+{
+    if (_batch <= 1)
+    {
+        create(_w, _h, _c, _elemsize, _elempack, _allocator);
+        return;
+    }
+
+    release();
+
+    elemsize = _elemsize;
+    elempack = _elempack;
+    allocator = _allocator;
+
+    dims = 3;
+    w = _w;
+    h = _h;
+    d = 1;
+    c = _c;
+    n = _batch;
+
+    cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+    nstep = alignSize(cstep * c * elemsize, 4096) / elemsize;
+
+    size_t totalsize = alignSize(nstep * n * elemsize, 4);
+    if (totalsize > 0)
+    {
+        if (allocator)
+            data = allocator->fastMalloc(totalsize + (int)sizeof(*refcount));
+        else
+            data = fastMalloc(totalsize + (int)sizeof(*refcount));
+    }
+
+    if (data)
+    {
+        refcount = (int*)(((unsigned char*)data) + totalsize);
+        *refcount = 1;
+    }
+}
+
+void Mat::create_batch(int _w, int _h, int _d, int _c, int _batch, size_t _elemsize, int _elempack, Allocator* _allocator)
+{
+    if (_batch <= 1)
+    {
+        create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+        return;
+    }
+
+    release();
+
+    elemsize = _elemsize;
+    elempack = _elempack;
+    allocator = _allocator;
+
+    dims = 4;
+    w = _w;
+    h = _h;
+    d = _d;
+    c = _c;
+    n = _batch;
+
+    cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize;
+    nstep = alignSize(cstep * c * elemsize, 4096) / elemsize;
+
+    size_t totalsize = alignSize(nstep * n * elemsize, 4);
+    if (totalsize > 0)
+    {
+        if (allocator)
+            data = allocator->fastMalloc(totalsize + (int)sizeof(*refcount));
+        else
+            data = fastMalloc(totalsize + (int)sizeof(*refcount));
+    }
+
+    if (data)
+    {
+        refcount = (int*)(((unsigned char*)data) + totalsize);
+        *refcount = 1;
+    }
+}
+
 #if NCNN_VULKAN
 void Mat::create_like(const VkMat& m, Allocator* _allocator)
 {
@@ -526,6 +723,18 @@ void Mat::create_like(const VkMat& m, Allocator* _allocator)
         create(m.w, m.h, m.d, m.c, m.elemsize, m.elempack, _allocator);
 }
 
+void Mat::create_like_batch(const VkMat& m, int _batch, Allocator* _allocator)
+{
+    if (m.dims == 1)
+        create_batch(m.w, _batch, m.elemsize, m.elempack, _allocator);
+    else if (m.dims == 2)
+        create_batch(m.w, m.h, _batch, m.elemsize, m.elempack, _allocator);
+    else if (m.dims == 3)
+        create_batch(m.w, m.h, m.c, _batch, m.elemsize, m.elempack, _allocator);
+    else if (m.dims == 4)
+        create_batch(m.w, m.h, m.d, m.c, _batch, m.elemsize, m.elempack, _allocator);
+}
+
 void Mat::create_like(const VkImageMat& im, Allocator* _allocator)
 {
     int _dims = im.dims;
@@ -571,6 +780,7 @@ void VkMat::create(int _w, size_t _elemsize, VkAllocator* _allocator)
     {
         refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount));
         *refcount = 1;
+        nstep = data->capacity / elemsize;
     }
 }
 
@@ -604,6 +814,7 @@ void VkMat::create(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
     {
         refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount));
         *refcount = 1;
+        nstep = data->capacity / elemsize;
     }
 }
 
@@ -637,6 +848,7 @@ void VkMat::create(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _alloc
     {
         refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount));
         *refcount = 1;
+        nstep = data->capacity / elemsize;
     }
 }
 
@@ -670,6 +882,7 @@ void VkMat::create(int _w, int _h, int _d, int _c, size_t _elemsize, VkAllocator
     {
         refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount));
         *refcount = 1;
+        nstep = data->capacity / elemsize;
     }
 }
 
@@ -703,6 +916,7 @@ void VkMat::create(int _w, size_t _elemsize, int _elempack, VkAllocator* _alloca
     {
         refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount));
         *refcount = 1;
+        nstep = data->capacity / elemsize;
     }
 }
 
@@ -736,6 +950,7 @@ void VkMat::create(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator*
     {
         refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount));
         *refcount = 1;
+        nstep = data->capacity / elemsize;
     }
 }
 
@@ -769,6 +984,7 @@ void VkMat::create(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAl
     {
         refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount));
         *refcount = 1;
+        nstep = data->capacity / elemsize;
     }
 }
 
@@ -802,11 +1018,18 @@ void VkMat::create(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempa
     {
         refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount));
         *refcount = 1;
+        nstep = data->capacity / elemsize;
     }
 }
 
 void VkMat::create_like(const Mat& m, VkAllocator* _allocator)
 {
+    if (m.n > 1)
+    {
+        create_like_batch(m, m.n, _allocator);
+        return;
+    }
+
     int _dims = m.dims;
     if (_dims == 1)
         create(m.w, m.elemsize, m.elempack, _allocator);
@@ -820,6 +1043,12 @@ void VkMat::create_like(const Mat& m, VkAllocator* _allocator)
 
 void VkMat::create_like(const VkMat& m, VkAllocator* _allocator)
 {
+    if (m.n > 1)
+    {
+        create_like_batch(m, m.n, _allocator);
+        return;
+    }
+
     int _dims = m.dims;
     if (_dims == 1)
         create(m.w, m.elemsize, m.elempack, _allocator);
@@ -831,6 +1060,30 @@ void VkMat::create_like(const VkMat& m, VkAllocator* _allocator)
         create(m.w, m.h, m.d, m.c, m.elemsize, m.elempack, _allocator);
 }
 
+void VkMat::create_like_batch(const Mat& m, int _batch, VkAllocator* _allocator)
+{
+    if (m.dims == 1)
+        create_batch(m.w, _batch, m.elemsize, m.elempack, _allocator);
+    else if (m.dims == 2)
+        create_batch(m.w, m.h, _batch, m.elemsize, m.elempack, _allocator);
+    else if (m.dims == 3)
+        create_batch(m.w, m.h, m.c, _batch, m.elemsize, m.elempack, _allocator);
+    else if (m.dims == 4)
+        create_batch(m.w, m.h, m.d, m.c, _batch, m.elemsize, m.elempack, _allocator);
+}
+
+void VkMat::create_like_batch(const VkMat& m, int _batch, VkAllocator* _allocator)
+{
+    if (m.dims == 1)
+        create_batch(m.w, _batch, m.elemsize, m.elempack, _allocator);
+    else if (m.dims == 2)
+        create_batch(m.w, m.h, _batch, m.elemsize, m.elempack, _allocator);
+    else if (m.dims == 3)
+        create_batch(m.w, m.h, m.c, _batch, m.elemsize, m.elempack, _allocator);
+    else if (m.dims == 4)
+        create_batch(m.w, m.h, m.d, m.c, _batch, m.elemsize, m.elempack, _allocator);
+}
+
 void VkMat::create_like(const VkImageMat& im, VkAllocator* _allocator)
 {
     int _dims = im.dims;
@@ -844,6 +1097,154 @@ void VkMat::create_like(const VkImageMat& im, VkAllocator* _allocator)
         create(im.w, im.h, im.d, im.c, im.elemsize, im.elempack, _allocator);
 }
 
+void VkMat::create_batch(int _w, int _batch, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+{
+    if (_batch <= 1)
+    {
+        create(_w, _elemsize, _elempack, _allocator);
+        return;
+    }
+
+    release();
+
+    elemsize = _elemsize;
+    elempack = _elempack;
+    allocator = _allocator;
+
+    dims = 1;
+    w = _w;
+    h = 1;
+    d = 1;
+    c = 1;
+    n = _batch;
+
+    cstep = alignSize((size_t)w * elemsize, 16) / elemsize;
+    nstep = alignSize(cstep * elemsize, 4096) / elemsize;
+
+    size_t totalsize = alignSize(nstep * n * elemsize, 4);
+    if (totalsize > 0)
+    {
+        data = allocator->fastMalloc(totalsize);
+    }
+
+    if (data)
+    {
+        refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount));
+        *refcount = 1;
+    }
+}
+
+void VkMat::create_batch(int _w, int _h, int _batch, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+{
+    if (_batch <= 1)
+    {
+        create(_w, _h, _elemsize, _elempack, _allocator);
+        return;
+    }
+
+    release();
+
+    elemsize = _elemsize;
+    elempack = _elempack;
+    allocator = _allocator;
+
+    dims = 2;
+    w = _w;
+    h = _h;
+    d = 1;
+    c = 1;
+    n = _batch;
+
+    cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+    nstep = alignSize(cstep * elemsize, 4096) / elemsize;
+
+    size_t totalsize = alignSize(nstep * n * elemsize, 4);
+    if (totalsize > 0)
+    {
+        data = allocator->fastMalloc(totalsize);
+    }
+
+    if (data)
+    {
+        refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount));
+        *refcount = 1;
+    }
+}
+
+void VkMat::create_batch(int _w, int _h, int _c, int _batch, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+{
+    if (_batch <= 1)
+    {
+        create(_w, _h, _c, _elemsize, _elempack, _allocator);
+        return;
+    }
+
+    release();
+
+    elemsize = _elemsize;
+    elempack = _elempack;
+    allocator = _allocator;
+
+    dims = 3;
+    w = _w;
+    h = _h;
+    d = 1;
+    c = _c;
+    n = _batch;
+
+    cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+    nstep = alignSize(cstep * c * elemsize, 4096) / elemsize;
+
+    size_t totalsize = alignSize(nstep * n * elemsize, 4);
+    if (totalsize > 0)
+    {
+        data = allocator->fastMalloc(totalsize);
+    }
+
+    if (data)
+    {
+        refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount));
+        *refcount = 1;
+    }
+}
+
+void VkMat::create_batch(int _w, int _h, int _d, int _c, int _batch, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+{
+    if (_batch <= 1)
+    {
+        create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+        return;
+    }
+
+    release();
+
+    elemsize = _elemsize;
+    elempack = _elempack;
+    allocator = _allocator;
+
+    dims = 4;
+    w = _w;
+    h = _h;
+    d = _d;
+    c = _c;
+    n = _batch;
+
+    cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize;
+    nstep = alignSize(cstep * c * elemsize, 4096) / elemsize;
+
+    size_t totalsize = alignSize(nstep * n * elemsize, 4);
+    if (totalsize > 0)
+    {
+        data = allocator->fastMalloc(totalsize);
+    }
+
+    if (data)
+    {
+        refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount));
+        *refcount = 1;
+    }
+}
+
 void VkImageMat::create(int _w, size_t _elemsize, VkAllocator* _allocator)
 {
     if (dims == 1 && w == _w && elemsize == _elemsize && elempack == 1 && allocator == _allocator)
diff --git a/src/mat.h b/src/mat.h
index 9e353aa61d4e..1fc24af5f5e3 100644
--- a/src/mat.h
+++ b/src/mat.h
@@ -163,9 +163,21 @@ class NCNN_EXPORT Mat
     void create(int w, int h, int d, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
     // allocate like
     void create_like(const Mat& m, Allocator* allocator = 0);
+    // allocate like with batch count, copying shape from m
+    void create_like_batch(const Mat& m, int batch, Allocator* allocator = 0);
+    // allocate batch vec
+    void create_batch(int w, int batch, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate batch image
+    void create_batch(int w, int h, int batch, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate batch dim
+    void create_batch(int w, int h, int c, int batch, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate batch packed cube
+    void create_batch(int w, int h, int d, int c, int batch, size_t elemsize, int elempack, Allocator* allocator = 0);
 #if NCNN_VULKAN
     // allocate like
     void create_like(const VkMat& m, Allocator* allocator = 0);
+    // allocate like with batch count, copying shape from VkMat
+    void create_like_batch(const VkMat& m, int batch, Allocator* allocator = 0);
     // allocate like
     void create_like(const VkImageMat& im, Allocator* allocator = 0);
 #endif // NCNN_VULKAN
@@ -205,6 +217,12 @@ class NCNN_EXPORT Mat
     Mat range(int x, int n);
     const Mat range(int x, int n) const;
 
+    // batch reference
+    Mat batch(int b);
+    const Mat batch(int b) const;
+    Mat batch_range(int b, int batches);
+    const Mat batch_range(int b, int batches) const;
+
     // access raw data
     template<typename T>
     operator T*();
@@ -333,6 +351,11 @@ class NCNN_EXPORT Mat
     int c;
 
     size_t cstep;
+
+    // batch count, default 1
+    int n;
+    // element step from one batch to the next (4K-byte aligned)
+    size_t nstep;
 };
 
 #if NCNN_VULKAN
@@ -401,8 +424,26 @@ class NCNN_EXPORT VkMat
     void create_like(const Mat& m, VkAllocator* allocator);
     // allocate like
     void create_like(const VkMat& m, VkAllocator* allocator);
+    // allocate like with batch count, copying shape from m
+    void create_like_batch(const Mat& m, int batch, VkAllocator* allocator);
+    // allocate like with batch count, copying shape from VkMat
+    void create_like_batch(const VkMat& m, int batch, VkAllocator* allocator);
     // allocate like
     void create_like(const VkImageMat& im, VkAllocator* allocator);
+    // allocate batch vec
+    void create_batch(int w, int batch, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate batch image
+    void create_batch(int w, int h, int batch, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate batch dim
+    void create_batch(int w, int h, int c, int batch, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate batch packed cube
+    void create_batch(int w, int h, int d, int c, int batch, size_t elemsize, int elempack, VkAllocator* allocator);
+
+    // batch reference
+    VkMat batch(int b);
+    const VkMat batch(int b) const;
+    VkMat batch_range(int b, int batches);
+    const VkMat batch_range(int b, int batches) const;
 
     // mapped
     Mat mapped() const;
@@ -459,6 +500,14 @@ class NCNN_EXPORT VkMat
     int c;
 
     size_t cstep;
+
+    // batch count, default 1
+    int n;
+    // element step from one batch to the next (4K-byte aligned)
+    // for non-batch VkMat, equals data->capacity / elemsize
+    size_t nstep;
+    // byte offset relative to data->offset (for batch sub-views)
+    size_t offset;
 };
 
 class NCNN_EXPORT VkImageMat
@@ -797,108 +846,108 @@ NCNN_EXPORT void dequantize_from_int32(const Mat& src, Mat& dst, const Mat& scal
 NCNN_EXPORT void requantize_from_int32_to_int8(const Mat& src, Mat& dst, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt = Option());
 
 NCNN_FORCEINLINE Mat::Mat()
-    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0)
 {
 }
 
 NCNN_FORCEINLINE Mat::Mat(int _w, size_t _elemsize, Allocator* _allocator)
-    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0)
 {
     create(_w, _elemsize, _allocator);
 }
 
 NCNN_FORCEINLINE Mat::Mat(int _w, int _h, size_t _elemsize, Allocator* _allocator)
-    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0)
 {
     create(_w, _h, _elemsize, _allocator);
 }
 
 NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, size_t _elemsize, Allocator* _allocator)
-    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0)
 {
     create(_w, _h, _c, _elemsize, _allocator);
 }
 
 NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, size_t _elemsize, Allocator* _allocator)
-    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0)
 {
     create(_w, _h, _d, _c, _elemsize, _allocator);
 }
 
 NCNN_FORCEINLINE Mat::Mat(int _w, size_t _elemsize, int _elempack, Allocator* _allocator)
-    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0)
 {
     create(_w, _elemsize, _elempack, _allocator);
 }
 
 NCNN_FORCEINLINE Mat::Mat(int _w, int _h, size_t _elemsize, int _elempack, Allocator* _allocator)
-    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0)
 {
     create(_w, _h, _elemsize, _elempack, _allocator);
 }
 
 NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, size_t _elemsize, int _elempack, Allocator* _allocator)
-    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0)
 {
     create(_w, _h, _c, _elemsize, _elempack, _allocator);
 }
 
 NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, Allocator* _allocator)
-    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0)
 {
     create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
 }
 
 NCNN_FORCEINLINE Mat::Mat(const Mat& m)
-    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c), cstep(m.cstep)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c), cstep(m.cstep), n(m.n), nstep(m.nstep)
 {
     addref();
 }
 
 NCNN_FORCEINLINE Mat::Mat(int _w, void* _data, size_t _elemsize, Allocator* _allocator)
-    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1), n(1), nstep(0)
 {
     cstep = alignSize(w * elemsize, 16) / elemsize;
 }
 
 NCNN_FORCEINLINE Mat::Mat(int _w, int _h, void* _data, size_t _elemsize, Allocator* _allocator)
-    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1), n(1), nstep(0)
 {
     cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
 }
 
 NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize, Allocator* _allocator)
-    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c), n(1), nstep(0)
 {
     cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
 }
 
 NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, void* _data, size_t _elemsize, Allocator* _allocator)
-    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c), n(1), nstep(0)
 {
     cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize;
 }
 
 NCNN_FORCEINLINE Mat::Mat(int _w, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
-    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1), n(1), nstep(0)
 {
     cstep = alignSize(w * elemsize, 16) / elemsize;
 }
 
 NCNN_FORCEINLINE Mat::Mat(int _w, int _h, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
-    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1), n(1), nstep(0)
 {
     cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
 }
 
 NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
-    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c), n(1), nstep(0)
 {
     cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
 }
 
 NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
-    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c), n(1), nstep(0)
 {
     cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize;
 }
@@ -1192,6 +1241,9 @@ NCNN_FORCEINLINE Mat& Mat::operator=(const Mat& m)
 
     cstep = m.cstep;
 
+    n = m.n;
+    nstep = m.nstep;
+
     return *this;
 }
 
@@ -1224,6 +1276,9 @@ NCNN_FORCEINLINE void Mat::release()
 
     cstep = 0;
 
+    n = 1;
+    nstep = 0;
+
     refcount = 0;
 }
 
@@ -1366,6 +1421,42 @@ NCNN_FORCEINLINE const Mat Mat::range(int x, int n) const
     return m;
 }
 
+NCNN_FORCEINLINE Mat Mat::batch(int b)
+{
+    Mat m(w, h, d, c, (unsigned char*)data + nstep * b * elemsize, elemsize, elempack, allocator);
+    m.dims = dims;
+    m.cstep = cstep;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::batch(int b) const
+{
+    Mat m(w, h, d, c, (unsigned char*)data + nstep * b * elemsize, elemsize, elempack, allocator);
+    m.dims = dims;
+    m.cstep = cstep;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::batch_range(int b, int batches)
+{
+    Mat m(w, h, d, c, (unsigned char*)data + nstep * b * elemsize, elemsize, elempack, allocator);
+    m.dims = dims;
+    m.cstep = cstep;
+    m.n = batches;
+    m.nstep = nstep;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::batch_range(int b, int batches) const
+{
+    Mat m(w, h, d, c, (unsigned char*)data + nstep * b * elemsize, elemsize, elempack, allocator);
+    m.dims = dims;
+    m.cstep = cstep;
+    m.n = batches;
+    m.nstep = nstep;
+    return m;
+}
+
 template<typename T>
 NCNN_FORCEINLINE Mat::operator T*()
 {
@@ -1391,54 +1482,54 @@ NCNN_FORCEINLINE const float& Mat::operator[](size_t i) const
 #if NCNN_VULKAN
 
 NCNN_FORCEINLINE VkMat::VkMat()
-    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0), offset(0)
 {
 }
 
 NCNN_FORCEINLINE VkMat::VkMat(int _w, size_t _elemsize, VkAllocator* _allocator)
-    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0), offset(0)
 {
     create(_w, _elemsize, _allocator);
 }
 
 NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
-    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0), offset(0)
 {
     create(_w, _h, _elemsize, _allocator);
 }
 
 NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
-    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0), offset(0)
 {
     create(_w, _h, _c, _elemsize, _allocator);
 }
 
 NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, size_t _elemsize, VkAllocator* _allocator)
-    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0), offset(0)
 {
     create(_w, _h, _d, _c, _elemsize, _allocator);
 }
 
 NCNN_FORCEINLINE VkMat::VkMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
-    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0), offset(0)
 {
     create(_w, _elemsize, _elempack, _allocator);
 }
 
 NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
-    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0), offset(0)
 {
     create(_w, _h, _elemsize, _elempack, _allocator);
 }
 
 NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
-    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0), offset(0)
 {
     create(_w, _h, _c, _elemsize, _elempack, _allocator);
 }
 
 NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
-    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0), offset(0)
 {
     create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
 }
@@ -1449,54 +1540,65 @@ NCNN_FORCEINLINE VkMat::VkMat(const VkMat& m)
     addref();
 
     cstep = m.cstep;
+    n = m.n;
+    nstep = m.nstep;
+    offset = m.offset;
 }
 
 NCNN_FORCEINLINE VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
-    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1), n(1), nstep(0), offset(0)
 {
     cstep = alignSize(w * elemsize, 16) / elemsize;
+    nstep = _data->capacity / _elemsize;
 }
 
 NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
-    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1), n(1), nstep(0), offset(0)
 {
     cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+    nstep = _data->capacity / _elemsize;
 }
 
 NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
-    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c), n(1), nstep(0), offset(0)
 {
     cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+    nstep = _data->capacity / _elemsize;
 }
 
 NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
-    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c), n(1), nstep(0), offset(0)
 {
     cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize;
+    nstep = _data->capacity / _elemsize;
 }
 
 NCNN_FORCEINLINE VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
-    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1), n(1), nstep(0), offset(0)
 {
     cstep = alignSize(w * elemsize, 16) / elemsize;
+    nstep = _data->capacity / _elemsize;
 }
 
 NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
-    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1), n(1), nstep(0), offset(0)
 {
     cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+    nstep = _data->capacity / _elemsize;
 }
 
 NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
-    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c), n(1), nstep(0), offset(0)
 {
     cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+    nstep = _data->capacity / _elemsize;
 }
 
 NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
-    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c), n(1), nstep(0), offset(0)
 {
     cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize;
+    nstep = _data->capacity / _elemsize;
 }
 
 NCNN_FORCEINLINE VkMat::~VkMat()
@@ -1528,6 +1630,10 @@ NCNN_FORCEINLINE VkMat& VkMat::operator=(const VkMat& m)
 
     cstep = m.cstep;
 
+    n = m.n;
+    nstep = m.nstep;
+    offset = m.offset;
+
     return *this;
 }
 
@@ -1556,7 +1662,7 @@ NCNN_FORCEINLINE void* VkMat::mapped_ptr() const
     if (!allocator->mappable)
         return 0;
 
-    return (unsigned char*)data->mapped_ptr + data->offset;
+    return (unsigned char*)data->mapped_ptr + data->offset + offset;
 }
 
 NCNN_FORCEINLINE void VkMat::addref()
@@ -1588,6 +1694,10 @@ NCNN_FORCEINLINE void VkMat::release()
 
     cstep = 0;
 
+    n = 1;
+    nstep = 0;
+    offset = 0;
+
     refcount = 0;
 }
 
@@ -1627,12 +1737,92 @@ NCNN_FORCEINLINE VkBuffer VkMat::buffer() const
 
 NCNN_FORCEINLINE size_t VkMat::buffer_offset() const
 {
-    return data->offset;
+    return data->offset + offset;
 }
 
 NCNN_FORCEINLINE size_t VkMat::buffer_capacity() const
 {
-    return data->capacity;
+    return nstep * elemsize;
+}
+
+NCNN_FORCEINLINE VkMat VkMat::batch(int b)
+{
+    VkMat m;
+    m.data = data;
+    m.refcount = 0;
+    m.elemsize = elemsize;
+    m.elempack = elempack;
+    m.allocator = allocator;
+    m.dims = dims;
+    m.w = w;
+    m.h = h;
+    m.d = d;
+    m.c = c;
+    m.cstep = cstep;
+    m.n = 1;
+    m.nstep = nstep;
+    m.offset = offset + nstep * b * elemsize;
+    return m;
+}
+
+NCNN_FORCEINLINE const VkMat VkMat::batch(int b) const
+{
+    VkMat m;
+    m.data = data;
+    m.refcount = 0;
+    m.elemsize = elemsize;
+    m.elempack = elempack;
+    m.allocator = allocator;
+    m.dims = dims;
+    m.w = w;
+    m.h = h;
+    m.d = d;
+    m.c = c;
+    m.cstep = cstep;
+    m.n = 1;
+    m.nstep = nstep;
+    m.offset = offset + nstep * b * elemsize;
+    return m;
+}
+
+NCNN_FORCEINLINE VkMat VkMat::batch_range(int b, int batches)
+{
+    VkMat m;
+    m.data = data;
+    m.refcount = 0;
+    m.elemsize = elemsize;
+    m.elempack = elempack;
+    m.allocator = allocator;
+    m.dims = dims;
+    m.w = w;
+    m.h = h;
+    m.d = d;
+    m.c = c;
+    m.cstep = cstep;
+    m.n = batches;
+    m.nstep = nstep;
+    m.offset = offset + nstep * b * elemsize;
+    return m;
+}
+
+NCNN_FORCEINLINE const VkMat VkMat::batch_range(int b, int batches) const
+{
+    VkMat m;
+    m.data = data;
+    m.refcount = 0;
+    m.elemsize = elemsize;
+    m.elempack = elempack;
+    m.allocator = allocator;
+    m.dims = dims;
+    m.w = w;
+    m.h = h;
+    m.d = d;
+    m.c = c;
+    m.cstep = cstep;
+    m.n = batches;
+    m.nstep = nstep;
+    m.offset = offset + nstep * b * elemsize;
+    return m;
 }
 
 NCNN_FORCEINLINE VkImageMat::VkImageMat()
diff --git a/src/net.cpp b/src/net.cpp
index 4394132040ef..de91cac424e3 100644
--- a/src/net.cpp
+++ b/src/net.cpp
@@ -629,33 +629,86 @@ int NetPrivate::do_forward_layer(const Layer* layer, std::vector<Mat>& blob_mats
         if (ret != 0)
             return ret;
 
-        // forward
-        if (opt.lightmode && layer->support_inplace)
+        // batch forward
+        if (bottom_blob.n > 1)
         {
-            Mat& bottom_top_blob = bottom_blob;
-            int ret = layer->forward_inplace(bottom_top_blob, opt);
-            if (ret != 0)
-                return ret;
+            const int B = bottom_blob.n;
+
+            if (opt.lightmode && layer->support_inplace)
+            {
+                for (int b = 0; b < B; b++)
+                {
+                    Mat batch_view = bottom_blob.batch(b);
+                    int ret = layer->forward_inplace(batch_view, opt);
+                    if (ret != 0)
+                        return ret;
+                }
 
-            // store top blob
-            blob_mats[top_blob_index] = bottom_top_blob;
+                // store top blob (whole batch, inplace modified)
+                blob_mats[top_blob_index] = bottom_blob;
+            }
+            else
+            {
+                Mat top_batch;
+                for (int b = 0; b < B; b++)
+                {
+                    Mat bottom_b = bottom_blob.batch(b);
+                    Mat top_b;
+                    int ret = layer->forward(bottom_b, top_b, opt);
+                    if (ret != 0)
+                        return ret;
+
+                    if (b == 0)
+                    {
+                        top_batch.create_like_batch(top_b, B, opt.blob_allocator);
+                        if (top_batch.empty())
+                            return -100;
+                    }
+
+                    size_t batch_data_size = top_b.cstep * top_b.c * top_b.elemsize;
+                    memcpy(top_batch.batch(b).data, top_b.data, batch_data_size);
+                }
+
+                // store top blob
+                blob_mats[top_blob_index] = top_batch;
+            }
+
+            if (opt.lightmode)
+            {
+                // delete after taken in light mode
+                blob_mats[bottom_blob_index].release();
+            }
         }
         else
         {
-            Mat top_blob;
-            int ret = layer->forward(bottom_blob, top_blob, opt);
-            if (ret != 0)
-                return ret;
+            // forward
+            if (opt.lightmode && layer->support_inplace)
+            {
+                Mat& bottom_top_blob = bottom_blob;
+                int ret = layer->forward_inplace(bottom_top_blob, opt);
+                if (ret != 0)
+                    return ret;
 
-            // store top blob
-            blob_mats[top_blob_index] = top_blob;
-        }
+                // store top blob
+                blob_mats[top_blob_index] = bottom_top_blob;
+            }
+            else
+            {
+                Mat top_blob;
+                int ret = layer->forward(bottom_blob, top_blob, opt);
+                if (ret != 0)
+                    return ret;
 
-        if (opt.lightmode)
-        {
-            // delete after taken in light mode
-            blob_mats[bottom_blob_index].release();
-        }
+                // store top blob
+                blob_mats[top_blob_index] = top_blob;
+            }
+
+            if (opt.lightmode)
+            {
+                // delete after taken in light mode
+                blob_mats[bottom_blob_index].release();
+            }
+        } // n == 1
     }
     else
     {
@@ -687,48 +740,135 @@ int NetPrivate::do_forward_layer(const Layer* layer, std::vector<Mat>& blob_mats
                 return ret;
         }
 
-        // forward
-        if (opt.lightmode && layer->support_inplace)
+        // detect batch
+        int B = 1;
+        for (size_t i = 0; i < bottom_blobs.size(); i++)
         {
-            std::vector<Mat>& bottom_top_blobs = bottom_blobs;
-            int ret = layer->forward_inplace(bottom_top_blobs, opt);
-            if (ret != 0)
-                return ret;
+            if (bottom_blobs[i].n > 1)
+            {
+                B = bottom_blobs[i].n;
+                break;
+            }
+        }
 
-            // store top blobs
-            for (size_t i = 0; i < layer->tops.size(); i++)
+        if (B > 1)
+        {
+            if (opt.lightmode && layer->support_inplace)
             {
-                int top_blob_index = layer->tops[i];
+                for (int b = 0; b < B; b++)
+                {
+                    std::vector<Mat> batch_views(bottom_blobs.size());
+                    for (size_t i = 0; i < bottom_blobs.size(); i++)
+                    {
+                        batch_views[i] = bottom_blobs[i].n > 1 ? bottom_blobs[i].batch(b) : bottom_blobs[i];
+                    }
+                    int ret = layer->forward_inplace(batch_views, opt);
+                    if (ret != 0)
+                        return ret;
+                }
 
-                blob_mats[top_blob_index] = bottom_top_blobs[i];
+                // store top blobs (whole batch, inplace modified)
+                for (size_t i = 0; i < layer->tops.size(); i++)
+                {
+                    int top_blob_index = layer->tops[i];
+                    blob_mats[top_blob_index] = bottom_blobs[i];
+                }
+            }
+            else
+            {
+                std::vector<Mat> top_batches(layer->tops.size());
+                for (int b = 0; b < B; b++)
+                {
+                    std::vector<Mat> bottom_b(bottom_blobs.size());
+                    for (size_t i = 0; i < bottom_blobs.size(); i++)
+                    {
+                        bottom_b[i] = bottom_blobs[i].n > 1 ? bottom_blobs[i].batch(b) : bottom_blobs[i];
+                    }
+
+                    std::vector<Mat> top_b(layer->tops.size());
+                    int ret = layer->forward(bottom_b, top_b, opt);
+                    if (ret != 0)
+                        return ret;
+
+                    if (b == 0)
+                    {
+                        for (size_t i = 0; i < top_b.size(); i++)
+                        {
+                            top_batches[i].create_like_batch(top_b[i], B, opt.blob_allocator);
+                            if (top_batches[i].empty())
+                                return -100;
+                        }
+                    }
+
+                    for (size_t i = 0; i < top_b.size(); i++)
+                    {
+                        size_t batch_data_size = top_b[i].cstep * top_b[i].c * top_b[i].elemsize;
+                        memcpy(top_batches[i].batch(b).data, top_b[i].data, batch_data_size);
+                    }
+                }
+
+                // store top blobs
+                for (size_t i = 0; i < layer->tops.size(); i++)
+                {
+                    int top_blob_index = layer->tops[i];
+                    blob_mats[top_blob_index] = top_batches[i];
+                }
+            }
+
+            if (opt.lightmode)
+            {
+                for (size_t i = 0; i < layer->bottoms.size(); i++)
+                {
+                    int bottom_blob_index = layer->bottoms[i];
+                    blob_mats[bottom_blob_index].release();
+                }
             }
         }
         else
         {
-            std::vector<Mat> top_blobs(layer->tops.size());
-            int ret = layer->forward(bottom_blobs, top_blobs, opt);
-            if (ret != 0)
-                return ret;
+            // forward
+            if (opt.lightmode && layer->support_inplace)
+            {
+                std::vector<Mat>& bottom_top_blobs = bottom_blobs;
+                int ret = layer->forward_inplace(bottom_top_blobs, opt);
+                if (ret != 0)
+                    return ret;
+
+                // store top blobs
+                for (size_t i = 0; i < layer->tops.size(); i++)
+                {
+                    int top_blob_index = layer->tops[i];
 
-            // store top blobs
-            for (size_t i = 0; i < layer->tops.size(); i++)
+                    blob_mats[top_blob_index] = bottom_top_blobs[i];
+                }
+            }
+            else
             {
-                int top_blob_index = layer->tops[i];
+                std::vector<Mat> top_blobs(layer->tops.size());
+                int ret = layer->forward(bottom_blobs, top_blobs, opt);
+                if (ret != 0)
+                    return ret;
+
+                // store top blobs
+                for (size_t i = 0; i < layer->tops.size(); i++)
+                {
+                    int top_blob_index = layer->tops[i];
 
-                blob_mats[top_blob_index] = top_blobs[i];
+                    blob_mats[top_blob_index] = top_blobs[i];
+                }
             }
-        }
 
-        if (opt.lightmode)
-        {
-            for (size_t i = 0; i < layer->bottoms.size(); i++)
+            if (opt.lightmode)
             {
-                int bottom_blob_index = layer->bottoms[i];
+                for (size_t i = 0; i < layer->bottoms.size(); i++)
+                {
+                    int bottom_blob_index = layer->bottoms[i];
 
-                // delete after taken in light mode
-                blob_mats[bottom_blob_index].release();
+                    // delete after taken in light mode
+                    blob_mats[bottom_blob_index].release();
+                }
             }
-        }
+        } // B == 1
     }
 
     return 0;
@@ -764,33 +904,83 @@ int NetPrivate::do_forward_layer(const Layer* layer, std::vector<VkMat>& blob_ma
         if (ret != 0)
             return ret;
 
-        // forward
-        if (opt.lightmode && layer->support_inplace)
+        // batch forward
+        if (bottom_blob.n > 1)
         {
-            VkMat& bottom_top_blob = bottom_blob;
-            int ret = layer->forward_inplace(bottom_top_blob, cmd, opt);
-            if (ret != 0)
-                return ret;
+            const int B = bottom_blob.n;
+
+            if (opt.lightmode && layer->support_inplace)
+            {
+                for (int b = 0; b < B; b++)
+                {
+                    VkMat batch_view = bottom_blob.batch(b);
+                    int ret = layer->forward_inplace(batch_view, cmd, opt);
+                    if (ret != 0)
+                        return ret;
+                }
 
-            // store top blob
-            blob_mats_gpu[top_blob_index] = bottom_top_blob;
+                blob_mats_gpu[top_blob_index] = bottom_blob;
+            }
+            else
+            {
+                VkMat top_batch;
+                for (int b = 0; b < B; b++)
+                {
+                    VkMat bottom_b = bottom_blob.batch(b);
+                    VkMat top_b;
+                    int ret = layer->forward(bottom_b, top_b, cmd, opt);
+                    if (ret != 0)
+                        return ret;
+
+                    if (b == 0)
+                    {
+                        top_batch.create_like_batch(top_b, B, opt.blob_vkallocator);
+                        if (top_batch.empty())
+                            return -100;
+                    }
+
+                    VkMat top_batch_slot = top_batch.batch(b);
+                    cmd.record_clone(top_b, top_batch_slot, opt);
+                }
+
+                blob_mats_gpu[top_blob_index] = top_batch;
+            }
+
+            if (opt.lightmode)
+            {
+                blob_mats_gpu[bottom_blob_index].release();
+            }
         }
         else
         {
-            VkMat top_blob;
-            int ret = layer->forward(bottom_blob, top_blob, cmd, opt);
-            if (ret != 0)
-                return ret;
+            // forward
+            if (opt.lightmode && layer->support_inplace)
+            {
+                VkMat& bottom_top_blob = bottom_blob;
+                int ret = layer->forward_inplace(bottom_top_blob, cmd, opt);
+                if (ret != 0)
+                    return ret;
 
-            // store top blob
-            blob_mats_gpu[top_blob_index] = top_blob;
-        }
+                // store top blob
+                blob_mats_gpu[top_blob_index] = bottom_top_blob;
+            }
+            else
+            {
+                VkMat top_blob;
+                int ret = layer->forward(bottom_blob, top_blob, cmd, opt);
+                if (ret != 0)
+                    return ret;
 
-        if (opt.lightmode)
-        {
-            // delete after taken in light mode
-            blob_mats_gpu[bottom_blob_index].release();
-        }
+                // store top blob
+                blob_mats_gpu[top_blob_index] = top_blob;
+            }
+
+            if (opt.lightmode)
+            {
+                // delete after taken in light mode
+                blob_mats_gpu[bottom_blob_index].release();
+            }
+        } // n == 1
     }
     else
     {
@@ -822,48 +1012,133 @@ int NetPrivate::do_forward_layer(const Layer* layer, std::vector<VkMat>& blob_ma
                 return ret;
         }
 
-        // forward
-        if (opt.lightmode && layer->support_inplace)
+        // detect batch
+        int B = 1;
+        for (size_t i = 0; i < bottom_blobs.size(); i++)
         {
-            std::vector<VkMat>& bottom_top_blobs = bottom_blobs;
-            int ret = layer->forward_inplace(bottom_top_blobs, cmd, opt);
-            if (ret != 0)
-                return ret;
+            if (bottom_blobs[i].n > 1)
+            {
+                B = bottom_blobs[i].n;
+                break;
+            }
+        }
 
-            // store top blobs
-            for (size_t i = 0; i < layer->tops.size(); i++)
+        if (B > 1)
+        {
+            if (opt.lightmode && layer->support_inplace)
             {
-                int top_blob_index = layer->tops[i];
+                for (int b = 0; b < B; b++)
+                {
+                    std::vector<VkMat> batch_views(bottom_blobs.size());
+                    for (size_t i = 0; i < bottom_blobs.size(); i++)
+                    {
+                        batch_views[i] = bottom_blobs[i].n > 1 ? bottom_blobs[i].batch(b) : bottom_blobs[i];
+                    }
+                    int ret = layer->forward_inplace(batch_views, cmd, opt);
+                    if (ret != 0)
+                        return ret;
+                }
 
-                blob_mats_gpu[top_blob_index] = bottom_top_blobs[i];
+                for (size_t i = 0; i < layer->tops.size(); i++)
+                {
+                    int top_blob_index = layer->tops[i];
+                    blob_mats_gpu[top_blob_index] = bottom_blobs[i];
+                }
+            }
+            else
+            {
+                std::vector<VkMat> top_batches(layer->tops.size());
+                for (int b = 0; b < B; b++)
+                {
+                    std::vector<VkMat> bottom_b(bottom_blobs.size());
+                    for (size_t i = 0; i < bottom_blobs.size(); i++)
+                    {
+                        bottom_b[i] = bottom_blobs[i].n > 1 ? bottom_blobs[i].batch(b) : bottom_blobs[i];
+                    }
+
+                    std::vector<VkMat> top_b(layer->tops.size());
+                    int ret = layer->forward(bottom_b, top_b, cmd, opt);
+                    if (ret != 0)
+                        return ret;
+
+                    if (b == 0)
+                    {
+                        for (size_t i = 0; i < top_b.size(); i++)
+                        {
+                            top_batches[i].create_like_batch(top_b[i], B, opt.blob_vkallocator);
+                            if (top_batches[i].empty())
+                                return -100;
+                        }
+                    }
+
+                    for (size_t i = 0; i < top_b.size(); i++)
+                    {
+                        VkMat top_batch_slot = top_batches[i].batch(b);
+                        cmd.record_clone(top_b[i], top_batch_slot, opt);
+                    }
+                }
+
+                for (size_t i = 0; i < layer->tops.size(); i++)
+                {
+                    int top_blob_index = layer->tops[i];
+                    blob_mats_gpu[top_blob_index] = top_batches[i];
+                }
+            }
+
+            if (opt.lightmode)
+            {
+                for (size_t i = 0; i < layer->bottoms.size(); i++)
+                {
+                    int bottom_blob_index = layer->bottoms[i];
+                    blob_mats_gpu[bottom_blob_index].release();
+                }
             }
         }
         else
         {
-            std::vector<VkMat> top_blobs(layer->tops.size());
-            int ret = layer->forward(bottom_blobs, top_blobs, cmd, opt);
-            if (ret != 0)
-                return ret;
+            // forward
+            if (opt.lightmode && layer->support_inplace)
+            {
+                std::vector<VkMat>& bottom_top_blobs = bottom_blobs;
+                int ret = layer->forward_inplace(bottom_top_blobs, cmd, opt);
+                if (ret != 0)
+                    return ret;
+
+                // store top blobs
+                for (size_t i = 0; i < layer->tops.size(); i++)
+                {
+                    int top_blob_index = layer->tops[i];
 
-            // store top blobs
-            for (size_t i = 0; i < layer->tops.size(); i++)
+                    blob_mats_gpu[top_blob_index] = bottom_top_blobs[i];
+                }
+            }
+            else
             {
-                int top_blob_index = layer->tops[i];
+                std::vector<VkMat> top_blobs(layer->tops.size());
+                int ret = layer->forward(bottom_blobs, top_blobs, cmd, opt);
+                if (ret != 0)
+                    return ret;
+
+                // store top blobs
+                for (size_t i = 0; i < layer->tops.size(); i++)
+                {
+                    int top_blob_index = layer->tops[i];
 
-                blob_mats_gpu[top_blob_index] = top_blobs[i];
+                    blob_mats_gpu[top_blob_index] = top_blobs[i];
+                }
             }
-        }
 
-        if (opt.lightmode)
-        {
-            for (size_t i = 0; i < layer->bottoms.size(); i++)
+            if (opt.lightmode)
             {
-                int bottom_blob_index = layer->bottoms[i];
+                for (size_t i = 0; i < layer->bottoms.size(); i++)
+                {
+                    int bottom_blob_index = layer->bottoms[i];
 
-                // delete after taken in light mode
-                blob_mats_gpu[bottom_blob_index].release();
+                    // delete after taken in light mode
+                    blob_mats_gpu[bottom_blob_index].release();
+                }
             }
-        }
+        } // B == 1
     }
 
     return 0;
@@ -2685,9 +2960,9 @@ int Extractor::extract(int blob_index, Mat& feat, int type)
     {
         if (d->opt.use_packing_layout && (type == 0) && feat.elempack != 1)
         {
-            Mat bottom_blob_unpacked;
-            convert_packing(feat, bottom_blob_unpacked, 1, d->opt);
-            feat = bottom_blob_unpacked;
+            Mat feat_unpacked;
+            convert_packing(feat, feat_unpacked, 1, d->opt);
+            feat = feat_unpacked;
             if (feat.empty())
                 return -100;
         }
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index e72e6d02b86e..761cf9264900 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -62,6 +62,7 @@ ncnn_add_test(c_api)
 ncnn_add_test(cpu)
 ncnn_add_test(expression)
 ncnn_add_test(paramdict)
+ncnn_add_test(mat_batch)
 
 if(NCNN_VULKAN)
     ncnn_add_test(command)
diff --git a/tests/test_mat_batch.cpp b/tests/test_mat_batch.cpp
new file mode 100644
index 000000000000..72feb2ce9ad7
--- /dev/null
+++ b/tests/test_mat_batch.cpp
@@ -0,0 +1,1256 @@
+// Copyright 2025 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "mat.h"
+#include "net.h"
+
+#if NCNN_VULKAN
+#include "gpu.h"
+#include "command.h"
+#endif
+
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+static int test_create_batch_basic()
+{
+    // create a batch of 4 images, 3 channels, 8x6 spatial
+    ncnn::Mat m;
+    m.create_batch(8, 6, 3, 4, 4u, 1);
+
+    if (m.dims != 3)
+    {
+        fprintf(stderr, "test_create_batch_basic dims expect 3 got %d\n", m.dims);
+        return -1;
+    }
+    if (m.w != 8 || m.h != 6 || m.c != 3)
+    {
+        fprintf(stderr, "test_create_batch_basic shape mismatch w=%d h=%d c=%d\n", m.w, m.h, m.c);
+        return -1;
+    }
+    if (m.n != 4)
+    {
+        fprintf(stderr, "test_create_batch_basic n expect 4 got %d\n", m.n);
+        return -1;
+    }
+    if (m.data == 0)
+    {
+        fprintf(stderr, "test_create_batch_basic data is null\n");
+        return -1;
+    }
+    if (m.refcount == 0 || *m.refcount != 1)
+    {
+        fprintf(stderr, "test_create_batch_basic refcount error\n");
+        return -1;
+    }
+
+    return 0;
+}
+
+static int test_nstep_alignment()
+{
+    // verify nstep * elemsize is 4K aligned
+    {
+        ncnn::Mat m;
+        m.create_batch(8, 6, 3, 4, 4u, 1);
+        size_t nstep_bytes = m.nstep * m.elemsize;
+        if (nstep_bytes % 4096 != 0)
+        {
+            fprintf(stderr, "test_nstep_alignment 3D failed: nstep_bytes=%zu\n", nstep_bytes);
+            return -1;
+        }
+    }
+
+    // odd spatial dims
+    {
+        ncnn::Mat m;
+        m.create_batch(7, 5, 13, 2, 4u, 1);
+        size_t nstep_bytes = m.nstep * m.elemsize;
+        if (nstep_bytes % 4096 != 0)
+        {
+            fprintf(stderr, "test_nstep_alignment odd failed: nstep_bytes=%zu\n", nstep_bytes);
+            return -1;
+        }
+    }
+
+    // 4D with depth
+    {
+        ncnn::Mat m;
+        m.create_batch(5, 4, 3, 2, 8, 4u, 1, 0);
+        if (m.dims != 4)
+        {
+            fprintf(stderr, "test_nstep_alignment 4D dims expect 4 got %d\n", m.dims);
+            return -1;
+        }
+        size_t nstep_bytes = m.nstep * m.elemsize;
+        if (nstep_bytes % 4096 != 0)
+        {
+            fprintf(stderr, "test_nstep_alignment 4D failed: nstep_bytes=%zu\n", nstep_bytes);
+            return -1;
+        }
+    }
+
+    // packed elempack=4
+    {
+        ncnn::Mat m;
+        m.create_batch(8, 6, 1, 12, 4, 16u, 4, 0);
+        size_t nstep_bytes = m.nstep * m.elemsize;
+        if (nstep_bytes % 4096 != 0)
+        {
+            fprintf(stderr, "test_nstep_alignment packed failed: nstep_bytes=%zu\n", nstep_bytes);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+static int test_batch_subview_zero_copy()
+{
+    ncnn::Mat m;
+    m.create_batch(4, 3, 2, 3, 4u, 1);
+
+    // fill each batch with distinct value
+    for (int b = 0; b < m.n; b++)
+    {
+        ncnn::Mat sub = m.batch(b);
+        sub.fill((float)(b + 1));
+    }
+
+    // read back and verify
+    for (int b = 0; b < m.n; b++)
+    {
+        const ncnn::Mat sub = m.batch(b);
+
+        // verify sub-view properties
+        if (sub.dims != m.dims || sub.w != m.w || sub.h != m.h || sub.c != m.c)
+        {
+            fprintf(stderr, "test_batch_subview shape mismatch at batch %d\n", b);
+            return -1;
+        }
+        if (sub.cstep != m.cstep)
+        {
+            fprintf(stderr, "test_batch_subview cstep mismatch at batch %d\n", b);
+            return -1;
+        }
+        if (sub.n != 1)
+        {
+            fprintf(stderr, "test_batch_subview n expect 1 got %d\n", sub.n);
+            return -1;
+        }
+        if (sub.refcount != 0)
+        {
+            fprintf(stderr, "test_batch_subview refcount should be NULL (zero-copy)\n");
+            return -1;
+        }
+
+        // verify data pointer is at correct offset
+        unsigned char* expected_ptr = (unsigned char*)m.data + m.nstep * b * m.elemsize;
+        if ((unsigned char*)sub.data != expected_ptr)
+        {
+            fprintf(stderr, "test_batch_subview data pointer mismatch at batch %d\n", b);
+            return -1;
+        }
+
+        // verify values
+        float expected = (float)(b + 1);
+        for (int q = 0; q < sub.c; q++)
+        {
+            const float* ptr = sub.channel(q);
+            for (int i = 0; i < sub.w * sub.h; i++)
+            {
+                if (ptr[i] != expected)
+                {
+                    fprintf(stderr, "test_batch_subview value mismatch at batch %d ch %d idx %d: got %f expect %f\n",
+                            b, q, i, ptr[i], expected);
+                    return -1;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int test_batch_range()
+{
+    ncnn::Mat m;
+    m.create_batch(4, 3, 2, 4, 4u, 1);
+
+    // fill with batch index
+    for (int b = 0; b < 4; b++)
+    {
+        ncnn::Mat sub = m.batch(b);
+        sub.fill((float)(b * 10));
+    }
+
+    // get range [1, 2) batches
+    ncnn::Mat range = m.batch_range(1, 2);
+    if (range.n != 2)
+    {
+        fprintf(stderr, "test_batch_range n expect 2 got %d\n", range.n);
+        return -1;
+    }
+    if (range.nstep != m.nstep)
+    {
+        fprintf(stderr, "test_batch_range nstep mismatch\n");
+        return -1;
+    }
+
+    // verify range.batch(0) == m.batch(1)
+    const ncnn::Mat r0 = range.batch(0);
+    const float* r0_ptr = r0.channel(0);
+    if (r0_ptr[0] != 10.f)
+    {
+        fprintf(stderr, "test_batch_range batch(0) value expect 10 got %f\n", r0_ptr[0]);
+        return -1;
+    }
+
+    // verify range.batch(1) == m.batch(2)
+    const ncnn::Mat r1 = range.batch(1);
+    const float* r1_ptr = r1.channel(0);
+    if (r1_ptr[0] != 20.f)
+    {
+        fprintf(stderr, "test_batch_range batch(1) value expect 20 got %f\n", r1_ptr[0]);
+        return -1;
+    }
+
+    return 0;
+}
+
+static int test_batch_data_isolation()
+{
+    ncnn::Mat m;
+    m.create_batch(16, 16, 3, 4, 4u, 1);
+
+    // write unique pattern to each batch
+    for (int b = 0; b < 4; b++)
+    {
+        ncnn::Mat sub = m.batch(b);
+        for (int q = 0; q < sub.c; q++)
+        {
+            float* ptr = sub.channel(q);
+            for (int i = 0; i < sub.w * sub.h; i++)
+            {
+                ptr[i] = (float)(b * 1000 + q * 100 + i);
+            }
+        }
+    }
+
+    // verify no cross-contamination
+    for (int b = 0; b < 4; b++)
+    {
+        const ncnn::Mat sub = m.batch(b);
+        for (int q = 0; q < sub.c; q++)
+        {
+            const float* ptr = sub.channel(q);
+            for (int i = 0; i < sub.w * sub.h; i++)
+            {
+                float expected = (float)(b * 1000 + q * 100 + i);
+                if (ptr[i] != expected)
+                {
+                    fprintf(stderr, "test_batch_data_isolation mismatch at b=%d q=%d i=%d: got %f expect %f\n",
+                            b, q, i, ptr[i], expected);
+                    return -1;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int test_batch_clone()
+{
+    ncnn::Mat m;
+    m.create_batch(8, 6, 3, 4, 4u, 1);
+
+    // fill with data
+    for (int b = 0; b < 4; b++)
+    {
+        ncnn::Mat sub = m.batch(b);
+        sub.fill((float)(b + 1));
+    }
+
+    // clone
+    ncnn::Mat m2 = m.clone();
+
+    // verify deep copy
+    if (m2.data == m.data)
+    {
+        fprintf(stderr, "test_batch_clone data should be different (deep copy)\n");
+        return -1;
+    }
+    if (m2.n != m.n)
+    {
+        fprintf(stderr, "test_batch_clone n mismatch\n");
+        return -1;
+    }
+    if (m2.nstep != m.nstep)
+    {
+        fprintf(stderr, "test_batch_clone nstep mismatch\n");
+        return -1;
+    }
+    if (m2.dims != m.dims || m2.w != m.w || m2.h != m.h || m2.c != m.c)
+    {
+        fprintf(stderr, "test_batch_clone shape mismatch\n");
+        return -1;
+    }
+
+    // verify values match
+    for (int b = 0; b < 4; b++)
+    {
+        const ncnn::Mat s2 = m2.batch(b);
+        float expected = (float)(b + 1);
+        const float* p2 = s2.channel(0);
+        if (p2[0] != expected)
+        {
+            fprintf(stderr, "test_batch_clone value mismatch at batch %d\n", b);
+            return -1;
+        }
+    }
+
+    // verify independence: modify original, clone should not change
+    m.batch(0).fill(999.f);
+    const float* p2 = m2.batch(0).channel(0);
+    if (p2[0] != 1.f)
+    {
+        fprintf(stderr, "test_batch_clone not independent after modify\n");
+        return -1;
+    }
+
+    return 0;
+}
+
+static int test_batch_release()
+{
+    ncnn::Mat m;
+    m.create_batch(4, 3, 2, 4, 4u, 1);
+
+    m.release();
+
+    if (m.dims != 0)
+    {
+        fprintf(stderr, "test_batch_release dims expect 0 got %d\n", m.dims);
+        return -1;
+    }
+    if (m.n != 1)
+    {
+        fprintf(stderr, "test_batch_release n expect 1 got %d\n", m.n);
+        return -1;
+    }
+    if (m.nstep != 0)
+    {
+        fprintf(stderr, "test_batch_release nstep expect 0 got %zu\n", m.nstep);
+        return -1;
+    }
+    if (m.data != 0)
+    {
+        fprintf(stderr, "test_batch_release data should be null\n");
+        return -1;
+    }
+
+    return 0;
+}
+
+static int test_backward_compatibility()
+{
+    // regular Mat should have n=1
+    ncnn::Mat m1(8, 6, 3);
+    if (m1.n != 1)
+    {
+        fprintf(stderr, "test_backward_compat n expect 1 got %d\n", m1.n);
+        return -1;
+    }
+
+    // channel() and row() still work
+    m1.fill(42.f);
+    ncnn::Mat ch0 = m1.channel(0);
+    if (ch0.w != 8 || ch0.h != 6)
+    {
+        fprintf(stderr, "test_backward_compat channel shape mismatch\n");
+        return -1;
+    }
+    const float* row0 = ch0.row(0);
+    if (row0[0] != 42.f)
+    {
+        fprintf(stderr, "test_backward_compat channel value mismatch\n");
+        return -1;
+    }
+
+    // copy ctor preserves n
+    ncnn::Mat m2 = m1;
+    if (m2.n != 1)
+    {
+        fprintf(stderr, "test_backward_compat copy n mismatch\n");
+        return -1;
+    }
+
+    // empty Mat has n=1
+    ncnn::Mat m3;
+    if (m3.n != 1)
+    {
+        fprintf(stderr, "test_backward_compat empty n expect 1 got %d\n", m3.n);
+        return -1;
+    }
+
+    return 0;
+}
+
+static int test_create_batch_single()
+{
+    // create_batch with batch=1 should fall back to regular create
+    ncnn::Mat m;
+    m.create_batch(8, 6, 3, 1, 4u, 1);
+
+    if (m.dims != 3)
+    {
+        fprintf(stderr, "test_create_batch_single dims expect 3 got %d\n", m.dims);
+        return -1;
+    }
+    if (m.n != 1)
+    {
+        fprintf(stderr, "test_create_batch_single n expect 1 got %d\n", m.n);
+        return -1;
+    }
+    if (m.w != 8 || m.h != 6 || m.c != 3)
+    {
+        fprintf(stderr, "test_create_batch_single shape mismatch\n");
+        return -1;
+    }
+
+    // should work like normal Mat
+    m.fill(7.f);
+    if (((const float*)m.data)[0] != 7.f)
+    {
+        fprintf(stderr, "test_create_batch_single fill failed\n");
+        return -1;
+    }
+
+    return 0;
+}
+
+static int test_create_batch_1d()
+{
+    // create a batch of 4 1D vectors, w=100
+    ncnn::Mat m;
+    m.create_batch(100, 4, 4u, 1);
+
+    if (m.dims != 1)
+    {
+        fprintf(stderr, "test_create_batch_1d dims expect 1 got %d\n", m.dims);
+        return -1;
+    }
+    if (m.w != 100 || m.h != 1 || m.d != 1 || m.c != 1)
+    {
+        fprintf(stderr, "test_create_batch_1d shape mismatch w=%d h=%d d=%d c=%d\n", m.w, m.h, m.d, m.c);
+        return -1;
+    }
+    if (m.n != 4)
+    {
+        fprintf(stderr, "test_create_batch_1d n expect 4 got %d\n", m.n);
+        return -1;
+    }
+    if (m.data == 0)
+    {
+        fprintf(stderr, "test_create_batch_1d data is null\n");
+        return -1;
+    }
+
+    // verify nstep alignment
+    size_t nstep_bytes = m.nstep * m.elemsize;
+    if (nstep_bytes % 4096 != 0)
+    {
+        fprintf(stderr, "test_create_batch_1d nstep_bytes=%zu not 4K aligned\n", nstep_bytes);
+        return -1;
+    }
+
+    // fill and verify subview zero-copy
+    for (int b = 0; b < m.n; b++)
+    {
+        ncnn::Mat sub = m.batch(b);
+        sub.fill((float)(b + 10));
+    }
+    for (int b = 0; b < m.n; b++)
+    {
+        const ncnn::Mat sub = m.batch(b);
+        if (sub.dims != 1 || sub.w != 100 || sub.n != 1)
+        {
+            fprintf(stderr, "test_create_batch_1d subview shape mismatch at batch %d\n", b);
+            return -1;
+        }
+        if (sub.refcount != 0)
+        {
+            fprintf(stderr, "test_create_batch_1d subview should be zero-copy\n");
+            return -1;
+        }
+        float expected = (float)(b + 10);
+        const float* ptr = (const float*)sub.data;
+        if (ptr[0] != expected || ptr[99] != expected)
+        {
+            fprintf(stderr, "test_create_batch_1d value mismatch at batch %d\n", b);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+static int test_create_batch_2d()
+{
+    // create a batch of 3 2D matrices, 10x20
+    ncnn::Mat m;
+    m.create_batch(10, 20, 3, 4u, 1);
+
+    if (m.dims != 2)
+    {
+        fprintf(stderr, "test_create_batch_2d dims expect 2 got %d\n", m.dims);
+        return -1;
+    }
+    if (m.w != 10 || m.h != 20 || m.d != 1 || m.c != 1)
+    {
+        fprintf(stderr, "test_create_batch_2d shape mismatch w=%d h=%d d=%d c=%d\n", m.w, m.h, m.d, m.c);
+        return -1;
+    }
+    if (m.n != 3)
+    {
+        fprintf(stderr, "test_create_batch_2d n expect 3 got %d\n", m.n);
+        return -1;
+    }
+
+    // verify nstep alignment
+    size_t nstep_bytes = m.nstep * m.elemsize;
+    if (nstep_bytes % 4096 != 0)
+    {
+        fprintf(stderr, "test_create_batch_2d nstep_bytes=%zu not 4K aligned\n", nstep_bytes);
+        return -1;
+    }
+
+    // fill and verify subview zero-copy
+    for (int b = 0; b < m.n; b++)
+    {
+        ncnn::Mat sub = m.batch(b);
+        sub.fill((float)(b + 100));
+    }
+    for (int b = 0; b < m.n; b++)
+    {
+        const ncnn::Mat sub = m.batch(b);
+        if (sub.dims != 2 || sub.w != 10 || sub.h != 20 || sub.n != 1)
+        {
+            fprintf(stderr, "test_create_batch_2d subview shape mismatch at batch %d\n", b);
+            return -1;
+        }
+        if (sub.refcount != 0)
+        {
+            fprintf(stderr, "test_create_batch_2d subview should be zero-copy\n");
+            return -1;
+        }
+        float expected = (float)(b + 100);
+        const float* ptr = (const float*)sub.data;
+        if (ptr[0] != expected || ptr[10 * 20 - 1] != expected)
+        {
+            fprintf(stderr, "test_create_batch_2d value mismatch at batch %d\n", b);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+static int test_batch_forward_relu()
+{
+    // Build a minimal Input -> ReLU network
+    // ReLU with slope=0.1 (leaky relu)
+    const char param_str[] = "7767517\n"
+                             "2 2\n"
+                             "Input input 0 1 data\n"
+                             "ReLU  relu  1 1 data output 0=1.000000e-01\n";
+
+    ncnn::Net net;
+    net.load_param_mem(param_str);
+
+    const int B = 4;
+    const int C = 3;
+    const int H = 3;
+    const int W = 4;
+
+    ncnn::Mat input_batch;
+    input_batch.create_batch(W, H, C, B, 4u, 1);
+    if (input_batch.empty())
+    {
+        fprintf(stderr, "test_batch_forward_relu create_batch failed\n");
+        return -1;
+    }
+
+    // fill: batch b gets value (b - 1.5), some negative, some positive
+    for (int b = 0; b < B; b++)
+    {
+        ncnn::Mat sub = input_batch.batch(b);
+        sub.fill((float)(b - 1.5f));
+    }
+
+    ncnn::Extractor ex = net.create_extractor();
+    ex.input("data", input_batch);
+
+    ncnn::Mat output_batch;
+    int ret = ex.extract("output", output_batch);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_batch_forward_relu extract failed ret=%d\n", ret);
+        return -1;
+    }
+
+    if (output_batch.n != B)
+    {
+        fprintf(stderr, "test_batch_forward_relu output n expect %d got %d\n", B, output_batch.n);
+        return -1;
+    }
+    if (output_batch.w != W || output_batch.h != H || output_batch.c != C)
+    {
+        fprintf(stderr, "test_batch_forward_relu output shape mismatch\n");
+        return -1;
+    }
+
+    // verify leaky relu: max(x, 0.1*x)
+    for (int b = 0; b < B; b++)
+    {
+        const ncnn::Mat out_sub = output_batch.batch(b);
+        float input_val = (float)(b - 1.5f);
+        float expected = input_val > 0 ? input_val : input_val * 0.1f;
+
+        for (int q = 0; q < C; q++)
+        {
+            const float* ptr = out_sub.channel(q);
+            for (int i = 0; i < W * H; i++)
+            {
+                if (fabsf(ptr[i] - expected) > 1e-5f)
+                {
+                    fprintf(stderr, "test_batch_forward_relu value mismatch at b=%d q=%d i=%d: got %f expect %f\n",
+                            b, q, i, ptr[i], expected);
+                    return -1;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int test_batch_forward_pooling()
+{
+    // Input -> Pooling(max, 2x2, stride=2)
+    const char param_str[] = "7767517\n"
+                             "2 2\n"
+                             "Input   input   0 1 data\n"
+                             "Pooling pooling 1 1 data output 0=0 1=2 2=2\n";
+
+    ncnn::Net net;
+    net.load_param_mem(param_str);
+
+    const int B = 2;
+    const int C = 2;
+    const int H = 4;
+    const int W = 4;
+
+    ncnn::Mat input_batch;
+    input_batch.create_batch(W, H, C, B, 4u, 1);
+
+    for (int b = 0; b < B; b++)
+    {
+        ncnn::Mat sub = input_batch.batch(b);
+        for (int q = 0; q < C; q++)
+        {
+            float* ptr = sub.channel(q);
+            for (int i = 0; i < W * H; i++)
+            {
+                ptr[i] = (float)(b * 100 + q * 10 + i);
+            }
+        }
+    }
+
+    ncnn::Extractor ex = net.create_extractor();
+    ex.input("data", input_batch);
+
+    ncnn::Mat output_batch;
+    int ret = ex.extract("output", output_batch);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_batch_forward_pooling extract failed ret=%d\n", ret);
+        return -1;
+    }
+
+    if (output_batch.n != B)
+    {
+        fprintf(stderr, "test_batch_forward_pooling output n expect %d got %d\n", B, output_batch.n);
+        return -1;
+    }
+    if (output_batch.w != 2 || output_batch.h != 2 || output_batch.c != C)
+    {
+        fprintf(stderr, "test_batch_forward_pooling output shape expect 2x2x%d got %dx%dx%d\n",
+                C, output_batch.w, output_batch.h, output_batch.c);
+        return -1;
+    }
+
+    // verify max pooling for batch 0, channel 0
+    // input 4x4: [ 0  1  2  3 / 4  5  6  7 / 8  9 10 11 / 12 13 14 15 ]
+    // max pool 2x2 stride 2 -> [ 5 7 / 13 15 ]
+    {
+        const ncnn::Mat out0 = output_batch.batch(0);
+        const float* ptr = out0.channel(0);
+        float expected[4] = {5.f, 7.f, 13.f, 15.f};
+        for (int i = 0; i < 4; i++)
+        {
+            if (fabsf(ptr[i] - expected[i]) > 1e-5f)
+            {
+                fprintf(stderr, "test_batch_forward_pooling b0 mismatch at i=%d: got %f expect %f\n",
+                        i, ptr[i], expected[i]);
+                return -1;
+            }
+        }
+    }
+
+    // verify batch 1, channel 0: input 100+i -> max pool -> [105, 107, 113, 115]
+    {
+        const ncnn::Mat out1 = output_batch.batch(1);
+        const float* ptr = out1.channel(0);
+        float expected[4] = {105.f, 107.f, 113.f, 115.f};
+        for (int i = 0; i < 4; i++)
+        {
+            if (fabsf(ptr[i] - expected[i]) > 1e-5f)
+            {
+                fprintf(stderr, "test_batch_forward_pooling b1 mismatch at i=%d: got %f expect %f\n",
+                        i, ptr[i], expected[i]);
+                return -1;
+            }
+        }
+    }
+
+    return 0;
+}
+
+#if NCNN_VULKAN
+static int test_vkmat_create_batch_basic()
+{
+    ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
+    ncnn::VkAllocator* blob_allocator = vkdev->acquire_blob_allocator();
+
+    ncnn::VkMat m;
+    m.create_batch(8, 6, 3, 4, 4u, 1, blob_allocator);
+
+    if (m.dims != 3)
+    {
+        fprintf(stderr, "test_vkmat_create_batch_basic dims expect 3 got %d\n", m.dims);
+        vkdev->reclaim_blob_allocator(blob_allocator);
+        return -1;
+    }
+    if (m.w != 8 || m.h != 6 || m.c != 3)
+    {
+        fprintf(stderr, "test_vkmat_create_batch_basic shape mismatch\n");
+        vkdev->reclaim_blob_allocator(blob_allocator);
+        return -1;
+    }
+    if (m.n != 4)
+    {
+        fprintf(stderr, "test_vkmat_create_batch_basic n expect 4 got %d\n", m.n);
+        vkdev->reclaim_blob_allocator(blob_allocator);
+        return -1;
+    }
+    if (m.data == 0)
+    {
+        fprintf(stderr, "test_vkmat_create_batch_basic data is null\n");
+        vkdev->reclaim_blob_allocator(blob_allocator);
+        return -1;
+    }
+
+    m.release();
+    vkdev->reclaim_blob_allocator(blob_allocator);
+    return 0;
+}
+
+static int test_vkmat_nstep_alignment()
+{
+    ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
+    ncnn::VkAllocator* blob_allocator = vkdev->acquire_blob_allocator();
+
+    ncnn::VkMat m;
+    m.create_batch(7, 5, 13, 4, 4u, 1, blob_allocator);
+
+    size_t nstep_bytes = m.nstep * m.elemsize;
+    if (nstep_bytes % 4096 != 0)
+    {
+        fprintf(stderr, "test_vkmat_nstep_alignment failed: nstep_bytes=%zu\n", nstep_bytes);
+        m.release();
+        vkdev->reclaim_blob_allocator(blob_allocator);
+        return -1;
+    }
+
+    m.release();
+    vkdev->reclaim_blob_allocator(blob_allocator);
+    return 0;
+}
+
+static int test_vkmat_batch_subview()
+{
+    ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
+    ncnn::VkAllocator* blob_allocator = vkdev->acquire_blob_allocator();
+
+    ncnn::VkMat m;
+    m.create_batch(4, 3, 2, 3, 4u, 1, blob_allocator);
+
+    for (int b = 0; b < m.n; b++)
+    {
+        const ncnn::VkMat sub = m.batch(b);
+
+        // verify sub-view properties
+        if (sub.dims != m.dims || sub.w != m.w || sub.h != m.h || sub.c != m.c)
+        {
+            fprintf(stderr, "test_vkmat_batch_subview shape mismatch at batch %d\n", b);
+            m.release();
+            vkdev->reclaim_blob_allocator(blob_allocator);
+            return -1;
+        }
+        if (sub.cstep != m.cstep)
+        {
+            fprintf(stderr, "test_vkmat_batch_subview cstep mismatch at batch %d\n", b);
+            m.release();
+            vkdev->reclaim_blob_allocator(blob_allocator);
+            return -1;
+        }
+        if (sub.n != 1)
+        {
+            fprintf(stderr, "test_vkmat_batch_subview n expect 1 got %d\n", sub.n);
+            m.release();
+            vkdev->reclaim_blob_allocator(blob_allocator);
+            return -1;
+        }
+        if (sub.refcount != 0)
+        {
+            fprintf(stderr, "test_vkmat_batch_subview refcount should be NULL (zero-copy)\n");
+            m.release();
+            vkdev->reclaim_blob_allocator(blob_allocator);
+            return -1;
+        }
+
+        // verify buffer_offset is correct
+        size_t expected_offset = m.buffer_offset() + m.nstep * b * m.elemsize;
+        if (sub.buffer_offset() != expected_offset)
+        {
+            fprintf(stderr, "test_vkmat_batch_subview buffer_offset mismatch at batch %d: got %zu expect %zu\n",
+                    b, sub.buffer_offset(), expected_offset);
+            m.release();
+            vkdev->reclaim_blob_allocator(blob_allocator);
+            return -1;
+        }
+
+        // verify same underlying VkBuffer
+        if (sub.buffer() != m.buffer())
+        {
+            fprintf(stderr, "test_vkmat_batch_subview buffer handle mismatch at batch %d\n", b);
+            m.release();
+            vkdev->reclaim_blob_allocator(blob_allocator);
+            return -1;
+        }
+    }
+
+    m.release();
+    vkdev->reclaim_blob_allocator(blob_allocator);
+    return 0;
+}
+
+static int test_vkmat_batch_range()
+{
+    ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
+    ncnn::VkAllocator* blob_allocator = vkdev->acquire_blob_allocator();
+
+    ncnn::VkMat m;
+    m.create_batch(4, 3, 2, 4, 4u, 1, blob_allocator);
+
+    ncnn::VkMat range = m.batch_range(1, 2);
+    if (range.n != 2)
+    {
+        fprintf(stderr, "test_vkmat_batch_range n expect 2 got %d\n", range.n);
+        m.release();
+        vkdev->reclaim_blob_allocator(blob_allocator);
+        return -1;
+    }
+    if (range.nstep != m.nstep)
+    {
+        fprintf(stderr, "test_vkmat_batch_range nstep mismatch\n");
+        m.release();
+        vkdev->reclaim_blob_allocator(blob_allocator);
+        return -1;
+    }
+
+    // verify range.batch(0) buffer_offset == m.batch(1) buffer_offset
+    if (range.batch(0).buffer_offset() != m.batch(1).buffer_offset())
+    {
+        fprintf(stderr, "test_vkmat_batch_range offset mismatch at range batch 0\n");
+        m.release();
+        vkdev->reclaim_blob_allocator(blob_allocator);
+        return -1;
+    }
+
+    // verify range.batch(1) buffer_offset == m.batch(2) buffer_offset
+    if (range.batch(1).buffer_offset() != m.batch(2).buffer_offset())
+    {
+        fprintf(stderr, "test_vkmat_batch_range offset mismatch at range batch 1\n");
+        m.release();
+        vkdev->reclaim_blob_allocator(blob_allocator);
+        return -1;
+    }
+
+    m.release();
+    vkdev->reclaim_blob_allocator(blob_allocator);
+    return 0;
+}
+
+static int test_vkmat_batch_release()
+{
+    ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
+    ncnn::VkAllocator* blob_allocator = vkdev->acquire_blob_allocator();
+
+    ncnn::VkMat m;
+    m.create_batch(4, 3, 2, 4, 4u, 1, blob_allocator);
+    m.release();
+
+    if (m.dims != 0)
+    {
+        fprintf(stderr, "test_vkmat_batch_release dims expect 0 got %d\n", m.dims);
+        vkdev->reclaim_blob_allocator(blob_allocator);
+        return -1;
+    }
+    if (m.n != 1)
+    {
+        fprintf(stderr, "test_vkmat_batch_release n expect 1 got %d\n", m.n);
+        vkdev->reclaim_blob_allocator(blob_allocator);
+        return -1;
+    }
+
+    vkdev->reclaim_blob_allocator(blob_allocator);
+    return 0;
+}
+
+static int test_vkmat_batch_upload_download()
+{
+    ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
+    ncnn::VkAllocator* blob_allocator = vkdev->acquire_blob_allocator();
+    ncnn::VkAllocator* staging_allocator = vkdev->acquire_staging_allocator();
+
+    const int B = 3;
+    const int W = 4;
+    const int H = 3;
+    const int C = 2;
+
+    // create and fill cpu batch
+    ncnn::Mat cpu_batch;
+    cpu_batch.create_batch(W, H, C, B, 4u, 1);
+    for (int b = 0; b < B; b++)
+    {
+        ncnn::Mat sub = cpu_batch.batch(b);
+        for (int q = 0; q < C; q++)
+        {
+            float* ptr = sub.channel(q);
+            for (int i = 0; i < W * H; i++)
+            {
+                ptr[i] = (float)(b * 100 + q * 10 + i);
+            }
+        }
+    }
+
+    // upload each batch, assemble on gpu, download back
+    ncnn::VkCompute cmd(vkdev);
+
+    ncnn::Option opt;
+    opt.blob_vkallocator = blob_allocator;
+    opt.workspace_vkallocator = blob_allocator;
+    opt.staging_vkallocator = staging_allocator;
+    opt.use_vulkan_compute = true;
+
+    ncnn::VkMat gpu_batch;
+    for (int b = 0; b < B; b++)
+    {
+        ncnn::Mat cpu_b = cpu_batch.batch(b);
+        ncnn::VkMat gpu_b;
+        cmd.record_upload(cpu_b, gpu_b, opt);
+
+        if (b == 0)
+        {
+            gpu_batch.create_like_batch(gpu_b, B, blob_allocator);
+        }
+
+        ncnn::VkMat gpu_batch_slot = gpu_batch.batch(b);
+        cmd.record_clone(gpu_b, gpu_batch_slot, opt);
+    }
+
+    // download each batch back
+    std::vector<ncnn::Mat> cpu_results(B);
+    for (int b = 0; b < B; b++)
+    {
+        ncnn::VkMat gpu_b = gpu_batch.batch(b);
+        cmd.record_download(gpu_b, cpu_results[b], opt);
+    }
+
+    int ret = cmd.submit_and_wait();
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_vkmat_batch_upload_download submit failed ret=%d\n", ret);
+        vkdev->reclaim_staging_allocator(staging_allocator);
+        vkdev->reclaim_blob_allocator(blob_allocator);
+        return -1;
+    }
+
+    // verify downloaded data matches original
+    for (int b = 0; b < B; b++)
+    {
+        const ncnn::Mat& result = cpu_results[b];
+        if (result.w != W || result.h != H || result.c != C)
+        {
+            fprintf(stderr, "test_vkmat_batch_upload_download shape mismatch at batch %d\n", b);
+            vkdev->reclaim_staging_allocator(staging_allocator);
+            vkdev->reclaim_blob_allocator(blob_allocator);
+            return -1;
+        }
+
+        const ncnn::Mat orig = cpu_batch.batch(b);
+        for (int q = 0; q < C; q++)
+        {
+            const float* orig_ptr = orig.channel(q);
+            const float* result_ptr = result.channel(q);
+            for (int i = 0; i < W * H; i++)
+            {
+                if (fabsf(orig_ptr[i] - result_ptr[i]) > 1e-5f)
+                {
+                    fprintf(stderr, "test_vkmat_batch_upload_download value mismatch at b=%d q=%d i=%d: got %f expect %f\n",
+                            b, q, i, result_ptr[i], orig_ptr[i]);
+                    vkdev->reclaim_staging_allocator(staging_allocator);
+                    vkdev->reclaim_blob_allocator(blob_allocator);
+                    return -1;
+                }
+            }
+        }
+    }
+
+    vkdev->reclaim_staging_allocator(staging_allocator);
+    vkdev->reclaim_blob_allocator(blob_allocator);
+    return 0;
+}
+
+static int test_vkmat_batch_forward_relu()
+{
+    const char param_str[] = "7767517\n"
+                             "2 2\n"
+                             "Input input 0 1 data\n"
+                             "ReLU  relu  1 1 data output 0=1.000000e-01\n";
+
+    ncnn::Net net;
+    ncnn::Option opt;
+    opt.use_vulkan_compute = true;
+    net.opt = opt;
+    net.load_param_mem(param_str);
+    net.load_model((const unsigned char*)"");
+
+    const int B = 4;
+    const int C = 3;
+    const int H = 3;
+    const int W = 4;
+
+    ncnn::Mat input_batch;
+    input_batch.create_batch(W, H, C, B, 4u, 1);
+    if (input_batch.empty())
+    {
+        fprintf(stderr, "test_vkmat_batch_forward_relu create_batch failed\n");
+        return -1;
+    }
+
+    for (int b = 0; b < B; b++)
+    {
+        ncnn::Mat sub = input_batch.batch(b);
+        sub.fill((float)(b - 1.5f));
+    }
+
+    ncnn::Extractor ex = net.create_extractor();
+    ex.input("data", input_batch);
+
+    ncnn::Mat output_batch;
+    int ret = ex.extract("output", output_batch);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_vkmat_batch_forward_relu extract failed ret=%d\n", ret);
+        return -1;
+    }
+
+    if (output_batch.n != B)
+    {
+        fprintf(stderr, "test_vkmat_batch_forward_relu output n expect %d got %d\n", B, output_batch.n);
+        return -1;
+    }
+    if (output_batch.w != W || output_batch.h != H || output_batch.c != C)
+    {
+        fprintf(stderr, "test_vkmat_batch_forward_relu output shape mismatch\n");
+        return -1;
+    }
+
+    for (int b = 0; b < B; b++)
+    {
+        const ncnn::Mat out_sub = output_batch.batch(b);
+        float input_val = (float)(b - 1.5f);
+        float expected = input_val > 0 ? input_val : input_val * 0.1f;
+
+        for (int q = 0; q < C; q++)
+        {
+            const float* ptr = out_sub.channel(q);
+            for (int i = 0; i < W * H; i++)
+            {
+                if (fabsf(ptr[i] - expected) > 1e-4f)
+                {
+                    fprintf(stderr, "test_vkmat_batch_forward_relu value mismatch at b=%d q=%d i=%d: got %f expect %f\n",
+                            b, q, i, ptr[i], expected);
+                    return -1;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int test_vkmat_batch_forward_pooling()
+{
+    const char param_str[] = "7767517\n"
+                             "2 2\n"
+                             "Input   input   0 1 data\n"
+                             "Pooling pooling 1 1 data output 0=0 1=2 2=2\n";
+
+    ncnn::Net net;
+    ncnn::Option opt;
+    opt.use_vulkan_compute = true;
+    net.opt = opt;
+    net.load_param_mem(param_str);
+    net.load_model((const unsigned char*)"");
+
+    const int B = 2;
+    const int C = 2;
+    const int H = 4;
+    const int W = 4;
+
+    ncnn::Mat input_batch;
+    input_batch.create_batch(W, H, C, B, 4u, 1);
+
+    for (int b = 0; b < B; b++)
+    {
+        ncnn::Mat sub = input_batch.batch(b);
+        for (int q = 0; q < C; q++)
+        {
+            float* ptr = sub.channel(q);
+            for (int i = 0; i < W * H; i++)
+            {
+                ptr[i] = (float)(b * 100 + q * 10 + i);
+            }
+        }
+    }
+
+    ncnn::Extractor ex = net.create_extractor();
+    ex.input("data", input_batch);
+
+    ncnn::Mat output_batch;
+    int ret = ex.extract("output", output_batch);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_vkmat_batch_forward_pooling extract failed ret=%d\n", ret);
+        return -1;
+    }
+
+    if (output_batch.n != B)
+    {
+        fprintf(stderr, "test_vkmat_batch_forward_pooling output n expect %d got %d\n", B, output_batch.n);
+        return -1;
+    }
+    if (output_batch.w != 2 || output_batch.h != 2 || output_batch.c != C)
+    {
+        fprintf(stderr, "test_vkmat_batch_forward_pooling output shape expect 2x2x%d got %dx%dx%d\n",
+                C, output_batch.w, output_batch.h, output_batch.c);
+        return -1;
+    }
+
+    // verify max pooling for batch 0, channel 0
+    // input 4x4: [ 0  1  2  3 / 4  5  6  7 / 8  9 10 11 / 12 13 14 15 ]
+    // max pool 2x2 stride 2 -> [ 5 7 / 13 15 ]
+    {
+        const ncnn::Mat out0 = output_batch.batch(0);
+        const float* ptr = out0.channel(0);
+        float expected[4] = {5.f, 7.f, 13.f, 15.f};
+        for (int i = 0; i < 4; i++)
+        {
+            if (fabsf(ptr[i] - expected[i]) > 1e-4f)
+            {
+                fprintf(stderr, "test_vkmat_batch_forward_pooling b0 mismatch at i=%d: got %f expect %f\n",
+                        i, ptr[i], expected[i]);
+                return -1;
+            }
+        }
+    }
+
+    // verify batch 1, channel 0: input 100+i -> max pool -> [105, 107, 113, 115]
+    {
+        const ncnn::Mat out1 = output_batch.batch(1);
+        const float* ptr = out1.channel(0);
+        float expected[4] = {105.f, 107.f, 113.f, 115.f};
+        for (int i = 0; i < 4; i++)
+        {
+            if (fabsf(ptr[i] - expected[i]) > 1e-4f)
+            {
+                fprintf(stderr, "test_vkmat_batch_forward_pooling b1 mismatch at i=%d: got %f expect %f\n",
+                        i, ptr[i], expected[i]);
+                return -1;
+            }
+        }
+    }
+
+    return 0;
+}
+#endif // NCNN_VULKAN
+
+int main()
+{
+    int ret = 0;
+
+    ret |= test_create_batch_basic();
+    ret |= test_nstep_alignment();
+    ret |= test_batch_subview_zero_copy();
+    ret |= test_batch_range();
+    ret |= test_batch_data_isolation();
+    ret |= test_batch_clone();
+    ret |= test_batch_release();
+    ret |= test_backward_compatibility();
+    ret |= test_create_batch_single();
+    ret |= test_create_batch_1d();
+    ret |= test_create_batch_2d();
+    ret |= test_batch_forward_relu();
+    ret |= test_batch_forward_pooling();
+
+#if NCNN_VULKAN
+    ncnn::create_gpu_instance();
+    if (ncnn::get_gpu_count() > 0)
+    {
+        ret |= test_vkmat_create_batch_basic();
+        ret |= test_vkmat_nstep_alignment();
+        ret |= test_vkmat_batch_subview();
+        ret |= test_vkmat_batch_range();
+        ret |= test_vkmat_batch_release();
+        ret |= test_vkmat_batch_upload_download();
+        ret |= test_vkmat_batch_forward_relu();
+        ret |= test_vkmat_batch_forward_pooling();
+    }
+    else
+    {
+        fprintf(stderr, "no vulkan device, skip vkmat batch tests\n");
+    }
+    ncnn::destroy_gpu_instance();
+#endif // NCNN_VULKAN
+
+    if (ret == 0)
+        fprintf(stderr, "test_mat_batch passed\n");
+
+    return ret;
+}
diff --git a/tests/test_squeezenet.cpp b/tests/test_squeezenet.cpp
index 887bfac6d20a..8877d7f8a5a1 100644
--- a/tests/test_squeezenet.cpp
+++ b/tests/test_squeezenet.cpp
@@ -404,6 +404,95 @@ static int test_squeezenet_overwrite_softmax(const ncnn::Option& opt, int load_m
     return check_top2(cls_scores, epsilon);
 }
 
+static int test_squeezenet_batch(const ncnn::Option& opt, float epsilon = 0.001)
+{
+    ncnn::Net squeezenet;
+
+    squeezenet.opt = opt;
+
+    squeezenet.load_param(MODEL_DIR "/squeezenet_v1.1.param");
+    squeezenet.load_model(MODEL_DIR "/squeezenet_v1.1.bin");
+
+    ncnn::Mat in = generate_ncnn_logo(ncnn::Mat::PIXEL_BGR, 227, 227);
+
+    const float mean_vals[3] = {104.f, 117.f, 123.f};
+    in.substract_mean_normalize(mean_vals, 0);
+
+    // single inference for reference
+    ncnn::Mat ref_out;
+    {
+        ncnn::Extractor ex = squeezenet.create_extractor();
+        ex.input("data", in);
+        ex.extract("prob", ref_out);
+    }
+
+    if (ref_out.empty() || ref_out.w != 1000)
+    {
+        fprintf(stderr, "test_squeezenet_batch reference output invalid w=%d\n", ref_out.w);
+        return -1;
+    }
+
+    // create batch input (3 copies of the same image)
+    const int B = 3;
+    ncnn::Mat in_batch;
+    in_batch.create_batch(in.w, in.h, in.c, B, in.elemsize, in.elempack);
+    if (in_batch.empty())
+    {
+        fprintf(stderr, "test_squeezenet_batch create_batch failed\n");
+        return -1;
+    }
+
+    size_t single_size = in.cstep * in.c * in.elemsize;
+    for (int b = 0; b < B; b++)
+    {
+        memcpy(in_batch.batch(b).data, in.data, single_size);
+    }
+
+    // batch inference
+    ncnn::Mat out_batch;
+    {
+        ncnn::Extractor ex = squeezenet.create_extractor();
+        ex.input("data", in_batch);
+        int ret = ex.extract("prob", out_batch);
+        if (ret != 0)
+        {
+            fprintf(stderr, "test_squeezenet_batch extract failed ret=%d\n", ret);
+            return -1;
+        }
+    }
+
+    if (out_batch.n != B)
+    {
+        fprintf(stderr, "test_squeezenet_batch output n expect %d got %d\n", B, out_batch.n);
+        return -1;
+    }
+    if (out_batch.dims != 1 || out_batch.w != 1000)
+    {
+        fprintf(stderr, "test_squeezenet_batch output shape mismatch dims=%d w=%d\n", out_batch.dims, out_batch.w);
+        return -1;
+    }
+
+    // compare each batch output against reference
+    for (int b = 0; b < B; b++)
+    {
+        const ncnn::Mat out_b = out_batch.batch(b);
+        const float* ref_ptr = (const float*)ref_out.data;
+        const float* out_ptr = (const float*)out_b.data;
+
+        for (int j = 0; j < 1000; j++)
+        {
+            if (!NearlyEqual(out_ptr[j], ref_ptr[j], epsilon))
+            {
+                fprintf(stderr, "test_squeezenet_batch mismatch at batch %d index %d: got %f expect %f\n",
+                        b, j, out_ptr[j], ref_ptr[j]);
+                return -1;
+            }
+        }
+    }
+
+    return 0;
+}
+
 int main()
 {
     SRAND(7767517);
@@ -508,5 +597,43 @@ int main()
 #endif // NCNN_VULKAN
     }
 
+    // batch inference tests
+    for (int i = 0; i < 4; i++)
+    {
+        const ncnn::Option& opt = opts[i];
+
+        float epsilon;
+        if (opt.use_bf16_storage || opt.use_fp16_packed || opt.use_fp16_storage)
+        {
+            epsilon = 0.1;
+        }
+        else
+        {
+            epsilon = 0.01;
+        }
+
+        int ret;
+
+        ncnn::Option opt_cpu = opt;
+        opt_cpu.use_vulkan_compute = false;
+        ret = test_squeezenet_batch(opt_cpu, epsilon);
+        if (ret != 0)
+        {
+            fprintf(stderr, "test_squeezenet_batch cpu failed use_packing_layout=%d use_fp16_packed=%d use_fp16_storage=%d use_bf16_storage=%d\n", opt.use_packing_layout, opt.use_fp16_packed, opt.use_fp16_storage, opt.use_bf16_storage);
+            return ret;
+        }
+
+#if NCNN_VULKAN
+        ncnn::Option opt_gpu = opt;
+        opt_gpu.use_vulkan_compute = true;
+        ret = test_squeezenet_batch(opt_gpu, epsilon);
+        if (ret != 0)
+        {
+            fprintf(stderr, "test_squeezenet_batch gpu failed use_packing_layout=%d use_fp16_packed=%d use_fp16_storage=%d use_bf16_storage=%d\n", opt.use_packing_layout, opt.use_fp16_packed, opt.use_fp16_storage, opt.use_bf16_storage);
+            return ret;
+        }
+#endif // NCNN_VULKAN
+    }
+
     return 0;
 }