diff --git a/src/command.cpp b/src/command.cpp index 1b9978bfa496..7ea927ed2360 100644 --- a/src/command.cpp +++ b/src/command.cpp @@ -359,10 +359,12 @@ void VkCompute::record_upload(const Mat& src, VkMat& dst, const Option& opt) { // NCNN_LOGE("record_upload buffer"); + const int B = src.n; + + // cpu cast to fp16 (discrete gpu) Mat src_fp16; if (src.elemsize == src.elempack * 4u) { - // cpu cast to fp16 (discrete gpu) if (vkdev->info.type() == 0 && (opt.use_bf16_storage || opt.use_bf16_packed)) { ncnn::cast_float32_to_bfloat16(src, src_fp16, opt); @@ -389,26 +391,32 @@ void VkCompute::record_upload(const Mat& src, VkMat& dst, const Option& opt) src_fp16 = src_fp16_pack4; } - // upload + // upload staging buffer VkMat dst_staging; - dst_staging.create_like(src_fp16, opt.staging_vkallocator); + if (B > 1) + dst_staging.create_like_batch(src_fp16.batch(0), B, opt.staging_vkallocator); + else + dst_staging.create_like(src_fp16, opt.staging_vkallocator); if (dst_staging.empty()) return; // stash staging d->upload_staging_buffers.push_back(dst_staging); - // NCNN_LOGE("upload_staging_buffer %p -> %p +%d ~%d", src_fp16.data, dst_staging.buffer(), dst_staging.buffer_offset(), dst_staging.buffer_capacity()); - // memcpy src to device - memcpy(dst_staging.mapped_ptr(), src_fp16.data, src_fp16.total() * src_fp16.elemsize); + for (int b = 0; b < B; b++) + { + const Mat src_b = src_fp16.batch(b); + VkMat staging_b = dst_staging.batch(b); + memcpy(staging_b.mapped_ptr(), src_b.data, src_b.total() * src_b.elemsize); + } dst_staging.allocator->flush(dst_staging.data); // mark device host-write @ null dst_staging.data->access_flags = VK_ACCESS_HOST_WRITE_BIT; dst_staging.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; - // resolve dst_elempack + // resolve dst_elempack (from single sample dimensions) int dims = src_fp16.dims; int elemcount = 0; if (dims == 1) elemcount = src_fp16.elempack * src_fp16.w; @@ -435,6 +443,8 @@ void VkCompute::record_download(const VkMat& src, Mat& dst, const Option& opt) { // NCNN_LOGE("record_download buffer"); + const int B = src.n; + // resolve dst_elempack int dims = src.dims; int elemcount = 0; @@ -480,7 +490,7 @@ void VkCompute::record_download(const VkMat& src, Mat& dst, const Option& opt) barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].buffer = dst_staging.buffer(); barriers[0].offset = dst_staging.buffer_offset(); - barriers[0].size = dst_staging.buffer_capacity(); + barriers[0].size = B > 1 ? dst_staging.nstep * B * dst_staging.elemsize : dst_staging.buffer_capacity(); VkPipelineStageFlags src_stage = dst_staging.data->stage_flags; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_HOST_BIT; @@ -509,7 +519,10 @@ void VkCompute::record_download(const VkMat& src, Mat& dst, const Option& opt) // create dst Mat dst_fp16; - dst_fp16.create_like(dst_staging, opt.blob_allocator); + if (B > 1) + dst_fp16.create_like_batch(dst_staging.batch(0), B, opt.blob_allocator); + else + dst_fp16.create_like(dst_staging, opt.blob_allocator); if (dst_fp16.empty()) return; @@ -530,48 +543,42 @@ void VkCompute::record_download(const VkMat& src, Mat& dst, const Option& opt) // cast to fp32 (discrete gpu) if (dst_fp16.elemsize == dst_fp16.elempack * 2u) { + int post_cast_type = 0; // 0=none, 1=bf16, 2=fp16 if (vkdev->info.type() == 0 && (opt.use_bf16_storage || opt.use_bf16_packed)) - { - int dims = dst_fp16.dims; - if (dims == 1) - dst.create(dst_fp16.w, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); - if (dims == 2) - dst.create(dst_fp16.w, dst_fp16.h, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); - if (dims == 3) - dst.create(dst_fp16.w, dst_fp16.h, dst_fp16.c, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); - if (dims == 4) - dst.create(dst_fp16.w, dst_fp16.h, dst_fp16.d, dst_fp16.c, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); - - d->download_post_mats.push_back(dst); - - VkComputePrivate::record r; - r.type = VkComputePrivate::record::TYPE_post_cast_bfloat16_to_float32; - r.command_buffer = 0; - r.post_cast_bfloat16_to_float32.download_post_mat_bf16_offset = d->download_post_mats_fp16.size() - 1; - r.post_cast_bfloat16_to_float32.download_post_mat_offset = d->download_post_mats.size() - 1; - r.post_cast_bfloat16_to_float32.num_threads = opt.num_threads; - d->delayed_records.push_back(r); - } + post_cast_type = 1; else if (vkdev->info.type() == 0 && (opt.use_fp16_storage || opt.use_fp16_packed)) + post_cast_type = 2; + + if (post_cast_type > 0) { - int dims = dst_fp16.dims; - if (dims == 1) - dst.create(dst_fp16.w, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); - if (dims == 2) - dst.create(dst_fp16.w, dst_fp16.h, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); - if (dims == 3) - dst.create(dst_fp16.w, dst_fp16.h, dst_fp16.c, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); - if (dims == 4) - dst.create(dst_fp16.w, dst_fp16.h, dst_fp16.d, dst_fp16.c, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); + size_t fp32_elemsize = (size_t)(dst_fp16.elempack * 4u); + if (dst_fp16.dims == 1) + dst.create_batch(dst_fp16.w, B, fp32_elemsize, dst_fp16.elempack, opt.blob_allocator); + else if (dst_fp16.dims == 2) + dst.create_batch(dst_fp16.w, dst_fp16.h, B, fp32_elemsize, dst_fp16.elempack, opt.blob_allocator); + else if (dst_fp16.dims == 3) + dst.create_batch(dst_fp16.w, dst_fp16.h, dst_fp16.c, B, fp32_elemsize, dst_fp16.elempack, opt.blob_allocator); + else if (dst_fp16.dims == 4) + dst.create_batch(dst_fp16.w, dst_fp16.h, dst_fp16.d, dst_fp16.c, B, fp32_elemsize, dst_fp16.elempack, opt.blob_allocator); d->download_post_mats.push_back(dst); VkComputePrivate::record r; - r.type = VkComputePrivate::record::TYPE_post_cast_float16_to_float32; r.command_buffer = 0; - r.post_cast_float16_to_float32.download_post_mat_fp16_offset = d->download_post_mats_fp16.size() - 1; - r.post_cast_float16_to_float32.download_post_mat_offset = d->download_post_mats.size() - 1; - r.post_cast_float16_to_float32.num_threads = opt.num_threads; + if (post_cast_type == 1) + { + r.type = VkComputePrivate::record::TYPE_post_cast_bfloat16_to_float32; + r.post_cast_bfloat16_to_float32.download_post_mat_bf16_offset = d->download_post_mats_fp16.size() - 1; + r.post_cast_bfloat16_to_float32.download_post_mat_offset = d->download_post_mats.size() - 1; + r.post_cast_bfloat16_to_float32.num_threads = opt.num_threads; + } + else + { + r.type = VkComputePrivate::record::TYPE_post_cast_float16_to_float32; + r.post_cast_float16_to_float32.download_post_mat_fp16_offset = d->download_post_mats_fp16.size() - 1; + r.post_cast_float16_to_float32.download_post_mat_offset = d->download_post_mats.size() - 1; + r.post_cast_float16_to_float32.num_threads = opt.num_threads; + } d->delayed_records.push_back(r); } else @@ -589,14 +596,24 @@ void VkCompute::record_clone(const Mat& src, VkMat& dst, const Option& opt) { // NCNN_LOGE("record_clone host to buffer"); + const int B = src.n; + // host to staging VkMat dst_staging; - dst_staging.create_like(src, opt.staging_vkallocator); + if (B > 1) + dst_staging.create_like_batch(src.batch(0), B, opt.staging_vkallocator); + else + dst_staging.create_like(src, opt.staging_vkallocator); if (dst_staging.empty()) return; // memcpy src to device - memcpy(dst_staging.mapped_ptr(), src.data, src.total() * src.elemsize); + for (int b = 0; b < B; b++) + { + const Mat src_b = src.batch(b); + VkMat staging_b = dst_staging.batch(b); + memcpy(staging_b.mapped_ptr(), src_b.data, src_b.total() * src_b.elemsize); + } dst_staging.allocator->flush(dst_staging.data); // mark device host-write @ null @@ -631,6 +648,8 @@ void VkCompute::record_clone(const VkMat& src, Mat& dst, const Option& opt) { // NCNN_LOGE("record_clone buffer to host"); + const int B = src.n; + if (!src.allocator->mappable) { // device to staging @@ -646,7 +665,10 @@ void VkCompute::record_clone(const VkMat& src, Mat& dst, const Option& opt) } // create dst - dst.create_like(src, opt.blob_allocator); + if (B > 1) + dst.create_like_batch(src.batch(0), B, opt.blob_allocator); + else + dst.create_like(src, opt.blob_allocator); if (dst.empty()) return; @@ -662,7 +684,7 @@ void VkCompute::record_clone(const VkMat& src, Mat& dst, const Option& opt) barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].buffer = src.buffer(); barriers[0].offset = src.buffer_offset(); - barriers[0].size = src.buffer_capacity(); + barriers[0].size = B > 1 ? src.nstep * B * src.elemsize : src.buffer_capacity(); VkPipelineStageFlags src_stage = src.data->stage_flags; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_HOST_BIT; @@ -722,8 +744,13 @@ void VkCompute::record_clone(const VkMat& src, VkMat& dst, const Option& opt) { // NCNN_LOGE("record_clone buffer to buffer"); + const int B = src.n; + // create dst - dst.create_like(src, opt.blob_vkallocator); + if (B > 1) + dst.create_like_batch(src.batch(0), B, opt.blob_vkallocator); + else + dst.create_like(src, opt.blob_vkallocator); if (dst.empty()) return; @@ -739,7 +766,7 @@ void VkCompute::record_clone(const VkMat& src, VkMat& dst, const Option& opt) barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].buffer = src.buffer(); barriers[0].offset = src.buffer_offset(); - barriers[0].size = src.buffer_capacity(); + barriers[0].size = B > 1 ? src.nstep * B * src.elemsize : src.buffer_capacity(); VkPipelineStageFlags src_stage = src.data->stage_flags; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; @@ -774,12 +801,14 @@ void VkCompute::record_clone(const VkMat& src, VkMat& dst, const Option& opt) dst.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; } - // record device to staging + // record copy { + VkDeviceSize copy_size = B > 1 ? src.nstep * B * src.elemsize : std::min(src.buffer_capacity(), dst.buffer_capacity()); + VkBufferCopy* regions = new VkBufferCopy[1]; regions[0].srcOffset = src.buffer_offset(); regions[0].dstOffset = dst.buffer_offset(); - regions[0].size = std::min(src.buffer_capacity(), dst.buffer_capacity()); + regions[0].size = copy_size; if (vkdev->info.support_VK_KHR_push_descriptor()) { @@ -1985,7 +2014,12 @@ int VkCompute::submit_and_wait() // NCNN_LOGE("post_download %p +%d ~%d -> %p", src.buffer(), src.buffer_offset(), src.buffer_capacity(), dst.data); src.allocator->invalidate(src.data); - memcpy(dst.data, src.mapped_ptr(), dst.total() * dst.elemsize); + for (int b = 0; b < dst.n; b++) + { + Mat dst_b = dst.batch(b); + size_t src_batch_offset = src.nstep * b * src.elemsize; + memcpy(dst_b.data, (const unsigned char*)src.mapped_ptr() + src_batch_offset, dst_b.total() * dst_b.elemsize); + } break; } case VkComputePrivate::record::TYPE_post_cast_float16_to_float32: diff --git a/src/layer.cpp b/src/layer.cpp index a00c937c5643..a12da2537b9d 100644 --- a/src/layer.cpp +++ b/src/layer.cpp @@ -27,6 +27,8 @@ Layer::Layer() support_any_packing = false; support_vulkan_any_packing = false; + support_batch = false; + featmask = 0; #if NCNN_VULKAN @@ -240,6 +242,7 @@ class Layer_final : public Layer support_fp16_storage = layer_cpu->support_fp16_storage; support_int8_storage = layer_cpu->support_int8_storage; support_any_packing = layer_cpu->support_any_packing; + support_batch = layer_cpu->support_batch; support_vulkan = false; support_tensor_storage = false; diff --git a/src/layer.h b/src/layer.h index 9fa45d7c47a3..ac11176bfe44 100644 --- a/src/layer.h +++ b/src/layer.h @@ -75,7 +75,8 @@ class NCNN_EXPORT Layer // vulkan accept input blob with any elempack bool support_vulkan_any_packing; - bool support_reserved_1; + // support batched input (n > 1), replaces support_reserved_1 + bool support_batch; bool support_reserved_2; bool support_reserved_3; bool support_reserved_4; diff --git a/src/layer/arm/cast_arm.cpp b/src/layer/arm/cast_arm.cpp index f028ad7e10ac..966517f85bb2 100644 --- a/src/layer/arm/cast_arm.cpp +++ b/src/layer/arm/cast_arm.cpp @@ -40,13 +40,14 @@ int Cast_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) int dims = bottom_blob.dims; size_t elemsize = bottom_blob.elemsize; int elempack = bottom_blob.elempack; + int batch = bottom_blob.n; size_t out_elemsize = elemsize; if (type_to == 1) { if (type_from == 3) { - Cast::forward(bottom_blob, top_blob, opt); + return Cast::forward(bottom_blob, top_blob, opt); } // float32 @@ -69,21 +70,13 @@ int Cast_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) } if (dims == 1) - { - top_blob.create(w, out_elemsize, elempack, opt.blob_allocator); - } + top_blob.create_batch(w, batch, out_elemsize, elempack, opt.blob_allocator); else if (dims == 2) - { - top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator); - } + top_blob.create_batch(w, h, batch, out_elemsize, elempack, opt.blob_allocator); else if (dims == 3) - { - top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); - } + top_blob.create_batch(w, h, channels, batch, out_elemsize, elempack, opt.blob_allocator); else if (dims == 4) - { - top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_allocator); - } + top_blob.create_batch(w, h, d, channels, batch, out_elemsize, elempack, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -101,11 +94,14 @@ int Cast_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) if (type_from == 3 && type_to == 1) { + const int total_bc = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { - const signed char* ptr = bottom_blob.channel(q); - float* outptr = top_blob.channel(q); + int b = bc / channels; + int q = bc % channels; + const signed char* ptr = bottom_blob.batch(b).channel(q); + float* outptr = top_blob.batch(b).channel(q); for (int i = 0; i < size; i++) { diff --git a/src/layer/arm/cast_bf16.h b/src/layer/arm/cast_bf16.h index 453ba7e8c182..30ec4fe2c1f9 100644 --- a/src/layer/arm/cast_bf16.h +++ b/src/layer/arm/cast_bf16.h @@ -21,17 +21,21 @@ static void cast_fp32_to_bf16_neon(const Mat& bottom_blob, Mat& top_blob, const const int d = bottom_blob.d; const int channels = bottom_blob.c; const int elempack = bottom_blob.elempack; + const int batch = bottom_blob.n; const int size = w * h * d * elempack; + const int total_bc = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { - const float* ptr = bottom_blob.channel(q); + int b = bc / channels; + int q = bc % channels; + const float* ptr = bottom_blob.batch(b).channel(q); #if __ARM_FEATURE_BF16_VECTOR_ARITHMETIC - __bf16* outptr = top_blob.channel(q); + __bf16* outptr = top_blob.batch(b).channel(q); #else - unsigned short* outptr = top_blob.channel(q); + unsigned short* outptr = top_blob.batch(b).channel(q); #endif int i = 0; @@ -185,18 +189,22 @@ static void cast_bf16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const const int d = bottom_blob.d; const int channels = bottom_blob.c; const int elempack = bottom_blob.elempack; + const int batch = bottom_blob.n; const int size = w * h * d * elempack; + const int total_bc = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { + int b = bc / channels; + int q = bc % channels; #if __ARM_FEATURE_BF16_VECTOR_ARITHMETIC - const __bf16* ptr = bottom_blob.channel(q); + const __bf16* ptr = bottom_blob.batch(b).channel(q); #else - const unsigned short* ptr = bottom_blob.channel(q); + const unsigned short* ptr = bottom_blob.batch(b).channel(q); #endif - float* outptr = top_blob.channel(q); + float* outptr = top_blob.batch(b).channel(q); int i = 0; #if __ARM_NEON diff --git a/src/layer/arm/cast_fp16.h b/src/layer/arm/cast_fp16.h index 929d4b58f7a9..3b9e67965bb3 100644 --- a/src/layer/arm/cast_fp16.h +++ b/src/layer/arm/cast_fp16.h @@ -21,14 +21,18 @@ static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const const int d = bottom_blob.d; const int channels = bottom_blob.c; const int elempack = bottom_blob.elempack; + const int batch = bottom_blob.n; const int size = w * h * d * elempack; + const int total_bc = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { - const float* ptr = bottom_blob.channel(q); - unsigned short* outptr = top_blob.channel(q); + int b = bc / channels; + int q = bc % channels; + const float* ptr = bottom_blob.batch(b).channel(q); + unsigned short* outptr = top_blob.batch(b).channel(q); int i = 0; #if (__ARM_FP & 2) @@ -179,14 +183,18 @@ static void cast_fp16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const const int d = bottom_blob.d; const int channels = bottom_blob.c; const int elempack = bottom_blob.elempack; + const int batch = bottom_blob.n; const int size = w * h * d * elempack; + const int total_bc = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { - const unsigned short* ptr = bottom_blob.channel(q); - float* outptr = top_blob.channel(q); + int b = bc / channels; + int q = bc % channels; + const unsigned short* ptr = bottom_blob.batch(b).channel(q); + float* outptr = top_blob.batch(b).channel(q); int i = 0; #if (__ARM_FP & 2) diff --git a/src/layer/arm/packing_arm.cpp b/src/layer/arm/packing_arm.cpp index 0b325a94176b..f4963afc76b5 100644 --- a/src/layer/arm/packing_arm.cpp +++ b/src/layer/arm/packing_arm.cpp @@ -69,6 +69,7 @@ int Packing_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op int d = bottom_blob.d; int channels = bottom_blob.c; int dims = bottom_blob.dims; + int batch = bottom_blob.n; if (!use_padding) { @@ -97,6 +98,7 @@ int Packing_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op top_blob.cstep = bottom_blob.cstep * elempack / out_elempack; top_blob.elemsize = elemsize / elempack * out_elempack; top_blob.elempack = out_elempack; + top_blob.nstep = bottom_blob.nstep * elempack / out_elempack; return 0; } @@ -105,21 +107,24 @@ int Packing_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op int outh = h * elempack / out_elempack; size_t out_elemsize = elemsize / elempack * out_elempack; - top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; if (pack1to4) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const float* r0 = bottom_blob.row(i * 4); - const float* r1 = bottom_blob.row(i * 4 + 1); - const float* r2 = bottom_blob.row(i * 4 + 2); - const float* r3 = bottom_blob.row(i * 4 + 3); + int b = bi / outh; + int i = bi % outh; + const float* r0 = bottom_blob.batch(b).row(i * 4); + const float* r1 = bottom_blob.batch(b).row(i * 4 + 1); + const float* r2 = bottom_blob.batch(b).row(i * 4 + 2); + const float* r3 = bottom_blob.batch(b).row(i * 4 + 3); - float* outptr = top_blob.row(i); + float* outptr = top_blob.batch(b).row(i); int j = 0; #if __ARM_NEON @@ -152,15 +157,18 @@ int Packing_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op } if (pack4to1) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const float* r0 = bottom_blob.row(i); + int b = bi / h; + int i = bi % h; + const float* r0 = bottom_blob.batch(b).row(i); - float* outptr0 = top_blob.row(i * 4); - float* outptr1 = top_blob.row(i * 4 + 1); - float* outptr2 = top_blob.row(i * 4 + 2); - float* outptr3 = top_blob.row(i * 4 + 3); + float* outptr0 = top_blob.batch(b).row(i * 4); + float* outptr1 = top_blob.batch(b).row(i * 4 + 1); + float* outptr2 = top_blob.batch(b).row(i * 4 + 2); + float* outptr3 = top_blob.batch(b).row(i * 4 + 3); int j = 0; #if __ARM_NEON @@ -201,23 +209,26 @@ int Packing_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op size_t out_elemsize = elemsize / elempack * out_elempack; if (dims == 3) - top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); - else // if (dims == 4) - top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); + else + top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; if (pack1to4) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const float* r0 = bottom_blob.channel(q * 4); - const float* r1 = bottom_blob.channel(q * 4 + 1); - const float* r2 = bottom_blob.channel(q * 4 + 2); - const float* r3 = bottom_blob.channel(q * 4 + 3); + int b = bq / outc; + int q = bq % outc; + const float* r0 = bottom_blob.batch(b).channel(q * 4); + const float* r1 = bottom_blob.batch(b).channel(q * 4 + 1); + const float* r2 = bottom_blob.batch(b).channel(q * 4 + 2); + const float* r3 = bottom_blob.batch(b).channel(q * 4 + 3); - float* outptr = top_blob.channel(q); + float* outptr = top_blob.batch(b).channel(q); int i = 0; #if __ARM_NEON @@ -250,15 +261,18 @@ int Packing_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op } if (pack4to1) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const float* r0 = bottom_blob.channel(q); + int b = bq / channels; + int q = bq % channels; + const float* r0 = bottom_blob.batch(b).channel(q); - float* outptr0 = top_blob.channel(q * 4); - float* outptr1 = top_blob.channel(q * 4 + 1); - float* outptr2 = top_blob.channel(q * 4 + 2); - float* outptr3 = top_blob.channel(q * 4 + 3); + float* outptr0 = top_blob.batch(b).channel(q * 4); + float* outptr1 = top_blob.batch(b).channel(q * 4 + 1); + float* outptr2 = top_blob.batch(b).channel(q * 4 + 2); + float* outptr3 = top_blob.batch(b).channel(q * 4 + 3); int i = 0; #if __ARM_NEON @@ -328,6 +342,7 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons int d = bottom_blob.d; int channels = bottom_blob.c; int dims = bottom_blob.dims; + int batch = bottom_blob.n; if (!use_padding) { @@ -356,6 +371,7 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons top_blob.cstep = bottom_blob.cstep * elempack / out_elempack; top_blob.elemsize = elemsize / elempack * out_elempack; top_blob.elempack = out_elempack; + top_blob.nstep = bottom_blob.nstep * elempack / out_elempack; return 0; } @@ -364,21 +380,24 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons int outh = h * elempack / out_elempack; size_t out_elemsize = elemsize / elempack * out_elempack; - top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; if (pack1to4) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const unsigned short* r0 = bottom_blob.row(i * 4); - const unsigned short* r1 = bottom_blob.row(i * 4 + 1); - const unsigned short* r2 = bottom_blob.row(i * 4 + 2); - const unsigned short* r3 = bottom_blob.row(i * 4 + 3); + int b = bi / outh; + int i = bi % outh; + const unsigned short* r0 = bottom_blob.batch(b).row(i * 4); + const unsigned short* r1 = bottom_blob.batch(b).row(i * 4 + 1); + const unsigned short* r2 = bottom_blob.batch(b).row(i * 4 + 2); + const unsigned short* r3 = bottom_blob.batch(b).row(i * 4 + 3); - unsigned short* outptr = top_blob.row(i); + unsigned short* outptr = top_blob.batch(b).row(i); int j = 0; #if __ARM_NEON @@ -411,15 +430,18 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack4to1) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const unsigned short* r0 = bottom_blob.row(i); + int b = bi / h; + int i = bi % h; + const unsigned short* r0 = bottom_blob.batch(b).row(i); - unsigned short* outptr0 = top_blob.row(i * 4); - unsigned short* outptr1 = top_blob.row(i * 4 + 1); - unsigned short* outptr2 = top_blob.row(i * 4 + 2); - unsigned short* outptr3 = top_blob.row(i * 4 + 3); + unsigned short* outptr0 = top_blob.batch(b).row(i * 4); + unsigned short* outptr1 = top_blob.batch(b).row(i * 4 + 1); + unsigned short* outptr2 = top_blob.batch(b).row(i * 4 + 2); + unsigned short* outptr3 = top_blob.batch(b).row(i * 4 + 3); int j = 0; #if __ARM_NEON @@ -451,19 +473,22 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack1to8) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const unsigned short* r0 = bottom_blob.row(i * 8); - const unsigned short* r1 = bottom_blob.row(i * 8 + 1); - const unsigned short* r2 = bottom_blob.row(i * 8 + 2); - const unsigned short* r3 = bottom_blob.row(i * 8 + 3); - const unsigned short* r4 = bottom_blob.row(i * 8 + 4); - const unsigned short* r5 = bottom_blob.row(i * 8 + 5); - const unsigned short* r6 = bottom_blob.row(i * 8 + 6); - const unsigned short* r7 = bottom_blob.row(i * 8 + 7); - - unsigned short* outptr = top_blob.row(i); + int b = bi / outh; + int i = bi % outh; + const unsigned short* r0 = bottom_blob.batch(b).row(i * 8); + const unsigned short* r1 = bottom_blob.batch(b).row(i * 8 + 1); + const unsigned short* r2 = bottom_blob.batch(b).row(i * 8 + 2); + const unsigned short* r3 = bottom_blob.batch(b).row(i * 8 + 3); + const unsigned short* r4 = bottom_blob.batch(b).row(i * 8 + 4); + const unsigned short* r5 = bottom_blob.batch(b).row(i * 8 + 5); + const unsigned short* r6 = bottom_blob.batch(b).row(i * 8 + 6); + const unsigned short* r7 = bottom_blob.batch(b).row(i * 8 + 7); + + unsigned short* outptr = top_blob.batch(b).row(i); int j = 0; #if __ARM_NEON @@ -615,19 +640,22 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack8to1) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const unsigned short* r0 = bottom_blob.row(i); - - unsigned short* outptr0 = top_blob.row(i * 8); - unsigned short* outptr1 = top_blob.row(i * 8 + 1); - unsigned short* outptr2 = top_blob.row(i * 8 + 2); - unsigned short* outptr3 = top_blob.row(i * 8 + 3); - unsigned short* outptr4 = top_blob.row(i * 8 + 4); - unsigned short* outptr5 = top_blob.row(i * 8 + 5); - unsigned short* outptr6 = top_blob.row(i * 8 + 6); - unsigned short* outptr7 = top_blob.row(i * 8 + 7); + int b = bi / h; + int i = bi % h; + const unsigned short* r0 = bottom_blob.batch(b).row(i); + + unsigned short* outptr0 = top_blob.batch(b).row(i * 8); + unsigned short* outptr1 = top_blob.batch(b).row(i * 8 + 1); + unsigned short* outptr2 = top_blob.batch(b).row(i * 8 + 2); + unsigned short* outptr3 = top_blob.batch(b).row(i * 8 + 3); + unsigned short* outptr4 = top_blob.batch(b).row(i * 8 + 4); + unsigned short* outptr5 = top_blob.batch(b).row(i * 8 + 5); + unsigned short* outptr6 = top_blob.batch(b).row(i * 8 + 6); + unsigned short* outptr7 = top_blob.batch(b).row(i * 8 + 7); int j = 0; #if __ARM_NEON @@ -769,13 +797,16 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack4to8) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const unsigned short* r0 = bottom_blob.row(i * 2); - const unsigned short* r1 = bottom_blob.row(i * 2 + 1); + int b = bi / outh; + int i = bi % outh; + const unsigned short* r0 = bottom_blob.batch(b).row(i * 2); + const unsigned short* r1 = bottom_blob.batch(b).row(i * 2 + 1); - unsigned short* outptr = top_blob.row(i); + unsigned short* outptr = top_blob.batch(b).row(i); int j = 0; #if NCNN_GNU_INLINE_ASM @@ -836,13 +867,16 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack8to4) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const unsigned short* r0 = bottom_blob.row(i); + int b = bi / h; + int i = bi % h; + const unsigned short* r0 = bottom_blob.batch(b).row(i); - unsigned short* outptr0 = top_blob.row(i * 2); - unsigned short* outptr1 = top_blob.row(i * 2 + 1); + unsigned short* outptr0 = top_blob.batch(b).row(i * 2); + unsigned short* outptr1 = top_blob.batch(b).row(i * 2 + 1); int j = 0; #if NCNN_GNU_INLINE_ASM @@ -912,23 +946,26 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons size_t out_elemsize = elemsize / elempack * out_elempack; if (dims == 3) - top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); - else // if (dims == 4) - top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); + else + top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; if (pack1to4) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const unsigned short* r0 = bottom_blob.channel(q * 4); - const unsigned short* r1 = bottom_blob.channel(q * 4 + 1); - const unsigned short* r2 = bottom_blob.channel(q * 4 + 2); - const unsigned short* r3 = bottom_blob.channel(q * 4 + 3); + int b = bq / outc; + int q = bq % outc; + const unsigned short* r0 = bottom_blob.batch(b).channel(q * 4); + const unsigned short* r1 = bottom_blob.batch(b).channel(q * 4 + 1); + const unsigned short* r2 = bottom_blob.batch(b).channel(q * 4 + 2); + const unsigned short* r3 = bottom_blob.batch(b).channel(q * 4 + 3); - unsigned short* outptr = top_blob.channel(q); + unsigned short* outptr = top_blob.batch(b).channel(q); int i = 0; #if __ARM_NEON @@ -961,15 +998,18 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack4to1) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const unsigned short* r0 = bottom_blob.channel(q); + int b = bq / channels; + int q = bq % channels; + const unsigned short* r0 = bottom_blob.batch(b).channel(q); - unsigned short* outptr0 = top_blob.channel(q * 4); - unsigned short* outptr1 = top_blob.channel(q * 4 + 1); - unsigned short* outptr2 = top_blob.channel(q * 4 + 2); - unsigned short* outptr3 = top_blob.channel(q * 4 + 3); + unsigned short* outptr0 = top_blob.batch(b).channel(q * 4); + unsigned short* outptr1 = top_blob.batch(b).channel(q * 4 + 1); + unsigned short* outptr2 = top_blob.batch(b).channel(q * 4 + 2); + unsigned short* outptr3 = top_blob.batch(b).channel(q * 4 + 3); int i = 0; #if __ARM_NEON @@ -1001,19 +1041,22 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack1to8) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const unsigned short* r0 = bottom_blob.channel(q * 8); - const unsigned short* r1 = bottom_blob.channel(q * 8 + 1); - const unsigned short* r2 = bottom_blob.channel(q * 8 + 2); - const unsigned short* r3 = bottom_blob.channel(q * 8 + 3); - const unsigned short* r4 = bottom_blob.channel(q * 8 + 4); - const unsigned short* r5 = bottom_blob.channel(q * 8 + 5); - const unsigned short* r6 = bottom_blob.channel(q * 8 + 6); - const unsigned short* r7 = bottom_blob.channel(q * 8 + 7); - - unsigned short* outptr = top_blob.channel(q); + int b = bq / outc; + int q = bq % outc; + const unsigned short* r0 = bottom_blob.batch(b).channel(q * 8); + const unsigned short* r1 = bottom_blob.batch(b).channel(q * 8 + 1); + const unsigned short* r2 = bottom_blob.batch(b).channel(q * 8 + 2); + const unsigned short* r3 = bottom_blob.batch(b).channel(q * 8 + 3); + const unsigned short* r4 = bottom_blob.batch(b).channel(q * 8 + 4); + const unsigned short* r5 = bottom_blob.batch(b).channel(q * 8 + 5); + const unsigned short* r6 = bottom_blob.batch(b).channel(q * 8 + 6); + const unsigned short* r7 = bottom_blob.batch(b).channel(q * 8 + 7); + + unsigned short* outptr = top_blob.batch(b).channel(q); int i = 0; #if __ARM_NEON @@ -1165,19 +1208,22 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack8to1) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const unsigned short* r0 = bottom_blob.channel(q); - - unsigned short* outptr0 = top_blob.channel(q * 8); - unsigned short* outptr1 = top_blob.channel(q * 8 + 1); - unsigned short* outptr2 = top_blob.channel(q * 8 + 2); - unsigned short* outptr3 = top_blob.channel(q * 8 + 3); - unsigned short* outptr4 = top_blob.channel(q * 8 + 4); - unsigned short* outptr5 = top_blob.channel(q * 8 + 5); - unsigned short* outptr6 = top_blob.channel(q * 8 + 6); - unsigned short* outptr7 = top_blob.channel(q * 8 + 7); + int b = bq / channels; + int q = bq % channels; + const unsigned short* r0 = bottom_blob.batch(b).channel(q); + + unsigned short* outptr0 = top_blob.batch(b).channel(q * 8); + unsigned short* outptr1 = top_blob.batch(b).channel(q * 8 + 1); + unsigned short* outptr2 = top_blob.batch(b).channel(q * 8 + 2); + unsigned short* outptr3 = top_blob.batch(b).channel(q * 8 + 3); + unsigned short* outptr4 = top_blob.batch(b).channel(q * 8 + 4); + unsigned short* outptr5 = top_blob.batch(b).channel(q * 8 + 5); + unsigned short* outptr6 = top_blob.batch(b).channel(q * 8 + 6); + unsigned short* outptr7 = top_blob.batch(b).channel(q * 8 + 7); int i = 0; #if __ARM_NEON @@ -1319,13 +1365,16 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack4to8) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const unsigned short* r0 = bottom_blob.channel(q * 2); - const unsigned short* r1 = bottom_blob.channel(q * 2 + 1); + int b = bq / outc; + int q = bq % outc; + const unsigned short* r0 = bottom_blob.batch(b).channel(q * 2); + const unsigned short* r1 = bottom_blob.batch(b).channel(q * 2 + 1); - unsigned short* outptr = top_blob.channel(q); + unsigned short* outptr = top_blob.batch(b).channel(q); int i = 0; #if NCNN_GNU_INLINE_ASM @@ -1386,13 +1435,16 @@ int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack8to4) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const unsigned short* r0 = bottom_blob.channel(q); + int b = bq / channels; + int q = bq % channels; + const unsigned short* r0 = bottom_blob.batch(b).channel(q); - unsigned short* outptr0 = top_blob.channel(q * 2); - unsigned short* outptr1 = top_blob.channel(q * 2 + 1); + unsigned short* outptr0 = top_blob.batch(b).channel(q * 2); + unsigned short* outptr1 = top_blob.batch(b).channel(q * 2 + 1); int i = 0; #if NCNN_GNU_INLINE_ASM @@ -1487,6 +1539,7 @@ int Packing_arm::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio int d = bottom_blob.d; int channels = bottom_blob.c; int dims = bottom_blob.dims; + int batch = bottom_blob.n; if (!use_padding) { @@ -1515,6 +1568,7 @@ int Packing_arm::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio top_blob.cstep = bottom_blob.cstep * elempack / out_elempack; top_blob.elemsize = elemsize / elempack * out_elempack; top_blob.elempack = out_elempack; + top_blob.nstep = bottom_blob.nstep * elempack / out_elempack; return 0; } @@ -1523,25 +1577,28 @@ int Packing_arm::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio int outh = h * elempack / out_elempack; size_t out_elemsize = elemsize / elempack * out_elempack; - top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; if (pack1to8) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const signed char* r0 = bottom_blob.row(i * 8); - const signed char* r1 = bottom_blob.row(i * 8 + 1); - const signed char* r2 = bottom_blob.row(i * 8 + 2); - const signed char* r3 = bottom_blob.row(i * 8 + 3); - const signed char* r4 = bottom_blob.row(i * 8 + 4); - const signed char* r5 = bottom_blob.row(i * 8 + 5); - const signed char* r6 = bottom_blob.row(i * 8 + 6); - const signed char* r7 = bottom_blob.row(i * 8 + 7); - - signed char* outptr = top_blob.row(i); + int b = bi / outh; + int i = bi % outh; + const signed char* r0 = bottom_blob.batch(b).row(i * 8); + const signed char* r1 = bottom_blob.batch(b).row(i * 8 + 1); + const signed char* r2 = bottom_blob.batch(b).row(i * 8 + 2); + const signed char* r3 = bottom_blob.batch(b).row(i * 8 + 3); + const signed char* r4 = bottom_blob.batch(b).row(i * 8 + 4); + const signed char* r5 = bottom_blob.batch(b).row(i * 8 + 5); + const signed char* r6 = bottom_blob.batch(b).row(i * 8 + 6); + const signed char* r7 = bottom_blob.batch(b).row(i * 8 + 7); + + signed char* outptr = top_blob.batch(b).row(i); int j = 0; for (; j < w; j++) @@ -1561,19 +1618,22 @@ int Packing_arm::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio } if (pack8to1) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const signed char* r0 = bottom_blob.row(i); - - signed char* outptr0 = top_blob.row(i * 8); - signed char* outptr1 = top_blob.row(i * 8 + 1); - signed char* outptr2 = top_blob.row(i * 8 + 2); - signed char* outptr3 = top_blob.row(i * 8 + 3); - signed char* outptr4 = top_blob.row(i * 8 + 4); - signed char* outptr5 = top_blob.row(i * 8 + 5); - signed char* outptr6 = top_blob.row(i * 8 + 6); - signed char* outptr7 = top_blob.row(i * 8 + 7); + int b = bi / h; + int i = bi % h; + const signed char* r0 = bottom_blob.batch(b).row(i); + + signed char* outptr0 = top_blob.batch(b).row(i * 8); + signed char* outptr1 = top_blob.batch(b).row(i * 8 + 1); + signed char* outptr2 = top_blob.batch(b).row(i * 8 + 2); + signed char* outptr3 = top_blob.batch(b).row(i * 8 + 3); + signed char* outptr4 = top_blob.batch(b).row(i * 8 + 4); + signed char* outptr5 = top_blob.batch(b).row(i * 8 + 5); + signed char* outptr6 = top_blob.batch(b).row(i * 8 + 6); + signed char* outptr7 = top_blob.batch(b).row(i * 8 + 7); int j = 0; for (; j < w; j++) @@ -1602,27 +1662,30 @@ int Packing_arm::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio size_t out_elemsize = elemsize / elempack * out_elempack; if (dims == 3) - top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); - else // if (dims == 4) - top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); + else + top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; if (pack1to8) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const signed char* r0 = bottom_blob.channel(q * 8); - const signed char* r1 = bottom_blob.channel(q * 8 + 1); - const signed char* r2 = bottom_blob.channel(q * 8 + 2); - const signed char* r3 = bottom_blob.channel(q * 8 + 3); - const signed char* r4 = bottom_blob.channel(q * 8 + 4); - const signed char* r5 = bottom_blob.channel(q * 8 + 5); - const signed char* r6 = bottom_blob.channel(q * 8 + 6); - const signed char* r7 = bottom_blob.channel(q * 8 + 7); - - signed char* outptr = top_blob.channel(q); + int b = bq / outc; + int q = bq % outc; + const signed char* r0 = bottom_blob.batch(b).channel(q * 8); + const signed char* r1 = bottom_blob.batch(b).channel(q * 8 + 1); + const signed char* r2 = bottom_blob.batch(b).channel(q * 8 + 2); + const signed char* r3 = bottom_blob.batch(b).channel(q * 8 + 3); + const signed char* r4 = bottom_blob.batch(b).channel(q * 8 + 4); + const signed char* r5 = bottom_blob.batch(b).channel(q * 8 + 5); + const signed char* r6 = bottom_blob.batch(b).channel(q * 8 + 6); + const signed char* r7 = bottom_blob.batch(b).channel(q * 8 + 7); + + signed char* outptr = top_blob.batch(b).channel(q); int i = 0; for (; i < size; i++) @@ -1642,19 +1705,22 @@ int Packing_arm::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio } if (pack8to1) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const signed char* r0 = bottom_blob.channel(q); - - signed char* outptr0 = top_blob.channel(q * 8); - signed char* outptr1 = top_blob.channel(q * 8 + 1); - signed char* outptr2 = top_blob.channel(q * 8 + 2); - signed char* outptr3 = top_blob.channel(q * 8 + 3); - signed char* outptr4 = top_blob.channel(q * 8 + 4); - signed char* outptr5 = top_blob.channel(q * 8 + 5); - signed char* outptr6 = top_blob.channel(q * 8 + 6); - signed char* outptr7 = top_blob.channel(q * 8 + 7); + int b = bq / channels; + int q = bq % channels; + const signed char* r0 = bottom_blob.batch(b).channel(q); + + signed char* outptr0 = top_blob.batch(b).channel(q * 8); + signed char* outptr1 = top_blob.batch(b).channel(q * 8 + 1); + signed char* outptr2 = top_blob.batch(b).channel(q * 8 + 2); + signed char* outptr3 = top_blob.batch(b).channel(q * 8 + 3); + signed char* outptr4 = top_blob.batch(b).channel(q * 8 + 4); + signed char* outptr5 = top_blob.batch(b).channel(q * 8 + 5); + signed char* outptr6 = top_blob.batch(b).channel(q * 8 + 6); + signed char* outptr7 = top_blob.batch(b).channel(q * 8 + 7); int i = 0; for (; i < size; i++) diff --git a/src/layer/cast.cpp b/src/layer/cast.cpp index 3dcff38f3cac..15a34ffcc764 100644 --- a/src/layer/cast.cpp +++ b/src/layer/cast.cpp @@ -50,6 +50,7 @@ int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons int d = bottom_blob.d; int channels = bottom_blob.c; int dims = bottom_blob.dims; + int batch = bottom_blob.n; size_t elemsize = bottom_blob.elemsize; int elempack = bottom_blob.elempack; @@ -76,33 +77,29 @@ int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons } if (dims == 1) - { - top_blob.create(w, out_elemsize, elempack, opt.blob_allocator); - } + top_blob.create_batch(w, batch, out_elemsize, elempack, opt.blob_allocator); else if (dims == 2) - { - top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator); - } + top_blob.create_batch(w, h, batch, out_elemsize, elempack, opt.blob_allocator); else if (dims == 3) - { - top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); - } + top_blob.create_batch(w, h, channels, batch, out_elemsize, elempack, opt.blob_allocator); else if (dims == 4) - { - top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_allocator); - } + top_blob.create_batch(w, h, d, channels, batch, out_elemsize, elempack, opt.blob_allocator); if (top_blob.empty()) return -100; int size = w * h * d * elempack; + int total_bc = batch * channels; + if (type_from == 1 && type_to == 2) { #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { - const float* ptr = bottom_blob.channel(q); - unsigned short* outptr = top_blob.channel(q); + int b = bc / channels; + int q = bc % channels; + const float* ptr = bottom_blob.batch(b).channel(q); + unsigned short* outptr = top_blob.batch(b).channel(q); for (int i = 0; i < size; i++) { @@ -114,10 +111,12 @@ int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons if (type_from == 2 && type_to == 1) { #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { - const unsigned short* ptr = bottom_blob.channel(q); - float* outptr = top_blob.channel(q); + int b = bc / channels; + int q = bc % channels; + const unsigned short* ptr = bottom_blob.batch(b).channel(q); + float* outptr = top_blob.batch(b).channel(q); for (int i = 0; i < size; i++) { @@ -129,10 +128,12 @@ int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons if (type_from == 3 && type_to == 1) { #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { - const signed char* ptr = bottom_blob.channel(q); - float* outptr = top_blob.channel(q); + int b = bc / channels; + int q = bc % channels; + const signed char* ptr = bottom_blob.batch(b).channel(q); + float* outptr = top_blob.batch(b).channel(q); for (int i = 0; i < size; i++) { @@ -144,10 +145,12 @@ int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons if (type_from == 1 && type_to == 4) { #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { - const float* ptr = bottom_blob.channel(q); - unsigned short* outptr = top_blob.channel(q); + int b = bc / channels; + int q = bc % channels; + const float* ptr = bottom_blob.batch(b).channel(q); + unsigned short* outptr = top_blob.batch(b).channel(q); for (int i = 0; i < size; i++) { @@ -159,10 +162,12 @@ int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons if (type_from == 4 && type_to == 1) { #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { - const unsigned short* ptr = bottom_blob.channel(q); - float* outptr = top_blob.channel(q); + int b = bc / channels; + int q = bc % channels; + const unsigned short* ptr = bottom_blob.batch(b).channel(q); + float* outptr = top_blob.batch(b).channel(q); for (int i = 0; i < size; i++) { diff --git a/src/layer/loongarch/cast_loongarch.cpp b/src/layer/loongarch/cast_loongarch.cpp index b2a0bb8ac58c..bd1b0acdbf6f 100644 --- a/src/layer/loongarch/cast_loongarch.cpp +++ b/src/layer/loongarch/cast_loongarch.cpp @@ -29,13 +29,14 @@ int Cast_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int dims = bottom_blob.dims; size_t elemsize = bottom_blob.elemsize; int elempack = bottom_blob.elempack; + int batch = bottom_blob.n; size_t out_elemsize = elemsize; if (type_to == 1) { if (type_from == 3) { - Cast::forward(bottom_blob, top_blob, opt); + return Cast::forward(bottom_blob, top_blob, opt); } // float32 @@ -58,21 +59,13 @@ int Cast_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& } if (dims == 1) - { - top_blob.create(w, out_elemsize, elempack, opt.blob_allocator); - } + top_blob.create_batch(w, batch, out_elemsize, elempack, opt.blob_allocator); else if (dims == 2) - { - top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator); - } + top_blob.create_batch(w, h, batch, out_elemsize, elempack, opt.blob_allocator); else if (dims == 3) - { - top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); - } + top_blob.create_batch(w, h, channels, batch, out_elemsize, elempack, opt.blob_allocator); else if (dims == 4) - { - top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_allocator); - } + top_blob.create_batch(w, h, d, channels, batch, out_elemsize, elempack, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -80,11 +73,14 @@ int Cast_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& if (type_from == 1 && type_to == 2) { + const int total_bc = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { - const float* ptr = bottom_blob.channel(q); - unsigned short* outptr = top_blob.channel(q); + int b = bc / channels; + int q = bc % channels; + const float* ptr = bottom_blob.batch(b).channel(q); + unsigned short* outptr = top_blob.batch(b).channel(q); int i = 0; #if __loongarch_sx @@ -111,11 +107,14 @@ int Cast_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& if (type_from == 2 && type_to == 1) { + const int total_bc = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { - const unsigned short* ptr = bottom_blob.channel(q); - float* outptr = top_blob.channel(q); + int b = bc / channels; + int q = bc % channels; + const unsigned short* ptr = bottom_blob.batch(b).channel(q); + float* outptr = top_blob.batch(b).channel(q); int i = 0; #if __loongarch_sx @@ -143,11 +142,14 @@ int Cast_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& if (type_from == 3 && type_to == 1) { + const int total_bc = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { - const signed char* ptr = bottom_blob.channel(q); - float* outptr = top_blob.channel(q); + int b = bc / channels; + int q = bc % channels; + const signed char* ptr = bottom_blob.batch(b).channel(q); + float* outptr = top_blob.batch(b).channel(q); for (int i = 0; i < size; i++) { @@ -158,11 +160,14 @@ int Cast_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& if (type_from == 4 && type_to == 1) { + const int total_bc = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { - const unsigned short* ptr = bottom_blob.channel(q); - float* outptr = top_blob.channel(q); + int b = bc / channels; + int q = bc % channels; + const unsigned short* ptr = bottom_blob.batch(b).channel(q); + float* outptr = top_blob.batch(b).channel(q); int i = 0; for (; i < size; i++) @@ -176,11 +181,14 @@ int Cast_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& if (type_from == 1 && type_to == 4) { + const int total_bc = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { - const float* ptr = bottom_blob.channel(q); - unsigned short* outptr = top_blob.channel(q); + int b = bc / channels; + int q = bc % channels; + const float* ptr = bottom_blob.batch(b).channel(q); + unsigned short* outptr = top_blob.batch(b).channel(q); int i = 0; for (; i < size; i++) diff --git a/src/layer/loongarch/packing_loongarch.cpp b/src/layer/loongarch/packing_loongarch.cpp index 6225dd49f23b..9f6af1d015d4 100644 --- a/src/layer/loongarch/packing_loongarch.cpp +++ b/src/layer/loongarch/packing_loongarch.cpp @@ -54,6 +54,7 @@ int Packing_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Opti int d = bottom_blob.d; int channels = bottom_blob.c; int dims = bottom_blob.dims; + int batch = bottom_blob.n; if (!use_padding) { @@ -82,6 +83,7 @@ int Packing_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Opti top_blob.cstep = bottom_blob.cstep * elempack / out_elempack; top_blob.elemsize = elemsize / elempack * out_elempack; top_blob.elempack = out_elempack; + top_blob.nstep = bottom_blob.nstep * elempack / out_elempack; return 0; } @@ -90,21 +92,24 @@ int Packing_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Opti int outh = h * elempack / out_elempack; size_t out_elemsize = elemsize / elempack * out_elempack; - top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; if (pack1to4) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const float* r0 = bottom_blob.row(i * 4); - const float* r1 = bottom_blob.row(i * 4 + 1); - const float* r2 = bottom_blob.row(i * 4 + 2); - const float* r3 = bottom_blob.row(i * 4 + 3); + int b = bi / outh; + int i = bi % outh; + const float* r0 = bottom_blob.batch(b).row(i * 4); + const float* r1 = bottom_blob.batch(b).row(i * 4 + 1); + const float* r2 = bottom_blob.batch(b).row(i * 4 + 2); + const float* r3 = bottom_blob.batch(b).row(i * 4 + 3); - float* outptr = top_blob.row(i); + float* outptr = top_blob.batch(b).row(i); int j = 0; #if __loongarch_sx @@ -150,15 +155,18 @@ int Packing_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Opti } if (pack4to1) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const float* r0 = bottom_blob.row(i); + int b = bi / h; + int i = bi % h; + const float* r0 = bottom_blob.batch(b).row(i); - float* outptr0 = top_blob.row(i * 4); - float* outptr1 = top_blob.row(i * 4 + 1); - float* outptr2 = top_blob.row(i * 4 + 2); - float* outptr3 = top_blob.row(i * 4 + 3); + float* outptr0 = top_blob.batch(b).row(i * 4); + float* outptr1 = top_blob.batch(b).row(i * 4 + 1); + float* outptr2 = top_blob.batch(b).row(i * 4 + 2); + float* outptr3 = top_blob.batch(b).row(i * 4 + 3); int j = 0; #if __loongarch_sx @@ -213,23 +221,26 @@ int Packing_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Opti size_t out_elemsize = elemsize / elempack * out_elempack; if (dims == 3) - top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); - else // if (dims == 4) - top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); + else + top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; if (pack1to4) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const float* r0 = bottom_blob.channel(q * 4); - const float* r1 = bottom_blob.channel(q * 4 + 1); - const float* r2 = bottom_blob.channel(q * 4 + 2); - const float* r3 = bottom_blob.channel(q * 4 + 3); + int b = bq / outc; + int q = bq % outc; + const float* r0 = bottom_blob.batch(b).channel(q * 4); + const float* r1 = bottom_blob.batch(b).channel(q * 4 + 1); + const float* r2 = bottom_blob.batch(b).channel(q * 4 + 2); + const float* r3 = bottom_blob.batch(b).channel(q * 4 + 3); - float* outptr = top_blob.channel(q); + float* outptr = top_blob.batch(b).channel(q); int i = 0; #if __loongarch_sx @@ -275,15 +286,18 @@ int Packing_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Opti } if (pack4to1) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const float* r0 = bottom_blob.channel(q); + int b = bq / channels; + int q = bq % channels; + const float* r0 = bottom_blob.batch(b).channel(q); - float* outptr0 = top_blob.channel(q * 4); - float* outptr1 = top_blob.channel(q * 4 + 1); - float* outptr2 = top_blob.channel(q * 4 + 2); - float* outptr3 = top_blob.channel(q * 4 + 3); + float* outptr0 = top_blob.batch(b).channel(q * 4); + float* outptr1 = top_blob.batch(b).channel(q * 4 + 1); + float* outptr2 = top_blob.batch(b).channel(q * 4 + 2); + float* outptr3 = top_blob.batch(b).channel(q * 4 + 3); int i = 0; #if __loongarch_sx @@ -363,6 +377,7 @@ int Packing_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const int d = bottom_blob.d; int channels = bottom_blob.c; int dims = bottom_blob.dims; + int batch = bottom_blob.n; if (!use_padding) { @@ -391,6 +406,7 @@ int Packing_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const top_blob.cstep = bottom_blob.cstep * elempack / out_elempack; top_blob.elemsize = elemsize / elempack * out_elempack; top_blob.elempack = out_elempack; + top_blob.nstep = bottom_blob.nstep * elempack / out_elempack; return 0; } @@ -399,25 +415,28 @@ int Packing_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const int outh = h * elempack / out_elempack; size_t out_elemsize = elemsize / elempack * out_elempack; - top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; if (pack1to8) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const signed char* r0 = bottom_blob.row(i * 8); - const signed char* r1 = bottom_blob.row(i * 8 + 1); - const signed char* r2 = bottom_blob.row(i * 8 + 2); - const signed char* r3 = bottom_blob.row(i * 8 + 3); - const signed char* r4 = bottom_blob.row(i * 8 + 4); - const signed char* r5 = bottom_blob.row(i * 8 + 5); - const signed char* r6 = bottom_blob.row(i * 8 + 6); - const signed char* r7 = bottom_blob.row(i * 8 + 7); - - signed char* outptr = top_blob.row(i); + int b = bi / outh; + int i = bi % outh; + const signed char* r0 = bottom_blob.batch(b).row(i * 8); + const signed char* r1 = bottom_blob.batch(b).row(i * 8 + 1); + const signed char* r2 = bottom_blob.batch(b).row(i * 8 + 2); + const signed char* r3 = bottom_blob.batch(b).row(i * 8 + 3); + const signed char* r4 = bottom_blob.batch(b).row(i * 8 + 4); + const signed char* r5 = bottom_blob.batch(b).row(i * 8 + 5); + const signed char* r6 = bottom_blob.batch(b).row(i * 8 + 6); + const signed char* r7 = bottom_blob.batch(b).row(i * 8 + 7); + + signed char* outptr = top_blob.batch(b).row(i); int j = 0; for (; j < w; j++) @@ -437,19 +456,22 @@ int Packing_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const } if (pack8to1) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const signed char* r0 = bottom_blob.row(i); - - signed char* outptr0 = top_blob.row(i * 8); - signed char* outptr1 = top_blob.row(i * 8 + 1); - signed char* outptr2 = top_blob.row(i * 8 + 2); - signed char* outptr3 = top_blob.row(i * 8 + 3); - signed char* outptr4 = top_blob.row(i * 8 + 4); - signed char* outptr5 = top_blob.row(i * 8 + 5); - signed char* outptr6 = top_blob.row(i * 8 + 6); - signed char* outptr7 = top_blob.row(i * 8 + 7); + int b = bi / h; + int i = bi % h; + const signed char* r0 = bottom_blob.batch(b).row(i); + + signed char* outptr0 = top_blob.batch(b).row(i * 8); + signed char* outptr1 = top_blob.batch(b).row(i * 8 + 1); + signed char* outptr2 = top_blob.batch(b).row(i * 8 + 2); + signed char* outptr3 = top_blob.batch(b).row(i * 8 + 3); + signed char* outptr4 = top_blob.batch(b).row(i * 8 + 4); + signed char* outptr5 = top_blob.batch(b).row(i * 8 + 5); + signed char* outptr6 = top_blob.batch(b).row(i * 8 + 6); + signed char* outptr7 = top_blob.batch(b).row(i * 8 + 7); int j = 0; for (; j < w; j++) @@ -478,27 +500,30 @@ int Packing_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const size_t out_elemsize = elemsize / elempack * out_elempack; if (dims == 3) - top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); - else // if (dims == 4) - top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); + else + top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; if (pack1to8) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const signed char* r0 = bottom_blob.channel(q * 8); - const signed char* r1 = bottom_blob.channel(q * 8 + 1); - const signed char* r2 = bottom_blob.channel(q * 8 + 2); - const signed char* r3 = bottom_blob.channel(q * 8 + 3); - const signed char* r4 = bottom_blob.channel(q * 8 + 4); - const signed char* r5 = bottom_blob.channel(q * 8 + 5); - const signed char* r6 = bottom_blob.channel(q * 8 + 6); - const signed char* r7 = bottom_blob.channel(q * 8 + 7); - - signed char* outptr = top_blob.channel(q); + int b = bq / outc; + int q = bq % outc; + const signed char* r0 = bottom_blob.batch(b).channel(q * 8); + const signed char* r1 = bottom_blob.batch(b).channel(q * 8 + 1); + const signed char* r2 = bottom_blob.batch(b).channel(q * 8 + 2); + const signed char* r3 = bottom_blob.batch(b).channel(q * 8 + 3); + const signed char* r4 = bottom_blob.batch(b).channel(q * 8 + 4); + const signed char* r5 = bottom_blob.batch(b).channel(q * 8 + 5); + const signed char* r6 = bottom_blob.batch(b).channel(q * 8 + 6); + const signed char* r7 = bottom_blob.batch(b).channel(q * 8 + 7); + + signed char* outptr = top_blob.batch(b).channel(q); int i = 0; for (; i < size; i++) @@ -518,19 +543,22 @@ int Packing_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const } if (pack8to1) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const signed char* r0 = bottom_blob.channel(q); - - signed char* outptr0 = top_blob.channel(q * 8); - signed char* outptr1 = top_blob.channel(q * 8 + 1); - signed char* outptr2 = top_blob.channel(q * 8 + 2); - signed char* outptr3 = top_blob.channel(q * 8 + 3); - signed char* outptr4 = top_blob.channel(q * 8 + 4); - signed char* outptr5 = top_blob.channel(q * 8 + 5); - signed char* outptr6 = top_blob.channel(q * 8 + 6); - signed char* outptr7 = top_blob.channel(q * 8 + 7); + int b = bq / channels; + int q = bq % channels; + const signed char* r0 = bottom_blob.batch(b).channel(q); + + signed char* outptr0 = top_blob.batch(b).channel(q * 8); + signed char* outptr1 = top_blob.batch(b).channel(q * 8 + 1); + signed char* outptr2 = top_blob.batch(b).channel(q * 8 + 2); + signed char* outptr3 = top_blob.batch(b).channel(q * 8 + 3); + signed char* outptr4 = top_blob.batch(b).channel(q * 8 + 4); + signed char* outptr5 = top_blob.batch(b).channel(q * 8 + 5); + signed char* outptr6 = top_blob.batch(b).channel(q * 8 + 6); + signed char* outptr7 = top_blob.batch(b).channel(q * 8 + 7); int i = 0; for (; i < size; i++) diff --git a/src/layer/mips/cast_mips.cpp b/src/layer/mips/cast_mips.cpp index deb74834ea18..667292e59977 100644 --- a/src/layer/mips/cast_mips.cpp +++ b/src/layer/mips/cast_mips.cpp @@ -29,13 +29,14 @@ int Cast_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) int dims = bottom_blob.dims; size_t elemsize = bottom_blob.elemsize; int elempack = bottom_blob.elempack; + int batch = bottom_blob.n; size_t out_elemsize = elemsize; if (type_to == 1) { if (type_from == 3) { - Cast::forward(bottom_blob, top_blob, opt); + return Cast::forward(bottom_blob, top_blob, opt); } // float32 @@ -58,21 +59,13 @@ int Cast_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) } if (dims == 1) - { - top_blob.create(w, out_elemsize, elempack, opt.blob_allocator); - } + top_blob.create_batch(w, batch, out_elemsize, elempack, opt.blob_allocator); else if (dims == 2) - { - top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator); - } + top_blob.create_batch(w, h, batch, out_elemsize, elempack, opt.blob_allocator); else if (dims == 3) - { - top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); - } + top_blob.create_batch(w, h, channels, batch, out_elemsize, elempack, opt.blob_allocator); else if (dims == 4) - { - top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_allocator); - } + top_blob.create_batch(w, h, d, channels, batch, out_elemsize, elempack, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -80,11 +73,14 @@ int Cast_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) if (type_from == 1 && type_to == 2) { + const int total_bc = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { - const float* ptr = bottom_blob.channel(q); - unsigned short* outptr = top_blob.channel(q); + int b = bc / channels; + int q = bc % channels; + const float* ptr = bottom_blob.batch(b).channel(q); + unsigned short* outptr = top_blob.batch(b).channel(q); int i = 0; #if __mips_msa @@ -111,11 +107,14 @@ int Cast_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) if (type_from == 2 && type_to == 1) { + const int total_bc = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { - const unsigned short* ptr = bottom_blob.channel(q); - float* outptr = top_blob.channel(q); + int b = bc / channels; + int q = bc % channels; + const unsigned short* ptr = bottom_blob.batch(b).channel(q); + float* outptr = top_blob.batch(b).channel(q); int i = 0; #if __mips_msa @@ -143,11 +142,14 @@ int Cast_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) if (type_from == 3 && type_to == 1) { + const int total_bc = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { - const signed char* ptr = bottom_blob.channel(q); - float* outptr = top_blob.channel(q); + int b = bc / channels; + int q = bc % channels; + const signed char* ptr = bottom_blob.batch(b).channel(q); + float* outptr = top_blob.batch(b).channel(q); for (int i = 0; i < size; i++) { @@ -158,11 +160,14 @@ int Cast_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) if (type_from == 4 && type_to == 1) { + const int total_bc = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { - const unsigned short* ptr = bottom_blob.channel(q); - float* outptr = top_blob.channel(q); + int b = bc / channels; + int q = bc % channels; + const unsigned short* ptr = bottom_blob.batch(b).channel(q); + float* outptr = top_blob.batch(b).channel(q); int i = 0; for (; i < size; i++) @@ -176,11 +181,14 @@ int Cast_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) if (type_from == 1 && type_to == 4) { + const int total_bc = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { - const float* ptr = bottom_blob.channel(q); - unsigned short* outptr = top_blob.channel(q); + int b = bc / channels; + int q = bc % channels; + const float* ptr = bottom_blob.batch(b).channel(q); + unsigned short* outptr = top_blob.batch(b).channel(q); int i = 0; for (; i < size; i++) diff --git a/src/layer/mips/packing_mips.cpp b/src/layer/mips/packing_mips.cpp index a4cea20e1c6c..9e435227f901 100644 --- a/src/layer/mips/packing_mips.cpp +++ b/src/layer/mips/packing_mips.cpp @@ -54,6 +54,7 @@ int Packing_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o int d = bottom_blob.d; int channels = bottom_blob.c; int dims = bottom_blob.dims; + int batch = bottom_blob.n; if (!use_padding) { @@ -82,6 +83,7 @@ int Packing_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o top_blob.cstep = bottom_blob.cstep * elempack / out_elempack; top_blob.elemsize = elemsize / elempack * out_elempack; top_blob.elempack = out_elempack; + top_blob.nstep = bottom_blob.nstep * elempack / out_elempack; return 0; } @@ -90,21 +92,24 @@ int Packing_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o int outh = h * elempack / out_elempack; size_t out_elemsize = elemsize / elempack * out_elempack; - top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; if (pack1to4) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const float* r0 = bottom_blob.row(i * 4); - const float* r1 = bottom_blob.row(i * 4 + 1); - const float* r2 = bottom_blob.row(i * 4 + 2); - const float* r3 = bottom_blob.row(i * 4 + 3); + int b = bi / outh; + int i = bi % outh; + const float* r0 = bottom_blob.batch(b).row(i * 4); + const float* r1 = bottom_blob.batch(b).row(i * 4 + 1); + const float* r2 = bottom_blob.batch(b).row(i * 4 + 2); + const float* r3 = bottom_blob.batch(b).row(i * 4 + 3); - float* outptr = top_blob.row(i); + float* outptr = top_blob.batch(b).row(i); int j = 0; #if __mips_msa @@ -150,15 +155,18 @@ int Packing_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o } if (pack4to1) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const float* r0 = bottom_blob.row(i); + int b = bi / h; + int i = bi % h; + const float* r0 = bottom_blob.batch(b).row(i); - float* outptr0 = top_blob.row(i * 4); - float* outptr1 = top_blob.row(i * 4 + 1); - float* outptr2 = top_blob.row(i * 4 + 2); - float* outptr3 = top_blob.row(i * 4 + 3); + float* outptr0 = top_blob.batch(b).row(i * 4); + float* outptr1 = top_blob.batch(b).row(i * 4 + 1); + float* outptr2 = top_blob.batch(b).row(i * 4 + 2); + float* outptr3 = top_blob.batch(b).row(i * 4 + 3); int j = 0; #if __mips_msa @@ -213,23 +221,26 @@ int Packing_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o size_t out_elemsize = elemsize / elempack * out_elempack; if (dims == 3) - top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); - else // if (dims == 4) - top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); + else + top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; if (pack1to4) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const float* r0 = bottom_blob.channel(q * 4); - const float* r1 = bottom_blob.channel(q * 4 + 1); - const float* r2 = bottom_blob.channel(q * 4 + 2); - const float* r3 = bottom_blob.channel(q * 4 + 3); + int b = bq / outc; + int q = bq % outc; + const float* r0 = bottom_blob.batch(b).channel(q * 4); + const float* r1 = bottom_blob.batch(b).channel(q * 4 + 1); + const float* r2 = bottom_blob.batch(b).channel(q * 4 + 2); + const float* r3 = bottom_blob.batch(b).channel(q * 4 + 3); - float* outptr = top_blob.channel(q); + float* outptr = top_blob.batch(b).channel(q); int i = 0; #if __mips_msa @@ -275,15 +286,18 @@ int Packing_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o } if (pack4to1) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const float* r0 = bottom_blob.channel(q); + int b = bq / channels; + int q = bq % channels; + const float* r0 = bottom_blob.batch(b).channel(q); - float* outptr0 = top_blob.channel(q * 4); - float* outptr1 = top_blob.channel(q * 4 + 1); - float* outptr2 = top_blob.channel(q * 4 + 2); - float* outptr3 = top_blob.channel(q * 4 + 3); + float* outptr0 = top_blob.batch(b).channel(q * 4); + float* outptr1 = top_blob.batch(b).channel(q * 4 + 1); + float* outptr2 = top_blob.batch(b).channel(q * 4 + 2); + float* outptr3 = top_blob.batch(b).channel(q * 4 + 3); int i = 0; #if __mips_msa @@ -363,6 +377,7 @@ int Packing_mips::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opti int d = bottom_blob.d; int channels = bottom_blob.c; int dims = bottom_blob.dims; + int batch = bottom_blob.n; if (!use_padding) { @@ -391,6 +406,7 @@ int Packing_mips::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opti top_blob.cstep = bottom_blob.cstep * elempack / out_elempack; top_blob.elemsize = elemsize / elempack * out_elempack; top_blob.elempack = out_elempack; + top_blob.nstep = bottom_blob.nstep * elempack / out_elempack; return 0; } @@ -399,25 +415,28 @@ int Packing_mips::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opti int outh = h * elempack / out_elempack; size_t out_elemsize = elemsize / elempack * out_elempack; - top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; if (pack1to8) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const signed char* r0 = bottom_blob.row(i * 8); - const signed char* r1 = bottom_blob.row(i * 8 + 1); - const signed char* r2 = bottom_blob.row(i * 8 + 2); - const signed char* r3 = bottom_blob.row(i * 8 + 3); - const signed char* r4 = bottom_blob.row(i * 8 + 4); - const signed char* r5 = bottom_blob.row(i * 8 + 5); - const signed char* r6 = bottom_blob.row(i * 8 + 6); - const signed char* r7 = bottom_blob.row(i * 8 + 7); - - signed char* outptr = top_blob.row(i); + int b = bi / outh; + int i = bi % outh; + const signed char* r0 = bottom_blob.batch(b).row(i * 8); + const signed char* r1 = bottom_blob.batch(b).row(i * 8 + 1); + const signed char* r2 = bottom_blob.batch(b).row(i * 8 + 2); + const signed char* r3 = bottom_blob.batch(b).row(i * 8 + 3); + const signed char* r4 = bottom_blob.batch(b).row(i * 8 + 4); + const signed char* r5 = bottom_blob.batch(b).row(i * 8 + 5); + const signed char* r6 = bottom_blob.batch(b).row(i * 8 + 6); + const signed char* r7 = bottom_blob.batch(b).row(i * 8 + 7); + + signed char* outptr = top_blob.batch(b).row(i); int j = 0; for (; j < w; j++) @@ -437,19 +456,22 @@ int Packing_mips::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opti } if (pack8to1) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const signed char* r0 = bottom_blob.row(i); - - signed char* outptr0 = top_blob.row(i * 8); - signed char* outptr1 = top_blob.row(i * 8 + 1); - signed char* outptr2 = top_blob.row(i * 8 + 2); - signed char* outptr3 = top_blob.row(i * 8 + 3); - signed char* outptr4 = top_blob.row(i * 8 + 4); - signed char* outptr5 = top_blob.row(i * 8 + 5); - signed char* outptr6 = top_blob.row(i * 8 + 6); - signed char* outptr7 = top_blob.row(i * 8 + 7); + int b = bi / h; + int i = bi % h; + const signed char* r0 = bottom_blob.batch(b).row(i); + + signed char* outptr0 = top_blob.batch(b).row(i * 8); + signed char* outptr1 = top_blob.batch(b).row(i * 8 + 1); + signed char* outptr2 = top_blob.batch(b).row(i * 8 + 2); + signed char* outptr3 = top_blob.batch(b).row(i * 8 + 3); + signed char* outptr4 = top_blob.batch(b).row(i * 8 + 4); + signed char* outptr5 = top_blob.batch(b).row(i * 8 + 5); + signed char* outptr6 = top_blob.batch(b).row(i * 8 + 6); + signed char* outptr7 = top_blob.batch(b).row(i * 8 + 7); int j = 0; for (; j < w; j++) @@ -478,27 +500,30 @@ int Packing_mips::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opti size_t out_elemsize = elemsize / elempack * out_elempack; if (dims == 3) - top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); - else // if (dims == 4) - top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); + else + top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; if (pack1to8) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const signed char* r0 = bottom_blob.channel(q * 8); - const signed char* r1 = bottom_blob.channel(q * 8 + 1); - const signed char* r2 = bottom_blob.channel(q * 8 + 2); - const signed char* r3 = bottom_blob.channel(q * 8 + 3); - const signed char* r4 = bottom_blob.channel(q * 8 + 4); - const signed char* r5 = bottom_blob.channel(q * 8 + 5); - const signed char* r6 = bottom_blob.channel(q * 8 + 6); - const signed char* r7 = bottom_blob.channel(q * 8 + 7); - - signed char* outptr = top_blob.channel(q); + int b = bq / outc; + int q = bq % outc; + const signed char* r0 = bottom_blob.batch(b).channel(q * 8); + const signed char* r1 = bottom_blob.batch(b).channel(q * 8 + 1); + const signed char* r2 = bottom_blob.batch(b).channel(q * 8 + 2); + const signed char* r3 = bottom_blob.batch(b).channel(q * 8 + 3); + const signed char* r4 = bottom_blob.batch(b).channel(q * 8 + 4); + const signed char* r5 = bottom_blob.batch(b).channel(q * 8 + 5); + const signed char* r6 = bottom_blob.batch(b).channel(q * 8 + 6); + const signed char* r7 = bottom_blob.batch(b).channel(q * 8 + 7); + + signed char* outptr = top_blob.batch(b).channel(q); int i = 0; for (; i < size; i++) @@ -518,19 +543,22 @@ int Packing_mips::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opti } if (pack8to1) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const signed char* r0 = bottom_blob.channel(q); - - signed char* outptr0 = top_blob.channel(q * 8); - signed char* outptr1 = top_blob.channel(q * 8 + 1); - signed char* outptr2 = top_blob.channel(q * 8 + 2); - signed char* outptr3 = top_blob.channel(q * 8 + 3); - signed char* outptr4 = top_blob.channel(q * 8 + 4); - signed char* outptr5 = top_blob.channel(q * 8 + 5); - signed char* outptr6 = top_blob.channel(q * 8 + 6); - signed char* outptr7 = top_blob.channel(q * 8 + 7); + int b = bq / channels; + int q = bq % channels; + const signed char* r0 = bottom_blob.batch(b).channel(q); + + signed char* outptr0 = top_blob.batch(b).channel(q * 8); + signed char* outptr1 = top_blob.batch(b).channel(q * 8 + 1); + signed char* outptr2 = top_blob.batch(b).channel(q * 8 + 2); + signed char* outptr3 = top_blob.batch(b).channel(q * 8 + 3); + signed char* outptr4 = top_blob.batch(b).channel(q * 8 + 4); + signed char* outptr5 = top_blob.batch(b).channel(q * 8 + 5); + signed char* outptr6 = top_blob.batch(b).channel(q * 8 + 6); + signed char* outptr7 = top_blob.batch(b).channel(q * 8 + 7); int i = 0; for (; i < size; i++) diff --git a/src/layer/packing.cpp b/src/layer/packing.cpp index 1ec3a332f84a..3bb7a8da729d 100644 --- a/src/layer/packing.cpp +++ b/src/layer/packing.cpp @@ -37,6 +37,7 @@ int Packing::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c int d = bottom_blob.d; int channels = bottom_blob.c; int dims = bottom_blob.dims; + int batch = bottom_blob.n; size_t elemsize = bottom_blob.elemsize; if (!use_padding) @@ -68,17 +69,22 @@ int Packing::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c top_blob.cstep = bottom_blob.cstep * elempack; top_blob.elemsize = elemsize / elempack; top_blob.elempack = out_elempack; + top_blob.nstep = bottom_blob.nstep * elempack; return 0; } int outw = (w * elempack + out_elempack - 1) / out_elempack; size_t out_elemsize = elemsize / elempack * out_elempack; - top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(outw, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - memcpy(top_blob.data, bottom_blob.data, w * elemsize); + #pragma omp parallel for num_threads(opt.num_threads) + for (int b = 0; b < batch; b++) + { + memcpy((unsigned char*)top_blob.batch(b).data, (unsigned char*)bottom_blob.batch(b).data, w * elemsize); + } return 0; } @@ -89,14 +95,19 @@ int Packing::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c size_t out_elemsize = elemsize / elempack * out_elempack; size_t lane_size = out_elemsize / out_elempack; - top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; + int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - unsigned char* outptr = (unsigned char*)top_blob + (size_t)i * w * out_elemsize; + int b = bi / outh; + int i = bi % outh; + + const unsigned char* bottom_ptr = (const unsigned char*)bottom_blob.batch(b).data; + unsigned char* outptr = (unsigned char*)top_blob.batch(b) + (size_t)i * w * out_elemsize; for (int j = 0; j < w; j++) { @@ -110,7 +121,7 @@ int Packing::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c int srck = (i * out_elempack + k) % elempack; - const unsigned char* ptr = (const unsigned char*)bottom_blob + (size_t)srcy * w * elemsize; + const unsigned char* ptr = bottom_ptr + (size_t)srcy * w * elemsize; const unsigned char* elem_ptr = ptr + j * elemsize; memcpy(out_elem_ptr + k * lane_size, elem_ptr + srck * lane_size, lane_size); @@ -127,14 +138,20 @@ int Packing::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c size_t out_elemsize = elemsize / elempack * out_elempack; size_t lane_size = out_elemsize / out_elempack; - top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; + int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - Mat out = top_blob.channel(q); + int b = bq / outc; + int q = bq % outc; + + Mat out = top_blob.batch(b).channel(q); + + const Mat bottom_batch = bottom_blob.batch(b); for (int i = 0; i < h; i++) { @@ -152,7 +169,7 @@ int Packing::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c int srck = (q * out_elempack + k) % elempack; - const Mat m = bottom_blob.channel(srcq); + const Mat m = bottom_batch.channel(srcq); const unsigned char* ptr = (const unsigned char*)m + (size_t)i * w * elemsize; const unsigned char* elem_ptr = ptr + j * elemsize; @@ -171,14 +188,20 @@ int Packing::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c size_t out_elemsize = elemsize / elempack * out_elempack; size_t lane_size = out_elemsize / out_elempack; - top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; + int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - Mat out = top_blob.channel(q); + int b = bq / outc; + int q = bq % outc; + + Mat out = top_blob.batch(b).channel(q); + + const Mat bottom_batch = bottom_blob.batch(b); for (int z = 0; z < d; z++) { @@ -198,7 +221,7 @@ int Packing::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c int srck = (q * out_elempack + k) % elempack; - const Mat m = bottom_blob.channel(srcq); + const Mat m = bottom_batch.channel(srcq); const unsigned char* ptr = (const unsigned char*)m + (size_t)(z * h + i) * w * elemsize; const unsigned char* elem_ptr = ptr + j * elemsize; diff --git a/src/layer/riscv/cast_riscv.cpp b/src/layer/riscv/cast_riscv.cpp index da4e74f242b8..4809cae0a709 100644 --- a/src/layer/riscv/cast_riscv.cpp +++ b/src/layer/riscv/cast_riscv.cpp @@ -33,6 +33,7 @@ int Cast_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt int dims = bottom_blob.dims; size_t elemsize = bottom_blob.elemsize; int elempack = bottom_blob.elempack; + int batch = bottom_blob.n; size_t out_elemsize = elemsize; if (type_to == 1) @@ -57,21 +58,13 @@ int Cast_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt } if (dims == 1) - { - top_blob.create(w, out_elemsize, elempack, opt.blob_allocator); - } + top_blob.create_batch(w, batch, out_elemsize, elempack, opt.blob_allocator); else if (dims == 2) - { - top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator); - } + top_blob.create_batch(w, h, batch, out_elemsize, elempack, opt.blob_allocator); else if (dims == 3) - { - top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); - } + top_blob.create_batch(w, h, channels, batch, out_elemsize, elempack, opt.blob_allocator); else if (dims == 4) - { - top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_allocator); - } + top_blob.create_batch(w, h, d, channels, batch, out_elemsize, elempack, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -107,11 +100,14 @@ int Cast_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt if (type_from == 3 && type_to == 1) { + const int total_bc = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { - const signed char* ptr = bottom_blob.channel(q); - float* outptr = top_blob.channel(q); + int b = bc / channels; + int q = bc % channels; + const signed char* ptr = bottom_blob.batch(b).channel(q); + float* outptr = top_blob.batch(b).channel(q); for (int i = 0; i < size; i++) { diff --git a/src/layer/riscv/cast_riscv_zfh.cpp b/src/layer/riscv/cast_riscv_zfh.cpp index 103e8bd48c2a..dfd9c53c3722 100644 --- a/src/layer/riscv/cast_riscv_zfh.cpp +++ b/src/layer/riscv/cast_riscv_zfh.cpp @@ -15,14 +15,18 @@ void Cast_riscv::cast_fp32_to_fp16(const Mat& bottom_blob, Mat& top_blob, const const int size = w * h * d * elempack; + const int batch = bottom_blob.n; + const int total_bc = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { - const float* ptr = bottom_blob.channel(q); + int b = bc / channels; + int q = bc % channels; + const float* ptr = bottom_blob.batch(b).channel(q); #if __riscv_zfh - __fp16* outptr = top_blob.channel(q); + __fp16* outptr = top_blob.batch(b).channel(q); #else - unsigned short* outptr = top_blob.channel(q); + unsigned short* outptr = top_blob.batch(b).channel(q); #endif #if __riscv_zvfh @@ -62,15 +66,19 @@ void Cast_riscv::cast_fp16_to_fp32(const Mat& bottom_blob, Mat& top_blob, const const int size = w * h * d * elempack; + const int batch = bottom_blob.n; + const int total_bc = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { + int b = bc / channels; + int q = bc % channels; #if __riscv_zfh - const __fp16* ptr = bottom_blob.channel(q); + const __fp16* ptr = bottom_blob.batch(b).channel(q); #else - const unsigned short* ptr = bottom_blob.channel(q); + const unsigned short* ptr = bottom_blob.batch(b).channel(q); #endif - float* outptr = top_blob.channel(q); + float* outptr = top_blob.batch(b).channel(q); #if __riscv_zvfh int n = size; diff --git a/src/layer/riscv/packing_riscv.cpp b/src/layer/riscv/packing_riscv.cpp index d1e51d504ebe..9d3ebe569020 100644 --- a/src/layer/riscv/packing_riscv.cpp +++ b/src/layer/riscv/packing_riscv.cpp @@ -78,6 +78,7 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int d = bottom_blob.d; int channels = bottom_blob.c; int dims = bottom_blob.dims; + int batch = bottom_blob.n; if (!use_padding) { @@ -106,6 +107,7 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& top_blob.cstep = bottom_blob.cstep * elempack / out_elempack; top_blob.elemsize = elemsize / elempack * out_elempack; top_blob.elempack = out_elempack; + top_blob.nstep = bottom_blob.nstep * elempack / out_elempack; return 0; } @@ -114,21 +116,24 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int outh = h * elempack / out_elempack; size_t out_elemsize = elemsize / elempack * out_elempack; - top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; if (pack1to4) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const float* r0 = bottom_blob.row(i * 4); - const float* r1 = bottom_blob.row(i * 4 + 1); - const float* r2 = bottom_blob.row(i * 4 + 2); - const float* r3 = bottom_blob.row(i * 4 + 3); + int b = bi / outh; + int i = bi % outh; + const float* r0 = bottom_blob.batch(b).row(i * 4); + const float* r1 = bottom_blob.batch(b).row(i * 4 + 1); + const float* r2 = bottom_blob.batch(b).row(i * 4 + 2); + const float* r3 = bottom_blob.batch(b).row(i * 4 + 3); - float* outptr = top_blob.row(i); + float* outptr = top_blob.batch(b).row(i); #if __riscv_vector int n = w; @@ -164,15 +169,18 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& } if (pack4to1) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const float* r0 = bottom_blob.row(i); + int b = bi / h; + int i = bi % h; + const float* r0 = bottom_blob.batch(b).row(i); - float* outptr0 = top_blob.row(i * 4); - float* outptr1 = top_blob.row(i * 4 + 1); - float* outptr2 = top_blob.row(i * 4 + 2); - float* outptr3 = top_blob.row(i * 4 + 3); + float* outptr0 = top_blob.batch(b).row(i * 4); + float* outptr1 = top_blob.batch(b).row(i * 4 + 1); + float* outptr2 = top_blob.batch(b).row(i * 4 + 2); + float* outptr3 = top_blob.batch(b).row(i * 4 + 3); #if __riscv_vector int n = w; @@ -209,19 +217,22 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& } if (pack1to8) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const float* r0 = bottom_blob.row(i * 8); - const float* r1 = bottom_blob.row(i * 8 + 1); - const float* r2 = bottom_blob.row(i * 8 + 2); - const float* r3 = bottom_blob.row(i * 8 + 3); - const float* r4 = bottom_blob.row(i * 8 + 4); - const float* r5 = bottom_blob.row(i * 8 + 5); - const float* r6 = bottom_blob.row(i * 8 + 6); - const float* r7 = bottom_blob.row(i * 8 + 7); - - float* outptr = top_blob.row(i); + int b = bi / outh; + int i = bi % outh; + const float* r0 = bottom_blob.batch(b).row(i * 8); + const float* r1 = bottom_blob.batch(b).row(i * 8 + 1); + const float* r2 = bottom_blob.batch(b).row(i * 8 + 2); + const float* r3 = bottom_blob.batch(b).row(i * 8 + 3); + const float* r4 = bottom_blob.batch(b).row(i * 8 + 4); + const float* r5 = bottom_blob.batch(b).row(i * 8 + 5); + const float* r6 = bottom_blob.batch(b).row(i * 8 + 6); + const float* r7 = bottom_blob.batch(b).row(i * 8 + 7); + + float* outptr = top_blob.batch(b).row(i); #if __riscv_vector int n = w; @@ -269,19 +280,22 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& } if (pack8to1) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const float* r0 = bottom_blob.row(i); - - float* outptr0 = top_blob.row(i * 8); - float* outptr1 = top_blob.row(i * 8 + 1); - float* outptr2 = top_blob.row(i * 8 + 2); - float* outptr3 = top_blob.row(i * 8 + 3); - float* outptr4 = top_blob.row(i * 8 + 4); - float* outptr5 = top_blob.row(i * 8 + 5); - float* outptr6 = top_blob.row(i * 8 + 6); - float* outptr7 = top_blob.row(i * 8 + 7); + int b = bi / h; + int i = bi % h; + const float* r0 = bottom_blob.batch(b).row(i); + + float* outptr0 = top_blob.batch(b).row(i * 8); + float* outptr1 = top_blob.batch(b).row(i * 8 + 1); + float* outptr2 = top_blob.batch(b).row(i * 8 + 2); + float* outptr3 = top_blob.batch(b).row(i * 8 + 3); + float* outptr4 = top_blob.batch(b).row(i * 8 + 4); + float* outptr5 = top_blob.batch(b).row(i * 8 + 5); + float* outptr6 = top_blob.batch(b).row(i * 8 + 6); + float* outptr7 = top_blob.batch(b).row(i * 8 + 7); #if __riscv_vector int n = w; @@ -329,13 +343,16 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& } if (pack4to8) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const float* r0 = bottom_blob.row(i * 2); - const float* r1 = bottom_blob.row(i * 2 + 1); + int b = bi / outh; + int i = bi % outh; + const float* r0 = bottom_blob.batch(b).row(i * 2); + const float* r1 = bottom_blob.batch(b).row(i * 2 + 1); - float* outptr = top_blob.row(i); + float* outptr = top_blob.batch(b).row(i); #if __riscv_vector int n = w; @@ -383,13 +400,16 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& } if (pack8to4) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const float* r0 = bottom_blob.row(i); + int b = bi / h; + int i = bi % h; + const float* r0 = bottom_blob.batch(b).row(i); - float* outptr0 = top_blob.row(i * 2); - float* outptr1 = top_blob.row(i * 2 + 1); + float* outptr0 = top_blob.batch(b).row(i * 2); + float* outptr1 = top_blob.batch(b).row(i * 2 + 1); #if __riscv_vector int n = w; @@ -446,23 +466,26 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& size_t out_elemsize = elemsize / elempack * out_elempack; if (dims == 3) - top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); else // if (dims == 4) - top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; if (pack1to4) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const float* r0 = bottom_blob.channel(q * 4); - const float* r1 = bottom_blob.channel(q * 4 + 1); - const float* r2 = bottom_blob.channel(q * 4 + 2); - const float* r3 = bottom_blob.channel(q * 4 + 3); + int b = bq / outc; + int q = bq % outc; + const float* r0 = bottom_blob.batch(b).channel(q * 4); + const float* r1 = bottom_blob.batch(b).channel(q * 4 + 1); + const float* r2 = bottom_blob.batch(b).channel(q * 4 + 2); + const float* r3 = bottom_blob.batch(b).channel(q * 4 + 3); - float* outptr = top_blob.channel(q); + float* outptr = top_blob.batch(b).channel(q); #if __riscv_vector int n = size; @@ -499,15 +522,18 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& } if (pack4to1) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const float* r0 = bottom_blob.channel(q); + int b = bq / channels; + int q = bq % channels; + const float* r0 = bottom_blob.batch(b).channel(q); - float* outptr0 = top_blob.channel(q * 4); - float* outptr1 = top_blob.channel(q * 4 + 1); - float* outptr2 = top_blob.channel(q * 4 + 2); - float* outptr3 = top_blob.channel(q * 4 + 3); + float* outptr0 = top_blob.batch(b).channel(q * 4); + float* outptr1 = top_blob.batch(b).channel(q * 4 + 1); + float* outptr2 = top_blob.batch(b).channel(q * 4 + 2); + float* outptr3 = top_blob.batch(b).channel(q * 4 + 3); #if __riscv_vector int n = size; @@ -543,19 +569,22 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& } if (pack1to8) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const float* r0 = bottom_blob.channel(q * 8); - const float* r1 = bottom_blob.channel(q * 8 + 1); - const float* r2 = bottom_blob.channel(q * 8 + 2); - const float* r3 = bottom_blob.channel(q * 8 + 3); - const float* r4 = bottom_blob.channel(q * 8 + 4); - const float* r5 = bottom_blob.channel(q * 8 + 5); - const float* r6 = bottom_blob.channel(q * 8 + 6); - const float* r7 = bottom_blob.channel(q * 8 + 7); - - float* outptr = top_blob.channel(q); + int b = bq / outc; + int q = bq % outc; + const float* r0 = bottom_blob.batch(b).channel(q * 8); + const float* r1 = bottom_blob.batch(b).channel(q * 8 + 1); + const float* r2 = bottom_blob.batch(b).channel(q * 8 + 2); + const float* r3 = bottom_blob.batch(b).channel(q * 8 + 3); + const float* r4 = bottom_blob.batch(b).channel(q * 8 + 4); + const float* r5 = bottom_blob.batch(b).channel(q * 8 + 5); + const float* r6 = bottom_blob.batch(b).channel(q * 8 + 6); + const float* r7 = bottom_blob.batch(b).channel(q * 8 + 7); + + float* outptr = top_blob.batch(b).channel(q); #if __riscv_vector int n = size; @@ -603,19 +632,22 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& } if (pack8to1) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const float* r0 = bottom_blob.channel(q); - - float* outptr0 = top_blob.channel(q * 8); - float* outptr1 = top_blob.channel(q * 8 + 1); - float* outptr2 = top_blob.channel(q * 8 + 2); - float* outptr3 = top_blob.channel(q * 8 + 3); - float* outptr4 = top_blob.channel(q * 8 + 4); - float* outptr5 = top_blob.channel(q * 8 + 5); - float* outptr6 = top_blob.channel(q * 8 + 6); - float* outptr7 = top_blob.channel(q * 8 + 7); + int b = bq / channels; + int q = bq % channels; + const float* r0 = bottom_blob.batch(b).channel(q); + + float* outptr0 = top_blob.batch(b).channel(q * 8); + float* outptr1 = top_blob.batch(b).channel(q * 8 + 1); + float* outptr2 = top_blob.batch(b).channel(q * 8 + 2); + float* outptr3 = top_blob.batch(b).channel(q * 8 + 3); + float* outptr4 = top_blob.batch(b).channel(q * 8 + 4); + float* outptr5 = top_blob.batch(b).channel(q * 8 + 5); + float* outptr6 = top_blob.batch(b).channel(q * 8 + 6); + float* outptr7 = top_blob.batch(b).channel(q * 8 + 7); #if __riscv_vector int n = size; @@ -663,13 +695,16 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& } if (pack4to8) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const float* r0 = bottom_blob.channel(q * 2); - const float* r1 = bottom_blob.channel(q * 2 + 1); + int b = bq / outc; + int q = bq % outc; + const float* r0 = bottom_blob.batch(b).channel(q * 2); + const float* r1 = bottom_blob.batch(b).channel(q * 2 + 1); - float* outptr = top_blob.channel(q); + float* outptr = top_blob.batch(b).channel(q); #if __riscv_vector int n = size; @@ -717,13 +752,16 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& } if (pack8to4) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const float* r0 = bottom_blob.channel(q); + int b = bq / channels; + int q = bq % channels; + const float* r0 = bottom_blob.batch(b).channel(q); - float* outptr0 = top_blob.channel(q * 2); - float* outptr1 = top_blob.channel(q * 2 + 1); + float* outptr0 = top_blob.batch(b).channel(q * 2); + float* outptr1 = top_blob.batch(b).channel(q * 2 + 1); #if __riscv_vector int n = size; @@ -809,6 +847,7 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int d = bottom_blob.d; int channels = bottom_blob.c; int dims = bottom_blob.dims; + int batch = bottom_blob.n; if (!use_padding) { @@ -837,6 +876,7 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co top_blob.cstep = bottom_blob.cstep * elempack / out_elempack; top_blob.elemsize = elemsize / elempack * out_elempack; top_blob.elempack = out_elempack; + top_blob.nstep = bottom_blob.nstep * elempack / out_elempack; return 0; } @@ -845,21 +885,24 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int outh = h * elempack / out_elempack; size_t out_elemsize = elemsize / elempack * out_elempack; - top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; if (pack1to4) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const unsigned short* r0 = bottom_blob.row(i * 4); - const unsigned short* r1 = bottom_blob.row(i * 4 + 1); - const unsigned short* r2 = bottom_blob.row(i * 4 + 2); - const unsigned short* r3 = bottom_blob.row(i * 4 + 3); + int b = bi / outh; + int i = bi % outh; + const unsigned short* r0 = bottom_blob.batch(b).row(i * 4); + const unsigned short* r1 = bottom_blob.batch(b).row(i * 4 + 1); + const unsigned short* r2 = bottom_blob.batch(b).row(i * 4 + 2); + const unsigned short* r3 = bottom_blob.batch(b).row(i * 4 + 3); - unsigned short* outptr = top_blob.row(i); + unsigned short* outptr = top_blob.batch(b).row(i); #if __riscv_vector int n = w; @@ -895,15 +938,18 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co } if (pack4to1) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const unsigned short* r0 = bottom_blob.row(i); + int b = bi / h; + int i = bi % h; + const unsigned short* r0 = bottom_blob.batch(b).row(i); - unsigned short* outptr0 = top_blob.row(i * 4); - unsigned short* outptr1 = top_blob.row(i * 4 + 1); - unsigned short* outptr2 = top_blob.row(i * 4 + 2); - unsigned short* outptr3 = top_blob.row(i * 4 + 3); + unsigned short* outptr0 = top_blob.batch(b).row(i * 4); + unsigned short* outptr1 = top_blob.batch(b).row(i * 4 + 1); + unsigned short* outptr2 = top_blob.batch(b).row(i * 4 + 2); + unsigned short* outptr3 = top_blob.batch(b).row(i * 4 + 3); #if __riscv_vector int n = w; @@ -939,19 +985,22 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co } if (pack1to8) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const unsigned short* r0 = bottom_blob.row(i * 8); - const unsigned short* r1 = bottom_blob.row(i * 8 + 1); - const unsigned short* r2 = bottom_blob.row(i * 8 + 2); - const unsigned short* r3 = bottom_blob.row(i * 8 + 3); - const unsigned short* r4 = bottom_blob.row(i * 8 + 4); - const unsigned short* r5 = bottom_blob.row(i * 8 + 5); - const unsigned short* r6 = bottom_blob.row(i * 8 + 6); - const unsigned short* r7 = bottom_blob.row(i * 8 + 7); - - unsigned short* outptr = top_blob.row(i); + int b = bi / outh; + int i = bi % outh; + const unsigned short* r0 = bottom_blob.batch(b).row(i * 8); + const unsigned short* r1 = bottom_blob.batch(b).row(i * 8 + 1); + const unsigned short* r2 = bottom_blob.batch(b).row(i * 8 + 2); + const unsigned short* r3 = bottom_blob.batch(b).row(i * 8 + 3); + const unsigned short* r4 = bottom_blob.batch(b).row(i * 8 + 4); + const unsigned short* r5 = bottom_blob.batch(b).row(i * 8 + 5); + const unsigned short* r6 = bottom_blob.batch(b).row(i * 8 + 6); + const unsigned short* r7 = bottom_blob.batch(b).row(i * 8 + 7); + + unsigned short* outptr = top_blob.batch(b).row(i); #if __riscv_vector int n = w; @@ -999,19 +1048,22 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co } if (pack8to1) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const unsigned short* r0 = bottom_blob.row(i); - - unsigned short* outptr0 = top_blob.row(i * 8); - unsigned short* outptr1 = top_blob.row(i * 8 + 1); - unsigned short* outptr2 = top_blob.row(i * 8 + 2); - unsigned short* outptr3 = top_blob.row(i * 8 + 3); - unsigned short* outptr4 = top_blob.row(i * 8 + 4); - unsigned short* outptr5 = top_blob.row(i * 8 + 5); - unsigned short* outptr6 = top_blob.row(i * 8 + 6); - unsigned short* outptr7 = top_blob.row(i * 8 + 7); + int b = bi / h; + int i = bi % h; + const unsigned short* r0 = bottom_blob.batch(b).row(i); + + unsigned short* outptr0 = top_blob.batch(b).row(i * 8); + unsigned short* outptr1 = top_blob.batch(b).row(i * 8 + 1); + unsigned short* outptr2 = top_blob.batch(b).row(i * 8 + 2); + unsigned short* outptr3 = top_blob.batch(b).row(i * 8 + 3); + unsigned short* outptr4 = top_blob.batch(b).row(i * 8 + 4); + unsigned short* outptr5 = top_blob.batch(b).row(i * 8 + 5); + unsigned short* outptr6 = top_blob.batch(b).row(i * 8 + 6); + unsigned short* outptr7 = top_blob.batch(b).row(i * 8 + 7); #if __riscv_vector int n = w; @@ -1059,13 +1111,16 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co } if (pack4to8) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const unsigned short* r0 = bottom_blob.row(i * 2); - const unsigned short* r1 = bottom_blob.row(i * 2 + 1); + int b = bi / outh; + int i = bi % outh; + const unsigned short* r0 = bottom_blob.batch(b).row(i * 2); + const unsigned short* r1 = bottom_blob.batch(b).row(i * 2 + 1); - unsigned short* outptr = top_blob.row(i); + unsigned short* outptr = top_blob.batch(b).row(i); #if __riscv_vector int n = w; @@ -1113,13 +1168,16 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co } if (pack8to4) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const unsigned short* r0 = bottom_blob.row(i); + int b = bi / h; + int i = bi % h; + const unsigned short* r0 = bottom_blob.batch(b).row(i); - unsigned short* outptr0 = top_blob.row(i * 2); - unsigned short* outptr1 = top_blob.row(i * 2 + 1); + unsigned short* outptr0 = top_blob.batch(b).row(i * 2); + unsigned short* outptr1 = top_blob.batch(b).row(i * 2 + 1); #if __riscv_vector int n = w; @@ -1176,23 +1234,26 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co size_t out_elemsize = elemsize / elempack * out_elempack; if (dims == 3) - top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); else // if (dims == 4) - top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; if (pack1to4) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const unsigned short* r0 = bottom_blob.channel(q * 4); - const unsigned short* r1 = bottom_blob.channel(q * 4 + 1); - const unsigned short* r2 = bottom_blob.channel(q * 4 + 2); - const unsigned short* r3 = bottom_blob.channel(q * 4 + 3); + int b = bq / outc; + int q = bq % outc; + const unsigned short* r0 = bottom_blob.batch(b).channel(q * 4); + const unsigned short* r1 = bottom_blob.batch(b).channel(q * 4 + 1); + const unsigned short* r2 = bottom_blob.batch(b).channel(q * 4 + 2); + const unsigned short* r3 = bottom_blob.batch(b).channel(q * 4 + 3); - unsigned short* outptr = top_blob.channel(q); + unsigned short* outptr = top_blob.batch(b).channel(q); #if __riscv_vector int n = size; @@ -1228,15 +1289,18 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co } if (pack4to1) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const unsigned short* r0 = bottom_blob.channel(q); + int b = bq / channels; + int q = bq % channels; + const unsigned short* r0 = bottom_blob.batch(b).channel(q); - unsigned short* outptr0 = top_blob.channel(q * 4); - unsigned short* outptr1 = top_blob.channel(q * 4 + 1); - unsigned short* outptr2 = top_blob.channel(q * 4 + 2); - unsigned short* outptr3 = top_blob.channel(q * 4 + 3); + unsigned short* outptr0 = top_blob.batch(b).channel(q * 4); + unsigned short* outptr1 = top_blob.batch(b).channel(q * 4 + 1); + unsigned short* outptr2 = top_blob.batch(b).channel(q * 4 + 2); + unsigned short* outptr3 = top_blob.batch(b).channel(q * 4 + 3); #if __riscv_vector int n = size; @@ -1272,19 +1336,22 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co } if (pack1to8) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const unsigned short* r0 = bottom_blob.channel(q * 8); - const unsigned short* r1 = bottom_blob.channel(q * 8 + 1); - const unsigned short* r2 = bottom_blob.channel(q * 8 + 2); - const unsigned short* r3 = bottom_blob.channel(q * 8 + 3); - const unsigned short* r4 = bottom_blob.channel(q * 8 + 4); - const unsigned short* r5 = bottom_blob.channel(q * 8 + 5); - const unsigned short* r6 = bottom_blob.channel(q * 8 + 6); - const unsigned short* r7 = bottom_blob.channel(q * 8 + 7); - - unsigned short* outptr = top_blob.channel(q); + int b = bq / outc; + int q = bq % outc; + const unsigned short* r0 = bottom_blob.batch(b).channel(q * 8); + const unsigned short* r1 = bottom_blob.batch(b).channel(q * 8 + 1); + const unsigned short* r2 = bottom_blob.batch(b).channel(q * 8 + 2); + const unsigned short* r3 = bottom_blob.batch(b).channel(q * 8 + 3); + const unsigned short* r4 = bottom_blob.batch(b).channel(q * 8 + 4); + const unsigned short* r5 = bottom_blob.batch(b).channel(q * 8 + 5); + const unsigned short* r6 = bottom_blob.batch(b).channel(q * 8 + 6); + const unsigned short* r7 = bottom_blob.batch(b).channel(q * 8 + 7); + + unsigned short* outptr = top_blob.batch(b).channel(q); #if __riscv_vector int n = size; @@ -1332,19 +1399,22 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co } if (pack8to1) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const unsigned short* r0 = bottom_blob.channel(q); - - unsigned short* outptr0 = top_blob.channel(q * 8); - unsigned short* outptr1 = top_blob.channel(q * 8 + 1); - unsigned short* outptr2 = top_blob.channel(q * 8 + 2); - unsigned short* outptr3 = top_blob.channel(q * 8 + 3); - unsigned short* outptr4 = top_blob.channel(q * 8 + 4); - unsigned short* outptr5 = top_blob.channel(q * 8 + 5); - unsigned short* outptr6 = top_blob.channel(q * 8 + 6); - unsigned short* outptr7 = top_blob.channel(q * 8 + 7); + int b = bq / channels; + int q = bq % channels; + const unsigned short* r0 = bottom_blob.batch(b).channel(q); + + unsigned short* outptr0 = top_blob.batch(b).channel(q * 8); + unsigned short* outptr1 = top_blob.batch(b).channel(q * 8 + 1); + unsigned short* outptr2 = top_blob.batch(b).channel(q * 8 + 2); + unsigned short* outptr3 = top_blob.batch(b).channel(q * 8 + 3); + unsigned short* outptr4 = top_blob.batch(b).channel(q * 8 + 4); + unsigned short* outptr5 = top_blob.batch(b).channel(q * 8 + 5); + unsigned short* outptr6 = top_blob.batch(b).channel(q * 8 + 6); + unsigned short* outptr7 = top_blob.batch(b).channel(q * 8 + 7); #if __riscv_vector int n = size; @@ -1392,13 +1462,16 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co } if (pack4to8) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const unsigned short* r0 = bottom_blob.channel(q * 2); - const unsigned short* r1 = bottom_blob.channel(q * 2 + 1); + int b = bq / outc; + int q = bq % outc; + const unsigned short* r0 = bottom_blob.batch(b).channel(q * 2); + const unsigned short* r1 = bottom_blob.batch(b).channel(q * 2 + 1); - unsigned short* outptr = top_blob.channel(q); + unsigned short* outptr = top_blob.batch(b).channel(q); #if __riscv_vector int n = size; @@ -1445,13 +1518,16 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co } if (pack8to4) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const unsigned short* r0 = bottom_blob.channel(q); + int b = bq / channels; + int q = bq % channels; + const unsigned short* r0 = bottom_blob.batch(b).channel(q); - unsigned short* outptr0 = top_blob.channel(q * 2); - unsigned short* outptr1 = top_blob.channel(q * 2 + 1); + unsigned short* outptr0 = top_blob.batch(b).channel(q * 2); + unsigned short* outptr1 = top_blob.batch(b).channel(q * 2 + 1); #if __riscv_vector int n = size; @@ -1533,6 +1609,7 @@ int Packing_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt int d = bottom_blob.d; int channels = bottom_blob.c; int dims = bottom_blob.dims; + int batch = bottom_blob.n; if (!use_padding) { @@ -1561,6 +1638,7 @@ int Packing_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt top_blob.cstep = bottom_blob.cstep * elempack / out_elempack; top_blob.elemsize = elemsize / elempack * out_elempack; top_blob.elempack = out_elempack; + top_blob.nstep = bottom_blob.nstep * elempack / out_elempack; return 0; } @@ -1569,25 +1647,28 @@ int Packing_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt int outh = h * elempack / out_elempack; size_t out_elemsize = elemsize / elempack * out_elempack; - top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; if (pack1to8) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const signed char* r0 = bottom_blob.row(i * 8); - const signed char* r1 = bottom_blob.row(i * 8 + 1); - const signed char* r2 = bottom_blob.row(i * 8 + 2); - const signed char* r3 = bottom_blob.row(i * 8 + 3); - const signed char* r4 = bottom_blob.row(i * 8 + 4); - const signed char* r5 = bottom_blob.row(i * 8 + 5); - const signed char* r6 = bottom_blob.row(i * 8 + 6); - const signed char* r7 = bottom_blob.row(i * 8 + 7); - - signed char* outptr = top_blob.row(i); + int b = bi / outh; + int i = bi % outh; + const signed char* r0 = bottom_blob.batch(b).row(i * 8); + const signed char* r1 = bottom_blob.batch(b).row(i * 8 + 1); + const signed char* r2 = bottom_blob.batch(b).row(i * 8 + 2); + const signed char* r3 = bottom_blob.batch(b).row(i * 8 + 3); + const signed char* r4 = bottom_blob.batch(b).row(i * 8 + 4); + const signed char* r5 = bottom_blob.batch(b).row(i * 8 + 5); + const signed char* r6 = bottom_blob.batch(b).row(i * 8 + 6); + const signed char* r7 = bottom_blob.batch(b).row(i * 8 + 7); + + signed char* outptr = top_blob.batch(b).row(i); int j = 0; for (; j < w; j++) @@ -1607,19 +1688,22 @@ int Packing_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt } if (pack8to1) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const signed char* r0 = bottom_blob.row(i); - - signed char* outptr0 = top_blob.row(i * 8); - signed char* outptr1 = top_blob.row(i * 8 + 1); - signed char* outptr2 = top_blob.row(i * 8 + 2); - signed char* outptr3 = top_blob.row(i * 8 + 3); - signed char* outptr4 = top_blob.row(i * 8 + 4); - signed char* outptr5 = top_blob.row(i * 8 + 5); - signed char* outptr6 = top_blob.row(i * 8 + 6); - signed char* outptr7 = top_blob.row(i * 8 + 7); + int b = bi / h; + int i = bi % h; + const signed char* r0 = bottom_blob.batch(b).row(i); + + signed char* outptr0 = top_blob.batch(b).row(i * 8); + signed char* outptr1 = top_blob.batch(b).row(i * 8 + 1); + signed char* outptr2 = top_blob.batch(b).row(i * 8 + 2); + signed char* outptr3 = top_blob.batch(b).row(i * 8 + 3); + signed char* outptr4 = top_blob.batch(b).row(i * 8 + 4); + signed char* outptr5 = top_blob.batch(b).row(i * 8 + 5); + signed char* outptr6 = top_blob.batch(b).row(i * 8 + 6); + signed char* outptr7 = top_blob.batch(b).row(i * 8 + 7); int j = 0; for (; j < w; j++) @@ -1648,27 +1732,30 @@ int Packing_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt size_t out_elemsize = elemsize / elempack * out_elempack; if (dims == 3) - top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); else // if (dims == 4) - top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; if (pack1to8) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const signed char* r0 = bottom_blob.channel(q * 8); - const signed char* r1 = bottom_blob.channel(q * 8 + 1); - const signed char* r2 = bottom_blob.channel(q * 8 + 2); - const signed char* r3 = bottom_blob.channel(q * 8 + 3); - const signed char* r4 = bottom_blob.channel(q * 8 + 4); - const signed char* r5 = bottom_blob.channel(q * 8 + 5); - const signed char* r6 = bottom_blob.channel(q * 8 + 6); - const signed char* r7 = bottom_blob.channel(q * 8 + 7); - - signed char* outptr = top_blob.channel(q); + int b = bq / outc; + int q = bq % outc; + const signed char* r0 = bottom_blob.batch(b).channel(q * 8); + const signed char* r1 = bottom_blob.batch(b).channel(q * 8 + 1); + const signed char* r2 = bottom_blob.batch(b).channel(q * 8 + 2); + const signed char* r3 = bottom_blob.batch(b).channel(q * 8 + 3); + const signed char* r4 = bottom_blob.batch(b).channel(q * 8 + 4); + const signed char* r5 = bottom_blob.batch(b).channel(q * 8 + 5); + const signed char* r6 = bottom_blob.batch(b).channel(q * 8 + 6); + const signed char* r7 = bottom_blob.batch(b).channel(q * 8 + 7); + + signed char* outptr = top_blob.batch(b).channel(q); int i = 0; for (; i < size; i++) @@ -1688,19 +1775,22 @@ int Packing_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt } if (pack8to1) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const signed char* r0 = bottom_blob.channel(q); - - signed char* outptr0 = top_blob.channel(q * 8); - signed char* outptr1 = top_blob.channel(q * 8 + 1); - signed char* outptr2 = top_blob.channel(q * 8 + 2); - signed char* outptr3 = top_blob.channel(q * 8 + 3); - signed char* outptr4 = top_blob.channel(q * 8 + 4); - signed char* outptr5 = top_blob.channel(q * 8 + 5); - signed char* outptr6 = top_blob.channel(q * 8 + 6); - signed char* outptr7 = top_blob.channel(q * 8 + 7); + int b = bq / channels; + int q = bq % channels; + const signed char* r0 = bottom_blob.batch(b).channel(q); + + signed char* outptr0 = top_blob.batch(b).channel(q * 8); + signed char* outptr1 = top_blob.batch(b).channel(q * 8 + 1); + signed char* outptr2 = top_blob.batch(b).channel(q * 8 + 2); + signed char* outptr3 = top_blob.batch(b).channel(q * 8 + 3); + signed char* outptr4 = top_blob.batch(b).channel(q * 8 + 4); + signed char* outptr5 = top_blob.batch(b).channel(q * 8 + 5); + signed char* outptr6 = top_blob.batch(b).channel(q * 8 + 6); + signed char* outptr7 = top_blob.batch(b).channel(q * 8 + 7); int i = 0; for (; i < size; i++) diff --git a/src/layer/vulkan/packing_vulkan.cpp b/src/layer/vulkan/packing_vulkan.cpp index ee93534b9b80..84d48ed74727 100644 --- a/src/layer/vulkan/packing_vulkan.cpp +++ b/src/layer/vulkan/packing_vulkan.cpp @@ -190,7 +190,8 @@ int Packing_vulkan::destroy_pipeline(const Option& /*opt*/) int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const { const int elempack = bottom_blob.elempack; - // NCNN_LOGE("Packing_vulkan b2b %d %d %d %d", elempack, out_elempack, cast_type_from, cast_type_to); + const int B = bottom_blob.n; + // NCNN_LOGE("Packing_vulkan b2b %d %d %d %d n=%d", elempack, out_elempack, cast_type_from, cast_type_to, B); if (elempack == out_elempack && cast_type_from == cast_type_to && bottom_blob.allocator == opt.blob_vkallocator) { @@ -258,12 +259,18 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute top_blob.cstep = bottom_blob.cstep * elempack; top_blob.elemsize = bottom_blob.elemsize / elempack; top_blob.elempack = out_elempack; + // preserve byte stride per batch when element size changes + if (B > 1) + top_blob.nstep = bottom_blob.nstep * bottom_blob.elemsize / top_blob.elemsize; return 0; } int outw = (w * elempack + out_elempack - 1) / out_elempack; - top_blob.create(outw, out_elemsize, out_elempack, opt.blob_vkallocator); + if (B > 1) + top_blob.create_batch(outw, B, out_elemsize, out_elempack, opt.blob_vkallocator); + else + top_blob.create(outw, out_elemsize, out_elempack, opt.blob_vkallocator); if (top_blob.empty()) return -100; } @@ -272,7 +279,10 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute { int outh = (h * elempack + out_elempack - 1) / out_elempack; - top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_vkallocator); + if (B > 1) + top_blob.create_batch(w, outh, B, out_elemsize, out_elempack, opt.blob_vkallocator); + else + top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_vkallocator); if (top_blob.empty()) return -100; } @@ -281,7 +291,10 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute { int outc = (channels * elempack + out_elempack - 1) / out_elempack; - top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_vkallocator); + if (B > 1) + top_blob.create_batch(w, h, outc, B, out_elemsize, out_elempack, opt.blob_vkallocator); + else + top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_vkallocator); if (top_blob.empty()) return -100; } @@ -290,145 +303,151 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute { int outc = (channels * elempack + out_elempack - 1) / out_elempack; - top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_vkallocator); + if (B > 1) + top_blob.create_batch(w, h, d, outc, B, out_elemsize, out_elempack, opt.blob_vkallocator); + else + top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_vkallocator); if (top_blob.empty()) return -100; } - std::vector buffer_bindings(4); - buffer_bindings[0] = bottom_blob; - buffer_bindings[1] = bottom_blob; - buffer_bindings[2] = top_blob; - buffer_bindings[3] = top_blob; - - if (elempack == out_elempack) + // dispatch per batch, writing directly to batch sub-views + for (int b = 0; b < B; b++) { - size_t n = 0; - size_t c = 0; - size_t stride = 0; - if (cast_type_from == 1) + const VkMat bottom_b = B > 1 ? bottom_blob.batch(b) : bottom_blob; + const VkMat top_b = B > 1 ? top_blob.batch(b) : top_blob; + + std::vector buffer_bindings(4); + buffer_bindings[0] = bottom_b; + buffer_bindings[1] = bottom_b; + buffer_bindings[2] = top_b; + buffer_bindings[3] = top_b; + + if (elempack == out_elempack) { - if (dims == 1 || dims == 2) + size_t n = 0; + size_t c = 0; + size_t stride = 0; + if (cast_type_from == 1) { - n = bottom_blob.cstep * elempack; - c = 1; - stride = top_blob.cstep * out_elempack; + if (dims == 1 || dims == 2) + { + n = bottom_b.cstep * elempack; + c = 1; + stride = top_b.cstep * out_elempack; + } + if (dims == 3 || dims == 4) + { + n = bottom_b.cstep * elempack; + c = bottom_b.c; + stride = top_b.cstep * out_elempack; + } } - if (dims == 3 || dims == 4) + else // if (cast_type_to == 1) { - n = bottom_blob.cstep * elempack; - c = bottom_blob.c; - stride = top_blob.cstep * out_elempack; + if (dims == 1 || dims == 2) + { + n = top_b.cstep * out_elempack; + c = 1; + stride = bottom_b.cstep * elempack; + } + if (dims == 3 || dims == 4) + { + n = top_b.cstep * out_elempack; + c = top_b.c; + stride = bottom_b.cstep * elempack; + } } + + std::vector constants(3); + constants[0].u32 = n / 4; + constants[1].u32 = c; + constants[2].u32 = stride / 4; + + VkMat dispatcher; + dispatcher.w = n / 4; + dispatcher.h = c; + dispatcher.c = 1; + + cmd.record_pipeline(pipeline_packing, buffer_bindings, constants, dispatcher); } - else // if (cast_type_to == 1) + if (elempack < out_elempack) { - if (dims == 1 || dims == 2) + size_t n = 0; + size_t c = 0; + size_t stride = 0; + if (dims == 1) { - n = top_blob.cstep * out_elempack; - c = 1; - stride = bottom_blob.cstep * elempack; + n = 1; + c = top_b.w; + stride = 1; + } + if (dims == 2) + { + n = top_b.w; + c = top_b.h; + stride = bottom_b.w; } if (dims == 3 || dims == 4) { - n = top_blob.cstep * out_elempack; - c = top_blob.c; - stride = bottom_blob.cstep * elempack; + n = top_b.cstep; + c = top_b.c; + stride = bottom_b.cstep; } - } - - std::vector constants(3); - constants[0].u32 = n / 4; - constants[1].u32 = c; - constants[2].u32 = stride / 4; - - VkMat dispatcher; - dispatcher.w = n / 4; - dispatcher.h = c; - dispatcher.c = 1; - - cmd.record_pipeline(pipeline_packing, buffer_bindings, constants, dispatcher); - } - if (elempack < out_elempack) - { - size_t n = 0; - size_t c = 0; - size_t stride = 0; - if (dims == 1) - { - n = 1; - c = top_blob.w; - stride = 1; - } - if (dims == 2) - { - n = top_blob.w; - c = top_blob.h; - stride = bottom_blob.w; - } - if (dims == 3 || dims == 4) - { - n = top_blob.cstep; - c = top_blob.c; - stride = bottom_blob.cstep; - } - - std::vector constants(3); - constants[0].u32 = n; - constants[1].u32 = c; - constants[2].u32 = stride; - // NCNN_LOGE("n = %u c = %u stride = %u", n, c, stride); + std::vector constants(3); + constants[0].u32 = n; + constants[1].u32 = c; + constants[2].u32 = stride; - VkMat dispatcher; - dispatcher.w = n; - dispatcher.h = c; - dispatcher.c = 1; + VkMat dispatcher; + dispatcher.w = n; + dispatcher.h = c; + dispatcher.c = 1; - if (elempack == 1 && out_elempack == 4) - { - cmd.record_pipeline(pipeline_packing_pack1to4, buffer_bindings, constants, dispatcher); - } - } - if (elempack > out_elempack) - { - size_t n = 0; - size_t c = 0; - size_t stride = 0; - if (dims == 1) - { - n = 1; - c = bottom_blob.w; - stride = 1; + if (elempack == 1 && out_elempack == 4) + { + cmd.record_pipeline(pipeline_packing_pack1to4, buffer_bindings, constants, dispatcher); + } } - if (dims == 2) + if (elempack > out_elempack) { - n = bottom_blob.w; - c = bottom_blob.h; - stride = top_blob.w; - } - if (dims == 3 || dims == 4) - { - n = bottom_blob.cstep; - c = bottom_blob.c; - stride = top_blob.cstep; - } - - std::vector constants(3); - constants[0].u32 = n; - constants[1].u32 = c; - constants[2].u32 = stride; + size_t n = 0; + size_t c = 0; + size_t stride = 0; + if (dims == 1) + { + n = 1; + c = bottom_b.w; + stride = 1; + } + if (dims == 2) + { + n = bottom_b.w; + c = bottom_b.h; + stride = top_b.w; + } + if (dims == 3 || dims == 4) + { + n = bottom_b.cstep; + c = bottom_b.c; + stride = top_b.cstep; + } - // NCNN_LOGE("n = %u c = %u stride = %u", n, c, stride); + std::vector constants(3); + constants[0].u32 = n; + constants[1].u32 = c; + constants[2].u32 = stride; - VkMat dispatcher; - dispatcher.w = n; - dispatcher.h = c; - dispatcher.c = 1; + VkMat dispatcher; + dispatcher.w = n; + dispatcher.h = c; + dispatcher.c = 1; - if (elempack == 4 && out_elempack == 1) - { - cmd.record_pipeline(pipeline_packing_pack4to1, buffer_bindings, constants, dispatcher); + if (elempack == 4 && out_elempack == 1) + { + cmd.record_pipeline(pipeline_packing_pack4to1, buffer_bindings, constants, dispatcher); + } } } diff --git a/src/layer/x86/cast_bf16.h b/src/layer/x86/cast_bf16.h index fbf6d8693f74..872a3ad9172f 100644 --- a/src/layer/x86/cast_bf16.h +++ b/src/layer/x86/cast_bf16.h @@ -35,13 +35,17 @@ static void cast_fp32_to_bf16_sse(const Mat& bottom_blob, Mat& top_blob, const O const int channels = bottom_blob.c; const int elempack = bottom_blob.elempack; + const int batch = bottom_blob.n; const int size = w * h * d * elempack; + const int total_bc = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { - const float* ptr = bottom_blob.channel(q); - unsigned short* outptr = top_blob.channel(q); + int b = bc / channels; + int q = bc % channels; + const float* ptr = bottom_blob.batch(b).channel(q); + unsigned short* outptr = top_blob.batch(b).channel(q); int i = 0; #if __SSE2__ @@ -107,13 +111,17 @@ static void cast_bf16_to_fp32_sse(const Mat& bottom_blob, Mat& top_blob, const O const int channels = bottom_blob.c; const int elempack = bottom_blob.elempack; + const int batch = bottom_blob.n; const int size = w * h * d * elempack; + const int total_bc = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { - const unsigned short* ptr = bottom_blob.channel(q); - float* outptr = top_blob.channel(q); + int b = bc / channels; + int q = bc % channels; + const unsigned short* ptr = bottom_blob.batch(b).channel(q); + float* outptr = top_blob.batch(b).channel(q); int i = 0; #if __SSE2__ diff --git a/src/layer/x86/cast_fp16.h b/src/layer/x86/cast_fp16.h index 6739700ed2b6..85deabc6704d 100644 --- a/src/layer/x86/cast_fp16.h +++ b/src/layer/x86/cast_fp16.h @@ -22,13 +22,17 @@ static void cast_fp32_to_fp16_sse(const Mat& bottom_blob, Mat& top_blob, const O const int channels = bottom_blob.c; const int elempack = bottom_blob.elempack; + const int batch = bottom_blob.n; const int size = w * h * d * elempack; + const int total_bc = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { - const float* ptr = bottom_blob.channel(q); - unsigned short* outptr = top_blob.channel(q); + int b = bc / channels; + int q = bc % channels; + const float* ptr = bottom_blob.batch(b).channel(q); + unsigned short* outptr = top_blob.batch(b).channel(q); int i = 0; #if __F16C__ @@ -82,13 +86,17 @@ static void cast_fp16_to_fp32_sse(const Mat& bottom_blob, Mat& top_blob, const O const int channels = bottom_blob.c; const int elempack = bottom_blob.elempack; + const int batch = bottom_blob.n; const int size = w * h * d * elempack; + const int total_bc = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { - const unsigned short* ptr = bottom_blob.channel(q); - float* outptr = top_blob.channel(q); + int b = bc / channels; + int q = bc % channels; + const unsigned short* ptr = bottom_blob.batch(b).channel(q); + float* outptr = top_blob.batch(b).channel(q); int i = 0; #if __F16C__ diff --git a/src/layer/x86/cast_x86.cpp b/src/layer/x86/cast_x86.cpp index 1d252acf4a54..86b2c17f0d81 100644 --- a/src/layer/x86/cast_x86.cpp +++ b/src/layer/x86/cast_x86.cpp @@ -36,6 +36,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) int d = bottom_blob.d; int channels = bottom_blob.c; int dims = bottom_blob.dims; + int batch = bottom_blob.n; size_t elemsize = bottom_blob.elemsize; int elempack = bottom_blob.elempack; @@ -44,7 +45,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) { if (type_from == 3) { - Cast::forward(bottom_blob, top_blob, opt); + return Cast::forward(bottom_blob, top_blob, opt); } // float32 @@ -67,21 +68,13 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) } if (dims == 1) - { - top_blob.create(w, out_elemsize, elempack, opt.blob_allocator); - } + top_blob.create_batch(w, batch, out_elemsize, elempack, opt.blob_allocator); else if (dims == 2) - { - top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator); - } + top_blob.create_batch(w, h, batch, out_elemsize, elempack, opt.blob_allocator); else if (dims == 3) - { - top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); - } + top_blob.create_batch(w, h, channels, batch, out_elemsize, elempack, opt.blob_allocator); else if (dims == 4) - { - top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_allocator); - } + top_blob.create_batch(w, h, d, channels, batch, out_elemsize, elempack, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -99,11 +92,14 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) if (type_from == 3 && type_to == 1) { + const int total_bc = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bc = 0; bc < total_bc; bc++) { - const signed char* ptr = bottom_blob.channel(q); - float* outptr = top_blob.channel(q); + int b = bc / channels; + int q = bc % channels; + const signed char* ptr = bottom_blob.batch(b).channel(q); + float* outptr = top_blob.batch(b).channel(q); for (int i = 0; i < size; i++) { diff --git a/src/layer/x86/packing_x86.cpp b/src/layer/x86/packing_x86.cpp index b6211419d84c..0ce08d11934c 100644 --- a/src/layer/x86/packing_x86.cpp +++ b/src/layer/x86/packing_x86.cpp @@ -70,6 +70,7 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op int d = bottom_blob.d; int channels = bottom_blob.c; int dims = bottom_blob.dims; + int batch = bottom_blob.n; if (!use_padding) { @@ -98,6 +99,7 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op top_blob.cstep = bottom_blob.cstep * elempack / out_elempack; top_blob.elemsize = elemsize / elempack * out_elempack; top_blob.elempack = out_elempack; + top_blob.nstep = bottom_blob.nstep * elempack / out_elempack; return 0; } @@ -106,21 +108,24 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op int outh = h * elempack / out_elempack; size_t out_elemsize = elemsize / elempack * out_elempack; - top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; if (pack1to4) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const float* r0 = bottom_blob.row(i * 4); - const float* r1 = bottom_blob.row(i * 4 + 1); - const float* r2 = bottom_blob.row(i * 4 + 2); - const float* r3 = bottom_blob.row(i * 4 + 3); + int b = bi / outh; + int i = bi % outh; + const float* r0 = bottom_blob.batch(b).row(i * 4); + const float* r1 = bottom_blob.batch(b).row(i * 4 + 1); + const float* r2 = bottom_blob.batch(b).row(i * 4 + 2); + const float* r3 = bottom_blob.batch(b).row(i * 4 + 3); - float* outptr = top_blob.row(i); + float* outptr = top_blob.batch(b).row(i); int j = 0; #if __SSE2__ @@ -159,15 +164,18 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op } if (pack4to1) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const float* r0 = bottom_blob.row(i); + int b = bi / h; + int i = bi % h; + const float* r0 = bottom_blob.batch(b).row(i); - float* outptr0 = top_blob.row(i * 4); - float* outptr1 = top_blob.row(i * 4 + 1); - float* outptr2 = top_blob.row(i * 4 + 2); - float* outptr3 = top_blob.row(i * 4 + 3); + float* outptr0 = top_blob.batch(b).row(i * 4); + float* outptr1 = top_blob.batch(b).row(i * 4 + 1); + float* outptr2 = top_blob.batch(b).row(i * 4 + 2); + float* outptr3 = top_blob.batch(b).row(i * 4 + 3); int j = 0; #if __SSE2__ @@ -206,19 +214,22 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op } if (pack1to8) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const float* r0 = bottom_blob.row(i * 8); - const float* r1 = bottom_blob.row(i * 8 + 1); - const float* r2 = bottom_blob.row(i * 8 + 2); - const float* r3 = bottom_blob.row(i * 8 + 3); - const float* r4 = bottom_blob.row(i * 8 + 4); - const float* r5 = bottom_blob.row(i * 8 + 5); - const float* r6 = bottom_blob.row(i * 8 + 6); - const float* r7 = bottom_blob.row(i * 8 + 7); - - float* outptr = top_blob.row(i); + int b = bi / outh; + int i = bi % outh; + const float* r0 = bottom_blob.batch(b).row(i * 8); + const float* r1 = bottom_blob.batch(b).row(i * 8 + 1); + const float* r2 = bottom_blob.batch(b).row(i * 8 + 2); + const float* r3 = bottom_blob.batch(b).row(i * 8 + 3); + const float* r4 = bottom_blob.batch(b).row(i * 8 + 4); + const float* r5 = bottom_blob.batch(b).row(i * 8 + 5); + const float* r6 = bottom_blob.batch(b).row(i * 8 + 6); + const float* r7 = bottom_blob.batch(b).row(i * 8 + 7); + + float* outptr = top_blob.batch(b).row(i); int j = 0; #if __AVX__ @@ -269,19 +280,22 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op } if (pack8to1) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const float* r0 = bottom_blob.row(i); - - float* outptr0 = top_blob.row(i * 8); - float* outptr1 = top_blob.row(i * 8 + 1); - float* outptr2 = top_blob.row(i * 8 + 2); - float* outptr3 = top_blob.row(i * 8 + 3); - float* outptr4 = top_blob.row(i * 8 + 4); - float* outptr5 = top_blob.row(i * 8 + 5); - float* outptr6 = top_blob.row(i * 8 + 6); - float* outptr7 = top_blob.row(i * 8 + 7); + int b = bi / h; + int i = bi % h; + const float* r0 = bottom_blob.batch(b).row(i); + + float* outptr0 = top_blob.batch(b).row(i * 8); + float* outptr1 = top_blob.batch(b).row(i * 8 + 1); + float* outptr2 = top_blob.batch(b).row(i * 8 + 2); + float* outptr3 = top_blob.batch(b).row(i * 8 + 3); + float* outptr4 = top_blob.batch(b).row(i * 8 + 4); + float* outptr5 = top_blob.batch(b).row(i * 8 + 5); + float* outptr6 = top_blob.batch(b).row(i * 8 + 6); + float* outptr7 = top_blob.batch(b).row(i * 8 + 7); int j = 0; #if __AVX__ @@ -333,13 +347,16 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op } if (pack4to8) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const float* r0 = bottom_blob.row(i * 2); - const float* r1 = bottom_blob.row(i * 2 + 1); + int b = bi / outh; + int i = bi % outh; + const float* r0 = bottom_blob.batch(b).row(i * 2); + const float* r1 = bottom_blob.batch(b).row(i * 2 + 1); - float* outptr = top_blob.row(i); + float* outptr = top_blob.batch(b).row(i); for (int j = 0; j < w; j++) { @@ -360,13 +377,16 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op } if (pack8to4) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const float* r0 = bottom_blob.row(i); + int b = bi / h; + int i = bi % h; + const float* r0 = bottom_blob.batch(b).row(i); - float* outptr0 = top_blob.row(i * 2); - float* outptr1 = top_blob.row(i * 2 + 1); + float* outptr0 = top_blob.batch(b).row(i * 2); + float* outptr1 = top_blob.batch(b).row(i * 2 + 1); for (int j = 0; j < w; j++) { @@ -387,27 +407,30 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op } if (pack1to16) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const float* r0 = bottom_blob.row(i * 16); - const float* r1 = bottom_blob.row(i * 16 + 1); - const float* r2 = bottom_blob.row(i * 16 + 2); - const float* r3 = bottom_blob.row(i * 16 + 3); - const float* r4 = bottom_blob.row(i * 16 + 4); - const float* r5 = bottom_blob.row(i * 16 + 5); - const float* r6 = bottom_blob.row(i * 16 + 6); - const float* r7 = bottom_blob.row(i * 16 + 7); - const float* r8 = bottom_blob.row(i * 16 + 8); - const float* r9 = bottom_blob.row(i * 16 + 9); - const float* ra = bottom_blob.row(i * 16 + 10); - const float* rb = bottom_blob.row(i * 16 + 11); - const float* rc = bottom_blob.row(i * 16 + 12); - const float* rd = bottom_blob.row(i * 16 + 13); - const float* re = bottom_blob.row(i * 16 + 14); - const float* rf = bottom_blob.row(i * 16 + 15); - - float* outptr = top_blob.row(i); + int b = bi / outh; + int i = bi % outh; + const float* r0 = bottom_blob.batch(b).row(i * 16); + const float* r1 = bottom_blob.batch(b).row(i * 16 + 1); + const float* r2 = bottom_blob.batch(b).row(i * 16 + 2); + const float* r3 = bottom_blob.batch(b).row(i * 16 + 3); + const float* r4 = bottom_blob.batch(b).row(i * 16 + 4); + const float* r5 = bottom_blob.batch(b).row(i * 16 + 5); + const float* r6 = bottom_blob.batch(b).row(i * 16 + 6); + const float* r7 = bottom_blob.batch(b).row(i * 16 + 7); + const float* r8 = bottom_blob.batch(b).row(i * 16 + 8); + const float* r9 = bottom_blob.batch(b).row(i * 16 + 9); + const float* ra = bottom_blob.batch(b).row(i * 16 + 10); + const float* rb = bottom_blob.batch(b).row(i * 16 + 11); + const float* rc = bottom_blob.batch(b).row(i * 16 + 12); + const float* rd = bottom_blob.batch(b).row(i * 16 + 13); + const float* re = bottom_blob.batch(b).row(i * 16 + 14); + const float* rf = bottom_blob.batch(b).row(i * 16 + 15); + + float* outptr = top_blob.batch(b).row(i); int j = 0; #if __AVX512F__ @@ -490,27 +513,30 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op } if (pack16to1) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const float* r0 = bottom_blob.row(i); - - float* outptr0 = top_blob.row(i * 16); - float* outptr1 = top_blob.row(i * 16 + 1); - float* outptr2 = top_blob.row(i * 16 + 2); - float* outptr3 = top_blob.row(i * 16 + 3); - float* outptr4 = top_blob.row(i * 16 + 4); - float* outptr5 = top_blob.row(i * 16 + 5); - float* outptr6 = top_blob.row(i * 16 + 6); - float* outptr7 = top_blob.row(i * 16 + 7); - float* outptr8 = top_blob.row(i * 16 + 8); - float* outptr9 = top_blob.row(i * 16 + 9); - float* outptra = top_blob.row(i * 16 + 10); - float* outptrb = top_blob.row(i * 16 + 11); - float* outptrc = top_blob.row(i * 16 + 12); - float* outptrd = top_blob.row(i * 16 + 13); - float* outptre = top_blob.row(i * 16 + 14); - float* outptrf = top_blob.row(i * 16 + 15); + int b = bi / h; + int i = bi % h; + const float* r0 = bottom_blob.batch(b).row(i); + + float* outptr0 = top_blob.batch(b).row(i * 16); + float* outptr1 = top_blob.batch(b).row(i * 16 + 1); + float* outptr2 = top_blob.batch(b).row(i * 16 + 2); + float* outptr3 = top_blob.batch(b).row(i * 16 + 3); + float* outptr4 = top_blob.batch(b).row(i * 16 + 4); + float* outptr5 = top_blob.batch(b).row(i * 16 + 5); + float* outptr6 = top_blob.batch(b).row(i * 16 + 6); + float* outptr7 = top_blob.batch(b).row(i * 16 + 7); + float* outptr8 = top_blob.batch(b).row(i * 16 + 8); + float* outptr9 = top_blob.batch(b).row(i * 16 + 9); + float* outptra = top_blob.batch(b).row(i * 16 + 10); + float* outptrb = top_blob.batch(b).row(i * 16 + 11); + float* outptrc = top_blob.batch(b).row(i * 16 + 12); + float* outptrd = top_blob.batch(b).row(i * 16 + 13); + float* outptre = top_blob.batch(b).row(i * 16 + 14); + float* outptrf = top_blob.batch(b).row(i * 16 + 15); int j = 0; #if __AVX512F__ @@ -594,15 +620,18 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op } if (pack4to16) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const float* r0 = bottom_blob.row(i * 4); - const float* r1 = bottom_blob.row(i * 4 + 1); - const float* r2 = bottom_blob.row(i * 4 + 2); - const float* r3 = bottom_blob.row(i * 4 + 3); + int b = bi / outh; + int i = bi % outh; + const float* r0 = bottom_blob.batch(b).row(i * 4); + const float* r1 = bottom_blob.batch(b).row(i * 4 + 1); + const float* r2 = bottom_blob.batch(b).row(i * 4 + 2); + const float* r3 = bottom_blob.batch(b).row(i * 4 + 3); - float* outptr = top_blob.row(i); + float* outptr = top_blob.batch(b).row(i); for (int j = 0; j < w; j++) { @@ -633,15 +662,18 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op } if (pack16to4) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const float* r0 = bottom_blob.row(i); + int b = bi / h; + int i = bi % h; + const float* r0 = bottom_blob.batch(b).row(i); - float* outptr0 = top_blob.row(i * 4); - float* outptr1 = top_blob.row(i * 4 + 1); - float* outptr2 = top_blob.row(i * 4 + 2); - float* outptr3 = top_blob.row(i * 4 + 3); + float* outptr0 = top_blob.batch(b).row(i * 4); + float* outptr1 = top_blob.batch(b).row(i * 4 + 1); + float* outptr2 = top_blob.batch(b).row(i * 4 + 2); + float* outptr3 = top_blob.batch(b).row(i * 4 + 3); for (int j = 0; j < w; j++) { @@ -672,13 +704,16 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op } if (pack8to16) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const float* r0 = bottom_blob.row(i * 2); - const float* r1 = bottom_blob.row(i * 2 + 1); + int b = bi / outh; + int i = bi % outh; + const float* r0 = bottom_blob.batch(b).row(i * 2); + const float* r1 = bottom_blob.batch(b).row(i * 2 + 1); - float* outptr = top_blob.row(i); + float* outptr = top_blob.batch(b).row(i); for (int j = 0; j < w; j++) { @@ -707,13 +742,16 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op } if (pack16to8) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const float* r0 = bottom_blob.row(i); + int b = bi / h; + int i = bi % h; + const float* r0 = bottom_blob.batch(b).row(i); - float* outptr0 = top_blob.row(i * 2); - float* outptr1 = top_blob.row(i * 2 + 1); + float* outptr0 = top_blob.batch(b).row(i * 2); + float* outptr1 = top_blob.batch(b).row(i * 2 + 1); for (int j = 0; j < w; j++) { @@ -751,23 +789,26 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op size_t out_elemsize = elemsize / elempack * out_elempack; if (dims == 3) - top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); else // if (dims == 4) - top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; if (pack1to4) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const float* r0 = bottom_blob.channel(q * 4); - const float* r1 = bottom_blob.channel(q * 4 + 1); - const float* r2 = bottom_blob.channel(q * 4 + 2); - const float* r3 = bottom_blob.channel(q * 4 + 3); + int b = bq / outc; + int q = bq % outc; + const float* r0 = bottom_blob.batch(b).channel(q * 4); + const float* r1 = bottom_blob.batch(b).channel(q * 4 + 1); + const float* r2 = bottom_blob.batch(b).channel(q * 4 + 2); + const float* r3 = bottom_blob.batch(b).channel(q * 4 + 3); - float* outptr = top_blob.channel(q); + float* outptr = top_blob.batch(b).channel(q); int i = 0; #if __SSE2__ @@ -806,15 +847,18 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op } if (pack4to1) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const float* r0 = bottom_blob.channel(q); + int b = bq / channels; + int q = bq % channels; + const float* r0 = bottom_blob.batch(b).channel(q); - float* outptr0 = top_blob.channel(q * 4); - float* outptr1 = top_blob.channel(q * 4 + 1); - float* outptr2 = top_blob.channel(q * 4 + 2); - float* outptr3 = top_blob.channel(q * 4 + 3); + float* outptr0 = top_blob.batch(b).channel(q * 4); + float* outptr1 = top_blob.batch(b).channel(q * 4 + 1); + float* outptr2 = top_blob.batch(b).channel(q * 4 + 2); + float* outptr3 = top_blob.batch(b).channel(q * 4 + 3); int i = 0; #if __SSE2__ @@ -853,19 +897,22 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op } if (pack1to8) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const float* r0 = bottom_blob.channel(q * 8); - const float* r1 = bottom_blob.channel(q * 8 + 1); - const float* r2 = bottom_blob.channel(q * 8 + 2); - const float* r3 = bottom_blob.channel(q * 8 + 3); - const float* r4 = bottom_blob.channel(q * 8 + 4); - const float* r5 = bottom_blob.channel(q * 8 + 5); - const float* r6 = bottom_blob.channel(q * 8 + 6); - const float* r7 = bottom_blob.channel(q * 8 + 7); - - float* outptr = top_blob.channel(q); + int b = bq / outc; + int q = bq % outc; + const float* r0 = bottom_blob.batch(b).channel(q * 8); + const float* r1 = bottom_blob.batch(b).channel(q * 8 + 1); + const float* r2 = bottom_blob.batch(b).channel(q * 8 + 2); + const float* r3 = bottom_blob.batch(b).channel(q * 8 + 3); + const float* r4 = bottom_blob.batch(b).channel(q * 8 + 4); + const float* r5 = bottom_blob.batch(b).channel(q * 8 + 5); + const float* r6 = bottom_blob.batch(b).channel(q * 8 + 6); + const float* r7 = bottom_blob.batch(b).channel(q * 8 + 7); + + float* outptr = top_blob.batch(b).channel(q); int i = 0; #if __AVX__ @@ -916,19 +963,22 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op } if (pack8to1) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const float* r0 = bottom_blob.channel(q); - - float* outptr0 = top_blob.channel(q * 8); - float* outptr1 = top_blob.channel(q * 8 + 1); - float* outptr2 = top_blob.channel(q * 8 + 2); - float* outptr3 = top_blob.channel(q * 8 + 3); - float* outptr4 = top_blob.channel(q * 8 + 4); - float* outptr5 = top_blob.channel(q * 8 + 5); - float* outptr6 = top_blob.channel(q * 8 + 6); - float* outptr7 = top_blob.channel(q * 8 + 7); + int b = bq / channels; + int q = bq % channels; + const float* r0 = bottom_blob.batch(b).channel(q); + + float* outptr0 = top_blob.batch(b).channel(q * 8); + float* outptr1 = top_blob.batch(b).channel(q * 8 + 1); + float* outptr2 = top_blob.batch(b).channel(q * 8 + 2); + float* outptr3 = top_blob.batch(b).channel(q * 8 + 3); + float* outptr4 = top_blob.batch(b).channel(q * 8 + 4); + float* outptr5 = top_blob.batch(b).channel(q * 8 + 5); + float* outptr6 = top_blob.batch(b).channel(q * 8 + 6); + float* outptr7 = top_blob.batch(b).channel(q * 8 + 7); int i = 0; #if __AVX__ @@ -980,13 +1030,16 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op } if (pack4to8) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const float* r0 = bottom_blob.channel(q * 2); - const float* r1 = bottom_blob.channel(q * 2 + 1); + int b = bq / outc; + int q = bq % outc; + const float* r0 = bottom_blob.batch(b).channel(q * 2); + const float* r1 = bottom_blob.batch(b).channel(q * 2 + 1); - float* outptr = top_blob.channel(q); + float* outptr = top_blob.batch(b).channel(q); for (int i = 0; i < size; i++) { @@ -1007,13 +1060,16 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op } if (pack8to4) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const float* r0 = bottom_blob.channel(q); + int b = bq / channels; + int q = bq % channels; + const float* r0 = bottom_blob.batch(b).channel(q); - float* outptr0 = top_blob.channel(q * 2); - float* outptr1 = top_blob.channel(q * 2 + 1); + float* outptr0 = top_blob.batch(b).channel(q * 2); + float* outptr1 = top_blob.batch(b).channel(q * 2 + 1); for (int i = 0; i < size; i++) { @@ -1034,27 +1090,30 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op } if (pack1to16) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const float* r0 = bottom_blob.channel(q * 16); - const float* r1 = bottom_blob.channel(q * 16 + 1); - const float* r2 = bottom_blob.channel(q * 16 + 2); - const float* r3 = bottom_blob.channel(q * 16 + 3); - const float* r4 = bottom_blob.channel(q * 16 + 4); - const float* r5 = bottom_blob.channel(q * 16 + 5); - const float* r6 = bottom_blob.channel(q * 16 + 6); - const float* r7 = bottom_blob.channel(q * 16 + 7); - const float* r8 = bottom_blob.channel(q * 16 + 8); - const float* r9 = bottom_blob.channel(q * 16 + 9); - const float* ra = bottom_blob.channel(q * 16 + 10); - const float* rb = bottom_blob.channel(q * 16 + 11); - const float* rc = bottom_blob.channel(q * 16 + 12); - const float* rd = bottom_blob.channel(q * 16 + 13); - const float* re = bottom_blob.channel(q * 16 + 14); - const float* rf = bottom_blob.channel(q * 16 + 15); - - float* outptr = top_blob.channel(q); + int b = bq / outc; + int q = bq % outc; + const float* r0 = bottom_blob.batch(b).channel(q * 16); + const float* r1 = bottom_blob.batch(b).channel(q * 16 + 1); + const float* r2 = bottom_blob.batch(b).channel(q * 16 + 2); + const float* r3 = bottom_blob.batch(b).channel(q * 16 + 3); + const float* r4 = bottom_blob.batch(b).channel(q * 16 + 4); + const float* r5 = bottom_blob.batch(b).channel(q * 16 + 5); + const float* r6 = bottom_blob.batch(b).channel(q * 16 + 6); + const float* r7 = bottom_blob.batch(b).channel(q * 16 + 7); + const float* r8 = bottom_blob.batch(b).channel(q * 16 + 8); + const float* r9 = bottom_blob.batch(b).channel(q * 16 + 9); + const float* ra = bottom_blob.batch(b).channel(q * 16 + 10); + const float* rb = bottom_blob.batch(b).channel(q * 16 + 11); + const float* rc = bottom_blob.batch(b).channel(q * 16 + 12); + const float* rd = bottom_blob.batch(b).channel(q * 16 + 13); + const float* re = bottom_blob.batch(b).channel(q * 16 + 14); + const float* rf = bottom_blob.batch(b).channel(q * 16 + 15); + + float* outptr = top_blob.batch(b).channel(q); int i = 0; #if __AVX512F__ @@ -1137,27 +1196,30 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op } if (pack16to1) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const float* r0 = bottom_blob.channel(q); - - float* outptr0 = top_blob.channel(q * 16); - float* outptr1 = top_blob.channel(q * 16 + 1); - float* outptr2 = top_blob.channel(q * 16 + 2); - float* outptr3 = top_blob.channel(q * 16 + 3); - float* outptr4 = top_blob.channel(q * 16 + 4); - float* outptr5 = top_blob.channel(q * 16 + 5); - float* outptr6 = top_blob.channel(q * 16 + 6); - float* outptr7 = top_blob.channel(q * 16 + 7); - float* outptr8 = top_blob.channel(q * 16 + 8); - float* outptr9 = top_blob.channel(q * 16 + 9); - float* outptra = top_blob.channel(q * 16 + 10); - float* outptrb = top_blob.channel(q * 16 + 11); - float* outptrc = top_blob.channel(q * 16 + 12); - float* outptrd = top_blob.channel(q * 16 + 13); - float* outptre = top_blob.channel(q * 16 + 14); - float* outptrf = top_blob.channel(q * 16 + 15); + int b = bq / channels; + int q = bq % channels; + const float* r0 = bottom_blob.batch(b).channel(q); + + float* outptr0 = top_blob.batch(b).channel(q * 16); + float* outptr1 = top_blob.batch(b).channel(q * 16 + 1); + float* outptr2 = top_blob.batch(b).channel(q * 16 + 2); + float* outptr3 = top_blob.batch(b).channel(q * 16 + 3); + float* outptr4 = top_blob.batch(b).channel(q * 16 + 4); + float* outptr5 = top_blob.batch(b).channel(q * 16 + 5); + float* outptr6 = top_blob.batch(b).channel(q * 16 + 6); + float* outptr7 = top_blob.batch(b).channel(q * 16 + 7); + float* outptr8 = top_blob.batch(b).channel(q * 16 + 8); + float* outptr9 = top_blob.batch(b).channel(q * 16 + 9); + float* outptra = top_blob.batch(b).channel(q * 16 + 10); + float* outptrb = top_blob.batch(b).channel(q * 16 + 11); + float* outptrc = top_blob.batch(b).channel(q * 16 + 12); + float* outptrd = top_blob.batch(b).channel(q * 16 + 13); + float* outptre = top_blob.batch(b).channel(q * 16 + 14); + float* outptrf = top_blob.batch(b).channel(q * 16 + 15); int i = 0; #if __AVX512F__ @@ -1241,15 +1303,18 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op } if (pack4to16) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const float* r0 = bottom_blob.channel(q * 4); - const float* r1 = bottom_blob.channel(q * 4 + 1); - const float* r2 = bottom_blob.channel(q * 4 + 2); - const float* r3 = bottom_blob.channel(q * 4 + 3); + int b = bq / outc; + int q = bq % outc; + const float* r0 = bottom_blob.batch(b).channel(q * 4); + const float* r1 = bottom_blob.batch(b).channel(q * 4 + 1); + const float* r2 = bottom_blob.batch(b).channel(q * 4 + 2); + const float* r3 = bottom_blob.batch(b).channel(q * 4 + 3); - float* outptr = top_blob.channel(q); + float* outptr = top_blob.batch(b).channel(q); for (int i = 0; i < size; i++) { @@ -1280,15 +1345,18 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op } if (pack16to4) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const float* r0 = bottom_blob.channel(q); + int b = bq / channels; + int q = bq % channels; + const float* r0 = bottom_blob.batch(b).channel(q); - float* outptr0 = top_blob.channel(q * 4); - float* outptr1 = top_blob.channel(q * 4 + 1); - float* outptr2 = top_blob.channel(q * 4 + 2); - float* outptr3 = top_blob.channel(q * 4 + 3); + float* outptr0 = top_blob.batch(b).channel(q * 4); + float* outptr1 = top_blob.batch(b).channel(q * 4 + 1); + float* outptr2 = top_blob.batch(b).channel(q * 4 + 2); + float* outptr3 = top_blob.batch(b).channel(q * 4 + 3); for (int i = 0; i < size; i++) { @@ -1319,13 +1387,16 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op } if (pack8to16) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const float* r0 = bottom_blob.channel(q * 2); - const float* r1 = bottom_blob.channel(q * 2 + 1); + int b = bq / outc; + int q = bq % outc; + const float* r0 = bottom_blob.batch(b).channel(q * 2); + const float* r1 = bottom_blob.batch(b).channel(q * 2 + 1); - float* outptr = top_blob.channel(q); + float* outptr = top_blob.batch(b).channel(q); for (int i = 0; i < size; i++) { @@ -1354,13 +1425,16 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op } if (pack16to8) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const float* r0 = bottom_blob.channel(q); + int b = bq / channels; + int q = bq % channels; + const float* r0 = bottom_blob.batch(b).channel(q); - float* outptr0 = top_blob.channel(q * 2); - float* outptr1 = top_blob.channel(q * 2 + 1); + float* outptr0 = top_blob.batch(b).channel(q * 2); + float* outptr1 = top_blob.batch(b).channel(q * 2 + 1); for (int i = 0; i < size; i++) { @@ -1433,6 +1507,7 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons int d = bottom_blob.d; int channels = bottom_blob.c; int dims = bottom_blob.dims; + int batch = bottom_blob.n; if (!use_padding) { @@ -1461,6 +1536,7 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons top_blob.cstep = bottom_blob.cstep * elempack / out_elempack; top_blob.elemsize = elemsize / elempack * out_elempack; top_blob.elempack = out_elempack; + top_blob.nstep = bottom_blob.nstep * elempack / out_elempack; return 0; } @@ -1469,21 +1545,24 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons int outh = h * elempack / out_elempack; size_t out_elemsize = elemsize / elempack * out_elempack; - top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; if (pack1to4) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const unsigned short* r0 = bottom_blob.row(i * 4); - const unsigned short* r1 = bottom_blob.row(i * 4 + 1); - const unsigned short* r2 = bottom_blob.row(i * 4 + 2); - const unsigned short* r3 = bottom_blob.row(i * 4 + 3); + int b = bi / outh; + int i = bi % outh; + const unsigned short* r0 = bottom_blob.batch(b).row(i * 4); + const unsigned short* r1 = bottom_blob.batch(b).row(i * 4 + 1); + const unsigned short* r2 = bottom_blob.batch(b).row(i * 4 + 2); + const unsigned short* r3 = bottom_blob.batch(b).row(i * 4 + 3); - unsigned short* outptr = top_blob.row(i); + unsigned short* outptr = top_blob.batch(b).row(i); int j = 0; #if __SSE2__ @@ -1524,15 +1603,18 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack4to1) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const unsigned short* r0 = bottom_blob.row(i); + int b = bi / h; + int i = bi % h; + const unsigned short* r0 = bottom_blob.batch(b).row(i); - unsigned short* outptr0 = top_blob.row(i * 4); - unsigned short* outptr1 = top_blob.row(i * 4 + 1); - unsigned short* outptr2 = top_blob.row(i * 4 + 2); - unsigned short* outptr3 = top_blob.row(i * 4 + 3); + unsigned short* outptr0 = top_blob.batch(b).row(i * 4); + unsigned short* outptr1 = top_blob.batch(b).row(i * 4 + 1); + unsigned short* outptr2 = top_blob.batch(b).row(i * 4 + 2); + unsigned short* outptr3 = top_blob.batch(b).row(i * 4 + 3); int j = 0; #if __SSE2__ @@ -1573,19 +1655,22 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack1to8) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const unsigned short* r0 = bottom_blob.row(i * 8); - const unsigned short* r1 = bottom_blob.row(i * 8 + 1); - const unsigned short* r2 = bottom_blob.row(i * 8 + 2); - const unsigned short* r3 = bottom_blob.row(i * 8 + 3); - const unsigned short* r4 = bottom_blob.row(i * 8 + 4); - const unsigned short* r5 = bottom_blob.row(i * 8 + 5); - const unsigned short* r6 = bottom_blob.row(i * 8 + 6); - const unsigned short* r7 = bottom_blob.row(i * 8 + 7); - - unsigned short* outptr = top_blob.row(i); + int b = bi / outh; + int i = bi % outh; + const unsigned short* r0 = bottom_blob.batch(b).row(i * 8); + const unsigned short* r1 = bottom_blob.batch(b).row(i * 8 + 1); + const unsigned short* r2 = bottom_blob.batch(b).row(i * 8 + 2); + const unsigned short* r3 = bottom_blob.batch(b).row(i * 8 + 3); + const unsigned short* r4 = bottom_blob.batch(b).row(i * 8 + 4); + const unsigned short* r5 = bottom_blob.batch(b).row(i * 8 + 5); + const unsigned short* r6 = bottom_blob.batch(b).row(i * 8 + 6); + const unsigned short* r7 = bottom_blob.batch(b).row(i * 8 + 7); + + unsigned short* outptr = top_blob.batch(b).row(i); int j = 0; for (; j < w; j++) @@ -1605,19 +1690,22 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack8to1) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const unsigned short* r0 = bottom_blob.row(i); - - unsigned short* outptr0 = top_blob.row(i * 8); - unsigned short* outptr1 = top_blob.row(i * 8 + 1); - unsigned short* outptr2 = top_blob.row(i * 8 + 2); - unsigned short* outptr3 = top_blob.row(i * 8 + 3); - unsigned short* outptr4 = top_blob.row(i * 8 + 4); - unsigned short* outptr5 = top_blob.row(i * 8 + 5); - unsigned short* outptr6 = top_blob.row(i * 8 + 6); - unsigned short* outptr7 = top_blob.row(i * 8 + 7); + int b = bi / h; + int i = bi % h; + const unsigned short* r0 = bottom_blob.batch(b).row(i); + + unsigned short* outptr0 = top_blob.batch(b).row(i * 8); + unsigned short* outptr1 = top_blob.batch(b).row(i * 8 + 1); + unsigned short* outptr2 = top_blob.batch(b).row(i * 8 + 2); + unsigned short* outptr3 = top_blob.batch(b).row(i * 8 + 3); + unsigned short* outptr4 = top_blob.batch(b).row(i * 8 + 4); + unsigned short* outptr5 = top_blob.batch(b).row(i * 8 + 5); + unsigned short* outptr6 = top_blob.batch(b).row(i * 8 + 6); + unsigned short* outptr7 = top_blob.batch(b).row(i * 8 + 7); int j = 0; for (; j < w; j++) @@ -1637,13 +1725,16 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack4to8) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const unsigned short* r0 = bottom_blob.row(i * 2); - const unsigned short* r1 = bottom_blob.row(i * 2 + 1); + int b = bi / outh; + int i = bi % outh; + const unsigned short* r0 = bottom_blob.batch(b).row(i * 2); + const unsigned short* r1 = bottom_blob.batch(b).row(i * 2 + 1); - unsigned short* outptr = top_blob.row(i); + unsigned short* outptr = top_blob.batch(b).row(i); for (int j = 0; j < w; j++) { @@ -1664,13 +1755,16 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack8to4) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const unsigned short* r0 = bottom_blob.row(i); + int b = bi / h; + int i = bi % h; + const unsigned short* r0 = bottom_blob.batch(b).row(i); - unsigned short* outptr0 = top_blob.row(i * 2); - unsigned short* outptr1 = top_blob.row(i * 2 + 1); + unsigned short* outptr0 = top_blob.batch(b).row(i * 2); + unsigned short* outptr1 = top_blob.batch(b).row(i * 2 + 1); for (int j = 0; j < w; j++) { @@ -1691,27 +1785,30 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack1to16) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const unsigned short* r0 = bottom_blob.row(i * 16); - const unsigned short* r1 = bottom_blob.row(i * 16 + 1); - const unsigned short* r2 = bottom_blob.row(i * 16 + 2); - const unsigned short* r3 = bottom_blob.row(i * 16 + 3); - const unsigned short* r4 = bottom_blob.row(i * 16 + 4); - const unsigned short* r5 = bottom_blob.row(i * 16 + 5); - const unsigned short* r6 = bottom_blob.row(i * 16 + 6); - const unsigned short* r7 = bottom_blob.row(i * 16 + 7); - const unsigned short* r8 = bottom_blob.row(i * 16 + 8); - const unsigned short* r9 = bottom_blob.row(i * 16 + 9); - const unsigned short* ra = bottom_blob.row(i * 16 + 10); - const unsigned short* rb = bottom_blob.row(i * 16 + 11); - const unsigned short* rc = bottom_blob.row(i * 16 + 12); - const unsigned short* rd = bottom_blob.row(i * 16 + 13); - const unsigned short* re = bottom_blob.row(i * 16 + 14); - const unsigned short* rf = bottom_blob.row(i * 16 + 15); - - unsigned short* outptr = top_blob.row(i); + int b = bi / outh; + int i = bi % outh; + const unsigned short* r0 = bottom_blob.batch(b).row(i * 16); + const unsigned short* r1 = bottom_blob.batch(b).row(i * 16 + 1); + const unsigned short* r2 = bottom_blob.batch(b).row(i * 16 + 2); + const unsigned short* r3 = bottom_blob.batch(b).row(i * 16 + 3); + const unsigned short* r4 = bottom_blob.batch(b).row(i * 16 + 4); + const unsigned short* r5 = bottom_blob.batch(b).row(i * 16 + 5); + const unsigned short* r6 = bottom_blob.batch(b).row(i * 16 + 6); + const unsigned short* r7 = bottom_blob.batch(b).row(i * 16 + 7); + const unsigned short* r8 = bottom_blob.batch(b).row(i * 16 + 8); + const unsigned short* r9 = bottom_blob.batch(b).row(i * 16 + 9); + const unsigned short* ra = bottom_blob.batch(b).row(i * 16 + 10); + const unsigned short* rb = bottom_blob.batch(b).row(i * 16 + 11); + const unsigned short* rc = bottom_blob.batch(b).row(i * 16 + 12); + const unsigned short* rd = bottom_blob.batch(b).row(i * 16 + 13); + const unsigned short* re = bottom_blob.batch(b).row(i * 16 + 14); + const unsigned short* rf = bottom_blob.batch(b).row(i * 16 + 15); + + unsigned short* outptr = top_blob.batch(b).row(i); int j = 0; for (; j < w; j++) @@ -1739,27 +1836,30 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack16to1) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const unsigned short* r0 = bottom_blob.row(i); - - unsigned short* outptr0 = top_blob.row(i * 16); - unsigned short* outptr1 = top_blob.row(i * 16 + 1); - unsigned short* outptr2 = top_blob.row(i * 16 + 2); - unsigned short* outptr3 = top_blob.row(i * 16 + 3); - unsigned short* outptr4 = top_blob.row(i * 16 + 4); - unsigned short* outptr5 = top_blob.row(i * 16 + 5); - unsigned short* outptr6 = top_blob.row(i * 16 + 6); - unsigned short* outptr7 = top_blob.row(i * 16 + 7); - unsigned short* outptr8 = top_blob.row(i * 16 + 8); - unsigned short* outptr9 = top_blob.row(i * 16 + 9); - unsigned short* outptra = top_blob.row(i * 16 + 10); - unsigned short* outptrb = top_blob.row(i * 16 + 11); - unsigned short* outptrc = top_blob.row(i * 16 + 12); - unsigned short* outptrd = top_blob.row(i * 16 + 13); - unsigned short* outptre = top_blob.row(i * 16 + 14); - unsigned short* outptrf = top_blob.row(i * 16 + 15); + int b = bi / h; + int i = bi % h; + const unsigned short* r0 = bottom_blob.batch(b).row(i); + + unsigned short* outptr0 = top_blob.batch(b).row(i * 16); + unsigned short* outptr1 = top_blob.batch(b).row(i * 16 + 1); + unsigned short* outptr2 = top_blob.batch(b).row(i * 16 + 2); + unsigned short* outptr3 = top_blob.batch(b).row(i * 16 + 3); + unsigned short* outptr4 = top_blob.batch(b).row(i * 16 + 4); + unsigned short* outptr5 = top_blob.batch(b).row(i * 16 + 5); + unsigned short* outptr6 = top_blob.batch(b).row(i * 16 + 6); + unsigned short* outptr7 = top_blob.batch(b).row(i * 16 + 7); + unsigned short* outptr8 = top_blob.batch(b).row(i * 16 + 8); + unsigned short* outptr9 = top_blob.batch(b).row(i * 16 + 9); + unsigned short* outptra = top_blob.batch(b).row(i * 16 + 10); + unsigned short* outptrb = top_blob.batch(b).row(i * 16 + 11); + unsigned short* outptrc = top_blob.batch(b).row(i * 16 + 12); + unsigned short* outptrd = top_blob.batch(b).row(i * 16 + 13); + unsigned short* outptre = top_blob.batch(b).row(i * 16 + 14); + unsigned short* outptrf = top_blob.batch(b).row(i * 16 + 15); int j = 0; for (; j < w; j++) @@ -1787,15 +1887,18 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack4to16) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const unsigned short* r0 = bottom_blob.row(i * 4); - const unsigned short* r1 = bottom_blob.row(i * 4 + 1); - const unsigned short* r2 = bottom_blob.row(i * 4 + 2); - const unsigned short* r3 = bottom_blob.row(i * 4 + 3); + int b = bi / outh; + int i = bi % outh; + const unsigned short* r0 = bottom_blob.batch(b).row(i * 4); + const unsigned short* r1 = bottom_blob.batch(b).row(i * 4 + 1); + const unsigned short* r2 = bottom_blob.batch(b).row(i * 4 + 2); + const unsigned short* r3 = bottom_blob.batch(b).row(i * 4 + 3); - unsigned short* outptr = top_blob.row(i); + unsigned short* outptr = top_blob.batch(b).row(i); for (int j = 0; j < w; j++) { @@ -1826,15 +1929,18 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack16to4) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const unsigned short* r0 = bottom_blob.row(i); + int b = bi / h; + int i = bi % h; + const unsigned short* r0 = bottom_blob.batch(b).row(i); - unsigned short* outptr0 = top_blob.row(i * 4); - unsigned short* outptr1 = top_blob.row(i * 4 + 1); - unsigned short* outptr2 = top_blob.row(i * 4 + 2); - unsigned short* outptr3 = top_blob.row(i * 4 + 3); + unsigned short* outptr0 = top_blob.batch(b).row(i * 4); + unsigned short* outptr1 = top_blob.batch(b).row(i * 4 + 1); + unsigned short* outptr2 = top_blob.batch(b).row(i * 4 + 2); + unsigned short* outptr3 = top_blob.batch(b).row(i * 4 + 3); for (int j = 0; j < w; j++) { @@ -1865,13 +1971,16 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack8to16) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const unsigned short* r0 = bottom_blob.row(i * 2); - const unsigned short* r1 = bottom_blob.row(i * 2 + 1); + int b = bi / outh; + int i = bi % outh; + const unsigned short* r0 = bottom_blob.batch(b).row(i * 2); + const unsigned short* r1 = bottom_blob.batch(b).row(i * 2 + 1); - unsigned short* outptr = top_blob.row(i); + unsigned short* outptr = top_blob.batch(b).row(i); for (int j = 0; j < w; j++) { @@ -1900,13 +2009,16 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack16to8) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const unsigned short* r0 = bottom_blob.row(i); + int b = bi / h; + int i = bi % h; + const unsigned short* r0 = bottom_blob.batch(b).row(i); - unsigned short* outptr0 = top_blob.row(i * 2); - unsigned short* outptr1 = top_blob.row(i * 2 + 1); + unsigned short* outptr0 = top_blob.batch(b).row(i * 2); + unsigned short* outptr1 = top_blob.batch(b).row(i * 2 + 1); for (int j = 0; j < w; j++) { @@ -1944,23 +2056,26 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons size_t out_elemsize = elemsize / elempack * out_elempack; if (dims == 3) - top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); else // if (dims == 4) - top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; if (pack1to4) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const unsigned short* r0 = bottom_blob.channel(q * 4); - const unsigned short* r1 = bottom_blob.channel(q * 4 + 1); - const unsigned short* r2 = bottom_blob.channel(q * 4 + 2); - const unsigned short* r3 = bottom_blob.channel(q * 4 + 3); + int b = bq / outc; + int q = bq % outc; + const unsigned short* r0 = bottom_blob.batch(b).channel(q * 4); + const unsigned short* r1 = bottom_blob.batch(b).channel(q * 4 + 1); + const unsigned short* r2 = bottom_blob.batch(b).channel(q * 4 + 2); + const unsigned short* r3 = bottom_blob.batch(b).channel(q * 4 + 3); - unsigned short* outptr = top_blob.channel(q); + unsigned short* outptr = top_blob.batch(b).channel(q); int i = 0; #if __SSE2__ @@ -2000,15 +2115,18 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack4to1) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const unsigned short* r0 = bottom_blob.channel(q); + int b = bq / channels; + int q = bq % channels; + const unsigned short* r0 = bottom_blob.batch(b).channel(q); - unsigned short* outptr0 = top_blob.channel(q * 4); - unsigned short* outptr1 = top_blob.channel(q * 4 + 1); - unsigned short* outptr2 = top_blob.channel(q * 4 + 2); - unsigned short* outptr3 = top_blob.channel(q * 4 + 3); + unsigned short* outptr0 = top_blob.batch(b).channel(q * 4); + unsigned short* outptr1 = top_blob.batch(b).channel(q * 4 + 1); + unsigned short* outptr2 = top_blob.batch(b).channel(q * 4 + 2); + unsigned short* outptr3 = top_blob.batch(b).channel(q * 4 + 3); int i = 0; #if __SSE2__ @@ -2048,19 +2166,22 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack1to8) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const unsigned short* r0 = bottom_blob.channel(q * 8); - const unsigned short* r1 = bottom_blob.channel(q * 8 + 1); - const unsigned short* r2 = bottom_blob.channel(q * 8 + 2); - const unsigned short* r3 = bottom_blob.channel(q * 8 + 3); - const unsigned short* r4 = bottom_blob.channel(q * 8 + 4); - const unsigned short* r5 = bottom_blob.channel(q * 8 + 5); - const unsigned short* r6 = bottom_blob.channel(q * 8 + 6); - const unsigned short* r7 = bottom_blob.channel(q * 8 + 7); - - unsigned short* outptr = top_blob.channel(q); + int b = bq / outc; + int q = bq % outc; + const unsigned short* r0 = bottom_blob.batch(b).channel(q * 8); + const unsigned short* r1 = bottom_blob.batch(b).channel(q * 8 + 1); + const unsigned short* r2 = bottom_blob.batch(b).channel(q * 8 + 2); + const unsigned short* r3 = bottom_blob.batch(b).channel(q * 8 + 3); + const unsigned short* r4 = bottom_blob.batch(b).channel(q * 8 + 4); + const unsigned short* r5 = bottom_blob.batch(b).channel(q * 8 + 5); + const unsigned short* r6 = bottom_blob.batch(b).channel(q * 8 + 6); + const unsigned short* r7 = bottom_blob.batch(b).channel(q * 8 + 7); + + unsigned short* outptr = top_blob.batch(b).channel(q); int i = 0; for (; i < size; i++) @@ -2080,19 +2201,22 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack8to1) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const unsigned short* r0 = bottom_blob.channel(q); - - unsigned short* outptr0 = top_blob.channel(q * 8); - unsigned short* outptr1 = top_blob.channel(q * 8 + 1); - unsigned short* outptr2 = top_blob.channel(q * 8 + 2); - unsigned short* outptr3 = top_blob.channel(q * 8 + 3); - unsigned short* outptr4 = top_blob.channel(q * 8 + 4); - unsigned short* outptr5 = top_blob.channel(q * 8 + 5); - unsigned short* outptr6 = top_blob.channel(q * 8 + 6); - unsigned short* outptr7 = top_blob.channel(q * 8 + 7); + int b = bq / channels; + int q = bq % channels; + const unsigned short* r0 = bottom_blob.batch(b).channel(q); + + unsigned short* outptr0 = top_blob.batch(b).channel(q * 8); + unsigned short* outptr1 = top_blob.batch(b).channel(q * 8 + 1); + unsigned short* outptr2 = top_blob.batch(b).channel(q * 8 + 2); + unsigned short* outptr3 = top_blob.batch(b).channel(q * 8 + 3); + unsigned short* outptr4 = top_blob.batch(b).channel(q * 8 + 4); + unsigned short* outptr5 = top_blob.batch(b).channel(q * 8 + 5); + unsigned short* outptr6 = top_blob.batch(b).channel(q * 8 + 6); + unsigned short* outptr7 = top_blob.batch(b).channel(q * 8 + 7); int i = 0; for (; i < size; i++) @@ -2112,13 +2236,16 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack4to8) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const unsigned short* r0 = bottom_blob.channel(q * 2); - const unsigned short* r1 = bottom_blob.channel(q * 2 + 1); + int b = bq / outc; + int q = bq % outc; + const unsigned short* r0 = bottom_blob.batch(b).channel(q * 2); + const unsigned short* r1 = bottom_blob.batch(b).channel(q * 2 + 1); - unsigned short* outptr = top_blob.channel(q); + unsigned short* outptr = top_blob.batch(b).channel(q); for (int i = 0; i < size; i++) { @@ -2139,13 +2266,16 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack8to4) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const unsigned short* r0 = bottom_blob.channel(q); + int b = bq / channels; + int q = bq % channels; + const unsigned short* r0 = bottom_blob.batch(b).channel(q); - unsigned short* outptr0 = top_blob.channel(q * 2); - unsigned short* outptr1 = top_blob.channel(q * 2 + 1); + unsigned short* outptr0 = top_blob.batch(b).channel(q * 2); + unsigned short* outptr1 = top_blob.batch(b).channel(q * 2 + 1); for (int i = 0; i < size; i++) { @@ -2166,27 +2296,30 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack1to16) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const unsigned short* r0 = bottom_blob.channel(q * 16); - const unsigned short* r1 = bottom_blob.channel(q * 16 + 1); - const unsigned short* r2 = bottom_blob.channel(q * 16 + 2); - const unsigned short* r3 = bottom_blob.channel(q * 16 + 3); - const unsigned short* r4 = bottom_blob.channel(q * 16 + 4); - const unsigned short* r5 = bottom_blob.channel(q * 16 + 5); - const unsigned short* r6 = bottom_blob.channel(q * 16 + 6); - const unsigned short* r7 = bottom_blob.channel(q * 16 + 7); - const unsigned short* r8 = bottom_blob.channel(q * 16 + 8); - const unsigned short* r9 = bottom_blob.channel(q * 16 + 9); - const unsigned short* ra = bottom_blob.channel(q * 16 + 10); - const unsigned short* rb = bottom_blob.channel(q * 16 + 11); - const unsigned short* rc = bottom_blob.channel(q * 16 + 12); - const unsigned short* rd = bottom_blob.channel(q * 16 + 13); - const unsigned short* re = bottom_blob.channel(q * 16 + 14); - const unsigned short* rf = bottom_blob.channel(q * 16 + 15); - - unsigned short* outptr = top_blob.channel(q); + int b = bq / outc; + int q = bq % outc; + const unsigned short* r0 = bottom_blob.batch(b).channel(q * 16); + const unsigned short* r1 = bottom_blob.batch(b).channel(q * 16 + 1); + const unsigned short* r2 = bottom_blob.batch(b).channel(q * 16 + 2); + const unsigned short* r3 = bottom_blob.batch(b).channel(q * 16 + 3); + const unsigned short* r4 = bottom_blob.batch(b).channel(q * 16 + 4); + const unsigned short* r5 = bottom_blob.batch(b).channel(q * 16 + 5); + const unsigned short* r6 = bottom_blob.batch(b).channel(q * 16 + 6); + const unsigned short* r7 = bottom_blob.batch(b).channel(q * 16 + 7); + const unsigned short* r8 = bottom_blob.batch(b).channel(q * 16 + 8); + const unsigned short* r9 = bottom_blob.batch(b).channel(q * 16 + 9); + const unsigned short* ra = bottom_blob.batch(b).channel(q * 16 + 10); + const unsigned short* rb = bottom_blob.batch(b).channel(q * 16 + 11); + const unsigned short* rc = bottom_blob.batch(b).channel(q * 16 + 12); + const unsigned short* rd = bottom_blob.batch(b).channel(q * 16 + 13); + const unsigned short* re = bottom_blob.batch(b).channel(q * 16 + 14); + const unsigned short* rf = bottom_blob.batch(b).channel(q * 16 + 15); + + unsigned short* outptr = top_blob.batch(b).channel(q); int i = 0; for (; i < size; i++) @@ -2214,27 +2347,30 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack16to1) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const unsigned short* r0 = bottom_blob.channel(q); - - unsigned short* outptr0 = top_blob.channel(q * 16); - unsigned short* outptr1 = top_blob.channel(q * 16 + 1); - unsigned short* outptr2 = top_blob.channel(q * 16 + 2); - unsigned short* outptr3 = top_blob.channel(q * 16 + 3); - unsigned short* outptr4 = top_blob.channel(q * 16 + 4); - unsigned short* outptr5 = top_blob.channel(q * 16 + 5); - unsigned short* outptr6 = top_blob.channel(q * 16 + 6); - unsigned short* outptr7 = top_blob.channel(q * 16 + 7); - unsigned short* outptr8 = top_blob.channel(q * 16 + 8); - unsigned short* outptr9 = top_blob.channel(q * 16 + 9); - unsigned short* outptra = top_blob.channel(q * 16 + 10); - unsigned short* outptrb = top_blob.channel(q * 16 + 11); - unsigned short* outptrc = top_blob.channel(q * 16 + 12); - unsigned short* outptrd = top_blob.channel(q * 16 + 13); - unsigned short* outptre = top_blob.channel(q * 16 + 14); - unsigned short* outptrf = top_blob.channel(q * 16 + 15); + int b = bq / channels; + int q = bq % channels; + const unsigned short* r0 = bottom_blob.batch(b).channel(q); + + unsigned short* outptr0 = top_blob.batch(b).channel(q * 16); + unsigned short* outptr1 = top_blob.batch(b).channel(q * 16 + 1); + unsigned short* outptr2 = top_blob.batch(b).channel(q * 16 + 2); + unsigned short* outptr3 = top_blob.batch(b).channel(q * 16 + 3); + unsigned short* outptr4 = top_blob.batch(b).channel(q * 16 + 4); + unsigned short* outptr5 = top_blob.batch(b).channel(q * 16 + 5); + unsigned short* outptr6 = top_blob.batch(b).channel(q * 16 + 6); + unsigned short* outptr7 = top_blob.batch(b).channel(q * 16 + 7); + unsigned short* outptr8 = top_blob.batch(b).channel(q * 16 + 8); + unsigned short* outptr9 = top_blob.batch(b).channel(q * 16 + 9); + unsigned short* outptra = top_blob.batch(b).channel(q * 16 + 10); + unsigned short* outptrb = top_blob.batch(b).channel(q * 16 + 11); + unsigned short* outptrc = top_blob.batch(b).channel(q * 16 + 12); + unsigned short* outptrd = top_blob.batch(b).channel(q * 16 + 13); + unsigned short* outptre = top_blob.batch(b).channel(q * 16 + 14); + unsigned short* outptrf = top_blob.batch(b).channel(q * 16 + 15); int i = 0; for (; i < size; i++) @@ -2262,15 +2398,18 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack4to16) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const unsigned short* r0 = bottom_blob.channel(q * 4); - const unsigned short* r1 = bottom_blob.channel(q * 4 + 1); - const unsigned short* r2 = bottom_blob.channel(q * 4 + 2); - const unsigned short* r3 = bottom_blob.channel(q * 4 + 3); + int b = bq / outc; + int q = bq % outc; + const unsigned short* r0 = bottom_blob.batch(b).channel(q * 4); + const unsigned short* r1 = bottom_blob.batch(b).channel(q * 4 + 1); + const unsigned short* r2 = bottom_blob.batch(b).channel(q * 4 + 2); + const unsigned short* r3 = bottom_blob.batch(b).channel(q * 4 + 3); - unsigned short* outptr = top_blob.channel(q); + unsigned short* outptr = top_blob.batch(b).channel(q); for (int i = 0; i < size; i++) { @@ -2301,15 +2440,18 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack16to4) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const unsigned short* r0 = bottom_blob.channel(q); + int b = bq / channels; + int q = bq % channels; + const unsigned short* r0 = bottom_blob.batch(b).channel(q); - unsigned short* outptr0 = top_blob.channel(q * 4); - unsigned short* outptr1 = top_blob.channel(q * 4 + 1); - unsigned short* outptr2 = top_blob.channel(q * 4 + 2); - unsigned short* outptr3 = top_blob.channel(q * 4 + 3); + unsigned short* outptr0 = top_blob.batch(b).channel(q * 4); + unsigned short* outptr1 = top_blob.batch(b).channel(q * 4 + 1); + unsigned short* outptr2 = top_blob.batch(b).channel(q * 4 + 2); + unsigned short* outptr3 = top_blob.batch(b).channel(q * 4 + 3); for (int i = 0; i < size; i++) { @@ -2340,13 +2482,16 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack8to16) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const unsigned short* r0 = bottom_blob.channel(q * 2); - const unsigned short* r1 = bottom_blob.channel(q * 2 + 1); + int b = bq / outc; + int q = bq % outc; + const unsigned short* r0 = bottom_blob.batch(b).channel(q * 2); + const unsigned short* r1 = bottom_blob.batch(b).channel(q * 2 + 1); - unsigned short* outptr = top_blob.channel(q); + unsigned short* outptr = top_blob.batch(b).channel(q); for (int i = 0; i < size; i++) { @@ -2375,13 +2520,16 @@ int Packing_x86::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons } if (pack16to8) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const unsigned short* r0 = bottom_blob.channel(q); + int b = bq / channels; + int q = bq % channels; + const unsigned short* r0 = bottom_blob.batch(b).channel(q); - unsigned short* outptr0 = top_blob.channel(q * 2); - unsigned short* outptr1 = top_blob.channel(q * 2 + 1); + unsigned short* outptr0 = top_blob.batch(b).channel(q * 2); + unsigned short* outptr1 = top_blob.batch(b).channel(q * 2 + 1); for (int i = 0; i < size; i++) { @@ -2444,6 +2592,7 @@ int Packing_x86::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio int d = bottom_blob.d; int channels = bottom_blob.c; int dims = bottom_blob.dims; + int batch = bottom_blob.n; if (!use_padding) { @@ -2472,6 +2621,7 @@ int Packing_x86::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio top_blob.cstep = bottom_blob.cstep * elempack / out_elempack; top_blob.elemsize = elemsize / elempack * out_elempack; top_blob.elempack = out_elempack; + top_blob.nstep = bottom_blob.nstep * elempack / out_elempack; return 0; } @@ -2480,25 +2630,28 @@ int Packing_x86::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio int outh = h * elempack / out_elempack; size_t out_elemsize = elemsize / elempack * out_elempack; - top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, outh, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; if (pack1to8) { + const int total_bi = batch * outh; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) + for (int bi = 0; bi < total_bi; bi++) { - const signed char* r0 = bottom_blob.row(i * 8); - const signed char* r1 = bottom_blob.row(i * 8 + 1); - const signed char* r2 = bottom_blob.row(i * 8 + 2); - const signed char* r3 = bottom_blob.row(i * 8 + 3); - const signed char* r4 = bottom_blob.row(i * 8 + 4); - const signed char* r5 = bottom_blob.row(i * 8 + 5); - const signed char* r6 = bottom_blob.row(i * 8 + 6); - const signed char* r7 = bottom_blob.row(i * 8 + 7); - - signed char* outptr = top_blob.row(i); + int b = bi / outh; + int i = bi % outh; + const signed char* r0 = bottom_blob.batch(b).row(i * 8); + const signed char* r1 = bottom_blob.batch(b).row(i * 8 + 1); + const signed char* r2 = bottom_blob.batch(b).row(i * 8 + 2); + const signed char* r3 = bottom_blob.batch(b).row(i * 8 + 3); + const signed char* r4 = bottom_blob.batch(b).row(i * 8 + 4); + const signed char* r5 = bottom_blob.batch(b).row(i * 8 + 5); + const signed char* r6 = bottom_blob.batch(b).row(i * 8 + 6); + const signed char* r7 = bottom_blob.batch(b).row(i * 8 + 7); + + signed char* outptr = top_blob.batch(b).row(i); int j = 0; for (; j < w; j++) @@ -2518,19 +2671,22 @@ int Packing_x86::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio } if (pack8to1) { + const int total_bi = batch * h; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int bi = 0; bi < total_bi; bi++) { - const signed char* r0 = bottom_blob.row(i); - - signed char* outptr0 = top_blob.row(i * 8); - signed char* outptr1 = top_blob.row(i * 8 + 1); - signed char* outptr2 = top_blob.row(i * 8 + 2); - signed char* outptr3 = top_blob.row(i * 8 + 3); - signed char* outptr4 = top_blob.row(i * 8 + 4); - signed char* outptr5 = top_blob.row(i * 8 + 5); - signed char* outptr6 = top_blob.row(i * 8 + 6); - signed char* outptr7 = top_blob.row(i * 8 + 7); + int b = bi / h; + int i = bi % h; + const signed char* r0 = bottom_blob.batch(b).row(i); + + signed char* outptr0 = top_blob.batch(b).row(i * 8); + signed char* outptr1 = top_blob.batch(b).row(i * 8 + 1); + signed char* outptr2 = top_blob.batch(b).row(i * 8 + 2); + signed char* outptr3 = top_blob.batch(b).row(i * 8 + 3); + signed char* outptr4 = top_blob.batch(b).row(i * 8 + 4); + signed char* outptr5 = top_blob.batch(b).row(i * 8 + 5); + signed char* outptr6 = top_blob.batch(b).row(i * 8 + 6); + signed char* outptr7 = top_blob.batch(b).row(i * 8 + 7); int j = 0; for (; j < w; j++) @@ -2559,27 +2715,30 @@ int Packing_x86::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio size_t out_elemsize = elemsize / elempack * out_elempack; if (dims == 3) - top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, h, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); else // if (dims == 4) - top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create_batch(w, h, d, outc, batch, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; if (pack1to8) { + const int total_bq = batch * outc; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) + for (int bq = 0; bq < total_bq; bq++) { - const signed char* r0 = bottom_blob.channel(q * 8); - const signed char* r1 = bottom_blob.channel(q * 8 + 1); - const signed char* r2 = bottom_blob.channel(q * 8 + 2); - const signed char* r3 = bottom_blob.channel(q * 8 + 3); - const signed char* r4 = bottom_blob.channel(q * 8 + 4); - const signed char* r5 = bottom_blob.channel(q * 8 + 5); - const signed char* r6 = bottom_blob.channel(q * 8 + 6); - const signed char* r7 = bottom_blob.channel(q * 8 + 7); - - signed char* outptr = top_blob.channel(q); + int b = bq / outc; + int q = bq % outc; + const signed char* r0 = bottom_blob.batch(b).channel(q * 8); + const signed char* r1 = bottom_blob.batch(b).channel(q * 8 + 1); + const signed char* r2 = bottom_blob.batch(b).channel(q * 8 + 2); + const signed char* r3 = bottom_blob.batch(b).channel(q * 8 + 3); + const signed char* r4 = bottom_blob.batch(b).channel(q * 8 + 4); + const signed char* r5 = bottom_blob.batch(b).channel(q * 8 + 5); + const signed char* r6 = bottom_blob.batch(b).channel(q * 8 + 6); + const signed char* r7 = bottom_blob.batch(b).channel(q * 8 + 7); + + signed char* outptr = top_blob.batch(b).channel(q); int i = 0; for (; i < size; i++) @@ -2599,19 +2758,22 @@ int Packing_x86::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio } if (pack8to1) { + const int total_bq = batch * channels; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int bq = 0; bq < total_bq; bq++) { - const signed char* r0 = bottom_blob.channel(q); - - signed char* outptr0 = top_blob.channel(q * 8); - signed char* outptr1 = top_blob.channel(q * 8 + 1); - signed char* outptr2 = top_blob.channel(q * 8 + 2); - signed char* outptr3 = top_blob.channel(q * 8 + 3); - signed char* outptr4 = top_blob.channel(q * 8 + 4); - signed char* outptr5 = top_blob.channel(q * 8 + 5); - signed char* outptr6 = top_blob.channel(q * 8 + 6); - signed char* outptr7 = top_blob.channel(q * 8 + 7); + int b = bq / channels; + int q = bq % channels; + const signed char* r0 = bottom_blob.batch(b).channel(q); + + signed char* outptr0 = top_blob.batch(b).channel(q * 8); + signed char* outptr1 = top_blob.batch(b).channel(q * 8 + 1); + signed char* outptr2 = top_blob.batch(b).channel(q * 8 + 2); + signed char* outptr3 = top_blob.batch(b).channel(q * 8 + 3); + signed char* outptr4 = top_blob.batch(b).channel(q * 8 + 4); + signed char* outptr5 = top_blob.batch(b).channel(q * 8 + 5); + signed char* outptr6 = top_blob.batch(b).channel(q * 8 + 6); + signed char* outptr7 = top_blob.batch(b).channel(q * 8 + 7); int i = 0; for (; i < size; i++) diff --git a/src/mat.cpp b/src/mat.cpp index f066a15c0417..052bdbbd1156 100644 --- a/src/mat.cpp +++ b/src/mat.cpp @@ -22,6 +22,25 @@ Mat Mat::clone(Allocator* _allocator) const return Mat(); Mat m; + if (n > 1) + { + m.create_like_batch(*this, n, _allocator); + + if (m.empty()) + return m; + + // copy batch by batch (nstep may include 4K padding) + size_t single_batch_size = cstep * c * elemsize; + for (int b = 0; b < n; b++) + { + const void* src = (const unsigned char*)data + nstep * b * elemsize; + void* dst = (unsigned char*)m.data + m.nstep * b * elemsize; + memcpy(dst, src, single_batch_size); + } + + return m; + } + if (dims == 1) m.create(w, elemsize, elempack, _allocator); else if (dims == 2) @@ -501,6 +520,12 @@ void Mat::create(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack void Mat::create_like(const Mat& m, Allocator* _allocator) { + if (m.n > 1) + { + create_like_batch(m, m.n, _allocator); + return; + } + int _dims = m.dims; if (_dims == 1) create(m.w, m.elemsize, m.elempack, _allocator); @@ -512,6 +537,178 @@ void Mat::create_like(const Mat& m, Allocator* _allocator) create(m.w, m.h, m.d, m.c, m.elemsize, m.elempack, _allocator); } +void Mat::create_like_batch(const Mat& m, int _batch, Allocator* _allocator) +{ + if (m.dims == 1) + create_batch(m.w, _batch, m.elemsize, m.elempack, _allocator); + else if (m.dims == 2) + create_batch(m.w, m.h, _batch, m.elemsize, m.elempack, _allocator); + else if (m.dims == 3) + create_batch(m.w, m.h, m.c, _batch, m.elemsize, m.elempack, _allocator); + else if (m.dims == 4) + create_batch(m.w, m.h, m.d, m.c, _batch, m.elemsize, m.elempack, _allocator); +} + +void Mat::create_batch(int _w, int _batch, size_t _elemsize, int _elempack, Allocator* _allocator) +{ + if (_batch <= 1) + { + create(_w, _elemsize, _elempack, _allocator); + return; + } + + release(); + + elemsize = _elemsize; + elempack = _elempack; + allocator = _allocator; + + dims = 1; + w = _w; + h = 1; + d = 1; + c = 1; + n = _batch; + + cstep = alignSize((size_t)w * elemsize, 16) / elemsize; + nstep = alignSize(cstep * elemsize, 4096) / elemsize; + + size_t totalsize = alignSize(nstep * n * elemsize, 4); + if (totalsize > 0) + { + if (allocator) + data = allocator->fastMalloc(totalsize + (int)sizeof(*refcount)); + else + data = fastMalloc(totalsize + (int)sizeof(*refcount)); + } + + if (data) + { + refcount = (int*)(((unsigned char*)data) + totalsize); + *refcount = 1; + } +} + +void Mat::create_batch(int _w, int _h, int _batch, size_t _elemsize, int _elempack, Allocator* _allocator) +{ + if (_batch <= 1) + { + create(_w, _h, _elemsize, _elempack, _allocator); + return; + } + + release(); + + elemsize = _elemsize; + elempack = _elempack; + allocator = _allocator; + + dims = 2; + w = _w; + h = _h; + d = 1; + c = 1; + n = _batch; + + cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize; + nstep = alignSize(cstep * elemsize, 4096) / elemsize; + + size_t totalsize = alignSize(nstep * n * elemsize, 4); + if (totalsize > 0) + { + if (allocator) + data = allocator->fastMalloc(totalsize + (int)sizeof(*refcount)); + else + data = fastMalloc(totalsize + (int)sizeof(*refcount)); + } + + if (data) + { + refcount = (int*)(((unsigned char*)data) + totalsize); + *refcount = 1; + } +} + +void Mat::create_batch(int _w, int _h, int _c, int _batch, size_t _elemsize, int _elempack, Allocator* _allocator) +{ + if (_batch <= 1) + { + create(_w, _h, _c, _elemsize, _elempack, _allocator); + return; + } + + release(); + + elemsize = _elemsize; + elempack = _elempack; + allocator = _allocator; + + dims = 3; + w = _w; + h = _h; + d = 1; + c = _c; + n = _batch; + + cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize; + nstep = alignSize(cstep * c * elemsize, 4096) / elemsize; + + size_t totalsize = alignSize(nstep * n * elemsize, 4); + if (totalsize > 0) + { + if (allocator) + data = allocator->fastMalloc(totalsize + (int)sizeof(*refcount)); + else + data = fastMalloc(totalsize + (int)sizeof(*refcount)); + } + + if (data) + { + refcount = (int*)(((unsigned char*)data) + totalsize); + *refcount = 1; + } +} + +void Mat::create_batch(int _w, int _h, int _d, int _c, int _batch, size_t _elemsize, int _elempack, Allocator* _allocator) +{ + if (_batch <= 1) + { + create(_w, _h, _d, _c, _elemsize, _elempack, _allocator); + return; + } + + release(); + + elemsize = _elemsize; + elempack = _elempack; + allocator = _allocator; + + dims = 4; + w = _w; + h = _h; + d = _d; + c = _c; + n = _batch; + + cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize; + nstep = alignSize(cstep * c * elemsize, 4096) / elemsize; + + size_t totalsize = alignSize(nstep * n * elemsize, 4); + if (totalsize > 0) + { + if (allocator) + data = allocator->fastMalloc(totalsize + (int)sizeof(*refcount)); + else + data = fastMalloc(totalsize + (int)sizeof(*refcount)); + } + + if (data) + { + refcount = (int*)(((unsigned char*)data) + totalsize); + *refcount = 1; + } +} + #if NCNN_VULKAN void Mat::create_like(const VkMat& m, Allocator* _allocator) { @@ -526,6 +723,18 @@ void Mat::create_like(const VkMat& m, Allocator* _allocator) create(m.w, m.h, m.d, m.c, m.elemsize, m.elempack, _allocator); } +void Mat::create_like_batch(const VkMat& m, int _batch, Allocator* _allocator) +{ + if (m.dims == 1) + create_batch(m.w, _batch, m.elemsize, m.elempack, _allocator); + else if (m.dims == 2) + create_batch(m.w, m.h, _batch, m.elemsize, m.elempack, _allocator); + else if (m.dims == 3) + create_batch(m.w, m.h, m.c, _batch, m.elemsize, m.elempack, _allocator); + else if (m.dims == 4) + create_batch(m.w, m.h, m.d, m.c, _batch, m.elemsize, m.elempack, _allocator); +} + void Mat::create_like(const VkImageMat& im, Allocator* _allocator) { int _dims = im.dims; @@ -571,6 +780,7 @@ void VkMat::create(int _w, size_t _elemsize, VkAllocator* _allocator) { refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount)); *refcount = 1; + nstep = data->capacity / elemsize; } } @@ -604,6 +814,7 @@ void VkMat::create(int _w, int _h, size_t _elemsize, VkAllocator* _allocator) { refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount)); *refcount = 1; + nstep = data->capacity / elemsize; } } @@ -637,6 +848,7 @@ void VkMat::create(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _alloc { refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount)); *refcount = 1; + nstep = data->capacity / elemsize; } } @@ -670,6 +882,7 @@ void VkMat::create(int _w, int _h, int _d, int _c, size_t _elemsize, VkAllocator { refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount)); *refcount = 1; + nstep = data->capacity / elemsize; } } @@ -703,6 +916,7 @@ void VkMat::create(int _w, size_t _elemsize, int _elempack, VkAllocator* _alloca { refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount)); *refcount = 1; + nstep = data->capacity / elemsize; } } @@ -736,6 +950,7 @@ void VkMat::create(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* { refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount)); *refcount = 1; + nstep = data->capacity / elemsize; } } @@ -769,6 +984,7 @@ void VkMat::create(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAl { refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount)); *refcount = 1; + nstep = data->capacity / elemsize; } } @@ -802,11 +1018,18 @@ void VkMat::create(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempa { refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount)); *refcount = 1; + nstep = data->capacity / elemsize; } } void VkMat::create_like(const Mat& m, VkAllocator* _allocator) { + if (m.n > 1) + { + create_like_batch(m, m.n, _allocator); + return; + } + int _dims = m.dims; if (_dims == 1) create(m.w, m.elemsize, m.elempack, _allocator); @@ -820,6 +1043,12 @@ void VkMat::create_like(const Mat& m, VkAllocator* _allocator) void VkMat::create_like(const VkMat& m, VkAllocator* _allocator) { + if (m.n > 1) + { + create_like_batch(m, m.n, _allocator); + return; + } + int _dims = m.dims; if (_dims == 1) create(m.w, m.elemsize, m.elempack, _allocator); @@ -831,6 +1060,30 @@ void VkMat::create_like(const VkMat& m, VkAllocator* _allocator) create(m.w, m.h, m.d, m.c, m.elemsize, m.elempack, _allocator); } +void VkMat::create_like_batch(const Mat& m, int _batch, VkAllocator* _allocator) +{ + if (m.dims == 1) + create_batch(m.w, _batch, m.elemsize, m.elempack, _allocator); + else if (m.dims == 2) + create_batch(m.w, m.h, _batch, m.elemsize, m.elempack, _allocator); + else if (m.dims == 3) + create_batch(m.w, m.h, m.c, _batch, m.elemsize, m.elempack, _allocator); + else if (m.dims == 4) + create_batch(m.w, m.h, m.d, m.c, _batch, m.elemsize, m.elempack, _allocator); +} + +void VkMat::create_like_batch(const VkMat& m, int _batch, VkAllocator* _allocator) +{ + if (m.dims == 1) + create_batch(m.w, _batch, m.elemsize, m.elempack, _allocator); + else if (m.dims == 2) + create_batch(m.w, m.h, _batch, m.elemsize, m.elempack, _allocator); + else if (m.dims == 3) + create_batch(m.w, m.h, m.c, _batch, m.elemsize, m.elempack, _allocator); + else if (m.dims == 4) + create_batch(m.w, m.h, m.d, m.c, _batch, m.elemsize, m.elempack, _allocator); +} + void VkMat::create_like(const VkImageMat& im, VkAllocator* _allocator) { int _dims = im.dims; @@ -844,6 +1097,154 @@ void VkMat::create_like(const VkImageMat& im, VkAllocator* _allocator) create(im.w, im.h, im.d, im.c, im.elemsize, im.elempack, _allocator); } +void VkMat::create_batch(int _w, int _batch, size_t _elemsize, int _elempack, VkAllocator* _allocator) +{ + if (_batch <= 1) + { + create(_w, _elemsize, _elempack, _allocator); + return; + } + + release(); + + elemsize = _elemsize; + elempack = _elempack; + allocator = _allocator; + + dims = 1; + w = _w; + h = 1; + d = 1; + c = 1; + n = _batch; + + cstep = alignSize((size_t)w * elemsize, 16) / elemsize; + nstep = alignSize(cstep * elemsize, 4096) / elemsize; + + size_t totalsize = alignSize(nstep * n * elemsize, 4); + if (totalsize > 0) + { + data = allocator->fastMalloc(totalsize); + } + + if (data) + { + refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount)); + *refcount = 1; + } +} + +void VkMat::create_batch(int _w, int _h, int _batch, size_t _elemsize, int _elempack, VkAllocator* _allocator) +{ + if (_batch <= 1) + { + create(_w, _h, _elemsize, _elempack, _allocator); + return; + } + + release(); + + elemsize = _elemsize; + elempack = _elempack; + allocator = _allocator; + + dims = 2; + w = _w; + h = _h; + d = 1; + c = 1; + n = _batch; + + cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize; + nstep = alignSize(cstep * elemsize, 4096) / elemsize; + + size_t totalsize = alignSize(nstep * n * elemsize, 4); + if (totalsize > 0) + { + data = allocator->fastMalloc(totalsize); + } + + if (data) + { + refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount)); + *refcount = 1; + } +} + +void VkMat::create_batch(int _w, int _h, int _c, int _batch, size_t _elemsize, int _elempack, VkAllocator* _allocator) +{ + if (_batch <= 1) + { + create(_w, _h, _c, _elemsize, _elempack, _allocator); + return; + } + + release(); + + elemsize = _elemsize; + elempack = _elempack; + allocator = _allocator; + + dims = 3; + w = _w; + h = _h; + d = 1; + c = _c; + n = _batch; + + cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize; + nstep = alignSize(cstep * c * elemsize, 4096) / elemsize; + + size_t totalsize = alignSize(nstep * n * elemsize, 4); + if (totalsize > 0) + { + data = allocator->fastMalloc(totalsize); + } + + if (data) + { + refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount)); + *refcount = 1; + } +} + +void VkMat::create_batch(int _w, int _h, int _d, int _c, int _batch, size_t _elemsize, int _elempack, VkAllocator* _allocator) +{ + if (_batch <= 1) + { + create(_w, _h, _d, _c, _elemsize, _elempack, _allocator); + return; + } + + release(); + + elemsize = _elemsize; + elempack = _elempack; + allocator = _allocator; + + dims = 4; + w = _w; + h = _h; + d = _d; + c = _c; + n = _batch; + + cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize; + nstep = alignSize(cstep * c * elemsize, 4096) / elemsize; + + size_t totalsize = alignSize(nstep * n * elemsize, 4); + if (totalsize > 0) + { + data = allocator->fastMalloc(totalsize); + } + + if (data) + { + refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount)); + *refcount = 1; + } +} + void VkImageMat::create(int _w, size_t _elemsize, VkAllocator* _allocator) { if (dims == 1 && w == _w && elemsize == _elemsize && elempack == 1 && allocator == _allocator) diff --git a/src/mat.h b/src/mat.h index 9e353aa61d4e..1fc24af5f5e3 100644 --- a/src/mat.h +++ b/src/mat.h @@ -163,9 +163,21 @@ class NCNN_EXPORT Mat void create(int w, int h, int d, int c, size_t elemsize, int elempack, Allocator* allocator = 0); // allocate like void create_like(const Mat& m, Allocator* allocator = 0); + // allocate like with batch count, copying shape from m + void create_like_batch(const Mat& m, int batch, Allocator* allocator = 0); + // allocate batch vec + void create_batch(int w, int batch, size_t elemsize, int elempack, Allocator* allocator = 0); + // allocate batch image + void create_batch(int w, int h, int batch, size_t elemsize, int elempack, Allocator* allocator = 0); + // allocate batch dim + void create_batch(int w, int h, int c, int batch, size_t elemsize, int elempack, Allocator* allocator = 0); + // allocate batch packed cube + void create_batch(int w, int h, int d, int c, int batch, size_t elemsize, int elempack, Allocator* allocator = 0); #if NCNN_VULKAN // allocate like void create_like(const VkMat& m, Allocator* allocator = 0); + // allocate like with batch count, copying shape from VkMat + void create_like_batch(const VkMat& m, int batch, Allocator* allocator = 0); // allocate like void create_like(const VkImageMat& im, Allocator* allocator = 0); #endif // NCNN_VULKAN @@ -205,6 +217,12 @@ class NCNN_EXPORT Mat Mat range(int x, int n); const Mat range(int x, int n) const; + // batch reference + Mat batch(int b); + const Mat batch(int b) const; + Mat batch_range(int b, int batches); + const Mat batch_range(int b, int batches) const; + // access raw data template operator T*(); @@ -333,6 +351,11 @@ class NCNN_EXPORT Mat int c; size_t cstep; + + // batch count, default 1 + int n; + // element step from one batch to the next (4K-byte aligned) + size_t nstep; }; #if NCNN_VULKAN @@ -401,8 +424,26 @@ class NCNN_EXPORT VkMat void create_like(const Mat& m, VkAllocator* allocator); // allocate like void create_like(const VkMat& m, VkAllocator* allocator); + // allocate like with batch count, copying shape from m + void create_like_batch(const Mat& m, int batch, VkAllocator* allocator); + // allocate like with batch count, copying shape from VkMat + void create_like_batch(const VkMat& m, int batch, VkAllocator* allocator); // allocate like void create_like(const VkImageMat& im, VkAllocator* allocator); + // allocate batch vec + void create_batch(int w, int batch, size_t elemsize, int elempack, VkAllocator* allocator); + // allocate batch image + void create_batch(int w, int h, int batch, size_t elemsize, int elempack, VkAllocator* allocator); + // allocate batch dim + void create_batch(int w, int h, int c, int batch, size_t elemsize, int elempack, VkAllocator* allocator); + // allocate batch packed cube + void create_batch(int w, int h, int d, int c, int batch, size_t elemsize, int elempack, VkAllocator* allocator); + + // batch reference + VkMat batch(int b); + const VkMat batch(int b) const; + VkMat batch_range(int b, int batches); + const VkMat batch_range(int b, int batches) const; // mapped Mat mapped() const; @@ -459,6 +500,14 @@ class NCNN_EXPORT VkMat int c; size_t cstep; + + // batch count, default 1 + int n; + // element step from one batch to the next (4K-byte aligned) + // for non-batch VkMat, equals data->capacity / elemsize + size_t nstep; + // byte offset relative to data->offset (for batch sub-views) + size_t offset; }; class NCNN_EXPORT VkImageMat @@ -797,108 +846,108 @@ NCNN_EXPORT void dequantize_from_int32(const Mat& src, Mat& dst, const Mat& scal NCNN_EXPORT void requantize_from_int32_to_int8(const Mat& src, Mat& dst, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt = Option()); NCNN_FORCEINLINE Mat::Mat() - : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0) { } NCNN_FORCEINLINE Mat::Mat(int _w, size_t _elemsize, Allocator* _allocator) - : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0) { create(_w, _elemsize, _allocator); } NCNN_FORCEINLINE Mat::Mat(int _w, int _h, size_t _elemsize, Allocator* _allocator) - : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0) { create(_w, _h, _elemsize, _allocator); } NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, size_t _elemsize, Allocator* _allocator) - : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0) { create(_w, _h, _c, _elemsize, _allocator); } NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, size_t _elemsize, Allocator* _allocator) - : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0) { create(_w, _h, _d, _c, _elemsize, _allocator); } NCNN_FORCEINLINE Mat::Mat(int _w, size_t _elemsize, int _elempack, Allocator* _allocator) - : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0) { create(_w, _elemsize, _elempack, _allocator); } NCNN_FORCEINLINE Mat::Mat(int _w, int _h, size_t _elemsize, int _elempack, Allocator* _allocator) - : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0) { create(_w, _h, _elemsize, _elempack, _allocator); } NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, size_t _elemsize, int _elempack, Allocator* _allocator) - : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0) { create(_w, _h, _c, _elemsize, _elempack, _allocator); } NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, Allocator* _allocator) - : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0) { create(_w, _h, _d, _c, _elemsize, _elempack, _allocator); } NCNN_FORCEINLINE Mat::Mat(const Mat& m) - : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c), cstep(m.cstep) + : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c), cstep(m.cstep), n(m.n), nstep(m.nstep) { addref(); } NCNN_FORCEINLINE Mat::Mat(int _w, void* _data, size_t _elemsize, Allocator* _allocator) - : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1) + : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1), n(1), nstep(0) { cstep = alignSize(w * elemsize, 16) / elemsize; } NCNN_FORCEINLINE Mat::Mat(int _w, int _h, void* _data, size_t _elemsize, Allocator* _allocator) - : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1) + : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1), n(1), nstep(0) { cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize; } NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize, Allocator* _allocator) - : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c) + : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c), n(1), nstep(0) { cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize; } NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, void* _data, size_t _elemsize, Allocator* _allocator) - : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c) + : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c), n(1), nstep(0) { cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize; } NCNN_FORCEINLINE Mat::Mat(int _w, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator) - : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1) + : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1), n(1), nstep(0) { cstep = alignSize(w * elemsize, 16) / elemsize; } NCNN_FORCEINLINE Mat::Mat(int _w, int _h, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator) - : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1) + : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1), n(1), nstep(0) { cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize; } NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator) - : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c) + : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c), n(1), nstep(0) { cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize; } NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator) - : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c) + : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c), n(1), nstep(0) { cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize; } @@ -1192,6 +1241,9 @@ NCNN_FORCEINLINE Mat& Mat::operator=(const Mat& m) cstep = m.cstep; + n = m.n; + nstep = m.nstep; + return *this; } @@ -1224,6 +1276,9 @@ NCNN_FORCEINLINE void Mat::release() cstep = 0; + n = 1; + nstep = 0; + refcount = 0; } @@ -1366,6 +1421,42 @@ NCNN_FORCEINLINE const Mat Mat::range(int x, int n) const return m; } +NCNN_FORCEINLINE Mat Mat::batch(int b) +{ + Mat m(w, h, d, c, (unsigned char*)data + nstep * b * elemsize, elemsize, elempack, allocator); + m.dims = dims; + m.cstep = cstep; + return m; +} + +NCNN_FORCEINLINE const Mat Mat::batch(int b) const +{ + Mat m(w, h, d, c, (unsigned char*)data + nstep * b * elemsize, elemsize, elempack, allocator); + m.dims = dims; + m.cstep = cstep; + return m; +} + +NCNN_FORCEINLINE Mat Mat::batch_range(int b, int batches) +{ + Mat m(w, h, d, c, (unsigned char*)data + nstep * b * elemsize, elemsize, elempack, allocator); + m.dims = dims; + m.cstep = cstep; + m.n = batches; + m.nstep = nstep; + return m; +} + +NCNN_FORCEINLINE const Mat Mat::batch_range(int b, int batches) const +{ + Mat m(w, h, d, c, (unsigned char*)data + nstep * b * elemsize, elemsize, elempack, allocator); + m.dims = dims; + m.cstep = cstep; + m.n = batches; + m.nstep = nstep; + return m; +} + template NCNN_FORCEINLINE Mat::operator T*() { @@ -1391,54 +1482,54 @@ NCNN_FORCEINLINE const float& Mat::operator[](size_t i) const #if NCNN_VULKAN NCNN_FORCEINLINE VkMat::VkMat() - : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0), offset(0) { } NCNN_FORCEINLINE VkMat::VkMat(int _w, size_t _elemsize, VkAllocator* _allocator) - : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0), offset(0) { create(_w, _elemsize, _allocator); } NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator) - : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0), offset(0) { create(_w, _h, _elemsize, _allocator); } NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator) - : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0), offset(0) { create(_w, _h, _c, _elemsize, _allocator); } NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, size_t _elemsize, VkAllocator* _allocator) - : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0), offset(0) { create(_w, _h, _d, _c, _elemsize, _allocator); } NCNN_FORCEINLINE VkMat::VkMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator) - : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0), offset(0) { create(_w, _elemsize, _elempack, _allocator); } NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator) - : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0), offset(0) { create(_w, _h, _elemsize, _elempack, _allocator); } NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator) - : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0), offset(0) { create(_w, _h, _c, _elemsize, _elempack, _allocator); } NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator) - : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0), n(1), nstep(0), offset(0) { create(_w, _h, _d, _c, _elemsize, _elempack, _allocator); } @@ -1449,54 +1540,65 @@ NCNN_FORCEINLINE VkMat::VkMat(const VkMat& m) addref(); cstep = m.cstep; + n = m.n; + nstep = m.nstep; + offset = m.offset; } NCNN_FORCEINLINE VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator) - : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1) + : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1), n(1), nstep(0), offset(0) { cstep = alignSize(w * elemsize, 16) / elemsize; + nstep = _data->capacity / _elemsize; } NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator) - : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1) + : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1), n(1), nstep(0), offset(0) { cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize; + nstep = _data->capacity / _elemsize; } NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator) - : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c) + : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c), n(1), nstep(0), offset(0) { cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize; + nstep = _data->capacity / _elemsize; } NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator) - : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c) + : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c), n(1), nstep(0), offset(0) { cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize; + nstep = _data->capacity / _elemsize; } NCNN_FORCEINLINE VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator) - : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1) + : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1), n(1), nstep(0), offset(0) { cstep = alignSize(w * elemsize, 16) / elemsize; + nstep = _data->capacity / _elemsize; } NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator) - : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1) + : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1), n(1), nstep(0), offset(0) { cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize; + nstep = _data->capacity / _elemsize; } NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator) - : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c) + : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c), n(1), nstep(0), offset(0) { cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize; + nstep = _data->capacity / _elemsize; } NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator) - : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c) + : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c), n(1), nstep(0), offset(0) { cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize; + nstep = _data->capacity / _elemsize; } NCNN_FORCEINLINE VkMat::~VkMat() @@ -1528,6 +1630,10 @@ NCNN_FORCEINLINE VkMat& VkMat::operator=(const VkMat& m) cstep = m.cstep; + n = m.n; + nstep = m.nstep; + offset = m.offset; + return *this; } @@ -1556,7 +1662,7 @@ NCNN_FORCEINLINE void* VkMat::mapped_ptr() const if (!allocator->mappable) return 0; - return (unsigned char*)data->mapped_ptr + data->offset; + return (unsigned char*)data->mapped_ptr + data->offset + offset; } NCNN_FORCEINLINE void VkMat::addref() @@ -1588,6 +1694,10 @@ NCNN_FORCEINLINE void VkMat::release() cstep = 0; + n = 1; + nstep = 0; + offset = 0; + refcount = 0; } @@ -1627,12 +1737,92 @@ NCNN_FORCEINLINE VkBuffer VkMat::buffer() const NCNN_FORCEINLINE size_t VkMat::buffer_offset() const { - return data->offset; + return data->offset + offset; } NCNN_FORCEINLINE size_t VkMat::buffer_capacity() const { - return data->capacity; + return nstep * elemsize; +} + +NCNN_FORCEINLINE VkMat VkMat::batch(int b) +{ + VkMat m; + m.data = data; + m.refcount = 0; + m.elemsize = elemsize; + m.elempack = elempack; + m.allocator = allocator; + m.dims = dims; + m.w = w; + m.h = h; + m.d = d; + m.c = c; + m.cstep = cstep; + m.n = 1; + m.nstep = nstep; + m.offset = offset + nstep * b * elemsize; + return m; +} + +NCNN_FORCEINLINE const VkMat VkMat::batch(int b) const +{ + VkMat m; + m.data = data; + m.refcount = 0; + m.elemsize = elemsize; + m.elempack = elempack; + m.allocator = allocator; + m.dims = dims; + m.w = w; + m.h = h; + m.d = d; + m.c = c; + m.cstep = cstep; + m.n = 1; + m.nstep = nstep; + m.offset = offset + nstep * b * elemsize; + return m; +} + +NCNN_FORCEINLINE VkMat VkMat::batch_range(int b, int batches) +{ + VkMat m; + m.data = data; + m.refcount = 0; + m.elemsize = elemsize; + m.elempack = elempack; + m.allocator = allocator; + m.dims = dims; + m.w = w; + m.h = h; + m.d = d; + m.c = c; + m.cstep = cstep; + m.n = batches; + m.nstep = nstep; + m.offset = offset + nstep * b * elemsize; + return m; +} + +NCNN_FORCEINLINE const VkMat VkMat::batch_range(int b, int batches) const +{ + VkMat m; + m.data = data; + m.refcount = 0; + m.elemsize = elemsize; + m.elempack = elempack; + m.allocator = allocator; + m.dims = dims; + m.w = w; + m.h = h; + m.d = d; + m.c = c; + m.cstep = cstep; + m.n = batches; + m.nstep = nstep; + m.offset = offset + nstep * b * elemsize; + return m; } NCNN_FORCEINLINE VkImageMat::VkImageMat() diff --git a/src/net.cpp b/src/net.cpp index 4394132040ef..de91cac424e3 100644 --- a/src/net.cpp +++ b/src/net.cpp @@ -629,33 +629,86 @@ int NetPrivate::do_forward_layer(const Layer* layer, std::vector& blob_mats if (ret != 0) return ret; - // forward - if (opt.lightmode && layer->support_inplace) + // batch forward + if (bottom_blob.n > 1) { - Mat& bottom_top_blob = bottom_blob; - int ret = layer->forward_inplace(bottom_top_blob, opt); - if (ret != 0) - return ret; + const int B = bottom_blob.n; + + if (opt.lightmode && layer->support_inplace) + { + for (int b = 0; b < B; b++) + { + Mat batch_view = bottom_blob.batch(b); + int ret = layer->forward_inplace(batch_view, opt); + if (ret != 0) + return ret; + } - // store top blob - blob_mats[top_blob_index] = bottom_top_blob; + // store top blob (whole batch, inplace modified) + blob_mats[top_blob_index] = bottom_blob; + } + else + { + Mat top_batch; + for (int b = 0; b < B; b++) + { + Mat bottom_b = bottom_blob.batch(b); + Mat top_b; + int ret = layer->forward(bottom_b, top_b, opt); + if (ret != 0) + return ret; + + if (b == 0) + { + top_batch.create_like_batch(top_b, B, opt.blob_allocator); + if (top_batch.empty()) + return -100; + } + + size_t batch_data_size = top_b.cstep * top_b.c * top_b.elemsize; + memcpy(top_batch.batch(b).data, top_b.data, batch_data_size); + } + + // store top blob + blob_mats[top_blob_index] = top_batch; + } + + if (opt.lightmode) + { + // delete after taken in light mode + blob_mats[bottom_blob_index].release(); + } } else { - Mat top_blob; - int ret = layer->forward(bottom_blob, top_blob, opt); - if (ret != 0) - return ret; + // forward + if (opt.lightmode && layer->support_inplace) + { + Mat& bottom_top_blob = bottom_blob; + int ret = layer->forward_inplace(bottom_top_blob, opt); + if (ret != 0) + return ret; - // store top blob - blob_mats[top_blob_index] = top_blob; - } + // store top blob + blob_mats[top_blob_index] = bottom_top_blob; + } + else + { + Mat top_blob; + int ret = layer->forward(bottom_blob, top_blob, opt); + if (ret != 0) + return ret; - if (opt.lightmode) - { - // delete after taken in light mode - blob_mats[bottom_blob_index].release(); - } + // store top blob + blob_mats[top_blob_index] = top_blob; + } + + if (opt.lightmode) + { + // delete after taken in light mode + blob_mats[bottom_blob_index].release(); + } + } // n == 1 } else { @@ -687,48 +740,135 @@ int NetPrivate::do_forward_layer(const Layer* layer, std::vector& blob_mats return ret; } - // forward - if (opt.lightmode && layer->support_inplace) + // detect batch + int B = 1; + for (size_t i = 0; i < bottom_blobs.size(); i++) { - std::vector& bottom_top_blobs = bottom_blobs; - int ret = layer->forward_inplace(bottom_top_blobs, opt); - if (ret != 0) - return ret; + if (bottom_blobs[i].n > 1) + { + B = bottom_blobs[i].n; + break; + } + } - // store top blobs - for (size_t i = 0; i < layer->tops.size(); i++) + if (B > 1) + { + if (opt.lightmode && layer->support_inplace) { - int top_blob_index = layer->tops[i]; + for (int b = 0; b < B; b++) + { + std::vector batch_views(bottom_blobs.size()); + for (size_t i = 0; i < bottom_blobs.size(); i++) + { + batch_views[i] = bottom_blobs[i].n > 1 ? bottom_blobs[i].batch(b) : bottom_blobs[i]; + } + int ret = layer->forward_inplace(batch_views, opt); + if (ret != 0) + return ret; + } - blob_mats[top_blob_index] = bottom_top_blobs[i]; + // store top blobs (whole batch, inplace modified) + for (size_t i = 0; i < layer->tops.size(); i++) + { + int top_blob_index = layer->tops[i]; + blob_mats[top_blob_index] = bottom_blobs[i]; + } + } + else + { + std::vector top_batches(layer->tops.size()); + for (int b = 0; b < B; b++) + { + std::vector bottom_b(bottom_blobs.size()); + for (size_t i = 0; i < bottom_blobs.size(); i++) + { + bottom_b[i] = bottom_blobs[i].n > 1 ? bottom_blobs[i].batch(b) : bottom_blobs[i]; + } + + std::vector top_b(layer->tops.size()); + int ret = layer->forward(bottom_b, top_b, opt); + if (ret != 0) + return ret; + + if (b == 0) + { + for (size_t i = 0; i < top_b.size(); i++) + { + top_batches[i].create_like_batch(top_b[i], B, opt.blob_allocator); + if (top_batches[i].empty()) + return -100; + } + } + + for (size_t i = 0; i < top_b.size(); i++) + { + size_t batch_data_size = top_b[i].cstep * top_b[i].c * top_b[i].elemsize; + memcpy(top_batches[i].batch(b).data, top_b[i].data, batch_data_size); + } + } + + // store top blobs + for (size_t i = 0; i < layer->tops.size(); i++) + { + int top_blob_index = layer->tops[i]; + blob_mats[top_blob_index] = top_batches[i]; + } + } + + if (opt.lightmode) + { + for (size_t i = 0; i < layer->bottoms.size(); i++) + { + int bottom_blob_index = layer->bottoms[i]; + blob_mats[bottom_blob_index].release(); + } } } else { - std::vector top_blobs(layer->tops.size()); - int ret = layer->forward(bottom_blobs, top_blobs, opt); - if (ret != 0) - return ret; + // forward + if (opt.lightmode && layer->support_inplace) + { + std::vector& bottom_top_blobs = bottom_blobs; + int ret = layer->forward_inplace(bottom_top_blobs, opt); + if (ret != 0) + return ret; + + // store top blobs + for (size_t i = 0; i < layer->tops.size(); i++) + { + int top_blob_index = layer->tops[i]; - // store top blobs - for (size_t i = 0; i < layer->tops.size(); i++) + blob_mats[top_blob_index] = bottom_top_blobs[i]; + } + } + else { - int top_blob_index = layer->tops[i]; + std::vector top_blobs(layer->tops.size()); + int ret = layer->forward(bottom_blobs, top_blobs, opt); + if (ret != 0) + return ret; + + // store top blobs + for (size_t i = 0; i < layer->tops.size(); i++) + { + int top_blob_index = layer->tops[i]; - blob_mats[top_blob_index] = top_blobs[i]; + blob_mats[top_blob_index] = top_blobs[i]; + } } - } - if (opt.lightmode) - { - for (size_t i = 0; i < layer->bottoms.size(); i++) + if (opt.lightmode) { - int bottom_blob_index = layer->bottoms[i]; + for (size_t i = 0; i < layer->bottoms.size(); i++) + { + int bottom_blob_index = layer->bottoms[i]; - // delete after taken in light mode - blob_mats[bottom_blob_index].release(); + // delete after taken in light mode + blob_mats[bottom_blob_index].release(); + } } - } + } // B == 1 } return 0; @@ -764,33 +904,83 @@ int NetPrivate::do_forward_layer(const Layer* layer, std::vector& blob_ma if (ret != 0) return ret; - // forward - if (opt.lightmode && layer->support_inplace) + // batch forward + if (bottom_blob.n > 1) { - VkMat& bottom_top_blob = bottom_blob; - int ret = layer->forward_inplace(bottom_top_blob, cmd, opt); - if (ret != 0) - return ret; + const int B = bottom_blob.n; + + if (opt.lightmode && layer->support_inplace) + { + for (int b = 0; b < B; b++) + { + VkMat batch_view = bottom_blob.batch(b); + int ret = layer->forward_inplace(batch_view, cmd, opt); + if (ret != 0) + return ret; + } - // store top blob - blob_mats_gpu[top_blob_index] = bottom_top_blob; + blob_mats_gpu[top_blob_index] = bottom_blob; + } + else + { + VkMat top_batch; + for (int b = 0; b < B; b++) + { + VkMat bottom_b = bottom_blob.batch(b); + VkMat top_b; + int ret = layer->forward(bottom_b, top_b, cmd, opt); + if (ret != 0) + return ret; + + if (b == 0) + { + top_batch.create_like_batch(top_b, B, opt.blob_vkallocator); + if (top_batch.empty()) + return -100; + } + + VkMat top_batch_slot = top_batch.batch(b); + cmd.record_clone(top_b, top_batch_slot, opt); + } + + blob_mats_gpu[top_blob_index] = top_batch; + } + + if (opt.lightmode) + { + blob_mats_gpu[bottom_blob_index].release(); + } } else { - VkMat top_blob; - int ret = layer->forward(bottom_blob, top_blob, cmd, opt); - if (ret != 0) - return ret; + // forward + if (opt.lightmode && layer->support_inplace) + { + VkMat& bottom_top_blob = bottom_blob; + int ret = layer->forward_inplace(bottom_top_blob, cmd, opt); + if (ret != 0) + return ret; - // store top blob - blob_mats_gpu[top_blob_index] = top_blob; - } + // store top blob + blob_mats_gpu[top_blob_index] = bottom_top_blob; + } + else + { + VkMat top_blob; + int ret = layer->forward(bottom_blob, top_blob, cmd, opt); + if (ret != 0) + return ret; - if (opt.lightmode) - { - // delete after taken in light mode - blob_mats_gpu[bottom_blob_index].release(); - } + // store top blob + blob_mats_gpu[top_blob_index] = top_blob; + } + + if (opt.lightmode) + { + // delete after taken in light mode + blob_mats_gpu[bottom_blob_index].release(); + } + } // n == 1 } else { @@ -822,48 +1012,133 @@ int NetPrivate::do_forward_layer(const Layer* layer, std::vector& blob_ma return ret; } - // forward - if (opt.lightmode && layer->support_inplace) + // detect batch + int B = 1; + for (size_t i = 0; i < bottom_blobs.size(); i++) { - std::vector& bottom_top_blobs = bottom_blobs; - int ret = layer->forward_inplace(bottom_top_blobs, cmd, opt); - if (ret != 0) - return ret; + if (bottom_blobs[i].n > 1) + { + B = bottom_blobs[i].n; + break; + } + } - // store top blobs - for (size_t i = 0; i < layer->tops.size(); i++) + if (B > 1) + { + if (opt.lightmode && layer->support_inplace) { - int top_blob_index = layer->tops[i]; + for (int b = 0; b < B; b++) + { + std::vector batch_views(bottom_blobs.size()); + for (size_t i = 0; i < bottom_blobs.size(); i++) + { + batch_views[i] = bottom_blobs[i].n > 1 ? bottom_blobs[i].batch(b) : bottom_blobs[i]; + } + int ret = layer->forward_inplace(batch_views, cmd, opt); + if (ret != 0) + return ret; + } - blob_mats_gpu[top_blob_index] = bottom_top_blobs[i]; + for (size_t i = 0; i < layer->tops.size(); i++) + { + int top_blob_index = layer->tops[i]; + blob_mats_gpu[top_blob_index] = bottom_blobs[i]; + } + } + else + { + std::vector top_batches(layer->tops.size()); + for (int b = 0; b < B; b++) + { + std::vector bottom_b(bottom_blobs.size()); + for (size_t i = 0; i < bottom_blobs.size(); i++) + { + bottom_b[i] = bottom_blobs[i].n > 1 ? bottom_blobs[i].batch(b) : bottom_blobs[i]; + } + + std::vector top_b(layer->tops.size()); + int ret = layer->forward(bottom_b, top_b, cmd, opt); + if (ret != 0) + return ret; + + if (b == 0) + { + for (size_t i = 0; i < top_b.size(); i++) + { + top_batches[i].create_like_batch(top_b[i], B, opt.blob_vkallocator); + if (top_batches[i].empty()) + return -100; + } + } + + for (size_t i = 0; i < top_b.size(); i++) + { + VkMat top_batch_slot = top_batches[i].batch(b); + cmd.record_clone(top_b[i], top_batch_slot, opt); + } + } + + for (size_t i = 0; i < layer->tops.size(); i++) + { + int top_blob_index = layer->tops[i]; + blob_mats_gpu[top_blob_index] = top_batches[i]; + } + } + + if (opt.lightmode) + { + for (size_t i = 0; i < layer->bottoms.size(); i++) + { + int bottom_blob_index = layer->bottoms[i]; + blob_mats_gpu[bottom_blob_index].release(); + } } } else { - std::vector top_blobs(layer->tops.size()); - int ret = layer->forward(bottom_blobs, top_blobs, cmd, opt); - if (ret != 0) - return ret; + // forward + if (opt.lightmode && layer->support_inplace) + { + std::vector& bottom_top_blobs = bottom_blobs; + int ret = layer->forward_inplace(bottom_top_blobs, cmd, opt); + if (ret != 0) + return ret; + + // store top blobs + for (size_t i = 0; i < layer->tops.size(); i++) + { + int top_blob_index = layer->tops[i]; - // store top blobs - for (size_t i = 0; i < layer->tops.size(); i++) + blob_mats_gpu[top_blob_index] = bottom_top_blobs[i]; + } + } + else { - int top_blob_index = layer->tops[i]; + std::vector top_blobs(layer->tops.size()); + int ret = layer->forward(bottom_blobs, top_blobs, cmd, opt); + if (ret != 0) + return ret; + + // store top blobs + for (size_t i = 0; i < layer->tops.size(); i++) + { + int top_blob_index = layer->tops[i]; - blob_mats_gpu[top_blob_index] = top_blobs[i]; + blob_mats_gpu[top_blob_index] = top_blobs[i]; + } } - } - if (opt.lightmode) - { - for (size_t i = 0; i < layer->bottoms.size(); i++) + if (opt.lightmode) { - int bottom_blob_index = layer->bottoms[i]; + for (size_t i = 0; i < layer->bottoms.size(); i++) + { + int bottom_blob_index = layer->bottoms[i]; - // delete after taken in light mode - blob_mats_gpu[bottom_blob_index].release(); + // delete after taken in light mode + blob_mats_gpu[bottom_blob_index].release(); + } } - } + } // B == 1 } return 0; @@ -2685,9 +2960,9 @@ int Extractor::extract(int blob_index, Mat& feat, int type) { if (d->opt.use_packing_layout && (type == 0) && feat.elempack != 1) { - Mat bottom_blob_unpacked; - convert_packing(feat, bottom_blob_unpacked, 1, d->opt); - feat = bottom_blob_unpacked; + Mat feat_unpacked; + convert_packing(feat, feat_unpacked, 1, d->opt); + feat = feat_unpacked; if (feat.empty()) return -100; } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e72e6d02b86e..761cf9264900 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -62,6 +62,7 @@ ncnn_add_test(c_api) ncnn_add_test(cpu) ncnn_add_test(expression) ncnn_add_test(paramdict) +ncnn_add_test(mat_batch) if(NCNN_VULKAN) ncnn_add_test(command) diff --git a/tests/test_mat_batch.cpp b/tests/test_mat_batch.cpp new file mode 100644 index 000000000000..72feb2ce9ad7 --- /dev/null +++ b/tests/test_mat_batch.cpp @@ -0,0 +1,1256 @@ +// Copyright 2025 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#include "mat.h" +#include "net.h" + +#if NCNN_VULKAN +#include "gpu.h" +#include "command.h" +#endif + +#include +#include +#include + +static int test_create_batch_basic() +{ + // create a batch of 4 images, 3 channels, 8x6 spatial + ncnn::Mat m; + m.create_batch(8, 6, 3, 4, 4u, 1); + + if (m.dims != 3) + { + fprintf(stderr, "test_create_batch_basic dims expect 3 got %d\n", m.dims); + return -1; + } + if (m.w != 8 || m.h != 6 || m.c != 3) + { + fprintf(stderr, "test_create_batch_basic shape mismatch w=%d h=%d c=%d\n", m.w, m.h, m.c); + return -1; + } + if (m.n != 4) + { + fprintf(stderr, "test_create_batch_basic n expect 4 got %d\n", m.n); + return -1; + } + if (m.data == 0) + { + fprintf(stderr, "test_create_batch_basic data is null\n"); + return -1; + } + if (m.refcount == 0 || *m.refcount != 1) + { + fprintf(stderr, "test_create_batch_basic refcount error\n"); + return -1; + } + + return 0; +} + +static int test_nstep_alignment() +{ + // verify nstep * elemsize is 4K aligned + { + ncnn::Mat m; + m.create_batch(8, 6, 3, 4, 4u, 1); + size_t nstep_bytes = m.nstep * m.elemsize; + if (nstep_bytes % 4096 != 0) + { + fprintf(stderr, "test_nstep_alignment 3D failed: nstep_bytes=%zu\n", nstep_bytes); + return -1; + } + } + + // odd spatial dims + { + ncnn::Mat m; + m.create_batch(7, 5, 13, 2, 4u, 1); + size_t nstep_bytes = m.nstep * m.elemsize; + if (nstep_bytes % 4096 != 0) + { + fprintf(stderr, "test_nstep_alignment odd failed: nstep_bytes=%zu\n", nstep_bytes); + return -1; + } + } + + // 4D with depth + { + ncnn::Mat m; + m.create_batch(5, 4, 3, 2, 8, 4u, 1, 0); + if (m.dims != 4) + { + fprintf(stderr, "test_nstep_alignment 4D dims expect 4 got %d\n", m.dims); + return -1; + } + size_t nstep_bytes = m.nstep * m.elemsize; + if (nstep_bytes % 4096 != 0) + { + fprintf(stderr, "test_nstep_alignment 4D failed: nstep_bytes=%zu\n", nstep_bytes); + return -1; + } + } + + // packed elempack=4 + { + ncnn::Mat m; + m.create_batch(8, 6, 1, 12, 4, 16u, 4, 0); + size_t nstep_bytes = m.nstep * m.elemsize; + if (nstep_bytes % 4096 != 0) + { + fprintf(stderr, "test_nstep_alignment packed failed: nstep_bytes=%zu\n", nstep_bytes); + return -1; + } + } + + return 0; +} + +static int test_batch_subview_zero_copy() +{ + ncnn::Mat m; + m.create_batch(4, 3, 2, 3, 4u, 1); + + // fill each batch with distinct value + for (int b = 0; b < m.n; b++) + { + ncnn::Mat sub = m.batch(b); + sub.fill((float)(b + 1)); + } + + // read back and verify + for (int b = 0; b < m.n; b++) + { + const ncnn::Mat sub = m.batch(b); + + // verify sub-view properties + if (sub.dims != m.dims || sub.w != m.w || sub.h != m.h || sub.c != m.c) + { + fprintf(stderr, "test_batch_subview shape mismatch at batch %d\n", b); + return -1; + } + if (sub.cstep != m.cstep) + { + fprintf(stderr, "test_batch_subview cstep mismatch at batch %d\n", b); + return -1; + } + if (sub.n != 1) + { + fprintf(stderr, "test_batch_subview n expect 1 got %d\n", sub.n); + return -1; + } + if (sub.refcount != 0) + { + fprintf(stderr, "test_batch_subview refcount should be NULL (zero-copy)\n"); + return -1; + } + + // verify data pointer is at correct offset + unsigned char* expected_ptr = (unsigned char*)m.data + m.nstep * b * m.elemsize; + if ((unsigned char*)sub.data != expected_ptr) + { + fprintf(stderr, "test_batch_subview data pointer mismatch at batch %d\n", b); + return -1; + } + + // verify values + float expected = (float)(b + 1); + for (int q = 0; q < sub.c; q++) + { + const float* ptr = sub.channel(q); + for (int i = 0; i < sub.w * sub.h; i++) + { + if (ptr[i] != expected) + { + fprintf(stderr, "test_batch_subview value mismatch at batch %d ch %d idx %d: got %f expect %f\n", + b, q, i, ptr[i], expected); + return -1; + } + } + } + } + + return 0; +} + +static int test_batch_range() +{ + ncnn::Mat m; + m.create_batch(4, 3, 2, 4, 4u, 1); + + // fill with batch index + for (int b = 0; b < 4; b++) + { + ncnn::Mat sub = m.batch(b); + sub.fill((float)(b * 10)); + } + + // get range [1, 2) batches + ncnn::Mat range = m.batch_range(1, 2); + if (range.n != 2) + { + fprintf(stderr, "test_batch_range n expect 2 got %d\n", range.n); + return -1; + } + if (range.nstep != m.nstep) + { + fprintf(stderr, "test_batch_range nstep mismatch\n"); + return -1; + } + + // verify range.batch(0) == m.batch(1) + const ncnn::Mat r0 = range.batch(0); + const float* r0_ptr = r0.channel(0); + if (r0_ptr[0] != 10.f) + { + fprintf(stderr, "test_batch_range batch(0) value expect 10 got %f\n", r0_ptr[0]); + return -1; + } + + // verify range.batch(1) == m.batch(2) + const ncnn::Mat r1 = range.batch(1); + const float* r1_ptr = r1.channel(0); + if (r1_ptr[0] != 20.f) + { + fprintf(stderr, "test_batch_range batch(1) value expect 20 got %f\n", r1_ptr[0]); + return -1; + } + + return 0; +} + +static int test_batch_data_isolation() +{ + ncnn::Mat m; + m.create_batch(16, 16, 3, 4, 4u, 1); + + // write unique pattern to each batch + for (int b = 0; b < 4; b++) + { + ncnn::Mat sub = m.batch(b); + for (int q = 0; q < sub.c; q++) + { + float* ptr = sub.channel(q); + for (int i = 0; i < sub.w * sub.h; i++) + { + ptr[i] = (float)(b * 1000 + q * 100 + i); + } + } + } + + // verify no cross-contamination + for (int b = 0; b < 4; b++) + { + const ncnn::Mat sub = m.batch(b); + for (int q = 0; q < sub.c; q++) + { + const float* ptr = sub.channel(q); + for (int i = 0; i < sub.w * sub.h; i++) + { + float expected = (float)(b * 1000 + q * 100 + i); + if (ptr[i] != expected) + { + fprintf(stderr, "test_batch_data_isolation mismatch at b=%d q=%d i=%d: got %f expect %f\n", + b, q, i, ptr[i], expected); + return -1; + } + } + } + } + + return 0; +} + +static int test_batch_clone() +{ + ncnn::Mat m; + m.create_batch(8, 6, 3, 4, 4u, 1); + + // fill with data + for (int b = 0; b < 4; b++) + { + ncnn::Mat sub = m.batch(b); + sub.fill((float)(b + 1)); + } + + // clone + ncnn::Mat m2 = m.clone(); + + // verify deep copy + if (m2.data == m.data) + { + fprintf(stderr, "test_batch_clone data should be different (deep copy)\n"); + return -1; + } + if (m2.n != m.n) + { + fprintf(stderr, "test_batch_clone n mismatch\n"); + return -1; + } + if (m2.nstep != m.nstep) + { + fprintf(stderr, "test_batch_clone nstep mismatch\n"); + return -1; + } + if (m2.dims != m.dims || m2.w != m.w || m2.h != m.h || m2.c != m.c) + { + fprintf(stderr, "test_batch_clone shape mismatch\n"); + return -1; + } + + // verify values match + for (int b = 0; b < 4; b++) + { + const ncnn::Mat s2 = m2.batch(b); + float expected = (float)(b + 1); + const float* p2 = s2.channel(0); + if (p2[0] != expected) + { + fprintf(stderr, "test_batch_clone value mismatch at batch %d\n", b); + return -1; + } + } + + // verify independence: modify original, clone should not change + m.batch(0).fill(999.f); + const float* p2 = m2.batch(0).channel(0); + if (p2[0] != 1.f) + { + fprintf(stderr, "test_batch_clone not independent after modify\n"); + return -1; + } + + return 0; +} + +static int test_batch_release() +{ + ncnn::Mat m; + m.create_batch(4, 3, 2, 4, 4u, 1); + + m.release(); + + if (m.dims != 0) + { + fprintf(stderr, "test_batch_release dims expect 0 got %d\n", m.dims); + return -1; + } + if (m.n != 1) + { + fprintf(stderr, "test_batch_release n expect 1 got %d\n", m.n); + return -1; + } + if (m.nstep != 0) + { + fprintf(stderr, "test_batch_release nstep expect 0 got %zu\n", m.nstep); + return -1; + } + if (m.data != 0) + { + fprintf(stderr, "test_batch_release data should be null\n"); + return -1; + } + + return 0; +} + +static int test_backward_compatibility() +{ + // regular Mat should have n=1 + ncnn::Mat m1(8, 6, 3); + if (m1.n != 1) + { + fprintf(stderr, "test_backward_compat n expect 1 got %d\n", m1.n); + return -1; + } + + // channel() and row() still work + m1.fill(42.f); + ncnn::Mat ch0 = m1.channel(0); + if (ch0.w != 8 || ch0.h != 6) + { + fprintf(stderr, "test_backward_compat channel shape mismatch\n"); + return -1; + } + const float* row0 = ch0.row(0); + if (row0[0] != 42.f) + { + fprintf(stderr, "test_backward_compat channel value mismatch\n"); + return -1; + } + + // copy ctor preserves n + ncnn::Mat m2 = m1; + if (m2.n != 1) + { + fprintf(stderr, "test_backward_compat copy n mismatch\n"); + return -1; + } + + // empty Mat has n=1 + ncnn::Mat m3; + if (m3.n != 1) + { + fprintf(stderr, "test_backward_compat empty n expect 1 got %d\n", m3.n); + return -1; + } + + return 0; +} + +static int test_create_batch_single() +{ + // create_batch with batch=1 should fall back to regular create + ncnn::Mat m; + m.create_batch(8, 6, 3, 1, 4u, 1); + + if (m.dims != 3) + { + fprintf(stderr, "test_create_batch_single dims expect 3 got %d\n", m.dims); + return -1; + } + if (m.n != 1) + { + fprintf(stderr, "test_create_batch_single n expect 1 got %d\n", m.n); + return -1; + } + if (m.w != 8 || m.h != 6 || m.c != 3) + { + fprintf(stderr, "test_create_batch_single shape mismatch\n"); + return -1; + } + + // should work like normal Mat + m.fill(7.f); + if (((const float*)m.data)[0] != 7.f) + { + fprintf(stderr, "test_create_batch_single fill failed\n"); + return -1; + } + + return 0; +} + +static int test_create_batch_1d() +{ + // create a batch of 4 1D vectors, w=100 + ncnn::Mat m; + m.create_batch(100, 4, 4u, 1); + + if (m.dims != 1) + { + fprintf(stderr, "test_create_batch_1d dims expect 1 got %d\n", m.dims); + return -1; + } + if (m.w != 100 || m.h != 1 || m.d != 1 || m.c != 1) + { + fprintf(stderr, "test_create_batch_1d shape mismatch w=%d h=%d d=%d c=%d\n", m.w, m.h, m.d, m.c); + return -1; + } + if (m.n != 4) + { + fprintf(stderr, "test_create_batch_1d n expect 4 got %d\n", m.n); + return -1; + } + if (m.data == 0) + { + fprintf(stderr, "test_create_batch_1d data is null\n"); + return -1; + } + + // verify nstep alignment + size_t nstep_bytes = m.nstep * m.elemsize; + if (nstep_bytes % 4096 != 0) + { + fprintf(stderr, "test_create_batch_1d nstep_bytes=%zu not 4K aligned\n", nstep_bytes); + return -1; + } + + // fill and verify subview zero-copy + for (int b = 0; b < m.n; b++) + { + ncnn::Mat sub = m.batch(b); + sub.fill((float)(b + 10)); + } + for (int b = 0; b < m.n; b++) + { + const ncnn::Mat sub = m.batch(b); + if (sub.dims != 1 || sub.w != 100 || sub.n != 1) + { + fprintf(stderr, "test_create_batch_1d subview shape mismatch at batch %d\n", b); + return -1; + } + if (sub.refcount != 0) + { + fprintf(stderr, "test_create_batch_1d subview should be zero-copy\n"); + return -1; + } + float expected = (float)(b + 10); + const float* ptr = (const float*)sub.data; + if (ptr[0] != expected || ptr[99] != expected) + { + fprintf(stderr, "test_create_batch_1d value mismatch at batch %d\n", b); + return -1; + } + } + + return 0; +} + +static int test_create_batch_2d() +{ + // create a batch of 3 2D matrices, 10x20 + ncnn::Mat m; + m.create_batch(10, 20, 3, 4u, 1); + + if (m.dims != 2) + { + fprintf(stderr, "test_create_batch_2d dims expect 2 got %d\n", m.dims); + return -1; + } + if (m.w != 10 || m.h != 20 || m.d != 1 || m.c != 1) + { + fprintf(stderr, "test_create_batch_2d shape mismatch w=%d h=%d d=%d c=%d\n", m.w, m.h, m.d, m.c); + return -1; + } + if (m.n != 3) + { + fprintf(stderr, "test_create_batch_2d n expect 3 got %d\n", m.n); + return -1; + } + + // verify nstep alignment + size_t nstep_bytes = m.nstep * m.elemsize; + if (nstep_bytes % 4096 != 0) + { + fprintf(stderr, "test_create_batch_2d nstep_bytes=%zu not 4K aligned\n", nstep_bytes); + return -1; + } + + // fill and verify subview zero-copy + for (int b = 0; b < m.n; b++) + { + ncnn::Mat sub = m.batch(b); + sub.fill((float)(b + 100)); + } + for (int b = 0; b < m.n; b++) + { + const ncnn::Mat sub = m.batch(b); + if (sub.dims != 2 || sub.w != 10 || sub.h != 20 || sub.n != 1) + { + fprintf(stderr, "test_create_batch_2d subview shape mismatch at batch %d\n", b); + return -1; + } + if (sub.refcount != 0) + { + fprintf(stderr, "test_create_batch_2d subview should be zero-copy\n"); + return -1; + } + float expected = (float)(b + 100); + const float* ptr = (const float*)sub.data; + if (ptr[0] != expected || ptr[10 * 20 - 1] != expected) + { + fprintf(stderr, "test_create_batch_2d value mismatch at batch %d\n", b); + return -1; + } + } + + return 0; +} + +static int test_batch_forward_relu() +{ + // Build a minimal Input -> ReLU network + // ReLU with slope=0.1 (leaky relu) + const char param_str[] = "7767517\n" + "2 2\n" + "Input input 0 1 data\n" + "ReLU relu 1 1 data output 0=1.000000e-01\n"; + + ncnn::Net net; + net.load_param_mem(param_str); + + const int B = 4; + const int C = 3; + const int H = 3; + const int W = 4; + + ncnn::Mat input_batch; + input_batch.create_batch(W, H, C, B, 4u, 1); + if (input_batch.empty()) + { + fprintf(stderr, "test_batch_forward_relu create_batch failed\n"); + return -1; + } + + // fill: batch b gets value (b - 1.5), some negative, some positive + for (int b = 0; b < B; b++) + { + ncnn::Mat sub = input_batch.batch(b); + sub.fill((float)(b - 1.5f)); + } + + ncnn::Extractor ex = net.create_extractor(); + ex.input("data", input_batch); + + ncnn::Mat output_batch; + int ret = ex.extract("output", output_batch); + if (ret != 0) + { + fprintf(stderr, "test_batch_forward_relu extract failed ret=%d\n", ret); + return -1; + } + + if (output_batch.n != B) + { + fprintf(stderr, "test_batch_forward_relu output n expect %d got %d\n", B, output_batch.n); + return -1; + } + if (output_batch.w != W || output_batch.h != H || output_batch.c != C) + { + fprintf(stderr, "test_batch_forward_relu output shape mismatch\n"); + return -1; + } + + // verify leaky relu: max(x, 0.1*x) + for (int b = 0; b < B; b++) + { + const ncnn::Mat out_sub = output_batch.batch(b); + float input_val = (float)(b - 1.5f); + float expected = input_val > 0 ? input_val : input_val * 0.1f; + + for (int q = 0; q < C; q++) + { + const float* ptr = out_sub.channel(q); + for (int i = 0; i < W * H; i++) + { + if (fabsf(ptr[i] - expected) > 1e-5f) + { + fprintf(stderr, "test_batch_forward_relu value mismatch at b=%d q=%d i=%d: got %f expect %f\n", + b, q, i, ptr[i], expected); + return -1; + } + } + } + } + + return 0; +} + +static int test_batch_forward_pooling() +{ + // Input -> Pooling(max, 2x2, stride=2) + const char param_str[] = "7767517\n" + "2 2\n" + "Input input 0 1 data\n" + "Pooling pooling 1 1 data output 0=0 1=2 2=2\n"; + + ncnn::Net net; + net.load_param_mem(param_str); + + const int B = 2; + const int C = 2; + const int H = 4; + const int W = 4; + + ncnn::Mat input_batch; + input_batch.create_batch(W, H, C, B, 4u, 1); + + for (int b = 0; b < B; b++) + { + ncnn::Mat sub = input_batch.batch(b); + for (int q = 0; q < C; q++) + { + float* ptr = sub.channel(q); + for (int i = 0; i < W * H; i++) + { + ptr[i] = (float)(b * 100 + q * 10 + i); + } + } + } + + ncnn::Extractor ex = net.create_extractor(); + ex.input("data", input_batch); + + ncnn::Mat output_batch; + int ret = ex.extract("output", output_batch); + if (ret != 0) + { + fprintf(stderr, "test_batch_forward_pooling extract failed ret=%d\n", ret); + return -1; + } + + if (output_batch.n != B) + { + fprintf(stderr, "test_batch_forward_pooling output n expect %d got %d\n", B, output_batch.n); + return -1; + } + if (output_batch.w != 2 || output_batch.h != 2 || output_batch.c != C) + { + fprintf(stderr, "test_batch_forward_pooling output shape expect 2x2x%d got %dx%dx%d\n", + C, output_batch.w, output_batch.h, output_batch.c); + return -1; + } + + // verify max pooling for batch 0, channel 0 + // input 4x4: [ 0 1 2 3 / 4 5 6 7 / 8 9 10 11 / 12 13 14 15 ] + // max pool 2x2 stride 2 -> [ 5 7 / 13 15 ] + { + const ncnn::Mat out0 = output_batch.batch(0); + const float* ptr = out0.channel(0); + float expected[4] = {5.f, 7.f, 13.f, 15.f}; + for (int i = 0; i < 4; i++) + { + if (fabsf(ptr[i] - expected[i]) > 1e-5f) + { + fprintf(stderr, "test_batch_forward_pooling b0 mismatch at i=%d: got %f expect %f\n", + i, ptr[i], expected[i]); + return -1; + } + } + } + + // verify batch 1, channel 0: input 100+i -> max pool -> [105, 107, 113, 115] + { + const ncnn::Mat out1 = output_batch.batch(1); + const float* ptr = out1.channel(0); + float expected[4] = {105.f, 107.f, 113.f, 115.f}; + for (int i = 0; i < 4; i++) + { + if (fabsf(ptr[i] - expected[i]) > 1e-5f) + { + fprintf(stderr, "test_batch_forward_pooling b1 mismatch at i=%d: got %f expect %f\n", + i, ptr[i], expected[i]); + return -1; + } + } + } + + return 0; +} + +#if NCNN_VULKAN +static int test_vkmat_create_batch_basic() +{ + ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(); + ncnn::VkAllocator* blob_allocator = vkdev->acquire_blob_allocator(); + + ncnn::VkMat m; + m.create_batch(8, 6, 3, 4, 4u, 1, blob_allocator); + + if (m.dims != 3) + { + fprintf(stderr, "test_vkmat_create_batch_basic dims expect 3 got %d\n", m.dims); + vkdev->reclaim_blob_allocator(blob_allocator); + return -1; + } + if (m.w != 8 || m.h != 6 || m.c != 3) + { + fprintf(stderr, "test_vkmat_create_batch_basic shape mismatch\n"); + vkdev->reclaim_blob_allocator(blob_allocator); + return -1; + } + if (m.n != 4) + { + fprintf(stderr, "test_vkmat_create_batch_basic n expect 4 got %d\n", m.n); + vkdev->reclaim_blob_allocator(blob_allocator); + return -1; + } + if (m.data == 0) + { + fprintf(stderr, "test_vkmat_create_batch_basic data is null\n"); + vkdev->reclaim_blob_allocator(blob_allocator); + return -1; + } + + m.release(); + vkdev->reclaim_blob_allocator(blob_allocator); + return 0; +} + +static int test_vkmat_nstep_alignment() +{ + ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(); + ncnn::VkAllocator* blob_allocator = vkdev->acquire_blob_allocator(); + + ncnn::VkMat m; + m.create_batch(7, 5, 13, 4, 4u, 1, blob_allocator); + + size_t nstep_bytes = m.nstep * m.elemsize; + if (nstep_bytes % 4096 != 0) + { + fprintf(stderr, "test_vkmat_nstep_alignment failed: nstep_bytes=%zu\n", nstep_bytes); + m.release(); + vkdev->reclaim_blob_allocator(blob_allocator); + return -1; + } + + m.release(); + vkdev->reclaim_blob_allocator(blob_allocator); + return 0; +} + +static int test_vkmat_batch_subview() +{ + ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(); + ncnn::VkAllocator* blob_allocator = vkdev->acquire_blob_allocator(); + + ncnn::VkMat m; + m.create_batch(4, 3, 2, 3, 4u, 1, blob_allocator); + + for (int b = 0; b < m.n; b++) + { + const ncnn::VkMat sub = m.batch(b); + + // verify sub-view properties + if (sub.dims != m.dims || sub.w != m.w || sub.h != m.h || sub.c != m.c) + { + fprintf(stderr, "test_vkmat_batch_subview shape mismatch at batch %d\n", b); + m.release(); + vkdev->reclaim_blob_allocator(blob_allocator); + return -1; + } + if (sub.cstep != m.cstep) + { + fprintf(stderr, "test_vkmat_batch_subview cstep mismatch at batch %d\n", b); + m.release(); + vkdev->reclaim_blob_allocator(blob_allocator); + return -1; + } + if (sub.n != 1) + { + fprintf(stderr, "test_vkmat_batch_subview n expect 1 got %d\n", sub.n); + m.release(); + vkdev->reclaim_blob_allocator(blob_allocator); + return -1; + } + if (sub.refcount != 0) + { + fprintf(stderr, "test_vkmat_batch_subview refcount should be NULL (zero-copy)\n"); + m.release(); + vkdev->reclaim_blob_allocator(blob_allocator); + return -1; + } + + // verify buffer_offset is correct + size_t expected_offset = m.buffer_offset() + m.nstep * b * m.elemsize; + if (sub.buffer_offset() != expected_offset) + { + fprintf(stderr, "test_vkmat_batch_subview buffer_offset mismatch at batch %d: got %zu expect %zu\n", + b, sub.buffer_offset(), expected_offset); + m.release(); + vkdev->reclaim_blob_allocator(blob_allocator); + return -1; + } + + // verify same underlying VkBuffer + if (sub.buffer() != m.buffer()) + { + fprintf(stderr, "test_vkmat_batch_subview buffer handle mismatch at batch %d\n", b); + m.release(); + vkdev->reclaim_blob_allocator(blob_allocator); + return -1; + } + } + + m.release(); + vkdev->reclaim_blob_allocator(blob_allocator); + return 0; +} + +static int test_vkmat_batch_range() +{ + ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(); + ncnn::VkAllocator* blob_allocator = vkdev->acquire_blob_allocator(); + + ncnn::VkMat m; + m.create_batch(4, 3, 2, 4, 4u, 1, blob_allocator); + + ncnn::VkMat range = m.batch_range(1, 2); + if (range.n != 2) + { + fprintf(stderr, "test_vkmat_batch_range n expect 2 got %d\n", range.n); + m.release(); + vkdev->reclaim_blob_allocator(blob_allocator); + return -1; + } + if (range.nstep != m.nstep) + { + fprintf(stderr, "test_vkmat_batch_range nstep mismatch\n"); + m.release(); + vkdev->reclaim_blob_allocator(blob_allocator); + return -1; + } + + // verify range.batch(0) buffer_offset == m.batch(1) buffer_offset + if (range.batch(0).buffer_offset() != m.batch(1).buffer_offset()) + { + fprintf(stderr, "test_vkmat_batch_range offset mismatch at range batch 0\n"); + m.release(); + vkdev->reclaim_blob_allocator(blob_allocator); + return -1; + } + + // verify range.batch(1) buffer_offset == m.batch(2) buffer_offset + if (range.batch(1).buffer_offset() != m.batch(2).buffer_offset()) + { + fprintf(stderr, "test_vkmat_batch_range offset mismatch at range batch 1\n"); + m.release(); + vkdev->reclaim_blob_allocator(blob_allocator); + return -1; + } + + m.release(); + vkdev->reclaim_blob_allocator(blob_allocator); + return 0; +} + +static int test_vkmat_batch_release() +{ + ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(); + ncnn::VkAllocator* blob_allocator = vkdev->acquire_blob_allocator(); + + ncnn::VkMat m; + m.create_batch(4, 3, 2, 4, 4u, 1, blob_allocator); + m.release(); + + if (m.dims != 0) + { + fprintf(stderr, "test_vkmat_batch_release dims expect 0 got %d\n", m.dims); + vkdev->reclaim_blob_allocator(blob_allocator); + return -1; + } + if (m.n != 1) + { + fprintf(stderr, "test_vkmat_batch_release n expect 1 got %d\n", m.n); + vkdev->reclaim_blob_allocator(blob_allocator); + return -1; + } + + vkdev->reclaim_blob_allocator(blob_allocator); + return 0; +} + +static int test_vkmat_batch_upload_download() +{ + ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(); + ncnn::VkAllocator* blob_allocator = vkdev->acquire_blob_allocator(); + ncnn::VkAllocator* staging_allocator = vkdev->acquire_staging_allocator(); + + const int B = 3; + const int W = 4; + const int H = 3; + const int C = 2; + + // create and fill cpu batch + ncnn::Mat cpu_batch; + cpu_batch.create_batch(W, H, C, B, 4u, 1); + for (int b = 0; b < B; b++) + { + ncnn::Mat sub = cpu_batch.batch(b); + for (int q = 0; q < C; q++) + { + float* ptr = sub.channel(q); + for (int i = 0; i < W * H; i++) + { + ptr[i] = (float)(b * 100 + q * 10 + i); + } + } + } + + // upload each batch, assemble on gpu, download back + ncnn::VkCompute cmd(vkdev); + + ncnn::Option opt; + opt.blob_vkallocator = blob_allocator; + opt.workspace_vkallocator = blob_allocator; + opt.staging_vkallocator = staging_allocator; + opt.use_vulkan_compute = true; + + ncnn::VkMat gpu_batch; + for (int b = 0; b < B; b++) + { + ncnn::Mat cpu_b = cpu_batch.batch(b); + ncnn::VkMat gpu_b; + cmd.record_upload(cpu_b, gpu_b, opt); + + if (b == 0) + { + gpu_batch.create_like_batch(gpu_b, B, blob_allocator); + } + + ncnn::VkMat gpu_batch_slot = gpu_batch.batch(b); + cmd.record_clone(gpu_b, gpu_batch_slot, opt); + } + + // download each batch back + std::vector cpu_results(B); + for (int b = 0; b < B; b++) + { + ncnn::VkMat gpu_b = gpu_batch.batch(b); + cmd.record_download(gpu_b, cpu_results[b], opt); + } + + int ret = cmd.submit_and_wait(); + if (ret != 0) + { + fprintf(stderr, "test_vkmat_batch_upload_download submit failed ret=%d\n", ret); + vkdev->reclaim_staging_allocator(staging_allocator); + vkdev->reclaim_blob_allocator(blob_allocator); + return -1; + } + + // verify downloaded data matches original + for (int b = 0; b < B; b++) + { + const ncnn::Mat& result = cpu_results[b]; + if (result.w != W || result.h != H || result.c != C) + { + fprintf(stderr, "test_vkmat_batch_upload_download shape mismatch at batch %d\n", b); + vkdev->reclaim_staging_allocator(staging_allocator); + vkdev->reclaim_blob_allocator(blob_allocator); + return -1; + } + + const ncnn::Mat orig = cpu_batch.batch(b); + for (int q = 0; q < C; q++) + { + const float* orig_ptr = orig.channel(q); + const float* result_ptr = result.channel(q); + for (int i = 0; i < W * H; i++) + { + if (fabsf(orig_ptr[i] - result_ptr[i]) > 1e-5f) + { + fprintf(stderr, "test_vkmat_batch_upload_download value mismatch at b=%d q=%d i=%d: got %f expect %f\n", + b, q, i, result_ptr[i], orig_ptr[i]); + vkdev->reclaim_staging_allocator(staging_allocator); + vkdev->reclaim_blob_allocator(blob_allocator); + return -1; + } + } + } + } + + vkdev->reclaim_staging_allocator(staging_allocator); + vkdev->reclaim_blob_allocator(blob_allocator); + return 0; +} + +static int test_vkmat_batch_forward_relu() +{ + const char param_str[] = "7767517\n" + "2 2\n" + "Input input 0 1 data\n" + "ReLU relu 1 1 data output 0=1.000000e-01\n"; + + ncnn::Net net; + ncnn::Option opt; + opt.use_vulkan_compute = true; + net.opt = opt; + net.load_param_mem(param_str); + net.load_model((const unsigned char*)""); + + const int B = 4; + const int C = 3; + const int H = 3; + const int W = 4; + + ncnn::Mat input_batch; + input_batch.create_batch(W, H, C, B, 4u, 1); + if (input_batch.empty()) + { + fprintf(stderr, "test_vkmat_batch_forward_relu create_batch failed\n"); + return -1; + } + + for (int b = 0; b < B; b++) + { + ncnn::Mat sub = input_batch.batch(b); + sub.fill((float)(b - 1.5f)); + } + + ncnn::Extractor ex = net.create_extractor(); + ex.input("data", input_batch); + + ncnn::Mat output_batch; + int ret = ex.extract("output", output_batch); + if (ret != 0) + { + fprintf(stderr, "test_vkmat_batch_forward_relu extract failed ret=%d\n", ret); + return -1; + } + + if (output_batch.n != B) + { + fprintf(stderr, "test_vkmat_batch_forward_relu output n expect %d got %d\n", B, output_batch.n); + return -1; + } + if (output_batch.w != W || output_batch.h != H || output_batch.c != C) + { + fprintf(stderr, "test_vkmat_batch_forward_relu output shape mismatch\n"); + return -1; + } + + for (int b = 0; b < B; b++) + { + const ncnn::Mat out_sub = output_batch.batch(b); + float input_val = (float)(b - 1.5f); + float expected = input_val > 0 ? input_val : input_val * 0.1f; + + for (int q = 0; q < C; q++) + { + const float* ptr = out_sub.channel(q); + for (int i = 0; i < W * H; i++) + { + if (fabsf(ptr[i] - expected) > 1e-4f) + { + fprintf(stderr, "test_vkmat_batch_forward_relu value mismatch at b=%d q=%d i=%d: got %f expect %f\n", + b, q, i, ptr[i], expected); + return -1; + } + } + } + } + + return 0; +} + +static int test_vkmat_batch_forward_pooling() +{ + const char param_str[] = "7767517\n" + "2 2\n" + "Input input 0 1 data\n" + "Pooling pooling 1 1 data output 0=0 1=2 2=2\n"; + + ncnn::Net net; + ncnn::Option opt; + opt.use_vulkan_compute = true; + net.opt = opt; + net.load_param_mem(param_str); + net.load_model((const unsigned char*)""); + + const int B = 2; + const int C = 2; + const int H = 4; + const int W = 4; + + ncnn::Mat input_batch; + input_batch.create_batch(W, H, C, B, 4u, 1); + + for (int b = 0; b < B; b++) + { + ncnn::Mat sub = input_batch.batch(b); + for (int q = 0; q < C; q++) + { + float* ptr = sub.channel(q); + for (int i = 0; i < W * H; i++) + { + ptr[i] = (float)(b * 100 + q * 10 + i); + } + } + } + + ncnn::Extractor ex = net.create_extractor(); + ex.input("data", input_batch); + + ncnn::Mat output_batch; + int ret = ex.extract("output", output_batch); + if (ret != 0) + { + fprintf(stderr, "test_vkmat_batch_forward_pooling extract failed ret=%d\n", ret); + return -1; + } + + if (output_batch.n != B) + { + fprintf(stderr, "test_vkmat_batch_forward_pooling output n expect %d got %d\n", B, output_batch.n); + return -1; + } + if (output_batch.w != 2 || output_batch.h != 2 || output_batch.c != C) + { + fprintf(stderr, "test_vkmat_batch_forward_pooling output shape expect 2x2x%d got %dx%dx%d\n", + C, output_batch.w, output_batch.h, output_batch.c); + return -1; + } + + // verify max pooling for batch 0, channel 0 + // input 4x4: [ 0 1 2 3 / 4 5 6 7 / 8 9 10 11 / 12 13 14 15 ] + // max pool 2x2 stride 2 -> [ 5 7 / 13 15 ] + { + const ncnn::Mat out0 = output_batch.batch(0); + const float* ptr = out0.channel(0); + float expected[4] = {5.f, 7.f, 13.f, 15.f}; + for (int i = 0; i < 4; i++) + { + if (fabsf(ptr[i] - expected[i]) > 1e-4f) + { + fprintf(stderr, "test_vkmat_batch_forward_pooling b0 mismatch at i=%d: got %f expect %f\n", + i, ptr[i], expected[i]); + return -1; + } + } + } + + // verify batch 1, channel 0: input 100+i -> max pool -> [105, 107, 113, 115] + { + const ncnn::Mat out1 = output_batch.batch(1); + const float* ptr = out1.channel(0); + float expected[4] = {105.f, 107.f, 113.f, 115.f}; + for (int i = 0; i < 4; i++) + { + if (fabsf(ptr[i] - expected[i]) > 1e-4f) + { + fprintf(stderr, "test_vkmat_batch_forward_pooling b1 mismatch at i=%d: got %f expect %f\n", + i, ptr[i], expected[i]); + return -1; + } + } + } + + return 0; +} +#endif // NCNN_VULKAN + +int main() +{ + int ret = 0; + + ret |= test_create_batch_basic(); + ret |= test_nstep_alignment(); + ret |= test_batch_subview_zero_copy(); + ret |= test_batch_range(); + ret |= test_batch_data_isolation(); + ret |= test_batch_clone(); + ret |= test_batch_release(); + ret |= test_backward_compatibility(); + ret |= test_create_batch_single(); + ret |= test_create_batch_1d(); + ret |= test_create_batch_2d(); + ret |= test_batch_forward_relu(); + ret |= test_batch_forward_pooling(); + +#if NCNN_VULKAN + ncnn::create_gpu_instance(); + if (ncnn::get_gpu_count() > 0) + { + ret |= test_vkmat_create_batch_basic(); + ret |= test_vkmat_nstep_alignment(); + ret |= test_vkmat_batch_subview(); + ret |= test_vkmat_batch_range(); + ret |= test_vkmat_batch_release(); + ret |= test_vkmat_batch_upload_download(); + ret |= test_vkmat_batch_forward_relu(); + ret |= test_vkmat_batch_forward_pooling(); + } + else + { + fprintf(stderr, "no vulkan device, skip vkmat batch tests\n"); + } + ncnn::destroy_gpu_instance(); +#endif // NCNN_VULKAN + + if (ret == 0) + fprintf(stderr, "test_mat_batch passed\n"); + + return ret; +} diff --git a/tests/test_squeezenet.cpp b/tests/test_squeezenet.cpp index 887bfac6d20a..8877d7f8a5a1 100644 --- a/tests/test_squeezenet.cpp +++ b/tests/test_squeezenet.cpp @@ -404,6 +404,95 @@ static int test_squeezenet_overwrite_softmax(const ncnn::Option& opt, int load_m return check_top2(cls_scores, epsilon); } +static int test_squeezenet_batch(const ncnn::Option& opt, float epsilon = 0.001) +{ + ncnn::Net squeezenet; + + squeezenet.opt = opt; + + squeezenet.load_param(MODEL_DIR "/squeezenet_v1.1.param"); + squeezenet.load_model(MODEL_DIR "/squeezenet_v1.1.bin"); + + ncnn::Mat in = generate_ncnn_logo(ncnn::Mat::PIXEL_BGR, 227, 227); + + const float mean_vals[3] = {104.f, 117.f, 123.f}; + in.substract_mean_normalize(mean_vals, 0); + + // single inference for reference + ncnn::Mat ref_out; + { + ncnn::Extractor ex = squeezenet.create_extractor(); + ex.input("data", in); + ex.extract("prob", ref_out); + } + + if (ref_out.empty() || ref_out.w != 1000) + { + fprintf(stderr, "test_squeezenet_batch reference output invalid w=%d\n", ref_out.w); + return -1; + } + + // create batch input (3 copies of the same image) + const int B = 3; + ncnn::Mat in_batch; + in_batch.create_batch(in.w, in.h, in.c, B, in.elemsize, in.elempack); + if (in_batch.empty()) + { + fprintf(stderr, "test_squeezenet_batch create_batch failed\n"); + return -1; + } + + size_t single_size = in.cstep * in.c * in.elemsize; + for (int b = 0; b < B; b++) + { + memcpy(in_batch.batch(b).data, in.data, single_size); + } + + // batch inference + ncnn::Mat out_batch; + { + ncnn::Extractor ex = squeezenet.create_extractor(); + ex.input("data", in_batch); + int ret = ex.extract("prob", out_batch); + if (ret != 0) + { + fprintf(stderr, "test_squeezenet_batch extract failed ret=%d\n", ret); + return -1; + } + } + + if (out_batch.n != B) + { + fprintf(stderr, "test_squeezenet_batch output n expect %d got %d\n", B, out_batch.n); + return -1; + } + if (out_batch.dims != 1 || out_batch.w != 1000) + { + fprintf(stderr, "test_squeezenet_batch output shape mismatch dims=%d w=%d\n", out_batch.dims, out_batch.w); + return -1; + } + + // compare each batch output against reference + for (int b = 0; b < B; b++) + { + const ncnn::Mat out_b = out_batch.batch(b); + const float* ref_ptr = (const float*)ref_out.data; + const float* out_ptr = (const float*)out_b.data; + + for (int j = 0; j < 1000; j++) + { + if (!NearlyEqual(out_ptr[j], ref_ptr[j], epsilon)) + { + fprintf(stderr, "test_squeezenet_batch mismatch at batch %d index %d: got %f expect %f\n", + b, j, out_ptr[j], ref_ptr[j]); + return -1; + } + } + } + + return 0; +} + int main() { SRAND(7767517); @@ -508,5 +597,43 @@ int main() #endif // NCNN_VULKAN } + // batch inference tests + for (int i = 0; i < 4; i++) + { + const ncnn::Option& opt = opts[i]; + + float epsilon; + if (opt.use_bf16_storage || opt.use_fp16_packed || opt.use_fp16_storage) + { + epsilon = 0.1; + } + else + { + epsilon = 0.01; + } + + int ret; + + ncnn::Option opt_cpu = opt; + opt_cpu.use_vulkan_compute = false; + ret = test_squeezenet_batch(opt_cpu, epsilon); + if (ret != 0) + { + fprintf(stderr, "test_squeezenet_batch cpu failed use_packing_layout=%d use_fp16_packed=%d use_fp16_storage=%d use_bf16_storage=%d\n", opt.use_packing_layout, opt.use_fp16_packed, opt.use_fp16_storage, opt.use_bf16_storage); + return ret; + } + +#if NCNN_VULKAN + ncnn::Option opt_gpu = opt; + opt_gpu.use_vulkan_compute = true; + ret = test_squeezenet_batch(opt_gpu, epsilon); + if (ret != 0) + { + fprintf(stderr, "test_squeezenet_batch gpu failed use_packing_layout=%d use_fp16_packed=%d use_fp16_storage=%d use_bf16_storage=%d\n", opt.use_packing_layout, opt.use_fp16_packed, opt.use_fp16_storage, opt.use_bf16_storage); + return ret; + } +#endif // NCNN_VULKAN + } + return 0; }