diff --git a/crates/bevy_pbr/src/render/allocate_uniforms.wgsl b/crates/bevy_pbr/src/render/allocate_uniforms.wgsl new file mode 100644 index 0000000000000..775926f070a46 --- /dev/null +++ b/crates/bevy_pbr/src/render/allocate_uniforms.wgsl @@ -0,0 +1,223 @@ +// A compute shader that allocates `MeshUniform`s. +// +// This shader runs before mesh preprocessing in order to determine the +// positions of `MeshUniform`s. Unlike `MeshInputUniform`s, which are scattered +// throughout the buffer, `MeshUniform`s are indexed by instance ID, and so we +// must place instances of the same mesh together in the buffer. One dispatch +// call corresponds to one batch set (i.e. one multidraw operation), and one +// thread corresponds to one bin (a.k.a. draw, a.k.a. batch). +// +// Essentially, the goal of this shader is to perform a prefix sum, using the +// "scan-then-fan" approach. It has three phases: +// +// 1. *Local scan*: Perform a [Hillis-Steele scan] on each chunk of draws, where +// the size of each chunk (i.e. the number of draws) is equal to the workgroup +// size (256). Write the total size for this chunk to the fan buffer. +// +// 2. *Global scan*: Do a Hillis-Steele scan on the fan buffer. Now we know the +// running total for each chunk. +// +// 3. *Fan*: Copy the running total for each chunk to every element of that +// chunk. +// +// Note that, for batch sets (i.e. multidraw indirect calls) that have fewer +// than 256 batches in them, we only need step (1). This is the common case. +// +// [Hillis-Steele scan]: https://en.wikipedia.org/wiki/Prefix_sum#Algorithm_1:_Shorter_span,_more_parallel + +#import bevy_pbr::mesh_preprocess_types::{BinMetadata, IndirectParametersMetadata} + +// Information needed to allocate `MeshUniform`s. +struct UniformAllocationMetadata { + // The index of this batch set in the `IndirectBatchSet` array. + // + // We write this into the `indirect_parameters_metadata`. + batch_set_index: u32, + + // The number of bins (a.k.a. draws, a.k.a. batches) in this batch set. + bin_count: u32, + + // The index of the first set of indirect parameters for this batch set. + // + // This is also the index of the first `IndirectParametersMetadata`, as + // that's a parallel array with the indirect parameters. + first_indirect_parameters_index: u32, + + // The index of the first `MeshUniform` slot for this batch set. + first_output_mesh_uniform_index: u32, + + // Padding. + pad: array, 15u>, +}; + +// The number of threads in a workgroup. +const WORKGROUP_SIZE: u32 = 256u; + +// Information needed to allocate `MeshUniform`s. +@group(0) @binding(0) var allocate_uniforms_metadata: UniformAllocationMetadata; + +// Information for each bin, including the indirect parameters offset and the +// instance count. +@group(0) @binding(1) var bin_metadata: array; + +// The array of indirect parameters metadata that we fill out, one for each +// batch. +@group(0) @binding(2) var indirect_parameters_metadata: + array; + +// A temporary buffer that stores the mesh uniform index of the last instance +// plus one for each workgroup (i.e. for each 256-bin chunk). +// +// This is accumulated in the second stage and written out in the third. +@group(0) @binding(3) var fan_buffer: array; + +// Scratch memory that stores the prefix sum for every element in our chunk. +var output_offsets: array; + +// The first step of the prefix sum. This computes the prefix sum for each +// 256-element chunk. +// +// Note that this will be the *only* step in the operation if the total number +// of bins in this batch set is 256 or fewer. Thus we must fill in the indirect +// parameters metadata for each batch here, as we can't guarantee that the +// following two steps will be run at all. +@compute @workgroup_size(256, 1, 1) +fn allocate_local_scan( + @builtin(local_invocation_id) local_id: vec3, + @builtin(workgroup_id) group_id: vec3, + @builtin(global_invocation_id) global_id: vec3 +) { + let bin_count = allocate_uniforms_metadata.bin_count; + + let block_start = group_id.x * WORKGROUP_SIZE; + let block_end = min(block_start + WORKGROUP_SIZE, bin_count); + + // If this is the first workgroup, take the first output index from the + // metadata into account. But if this is the second chunk or beyond, don't + // do that, as the second and third phases will add it in and we don't want + // to double-count it. + if (group_id.x == 0u) { + output_offsets[local_id.x] = allocate_uniforms_metadata.first_output_mesh_uniform_index; + } else { + output_offsets[local_id.x] = 0u; + } + workgroupBarrier(); + + // We're doing an inclusive sum, so put the instance count in the *next* bin. + if (global_id.x < block_end && local_id.x < WORKGROUP_SIZE - 1u) { + output_offsets[local_id.x + 1] = bin_metadata[global_id.x].instance_count; + } + workgroupBarrier(); + + // Prefix sum within our workgroup. + hillis_steele_scan(local_id.x); + + // Now write the indirect parameters metadata for this batch. We fill in the + // `base_output_index` with the value of the prefix sum (which might be + // incomplete if this isn't the first chunk). We also populate a few + // bookkeeping fields for later rendering passes to use. + if (global_id.x < block_end) { + let indirect_parameters_offset = + allocate_uniforms_metadata.first_indirect_parameters_index + + bin_metadata[global_id.x].indirect_parameters_offset; + indirect_parameters_metadata[indirect_parameters_offset].base_output_index = + output_offsets[local_id.x]; + indirect_parameters_metadata[indirect_parameters_offset].batch_set_index = + allocate_uniforms_metadata.batch_set_index; + // These parameters get filled in later. Initialize them to zero for now. + // This is required in the case of the early/late instance counts + // because the mesh preprocessing shader will atomically increment them. + indirect_parameters_metadata[indirect_parameters_offset].mesh_index = 0u; + indirect_parameters_metadata[indirect_parameters_offset].early_instance_count = 0u; + indirect_parameters_metadata[indirect_parameters_offset].late_instance_count = 0u; + } + + // If this is the last element in the workgroup, put the total number of + // instances (plus the first output mesh uniform index if we're the first + // workgroup) in the fan buffer in preparation for the next phase. + if (local_id.x == WORKGROUP_SIZE - 1u) { + fan_buffer[group_id.x] = output_offsets[WORKGROUP_SIZE - 1u] + + bin_metadata[global_id.x].instance_count; + } +} + +// The second step of the prefix sum. +// +// This step takes the intermediate fan values computed in the previous step +// (i.e. the sum going out of each chunk) and performs one or more Hillis-Steele +// scans in order to compute the fan value going into each chunk. +// +// This step is omitted if there are 256 or fewer total draws. +@compute @workgroup_size(256, 1, 1) +fn allocate_global_scan(@builtin(local_invocation_id) local_id: vec3) { + var sum = 0u; + let chunk_count = div_ceil(allocate_uniforms_metadata.bin_count, WORKGROUP_SIZE); + + // Do a sequential loop over each block of 256 chunks. Because each + // iteration of this loop covers 64K meshes, the fact that it's sequential + // isn't going to be a problem in practice. + for (var block_start = 0u; block_start < chunk_count; block_start += WORKGROUP_SIZE) { + // Set up the Hillis-Steele scan. + let block_end = min(block_start + WORKGROUP_SIZE, chunk_count); + let global_id = block_start + local_id.x; + if (global_id < block_end) { + output_offsets[local_id.x] = sum + fan_buffer[global_id]; + } + workgroupBarrier(); + + // Perform the scan. + hillis_steele_scan(local_id.x); + + // Write the value back. + if (global_id < block_end) { + fan_buffer[global_id] = output_offsets[local_id.x]; + } + + // Save the sum coming out of this block for the next one. + sum = output_offsets[WORKGROUP_SIZE - 1u]; + } +} + +// The third step of the prefix sum. +// +// We take the summed fan value computed in the previous step and add it in to +// each value of each chunk beyond the first. We dispatch one fewer workgroup +// here than in step (1), because there's nothing to do for the first chunk. +// +// This step is omitted if there are 256 or fewer total draws. +@compute @workgroup_size(256, 1, 1) +fn allocate_fan( + @builtin(workgroup_id) group_id: vec3, + @builtin(global_invocation_id) global_id: vec3 +) { + let id = global_id.x + WORKGROUP_SIZE; + let bin_count = allocate_uniforms_metadata.bin_count; + if (id >= bin_count) { + return; + } + + let fan_value = fan_buffer[group_id.x]; + let indirect_parameters_offset = + allocate_uniforms_metadata.first_indirect_parameters_index + + bin_metadata[id].indirect_parameters_offset; + indirect_parameters_metadata[indirect_parameters_offset].base_output_index += fan_value; +} + +// Calculates a running exclusive sum. +// https://en.wikipedia.org/wiki/Prefix_sum#Algorithm_1:_Shorter_span,_more_parallel +fn hillis_steele_scan(local_id: u32) { + for (var offset = 1u; offset < WORKGROUP_SIZE; offset *= 2u) { + var term = 0u; + if (local_id >= offset) { + term = output_offsets[local_id - offset]; + } + workgroupBarrier(); + output_offsets[local_id] += term; + workgroupBarrier(); + } +} + +// Divides unsigned integer a by b, rounding up. +fn div_ceil(a: u32, b: u32) -> u32 { + return (a + b - 1u) / b; +} diff --git a/crates/bevy_pbr/src/render/build_indirect_params.wgsl b/crates/bevy_pbr/src/render/build_indirect_params.wgsl index 5ca6d4c0ccfff..1f0050a464e50 100644 --- a/crates/bevy_pbr/src/render/build_indirect_params.wgsl +++ b/crates/bevy_pbr/src/render/build_indirect_params.wgsl @@ -12,8 +12,7 @@ IndirectBatchSet, IndirectParametersIndexed, IndirectParametersNonIndexed, - IndirectParametersCpuMetadata, - IndirectParametersGpuMetadata, + IndirectParametersMetadata, MeshInput } @@ -23,11 +22,8 @@ // Data that we use to generate the indirect parameters. // // The `mesh_preprocess.wgsl` shader emits these. -@group(0) @binding(1) var indirect_parameters_cpu_metadata: - array; - -@group(0) @binding(2) var indirect_parameters_gpu_metadata: - array; +@group(0) @binding(1) var indirect_parameters_metadata: + array; // Information about each batch set. // @@ -56,21 +52,21 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3) { // Figure out our instance index (i.e. batch index). If this thread doesn't // correspond to any index, bail. let instance_index = global_invocation_id.x; - if (instance_index >= arrayLength(&indirect_parameters_cpu_metadata)) { + if (instance_index >= arrayLength(&indirect_parameters_metadata)) { return; } // Unpack the metadata for this batch. - let base_output_index = indirect_parameters_cpu_metadata[instance_index].base_output_index; - let batch_set_index = indirect_parameters_cpu_metadata[instance_index].batch_set_index; - let mesh_index = indirect_parameters_gpu_metadata[instance_index].mesh_index; + let base_output_index = indirect_parameters_metadata[instance_index].base_output_index; + let batch_set_index = indirect_parameters_metadata[instance_index].batch_set_index; + let mesh_index = indirect_parameters_metadata[instance_index].mesh_index; // If we aren't using `multi_draw_indirect_count`, we have a 1:1 fixed // assignment of batches to slots in the indirect parameters buffer, so we // can just use the instance index as the index of our indirect parameters. let early_instance_count = - indirect_parameters_gpu_metadata[instance_index].early_instance_count; - let late_instance_count = indirect_parameters_gpu_metadata[instance_index].late_instance_count; + indirect_parameters_metadata[instance_index].early_instance_count; + let late_instance_count = indirect_parameters_metadata[instance_index].late_instance_count; // If in the early phase, we draw only the early meshes. If in the late // phase, we draw only the late meshes. If in the main phase, draw all the diff --git a/crates/bevy_pbr/src/render/gpu_preprocess.rs b/crates/bevy_pbr/src/render/gpu_preprocess.rs index 259678cc8cca4..1eb935780574e 100644 --- a/crates/bevy_pbr/src/render/gpu_preprocess.rs +++ b/crates/bevy_pbr/src/render/gpu_preprocess.rs @@ -35,18 +35,19 @@ use bevy_math::Vec4; use bevy_platform::collections::HashMap; use bevy_render::{ batching::gpu_preprocessing::{ - clear_bin_unpacking_buffers, BatchedInstanceBuffers, BinUnpackingBuffers, - BinUnpackingBuffersKey, BinUnpackingJob, BinUnpackingMetadataIndex, - GpuBinUnpackingMetadata, GpuOcclusionCullingWorkItemBuffers, GpuPreprocessingMode, - GpuPreprocessingSupport, IndirectBatchSet, IndirectParametersBuffers, - IndirectParametersCpuMetadata, IndirectParametersGpuMetadata, IndirectParametersIndexed, - IndirectParametersNonIndexed, LatePreprocessWorkItemIndirectParameters, PreprocessWorkItem, - PreprocessWorkItemBuffers, UntypedPhaseBatchedInstanceBuffers, + clear_scene_unpacking_buffers, BatchedInstanceBuffers, BinUnpackingMetadataIndex, + GpuBinMetadata, GpuBinUnpackingMetadata, GpuOcclusionCullingWorkItemBuffers, + GpuPreprocessingMode, GpuPreprocessingSupport, GpuUniformAllocationMetadata, + IndirectBatchSet, IndirectParametersBuffers, IndirectParametersIndexed, + IndirectParametersMetadata, IndirectParametersNonIndexed, + LatePreprocessWorkItemIndirectParameters, PreprocessWorkItem, PreprocessWorkItemBuffers, + SceneUnpackingBuffers, SceneUnpackingBuffersKey, SceneUnpackingJob, + UniformAllocationMetadataIndex, UntypedPhaseBatchedInstanceBuffers, UntypedPhaseIndirectParametersBuffers, }, diagnostic::RecordDiagnostics as _, occlusion_culling::OcclusionCulling, - render_phase::GpuRenderBinnedMeshInstance, + render_phase::{GpuRenderBinnedMeshInstance, UNIFORM_ALLOCATION_WORKGROUP_SIZE}, render_resource::{ binding_types::{storage_buffer, storage_buffer_read_only, texture_2d, uniform_buffer}, BindGroup, BindGroupEntries, BindGroupLayoutDescriptor, BindGroupLayoutEntries, @@ -130,6 +131,8 @@ pub struct PreprocessPipelines { pub main_phase: PreprocessPhasePipelines, /// Compute shader pipelines for the bin unpacking step. pub bin_unpacking: BinUnpackingPipeline, + /// Compute shader pipelines for the uniform allocation step. + pub uniform_allocation: UniformAllocationPipelines, } /// Compute shader pipelines for a specific phase: early, late, or main. @@ -206,6 +209,63 @@ pub struct BinUnpackingPipeline { pub pipeline_id: Option, } +/// Pipelines for the `allocate_uniforms` compute shader. +/// +/// This shader has three steps, so we have three pipelines. +/// +/// Although the `Handle` is the same among these three pipelines, they +/// have to be separate so that the `SpecializedComputePipeline` implementation +/// on each sub-pipeline can access it. +#[derive(Clone)] +pub struct UniformAllocationPipelines { + /// The pipeline for step 1: local scan. + pub local_scan: UniformAllocationLocalScanPipeline, + /// The pipeline for step 2: global scan. + pub global_scan: UniformAllocationGlobalScanPipeline, + /// The pipeline for step 3: fan. + pub fan: UniformAllocationFanPipeline, +} + +/// The pipeline for the first step of the `allocate_uniforms` shader. +#[derive(Clone)] +pub struct UniformAllocationLocalScanPipeline { + /// The bind group layout, shared among all the uniform allocation + /// pipelines. + pub bind_group_layout: BindGroupLayoutDescriptor, + /// The shader, also shared among all uniform allocation pipelines. + pub shader: Handle, + /// The pipeline ID for the first step of the `allocate_uniforms` shader. + pub pipeline_id_local_scan: Option, +} + +/// The pipeline for the second step of the `allocate_uniforms` shader. +/// +/// This step is skipped if the number of bins in the batch set is 256 or fewer. +#[derive(Clone)] +pub struct UniformAllocationGlobalScanPipeline { + /// The bind group layout, shared among all the uniform allocation + /// pipelines. + pub bind_group_layout: BindGroupLayoutDescriptor, + /// The shader, also shared among all uniform allocation pipelines. + pub shader: Handle, + /// The pipeline ID for the second step of the `allocate_uniforms` shader. + pub pipeline_id_global_scan: Option, +} + +/// The pipeline for the third step of the `allocate_uniforms` shader. +/// +/// This step is skipped if the number of bins in the batch set is 256 or fewer. +#[derive(Clone)] +pub struct UniformAllocationFanPipeline { + /// The bind group layout, shared among all the uniform allocation + /// pipelines. + pub bind_group_layout: BindGroupLayoutDescriptor, + /// The shader, also shared among all uniform allocation pipelines. + pub shader: Handle, + /// The pipeline ID for the third step of the `allocate_uniforms` shader. + pub pipeline_id_fan: Option, +} + bitflags! { /// Specifies variants of the mesh preprocessing shader. #[derive(Clone, Copy, PartialEq, Eq, Hash)] @@ -351,16 +411,16 @@ pub struct PhaseBuildIndirectParametersBindGroups { /// mesh indexed-ness. #[derive(Clone, Resource, Default, Deref, DerefMut)] pub struct BinUnpackingBindGroups( - pub HashMap, + pub HashMap, ); /// The bind groups for the `unpack_bins` shader for a single (view, phase) /// combination. #[derive(Clone)] pub struct ViewPhaseBinUnpackingBindGroups { - /// The bind group for the indexed meshes. + /// The bind groups for the indexed meshes, one for each batch set. indexed: Vec, - /// The bind group for the non-indexed meshes. + /// The bind groups for the non-indexed meshes, one for each batch set. non_indexed: Vec, } @@ -369,7 +429,7 @@ pub struct ViewPhaseBinUnpackingBindGroups { #[derive(Clone)] pub struct ViewPhaseBinUnpackingBindGroup { /// The index of the metadata in the - /// [`BinUnpackingBuffers::bin_unpacking_metadata`] buffer. + /// [`SceneUnpackingBuffers::bin_unpacking_metadata`] buffer. pub metadata_index: BinUnpackingMetadataIndex, /// The actual shader bind group. pub bind_group: BindGroup, @@ -378,6 +438,39 @@ pub struct ViewPhaseBinUnpackingBindGroup { pub mesh_instance_count: u32, } +/// A resource, part of the render world, that stores all the bind groups for +/// the uniform allocation shader. +/// +/// There will be one such bind group for each combination of view, phase, and +/// mesh indexed-ness. +#[derive(Clone, Resource, Default, Deref, DerefMut)] +pub struct UniformAllocationBindGroups( + pub HashMap, +); + +/// The bind groups for the `allocate_uniforms` shader for a single (view, +/// phase) combination. +#[derive(Clone)] +pub struct ViewPhaseUniformAllocationBindGroups { + /// The bind groups for the indexed meshes, one for each batch set. + indexed: Vec, + /// The bind groups for the non-indexed meshes, one for each batch set. + non_indexed: Vec, +} + +/// The bind group for the `allocate_uniforms` shader for a single combination +/// of view, phase, and mesh indexed-ness. +#[derive(Clone)] +pub struct ViewPhaseUniformAllocationBindGroup { + /// The index of the metadata in the + /// [`SceneUnpackingBuffers::uniform_allocation_metadata`] buffer. + pub metadata_index: UniformAllocationMetadataIndex, + /// The actual shader bind group. + pub bind_group: BindGroup, + /// The total number of bins in this batch set. + pub bin_count: u32, +} + /// Stops the `GpuPreprocessNode` attempting to generate the buffer for this view /// useful to avoid duplicating effort if the bind group is shared between views #[derive(Component, Default)] @@ -389,6 +482,7 @@ impl Plugin for GpuMeshPreprocessPlugin { embedded_asset!(app, "reset_indirect_batch_sets.wgsl"); embedded_asset!(app, "build_indirect_params.wgsl"); embedded_asset!(app, "unpack_bins.wgsl"); + embedded_asset!(app, "allocate_uniforms.wgsl"); } fn finish(&self, app: &mut App) { @@ -405,15 +499,19 @@ impl Plugin for GpuMeshPreprocessPlugin { render_app .init_gpu_resource::() + .init_gpu_resource::() .init_gpu_resource::() .init_gpu_resource::>() .init_gpu_resource::>() .init_gpu_resource::>() .init_gpu_resource::>() + .init_gpu_resource::>() + .init_gpu_resource::>() + .init_gpu_resource::>() .add_systems( Render, ( - clear_bin_unpacking_buffers.in_set(RenderSystems::PrepareResources), + clear_scene_unpacking_buffers.in_set(RenderSystems::PrepareResources), prepare_preprocess_pipelines.in_set(RenderSystems::Prepare), prepare_preprocess_bind_groups .run_if(resource_exists::>, - mut ctx: RenderContext, +/// A rendering system that invokes a compute shader for each batch set in order +/// to determine where `MeshUniform`s should be placed. +/// +/// This shader exists because a single batch set could contain many meshes. By +/// performing this on the GPU, we avoid having to traverse every visible mesh +/// on the CPU every frame. +pub fn allocate_uniforms( + current_view: ViewQuery, Without>, + view_query: Query<&ExtractedView, Without>, + batched_instance_buffers: Res>, + pipeline_cache: Res, + preprocess_pipelines: Res, + uniform_allocation_bind_groups: Res, + mut render_context: RenderContext, ) { - let Some(indirect_parameters_buffers) = indirect_parameters_buffers else { - return; - }; + let diagnostics = render_context.diagnostic_recorder(); + let diagnostics = diagnostics.as_deref(); - // Clear out each indexed and non-indexed GPU-side buffer. - for phase_indirect_parameters_buffers in indirect_parameters_buffers.values() { - if let Some(indexed_gpu_metadata_buffer) = phase_indirect_parameters_buffers - .indexed - .gpu_metadata_buffer() - { - ctx.command_encoder().clear_buffer( - indexed_gpu_metadata_buffer, - 0, - Some( - phase_indirect_parameters_buffers.indexed.batch_count() as u64 - * size_of::() as u64, - ), - ); - } + let command_encoder = render_context.command_encoder(); + let mut compute_pass = command_encoder.begin_compute_pass(&ComputePassDescriptor { + label: Some("uniform allocation"), + timestamp_writes: None, + }); - if let Some(non_indexed_gpu_metadata_buffer) = phase_indirect_parameters_buffers - .non_indexed - .gpu_metadata_buffer() - { - ctx.command_encoder().clear_buffer( - non_indexed_gpu_metadata_buffer, - 0, - Some( - phase_indirect_parameters_buffers.non_indexed.batch_count() as u64 - * size_of::() as u64, - ), - ); + let pass_span = diagnostics.pass_span(&mut compute_pass, "uniform_allocation"); + + // Gather up all views. + let view_entity = current_view.entity(); + let shadow_cascade_views = current_view.into_inner(); + let all_views = gather_shadow_cascades_for_view(view_entity, shadow_cascade_views); + + // Don't run if the shaders haven't been compiled yet. + if let ( + Some(uniform_allocation_local_scan_pipeline_id), + Some(uniform_allocation_global_scan_pipeline_id), + Some(uniform_allocation_fan_pipeline_id), + ) = ( + preprocess_pipelines + .uniform_allocation + .local_scan + .pipeline_id_local_scan, + preprocess_pipelines + .uniform_allocation + .global_scan + .pipeline_id_global_scan, + preprocess_pipelines.uniform_allocation.fan.pipeline_id_fan, + ) && let ( + Some(uniform_allocation_local_scan_pipeline), + Some(uniform_allocation_global_scan_pipeline), + Some(uniform_allocation_fan_pipeline), + ) = ( + pipeline_cache.get_compute_pipeline(uniform_allocation_local_scan_pipeline_id), + pipeline_cache.get_compute_pipeline(uniform_allocation_global_scan_pipeline_id), + pipeline_cache.get_compute_pipeline(uniform_allocation_fan_pipeline_id), + ) { + // Loop over each view… + for view_entity in all_views { + let Ok(view) = view_query.get(view_entity) else { + continue; + }; + + // …and each phase within each view. + for phase_type_id in batched_instance_buffers.phase_instance_buffers.keys() { + let uniform_allocation_buffers_key = SceneUnpackingBuffersKey { + phase: *phase_type_id, + view: view.retained_view_entity, + }; + + // Fetch the bind groups for this (view, phase) combination. + let Some(phase_uniform_allocation_bind_groups) = + uniform_allocation_bind_groups.get(&uniform_allocation_buffers_key) + else { + continue; + }; + + // Invoke the shader for all batch sets corresponding to indexed + // meshes and then for all batch sets corresponding to + // non-indexed meshes. + for uniform_allocation_bind_group in phase_uniform_allocation_bind_groups + .indexed + .iter() + .chain(phase_uniform_allocation_bind_groups.non_indexed.iter()) + { + // Invoke the local scan (step 1). + compute_pass.set_pipeline(uniform_allocation_local_scan_pipeline); + compute_pass.set_bind_group(0, &uniform_allocation_bind_group.bind_group, &[]); + let local_scan_workgroup_count = uniform_allocation_bind_group + .bin_count + .div_ceil(UNIFORM_ALLOCATION_WORKGROUP_SIZE); + if local_scan_workgroup_count > 0 { + compute_pass.dispatch_workgroups(local_scan_workgroup_count, 1, 1); + } + + // If there are 256 or fewer draws in this batch, we're + // done. Otherwise, perform the other two steps. + if local_scan_workgroup_count > 1 { + // Invoke the global scan (step 2). + compute_pass.set_pipeline(uniform_allocation_global_scan_pipeline); + compute_pass.dispatch_workgroups(1, 1, 1); + + // Perform the fan operation (step 3). + compute_pass.set_pipeline(uniform_allocation_fan_pipeline); + let fan_workgroup_count = local_scan_workgroup_count - 1; + compute_pass.dispatch_workgroups(fan_workgroup_count, 1, 1); + } + } + } } } + + pass_span.end(&mut compute_pass); } /// A rendering system that invokes a compute shader for each batch set in order @@ -555,14 +726,14 @@ pub fn unpack_bins( // …and each phase within each view. for phase_type_id in batched_instance_buffers.phase_instance_buffers.keys() { - let bin_unpacking_buffers_key = BinUnpackingBuffersKey { + let scene_unpacking_buffers_key = SceneUnpackingBuffersKey { phase: *phase_type_id, view: view.retained_view_entity, }; // Fetch the bind groups for this (view, phase) combination. let Some(phase_bin_unpacking_bind_groups) = - bin_unpacking_bind_groups.get(&bin_unpacking_buffers_key) + bin_unpacking_bind_groups.get(&scene_unpacking_buffers_key) else { continue; }; @@ -1273,6 +1444,8 @@ impl FromWorld for PreprocessPipelines { .extend_sequential((storage_buffer::(false),)); let bin_unpacking_bind_group_layout_entries = bin_unpacking_bind_group_layout_entries(); + let uniform_allocation_bind_group_layout_entries = + uniform_allocation_bind_group_layout_entries(); // Create the bind group layouts. let direct_bind_group_layout = BindGroupLayoutDescriptor::new( @@ -1307,6 +1480,10 @@ impl FromWorld for PreprocessPipelines { "bin unpacking bind group layout", &bin_unpacking_bind_group_layout_entries, ); + let uniform_allocation_bind_group_layout = BindGroupLayoutDescriptor::new( + "uniform allocation bind group layout", + &uniform_allocation_bind_group_layout_entries, + ); let preprocess_shader = load_embedded_asset!(world, "mesh_preprocess.wgsl"); let reset_indirect_batch_sets_shader = @@ -1314,6 +1491,7 @@ impl FromWorld for PreprocessPipelines { let build_indirect_params_shader = load_embedded_asset!(world, "build_indirect_params.wgsl"); let bin_unpacking_shader = load_embedded_asset!(world, "unpack_bins.wgsl"); + let uniform_allocation_shader = load_embedded_asset!(world, "allocate_uniforms.wgsl"); let preprocess_phase_pipelines = PreprocessPhasePipelines { reset_indirect_batch_sets: ResetIndirectBatchSetsPipeline { @@ -1374,6 +1552,23 @@ impl FromWorld for PreprocessPipelines { shader: bin_unpacking_shader, pipeline_id: None, }, + uniform_allocation: UniformAllocationPipelines { + local_scan: UniformAllocationLocalScanPipeline { + bind_group_layout: uniform_allocation_bind_group_layout.clone(), + shader: uniform_allocation_shader.clone(), + pipeline_id_local_scan: None, + }, + global_scan: UniformAllocationGlobalScanPipeline { + bind_group_layout: uniform_allocation_bind_group_layout.clone(), + shader: uniform_allocation_shader.clone(), + pipeline_id_global_scan: None, + }, + fan: UniformAllocationFanPipeline { + bind_group_layout: uniform_allocation_bind_group_layout.clone(), + shader: uniform_allocation_shader.clone(), + pipeline_id_fan: None, + }, + }, } } } @@ -1405,15 +1600,17 @@ fn build_indirect_params_bind_group_layout_entries() -> DynamicBindGroupLayoutEn DynamicBindGroupLayoutEntries::new_with_indices( ShaderStages::COMPUTE, ( + // @group(0) @binding(0) var current_input: + // array; (0, storage_buffer_read_only::(false)), + // @group(0) @binding(1) var indirect_parameters_metadata: + // array; ( 1, - storage_buffer_read_only::(false), - ), - ( - 2, - storage_buffer_read_only::(false), + storage_buffer_read_only::(false), ), + // @group(0) @binding(3) var + // indirect_batch_sets: array; (3, storage_buffer::(false)), ), ) @@ -1425,17 +1622,11 @@ fn gpu_culling_bind_group_layout_entries() -> DynamicBindGroupLayoutEntries { // GPU culling bind group parameters are a superset of those in the CPU // culling (direct) shader. preprocess_direct_bind_group_layout_entries().extend_with_indices(( - // `indirect_parameters_cpu_metadata` + // @group(0) @binding(7) var indirect_parameters_metadata: + // array; ( 7, - storage_buffer_read_only::( - /* has_dynamic_offset= */ false, - ), - ), - // `indirect_parameters_gpu_metadata` - ( - 8, - storage_buffer::(/* has_dynamic_offset= */ false), + storage_buffer::(/* has_dynamic_offset= */ false), ), // `mesh_culling_data` ( @@ -1465,7 +1656,7 @@ fn gpu_occlusion_culling_bind_group_layout_entries() -> DynamicBindGroupLayoutEn /// Creates and returns bind group layout entries for the GPU bin unpacking /// shader (`unpack_bins`). -fn bin_unpacking_bind_group_layout_entries() -> BindGroupLayoutEntries<4> { +fn bin_unpacking_bind_group_layout_entries() -> BindGroupLayoutEntries<5> { BindGroupLayoutEntries::sequential( ShaderStages::COMPUTE, ( @@ -1478,13 +1669,37 @@ fn bin_unpacking_bind_group_layout_entries() -> BindGroupLayoutEntries<4> { // @group(0) @binding(2) var // preprocess_work_items: array; storage_buffer::(false), - // @group(0) @binding(3) var - // bin_index_to_indirect_parameters_offset: array; + // @group(0) @binding(3) var bin_metadata: + // array; + storage_buffer_read_only::(false), + // @group(0) @binding(4) var + // bin_index_to_bin_metadata_index: array; storage_buffer_read_only::(false), ), ) } +/// Creates and returns bind group layout entries for the GPU uniform allocation +/// shader (`allocate_uniforms`). +fn uniform_allocation_bind_group_layout_entries() -> BindGroupLayoutEntries<4> { + BindGroupLayoutEntries::sequential( + ShaderStages::COMPUTE, + ( + // @group(0) @binding(0) var allocate_uniforms_metadata: + // AllocateUniformsMetadata; + uniform_buffer::(false), + // @group(0) @binding(1) var bin_metadata: array; + storage_buffer_read_only::(false), + // @group(0) @binding(2) var + // indirect_parameters_metadata: array; + storage_buffer::(false), + // @group(0) @binding(3) var fan_buffer: + // array; + storage_buffer::(false), + ), + ) +} + /// A system that specializes the pipelines relating to mesh preprocessing if /// necessary. /// @@ -1505,6 +1720,15 @@ pub fn prepare_preprocess_pipelines( mut specialized_bin_unpacking_pipelines: ResMut< SpecializedComputePipelines, >, + mut specialized_uniform_allocation_local_scan_pipelines: ResMut< + SpecializedComputePipelines, + >, + mut specialized_uniform_allocation_global_scan_pipelines: ResMut< + SpecializedComputePipelines, + >, + mut specialized_uniform_allocation_fan_pipelines: ResMut< + SpecializedComputePipelines, + >, preprocess_pipelines: ResMut, gpu_preprocessing_support: Res, ) { @@ -1617,6 +1841,14 @@ pub fn prepare_preprocess_pipelines( preprocess_pipelines .bin_unpacking .prepare(&pipeline_cache, &mut specialized_bin_unpacking_pipelines); + + // Prepare the uniform allocation compute pipeline. + preprocess_pipelines.uniform_allocation.prepare( + &pipeline_cache, + &mut specialized_uniform_allocation_local_scan_pipelines, + &mut specialized_uniform_allocation_global_scan_pipelines, + &mut specialized_uniform_allocation_fan_pipelines, + ); } impl PreprocessPipeline { @@ -1714,6 +1946,51 @@ impl SpecializedComputePipeline for BinUnpackingPipeline { } } +impl SpecializedComputePipeline for UniformAllocationLocalScanPipeline { + type Key = (); + + fn specialize(&self, _: Self::Key) -> ComputePipelineDescriptor { + ComputePipelineDescriptor { + label: Some("uniform allocation, local scan".into()), + layout: vec![self.bind_group_layout.clone()], + shader: self.shader.clone(), + shader_defs: vec![], + entry_point: Some("allocate_local_scan".into()), + ..Default::default() + } + } +} + +impl SpecializedComputePipeline for UniformAllocationGlobalScanPipeline { + type Key = (); + + fn specialize(&self, _: Self::Key) -> ComputePipelineDescriptor { + ComputePipelineDescriptor { + label: Some("uniform allocation, global scan".into()), + layout: vec![self.bind_group_layout.clone()], + shader: self.shader.clone(), + shader_defs: vec![], + entry_point: Some("allocate_global_scan".into()), + ..Default::default() + } + } +} + +impl SpecializedComputePipeline for UniformAllocationFanPipeline { + type Key = (); + + fn specialize(&self, _: Self::Key) -> ComputePipelineDescriptor { + ComputePipelineDescriptor { + label: Some("uniform allocation, fan".into()), + layout: vec![self.bind_group_layout.clone()], + shader: self.shader.clone(), + shader_defs: vec![], + entry_point: Some("allocate_fan".into()), + ..Default::default() + } + } +} + impl ResetIndirectBatchSetsPipeline { fn prepare( &mut self, @@ -1761,6 +2038,46 @@ impl BinUnpackingPipeline { } } +impl UniformAllocationPipelines { + /// Specializes all three pipelines that use the uniform allocation shader. + fn prepare( + &mut self, + pipeline_cache: &PipelineCache, + uniform_allocation_local_scan_pipelines: &mut SpecializedComputePipelines< + UniformAllocationLocalScanPipeline, + >, + uniform_allocation_global_scan_pipelines: &mut SpecializedComputePipelines< + UniformAllocationGlobalScanPipeline, + >, + uniform_allocation_fan_pipelines: &mut SpecializedComputePipelines< + UniformAllocationFanPipeline, + >, + ) { + if self.local_scan.pipeline_id_local_scan.is_none() { + self.local_scan.pipeline_id_local_scan = + Some(uniform_allocation_local_scan_pipelines.specialize( + pipeline_cache, + &self.local_scan, + (), + )); + } + + if self.global_scan.pipeline_id_global_scan.is_none() { + self.global_scan.pipeline_id_global_scan = + Some(uniform_allocation_global_scan_pipelines.specialize( + pipeline_cache, + &self.global_scan, + (), + )); + } + + if self.fan.pipeline_id_fan.is_none() { + self.fan.pipeline_id_fan = + Some(uniform_allocation_fan_pipelines.specialize(pipeline_cache, &self.fan, ())); + } + } +} + /// A system that attaches buffers to bind groups for the variants of the /// compute shaders relating to mesh preprocessing. #[expect( @@ -1775,13 +2092,14 @@ pub fn prepare_preprocess_bind_groups( pipeline_cache: Res, batched_instance_buffers: Res>, indirect_parameters_buffers: Res, - bin_unpacking_buffers: Res, + scene_unpacking_buffers: Res, mesh_culling_data_buffer: Res, visibility_ranges: Res, view_uniforms: Res, previous_view_uniforms: Res, pipelines: Res, mut bin_unpacking_bind_groups: ResMut, + mut uniform_allocation_bind_groups: ResMut, ) { // Grab the `BatchedInstanceBuffers`. let BatchedInstanceBuffers { @@ -1912,7 +2230,7 @@ pub fn prepare_preprocess_bind_groups( } // Create the bind groups we'll need for each dispatch of the bin unpacking - // (`unpack_bins`) shader. + // (`unpack_bins`) and uniform allocation (`allocate_uniforms`) shaders. for (_, view) in &views { create_bin_unpacking_bind_groups( &mut bin_unpacking_bind_groups, @@ -1921,7 +2239,16 @@ pub fn prepare_preprocess_bind_groups( &pipelines, &indirect_parameters_buffers, phase_instance_buffers, - &bin_unpacking_buffers, + &scene_unpacking_buffers, + &view.retained_view_entity, + ); + create_uniform_allocation_bind_groups( + &mut uniform_allocation_bind_groups, + &render_device, + &pipeline_cache, + &pipelines, + &indirect_parameters_buffers, + &scene_unpacking_buffers, &view.retained_view_entity, ); } @@ -2074,17 +2401,13 @@ impl<'a> PreprocessBindGroupBuilder<'a> { match ( self.phase_indirect_parameters_buffers .indexed - .cpu_metadata_buffer(), - self.phase_indirect_parameters_buffers - .indexed - .gpu_metadata_buffer(), + .metadata_buffer(), indexed_work_item_buffer.buffer(), late_indexed_work_item_buffer.buffer(), self.late_indexed_indirect_parameters_buffer.buffer(), ) { ( - Some(indexed_cpu_metadata_buffer), - Some(indexed_gpu_metadata_buffer), + Some(indexed_metadata_buffer), Some(indexed_work_item_gpu_buffer), Some(late_indexed_work_item_gpu_buffer), Some(late_indexed_indirect_parameters_buffer), @@ -2108,8 +2431,14 @@ impl<'a> PreprocessBindGroupBuilder<'a> { .bind_group_layout, ), &BindGroupEntries::with_indices(( + // @group(0) @binding(3) var current_input: + // array; (3, self.current_input_buffer.as_entire_binding()), + // @group(0) @binding(4) var + // previous_input: array; (4, self.previous_input_buffer.as_entire_binding()), + // @group(0) @binding(5) var work_items: + // array; ( 5, BindingResource::Buffer(BufferBinding { @@ -2118,13 +2447,26 @@ impl<'a> PreprocessBindGroupBuilder<'a> { size: indexed_work_item_buffer_size, }), ), + // @group(0) @binding(6) var + // output: array; (6, self.data_buffer.as_entire_binding()), - (7, indexed_cpu_metadata_buffer.as_entire_binding()), - (8, indexed_gpu_metadata_buffer.as_entire_binding()), + // @group(0) @binding(7) var + // indirect_parameters_metadata: + // array; + (7, indexed_metadata_buffer.as_entire_binding()), + // @group(0) @binding(9) var + // mesh_culling_data: array; (9, mesh_culling_data_buffer.as_entire_binding()), + // @group(0) @binding(10) var + // visibility_ranges: array>; (10, visibility_range_binding.clone()), + // @group(0) @binding(0) var view: View; (0, view_uniforms_binding.clone()), + // @group(0) @binding(11) var depth_pyramid: + // texture_2d; (11, &view_depth_pyramid.all_mips), + // @group(0) @binding(2) var + // previous_view_uniforms: PreviousViewUniforms; ( 2, BufferBinding { @@ -2133,6 +2475,9 @@ impl<'a> PreprocessBindGroupBuilder<'a> { size: NonZeroU64::new(size_of::() as u64), }, ), + // @group(0) @binding(12) var + // late_preprocess_work_items: + // array; ( 12, BufferBinding { @@ -2141,6 +2486,9 @@ impl<'a> PreprocessBindGroupBuilder<'a> { size: indexed_work_item_buffer_size, }, ), + // @group(0) @binding(13) var + // late_preprocess_work_item_indirect_parameters: + // array; ( 13, BufferBinding { @@ -2176,17 +2524,13 @@ impl<'a> PreprocessBindGroupBuilder<'a> { match ( self.phase_indirect_parameters_buffers .non_indexed - .cpu_metadata_buffer(), - self.phase_indirect_parameters_buffers - .non_indexed - .gpu_metadata_buffer(), + .metadata_buffer(), non_indexed_work_item_buffer.buffer(), late_non_indexed_work_item_buffer.buffer(), self.late_non_indexed_indirect_parameters_buffer.buffer(), ) { ( - Some(non_indexed_cpu_metadata_buffer), - Some(non_indexed_gpu_metadata_buffer), + Some(non_indexed_metadata_buffer), Some(non_indexed_work_item_gpu_buffer), Some(late_non_indexed_work_item_buffer), Some(late_non_indexed_indirect_parameters_buffer), @@ -2210,8 +2554,14 @@ impl<'a> PreprocessBindGroupBuilder<'a> { .bind_group_layout, ), &BindGroupEntries::with_indices(( + // @group(0) @binding(3) var current_input: + // array; (3, self.current_input_buffer.as_entire_binding()), + // @group(0) @binding(4) var + // previous_input: array; (4, self.previous_input_buffer.as_entire_binding()), + // @group(0) @binding(5) var work_items: + // array; ( 5, BindingResource::Buffer(BufferBinding { @@ -2221,12 +2571,23 @@ impl<'a> PreprocessBindGroupBuilder<'a> { }), ), (6, self.data_buffer.as_entire_binding()), - (7, non_indexed_cpu_metadata_buffer.as_entire_binding()), - (8, non_indexed_gpu_metadata_buffer.as_entire_binding()), + // @group(0) @binding(7) var + // indirect_parameters_metadata: + // array; + (7, non_indexed_metadata_buffer.as_entire_binding()), + // @group(0) @binding(9) var + // mesh_culling_data: array; (9, mesh_culling_data_buffer.as_entire_binding()), + // @group(0) @binding(10) var + // visibility_ranges: array>; (10, visibility_range_binding.clone()), + // @group(0) @binding(0) var view: View; (0, view_uniforms_binding.clone()), + // @group(0) @binding(11) var depth_pyramid: + // texture_2d; (11, &view_depth_pyramid.all_mips), + // @group(0) @binding(2) var + // previous_view_uniforms: PreviousViewUniforms; ( 2, BufferBinding { @@ -2235,6 +2596,9 @@ impl<'a> PreprocessBindGroupBuilder<'a> { size: NonZeroU64::new(size_of::() as u64), }, ), + // @group(0) @binding(12) var + // late_preprocess_work_items: + // array; ( 12, BufferBinding { @@ -2243,6 +2607,9 @@ impl<'a> PreprocessBindGroupBuilder<'a> { size: non_indexed_work_item_buffer_size, }, ), + // @group(0) @binding(13) var + // late_preprocess_work_item_indirect_parameters: + // array; ( 13, BufferBinding { @@ -2277,16 +2644,12 @@ impl<'a> PreprocessBindGroupBuilder<'a> { match ( self.phase_indirect_parameters_buffers .indexed - .cpu_metadata_buffer(), - self.phase_indirect_parameters_buffers - .indexed - .gpu_metadata_buffer(), + .metadata_buffer(), late_indexed_work_item_buffer.buffer(), self.late_indexed_indirect_parameters_buffer.buffer(), ) { ( - Some(indexed_cpu_metadata_buffer), - Some(indexed_gpu_metadata_buffer), + Some(indexed_metadata_buffer), Some(late_indexed_work_item_gpu_buffer), Some(late_indexed_indirect_parameters_buffer), ) => { @@ -2309,8 +2672,14 @@ impl<'a> PreprocessBindGroupBuilder<'a> { .bind_group_layout, ), &BindGroupEntries::with_indices(( + // @group(0) @binding(3) var current_input: + // array; (3, self.current_input_buffer.as_entire_binding()), + // @group(0) @binding(4) var + // previous_input: array; (4, self.previous_input_buffer.as_entire_binding()), + // @group(0) @binding(5) var work_items: + // array; ( 5, BindingResource::Buffer(BufferBinding { @@ -2319,13 +2688,26 @@ impl<'a> PreprocessBindGroupBuilder<'a> { size: late_indexed_work_item_buffer_size, }), ), + // @group(0) @binding(6) var + // output: array; (6, self.data_buffer.as_entire_binding()), - (7, indexed_cpu_metadata_buffer.as_entire_binding()), - (8, indexed_gpu_metadata_buffer.as_entire_binding()), + // @group(0) @binding(7) var + // indirect_parameters_metadata: + // array; + (7, indexed_metadata_buffer.as_entire_binding()), + // @group(0) @binding(9) var + // mesh_culling_data: array; (9, mesh_culling_data_buffer.as_entire_binding()), + // @group(0) @binding(10) var + // visibility_ranges: array>; (10, visibility_range_binding.clone()), + // @group(0) @binding(0) var view: View; (0, view_uniforms_binding.clone()), + // @group(0) @binding(11) var depth_pyramid: + // texture_2d; (11, &view_depth_pyramid.all_mips), + // @group(0) @binding(2) var + // previous_view_uniforms: PreviousViewUniforms; ( 2, BufferBinding { @@ -2334,6 +2716,9 @@ impl<'a> PreprocessBindGroupBuilder<'a> { size: NonZeroU64::new(size_of::() as u64), }, ), + // @group(0) @binding(13) var + // late_preprocess_work_item_indirect_parameters: + // array; ( 13, BufferBinding { @@ -2368,16 +2753,12 @@ impl<'a> PreprocessBindGroupBuilder<'a> { match ( self.phase_indirect_parameters_buffers .non_indexed - .cpu_metadata_buffer(), - self.phase_indirect_parameters_buffers - .non_indexed - .gpu_metadata_buffer(), + .metadata_buffer(), late_non_indexed_work_item_buffer.buffer(), self.late_non_indexed_indirect_parameters_buffer.buffer(), ) { ( - Some(non_indexed_cpu_metadata_buffer), - Some(non_indexed_gpu_metadata_buffer), + Some(non_indexed_metadata_buffer), Some(non_indexed_work_item_gpu_buffer), Some(late_non_indexed_indirect_parameters_buffer), ) => { @@ -2400,8 +2781,14 @@ impl<'a> PreprocessBindGroupBuilder<'a> { .bind_group_layout, ), &BindGroupEntries::with_indices(( + // @group(0) @binding(3) var current_input: + // array; (3, self.current_input_buffer.as_entire_binding()), + // @group(0) @binding(4) var + // previous_input: array; (4, self.previous_input_buffer.as_entire_binding()), + // @group(0) @binding(5) var work_items: + // array; ( 5, BindingResource::Buffer(BufferBinding { @@ -2410,13 +2797,26 @@ impl<'a> PreprocessBindGroupBuilder<'a> { size: non_indexed_work_item_buffer_size, }), ), + // @group(0) @binding(6) var + // output: array; (6, self.data_buffer.as_entire_binding()), - (7, non_indexed_cpu_metadata_buffer.as_entire_binding()), - (8, non_indexed_gpu_metadata_buffer.as_entire_binding()), + // @group(0) @binding(7) var + // indirect_parameters_metadata: + // array; + (7, non_indexed_metadata_buffer.as_entire_binding()), + // @group(0) @binding(9) var + // mesh_culling_data: array; (9, mesh_culling_data_buffer.as_entire_binding()), + // @group(0) @binding(10) var + // visibility_ranges: array>; (10, visibility_range_binding.clone()), + // @group(0) @binding(0) var view: View; (0, view_uniforms_binding.clone()), + // @group(0) @binding(11) var depth_pyramid: + // texture_2d; (11, &view_depth_pyramid.all_mips), + // @group(0) @binding(2) var + // previous_view_uniforms: PreviousViewUniforms; ( 2, BufferBinding { @@ -2425,6 +2825,9 @@ impl<'a> PreprocessBindGroupBuilder<'a> { size: NonZeroU64::new(size_of::() as u64), }, ), + // @group(0) @binding(13) var + // late_preprocess_work_item_indirect_parameters: + // array; ( 13, BufferBinding { @@ -2472,17 +2875,10 @@ impl<'a> PreprocessBindGroupBuilder<'a> { match ( self.phase_indirect_parameters_buffers .indexed - .cpu_metadata_buffer(), - self.phase_indirect_parameters_buffers - .indexed - .gpu_metadata_buffer(), + .metadata_buffer(), indexed_work_item_buffer.buffer(), ) { - ( - Some(indexed_cpu_metadata_buffer), - Some(indexed_gpu_metadata_buffer), - Some(indexed_work_item_gpu_buffer), - ) => { + (Some(indexed_metadata_buffer), Some(indexed_work_item_gpu_buffer)) => { // Don't use `as_entire_binding()` here; the shader reads the array // length and the underlying buffer may be longer than the actual size // of the vector. @@ -2513,8 +2909,7 @@ impl<'a> PreprocessBindGroupBuilder<'a> { }), ), (6, self.data_buffer.as_entire_binding()), - (7, indexed_cpu_metadata_buffer.as_entire_binding()), - (8, indexed_gpu_metadata_buffer.as_entire_binding()), + (7, indexed_metadata_buffer.as_entire_binding()), (9, mesh_culling_data_buffer.as_entire_binding()), (10, visibility_range_binding.clone()), (0, view_uniforms_binding.clone()), @@ -2539,17 +2934,10 @@ impl<'a> PreprocessBindGroupBuilder<'a> { match ( self.phase_indirect_parameters_buffers .non_indexed - .cpu_metadata_buffer(), - self.phase_indirect_parameters_buffers - .non_indexed - .gpu_metadata_buffer(), + .metadata_buffer(), non_indexed_work_item_buffer.buffer(), ) { - ( - Some(non_indexed_cpu_metadata_buffer), - Some(non_indexed_gpu_metadata_buffer), - Some(non_indexed_work_item_gpu_buffer), - ) => { + (Some(non_indexed_metadata_buffer), Some(non_indexed_work_item_gpu_buffer)) => { // Don't use `as_entire_binding()` here; the shader reads the array // length and the underlying buffer may be longer than the actual size // of the vector. @@ -2569,8 +2957,14 @@ impl<'a> PreprocessBindGroupBuilder<'a> { .bind_group_layout, ), &BindGroupEntries::with_indices(( + // @group(0) @binding(3) var current_input: + // array; (3, self.current_input_buffer.as_entire_binding()), + // @group(0) @binding(4) var + // previous_input: array; (4, self.previous_input_buffer.as_entire_binding()), + // @group(0) @binding(5) var work_items: + // array; ( 5, BindingResource::Buffer(BufferBinding { @@ -2579,11 +2973,20 @@ impl<'a> PreprocessBindGroupBuilder<'a> { size: non_indexed_work_item_buffer_size, }), ), + // @group(0) @binding(6) var + // output: array; (6, self.data_buffer.as_entire_binding()), - (7, non_indexed_cpu_metadata_buffer.as_entire_binding()), - (8, non_indexed_gpu_metadata_buffer.as_entire_binding()), + // @group(0) @binding(7) var + // indirect_parameters_metadata: + // array; + (7, non_indexed_metadata_buffer.as_entire_binding()), + // @group(0) @binding(9) var + // mesh_culling_data: array; (9, mesh_culling_data_buffer.as_entire_binding()), + // @group(0) @binding(10) var + // visibility_ranges: array>; (10, visibility_range_binding.clone()), + // @group(0) @binding(0) var view: View; (0, view_uniforms_binding.clone()), )), ), @@ -2658,18 +3061,12 @@ fn create_build_indirect_parameters_bind_groups( }, build_indexed_indirect: match ( - phase_indirect_parameters_buffer - .indexed - .cpu_metadata_buffer(), - phase_indirect_parameters_buffer - .indexed - .gpu_metadata_buffer(), + phase_indirect_parameters_buffer.indexed.metadata_buffer(), phase_indirect_parameters_buffer.indexed.data_buffer(), phase_indirect_parameters_buffer.indexed.batch_sets_buffer(), ) { ( - Some(indexed_indirect_parameters_cpu_metadata_buffer), - Some(indexed_indirect_parameters_gpu_metadata_buffer), + Some(indexed_indirect_parameters_metadata_buffer), Some(indexed_indirect_parameters_data_buffer), Some(indexed_batch_sets_buffer), ) => Some( @@ -2682,30 +3079,39 @@ fn create_build_indirect_parameters_bind_groups( .gpu_frustum_culling_build_indexed_indirect_params .bind_group_layout, ), - &BindGroupEntries::sequential(( - current_input_buffer.as_entire_binding(), + &BindGroupEntries::with_indices(( + // @group(0) @binding(0) var + // current_input: array; + (0, current_input_buffer.as_entire_binding()), + // @group(0) @binding(1) var + // indirect_parameters_metadata: + // array; + // // Don't use `as_entire_binding` here; the shader reads // the length and `RawBufferVec` overallocates. - BufferBinding { - buffer: indexed_indirect_parameters_cpu_metadata_buffer, - offset: 0, - size: NonZeroU64::new( - phase_indirect_parameters_buffer.indexed.batch_count() - as u64 - * size_of::() as u64, - ), - }, - BufferBinding { - buffer: indexed_indirect_parameters_gpu_metadata_buffer, - offset: 0, - size: NonZeroU64::new( - phase_indirect_parameters_buffer.indexed.batch_count() - as u64 - * size_of::() as u64, - ), - }, - indexed_batch_sets_buffer.as_entire_binding(), - indexed_indirect_parameters_data_buffer.as_entire_binding(), + ( + 1, + BufferBinding { + buffer: indexed_indirect_parameters_metadata_buffer, + offset: 0, + size: NonZeroU64::new( + phase_indirect_parameters_buffer.indexed.batch_count() + as u64 + * size_of::() as u64, + ), + }, + ), + // @group(0) @binding(3) var indirect_batch_sets: + // array; + (3, indexed_batch_sets_buffer.as_entire_binding()), + // @group(0) @binding(4) var indirect_parameters: + // array; + ( + 4, + indexed_indirect_parameters_data_buffer.as_entire_binding(), + ), )), ), ), @@ -2715,18 +3121,14 @@ fn create_build_indirect_parameters_bind_groups( build_non_indexed_indirect: match ( phase_indirect_parameters_buffer .non_indexed - .cpu_metadata_buffer(), - phase_indirect_parameters_buffer - .non_indexed - .gpu_metadata_buffer(), + .metadata_buffer(), phase_indirect_parameters_buffer.non_indexed.data_buffer(), phase_indirect_parameters_buffer .non_indexed .batch_sets_buffer(), ) { ( - Some(non_indexed_indirect_parameters_cpu_metadata_buffer), - Some(non_indexed_indirect_parameters_gpu_metadata_buffer), + Some(non_indexed_indirect_parameters_metadata_buffer), Some(non_indexed_indirect_parameters_data_buffer), Some(non_indexed_batch_sets_buffer), ) => Some( @@ -2739,30 +3141,41 @@ fn create_build_indirect_parameters_bind_groups( .gpu_frustum_culling_build_non_indexed_indirect_params .bind_group_layout, ), - &BindGroupEntries::sequential(( - current_input_buffer.as_entire_binding(), + &BindGroupEntries::with_indices(( + // @group(0) @binding(0) var + // current_input: array; + (0, current_input_buffer.as_entire_binding()), + // @group(0) @binding(1) var + // indirect_parameters_metadata: + // array; + // // Don't use `as_entire_binding` here; the shader reads // the length and `RawBufferVec` overallocates. - BufferBinding { - buffer: non_indexed_indirect_parameters_cpu_metadata_buffer, - offset: 0, - size: NonZeroU64::new( - phase_indirect_parameters_buffer.non_indexed.batch_count() - as u64 - * size_of::() as u64, - ), - }, - BufferBinding { - buffer: non_indexed_indirect_parameters_gpu_metadata_buffer, - offset: 0, - size: NonZeroU64::new( - phase_indirect_parameters_buffer.non_indexed.batch_count() - as u64 - * size_of::() as u64, - ), - }, - non_indexed_batch_sets_buffer.as_entire_binding(), - non_indexed_indirect_parameters_data_buffer.as_entire_binding(), + ( + 1, + BufferBinding { + buffer: non_indexed_indirect_parameters_metadata_buffer, + offset: 0, + size: NonZeroU64::new( + phase_indirect_parameters_buffer + .non_indexed + .batch_count() + as u64 + * size_of::() as u64, + ), + }, + ), + // @group(0) @binding(3) var indirect_batch_sets: + // array; + (3, non_indexed_batch_sets_buffer.as_entire_binding()), + // @group(0) @binding(4) var indirect_parameters: + // array; + ( + 4, + non_indexed_indirect_parameters_data_buffer.as_entire_binding(), + ), )), ), ), @@ -2784,10 +3197,11 @@ fn create_bin_unpacking_bind_groups( preprocess_pipelines: &PreprocessPipelines, indirect_parameters_buffers: &IndirectParametersBuffers, phase_instance_buffers: &TypeIdMap>, - bin_unpacking_buffers: &BinUnpackingBuffers, + scene_unpacking_buffers: &SceneUnpackingBuffers, view_entity: &RetainedViewEntity, ) { - let Some(bin_unpacking_metadata_buffer) = bin_unpacking_buffers.bin_unpacking_metadata.buffer() + let Some(bin_unpacking_metadata_buffer) = + scene_unpacking_buffers.bin_unpacking_metadata.buffer() else { return; }; @@ -2804,13 +3218,12 @@ fn create_bin_unpacking_bind_groups( else { continue; }; - let Some(view_phase_bin_unpacking_buffers) = - bin_unpacking_buffers - .view_phase_buffers - .get(&BinUnpackingBuffersKey { - phase: *phase_type_id, - view: *view_entity, - }) + let Some(view_phase_bin_unpacking_buffers) = scene_unpacking_buffers + .view_phase_buffers + .get(&SceneUnpackingBuffersKey { + phase: *phase_type_id, + view: *view_entity, + }) else { continue; }; @@ -2829,7 +3242,7 @@ fn create_bin_unpacking_bind_groups( // Create the actual bind groups. bin_unpacking_bind_groups.insert( - BinUnpackingBuffersKey { + SceneUnpackingBuffersKey { phase: *phase_type_id, view: *view_entity, }, @@ -2881,7 +3294,7 @@ fn create_bin_unpacking_bind_group( render_device: &RenderDevice, preprocess_pipelines: &PreprocessPipelines, pipeline_cache: &PipelineCache, - job: &BinUnpackingJob, + job: &SceneUnpackingJob, bin_unpacking_metadata_buffer: &Buffer, work_item_buffer: &Buffer, indexed: bool, @@ -2911,10 +3324,12 @@ fn create_bin_unpacking_bind_group( // read_write> preprocess_work_items: // array; work_item_buffer.as_entire_binding(), - // @group(0) @binding(3) var - // bin_index_to_indirect_parameters_offset: - // array; - job.bin_index_to_indirect_parameters_offset_buffer + // @group(0) @binding(3) var bin_metadata: + // array; + job.bin_metadata_buffer.as_entire_binding(), + // @group(0) @binding(4) var + // bin_index_to_bin_metadata_index: array; + job.bin_index_to_bin_metadata_index_buffer .as_entire_binding(), )), ); @@ -2925,6 +3340,139 @@ fn create_bin_unpacking_bind_group( } } +/// Creates all bind groups needed to run the `allocate_uniforms` shader for all +/// the phases for a single view. +fn create_uniform_allocation_bind_groups( + uniform_allocation_bind_groups: &mut UniformAllocationBindGroups, + render_device: &RenderDevice, + pipeline_cache: &PipelineCache, + preprocess_pipelines: &PreprocessPipelines, + indirect_parameters_buffers: &IndirectParametersBuffers, + scene_unpacking_buffers: &SceneUnpackingBuffers, + view_entity: &RetainedViewEntity, +) { + let Some(uniform_allocation_metadata_buffer) = + scene_unpacking_buffers.uniform_allocation_metadata.buffer() + else { + return; + }; + + for (phase_type_id, phase_indirect_parameters_buffers) in indirect_parameters_buffers.iter() { + let Some(view_phase_bin_unpacking_buffers) = scene_unpacking_buffers + .view_phase_buffers + .get(&SceneUnpackingBuffersKey { + phase: *phase_type_id, + view: *view_entity, + }) + else { + continue; + }; + + // Create the actual bind groups. + uniform_allocation_bind_groups.insert( + SceneUnpackingBuffersKey { + phase: *phase_type_id, + view: *view_entity, + }, + ViewPhaseUniformAllocationBindGroups { + indexed: match phase_indirect_parameters_buffers.indexed.metadata_buffer() { + None => vec![], + Some(indexed_indirect_parameters_metadata_buffer) => { + view_phase_bin_unpacking_buffers + .indexed_unpacking_jobs + .iter() + .map(|job| { + create_uniform_allocation_bind_group( + render_device, + preprocess_pipelines, + pipeline_cache, + job, + uniform_allocation_metadata_buffer, + indexed_indirect_parameters_metadata_buffer, + true, + ) + }) + .collect() + } + }, + non_indexed: match phase_indirect_parameters_buffers + .non_indexed + .metadata_buffer() + { + None => vec![], + Some(non_indexed_indirect_parameters_metadata_buffer) => { + view_phase_bin_unpacking_buffers + .non_indexed_unpacking_jobs + .iter() + .map(|job| { + create_uniform_allocation_bind_group( + render_device, + preprocess_pipelines, + pipeline_cache, + job, + uniform_allocation_metadata_buffer, + non_indexed_indirect_parameters_metadata_buffer, + false, + ) + }) + .collect() + } + }, + }, + ); + } +} + +/// Creates a bind group for the uniform allocation shader for a single (view, +/// phase, mesh indexed-ness) combination. +fn create_uniform_allocation_bind_group( + render_device: &RenderDevice, + preprocess_pipelines: &PreprocessPipelines, + pipeline_cache: &PipelineCache, + job: &SceneUnpackingJob, + uniform_allocation_metadata_buffer: &Buffer, + indirect_parameters_metadata_buffer: &Buffer, + indexed: bool, +) -> ViewPhaseUniformAllocationBindGroup { + let bind_group = render_device.create_bind_group( + if indexed { + "uniform allocation indexed bind group" + } else { + "uniform allocation non-indexed bind group" + }, + &pipeline_cache.get_bind_group_layout( + // All the pipelines' bind group layouts should be identical. + &preprocess_pipelines + .uniform_allocation + .local_scan + .bind_group_layout, + ), + &BindGroupEntries::sequential(( + // @group(0) @binding(0) var allocate_uniforms_metadata: + // AllocateUniformsMetadata; + BindingResource::Buffer(BufferBinding { + buffer: uniform_allocation_metadata_buffer, + offset: job.uniform_allocation_metadata_index.uniform_offset() as u64, + size: NonZeroU64::new(size_of::() as u64), + }), + // @group(0) @binding(1) var bin_metadata: + // array; + job.bin_metadata_buffer.as_entire_binding(), + // @group(0) @binding(2) var + // indirect_parameters_metadata: array; + indirect_parameters_metadata_buffer.as_entire_binding(), + // @group(0) @binding(3) var fan_buffer: + // array; + job.fan_buffer.as_entire_binding(), + )), + ); + ViewPhaseUniformAllocationBindGroup { + metadata_index: job.uniform_allocation_metadata_index, + bind_group, + bin_count: job.bin_count, + } +} + /// Writes the information needed to do GPU mesh culling to the GPU. pub fn write_mesh_culling_data_buffer( render_device: Res, diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs index f644b4f5bdb16..06139ca452615 100644 --- a/crates/bevy_pbr/src/render/mesh.rs +++ b/crates/bevy_pbr/src/render/mesh.rs @@ -49,7 +49,7 @@ use bevy_render::{ batching::{ gpu_preprocessing::{ self, GpuPreprocessingSupport, IndirectBatchSet, IndirectParametersBuffers, - IndirectParametersCpuMetadata, IndirectParametersIndexed, IndirectParametersNonIndexed, + IndirectParametersIndexed, IndirectParametersMetadata, IndirectParametersNonIndexed, InstanceInputUniformBuffer, UntypedPhaseIndirectParametersBuffers, }, no_gpu_preprocessing, GetBatchData, GetFullBatchData, NoAutomaticBatching, @@ -3005,12 +3005,16 @@ impl GetFullBatchData for MeshPipeline { phase_indirect_parameters_buffers: &mut UntypedPhaseIndirectParametersBuffers, indirect_parameters_offset: u32, ) { - let indirect_parameters = IndirectParametersCpuMetadata { + let indirect_parameters = IndirectParametersMetadata { base_output_index, batch_set_index: match batch_set_index { Some(batch_set_index) => u32::from(batch_set_index), None => !0, }, + // These fields are filled in by the GPU: + mesh_index: 0, + early_instance_count: 0, + late_instance_count: 0, }; if indexed { diff --git a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl index 496cd6b6f84ab..ea939a92353ce 100644 --- a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl +++ b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl @@ -14,9 +14,7 @@ // are known as *early mesh preprocessing* and *late mesh preprocessing* // respectively. -#import bevy_pbr::mesh_preprocess_types::{ - IndirectParametersCpuMetadata, IndirectParametersGpuMetadata, MeshInput, PreprocessWorkItem -} +#import bevy_pbr::mesh_preprocess_types::{IndirectParametersMetadata, MeshInput, PreprocessWorkItem} #import bevy_pbr::mesh_types::{ Mesh, MESH_FLAGS_AABB_BASED_VISIBILITY_RANGE_BIT, MESH_FLAGS_NO_FRUSTUM_CULLING_BIT, MESH_FLAGS_VISIBILITY_RANGE_INDEX_BITS @@ -84,11 +82,8 @@ struct Immediates { #ifdef INDIRECT // The array of indirect parameters for drawcalls. -@group(0) @binding(7) var indirect_parameters_cpu_metadata: - array; - -@group(0) @binding(8) var indirect_parameters_gpu_metadata: - array; +@group(0) @binding(7) var indirect_parameters_metadata: + array; #endif #ifdef FRUSTUM_CULLING @@ -182,7 +177,7 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3) { // building shader can access it. #ifndef LATE_PHASE if (instance_index == 0u) || (work_items[instance_index - 1].output_or_indirect_parameters_index != indirect_parameters_index) { - indirect_parameters_gpu_metadata[indirect_parameters_index].mesh_index = input_index; + indirect_parameters_metadata[indirect_parameters_index].mesh_index = input_index; } #endif // LATE_PHASE @@ -362,20 +357,20 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3) { #ifdef INDIRECT #ifdef LATE_PHASE let batch_output_index = atomicLoad( - &indirect_parameters_gpu_metadata[indirect_parameters_index].early_instance_count + &indirect_parameters_metadata[indirect_parameters_index].early_instance_count ) + atomicAdd( - &indirect_parameters_gpu_metadata[indirect_parameters_index].late_instance_count, + &indirect_parameters_metadata[indirect_parameters_index].late_instance_count, 1u ); #else // LATE_PHASE let batch_output_index = atomicAdd( - &indirect_parameters_gpu_metadata[indirect_parameters_index].early_instance_count, + &indirect_parameters_metadata[indirect_parameters_index].early_instance_count, 1u ); #endif // LATE_PHASE let mesh_output_index = - indirect_parameters_cpu_metadata[indirect_parameters_index].base_output_index + + indirect_parameters_metadata[indirect_parameters_index].base_output_index + batch_output_index; #endif // INDIRECT diff --git a/crates/bevy_pbr/src/render/unpack_bins.wgsl b/crates/bevy_pbr/src/render/unpack_bins.wgsl index 4161cc2f0e7d8..ba9077a3e1af9 100644 --- a/crates/bevy_pbr/src/render/unpack_bins.wgsl +++ b/crates/bevy_pbr/src/render/unpack_bins.wgsl @@ -8,7 +8,7 @@ // the position of the command in the indirect parameters buffer that will draw // that entity. -#import bevy_pbr::mesh_preprocess_types::PreprocessWorkItem +#import bevy_pbr::mesh_preprocess_types::{BinMetadata, PreprocessWorkItem} // Information needed to unpack bins belonging to a single batch set. struct BinUnpackingMetadata { @@ -53,10 +53,16 @@ struct BinnedMeshInstance { // The output list of `PreprocessWorkItem`s. @group(0) @binding(2) var preprocess_work_items: array; -// A mapping from each `bin_index` to the index of the GPU indirect parameters -// for this bin, relative to the start of the indirect parameters for this batch +// The bin metadata, which contains the index of the GPU indirect parameters for +// this bin, relative to the start of the indirect parameters for this batch // set. -@group(0) @binding(3) var bin_index_to_indirect_parameters_offset: array; +// +// This is indexed by the bin metadata index below. +@group(0) @binding(3) var bin_metadata: array; + +// A mapping from the bin index to the metadata index within the `bin_metadata` +// buffer. +@group(0) @binding(4) var bin_index_to_bin_metadata_index: array; @compute @workgroup_size(64) @@ -73,7 +79,8 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3) { // Look up the indirect parameters index for this bin, relative to the first // indirect parameters offset for this batch set. - let indirect_parameters_offset = bin_index_to_indirect_parameters_offset[bin_index]; + let bin_metadata_index = bin_index_to_bin_metadata_index[bin_index]; + let indirect_parameters_offset = bin_metadata[bin_metadata_index].indirect_parameters_offset; // Determine the location we should write the work item to. let output_index = bin_unpacking_metadata.base_output_work_item_index + global_id; diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs index a0b80f43dc147..4742a66024be1 100644 --- a/crates/bevy_render/src/batching/gpu_preprocessing.rs +++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs @@ -72,7 +72,7 @@ impl Plugin for BatchingPlugin { }) .init_gpu_resource::() .allow_ambiguous_resource::() - .init_gpu_resource::() + .init_gpu_resource::() .add_systems( Render, write_indirect_parameters_buffers.in_set(RenderSystems::PrepareResourcesFlush), @@ -762,10 +762,10 @@ pub struct PreprocessWorkItem { pub input_index: u32, /// In direct mode, the index of the mesh uniform; in indirect mode, the - /// index of the [`IndirectParametersGpuMetadata`]. + /// index of the [`IndirectParametersMetadata`]. /// /// In indirect mode, this is the index of the - /// [`IndirectParametersGpuMetadata`] in the + /// [`IndirectParametersMetadata`] in the /// `IndirectParametersBuffers::indexed_metadata` or /// `IndirectParametersBuffers::non_indexed_metadata`. pub output_or_indirect_parameters_index: u32, @@ -807,13 +807,30 @@ pub struct IndirectParametersNonIndexed { pub first_instance: u32, } -/// A structure, initialized on CPU and read on GPU, that contains metadata -/// about each batch. +impl MeshClassIndirectParameters for IndirectParametersIndexed { + fn debug_label() -> &'static str { + "indexed" + } +} + +impl MeshClassIndirectParameters for IndirectParametersNonIndexed { + fn debug_label() -> &'static str { + "non-indexed" + } +} + +/// A structure, written and read on GPU, that records how many instances of +/// each mesh are actually to be drawn. +/// +/// The GPU mesh preprocessing shader increments the +/// [`Self::early_instance_count`] and [`Self::late_instance_count`] as it +/// determines that meshes are visible. The indirect parameter building shader +/// reads this metadata in order to construct the indirect draw parameters. /// /// Each batch will have one instance of this structure. #[derive(Clone, Copy, Default, Pod, Zeroable, ShaderType)] #[repr(C)] -pub struct IndirectParametersCpuMetadata { +pub struct IndirectParametersMetadata { /// The index of the first instance of this mesh in the array of /// `MeshUniform`s. /// @@ -828,38 +845,25 @@ pub struct IndirectParametersCpuMetadata { /// /// A *batch set* is a set of meshes that may be multi-drawn together. /// Multiple batches (and therefore multiple instances of - /// [`IndirectParametersGpuMetadata`] structures) can be part of the same - /// batch set. + /// [`IndirectParametersMetadata`] structures) can be part of the same batch + /// set. pub batch_set_index: u32, -} -/// A structure, written and read on GPU, that records how many instances of -/// each mesh are actually to be drawn. -/// -/// The GPU mesh preprocessing shader increments the -/// [`Self::early_instance_count`] and [`Self::late_instance_count`] as it -/// determines that meshes are visible. The indirect parameter building shader -/// reads this metadata in order to construct the indirect draw parameters. -/// -/// Each batch will have one instance of this structure. -#[derive(Clone, Copy, Default, Pod, Zeroable, ShaderType)] -#[repr(C)] -pub struct IndirectParametersGpuMetadata { /// The index of the first mesh in this batch in the array of /// `MeshInputUniform`s. pub mesh_index: u32, /// The number of instances that were judged visible last frame. /// - /// The CPU sets this value to 0, and the GPU mesh preprocessing shader - /// increments it as it culls mesh instances. + /// The uniform allocation pass sets this value to 0, and the mesh + /// preprocessing shader increments it as it culls mesh instances. pub early_instance_count: u32, /// The number of instances that have been judged potentially visible this /// frame that weren't in the last frame's potentially visible set. /// - /// The CPU sets this value to 0, and the GPU mesh preprocessing shader - /// increments it as it culls mesh instances. + /// The uniform allocation pass sets this value to 0, and the mesh + /// preprocessing shader increments it as it culls mesh instances. pub late_instance_count: u32, } @@ -899,7 +903,7 @@ pub struct IndirectBatchSet { /// (`multi_draw_indirect`, `multi_draw_indirect_count`) use to draw the scene. /// /// In addition to the indirect draw buffers themselves, this structure contains -/// the buffers that store [`IndirectParametersGpuMetadata`], which are the +/// the buffers that store [`IndirectParametersMetadata`], which are the /// structures that culling writes to so that the indirect parameter building /// pass can determine how many meshes are actually to be drawn. /// @@ -953,21 +957,95 @@ impl Default for GpuBinUnpackingMetadata { } } +/// Information about each bin in a batch set. +/// +/// This is maintained by the CPU and cached for bins that don't change from +/// frame to frame. +#[derive(Clone, Copy, Pod, Zeroable, ShaderType)] +#[repr(C)] +pub struct GpuBinMetadata { + /// The index of the indirect parameters for this bin, relative to the first + /// indirect parameter index for the batch set. + /// + /// That is, the final indirect parameters index for this bin is + /// `first_indirect_parameters_index` in the `UniformAllocationMetadata` + /// plus this value. + pub indirect_parameters_offset: u32, + + /// The index of the bin that this metadata corresponds to. + /// + /// The GPU doesn't use this, but the CPU does in order to perform the + /// reverse mapping from bin metadata index back to the bin. We could store + /// this in a non-GPU-accessible buffer, but I figured the extra complexity + /// wasn't worth it. + pub bin_index: u32, + + /// The number of mesh instances in this bin. + pub instance_count: u32, +} + +/// Information needed to allocate `MeshUniform`s on the GPU. +#[derive(Clone, Copy, Pod, Zeroable, ShaderType)] +#[repr(C)] +pub struct GpuUniformAllocationMetadata { + /// The index of this batch set in the `IndirectBatchSet` array. + /// + /// We write this into the `indirect_parameters_metadata`. + pub batch_set_index: u32, + + /// The number of bins (a.k.a. draws, a.k.a. batches) in this batch set. + pub bin_count: u32, + + /// The index of the first set of indirect parameters for this batch set. + /// + /// This is also the index of the first `IndirectParametersMetadata`, as + /// that's a parallel array with the indirect parameters. + pub first_indirect_parameters_index: u32, + + /// The index of the first `MeshUniform` slot for this batch set. + pub first_output_mesh_uniform_index: u32, + + /// Padding. + pub pad: [u32; 60], +} + /// CPU-side information needed to construct the bind groups and issue the -/// dispatch for the `unpack_bins` shader, for a single batch set. -pub struct BinUnpackingJob { +/// dispatch for the `unpack_bins` and `allocate_uniforms` shaders, for a single +/// batch set. +/// +/// Because those two shaders are always invoked together, we combine the +/// information together for efficiency's sake. +pub struct SceneUnpackingJob { /// The GPU buffer of `GpuRenderBinnedMeshInstance`s corresponding to the /// mesh instances that this batch set contains. + /// + /// This is used in the `unpack_bins` shader. pub render_binned_mesh_instance_buffer: Buffer, - /// The GPU buffer that maps each bin index to the index of the indirect - /// drawing parameters for that bin, relative to the first such indirect - /// drawing parameters for this batch set. - pub bin_index_to_indirect_parameters_offset_buffer: Buffer, + /// The GPU buffer that stores various metadata for each bin, including the + /// indirect parameters offset and the instance count. + /// + /// This is used in both the `allocate_uniforms` and `unpack_bins` shaders. + pub bin_metadata_buffer: Buffer, + /// A temporary GPU buffer that stores the mesh uniform index of the last + /// instance plus one for each workgroup (i.e. for each 256-bin chunk). + /// + /// This is accumulated in the second stage of the `allocate_uniforms` + /// shader and written out in the third. + pub fan_buffer: Buffer, + /// A GPU buffer that maps the stable index of each bin to the index of the + /// metadata in the [`Self::bin_metadata_buffer`]. + pub bin_index_to_bin_metadata_index_buffer: Buffer, /// The index of this batch set's [`GpuBinUnpackingMetadata`] in the - /// [`BinUnpackingBuffers::bin_unpacking_metadata`] buffer. + /// [`SceneUnpackingBuffers::bin_unpacking_metadata`] buffer. pub bin_unpacking_metadata_index: BinUnpackingMetadataIndex, + /// The index of this batch set's [`GpuUniformAllocationMetadata`] in the + /// [`SceneUnpackingBuffers::uniform_allocation_metadata`] buffer. + pub uniform_allocation_metadata_index: UniformAllocationMetadataIndex, /// The total number of mesh instances in this batch set. pub mesh_instance_count: u32, + /// The total number of bins (i.e. draws, i.e. separate meshes) in this + /// batch set. + pub bin_count: u32, } /// The buffers containing all the information that indirect draw commands use @@ -1123,21 +1201,27 @@ impl UntypedPhaseIndirectParametersBuffers { } /// A resource, part of the render world, that holds all GPU buffers used for -/// the bin unpacking shader. +/// the bin unpacking and uniform allocation shaders. #[derive(Resource)] -pub struct BinUnpackingBuffers { +pub struct SceneUnpackingBuffers { + /// A buffer containing all the uniforms needed to run the uniform + /// allocation compute shader for each batch set. + pub uniform_allocation_metadata: RawBufferVec, /// A buffer containing all the uniforms needed to run the bin unpacking /// compute shader for each batch set. pub bin_unpacking_metadata: RawBufferVec, /// Per-view-phase buffers for the bin unpacking shader. - pub view_phase_buffers: HashMap, + pub view_phase_buffers: HashMap, } -impl Default for BinUnpackingBuffers { +impl Default for SceneUnpackingBuffers { fn default() -> Self { + let mut uniform_allocation_metadata = RawBufferVec::new(BufferUsages::UNIFORM); + uniform_allocation_metadata.set_label(Some("uniform allocation metadata buffer")); let mut bin_unpacking_metadata = RawBufferVec::new(BufferUsages::UNIFORM); bin_unpacking_metadata.set_label(Some("bin unpacking metadata buffer")); - BinUnpackingBuffers { + SceneUnpackingBuffers { + uniform_allocation_metadata, bin_unpacking_metadata, view_phase_buffers: HashMap::default(), } @@ -1147,18 +1231,18 @@ impl Default for BinUnpackingBuffers { /// GPU buffers for the bin unpacking shader that are specific to each phase of /// each view. #[derive(Default)] -pub struct ViewPhaseBinUnpackingBuffers { +pub struct ViewPhaseSceneUnpackingBuffers { /// Metadata that describes each unpacking job, specific to indexed meshes. - pub indexed_unpacking_jobs: Vec, + pub indexed_unpacking_jobs: Vec, /// Metadata that describes each unpacking job, specific to non-indexed /// meshes. - pub non_indexed_unpacking_jobs: Vec, + pub non_indexed_unpacking_jobs: Vec, } /// A key used to look up the bin unpacking buffers for a specific phase of a /// specific view. #[derive(Clone, Copy, PartialEq, Eq, Hash)] -pub struct BinUnpackingBuffersKey { +pub struct SceneUnpackingBuffersKey { /// The ID of the phase. pub phase: TypeId, /// The entity ID of the view. @@ -1166,19 +1250,33 @@ pub struct BinUnpackingBuffersKey { } /// The index of the metadata corresponding to one bin unpacking job in the -/// [`BinUnpackingBuffers::bin_unpacking_metadata`] buffer. +/// [`SceneUnpackingBuffers::bin_unpacking_metadata`] buffer. #[derive(Clone, Copy, Deref, DerefMut)] pub struct BinUnpackingMetadataIndex(pub NonMaxU32); impl BinUnpackingMetadataIndex { /// Returns the byte offset within the - /// [`BinUnpackingBuffers::bin_unpacking_metadata`] buffer corresponding to + /// [`SceneUnpackingBuffers::bin_unpacking_metadata`] buffer corresponding to /// this index. pub fn uniform_offset(&self) -> u32 { self.get() * size_of::() as u32 } } +/// The index of the metadata corresponding to one uniform allocation job in the +/// [`SceneUnpackingBuffers::uniform_allocation_metadata`] buffer. +#[derive(Clone, Copy, Deref, DerefMut)] +pub struct UniformAllocationMetadataIndex(pub NonMaxU32); + +impl UniformAllocationMetadataIndex { + /// Returns the byte offset within the + /// [`SceneUnpackingBuffers::uniform_allocation_metadata`] buffer + /// corresponding to this index. + pub fn uniform_offset(&self) -> u32 { + self.get() * size_of::() as u32 + } +} + /// The buffers containing all the information that indirect draw commands use /// to draw the scene, for a single mesh class (indexed or non-indexed), for a /// single phase. @@ -1193,21 +1291,13 @@ where /// it to perform the draws. indirect_draw_parameters: UninitBufferVec, - /// The GPU buffer that holds the data used to construct indirect draw - /// parameters for meshes. - /// - /// The GPU mesh preprocessing shader writes to this buffer, and the - /// indirect parameters building shader reads this buffer to construct the - /// indirect draw parameters. - cpu_metadata: RawBufferVec, - /// The GPU buffer that holds data built by the GPU used to construct /// indirect draw parameters for meshes. /// /// The GPU mesh preprocessing shader writes to this buffer, and the /// indirect parameters building shader reads this buffer to construct the /// indirect draw parameters. - gpu_metadata: UninitBufferVec, + metadata: PartialBufferVec, /// The GPU buffer that holds the number of indirect draw commands for each /// phase of each view, for meshes. @@ -1218,9 +1308,15 @@ where batch_sets: RawBufferVec, } +/// GPU-side indirect draw parameters for either indexed or non-indexed meshes. +pub trait MeshClassIndirectParameters: Clone + ShaderSize + WriteInto { + /// Either the string "indexed" or "non-indexed". + fn debug_label() -> &'static str; +} + impl MeshClassIndirectParametersBuffers where - IP: Clone + ShaderSize + WriteInto, + IP: MeshClassIndirectParameters, { fn new( allow_copies_from_indirect_parameter_buffers: bool, @@ -1232,8 +1328,10 @@ where MeshClassIndirectParametersBuffers { indirect_draw_parameters: UninitBufferVec::new(indirect_parameter_buffer_usages), - cpu_metadata: RawBufferVec::new(BufferUsages::STORAGE), - gpu_metadata: UninitBufferVec::new(BufferUsages::STORAGE), + metadata: PartialBufferVec::new( + BufferUsages::STORAGE, + format!("{} indirect parameters metadata buffer", IP::debug_label()), + ), batch_sets: RawBufferVec::new(indirect_parameter_buffer_usages), } } @@ -1249,16 +1347,6 @@ where self.indirect_draw_parameters.buffer() } - /// Returns the GPU buffer that holds the CPU-constructed data used to - /// construct indirect draw parameters for meshes. - /// - /// The CPU writes to this buffer, and the indirect parameters building - /// shader reads this buffer to construct the indirect draw parameters. - #[inline] - pub fn cpu_metadata_buffer(&self) -> Option<&Buffer> { - self.cpu_metadata.buffer() - } - /// Returns the GPU buffer that holds the GPU-constructed data used to /// construct indirect draw parameters for meshes. /// @@ -1266,8 +1354,8 @@ where /// indirect parameters building shader reads this buffer to construct the /// indirect draw parameters. #[inline] - pub fn gpu_metadata_buffer(&self) -> Option<&Buffer> { - self.gpu_metadata.buffer() + pub fn metadata_buffer(&self) -> Option<&Buffer> { + self.metadata.buffer() } /// Returns the GPU buffer that holds the number of indirect draw commands @@ -1283,24 +1371,20 @@ where /// Reserves space for `count` new batches. /// - /// This allocates in the [`Self::cpu_metadata`], [`Self::gpu_metadata`], - /// and [`Self::indirect_draw_parameters`] buffers. + /// This allocates in the [`Self::metadata`] and + /// [`Self::indirect_draw_parameters`] buffers. fn allocate(&mut self, count: u32) -> u32 { let length = self.indirect_draw_parameters.len(); - self.cpu_metadata.reserve_internal(count as usize); - self.gpu_metadata.add_multiple(count as usize); + self.metadata.push_multiple_init(count as usize); for _ in 0..count { self.indirect_draw_parameters.add(); - self.cpu_metadata - .push(IndirectParametersCpuMetadata::default()); } length as u32 } - /// Sets the [`IndirectParametersCpuMetadata`] for the mesh at the given - /// index. - pub fn set(&mut self, index: u32, value: IndirectParametersCpuMetadata) { - self.cpu_metadata.set(index, value); + /// Sets the [`IndirectParametersMetadata`] for the mesh at the given index. + pub fn set(&mut self, index: u32, value: IndirectParametersMetadata) { + self.metadata.set(index as usize, value); } /// Returns the number of batches corresponding to meshes that are currently @@ -1313,8 +1397,7 @@ where /// Clears out all the buffers in preparation for a new frame. pub fn clear(&mut self) { self.indirect_draw_parameters.clear(); - self.cpu_metadata.clear(); - self.gpu_metadata.clear(); + self.metadata.clear(); self.batch_sets.clear(); } } @@ -2037,6 +2120,10 @@ pub fn batch_and_prepare_binned_render_phase( as u32, // Unused. first_work_item_index: 0, + // Unused. + first_indirect_parameters_index: 0, + // Unused. + first_output_mesh_uniform_index: 0, }); } } @@ -2154,15 +2241,20 @@ where &mut self, batch_set: &RenderMultidrawableBatchSet, data_buffer: &mut UninitBufferVec, - indexed_work_item_buffer: &mut PartialBufferVec, + work_item_buffer: &mut PartialBufferVec, mesh_class_buffers: &mut MeshClassIndirectParametersBuffers, batch_sets: &mut Vec>, ) where IP: Clone + ShaderSize + WriteInto, { + // Note that this function is O(1) and doesn't have any loops over the + // meshes or mesh instances in this batch set. This is very important + // for proper GPU-driven rendering, as we want to have no overhead on + // the CPU for meshes that didn't change from the last frame. + let current_indexed_batch_set_index = self.batch_set_index; let current_output_index = data_buffer.len() as u32; - let first_work_item_index = indexed_work_item_buffer.len() as u32; + let first_work_item_index = work_item_buffer.len() as u32; let indirect_parameters_base = self.indirect_parameters_index; @@ -2180,56 +2272,25 @@ where .representative_entity() .unwrap_or(MainEntity::from(Entity::PLACEHOLDER)); - // Calculate where the mesh uniform (not the mesh input uniform) should - // go for each mesh instance in our bins. This entails performing a - // prefix sum on the number of elements in each bin. First, initialize - // each base output index to zero. - // - // TODO: Eventually, this should be done on GPU with a prefix sum. We - // don't want any per-bin work to be done on CPU for bins that didn't - // change since the last frame. - let cpu_metadata_offset = mesh_class_buffers.cpu_metadata.len() as u32; - for _ in 0..batch_set.bin_count() { - mesh_class_buffers - .cpu_metadata - .push(IndirectParametersCpuMetadata { - // We fill this in later. - base_output_index: 0, - batch_set_index: self.batch_set_index, - }); - } - - // Next, traverse each bin and allocate the position of each mesh - // uniform in it. Additionally, reserve space for the mesh instances in - // the buffers. - for bin_index in batch_set.bin_key_to_bin_index.values() { - let bin = batch_set.bin(*bin_index).expect("Bin not present"); + // Calculate where the indirect parameters metadata should go for this + // batch set. The uniform allocation shader will create the actual + // metadata. + let bin_count = batch_set.bin_count(); + let first_metadata_index = + mesh_class_buffers.metadata.push_multiple_uninit(bin_count) as u32; - // Allocate the indirect parameters. - let indirect_parameters_offset = *batch_set - .gpu_buffers - .bin_index_to_indirect_parameters_offset_buffer - .get(bin_index.0) - .unwrap(); - mesh_class_buffers.cpu_metadata.values_mut() - [cpu_metadata_offset as usize + indirect_parameters_offset as usize] - .base_output_index = data_buffer.len() as u32; - - // Reserve space for the appropriate number of entities in the work - // item buffer and data buffer. Also, advance the output index and - // work item count. - let bin_entity_count = bin.entity_to_binned_mesh_instance_index.len(); - indexed_work_item_buffer.push_multiple_uninit(bin_entity_count); - data_buffer.add_multiple(bin_entity_count); - self.work_item_count += bin_entity_count; - } + // Next, reserve space for the mesh uniforms and work items that this + // batch set will need. + let first_output_mesh_uniform_index = + data_buffer.add_multiple(batch_set.instance_count as usize) as u32; + work_item_buffer.push_multiple_uninit(batch_set.instance_count as usize); + self.work_item_count += batch_set.instance_count as usize; // Reserve space for the bins in this batch set in the GPU buffers. - let bin_count = batch_set.bin_count(); - mesh_class_buffers.gpu_metadata.add_multiple(bin_count); - mesh_class_buffers + let first_indirect_parameters_index = mesh_class_buffers .indirect_draw_parameters - .add_multiple(bin_count); + .add_multiple(bin_count) as u32; + debug_assert_eq!(first_metadata_index, first_indirect_parameters_index); // Write the information the GPU will need about this batch set. mesh_class_buffers.batch_sets.push(IndirectBatchSet { @@ -2254,6 +2315,8 @@ where batch_count: self.indirect_parameters_index - indirect_parameters_base, index: current_indexed_batch_set_index, first_work_item_index, + first_indirect_parameters_index, + first_output_mesh_uniform_index, }); } } @@ -2323,7 +2386,7 @@ pub fn write_batched_instance_buffers( render_queue: Res, gpu_array_buffer: ResMut>, pipeline_cache: Res, - mut bin_unpacking_buffers: ResMut, + mut bin_unpacking_buffers: ResMut, mut sparse_buffer_update_jobs: ResMut, mut sparse_buffer_update_bind_groups: ResMut, sparse_buffer_update_pipelines: Res, @@ -2339,6 +2402,8 @@ pub fn write_batched_instance_buffers( let render_device = &*render_device; let render_queue = &*render_queue; + let bin_unpacking_buffers = &mut *bin_unpacking_buffers; + ComputeTaskPool::get().scope(|scope| { scope.spawn(async { let _span = bevy_log::info_span!("write_current_input_buffers").entered(); @@ -2401,6 +2466,17 @@ pub fn write_batched_instance_buffers( }); } } + + scope.spawn(async { + bin_unpacking_buffers + .bin_unpacking_metadata + .write_buffer(render_device, render_queue); + }); + scope.spawn(async { + bin_unpacking_buffers + .uniform_allocation_metadata + .write_buffer(render_device, render_queue); + }); }); // Create the resources necessary to perform sparse uploads of the current @@ -2412,10 +2488,6 @@ pub fn write_batched_instance_buffers( &mut sparse_buffer_update_bind_groups, &sparse_buffer_update_pipelines, ); - - bin_unpacking_buffers - .bin_unpacking_metadata - .write_buffer(render_device, render_queue); } /// Writes the bin data for each render phase to the GPU. @@ -2425,7 +2497,7 @@ pub fn write_batched_instance_buffers( pub fn write_binned_instance_buffers( mut views: Query<&ExtractedView>, mut view_binned_render_phases: ResMut>, - bin_unpacking_buffers: ResMut, + bin_unpacking_buffers: ResMut, render_device: Res, render_queue: Res, ) where @@ -2461,7 +2533,7 @@ pub fn write_binned_instance_buffers( // combination. let view_phase_bin_unpacking_buffers = bin_unpacking_buffers .view_phase_buffers - .entry(BinUnpackingBuffersKey { + .entry(SceneUnpackingBuffersKey { phase: phase_type_id, view: extracted_view.retained_view_entity, }) @@ -2485,7 +2557,7 @@ pub fn write_binned_instance_buffers( // We use the *representative entity* as the key for the later loop to // find the `BatchSetBinUnpackingMetadata`, because it's a unique value // that can be fetched from the `BinnedRenderPhaseBatchSet`. - let mut representative_entity_to_batch_set_bin_unpacking_metadata = + let mut representative_entity_to_batch_set_scene_unpacking_metadata = MainEntityHashMap::default(); for batch_set in batch_sets { @@ -2498,11 +2570,14 @@ pub fn write_binned_instance_buffers( { // Record the batch set bin unpacking metadata for later passes // to use. - representative_entity_to_batch_set_bin_unpacking_metadata.insert( + representative_entity_to_batch_set_scene_unpacking_metadata.insert( main_entity, - BatchSetBinUnpackingMetadata { + BatchSetSceneUnpackingMetadata { base_output_work_item_index: batch_set.first_work_item_index, base_indirect_parameters_index: indirect_parameters_range.start, + batch_set_index: batch_set.index, + first_indirect_parameters_index: batch_set.first_indirect_parameters_index, + first_output_mesh_uniform_index: batch_set.first_output_mesh_uniform_index, }, ); } @@ -2517,8 +2592,8 @@ pub fn write_binned_instance_buffers( let Some(representative_entity) = batch_set.representative_entity() else { continue; }; - let Some(bin_unpacking_metadata) = - representative_entity_to_batch_set_bin_unpacking_metadata + let Some(scene_unpacking_metadata) = + representative_entity_to_batch_set_scene_unpacking_metadata .get(&representative_entity) else { continue; @@ -2532,20 +2607,32 @@ pub fn write_binned_instance_buffers( .write_buffer(&render_device, &render_queue); batch_set .gpu_buffers - .bin_index_to_indirect_parameters_offset_buffer + .bin_metadata_buffer + .write_buffer(&render_device, &render_queue); + batch_set + .gpu_buffers + .fan_buffer + .write_buffer(&render_device); + batch_set + .gpu_buffers + .bin_index_to_bin_metadata_index_buffer .write_buffer(&render_device, &render_queue); let ( Some(render_bin_entry_buffer), - Some(bin_index_to_indirect_parameters_offset_buffer), + Some(bin_metadata_buffer), + Some(fan_buffer), + Some(bin_index_to_bin_metadata_index_buffer), ) = ( batch_set .gpu_buffers .render_binned_mesh_instance_buffer .buffer(), + batch_set.gpu_buffers.bin_metadata_buffer.buffer(), + batch_set.gpu_buffers.fan_buffer.buffer(), batch_set .gpu_buffers - .bin_index_to_indirect_parameters_offset_buffer + .bin_index_to_bin_metadata_index_buffer .buffer(), ) else { @@ -2556,33 +2643,59 @@ pub fn write_binned_instance_buffers( .gpu_buffers .render_binned_mesh_instance_buffer .len() as u32; + let bin_count = batch_set.bin_count() as u32; // Build up the `GpuBinUnpackingMetadata` for this batch set. let gpu_bin_unpacking_metadata_index = bin_unpacking_buffers .bin_unpacking_metadata .push(GpuBinUnpackingMetadata { - base_output_work_item_index: bin_unpacking_metadata.base_output_work_item_index, - base_indirect_parameters_index: bin_unpacking_metadata + base_output_work_item_index: scene_unpacking_metadata + .base_output_work_item_index, + base_indirect_parameters_index: scene_unpacking_metadata .base_indirect_parameters_index, binned_mesh_instance_count, pad: [0; _], }); - let Some(gpu_bin_unpacking_metadata_index) = - NonMaxU32::new(gpu_bin_unpacking_metadata_index as u32) + // Build up the `GpuUniformAllocationMetadata` for this batch set. + let gpu_uniform_allocation_metadata_index = bin_unpacking_buffers + .uniform_allocation_metadata + .push(GpuUniformAllocationMetadata { + batch_set_index: scene_unpacking_metadata.batch_set_index, + bin_count, + first_indirect_parameters_index: scene_unpacking_metadata + .first_indirect_parameters_index, + first_output_mesh_uniform_index: scene_unpacking_metadata + .first_output_mesh_uniform_index, + pad: [0; _], + }); + + let ( + Some(gpu_bin_unpacking_metadata_index), + Some(gpu_uniform_allocation_metadata_index), + ) = ( + NonMaxU32::new(gpu_bin_unpacking_metadata_index as u32), + NonMaxU32::new(gpu_uniform_allocation_metadata_index as u32), + ) else { continue; }; // Create the [`BinUnpackingJob`]. - let job = BinUnpackingJob { + let job = SceneUnpackingJob { render_binned_mesh_instance_buffer: render_bin_entry_buffer.clone(), - bin_index_to_indirect_parameters_offset_buffer: - bin_index_to_indirect_parameters_offset_buffer.clone(), + bin_metadata_buffer: bin_metadata_buffer.clone(), + fan_buffer: fan_buffer.clone(), + bin_index_to_bin_metadata_index_buffer: bin_index_to_bin_metadata_index_buffer + .clone(), bin_unpacking_metadata_index: BinUnpackingMetadataIndex( gpu_bin_unpacking_metadata_index, ), + uniform_allocation_metadata_index: UniformAllocationMetadataIndex( + gpu_uniform_allocation_metadata_index, + ), mesh_instance_count: binned_mesh_instance_count, + bin_count, }; if batch_set_key.indexed() { @@ -2606,20 +2719,28 @@ pub fn write_binned_instance_buffers( }); } -/// Clears out the [`BinUnpackingBuffers`] in preparation for a new frame. -pub fn clear_bin_unpacking_buffers(mut bin_unpacking_buffers: ResMut) { - bin_unpacking_buffers.bin_unpacking_metadata.clear(); +/// Clears out the [`SceneUnpackingBuffers`] in preparation for a new frame. +pub fn clear_scene_unpacking_buffers(mut scene_unpacking_buffers: ResMut) { + scene_unpacking_buffers.bin_unpacking_metadata.clear(); + scene_unpacking_buffers.uniform_allocation_metadata.clear(); } -/// CPU-side metadata needed to drive the bin unpacking compute shader for a -/// single batch set. -struct BatchSetBinUnpackingMetadata { +/// CPU-side metadata needed to drive the uniform allocation and bin unpacking +/// compute shaders for a single batch set. +struct BatchSetSceneUnpackingMetadata { /// The index of the first [`PreprocessWorkItem`] that the compute shader /// dispatch is to write to. base_output_work_item_index: u32, /// The index of the first GPU indirect parameters command for the batch /// set. base_indirect_parameters_index: u32, + /// The index of the batch set in the `indirect_batch_sets` array. + batch_set_index: u32, + /// The index of the indirect parameters for the first bin in the indirect + /// parameters buffer. + first_indirect_parameters_index: u32, + /// The index of the first `MeshUniform` in the mesh uniforms buffer. + first_output_mesh_uniform_index: u32, } pub fn clear_indirect_parameters_buffers( @@ -2655,35 +2776,20 @@ pub fn write_indirect_parameters_buffers( }); scope.spawn(async { - let _span = bevy_log::info_span!("indexed_cpu_metadata").entered(); + let _span = bevy_log::info_span!("indexed_metadata").entered(); phase_indirect_parameters_buffers .indexed - .cpu_metadata + .metadata .write_buffer(render_device, render_queue); }); scope.spawn(async { - let _span = bevy_log::info_span!("non_indexed_cpu_metadata").entered(); + let _span = bevy_log::info_span!("non_indexed_metadata").entered(); phase_indirect_parameters_buffers .non_indexed - .cpu_metadata + .metadata .write_buffer(render_device, render_queue); }); - scope.spawn(async { - let _span = bevy_log::info_span!("non_indexed_gpu_metadata").entered(); - phase_indirect_parameters_buffers - .non_indexed - .gpu_metadata - .write_buffer(render_device); - }); - scope.spawn(async { - let _span = bevy_log::info_span!("indexed_gpu_metadata").entered(); - phase_indirect_parameters_buffers - .indexed - .gpu_metadata - .write_buffer(render_device); - }); - scope.spawn(async { let _span = bevy_log::info_span!("indexed_batch_sets").entered(); phase_indirect_parameters_buffers diff --git a/crates/bevy_render/src/batching/mod.rs b/crates/bevy_render/src/batching/mod.rs index d0d00367d5053..f8f36daf5eec0 100644 --- a/crates/bevy_render/src/batching/mod.rs +++ b/crates/bevy_render/src/batching/mod.rs @@ -164,12 +164,12 @@ pub trait GetFullBatchData: GetBatchData { query_item: MainEntity, ) -> Option; - /// Writes the [`gpu_preprocessing::IndirectParametersGpuMetadata`] - /// necessary to draw this batch into the given metadata buffer at the given - /// index. + /// Writes the [`gpu_preprocessing::IndirectParametersMetadata`] necessary + /// to draw this batch into the given metadata buffer at the given index. /// /// This is only used if GPU culling is enabled (which requires GPU - /// preprocessing). + /// preprocessing), and only for phase items that don't support GPU uniform + /// allocation yet. /// /// * `indexed` is true if the mesh is indexed or false if it's non-indexed. /// diff --git a/crates/bevy_render/src/occlusion_culling/mesh_preprocess_types.wgsl b/crates/bevy_render/src/occlusion_culling/mesh_preprocess_types.wgsl index abfa658904f49..59b688a6ffb57 100644 --- a/crates/bevy_render/src/occlusion_culling/mesh_preprocess_types.wgsl +++ b/crates/bevy_render/src/occlusion_culling/mesh_preprocess_types.wgsl @@ -51,18 +51,50 @@ struct IndirectParametersNonIndexed { first_instance: u32, } -struct IndirectParametersCpuMetadata { +// Information needed to construct indirect draw parameters for a single draw. +// +// Note that is per-*draw* (i.e. per-mesh), not per-mesh-instance or +// per-batch-set. A single multi-draw indirect call can perform multiple draws. +// +// Typically, the uniform allocation and mesh preprocessing phases fill in this +// structure. However, parts of it may be filled in on the CPU for objects that +// aren't multidrawn. +struct IndirectParametersMetadata { + // The index of the first `MeshUniform` for this draw in the mesh uniform + // buffer. + // + // `MeshUniform`s for all instances are stored consecutively. + // + // This is filled in in the `allocate_uniforms` shader, or on the CPU when + // multidraw isn't in use. base_output_index: u32, + + // The index of this batch set in the `IndirectBatchSet` array. + // + // This is filled in in the `allocate_uniforms` shader, or on the CPU when + // multidraw isn't in use. batch_set_index: u32, -} -struct IndirectParametersGpuMetadata { + // The index of the mesh in the `MeshInput` buffer. + // + // The mesh preprocessing shader fills this in. mesh_index: u32, + #ifdef WRITE_INDIRECT_PARAMETERS_METADATA + // The number of instances that were visible last frame (if occlusion + // culling is in use) or that were visible at all (if occlusion culling + // isn't in use). early_instance_count: atomic, + // The number of instances that were visible this frame if occlusion culling + // is in use. late_instance_count: atomic, #else // WRITE_INDIRECT_PARAMETERS_METADATA + // The number of instances that were visible last frame (if occlusion + // culling is in use) or that were visible at all (if occlusion culling + // isn't in use). early_instance_count: u32, + // The number of instances that were visible this frame if occlusion culling + // is in use. late_instance_count: u32, #endif // WRITE_INDIRECT_PARAMETERS_METADATA } @@ -81,4 +113,29 @@ struct PreprocessWorkItem { // indirect mode, the index of the `IndirectParameters` in // `indirect_parameters` that we write to. output_or_indirect_parameters_index: u32, -} \ No newline at end of file +} + +// Information about each bin in a batch set. +// +// This is maintained by the CPU and cached for bins that don't change from +// frame to frame. +struct BinMetadata { + // The index of the indirect parameters for this bin, relative to the first + // indirect parameter index for the batch set. + // + // That is, the final indirect parameters index for this bin is + // `first_indirect_parameters_index` in the `UniformAllocationMetadata` plus + // this value. + indirect_parameters_offset: u32, + + // The index of the bin that this metadata corresponds to. + // + // The GPU doesn't use this, but the CPU does in order to perform the + // reverse mapping from bin metadata index back to the bin. We could store + // this in a non-GPU-accessible buffer, but I figured the extra complexity + // wasn't worth it. + bin_index: u32, + + // The number of mesh instances in this bin. + instance_count: u32, +}; diff --git a/crates/bevy_render/src/render_phase/mod.rs b/crates/bevy_render/src/render_phase/mod.rs index 0d819dab54f0a..b422762ceb307 100644 --- a/crates/bevy_render/src/render_phase/mod.rs +++ b/crates/bevy_render/src/render_phase/mod.rs @@ -44,10 +44,10 @@ pub use rangefinder::*; use wgpu::{BufferUsages, Features}; use crate::batching::gpu_preprocessing::{ - GpuPreprocessingMode, GpuPreprocessingSupport, PhaseBatchedInstanceBuffers, + GpuBinMetadata, GpuPreprocessingMode, GpuPreprocessingSupport, PhaseBatchedInstanceBuffers, PhaseIndirectParametersBuffers, }; -use crate::render_resource::RawBufferVec; +use crate::render_resource::{RawBufferVec, UninitBufferVec}; use crate::renderer::RenderDevice; use crate::sync_world::{MainEntity, MainEntityHashMap}; use crate::view::{ExtractedView, RetainedViewEntity}; @@ -78,7 +78,6 @@ use core::{ hash::Hash, iter, marker::PhantomData, - mem, ops::{Range, RangeBounds}, }; use smallvec::SmallVec; @@ -207,8 +206,7 @@ impl RenderMultidrawableBin { /// The index of a mesh instance in the /// [`RenderMultidrawableBatchSetGpuBuffers::render_binned_mesh_instance_buffer`] -/// and [`RenderMultidrawableBatchSet::render_binned_mesh_instances_cpu`] -/// arrays. +/// array. /// /// These two arrays are parallel and always have the same length. /// @@ -218,6 +216,9 @@ impl RenderMultidrawableBin { #[derive(Clone, Copy, Debug, Deref, DerefMut)] pub(crate) struct RenderBinnedMeshInstanceIndex(pub(crate) u32); +#[derive(Clone, Copy, Debug, Deref, DerefMut)] +pub(crate) struct RenderBinMetadataIndex(pub(crate) u32); + /// The GPU buffers that go along with [`RenderMultidrawableBatchSet`]. /// /// The bin unpacking shader uses these in order to produce @@ -230,31 +231,20 @@ pub struct RenderMultidrawableBatchSetGpuBuffers { /// (`RenderBinnedMeshInstanceIndex`) to its input uniform index /// ([`InputUniformIndex`]) and bin index (`RenderBinIndex`). pub render_binned_mesh_instance_buffer: RawBufferVec, - /// A mapping from each `RenderBinnedMeshInstanceIndex` to the offset of its - /// indirect draw parameters. - pub bin_index_to_indirect_parameters_offset_buffer: RawBufferVec, -} -/// Information about each binned mesh instance that the -/// [`RenderMultidrawableBatchSet`] keeps on CPU. -#[derive(Clone, Copy)] -pub(crate) struct CpuRenderBinnedMeshInstance { - /// The entity associated with this mesh instance. - pub(crate) main_entity: MainEntity, + /// A mapping from each `RenderBinMetadataIndex` to the offset of its indirect draw + /// parameters. + pub bin_metadata_buffer: RawBufferVec, - /// The index of the bin that the entity is in. + /// A mapping from each `RenderBinIndex` to the associated bin metadata + /// index. /// - /// Note that bin indices are stable from frame to frame. - bin_index: RenderBinIndex, -} + /// This array isn't necessarily tightly packed (i.e. it can have holes). + pub bin_index_to_bin_metadata_index_buffer: RawBufferVec, -impl Default for CpuRenderBinnedMeshInstance { - fn default() -> Self { - CpuRenderBinnedMeshInstance { - main_entity: MainEntity::from(Entity::PLACEHOLDER), - bin_index: RenderBinIndex::default(), - } - } + /// A temporary buffer used to store intermediate sums during the uniform + /// allocation phase. + pub fan_buffer: UninitBufferVec, } impl RenderMultidrawableBatchSetGpuBuffers { @@ -262,14 +252,19 @@ impl RenderMultidrawableBatchSetGpuBuffers { fn new() -> RenderMultidrawableBatchSetGpuBuffers { let mut render_bin_entry_buffer = RawBufferVec::new(BufferUsages::STORAGE); render_bin_entry_buffer.set_label(Some("render bin entry buffer")); - let mut bin_index_to_indirect_parameters_offset_buffer = - RawBufferVec::new(BufferUsages::STORAGE); - bin_index_to_indirect_parameters_offset_buffer - .set_label(Some("bin index to indirect parameters offset buffer")); + let mut bin_metadata_buffer = RawBufferVec::new(BufferUsages::STORAGE); + bin_metadata_buffer.set_label(Some("bin metadata buffer")); + let mut bin_index_to_bin_metadata_index_buffer = RawBufferVec::new(BufferUsages::STORAGE); + bin_index_to_bin_metadata_index_buffer + .set_label(Some("bin-index-to-bin-metadata-index buffer")); + let mut fan_buffer = UninitBufferVec::new(BufferUsages::STORAGE); + fan_buffer.set_label(Some("fan buffer")); RenderMultidrawableBatchSetGpuBuffers { render_binned_mesh_instance_buffer: render_bin_entry_buffer, - bin_index_to_indirect_parameters_offset_buffer, + bin_metadata_buffer, + bin_index_to_bin_metadata_index_buffer, + fan_buffer, } } @@ -277,7 +272,6 @@ impl RenderMultidrawableBatchSetGpuBuffers { fn insert( &mut self, bin: &mut RenderMultidrawableBin, - cpu_binned_mesh_instance_buffer: &mut Vec, main_entity: MainEntity, input_uniform_index: InputUniformIndex, bin_index: RenderBinIndex, @@ -300,28 +294,21 @@ impl RenderMultidrawableBatchSetGpuBuffers { .push(GpuRenderBinnedMeshInstance::default()) as u32, ); - cpu_binned_mesh_instance_buffer.push(CpuRenderBinnedMeshInstance::default()); vacant_entry.insert(render_bin_buffer_index); render_bin_buffer_index } }; - // Place the entry in the instance buffer at the proper spot. Also, save - // the entity and bin index in the CPU-side array. + // Place the entry in the instance buffer at the proper spot. self.render_binned_mesh_instance_buffer.values_mut() [render_binned_mesh_instance_buffer_index.0 as usize] = gpu_render_bin_entry; - cpu_binned_mesh_instance_buffer[render_binned_mesh_instance_buffer_index.0 as usize] = - CpuRenderBinnedMeshInstance { - main_entity, - bin_index, - }; - // The GPU-side `render_binned_mesh_instance_buffer` and the CPU-side - // `cpu_binned_mesh_instance_buffer` are parallel arrays and must have - // the same length, so assert that in debug mode. + let bin_metadata_index = + self.bin_index_to_bin_metadata_index_buffer.values()[bin_index.0 as usize]; + self.bin_metadata_buffer.values_mut()[bin_metadata_index as usize].instance_count += 1; debug_assert_eq!( - self.render_binned_mesh_instance_buffer.len(), - cpu_binned_mesh_instance_buffer.len() + self.bin_metadata_buffer.values()[bin_metadata_index as usize].instance_count as usize, + bin.entity_to_binned_mesh_instance_index.len() ); } @@ -339,9 +326,9 @@ impl RenderMultidrawableBatchSetGpuBuffers { fn remove( &mut self, bin: &mut RenderMultidrawableBin, - cpu_binned_mesh_instance_buffer: &mut Vec, + bin_index: RenderBinIndex, entity_to_remove: MainEntity, - ) -> Option<(RenderBinnedMeshInstanceIndex, CpuRenderBinnedMeshInstance)> { + ) -> Option<(RenderBinnedMeshInstanceIndex, GpuRenderBinnedMeshInstance)> { // Remove the entity from the `entity_to_binned_mesh_instance_index` // map. let old_index = bin @@ -349,29 +336,29 @@ impl RenderMultidrawableBatchSetGpuBuffers { .remove(&entity_to_remove) .expect("Entity not in bin"); + let bin_metadata_index = + self.bin_index_to_bin_metadata_index_buffer.values()[bin_index.0 as usize]; + self.bin_metadata_buffer.values_mut()[bin_metadata_index as usize].instance_count -= 1; + debug_assert_eq!( + self.bin_metadata_buffer.values()[bin_metadata_index as usize].instance_count as usize, + bin.entity_to_binned_mesh_instance_index.len() + ); + // Remove the entity from the reverse // `render_binned_mesh_instance_buffer` list, as well // as the parallel `render_binned_mesh_instance_buffer`. Because binned // mesh instance indices must be contiguous, this requires use of // `swap_remove`. - cpu_binned_mesh_instance_buffer.swap_remove(old_index.0 as usize); self.render_binned_mesh_instance_buffer .swap_remove(old_index.0 as usize); - // Both `render_binned_mesh_instance_buffer` and - // `cpu_binned_mesh_instance_buffer` must be parallel arrays, so assert - // that they have the same length. - debug_assert_eq!( - cpu_binned_mesh_instance_buffer.len(), - self.render_binned_mesh_instance_buffer.len() - ); - // If an entity was displaced (i.e. has a new binned mesh instance index // now), then return that to the caller so that they can perform // whatever bookkeeping is necessary. - cpu_binned_mesh_instance_buffer + self.render_binned_mesh_instance_buffer + .values() .get(old_index.0 as usize) - .map(|entity_indices| (old_index, *entity_indices)) + .map(|gpu_render_binned_mesh_instance| (old_index, *gpu_render_binned_mesh_instance)) } } @@ -383,6 +370,9 @@ impl RenderMultidrawableBatchSetGpuBuffers { #[repr(transparent)] pub(crate) struct RenderBinIndex(pub(crate) u32); +/// The number of threads per workgroup in the `allocate_uniforms` shader. +pub const UNIFORM_ALLOCATION_WORKGROUP_SIZE: u32 = 256; + /// A collection of mesh instances that can be drawn together, sorted into bins. /// /// This data structure stores a list of entity indices corresponding to mesh @@ -393,50 +383,61 @@ pub(crate) struct RenderBinIndex(pub(crate) u32); /// of the data structure is as follows: /// /// ```text -/// ┌─ -/// │ ─────┬──────────────┬───── -/// │ │ Mesh Inst. 2 │ -/// │ Binned Mesh ... ├──────────────┤ ... -/// │ Instances │ Entity 8 │ -/// │ ─────┴───┬──────────┴───── -/// │ │ -/// │ │ ┌───────────────────────────────┐ -/// │ │ │ │ -/// │ ▼ ▼ │ -/// │ ┌───────┬───────┬───────┬───── │ -/// │ Bins │ Bin 0 │ Bin 1 │ Bin 2 │ ... │ -/// │ └───────┴───┬───┴───────┴───── │ -/// │ │ │ -/// CPU │ │ │ -/// │ Entity-to- │ ┌──────────┬──────────┬───── │ -/// │ Binned-Mesh- └─►│ Entity 3 │ Entity 8 │ ... │ -/// │ Instance- └──────────┴──────┬───┴───── │ -/// │ Index │ │ -/// │ │ │ -/// │ │ │ -/// │ │ │ -/// │ │ │ -/// │ Indirect- ┌───────┬───────┬───────┬───── │ │ -/// │ Parameters- │ IPO 0 │ IPO 1 │ IPO 2 │ ... │ │ -/// │ Offset-to- └───────┴───────┴───────┴───── │ │ -/// │ Bin-Index ▲ │ │ -/// │ │ │ │ -/// └─ │ │ │ -/// ┌───────┘ │ │ -/// ┌─ │ │ │ -/// │ │ │ │ -/// │ Bin-to- ▼ │ │ -/// │ Indirect- ┌───────┬───────┬───────┬───── │ │ -/// │ Parameters- │ Bin 0 │ Bin 1 │ Bin 2 │ ... │ │ -/// │ Offset └───────┴───────┴───────┴───── │ │ -/// GPU │ Buffer │ │ -/// │ │ │ -/// │ ▼ │ -/// │ Binned Mesh ─────┬──────────────┬──────────────┬───── │ -/// │ Instance ... │ Mesh Inst. 1 │ Mesh Inst. 2 │ ... │ -/// │ Buffer ─────┴──────────────┴───────────┬──┴───── │ -/// │ │ │ -/// └─ └────────────┘ +/// Bin Key to +/// Bin Index +/// +/// │ ... │ +/// ├───────────┤ +/// │ Bin Key A │ Bins +/// ├───────────┤ ┌────────────┐ +/// │ Bin Key B ├───┬─────────────────────────────────►│ Bin 1 │◄──────────────────┐ +/// ├───────────┤ │ └─┬──────────┤ │ +/// │ Bin Key C │ │ Mesh Input │ Entity 3 │ │ +/// ├───────────┤ │ Uniform to ├──────────┤ │ +/// │ ... │ │ Entity ┌───────►│ Entity 7 ├────────┐ │ +/// │ │ ├──────────┤ │ │ +/// │ │ ... │ │ │ Entity 9 │ │ │ +/// │ ├──────────┤ │ ┌─┴──────────┤ │ │ +/// │ │ Entity 4 │ │ │ Bin 2 │ │ │ +/// │ ├──────────┤ │ └─┬──────────┤ │ │ +/// │ ┌──►│ Entity 7 ├────────┘ │ ... │ │ │ +/// │ │ ├──────────┤ │ │ +/// │ │ │ Entity 1 │ │ │ +/// │ │ ├──────────┤ │ │ +/// │ │ │ Entity 3 │ │ │ +/// │ │ ├──────────┤ │ │ +/// │ │ │ ... │ │ │ +/// CPU ▲ │ │ │ │ +/// ┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄│┄┄┄│┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄│┄┄┄┄┄┄┄┄┄┄│┄ +/// GPU ▼ │ │ │ │ +/// │ │ Binned Mesh │ │ +/// │ │ Instances │ │ +/// │ │ │ │ +/// │ │ │ ... │ │ │ +/// │ │ ┌─┴─────────────────┤ │ │ +/// │ │ │ Mesh Instance 4 │◄──┘ │ +/// │ │ └─┬─────────────────┤ │ +/// │ └──────────────────────────────┤ Input Uniform 5 │ │ +/// │ ├─────────────────┤ │ +/// │ │ Bin 1 ├──────────────┤ +/// │ ┌─┴─────────────────┤ │ +/// │ Bin Index to Bin │ ... │ │ +/// │ Metadata Index │ +/// │ │ +/// │ │ ... │ │ +/// │ ├──────────────┤ Bin Metadata │ +/// │ │ Bin Metadata │ │ +/// │ │ Index 2 │ │ ... │ │ +/// │ ├──────────────┤ ┌─┴────────────────────────┤ │ +/// └──────────►│ Bin Metadata ├──────►│ Bin 1 ├─────┘ +/// │ Index 9 │ └─┬────────────────────────┤ +/// ├──────────────┤ │ Indirect Params Offset │ +/// │ Bin Metadata │ ├────────────────────────┤ +/// │ Index 4 │ │ Instance Count │ +/// ├──────────────┤ ┌─┴────────────────────────┤ +/// │ ... │ │ Bin 12 │ +/// └─┬────────────────────────┤ +/// │ ... │ /// ``` pub struct RenderMultidrawableBatchSet where @@ -454,9 +455,7 @@ where /// This list isn't tightly packed. bins: Vec>, - /// A list of unused [`RenderBinIndex`]es waiting to be reused. - /// - /// Each [`RenderBinIndex`] in this list corresponds to an empty bin. + /// A list of free `RenderBinIndex` values. bin_free_list: Vec, /// A mapping from the indirect parameters offset to the index of each bin. @@ -466,8 +465,12 @@ where /// set. indirect_parameters_offset_to_bin_index: Vec, - /// Information about each binned mesh instance kept on CPU. - pub(crate) render_binned_mesh_instances_cpu: Vec, + /// A reverse mapping from the index of the `MeshInputUniform` back to the + /// associated [`MainEntity`]. + mesh_input_uniform_index_to_entity: Vec, + + /// The total number of mesh instances in the batch. + pub(crate) instance_count: u32, } impl RenderMultidrawableBatchSet @@ -483,7 +486,8 @@ where bins: vec![], bin_free_list: vec![], indirect_parameters_offset_to_bin_index: vec![], - render_binned_mesh_instances_cpu: vec![], + mesh_input_uniform_index_to_entity: vec![], + instance_count: 0, } } @@ -513,6 +517,16 @@ where main_entity: MainEntity, input_uniform_index: InputUniformIndex, ) { + if (input_uniform_index.0 as usize) >= self.mesh_input_uniform_index_to_entity.len() { + self.mesh_input_uniform_index_to_entity + .extend(iter::repeat_n( + MainEntity::from(Entity::PLACEHOLDER), + input_uniform_index.0 as usize - self.mesh_input_uniform_index_to_entity.len() + + 1, + )); + } + self.mesh_input_uniform_index_to_entity[input_uniform_index.0 as usize] = main_entity; + let bin_index; match self.bin_key_to_bin_index.entry(bin_key) { Entry::Occupied(occupied_entry) => { @@ -535,19 +549,23 @@ where vacant_entry.insert(bin_index); // Grab an indirect parameters offset. - self.allocate_indirect_parameters(bin_index); + self.allocate_bin_metadata(bin_index); + + // Add space to the fan buffer if necessary. + if bin_index.0.div_ceil(UNIFORM_ALLOCATION_WORKGROUP_SIZE) + == self.gpu_buffers.fan_buffer.len() as u32 + { + self.gpu_buffers.fan_buffer.add(); + } } } + self.instance_count += 1; + // Update the GPU buffers. let bin = self.bins[bin_index.0 as usize].as_mut().unwrap(); - self.gpu_buffers.insert( - bin, - &mut self.render_binned_mesh_instances_cpu, - main_entity, - input_uniform_index, - bin_index, - ); + self.gpu_buffers + .insert(bin, main_entity, input_uniform_index, bin_index); } /// Removes the given entity from the bin with the given key. @@ -561,47 +579,53 @@ where .expect("Bin key not present"); let bin = self.bins[bin_index.0 as usize].as_mut().unwrap(); - let maybe_displaced_entity_indices = - self.gpu_buffers - .remove(bin, &mut self.render_binned_mesh_instances_cpu, main_entity); - if let Some((old_render_bin_buffer_index, displaced_entity_indices)) = - maybe_displaced_entity_indices + let maybe_displaced_binned_mesh_instance = + self.gpu_buffers.remove(bin, bin_index, main_entity); + if let Some((old_binned_mesh_instance_index, displaced_binned_mesh_instance)) = + maybe_displaced_binned_mesh_instance { - self.bins[displaced_entity_indices.bin_index.0 as usize] + let displaced_entity = self.mesh_input_uniform_index_to_entity + [displaced_binned_mesh_instance.input_uniform_index as usize]; + self.bins[displaced_binned_mesh_instance.bin_index as usize] .as_mut() .expect("Bin not present") .entity_to_binned_mesh_instance_index - .insert( - displaced_entity_indices.main_entity, - old_render_bin_buffer_index, - ); + .insert(displaced_entity, old_binned_mesh_instance_index); } + self.instance_count -= 1; + self.remove_bin_if_empty(bin_key, bin_index); } /// Allocates an indirect parameters slot for a new bin. - fn allocate_indirect_parameters(&mut self, bin_index: RenderBinIndex) { + fn allocate_bin_metadata(&mut self, bin_index: RenderBinIndex) { // Indirect parameters must be tightly packed, so we always add one to // the end of the list. Record the bin index for the new indirect // parameters offset. let indirect_parameters_offset = self.indirect_parameters_offset_to_bin_index.len() as u32; self.indirect_parameters_offset_to_bin_index.push(bin_index); - // Update the reverse mapping from bin index to indirect parameters offset. - if bin_index.0 as usize + let bin_metadata_index = + RenderBinMetadataIndex(self.gpu_buffers.bin_metadata_buffer.push(GpuBinMetadata { + indirect_parameters_offset, + instance_count: 0, + bin_index: bin_index.0, + }) as u32); + + if bin_index.0 == self .gpu_buffers - .bin_index_to_indirect_parameters_offset_buffer - .len() + .bin_index_to_bin_metadata_index_buffer + .len() as u32 { self.gpu_buffers - .bin_index_to_indirect_parameters_offset_buffer - .push(indirect_parameters_offset); + .bin_index_to_bin_metadata_index_buffer + .push(bin_metadata_index.0); } else { self.gpu_buffers - .bin_index_to_indirect_parameters_offset_buffer - .values_mut()[bin_index.0 as usize] = indirect_parameters_offset; + .bin_index_to_bin_metadata_index_buffer + .set(bin_index.0, bin_metadata_index.0); } } @@ -618,19 +642,38 @@ where self.bin_free_list.push(bin_index); self.bins[bin_index.0 as usize] = None; + // Remove the bin index from the metadata list. + let old_bin_metadata_index = RenderBinMetadataIndex( + self.gpu_buffers + .bin_index_to_bin_metadata_index_buffer + .values()[bin_index.0 as usize], + ); + self.gpu_buffers + .bin_index_to_bin_metadata_index_buffer + .values_mut()[bin_index.0 as usize] = u32::MAX; + + // Remove the metadata. + let old_bin_metadata = self + .gpu_buffers + .bin_metadata_buffer + .swap_remove(old_bin_metadata_index.0 as usize); + debug_assert_eq!(old_bin_metadata.bin_index, bin_index.0); + if let Some(displaced_metadata) = self + .gpu_buffers + .bin_metadata_buffer + .get(old_bin_metadata_index.0) + { + self.gpu_buffers + .bin_index_to_bin_metadata_index_buffer + .set(displaced_metadata.bin_index, old_bin_metadata_index.0); + } + // Remove the indirect parameters offset corresponding to the bin. Note // that indirect parameters must be tightly packed. Thus we must use // `swap_remove`. - let indirect_parameters_offset = mem::replace( - &mut self - .gpu_buffers - .bin_index_to_indirect_parameters_offset_buffer - .values_mut()[bin_index.0 as usize], - u32::MAX, - ); let removed_bin_index = self .indirect_parameters_offset_to_bin_index - .swap_remove(indirect_parameters_offset as usize); + .swap_remove(old_bin_metadata.indirect_parameters_offset as usize); debug_assert_eq!(bin_index, removed_bin_index); // `swap_remove` may have changed the indirect parameter index of some @@ -641,18 +684,24 @@ where // reflect the new offset of that displaced bin. if let Some(displaced_bin_index) = self .indirect_parameters_offset_to_bin_index - .get(indirect_parameters_offset as usize) + .get(old_bin_metadata.indirect_parameters_offset as usize) { - self.gpu_buffers - .bin_index_to_indirect_parameters_offset_buffer - .set(displaced_bin_index.0, indirect_parameters_offset); + let displaced_bin_metadata_index = self + .gpu_buffers + .bin_index_to_bin_metadata_index_buffer + .values()[displaced_bin_index.0 as usize]; + self.gpu_buffers.bin_metadata_buffer.values_mut() + [displaced_bin_metadata_index as usize] + .indirect_parameters_offset = old_bin_metadata.indirect_parameters_offset; } } + /// Returns true if all bins are empty: i.e. there are no mesh instances. fn is_empty(&self) -> bool { self.bin_free_list.len() == self.bins.len() } + /// Returns the number of bins (i.e. draws) in this batch set. pub(crate) fn bin_count(&self) -> usize { self.bin_key_to_bin_index.len() } @@ -773,6 +822,8 @@ pub struct BinnedRenderPhaseBatchSet { /// The index of the first preprocessing work item for this batch set in the /// preprocessing work item buffer. pub(crate) first_work_item_index: u32, + pub(crate) first_indirect_parameters_index: u32, + pub(crate) first_output_mesh_uniform_index: u32, } impl BinnedRenderPhaseBatchSets { @@ -937,7 +988,7 @@ where /// This field is ignored if GPU preprocessing isn't in use, such as (currently) /// in the case of 2D meshes. In that case, it can be safely set to /// [`core::default::Default::default`]. -#[derive(Clone, Copy, PartialEq, Default, Deref, DerefMut, Debug, Pod, Zeroable)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, Default, Deref, DerefMut, Debug, Pod, Zeroable)] #[repr(transparent)] pub struct InputUniformIndex(pub u32); @@ -2231,6 +2282,7 @@ impl RenderBin { #[cfg(test)] mod tests { + use bevy_platform::collections::HashMap; use proptest_derive::Arbitrary; use crate::render_phase::GpuRenderBinnedMeshInstance; @@ -2365,6 +2417,8 @@ mod tests { bin_index_to_entities: Vec, /// A mapping from each entity ID to the binned mesh instance data. entity_to_binned_mesh_instance: MainEntityHashMap, + /// A mapping from each input uniform index back to the entity. + input_uniform_index_to_entity: HashMap, } impl ExpectedMultidrawableBatchSet { @@ -2383,6 +2437,8 @@ mod tests { }, ); self.bin_index_to_entities[bin_index.0 as usize].insert(entity); + self.input_uniform_index_to_entity + .insert(input_uniform_index, entity); } /// Removes an entity from the control structure and returns its instance. @@ -2391,6 +2447,10 @@ mod tests { self.entity_to_binned_mesh_instance.remove(&entity).unwrap(); self.bin_index_to_entities[render_binned_mesh_instance.bin_index as usize] .remove(&entity); + self.input_uniform_index_to_entity + .remove(&InputUniformIndex( + render_binned_mesh_instance.input_uniform_index, + )); render_binned_mesh_instance } } @@ -2415,6 +2475,7 @@ mod tests { let mut expected = ExpectedMultidrawableBatchSet { bin_index_to_entities: vec![MainEntityHashSet::default(); 1024], entity_to_binned_mesh_instance: MainEntityHashMap::default(), + input_uniform_index_to_entity: HashMap::default(), }; // Process each operation, skipping invalid ones. @@ -2430,10 +2491,15 @@ mod tests { )); let input_uniform_index = InputUniformIndex(input_uniform_index); - // Skip this operation if it's trying to add an entity that's already binned. + // Skip this operation if it's trying to add an + // entity that's already binned or duplicating + // an input uniform index. if expected .entity_to_binned_mesh_instance .contains_key(&entity) + || expected + .input_uniform_index_to_entity + .contains_key(&input_uniform_index) { continue; } @@ -2504,7 +2570,9 @@ mod tests { verify_render_binned_mesh_instance_buffer(batch_set, &expected.bin_index_to_entities); // Verify that no indirect parameter offsets overlap. - verify_indirect_parameters_offsets(batch_set); + verify_bin_metadata(batch_set); + + verify_instance_count(batch_set); } /// Verifies that every entity is present in the multidrawable batch @@ -2572,13 +2640,12 @@ mod tests { .iter() .enumerate() { - let binned_mesh_instance_cpu = - &batch_set.render_binned_mesh_instances_cpu[render_bin_buffer_index]; + let mapped_entity = batch_set.mesh_input_uniform_index_to_entity + [gpu_render_binned_mesh_instance.input_uniform_index as usize]; // Make sure that the `GpuRenderBinnedMeshInstance::bin_index` // matches the `CpuRenderBinnedMeshInstance::bin_index`. let gpu_render_bin_index = gpu_render_binned_mesh_instance.bin_index; - assert_eq!(gpu_render_bin_index, *binned_mesh_instance_cpu.bin_index); let render_bin = batch_set.bins[gpu_render_bin_index as usize] .as_ref() @@ -2587,7 +2654,7 @@ mod tests { // Make sure that the entity in the // `RenderMultidrawableBin::entity_to_binned_mesh_instance_index` // table matches the entity in the - // `CpuRenderBinnedMeshInstance`. + // `mesh_input_uniform_index_to_entity`. let Some(entity) = render_bin .entity_to_binned_mesh_instance_index .iter() @@ -2604,7 +2671,7 @@ mod tests { render_bin_buffer_index, gpu_render_bin_index ); }; - assert_eq!(binned_mesh_instance_cpu.main_entity, *entity); + assert_eq!(mapped_entity, *entity); // Make sure that the bin with the appropriate bin key should // actually contain the entity. @@ -2629,40 +2696,71 @@ mod tests { } } - fn verify_indirect_parameters_offsets( - batch_set: &RenderMultidrawableBatchSet, - ) { - for (render_bin_index, indirect_parameters_offset) in batch_set - .gpu_buffers - .bin_index_to_indirect_parameters_offset_buffer - .values() - .iter() - .enumerate() - { - if *indirect_parameters_offset == u32::MAX { + /// Ensures that the `BinMetadata` and + /// indirect-parameters-offset-to-bin-index maps are correct. + fn verify_bin_metadata(batch_set: &RenderMultidrawableBatchSet) { + for bin_metadata in batch_set.gpu_buffers.bin_metadata_buffer.values().iter() { + if bin_metadata.indirect_parameters_offset == u32::MAX { continue; } assert_eq!( batch_set.indirect_parameters_offset_to_bin_index - [*indirect_parameters_offset as usize], - RenderBinIndex(render_bin_index as u32) + [bin_metadata.indirect_parameters_offset as usize], + RenderBinIndex(bin_metadata.bin_index) ); } + for (indirect_parameters_offset, render_bin_index) in batch_set .indirect_parameters_offset_to_bin_index .iter() .enumerate() { assert!(batch_set.bins[render_bin_index.0 as usize].is_some()); + + let gpu_metadata_index = *batch_set + .gpu_buffers + .bin_index_to_bin_metadata_index_buffer + .get(render_bin_index.0) + .unwrap(); + + let gpu_metadata = batch_set + .gpu_buffers + .bin_metadata_buffer + .get(gpu_metadata_index) + .unwrap(); + + // Verify that the bidirectional indirect parameters offset ↔ + // bin index mapping is correct. assert_eq!( - *batch_set - .gpu_buffers - .bin_index_to_indirect_parameters_offset_buffer - .get(render_bin_index.0) - .unwrap(), + gpu_metadata.indirect_parameters_offset, indirect_parameters_offset as u32 ); + // Verify that the bin indices match up. + assert_eq!(gpu_metadata.bin_index, render_bin_index.0); + // Verify that the instance count for that bin matches up. + assert_eq!( + gpu_metadata.instance_count, + batch_set.bins[render_bin_index.0 as usize] + .as_ref() + .unwrap() + .entity_to_binned_mesh_instance_index + .len() as u32 + ); + } + } + + /// Ensures that the recorded `instance_count` matches the actual number + /// of instances. + fn verify_instance_count(batch_set: &RenderMultidrawableBatchSet) { + let mut total_instance_count = 0; + for bin in batch_set + .bins + .iter() + .filter_map(|maybe_bin| maybe_bin.as_ref()) + { + total_instance_count += bin.entity_to_binned_mesh_instance_index.len() as u32; } + assert_eq!(batch_set.instance_count, total_instance_count); } } } diff --git a/crates/bevy_render/src/render_resource/buffer_vec.rs b/crates/bevy_render/src/render_resource/buffer_vec.rs index ed9e1a39db3a3..dcb2d734b2ee6 100644 --- a/crates/bevy_render/src/render_resource/buffer_vec.rs +++ b/crates/bevy_render/src/render_resource/buffer_vec.rs @@ -952,6 +952,24 @@ where self.uninit_element_count += count; first_index } + + /// Sets the value at the given index to the given value. + pub fn set(&mut self, index: usize, value: T) { + self.values[index] = value; + } +} + +impl PartialBufferVec +where + T: NoUninit + Default, +{ + /// Pushes `count` copies of `T::default` to the array. + pub fn push_multiple_init(&mut self, count: usize) -> usize { + debug_assert_eq!(self.uninit_element_count, 0); + let index = self.values.len(); + self.values.extend(iter::repeat_n(T::default(), count)); + index + } } /// Error returned when `write_buffer_range` fails diff --git a/crates/bevy_sprite_render/src/mesh2d/mesh.rs b/crates/bevy_sprite_render/src/mesh2d/mesh.rs index 7d1e5bbfdeecf..0ba52828959a9 100644 --- a/crates/bevy_sprite_render/src/mesh2d/mesh.rs +++ b/crates/bevy_sprite_render/src/mesh2d/mesh.rs @@ -28,7 +28,7 @@ use bevy_render::prelude::Msaa; use bevy_render::RenderSystems::PrepareAssets; use bevy_render::{ batching::{ - gpu_preprocessing::IndirectParametersCpuMetadata, + gpu_preprocessing::IndirectParametersMetadata, no_gpu_preprocessing::{ self, batch_and_prepare_binned_render_phase, batch_and_prepare_sorted_render_phase, write_batched_instance_buffer, BatchedInstanceBuffer, @@ -427,12 +427,16 @@ impl GetFullBatchData for Mesh2dPipeline { // Note that `IndirectParameters` covers both of these structures, even // though they actually have distinct layouts. See the comment above that // type for more information. - let indirect_parameters = IndirectParametersCpuMetadata { + let indirect_parameters = IndirectParametersMetadata { base_output_index, batch_set_index: match batch_set_index { None => !0, Some(batch_set_index) => u32::from(batch_set_index), }, + // These fields are unused in the 2D pipeline. + mesh_index: 0, + early_instance_count: 0, + late_instance_count: 0, }; if indexed { diff --git a/examples/shader_advanced/custom_render_phase.rs b/examples/shader_advanced/custom_render_phase.rs index 49a5ec1e31fd9..f3605e4bcf36e 100644 --- a/examples/shader_advanced/custom_render_phase.rs +++ b/examples/shader_advanced/custom_render_phase.rs @@ -34,7 +34,7 @@ use bevy::{ render::{ batching::{ gpu_preprocessing::{ - batch_and_prepare_sorted_render_phase, IndirectParametersCpuMetadata, + batch_and_prepare_sorted_render_phase, IndirectParametersMetadata, UntypedPhaseIndirectParametersBuffers, }, GetBatchData, GetFullBatchData, @@ -464,12 +464,16 @@ impl GetFullBatchData for StencilPipeline { // Note that `IndirectParameters` covers both of these structures, even // though they actually have distinct layouts. See the comment above that // type for more information. - let indirect_parameters = IndirectParametersCpuMetadata { + let indirect_parameters = IndirectParametersMetadata { base_output_index, batch_set_index: match batch_set_index { None => !0, Some(batch_set_index) => u32::from(batch_set_index), }, + // These fields are filled in by the GPU: + mesh_index: 0, + early_instance_count: 0, + late_instance_count: 0, }; if indexed {