diff --git a/crates/bevy_pbr/src/render/allocate_uniforms.wgsl b/crates/bevy_pbr/src/render/allocate_uniforms.wgsl
new file mode 100644
index 0000000000000..775926f070a46
--- /dev/null
+++ b/crates/bevy_pbr/src/render/allocate_uniforms.wgsl
@@ -0,0 +1,223 @@
+// A compute shader that allocates `MeshUniform`s.
+//
+// This shader runs before mesh preprocessing in order to determine the
+// positions of `MeshUniform`s. Unlike `MeshInputUniform`s, which are scattered
+// throughout the buffer, `MeshUniform`s are indexed by instance ID, and so we
+// must place instances of the same mesh together in the buffer. One dispatch
+// call corresponds to one batch set (i.e. one multidraw operation), and one
+// thread corresponds to one bin (a.k.a. draw, a.k.a. batch).
+//
+// Essentially, the goal of this shader is to perform a prefix sum, using the
+// "scan-then-fan" approach. It has three phases:
+//
+// 1. *Local scan*: Perform a [Hillis-Steele scan] on each chunk of draws, where
+// the size of each chunk (i.e. the number of draws) is equal to the workgroup
+// size (256). Write the total size for this chunk to the fan buffer.
+//
+// 2. *Global scan*: Do a Hillis-Steele scan on the fan buffer. Now we know the
+// running total for each chunk.
+//
+// 3. *Fan*: Copy the running total for each chunk to every element of that
+// chunk.
+//
+// Note that, for batch sets (i.e. multidraw indirect calls) that have fewer
+// than 256 batches in them, we only need step (1). This is the common case.
+//
+// [Hillis-Steele scan]: https://en.wikipedia.org/wiki/Prefix_sum#Algorithm_1:_Shorter_span,_more_parallel
+
+#import bevy_pbr::mesh_preprocess_types::{BinMetadata, IndirectParametersMetadata}
+
+// Information needed to allocate `MeshUniform`s.
+struct UniformAllocationMetadata {
+    // The index of this batch set in the `IndirectBatchSet` array.
+    //
+    // We write this into the `indirect_parameters_metadata`.
+    batch_set_index: u32,
+
+    // The number of bins (a.k.a. draws, a.k.a. batches) in this batch set.
+    bin_count: u32,
+
+    // The index of the first set of indirect parameters for this batch set.
+    //
+    // This is also the index of the first `IndirectParametersMetadata`, as
+    // that's a parallel array with the indirect parameters.
+    first_indirect_parameters_index: u32,
+
+    // The index of the first `MeshUniform` slot for this batch set.
+    first_output_mesh_uniform_index: u32,
+
+    // Padding.
+    pad: array<vec4<f32>, 15u>,
+};
+
+// The number of threads in a workgroup.
+const WORKGROUP_SIZE: u32 = 256u;
+
+// Information needed to allocate `MeshUniform`s.
+@group(0) @binding(0) var<uniform> allocate_uniforms_metadata: UniformAllocationMetadata;
+
+// Information for each bin, including the indirect parameters offset and the
+// instance count.
+@group(0) @binding(1) var<storage> bin_metadata: array<BinMetadata>;
+
+// The array of indirect parameters metadata that we fill out, one for each
+// batch.
+@group(0) @binding(2) var<storage, read_write> indirect_parameters_metadata:
+    array<IndirectParametersMetadata>;
+
+// A temporary buffer that stores the mesh uniform index of the last instance
+// plus one for each workgroup (i.e. for each 256-bin chunk).
+//
+// This is accumulated in the second stage and written out in the third.
+@group(0) @binding(3) var<storage, read_write> fan_buffer: array<u32>;
+
+// Scratch memory that stores the prefix sum for every element in our chunk.
+var<workgroup> output_offsets: array<u32, 256>;
+
+// The first step of the prefix sum. This computes the prefix sum for each
+// 256-element chunk.
+//
+// Note that this will be the *only* step in the operation if the total number
+// of bins in this batch set is 256 or fewer. Thus we must fill in the indirect
+// parameters metadata for each batch here, as we can't guarantee that the
+// following two steps will be run at all.
+@compute @workgroup_size(256, 1, 1)
+fn allocate_local_scan(
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+    @builtin(workgroup_id) group_id: vec3<u32>,
+    @builtin(global_invocation_id) global_id: vec3<u32>
+) {
+    let bin_count = allocate_uniforms_metadata.bin_count;
+
+    let block_start = group_id.x * WORKGROUP_SIZE;
+    let block_end = min(block_start + WORKGROUP_SIZE, bin_count);
+
+    // If this is the first workgroup, take the first output index from the
+    // metadata into account. But if this is the second chunk or beyond, don't
+    // do that, as the second and third phases will add it in and we don't want
+    // to double-count it.
+    if (group_id.x == 0u) {
+        output_offsets[local_id.x] = allocate_uniforms_metadata.first_output_mesh_uniform_index;
+    } else {
+        output_offsets[local_id.x] = 0u;
+    }
+    workgroupBarrier();
+
+    // We're doing an inclusive sum, so put the instance count in the *next* bin.
+    if (global_id.x < block_end && local_id.x < WORKGROUP_SIZE - 1u) {
+        output_offsets[local_id.x + 1] = bin_metadata[global_id.x].instance_count;
+    }
+    workgroupBarrier();
+
+    // Prefix sum within our workgroup.
+    hillis_steele_scan(local_id.x);
+
+    // Now write the indirect parameters metadata for this batch. We fill in the
+    // `base_output_index` with the value of the prefix sum (which might be
+    // incomplete if this isn't the first chunk). We also populate a few
+    // bookkeeping fields for later rendering passes to use.
+    if (global_id.x < block_end) {
+        let indirect_parameters_offset =
+            allocate_uniforms_metadata.first_indirect_parameters_index +
+            bin_metadata[global_id.x].indirect_parameters_offset;
+        indirect_parameters_metadata[indirect_parameters_offset].base_output_index =
+            output_offsets[local_id.x];
+        indirect_parameters_metadata[indirect_parameters_offset].batch_set_index =
+            allocate_uniforms_metadata.batch_set_index;
+        // These parameters get filled in later. Initialize them to zero for now.
+        // This is required in the case of the early/late instance counts
+        // because the mesh preprocessing shader will atomically increment them.
+        indirect_parameters_metadata[indirect_parameters_offset].mesh_index = 0u;
+        indirect_parameters_metadata[indirect_parameters_offset].early_instance_count = 0u;
+        indirect_parameters_metadata[indirect_parameters_offset].late_instance_count = 0u;
+    }
+
+    // If this is the last element in the workgroup, put the total number of
+    // instances (plus the first output mesh uniform index if we're the first
+    // workgroup) in the fan buffer in preparation for the next phase.
+    if (local_id.x == WORKGROUP_SIZE - 1u) {
+        fan_buffer[group_id.x] = output_offsets[WORKGROUP_SIZE - 1u] +
+            bin_metadata[global_id.x].instance_count;
+    }
+}
+
+// The second step of the prefix sum.
+//
+// This step takes the intermediate fan values computed in the previous step
+// (i.e. the sum going out of each chunk) and performs one or more Hillis-Steele
+// scans in order to compute the fan value going into each chunk.
+//
+// This step is omitted if there are 256 or fewer total draws.
+@compute @workgroup_size(256, 1, 1)
+fn allocate_global_scan(@builtin(local_invocation_id) local_id: vec3<u32>) {
+    var sum = 0u;
+    let chunk_count = div_ceil(allocate_uniforms_metadata.bin_count, WORKGROUP_SIZE);
+
+    // Do a sequential loop over each block of 256 chunks. Because each
+    // iteration of this loop covers 64K meshes, the fact that it's sequential
+    // isn't going to be a problem in practice.
+    for (var block_start = 0u; block_start < chunk_count; block_start += WORKGROUP_SIZE) {
+        // Set up the Hillis-Steele scan.
+        let block_end = min(block_start + WORKGROUP_SIZE, chunk_count);
+        let global_id = block_start + local_id.x;
+        if (global_id < block_end) {
+            output_offsets[local_id.x] = sum + fan_buffer[global_id];
+        }
+        workgroupBarrier();
+
+        // Perform the scan.
+        hillis_steele_scan(local_id.x);
+
+        // Write the value back.
+        if (global_id < block_end) {
+            fan_buffer[global_id] = output_offsets[local_id.x];
+        }
+
+        // Save the sum coming out of this block for the next one.
+        sum = output_offsets[WORKGROUP_SIZE - 1u];
+    }
+}
+
+// The third step of the prefix sum.
+//
+// We take the summed fan value computed in the previous step and add it in to
+// each value of each chunk beyond the first. We dispatch one fewer workgroup
+// here than in step (1), because there's nothing to do for the first chunk.
+//
+// This step is omitted if there are 256 or fewer total draws.
+@compute @workgroup_size(256, 1, 1)
+fn allocate_fan(
+    @builtin(workgroup_id) group_id: vec3<u32>,
+    @builtin(global_invocation_id) global_id: vec3<u32>
+) {
+    let id = global_id.x + WORKGROUP_SIZE;
+    let bin_count = allocate_uniforms_metadata.bin_count;
+    if (id >= bin_count) {
+        return;
+    }
+
+    let fan_value = fan_buffer[group_id.x];
+    let indirect_parameters_offset =
+        allocate_uniforms_metadata.first_indirect_parameters_index +
+        bin_metadata[id].indirect_parameters_offset;
+    indirect_parameters_metadata[indirect_parameters_offset].base_output_index += fan_value;
+}
+
+// Calculates a running exclusive sum.
+// https://en.wikipedia.org/wiki/Prefix_sum#Algorithm_1:_Shorter_span,_more_parallel
+fn hillis_steele_scan(local_id: u32) {
+    for (var offset = 1u; offset < WORKGROUP_SIZE; offset *= 2u) {
+        var term = 0u;
+        if (local_id >= offset) {
+            term = output_offsets[local_id - offset];
+        }
+        workgroupBarrier();
+        output_offsets[local_id] += term;
+        workgroupBarrier();
+    }
+}
+
+// Divides unsigned integer a by b, rounding up.
+fn div_ceil(a: u32, b: u32) -> u32 {
+    return (a + b - 1u) / b;
+}
diff --git a/crates/bevy_pbr/src/render/build_indirect_params.wgsl b/crates/bevy_pbr/src/render/build_indirect_params.wgsl
index 5ca6d4c0ccfff..1f0050a464e50 100644
--- a/crates/bevy_pbr/src/render/build_indirect_params.wgsl
+++ b/crates/bevy_pbr/src/render/build_indirect_params.wgsl
@@ -12,8 +12,7 @@
     IndirectBatchSet,
     IndirectParametersIndexed,
     IndirectParametersNonIndexed,
-    IndirectParametersCpuMetadata,
-    IndirectParametersGpuMetadata,
+    IndirectParametersMetadata,
     MeshInput
 }
 
@@ -23,11 +22,8 @@
 // Data that we use to generate the indirect parameters.
 //
 // The `mesh_preprocess.wgsl` shader emits these.
-@group(0) @binding(1) var<storage> indirect_parameters_cpu_metadata:
-    array<IndirectParametersCpuMetadata>;
-
-@group(0) @binding(2) var<storage> indirect_parameters_gpu_metadata:
-    array<IndirectParametersGpuMetadata>;
+@group(0) @binding(1) var<storage> indirect_parameters_metadata:
+    array<IndirectParametersMetadata>;
 
 // Information about each batch set.
 //
@@ -56,21 +52,21 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
     // Figure out our instance index (i.e. batch index). If this thread doesn't
     // correspond to any index, bail.
     let instance_index = global_invocation_id.x;
-    if (instance_index >= arrayLength(&indirect_parameters_cpu_metadata)) {
+    if (instance_index >= arrayLength(&indirect_parameters_metadata)) {
         return;
     }
 
     // Unpack the metadata for this batch.
-    let base_output_index = indirect_parameters_cpu_metadata[instance_index].base_output_index;
-    let batch_set_index = indirect_parameters_cpu_metadata[instance_index].batch_set_index;
-    let mesh_index = indirect_parameters_gpu_metadata[instance_index].mesh_index;
+    let base_output_index = indirect_parameters_metadata[instance_index].base_output_index;
+    let batch_set_index = indirect_parameters_metadata[instance_index].batch_set_index;
+    let mesh_index = indirect_parameters_metadata[instance_index].mesh_index;
 
     // If we aren't using `multi_draw_indirect_count`, we have a 1:1 fixed
     // assignment of batches to slots in the indirect parameters buffer, so we
     // can just use the instance index as the index of our indirect parameters.
     let early_instance_count =
-        indirect_parameters_gpu_metadata[instance_index].early_instance_count;
-    let late_instance_count = indirect_parameters_gpu_metadata[instance_index].late_instance_count;
+        indirect_parameters_metadata[instance_index].early_instance_count;
+    let late_instance_count = indirect_parameters_metadata[instance_index].late_instance_count;
 
     // If in the early phase, we draw only the early meshes. If in the late
     // phase, we draw only the late meshes. If in the main phase, draw all the
diff --git a/crates/bevy_pbr/src/render/gpu_preprocess.rs b/crates/bevy_pbr/src/render/gpu_preprocess.rs
index 259678cc8cca4..1eb935780574e 100644
--- a/crates/bevy_pbr/src/render/gpu_preprocess.rs
+++ b/crates/bevy_pbr/src/render/gpu_preprocess.rs
@@ -35,18 +35,19 @@ use bevy_math::Vec4;
 use bevy_platform::collections::HashMap;
 use bevy_render::{
     batching::gpu_preprocessing::{
-        clear_bin_unpacking_buffers, BatchedInstanceBuffers, BinUnpackingBuffers,
-        BinUnpackingBuffersKey, BinUnpackingJob, BinUnpackingMetadataIndex,
-        GpuBinUnpackingMetadata, GpuOcclusionCullingWorkItemBuffers, GpuPreprocessingMode,
-        GpuPreprocessingSupport, IndirectBatchSet, IndirectParametersBuffers,
-        IndirectParametersCpuMetadata, IndirectParametersGpuMetadata, IndirectParametersIndexed,
-        IndirectParametersNonIndexed, LatePreprocessWorkItemIndirectParameters, PreprocessWorkItem,
-        PreprocessWorkItemBuffers, UntypedPhaseBatchedInstanceBuffers,
+        clear_scene_unpacking_buffers, BatchedInstanceBuffers, BinUnpackingMetadataIndex,
+        GpuBinMetadata, GpuBinUnpackingMetadata, GpuOcclusionCullingWorkItemBuffers,
+        GpuPreprocessingMode, GpuPreprocessingSupport, GpuUniformAllocationMetadata,
+        IndirectBatchSet, IndirectParametersBuffers, IndirectParametersIndexed,
+        IndirectParametersMetadata, IndirectParametersNonIndexed,
+        LatePreprocessWorkItemIndirectParameters, PreprocessWorkItem, PreprocessWorkItemBuffers,
+        SceneUnpackingBuffers, SceneUnpackingBuffersKey, SceneUnpackingJob,
+        UniformAllocationMetadataIndex, UntypedPhaseBatchedInstanceBuffers,
         UntypedPhaseIndirectParametersBuffers,
     },
     diagnostic::RecordDiagnostics as _,
     occlusion_culling::OcclusionCulling,
-    render_phase::GpuRenderBinnedMeshInstance,
+    render_phase::{GpuRenderBinnedMeshInstance, UNIFORM_ALLOCATION_WORKGROUP_SIZE},
     render_resource::{
         binding_types::{storage_buffer, storage_buffer_read_only, texture_2d, uniform_buffer},
         BindGroup, BindGroupEntries, BindGroupLayoutDescriptor, BindGroupLayoutEntries,
@@ -130,6 +131,8 @@ pub struct PreprocessPipelines {
     pub main_phase: PreprocessPhasePipelines,
     /// Compute shader pipelines for the bin unpacking step.
     pub bin_unpacking: BinUnpackingPipeline,
+    /// Compute shader pipelines for the uniform allocation step.
+    pub uniform_allocation: UniformAllocationPipelines,
 }
 
 /// Compute shader pipelines for a specific phase: early, late, or main.
@@ -206,6 +209,63 @@ pub struct BinUnpackingPipeline {
     pub pipeline_id: Option<CachedComputePipelineId>,
 }
 
+/// Pipelines for the `allocate_uniforms` compute shader.
+///
+/// This shader has three steps, so we have three pipelines.
+///
+/// Although the `Handle<Shader>` is the same among these three pipelines, they
+/// have to be separate so that the `SpecializedComputePipeline` implementation
+/// on each sub-pipeline can access it.
+#[derive(Clone)]
+pub struct UniformAllocationPipelines {
+    /// The pipeline for step 1: local scan.
+    pub local_scan: UniformAllocationLocalScanPipeline,
+    /// The pipeline for step 2: global scan.
+    pub global_scan: UniformAllocationGlobalScanPipeline,
+    /// The pipeline for step 3: fan.
+    pub fan: UniformAllocationFanPipeline,
+}
+
+/// The pipeline for the first step of the `allocate_uniforms` shader.
+#[derive(Clone)]
+pub struct UniformAllocationLocalScanPipeline {
+    /// The bind group layout, shared among all the uniform allocation
+    /// pipelines.
+    pub bind_group_layout: BindGroupLayoutDescriptor,
+    /// The shader, also shared among all uniform allocation pipelines.
+    pub shader: Handle<Shader>,
+    /// The pipeline ID for the first step of the `allocate_uniforms` shader.
+    pub pipeline_id_local_scan: Option<CachedComputePipelineId>,
+}
+
+/// The pipeline for the second step of the `allocate_uniforms` shader.
+///
+/// This step is skipped if the number of bins in the batch set is 256 or fewer.
+#[derive(Clone)]
+pub struct UniformAllocationGlobalScanPipeline {
+    /// The bind group layout, shared among all the uniform allocation
+    /// pipelines.
+    pub bind_group_layout: BindGroupLayoutDescriptor,
+    /// The shader, also shared among all uniform allocation pipelines.
+    pub shader: Handle<Shader>,
+    /// The pipeline ID for the second step of the `allocate_uniforms` shader.
+    pub pipeline_id_global_scan: Option<CachedComputePipelineId>,
+}
+
+/// The pipeline for the third step of the `allocate_uniforms` shader.
+///
+/// This step is skipped if the number of bins in the batch set is 256 or fewer.
+#[derive(Clone)]
+pub struct UniformAllocationFanPipeline {
+    /// The bind group layout, shared among all the uniform allocation
+    /// pipelines.
+    pub bind_group_layout: BindGroupLayoutDescriptor,
+    /// The shader, also shared among all uniform allocation pipelines.
+    pub shader: Handle<Shader>,
+    /// The pipeline ID for the third step of the `allocate_uniforms` shader.
+    pub pipeline_id_fan: Option<CachedComputePipelineId>,
+}
+
 bitflags! {
     /// Specifies variants of the mesh preprocessing shader.
     #[derive(Clone, Copy, PartialEq, Eq, Hash)]
@@ -351,16 +411,16 @@ pub struct PhaseBuildIndirectParametersBindGroups {
 /// mesh indexed-ness.
 #[derive(Clone, Resource, Default, Deref, DerefMut)]
 pub struct BinUnpackingBindGroups(
-    pub HashMap<BinUnpackingBuffersKey, ViewPhaseBinUnpackingBindGroups>,
+    pub HashMap<SceneUnpackingBuffersKey, ViewPhaseBinUnpackingBindGroups>,
 );
 
 /// The bind groups for the `unpack_bins` shader for a single (view, phase)
 /// combination.
 #[derive(Clone)]
 pub struct ViewPhaseBinUnpackingBindGroups {
-    /// The bind group for the indexed meshes.
+    /// The bind groups for the indexed meshes, one for each batch set.
     indexed: Vec<ViewPhaseBinUnpackingBindGroup>,
-    /// The bind group for the non-indexed meshes.
+    /// The bind groups for the non-indexed meshes, one for each batch set.
     non_indexed: Vec<ViewPhaseBinUnpackingBindGroup>,
 }
 
@@ -369,7 +429,7 @@ pub struct ViewPhaseBinUnpackingBindGroups {
 #[derive(Clone)]
 pub struct ViewPhaseBinUnpackingBindGroup {
     /// The index of the metadata in the
-    /// [`BinUnpackingBuffers::bin_unpacking_metadata`] buffer.
+    /// [`SceneUnpackingBuffers::bin_unpacking_metadata`] buffer.
     pub metadata_index: BinUnpackingMetadataIndex,
     /// The actual shader bind group.
     pub bind_group: BindGroup,
@@ -378,6 +438,39 @@ pub struct ViewPhaseBinUnpackingBindGroup {
     pub mesh_instance_count: u32,
 }
 
+/// A resource, part of the render world, that stores all the bind groups for
+/// the uniform allocation shader.
+///
+/// There will be one such bind group for each combination of view, phase, and
+/// mesh indexed-ness.
+#[derive(Clone, Resource, Default, Deref, DerefMut)]
+pub struct UniformAllocationBindGroups(
+    pub HashMap<SceneUnpackingBuffersKey, ViewPhaseUniformAllocationBindGroups>,
+);
+
+/// The bind groups for the `allocate_uniforms` shader for a single (view,
+/// phase) combination.
+#[derive(Clone)]
+pub struct ViewPhaseUniformAllocationBindGroups {
+    /// The bind groups for the indexed meshes, one for each batch set.
+    indexed: Vec<ViewPhaseUniformAllocationBindGroup>,
+    /// The bind groups for the non-indexed meshes, one for each batch set.
+    non_indexed: Vec<ViewPhaseUniformAllocationBindGroup>,
+}
+
+/// The bind group for the `allocate_uniforms` shader for a single combination
+/// of view, phase, and mesh indexed-ness.
+#[derive(Clone)]
+pub struct ViewPhaseUniformAllocationBindGroup {
+    /// The index of the metadata in the
+    /// [`SceneUnpackingBuffers::uniform_allocation_metadata`] buffer.
+    pub metadata_index: UniformAllocationMetadataIndex,
+    /// The actual shader bind group.
+    pub bind_group: BindGroup,
+    /// The total number of bins in this batch set.
+    pub bin_count: u32,
+}
+
 /// Stops the `GpuPreprocessNode` attempting to generate the buffer for this view
 /// useful to avoid duplicating effort if the bind group is shared between views
 #[derive(Component, Default)]
@@ -389,6 +482,7 @@ impl Plugin for GpuMeshPreprocessPlugin {
         embedded_asset!(app, "reset_indirect_batch_sets.wgsl");
         embedded_asset!(app, "build_indirect_params.wgsl");
         embedded_asset!(app, "unpack_bins.wgsl");
+        embedded_asset!(app, "allocate_uniforms.wgsl");
     }
 
     fn finish(&self, app: &mut App) {
@@ -405,15 +499,19 @@ impl Plugin for GpuMeshPreprocessPlugin {
 
         render_app
             .init_gpu_resource::<BinUnpackingBindGroups>()
+            .init_gpu_resource::<UniformAllocationBindGroups>()
             .init_gpu_resource::<PreprocessPipelines>()
             .init_gpu_resource::<SpecializedComputePipelines<PreprocessPipeline>>()
             .init_gpu_resource::<SpecializedComputePipelines<ResetIndirectBatchSetsPipeline>>()
             .init_gpu_resource::<SpecializedComputePipelines<BuildIndirectParametersPipeline>>()
             .init_gpu_resource::<SpecializedComputePipelines<BinUnpackingPipeline>>()
+            .init_gpu_resource::<SpecializedComputePipelines<UniformAllocationLocalScanPipeline>>()
+            .init_gpu_resource::<SpecializedComputePipelines<UniformAllocationGlobalScanPipeline>>()
+            .init_gpu_resource::<SpecializedComputePipelines<UniformAllocationFanPipeline>>()
             .add_systems(
                 Render,
                 (
-                    clear_bin_unpacking_buffers.in_set(RenderSystems::PrepareResources),
+                    clear_scene_unpacking_buffers.in_set(RenderSystems::PrepareResources),
                     prepare_preprocess_pipelines.in_set(RenderSystems::Prepare),
                     prepare_preprocess_bind_groups
                         .run_if(resource_exists::<BatchedInstanceBuffers<
@@ -429,7 +527,7 @@ impl Plugin for GpuMeshPreprocessPlugin {
                 Core3d,
                 (
                     (
-                        clear_indirect_parameters_metadata,
+                        allocate_uniforms,
                         unpack_bins,
                         early_gpu_preprocess,
                         early_prepass_build_indirect_parameters.run_if(any_match_filter::<(
@@ -468,44 +566,117 @@ impl Plugin for GpuMeshPreprocessPlugin {
     }
 }
 
-pub fn clear_indirect_parameters_metadata(
-    indirect_parameters_buffers: Option<Res<IndirectParametersBuffers>>,
-    mut ctx: RenderContext,
+/// A rendering system that invokes a compute shader for each batch set in order
+/// to determine where `MeshUniform`s should be placed.
+///
+/// This shader exists because a single batch set could contain many meshes. By
+/// performing this on the GPU, we avoid having to traverse every visible mesh
+/// on the CPU every frame.
+pub fn allocate_uniforms(
+    current_view: ViewQuery<Option<&ViewLightEntities>, Without<SkipGpuPreprocess>>,
+    view_query: Query<&ExtractedView, Without<SkipGpuPreprocess>>,
+    batched_instance_buffers: Res<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
+    pipeline_cache: Res<PipelineCache>,
+    preprocess_pipelines: Res<PreprocessPipelines>,
+    uniform_allocation_bind_groups: Res<UniformAllocationBindGroups>,
+    mut render_context: RenderContext,
 ) {
-    let Some(indirect_parameters_buffers) = indirect_parameters_buffers else {
-        return;
-    };
+    let diagnostics = render_context.diagnostic_recorder();
+    let diagnostics = diagnostics.as_deref();
 
-    // Clear out each indexed and non-indexed GPU-side buffer.
-    for phase_indirect_parameters_buffers in indirect_parameters_buffers.values() {
-        if let Some(indexed_gpu_metadata_buffer) = phase_indirect_parameters_buffers
-            .indexed
-            .gpu_metadata_buffer()
-        {
-            ctx.command_encoder().clear_buffer(
-                indexed_gpu_metadata_buffer,
-                0,
-                Some(
-                    phase_indirect_parameters_buffers.indexed.batch_count() as u64
-                        * size_of::<IndirectParametersGpuMetadata>() as u64,
-                ),
-            );
-        }
+    let command_encoder = render_context.command_encoder();
+    let mut compute_pass = command_encoder.begin_compute_pass(&ComputePassDescriptor {
+        label: Some("uniform allocation"),
+        timestamp_writes: None,
+    });
 
-        if let Some(non_indexed_gpu_metadata_buffer) = phase_indirect_parameters_buffers
-            .non_indexed
-            .gpu_metadata_buffer()
-        {
-            ctx.command_encoder().clear_buffer(
-                non_indexed_gpu_metadata_buffer,
-                0,
-                Some(
-                    phase_indirect_parameters_buffers.non_indexed.batch_count() as u64
-                        * size_of::<IndirectParametersGpuMetadata>() as u64,
-                ),
-            );
+    let pass_span = diagnostics.pass_span(&mut compute_pass, "uniform_allocation");
+
+    // Gather up all views.
+    let view_entity = current_view.entity();
+    let shadow_cascade_views = current_view.into_inner();
+    let all_views = gather_shadow_cascades_for_view(view_entity, shadow_cascade_views);
+
+    // Don't run if the shaders haven't been compiled yet.
+    if let (
+        Some(uniform_allocation_local_scan_pipeline_id),
+        Some(uniform_allocation_global_scan_pipeline_id),
+        Some(uniform_allocation_fan_pipeline_id),
+    ) = (
+        preprocess_pipelines
+            .uniform_allocation
+            .local_scan
+            .pipeline_id_local_scan,
+        preprocess_pipelines
+            .uniform_allocation
+            .global_scan
+            .pipeline_id_global_scan,
+        preprocess_pipelines.uniform_allocation.fan.pipeline_id_fan,
+    ) && let (
+        Some(uniform_allocation_local_scan_pipeline),
+        Some(uniform_allocation_global_scan_pipeline),
+        Some(uniform_allocation_fan_pipeline),
+    ) = (
+        pipeline_cache.get_compute_pipeline(uniform_allocation_local_scan_pipeline_id),
+        pipeline_cache.get_compute_pipeline(uniform_allocation_global_scan_pipeline_id),
+        pipeline_cache.get_compute_pipeline(uniform_allocation_fan_pipeline_id),
+    ) {
+        // Loop over each view…
+        for view_entity in all_views {
+            let Ok(view) = view_query.get(view_entity) else {
+                continue;
+            };
+
+            // …and each phase within each view.
+            for phase_type_id in batched_instance_buffers.phase_instance_buffers.keys() {
+                let uniform_allocation_buffers_key = SceneUnpackingBuffersKey {
+                    phase: *phase_type_id,
+                    view: view.retained_view_entity,
+                };
+
+                // Fetch the bind groups for this (view, phase) combination.
+                let Some(phase_uniform_allocation_bind_groups) =
+                    uniform_allocation_bind_groups.get(&uniform_allocation_buffers_key)
+                else {
+                    continue;
+                };
+
+                // Invoke the shader for all batch sets corresponding to indexed
+                // meshes and then for all batch sets corresponding to
+                // non-indexed meshes.
+                for uniform_allocation_bind_group in phase_uniform_allocation_bind_groups
+                    .indexed
+                    .iter()
+                    .chain(phase_uniform_allocation_bind_groups.non_indexed.iter())
+                {
+                    // Invoke the local scan (step 1).
+                    compute_pass.set_pipeline(uniform_allocation_local_scan_pipeline);
+                    compute_pass.set_bind_group(0, &uniform_allocation_bind_group.bind_group, &[]);
+                    let local_scan_workgroup_count = uniform_allocation_bind_group
+                        .bin_count
+                        .div_ceil(UNIFORM_ALLOCATION_WORKGROUP_SIZE);
+                    if local_scan_workgroup_count > 0 {
+                        compute_pass.dispatch_workgroups(local_scan_workgroup_count, 1, 1);
+                    }
+
+                    // If there are 256 or fewer draws in this batch, we're
+                    // done. Otherwise, perform the other two steps.
+                    if local_scan_workgroup_count > 1 {
+                        // Invoke the global scan (step 2).
+                        compute_pass.set_pipeline(uniform_allocation_global_scan_pipeline);
+                        compute_pass.dispatch_workgroups(1, 1, 1);
+
+                        // Perform the fan operation (step 3).
+                        compute_pass.set_pipeline(uniform_allocation_fan_pipeline);
+                        let fan_workgroup_count = local_scan_workgroup_count - 1;
+                        compute_pass.dispatch_workgroups(fan_workgroup_count, 1, 1);
+                    }
+                }
+            }
         }
     }
+
+    pass_span.end(&mut compute_pass);
 }
 
 /// A rendering system that invokes a compute shader for each batch set in order
@@ -555,14 +726,14 @@ pub fn unpack_bins(
 
             // …and each phase within each view.
             for phase_type_id in batched_instance_buffers.phase_instance_buffers.keys() {
-                let bin_unpacking_buffers_key = BinUnpackingBuffersKey {
+                let scene_unpacking_buffers_key = SceneUnpackingBuffersKey {
                     phase: *phase_type_id,
                     view: view.retained_view_entity,
                 };
 
                 // Fetch the bind groups for this (view, phase) combination.
                 let Some(phase_bin_unpacking_bind_groups) =
-                    bin_unpacking_bind_groups.get(&bin_unpacking_buffers_key)
+                    bin_unpacking_bind_groups.get(&scene_unpacking_buffers_key)
                 else {
                     continue;
                 };
@@ -1273,6 +1444,8 @@ impl FromWorld for PreprocessPipelines {
                 .extend_sequential((storage_buffer::<IndirectParametersNonIndexed>(false),));
 
         let bin_unpacking_bind_group_layout_entries = bin_unpacking_bind_group_layout_entries();
+        let uniform_allocation_bind_group_layout_entries =
+            uniform_allocation_bind_group_layout_entries();
 
         // Create the bind group layouts.
         let direct_bind_group_layout = BindGroupLayoutDescriptor::new(
@@ -1307,6 +1480,10 @@ impl FromWorld for PreprocessPipelines {
             "bin unpacking bind group layout",
             &bin_unpacking_bind_group_layout_entries,
         );
+        let uniform_allocation_bind_group_layout = BindGroupLayoutDescriptor::new(
+            "uniform allocation bind group layout",
+            &uniform_allocation_bind_group_layout_entries,
+        );
 
         let preprocess_shader = load_embedded_asset!(world, "mesh_preprocess.wgsl");
         let reset_indirect_batch_sets_shader =
@@ -1314,6 +1491,7 @@ impl FromWorld for PreprocessPipelines {
         let build_indirect_params_shader =
             load_embedded_asset!(world, "build_indirect_params.wgsl");
         let bin_unpacking_shader = load_embedded_asset!(world, "unpack_bins.wgsl");
+        let uniform_allocation_shader = load_embedded_asset!(world, "allocate_uniforms.wgsl");
 
         let preprocess_phase_pipelines = PreprocessPhasePipelines {
             reset_indirect_batch_sets: ResetIndirectBatchSetsPipeline {
@@ -1374,6 +1552,23 @@ impl FromWorld for PreprocessPipelines {
                 shader: bin_unpacking_shader,
                 pipeline_id: None,
             },
+            uniform_allocation: UniformAllocationPipelines {
+                local_scan: UniformAllocationLocalScanPipeline {
+                    bind_group_layout: uniform_allocation_bind_group_layout.clone(),
+                    shader: uniform_allocation_shader.clone(),
+                    pipeline_id_local_scan: None,
+                },
+                global_scan: UniformAllocationGlobalScanPipeline {
+                    bind_group_layout: uniform_allocation_bind_group_layout.clone(),
+                    shader: uniform_allocation_shader.clone(),
+                    pipeline_id_global_scan: None,
+                },
+                fan: UniformAllocationFanPipeline {
+                    bind_group_layout: uniform_allocation_bind_group_layout.clone(),
+                    shader: uniform_allocation_shader.clone(),
+                    pipeline_id_fan: None,
+                },
+            },
         }
     }
 }
@@ -1405,15 +1600,17 @@ fn build_indirect_params_bind_group_layout_entries() -> DynamicBindGroupLayoutEn
     DynamicBindGroupLayoutEntries::new_with_indices(
         ShaderStages::COMPUTE,
         (
+            // @group(0) @binding(0) var<storage> current_input:
+            // array<MeshInput>;
             (0, storage_buffer_read_only::<MeshInputUniform>(false)),
+            // @group(0) @binding(1) var<storage> indirect_parameters_metadata:
+            // array<IndirectParametersMetadata>;
             (
                 1,
-                storage_buffer_read_only::<IndirectParametersCpuMetadata>(false),
-            ),
-            (
-                2,
-                storage_buffer_read_only::<IndirectParametersGpuMetadata>(false),
+                storage_buffer_read_only::<IndirectParametersMetadata>(false),
             ),
+            // @group(0) @binding(3) var<storage, read_write>
+            // indirect_batch_sets: array<IndirectBatchSet>;
             (3, storage_buffer::<IndirectBatchSet>(false)),
         ),
     )
@@ -1425,17 +1622,11 @@ fn gpu_culling_bind_group_layout_entries() -> DynamicBindGroupLayoutEntries {
     // GPU culling bind group parameters are a superset of those in the CPU
     // culling (direct) shader.
     preprocess_direct_bind_group_layout_entries().extend_with_indices((
-        // `indirect_parameters_cpu_metadata`
+        // @group(0) @binding(7) var<storage> indirect_parameters_metadata:
+        // array<IndirectParametersMetadata>;
         (
             7,
-            storage_buffer_read_only::<IndirectParametersCpuMetadata>(
-                /* has_dynamic_offset= */ false,
-            ),
-        ),
-        // `indirect_parameters_gpu_metadata`
-        (
-            8,
-            storage_buffer::<IndirectParametersGpuMetadata>(/* has_dynamic_offset= */ false),
+            storage_buffer::<IndirectParametersMetadata>(/* has_dynamic_offset= */ false),
         ),
         // `mesh_culling_data`
         (
@@ -1465,7 +1656,7 @@ fn gpu_occlusion_culling_bind_group_layout_entries() -> DynamicBindGroupLayoutEn
 
 /// Creates and returns bind group layout entries for the GPU bin unpacking
 /// shader (`unpack_bins`).
-fn bin_unpacking_bind_group_layout_entries() -> BindGroupLayoutEntries<4> {
+fn bin_unpacking_bind_group_layout_entries() -> BindGroupLayoutEntries<5> {
     BindGroupLayoutEntries::sequential(
         ShaderStages::COMPUTE,
         (
@@ -1478,13 +1669,37 @@ fn bin_unpacking_bind_group_layout_entries() -> BindGroupLayoutEntries<4> {
             // @group(0) @binding(2) var<storage, read_write>
             // preprocess_work_items: array<PreprocessWorkItem>;
             storage_buffer::<PreprocessWorkItem>(false),
-            // @group(0) @binding(3) var<storage>
-            // bin_index_to_indirect_parameters_offset: array<u32>;
+            // @group(0) @binding(3) var<storage> bin_metadata:
+            // array<GpuBinMetadata>;
+            storage_buffer_read_only::<GpuBinMetadata>(false),
+            // @group(0) @binding(4) var<storage>
+            // bin_index_to_bin_metadata_index: array<u32>;
             storage_buffer_read_only::<u32>(false),
         ),
     )
 }
 
+/// Creates and returns bind group layout entries for the GPU uniform allocation
+/// shader (`allocate_uniforms`).
+fn uniform_allocation_bind_group_layout_entries() -> BindGroupLayoutEntries<4> {
+    BindGroupLayoutEntries::sequential(
+        ShaderStages::COMPUTE,
+        (
+            // @group(0) @binding(0) var<uniform> allocate_uniforms_metadata:
+            // AllocateUniformsMetadata;
+            uniform_buffer::<GpuUniformAllocationMetadata>(false),
+            // @group(0) @binding(1) var<storage> bin_metadata: array<BinMetadata>;
+            storage_buffer_read_only::<GpuBinMetadata>(false),
+            // @group(0) @binding(2) var<storage, read_write>
+            // indirect_parameters_metadata: array<IndirectParametersMetadata>;
+            storage_buffer::<IndirectParametersMetadata>(false),
+            // @group(0) @binding(3) var<storage, read_write> fan_buffer:
+            // array<u32>;
+            storage_buffer::<u32>(false),
+        ),
+    )
+}
+
 /// A system that specializes the pipelines relating to mesh preprocessing if
 /// necessary.
 ///
@@ -1505,6 +1720,15 @@ pub fn prepare_preprocess_pipelines(
     mut specialized_bin_unpacking_pipelines: ResMut<
         SpecializedComputePipelines<BinUnpackingPipeline>,
     >,
+    mut specialized_uniform_allocation_local_scan_pipelines: ResMut<
+        SpecializedComputePipelines<UniformAllocationLocalScanPipeline>,
+    >,
+    mut specialized_uniform_allocation_global_scan_pipelines: ResMut<
+        SpecializedComputePipelines<UniformAllocationGlobalScanPipeline>,
+    >,
+    mut specialized_uniform_allocation_fan_pipelines: ResMut<
+        SpecializedComputePipelines<UniformAllocationFanPipeline>,
+    >,
     preprocess_pipelines: ResMut<PreprocessPipelines>,
     gpu_preprocessing_support: Res<GpuPreprocessingSupport>,
 ) {
@@ -1617,6 +1841,14 @@ pub fn prepare_preprocess_pipelines(
     preprocess_pipelines
         .bin_unpacking
         .prepare(&pipeline_cache, &mut specialized_bin_unpacking_pipelines);
+
+    // Prepare the uniform allocation compute pipeline.
+    preprocess_pipelines.uniform_allocation.prepare(
+        &pipeline_cache,
+        &mut specialized_uniform_allocation_local_scan_pipelines,
+        &mut specialized_uniform_allocation_global_scan_pipelines,
+        &mut specialized_uniform_allocation_fan_pipelines,
+    );
 }
 
 impl PreprocessPipeline {
@@ -1714,6 +1946,51 @@ impl SpecializedComputePipeline for BinUnpackingPipeline {
     }
 }
 
+impl SpecializedComputePipeline for UniformAllocationLocalScanPipeline {
+    type Key = ();
+
+    fn specialize(&self, _: Self::Key) -> ComputePipelineDescriptor {
+        ComputePipelineDescriptor {
+            label: Some("uniform allocation, local scan".into()),
+            layout: vec![self.bind_group_layout.clone()],
+            shader: self.shader.clone(),
+            shader_defs: vec![],
+            entry_point: Some("allocate_local_scan".into()),
+            ..Default::default()
+        }
+    }
+}
+
+impl SpecializedComputePipeline for UniformAllocationGlobalScanPipeline {
+    type Key = ();
+
+    fn specialize(&self, _: Self::Key) -> ComputePipelineDescriptor {
+        ComputePipelineDescriptor {
+            label: Some("uniform allocation, global scan".into()),
+            layout: vec![self.bind_group_layout.clone()],
+            shader: self.shader.clone(),
+            shader_defs: vec![],
+            entry_point: Some("allocate_global_scan".into()),
+            ..Default::default()
+        }
+    }
+}
+
+impl SpecializedComputePipeline for UniformAllocationFanPipeline {
+    type Key = ();
+
+    fn specialize(&self, _: Self::Key) -> ComputePipelineDescriptor {
+        ComputePipelineDescriptor {
+            label: Some("uniform allocation, fan".into()),
+            layout: vec![self.bind_group_layout.clone()],
+            shader: self.shader.clone(),
+            shader_defs: vec![],
+            entry_point: Some("allocate_fan".into()),
+            ..Default::default()
+        }
+    }
+}
+
 impl ResetIndirectBatchSetsPipeline {
     fn prepare(
         &mut self,
@@ -1761,6 +2038,46 @@ impl BinUnpackingPipeline {
     }
 }
 
+impl UniformAllocationPipelines {
+    /// Specializes all three pipelines that use the uniform allocation shader.
+    fn prepare(
+        &mut self,
+        pipeline_cache: &PipelineCache,
+        uniform_allocation_local_scan_pipelines: &mut SpecializedComputePipelines<
+            UniformAllocationLocalScanPipeline,
+        >,
+        uniform_allocation_global_scan_pipelines: &mut SpecializedComputePipelines<
+            UniformAllocationGlobalScanPipeline,
+        >,
+        uniform_allocation_fan_pipelines: &mut SpecializedComputePipelines<
+            UniformAllocationFanPipeline,
+        >,
+    ) {
+        if self.local_scan.pipeline_id_local_scan.is_none() {
+            self.local_scan.pipeline_id_local_scan =
+                Some(uniform_allocation_local_scan_pipelines.specialize(
+                    pipeline_cache,
+                    &self.local_scan,
+                    (),
+                ));
+        }
+
+        if self.global_scan.pipeline_id_global_scan.is_none() {
+            self.global_scan.pipeline_id_global_scan =
+                Some(uniform_allocation_global_scan_pipelines.specialize(
+                    pipeline_cache,
+                    &self.global_scan,
+                    (),
+                ));
+        }
+
+        if self.fan.pipeline_id_fan.is_none() {
+            self.fan.pipeline_id_fan =
+                Some(uniform_allocation_fan_pipelines.specialize(pipeline_cache, &self.fan, ()));
+        }
+    }
+}
+
 /// A system that attaches buffers to bind groups for the variants of the
 /// compute shaders relating to mesh preprocessing.
 #[expect(
@@ -1775,13 +2092,14 @@ pub fn prepare_preprocess_bind_groups(
     pipeline_cache: Res<PipelineCache>,
     batched_instance_buffers: Res<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
     indirect_parameters_buffers: Res<IndirectParametersBuffers>,
-    bin_unpacking_buffers: Res<BinUnpackingBuffers>,
+    scene_unpacking_buffers: Res<SceneUnpackingBuffers>,
     mesh_culling_data_buffer: Res<MeshCullingDataBuffer>,
     visibility_ranges: Res<RenderVisibilityRanges>,
     view_uniforms: Res<ViewUniforms>,
     previous_view_uniforms: Res<PreviousViewUniforms>,
     pipelines: Res<PreprocessPipelines>,
     mut bin_unpacking_bind_groups: ResMut<BinUnpackingBindGroups>,
+    mut uniform_allocation_bind_groups: ResMut<UniformAllocationBindGroups>,
 ) {
     // Grab the `BatchedInstanceBuffers`.
     let BatchedInstanceBuffers {
@@ -1912,7 +2230,7 @@ pub fn prepare_preprocess_bind_groups(
     }
 
     // Create the bind groups we'll need for each dispatch of the bin unpacking
-    // (`unpack_bins`) shader.
+    // (`unpack_bins`) and uniform allocation (`allocate_uniforms`) shaders.
     for (_, view) in &views {
         create_bin_unpacking_bind_groups(
             &mut bin_unpacking_bind_groups,
@@ -1921,7 +2239,16 @@ pub fn prepare_preprocess_bind_groups(
             &pipelines,
             &indirect_parameters_buffers,
             phase_instance_buffers,
-            &bin_unpacking_buffers,
+            &scene_unpacking_buffers,
+            &view.retained_view_entity,
+        );
+        create_uniform_allocation_bind_groups(
+            &mut uniform_allocation_bind_groups,
+            &render_device,
+            &pipeline_cache,
+            &pipelines,
+            &indirect_parameters_buffers,
+            &scene_unpacking_buffers,
             &view.retained_view_entity,
         );
     }
@@ -2074,17 +2401,13 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
         match (
             self.phase_indirect_parameters_buffers
                 .indexed
-                .cpu_metadata_buffer(),
-            self.phase_indirect_parameters_buffers
-                .indexed
-                .gpu_metadata_buffer(),
+                .metadata_buffer(),
             indexed_work_item_buffer.buffer(),
             late_indexed_work_item_buffer.buffer(),
             self.late_indexed_indirect_parameters_buffer.buffer(),
         ) {
             (
-                Some(indexed_cpu_metadata_buffer),
-                Some(indexed_gpu_metadata_buffer),
+                Some(indexed_metadata_buffer),
                 Some(indexed_work_item_gpu_buffer),
                 Some(late_indexed_work_item_gpu_buffer),
                 Some(late_indexed_indirect_parameters_buffer),
@@ -2108,8 +2431,14 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
                                 .bind_group_layout,
                         ),
                         &BindGroupEntries::with_indices((
+                            // @group(0) @binding(3) var<storage> current_input:
+                            // array<MeshInput>;
                             (3, self.current_input_buffer.as_entire_binding()),
+                            // @group(0) @binding(4) var<storage>
+                            // previous_input: array<MeshInput>;
                             (4, self.previous_input_buffer.as_entire_binding()),
+                            // @group(0) @binding(5) var<storage> work_items:
+                            // array<PreprocessWorkItem>;
                             (
                                 5,
                                 BindingResource::Buffer(BufferBinding {
@@ -2118,13 +2447,26 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
                                     size: indexed_work_item_buffer_size,
                                 }),
                             ),
+                            // @group(0) @binding(6) var<storage, read_write>
+                            // output: array<Mesh>;
                             (6, self.data_buffer.as_entire_binding()),
-                            (7, indexed_cpu_metadata_buffer.as_entire_binding()),
-                            (8, indexed_gpu_metadata_buffer.as_entire_binding()),
+                            // @group(0) @binding(7) var<storage>
+                            // indirect_parameters_metadata:
+                            // array<IndirectParametersMetadata>;
+                            (7, indexed_metadata_buffer.as_entire_binding()),
+                            // @group(0) @binding(9) var<storage>
+                            // mesh_culling_data: array<MeshCullingData>;
                             (9, mesh_culling_data_buffer.as_entire_binding()),
+                            // @group(0) @binding(10) var<storage>
+                            // visibility_ranges: array<vec4<f32>>;
                             (10, visibility_range_binding.clone()),
+                            // @group(0) @binding(0) var<uniform> view: View;
                             (0, view_uniforms_binding.clone()),
+                            // @group(0) @binding(11) var depth_pyramid:
+                            // texture_2d<f32>;
                             (11, &view_depth_pyramid.all_mips),
+                            // @group(0) @binding(2) var<uniform>
+                            // previous_view_uniforms: PreviousViewUniforms;
                             (
                                 2,
                                 BufferBinding {
@@ -2133,6 +2475,9 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
                                     size: NonZeroU64::new(size_of::<PreviousViewData>() as u64),
                                 },
                             ),
+                            // @group(0) @binding(12) var<storage, read_write>
+                            // late_preprocess_work_items:
+                            // array<PreprocessWorkItem>;
                             (
                                 12,
                                 BufferBinding {
@@ -2141,6 +2486,9 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
                                     size: indexed_work_item_buffer_size,
                                 },
                             ),
+                            // @group(0) @binding(13) var<storage, read_write>
+                            // late_preprocess_work_item_indirect_parameters:
+                            // array<LatePreprocessWorkItemIndirectParameters>;
                             (
                                 13,
                                 BufferBinding {
@@ -2176,17 +2524,13 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
         match (
             self.phase_indirect_parameters_buffers
                 .non_indexed
-                .cpu_metadata_buffer(),
-            self.phase_indirect_parameters_buffers
-                .non_indexed
-                .gpu_metadata_buffer(),
+                .metadata_buffer(),
             non_indexed_work_item_buffer.buffer(),
             late_non_indexed_work_item_buffer.buffer(),
             self.late_non_indexed_indirect_parameters_buffer.buffer(),
         ) {
             (
-                Some(non_indexed_cpu_metadata_buffer),
-                Some(non_indexed_gpu_metadata_buffer),
+                Some(non_indexed_metadata_buffer),
                 Some(non_indexed_work_item_gpu_buffer),
                 Some(late_non_indexed_work_item_buffer),
                 Some(late_non_indexed_indirect_parameters_buffer),
@@ -2210,8 +2554,14 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
                                 .bind_group_layout,
                         ),
                         &BindGroupEntries::with_indices((
+                            // @group(0) @binding(3) var<storage> current_input:
+                            // array<MeshInput>;
                             (3, self.current_input_buffer.as_entire_binding()),
+                            // @group(0) @binding(4) var<storage>
+                            // previous_input: array<MeshInput>;
                             (4, self.previous_input_buffer.as_entire_binding()),
+                            // @group(0) @binding(5) var<storage> work_items:
+                            // array<PreprocessWorkItem>;
                             (
                                 5,
                                 BindingResource::Buffer(BufferBinding {
@@ -2221,12 +2571,23 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
                                 }),
                             ),
                             (6, self.data_buffer.as_entire_binding()),
-                            (7, non_indexed_cpu_metadata_buffer.as_entire_binding()),
-                            (8, non_indexed_gpu_metadata_buffer.as_entire_binding()),
+                            // @group(0) @binding(7) var<storage>
+                            // indirect_parameters_metadata:
+                            // array<IndirectParametersMetadata>;
+                            (7, non_indexed_metadata_buffer.as_entire_binding()),
+                            // @group(0) @binding(9) var<storage>
+                            // mesh_culling_data: array<MeshCullingData>;
                             (9, mesh_culling_data_buffer.as_entire_binding()),
+                            // @group(0) @binding(10) var<storage>
+                            // visibility_ranges: array<vec4<f32>>;
                             (10, visibility_range_binding.clone()),
+                            // @group(0) @binding(0) var<uniform> view: View;
                             (0, view_uniforms_binding.clone()),
+                            // @group(0) @binding(11) var depth_pyramid:
+                            // texture_2d<f32>;
                             (11, &view_depth_pyramid.all_mips),
+                            // @group(0) @binding(2) var<uniform>
+                            // previous_view_uniforms: PreviousViewUniforms;
                             (
                                 2,
                                 BufferBinding {
@@ -2235,6 +2596,9 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
                                     size: NonZeroU64::new(size_of::<PreviousViewData>() as u64),
                                 },
                             ),
+                            // @group(0) @binding(12) var<storage, read_write>
+                            // late_preprocess_work_items:
+                            // array<PreprocessWorkItem>;
                             (
                                 12,
                                 BufferBinding {
@@ -2243,6 +2607,9 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
                                     size: non_indexed_work_item_buffer_size,
                                 },
                             ),
+                            // @group(0) @binding(13) var<storage, read_write>
+                            // late_preprocess_work_item_indirect_parameters:
+                            // array<LatePreprocessWorkItemIndirectParameters>;
                             (
                                 13,
                                 BufferBinding {
@@ -2277,16 +2644,12 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
         match (
             self.phase_indirect_parameters_buffers
                 .indexed
-                .cpu_metadata_buffer(),
-            self.phase_indirect_parameters_buffers
-                .indexed
-                .gpu_metadata_buffer(),
+                .metadata_buffer(),
             late_indexed_work_item_buffer.buffer(),
             self.late_indexed_indirect_parameters_buffer.buffer(),
         ) {
             (
-                Some(indexed_cpu_metadata_buffer),
-                Some(indexed_gpu_metadata_buffer),
+                Some(indexed_metadata_buffer),
                 Some(late_indexed_work_item_gpu_buffer),
                 Some(late_indexed_indirect_parameters_buffer),
             ) => {
@@ -2309,8 +2672,14 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
                                 .bind_group_layout,
                         ),
                         &BindGroupEntries::with_indices((
+                            // @group(0) @binding(3) var<storage> current_input:
+                            // array<MeshInput>;
                             (3, self.current_input_buffer.as_entire_binding()),
+                            // @group(0) @binding(4) var<storage>
+                            // previous_input: array<MeshInput>;
                             (4, self.previous_input_buffer.as_entire_binding()),
+                            // @group(0) @binding(5) var<storage> work_items:
+                            // array<PreprocessWorkItem>;
                             (
                                 5,
                                 BindingResource::Buffer(BufferBinding {
@@ -2319,13 +2688,26 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
                                     size: late_indexed_work_item_buffer_size,
                                 }),
                             ),
+                            // @group(0) @binding(6) var<storage, read_write>
+                            // output: array<Mesh>;
                             (6, self.data_buffer.as_entire_binding()),
-                            (7, indexed_cpu_metadata_buffer.as_entire_binding()),
-                            (8, indexed_gpu_metadata_buffer.as_entire_binding()),
+                            // @group(0) @binding(7) var<storage>
+                            // indirect_parameters_metadata:
+                            // array<IndirectParametersMetadata>;
+                            (7, indexed_metadata_buffer.as_entire_binding()),
+                            // @group(0) @binding(9) var<storage>
+                            // mesh_culling_data: array<MeshCullingData>;
                             (9, mesh_culling_data_buffer.as_entire_binding()),
+                            // @group(0) @binding(10) var<storage>
+                            // visibility_ranges: array<vec4<f32>>;
                             (10, visibility_range_binding.clone()),
+                            // @group(0) @binding(0) var<uniform> view: View;
                             (0, view_uniforms_binding.clone()),
+                            // @group(0) @binding(11) var depth_pyramid:
+                            // texture_2d<f32>;
                             (11, &view_depth_pyramid.all_mips),
+                            // @group(0) @binding(2) var<uniform>
+                            // previous_view_uniforms: PreviousViewUniforms;
                             (
                                 2,
                                 BufferBinding {
@@ -2334,6 +2716,9 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
                                     size: NonZeroU64::new(size_of::<PreviousViewData>() as u64),
                                 },
                             ),
+                            // @group(0) @binding(13) var<storage, read_write>
+                            // late_preprocess_work_item_indirect_parameters:
+                            // array<LatePreprocessWorkItemIndirectParameters>;
                             (
                                 13,
                                 BufferBinding {
@@ -2368,16 +2753,12 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
         match (
             self.phase_indirect_parameters_buffers
                 .non_indexed
-                .cpu_metadata_buffer(),
-            self.phase_indirect_parameters_buffers
-                .non_indexed
-                .gpu_metadata_buffer(),
+                .metadata_buffer(),
             late_non_indexed_work_item_buffer.buffer(),
             self.late_non_indexed_indirect_parameters_buffer.buffer(),
         ) {
             (
-                Some(non_indexed_cpu_metadata_buffer),
-                Some(non_indexed_gpu_metadata_buffer),
+                Some(non_indexed_metadata_buffer),
                 Some(non_indexed_work_item_gpu_buffer),
                 Some(late_non_indexed_indirect_parameters_buffer),
             ) => {
@@ -2400,8 +2781,14 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
                                 .bind_group_layout,
                         ),
                         &BindGroupEntries::with_indices((
+                            // @group(0) @binding(3) var<storage> current_input:
+                            // array<MeshInput>;
                             (3, self.current_input_buffer.as_entire_binding()),
+                            // @group(0) @binding(4) var<storage>
+                            // previous_input: array<MeshInput>;
                             (4, self.previous_input_buffer.as_entire_binding()),
+                            // @group(0) @binding(5) var<storage> work_items:
+                            // array<PreprocessWorkItem>;
                             (
                                 5,
                                 BindingResource::Buffer(BufferBinding {
@@ -2410,13 +2797,26 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
                                     size: non_indexed_work_item_buffer_size,
                                 }),
                             ),
+                            // @group(0) @binding(6) var<storage, read_write>
+                            // output: array<Mesh>;
                             (6, self.data_buffer.as_entire_binding()),
-                            (7, non_indexed_cpu_metadata_buffer.as_entire_binding()),
-                            (8, non_indexed_gpu_metadata_buffer.as_entire_binding()),
+                            // @group(0) @binding(7) var<storage>
+                            // indirect_parameters_metadata:
+                            // array<IndirectParametersMetadata>;
+                            (7, non_indexed_metadata_buffer.as_entire_binding()),
+                            // @group(0) @binding(9) var<storage>
+                            // mesh_culling_data: array<MeshCullingData>;
                             (9, mesh_culling_data_buffer.as_entire_binding()),
+                            // @group(0) @binding(10) var<storage>
+                            // visibility_ranges: array<vec4<f32>>;
                             (10, visibility_range_binding.clone()),
+                            // @group(0) @binding(0) var<uniform> view: View;
                             (0, view_uniforms_binding.clone()),
+                            // @group(0) @binding(11) var depth_pyramid:
+                            // texture_2d<f32>;
                             (11, &view_depth_pyramid.all_mips),
+                            // @group(0) @binding(2) var<uniform>
+                            // previous_view_uniforms: PreviousViewUniforms;
                             (
                                 2,
                                 BufferBinding {
@@ -2425,6 +2825,9 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
                                     size: NonZeroU64::new(size_of::<PreviousViewData>() as u64),
                                 },
                             ),
+                            // @group(0) @binding(13) var<storage, read>
+                            // late_preprocess_work_item_indirect_parameters:
+                            // array<LatePreprocessWorkItemIndirectParameters>;
                             (
                                 13,
                                 BufferBinding {
@@ -2472,17 +2875,10 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
         match (
             self.phase_indirect_parameters_buffers
                 .indexed
-                .cpu_metadata_buffer(),
-            self.phase_indirect_parameters_buffers
-                .indexed
-                .gpu_metadata_buffer(),
+                .metadata_buffer(),
             indexed_work_item_buffer.buffer(),
         ) {
-            (
-                Some(indexed_cpu_metadata_buffer),
-                Some(indexed_gpu_metadata_buffer),
-                Some(indexed_work_item_gpu_buffer),
-            ) => {
+            (Some(indexed_metadata_buffer), Some(indexed_work_item_gpu_buffer)) => {
                 // Don't use `as_entire_binding()` here; the shader reads the array
                 // length and the underlying buffer may be longer than the actual size
                 // of the vector.
@@ -2513,8 +2909,7 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
                                 }),
                             ),
                             (6, self.data_buffer.as_entire_binding()),
-                            (7, indexed_cpu_metadata_buffer.as_entire_binding()),
-                            (8, indexed_gpu_metadata_buffer.as_entire_binding()),
+                            (7, indexed_metadata_buffer.as_entire_binding()),
                             (9, mesh_culling_data_buffer.as_entire_binding()),
                             (10, visibility_range_binding.clone()),
                             (0, view_uniforms_binding.clone()),
@@ -2539,17 +2934,10 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
         match (
             self.phase_indirect_parameters_buffers
                 .non_indexed
-                .cpu_metadata_buffer(),
-            self.phase_indirect_parameters_buffers
-                .non_indexed
-                .gpu_metadata_buffer(),
+                .metadata_buffer(),
             non_indexed_work_item_buffer.buffer(),
         ) {
-            (
-                Some(non_indexed_cpu_metadata_buffer),
-                Some(non_indexed_gpu_metadata_buffer),
-                Some(non_indexed_work_item_gpu_buffer),
-            ) => {
+            (Some(non_indexed_metadata_buffer), Some(non_indexed_work_item_gpu_buffer)) => {
                 // Don't use `as_entire_binding()` here; the shader reads the array
                 // length and the underlying buffer may be longer than the actual size
                 // of the vector.
@@ -2569,8 +2957,14 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
                                 .bind_group_layout,
                         ),
                         &BindGroupEntries::with_indices((
+                            // @group(0) @binding(3) var<storage> current_input:
+                            // array<MeshInput>;
                             (3, self.current_input_buffer.as_entire_binding()),
+                            // @group(0) @binding(4) var<storage>
+                            // previous_input: array<MeshInput>;
                             (4, self.previous_input_buffer.as_entire_binding()),
+                            // @group(0) @binding(5) var<storage> work_items:
+                            // array<PreprocessWorkItem>;
                             (
                                 5,
                                 BindingResource::Buffer(BufferBinding {
@@ -2579,11 +2973,20 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
                                     size: non_indexed_work_item_buffer_size,
                                 }),
                             ),
+                            // @group(0) @binding(6) var<storage, read_write>
+                            // output: array<Mesh>;
                             (6, self.data_buffer.as_entire_binding()),
-                            (7, non_indexed_cpu_metadata_buffer.as_entire_binding()),
-                            (8, non_indexed_gpu_metadata_buffer.as_entire_binding()),
+                            // @group(0) @binding(7) var<storage>
+                            // indirect_parameters_metadata:
+                            // array<IndirectParametersMetadata>;
+                            (7, non_indexed_metadata_buffer.as_entire_binding()),
+                            // @group(0) @binding(9) var<storage>
+                            // mesh_culling_data: array<MeshCullingData>;
                             (9, mesh_culling_data_buffer.as_entire_binding()),
+                            // @group(0) @binding(10) var<storage>
+                            // visibility_ranges: array<vec4<f32>>;
                             (10, visibility_range_binding.clone()),
+                            // @group(0) @binding(0) var<uniform> view: View;
                             (0, view_uniforms_binding.clone()),
                         )),
                     ),
@@ -2658,18 +3061,12 @@ fn create_build_indirect_parameters_bind_groups(
                 },
 
                 build_indexed_indirect: match (
-                    phase_indirect_parameters_buffer
-                        .indexed
-                        .cpu_metadata_buffer(),
-                    phase_indirect_parameters_buffer
-                        .indexed
-                        .gpu_metadata_buffer(),
+                    phase_indirect_parameters_buffer.indexed.metadata_buffer(),
                     phase_indirect_parameters_buffer.indexed.data_buffer(),
                     phase_indirect_parameters_buffer.indexed.batch_sets_buffer(),
                 ) {
                     (
-                        Some(indexed_indirect_parameters_cpu_metadata_buffer),
-                        Some(indexed_indirect_parameters_gpu_metadata_buffer),
+                        Some(indexed_indirect_parameters_metadata_buffer),
                         Some(indexed_indirect_parameters_data_buffer),
                         Some(indexed_batch_sets_buffer),
                     ) => Some(
@@ -2682,30 +3079,39 @@ fn create_build_indirect_parameters_bind_groups(
                                     .gpu_frustum_culling_build_indexed_indirect_params
                                     .bind_group_layout,
                             ),
-                            &BindGroupEntries::sequential((
-                                current_input_buffer.as_entire_binding(),
+                            &BindGroupEntries::with_indices((
+                                // @group(0) @binding(0) var<storage>
+                                // current_input: array<MeshInput>;
+                                (0, current_input_buffer.as_entire_binding()),
+                                // @group(0) @binding(1) var<storage>
+                                // indirect_parameters_metadata:
+                                // array<IndirectParametersMetadata>;
+                                //
                                 // Don't use `as_entire_binding` here; the shader reads
                                 // the length and `RawBufferVec` overallocates.
-                                BufferBinding {
-                                    buffer: indexed_indirect_parameters_cpu_metadata_buffer,
-                                    offset: 0,
-                                    size: NonZeroU64::new(
-                                        phase_indirect_parameters_buffer.indexed.batch_count()
-                                            as u64
-                                            * size_of::<IndirectParametersCpuMetadata>() as u64,
-                                    ),
-                                },
-                                BufferBinding {
-                                    buffer: indexed_indirect_parameters_gpu_metadata_buffer,
-                                    offset: 0,
-                                    size: NonZeroU64::new(
-                                        phase_indirect_parameters_buffer.indexed.batch_count()
-                                            as u64
-                                            * size_of::<IndirectParametersGpuMetadata>() as u64,
-                                    ),
-                                },
-                                indexed_batch_sets_buffer.as_entire_binding(),
-                                indexed_indirect_parameters_data_buffer.as_entire_binding(),
+                                (
+                                    1,
+                                    BufferBinding {
+                                        buffer: indexed_indirect_parameters_metadata_buffer,
+                                        offset: 0,
+                                        size: NonZeroU64::new(
+                                            phase_indirect_parameters_buffer.indexed.batch_count()
+                                                as u64
+                                                * size_of::<IndirectParametersMetadata>() as u64,
+                                        ),
+                                    },
+                                ),
+                                // @group(0) @binding(3) var<storage,
+                                // read_write> indirect_batch_sets:
+                                // array<IndirectBatchSet>;
+                                (3, indexed_batch_sets_buffer.as_entire_binding()),
+                                // @group(0) @binding(4) var<storage,
+                                // read_write> indirect_parameters:
+                                // array<IndirectParametersIndexed>;
+                                (
+                                    4,
+                                    indexed_indirect_parameters_data_buffer.as_entire_binding(),
+                                ),
                             )),
                         ),
                     ),
@@ -2715,18 +3121,14 @@ fn create_build_indirect_parameters_bind_groups(
                 build_non_indexed_indirect: match (
                     phase_indirect_parameters_buffer
                         .non_indexed
-                        .cpu_metadata_buffer(),
-                    phase_indirect_parameters_buffer
-                        .non_indexed
-                        .gpu_metadata_buffer(),
+                        .metadata_buffer(),
                     phase_indirect_parameters_buffer.non_indexed.data_buffer(),
                     phase_indirect_parameters_buffer
                         .non_indexed
                         .batch_sets_buffer(),
                 ) {
                     (
-                        Some(non_indexed_indirect_parameters_cpu_metadata_buffer),
-                        Some(non_indexed_indirect_parameters_gpu_metadata_buffer),
+                        Some(non_indexed_indirect_parameters_metadata_buffer),
                         Some(non_indexed_indirect_parameters_data_buffer),
                         Some(non_indexed_batch_sets_buffer),
                     ) => Some(
@@ -2739,30 +3141,41 @@ fn create_build_indirect_parameters_bind_groups(
                                     .gpu_frustum_culling_build_non_indexed_indirect_params
                                     .bind_group_layout,
                             ),
-                            &BindGroupEntries::sequential((
-                                current_input_buffer.as_entire_binding(),
+                            &BindGroupEntries::with_indices((
+                                // @group(0) @binding(0) var<storage>
+                                // current_input: array<MeshInput>;
+                                (0, current_input_buffer.as_entire_binding()),
+                                // @group(0) @binding(1) var<storage>
+                                // indirect_parameters_metadata:
+                                // array<IndirectParametersMetadata>;
+                                //
                                 // Don't use `as_entire_binding` here; the shader reads
                                 // the length and `RawBufferVec` overallocates.
-                                BufferBinding {
-                                    buffer: non_indexed_indirect_parameters_cpu_metadata_buffer,
-                                    offset: 0,
-                                    size: NonZeroU64::new(
-                                        phase_indirect_parameters_buffer.non_indexed.batch_count()
-                                            as u64
-                                            * size_of::<IndirectParametersCpuMetadata>() as u64,
-                                    ),
-                                },
-                                BufferBinding {
-                                    buffer: non_indexed_indirect_parameters_gpu_metadata_buffer,
-                                    offset: 0,
-                                    size: NonZeroU64::new(
-                                        phase_indirect_parameters_buffer.non_indexed.batch_count()
-                                            as u64
-                                            * size_of::<IndirectParametersGpuMetadata>() as u64,
-                                    ),
-                                },
-                                non_indexed_batch_sets_buffer.as_entire_binding(),
-                                non_indexed_indirect_parameters_data_buffer.as_entire_binding(),
+                                (
+                                    1,
+                                    BufferBinding {
+                                        buffer: non_indexed_indirect_parameters_metadata_buffer,
+                                        offset: 0,
+                                        size: NonZeroU64::new(
+                                            phase_indirect_parameters_buffer
+                                                .non_indexed
+                                                .batch_count()
+                                                as u64
+                                                * size_of::<IndirectParametersMetadata>() as u64,
+                                        ),
+                                    },
+                                ),
+                                // @group(0) @binding(3) var<storage,
+                                // read_write> indirect_batch_sets:
+                                // array<IndirectBatchSet>;
+                                (3, non_indexed_batch_sets_buffer.as_entire_binding()),
+                                // @group(0) @binding(4) var<storage,
+                                // read_write> indirect_parameters:
+                                // array<IndirectParametersNonIndexed>;
+                                (
+                                    4,
+                                    non_indexed_indirect_parameters_data_buffer.as_entire_binding(),
+                                ),
                             )),
                         ),
                     ),
@@ -2784,10 +3197,11 @@ fn create_bin_unpacking_bind_groups(
     preprocess_pipelines: &PreprocessPipelines,
     indirect_parameters_buffers: &IndirectParametersBuffers,
     phase_instance_buffers: &TypeIdMap<UntypedPhaseBatchedInstanceBuffers<MeshUniform>>,
-    bin_unpacking_buffers: &BinUnpackingBuffers,
+    scene_unpacking_buffers: &SceneUnpackingBuffers,
     view_entity: &RetainedViewEntity,
 ) {
-    let Some(bin_unpacking_metadata_buffer) = bin_unpacking_buffers.bin_unpacking_metadata.buffer()
+    let Some(bin_unpacking_metadata_buffer) =
+        scene_unpacking_buffers.bin_unpacking_metadata.buffer()
     else {
         return;
     };
@@ -2804,13 +3218,12 @@ fn create_bin_unpacking_bind_groups(
         else {
             continue;
         };
-        let Some(view_phase_bin_unpacking_buffers) =
-            bin_unpacking_buffers
-                .view_phase_buffers
-                .get(&BinUnpackingBuffersKey {
-                    phase: *phase_type_id,
-                    view: *view_entity,
-                })
+        let Some(view_phase_bin_unpacking_buffers) = scene_unpacking_buffers
+            .view_phase_buffers
+            .get(&SceneUnpackingBuffersKey {
+                phase: *phase_type_id,
+                view: *view_entity,
+            })
         else {
             continue;
         };
@@ -2829,7 +3242,7 @@ fn create_bin_unpacking_bind_groups(
 
         // Create the actual bind groups.
         bin_unpacking_bind_groups.insert(
-            BinUnpackingBuffersKey {
+            SceneUnpackingBuffersKey {
                 phase: *phase_type_id,
                 view: *view_entity,
             },
@@ -2881,7 +3294,7 @@ fn create_bin_unpacking_bind_group(
     render_device: &RenderDevice,
     preprocess_pipelines: &PreprocessPipelines,
     pipeline_cache: &PipelineCache,
-    job: &BinUnpackingJob,
+    job: &SceneUnpackingJob,
     bin_unpacking_metadata_buffer: &Buffer,
     work_item_buffer: &Buffer,
     indexed: bool,
@@ -2911,10 +3324,12 @@ fn create_bin_unpacking_bind_group(
             // read_write> preprocess_work_items:
             // array<PreprocessWorkItem>;
             work_item_buffer.as_entire_binding(),
-            // @group(0) @binding(3) var<storage>
-            // bin_index_to_indirect_parameters_offset:
-            // array<u32>;
-            job.bin_index_to_indirect_parameters_offset_buffer
+            // @group(0) @binding(3) var<storage> bin_metadata:
+            // array<BinMetadata>;
+            job.bin_metadata_buffer.as_entire_binding(),
+            // @group(0) @binding(4) var<storage>
+            // bin_index_to_bin_metadata_index: array<u32>;
+            job.bin_index_to_bin_metadata_index_buffer
                 .as_entire_binding(),
         )),
     );
@@ -2925,6 +3340,139 @@ fn create_bin_unpacking_bind_group(
     }
 }
 
+/// Creates all bind groups needed to run the `allocate_uniforms` shader for all
+/// the phases for a single view.
+fn create_uniform_allocation_bind_groups(
+    uniform_allocation_bind_groups: &mut UniformAllocationBindGroups,
+    render_device: &RenderDevice,
+    pipeline_cache: &PipelineCache,
+    preprocess_pipelines: &PreprocessPipelines,
+    indirect_parameters_buffers: &IndirectParametersBuffers,
+    scene_unpacking_buffers: &SceneUnpackingBuffers,
+    view_entity: &RetainedViewEntity,
+) {
+    let Some(uniform_allocation_metadata_buffer) =
+        scene_unpacking_buffers.uniform_allocation_metadata.buffer()
+    else {
+        return;
+    };
+
+    for (phase_type_id, phase_indirect_parameters_buffers) in indirect_parameters_buffers.iter() {
+        let Some(view_phase_bin_unpacking_buffers) = scene_unpacking_buffers
+            .view_phase_buffers
+            .get(&SceneUnpackingBuffersKey {
+                phase: *phase_type_id,
+                view: *view_entity,
+            })
+        else {
+            continue;
+        };
+
+        // Create the actual bind groups.
+        uniform_allocation_bind_groups.insert(
+            SceneUnpackingBuffersKey {
+                phase: *phase_type_id,
+                view: *view_entity,
+            },
+            ViewPhaseUniformAllocationBindGroups {
+                indexed: match phase_indirect_parameters_buffers.indexed.metadata_buffer() {
+                    None => vec![],
+                    Some(indexed_indirect_parameters_metadata_buffer) => {
+                        view_phase_bin_unpacking_buffers
+                            .indexed_unpacking_jobs
+                            .iter()
+                            .map(|job| {
+                                create_uniform_allocation_bind_group(
+                                    render_device,
+                                    preprocess_pipelines,
+                                    pipeline_cache,
+                                    job,
+                                    uniform_allocation_metadata_buffer,
+                                    indexed_indirect_parameters_metadata_buffer,
+                                    true,
+                                )
+                            })
+                            .collect()
+                    }
+                },
+                non_indexed: match phase_indirect_parameters_buffers
+                    .non_indexed
+                    .metadata_buffer()
+                {
+                    None => vec![],
+                    Some(non_indexed_indirect_parameters_metadata_buffer) => {
+                        view_phase_bin_unpacking_buffers
+                            .non_indexed_unpacking_jobs
+                            .iter()
+                            .map(|job| {
+                                create_uniform_allocation_bind_group(
+                                    render_device,
+                                    preprocess_pipelines,
+                                    pipeline_cache,
+                                    job,
+                                    uniform_allocation_metadata_buffer,
+                                    non_indexed_indirect_parameters_metadata_buffer,
+                                    false,
+                                )
+                            })
+                            .collect()
+                    }
+                },
+            },
+        );
+    }
+}
+
+/// Creates a bind group for the uniform allocation shader for a single (view,
+/// phase, mesh indexed-ness) combination.
+fn create_uniform_allocation_bind_group(
+    render_device: &RenderDevice,
+    preprocess_pipelines: &PreprocessPipelines,
+    pipeline_cache: &PipelineCache,
+    job: &SceneUnpackingJob,
+    uniform_allocation_metadata_buffer: &Buffer,
+    indirect_parameters_metadata_buffer: &Buffer,
+    indexed: bool,
+) -> ViewPhaseUniformAllocationBindGroup {
+    let bind_group = render_device.create_bind_group(
+        if indexed {
+            "uniform allocation indexed bind group"
+        } else {
+            "uniform allocation non-indexed bind group"
+        },
+        &pipeline_cache.get_bind_group_layout(
+            // All the pipelines' bind group layouts should be identical.
+            &preprocess_pipelines
+                .uniform_allocation
+                .local_scan
+                .bind_group_layout,
+        ),
+        &BindGroupEntries::sequential((
+            // @group(0) @binding(0) var<uniform> allocate_uniforms_metadata:
+            // AllocateUniformsMetadata;
+            BindingResource::Buffer(BufferBinding {
+                buffer: uniform_allocation_metadata_buffer,
+                offset: job.uniform_allocation_metadata_index.uniform_offset() as u64,
+                size: NonZeroU64::new(size_of::<GpuUniformAllocationMetadata>() as u64),
+            }),
+            // @group(0) @binding(1) var<storage> bin_metadata:
+            // array<BinMetadata>;
+            job.bin_metadata_buffer.as_entire_binding(),
+            // @group(0) @binding(2) var<storage, read_write>
+            // indirect_parameters_metadata: array<IndirectParametersMetadata>;
+            indirect_parameters_metadata_buffer.as_entire_binding(),
+            // @group(0) @binding(3) var<storage, read_write> fan_buffer:
+            // array<u32>;
+            job.fan_buffer.as_entire_binding(),
+        )),
+    );
+    ViewPhaseUniformAllocationBindGroup {
+        metadata_index: job.uniform_allocation_metadata_index,
+        bind_group,
+        bin_count: job.bin_count,
+    }
+}
+
 /// Writes the information needed to do GPU mesh culling to the GPU.
 pub fn write_mesh_culling_data_buffer(
     render_device: Res<RenderDevice>,
diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index f644b4f5bdb16..06139ca452615 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -49,7 +49,7 @@ use bevy_render::{
     batching::{
         gpu_preprocessing::{
             self, GpuPreprocessingSupport, IndirectBatchSet, IndirectParametersBuffers,
-            IndirectParametersCpuMetadata, IndirectParametersIndexed, IndirectParametersNonIndexed,
+            IndirectParametersIndexed, IndirectParametersMetadata, IndirectParametersNonIndexed,
             InstanceInputUniformBuffer, UntypedPhaseIndirectParametersBuffers,
         },
         no_gpu_preprocessing, GetBatchData, GetFullBatchData, NoAutomaticBatching,
@@ -3005,12 +3005,16 @@ impl GetFullBatchData for MeshPipeline {
         phase_indirect_parameters_buffers: &mut UntypedPhaseIndirectParametersBuffers,
         indirect_parameters_offset: u32,
     ) {
-        let indirect_parameters = IndirectParametersCpuMetadata {
+        let indirect_parameters = IndirectParametersMetadata {
             base_output_index,
             batch_set_index: match batch_set_index {
                 Some(batch_set_index) => u32::from(batch_set_index),
                 None => !0,
             },
+            // These fields are filled in by the GPU:
+            mesh_index: 0,
+            early_instance_count: 0,
+            late_instance_count: 0,
         };
 
         if indexed {
diff --git a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
index 496cd6b6f84ab..ea939a92353ce 100644
--- a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
+++ b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
@@ -14,9 +14,7 @@
 // are known as *early mesh preprocessing* and *late mesh preprocessing*
 // respectively.
 
-#import bevy_pbr::mesh_preprocess_types::{
-    IndirectParametersCpuMetadata, IndirectParametersGpuMetadata, MeshInput, PreprocessWorkItem
-}
+#import bevy_pbr::mesh_preprocess_types::{IndirectParametersMetadata, MeshInput, PreprocessWorkItem}
 #import bevy_pbr::mesh_types::{
     Mesh, MESH_FLAGS_AABB_BASED_VISIBILITY_RANGE_BIT, MESH_FLAGS_NO_FRUSTUM_CULLING_BIT,
     MESH_FLAGS_VISIBILITY_RANGE_INDEX_BITS
@@ -84,11 +82,8 @@ struct Immediates {
 
 #ifdef INDIRECT
 // The array of indirect parameters for drawcalls.
-@group(0) @binding(7) var<storage> indirect_parameters_cpu_metadata:
-    array<IndirectParametersCpuMetadata>;
-
-@group(0) @binding(8) var<storage, read_write> indirect_parameters_gpu_metadata:
-    array<IndirectParametersGpuMetadata>;
+@group(0) @binding(7) var<storage, read_write> indirect_parameters_metadata:
+    array<IndirectParametersMetadata>;
 #endif
 
 #ifdef FRUSTUM_CULLING
@@ -182,7 +177,7 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
     // building shader can access it.
 #ifndef LATE_PHASE
     if (instance_index == 0u) || (work_items[instance_index - 1].output_or_indirect_parameters_index != indirect_parameters_index) {
-        indirect_parameters_gpu_metadata[indirect_parameters_index].mesh_index = input_index;
+        indirect_parameters_metadata[indirect_parameters_index].mesh_index = input_index;
     }
 #endif  // LATE_PHASE
 
@@ -362,20 +357,20 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
 #ifdef INDIRECT
 #ifdef LATE_PHASE
     let batch_output_index = atomicLoad(
-        &indirect_parameters_gpu_metadata[indirect_parameters_index].early_instance_count
+        &indirect_parameters_metadata[indirect_parameters_index].early_instance_count
     ) + atomicAdd(
-        &indirect_parameters_gpu_metadata[indirect_parameters_index].late_instance_count,
+        &indirect_parameters_metadata[indirect_parameters_index].late_instance_count,
         1u
     );
 #else   // LATE_PHASE
     let batch_output_index = atomicAdd(
-        &indirect_parameters_gpu_metadata[indirect_parameters_index].early_instance_count,
+        &indirect_parameters_metadata[indirect_parameters_index].early_instance_count,
         1u
     );
 #endif  // LATE_PHASE
 
     let mesh_output_index =
-        indirect_parameters_cpu_metadata[indirect_parameters_index].base_output_index +
+        indirect_parameters_metadata[indirect_parameters_index].base_output_index +
         batch_output_index;
 
 #endif  // INDIRECT
diff --git a/crates/bevy_pbr/src/render/unpack_bins.wgsl b/crates/bevy_pbr/src/render/unpack_bins.wgsl
index 4161cc2f0e7d8..ba9077a3e1af9 100644
--- a/crates/bevy_pbr/src/render/unpack_bins.wgsl
+++ b/crates/bevy_pbr/src/render/unpack_bins.wgsl
@@ -8,7 +8,7 @@
 // the position of the command in the indirect parameters buffer that will draw
 // that entity.
 
-#import bevy_pbr::mesh_preprocess_types::PreprocessWorkItem
+#import bevy_pbr::mesh_preprocess_types::{BinMetadata, PreprocessWorkItem}
 
 // Information needed to unpack bins belonging to a single batch set.
 struct BinUnpackingMetadata {
@@ -53,10 +53,16 @@ struct BinnedMeshInstance {
 // The output list of `PreprocessWorkItem`s.
 @group(0) @binding(2) var<storage, read_write> preprocess_work_items: array<PreprocessWorkItem>;
 
-// A mapping from each `bin_index` to the index of the GPU indirect parameters
-// for this bin, relative to the start of the indirect parameters for this batch
+// The bin metadata, which contains the index of the GPU indirect parameters for
+// this bin, relative to the start of the indirect parameters for this batch
 // set.
-@group(0) @binding(3) var<storage> bin_index_to_indirect_parameters_offset: array<u32>;
+//
+// This is indexed by the bin metadata index below.
+@group(0) @binding(3) var<storage> bin_metadata: array<BinMetadata>;
+
+// A mapping from the bin index to the metadata index within the `bin_metadata`
+// buffer.
+@group(0) @binding(4) var<storage> bin_index_to_bin_metadata_index: array<u32>;
 
 @compute
 @workgroup_size(64)
@@ -73,7 +79,8 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
 
     // Look up the indirect parameters index for this bin, relative to the first
     // indirect parameters offset for this batch set.
-    let indirect_parameters_offset = bin_index_to_indirect_parameters_offset[bin_index];
+    let bin_metadata_index = bin_index_to_bin_metadata_index[bin_index];
+    let indirect_parameters_offset = bin_metadata[bin_metadata_index].indirect_parameters_offset;
 
     // Determine the location we should write the work item to.
     let output_index = bin_unpacking_metadata.base_output_work_item_index + global_id;
diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs
index a0b80f43dc147..4742a66024be1 100644
--- a/crates/bevy_render/src/batching/gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs
@@ -72,7 +72,7 @@ impl Plugin for BatchingPlugin {
             })
             .init_gpu_resource::<IndirectParametersBuffers>()
             .allow_ambiguous_resource::<IndirectParametersBuffers>()
-            .init_gpu_resource::<BinUnpackingBuffers>()
+            .init_gpu_resource::<SceneUnpackingBuffers>()
             .add_systems(
                 Render,
                 write_indirect_parameters_buffers.in_set(RenderSystems::PrepareResourcesFlush),
@@ -762,10 +762,10 @@ pub struct PreprocessWorkItem {
     pub input_index: u32,
 
     /// In direct mode, the index of the mesh uniform; in indirect mode, the
-    /// index of the [`IndirectParametersGpuMetadata`].
+    /// index of the [`IndirectParametersMetadata`].
     ///
     /// In indirect mode, this is the index of the
-    /// [`IndirectParametersGpuMetadata`] in the
+    /// [`IndirectParametersMetadata`] in the
     /// `IndirectParametersBuffers::indexed_metadata` or
     /// `IndirectParametersBuffers::non_indexed_metadata`.
     pub output_or_indirect_parameters_index: u32,
@@ -807,13 +807,30 @@ pub struct IndirectParametersNonIndexed {
     pub first_instance: u32,
 }
 
-/// A structure, initialized on CPU and read on GPU, that contains metadata
-/// about each batch.
+impl MeshClassIndirectParameters for IndirectParametersIndexed {
+    fn debug_label() -> &'static str {
+        "indexed"
+    }
+}
+
+impl MeshClassIndirectParameters for IndirectParametersNonIndexed {
+    fn debug_label() -> &'static str {
+        "non-indexed"
+    }
+}
+
+/// A structure, written and read on GPU, that records how many instances of
+/// each mesh are actually to be drawn.
+///
+/// The GPU mesh preprocessing shader increments the
+/// [`Self::early_instance_count`] and [`Self::late_instance_count`] as it
+/// determines that meshes are visible.  The indirect parameter building shader
+/// reads this metadata in order to construct the indirect draw parameters.
 ///
 /// Each batch will have one instance of this structure.
 #[derive(Clone, Copy, Default, Pod, Zeroable, ShaderType)]
 #[repr(C)]
-pub struct IndirectParametersCpuMetadata {
+pub struct IndirectParametersMetadata {
     /// The index of the first instance of this mesh in the array of
     /// `MeshUniform`s.
     ///
@@ -828,38 +845,25 @@ pub struct IndirectParametersCpuMetadata {
     ///
     /// A *batch set* is a set of meshes that may be multi-drawn together.
     /// Multiple batches (and therefore multiple instances of
-    /// [`IndirectParametersGpuMetadata`] structures) can be part of the same
-    /// batch set.
+    /// [`IndirectParametersMetadata`] structures) can be part of the same batch
+    /// set.
     pub batch_set_index: u32,
-}
 
-/// A structure, written and read on GPU, that records how many instances of
-/// each mesh are actually to be drawn.
-///
-/// The GPU mesh preprocessing shader increments the
-/// [`Self::early_instance_count`] and [`Self::late_instance_count`] as it
-/// determines that meshes are visible.  The indirect parameter building shader
-/// reads this metadata in order to construct the indirect draw parameters.
-///
-/// Each batch will have one instance of this structure.
-#[derive(Clone, Copy, Default, Pod, Zeroable, ShaderType)]
-#[repr(C)]
-pub struct IndirectParametersGpuMetadata {
     /// The index of the first mesh in this batch in the array of
     /// `MeshInputUniform`s.
     pub mesh_index: u32,
 
     /// The number of instances that were judged visible last frame.
     ///
-    /// The CPU sets this value to 0, and the GPU mesh preprocessing shader
-    /// increments it as it culls mesh instances.
+    /// The uniform allocation pass sets this value to 0, and the mesh
+    /// preprocessing shader increments it as it culls mesh instances.
     pub early_instance_count: u32,
 
     /// The number of instances that have been judged potentially visible this
     /// frame that weren't in the last frame's potentially visible set.
     ///
-    /// The CPU sets this value to 0, and the GPU mesh preprocessing shader
-    /// increments it as it culls mesh instances.
+    /// The uniform allocation pass sets this value to 0, and the mesh
+    /// preprocessing shader increments it as it culls mesh instances.
     pub late_instance_count: u32,
 }
 
@@ -899,7 +903,7 @@ pub struct IndirectBatchSet {
 /// (`multi_draw_indirect`, `multi_draw_indirect_count`) use to draw the scene.
 ///
 /// In addition to the indirect draw buffers themselves, this structure contains
-/// the buffers that store [`IndirectParametersGpuMetadata`], which are the
+/// the buffers that store [`IndirectParametersMetadata`], which are the
 /// structures that culling writes to so that the indirect parameter building
 /// pass can determine how many meshes are actually to be drawn.
 ///
@@ -953,21 +957,95 @@ impl Default for GpuBinUnpackingMetadata {
     }
 }
 
+/// Information about each bin in a batch set.
+///
+/// This is maintained by the CPU and cached for bins that don't change from
+/// frame to frame.
+#[derive(Clone, Copy, Pod, Zeroable, ShaderType)]
+#[repr(C)]
+pub struct GpuBinMetadata {
+    /// The index of the indirect parameters for this bin, relative to the first
+    /// indirect parameter index for the batch set.
+    ///
+    /// That is, the final indirect parameters index for this bin is
+    /// `first_indirect_parameters_index` in the `UniformAllocationMetadata`
+    /// plus this value.
+    pub indirect_parameters_offset: u32,
+
+    /// The index of the bin that this metadata corresponds to.
+    ///
+    /// The GPU doesn't use this, but the CPU does in order to perform the
+    /// reverse mapping from bin metadata index back to the bin. We could store
+    /// this in a non-GPU-accessible buffer, but I figured the extra complexity
+    /// wasn't worth it.
+    pub bin_index: u32,
+
+    /// The number of mesh instances in this bin.
+    pub instance_count: u32,
+}
+
+/// Information needed to allocate `MeshUniform`s on the GPU.
+#[derive(Clone, Copy, Pod, Zeroable, ShaderType)]
+#[repr(C)]
+pub struct GpuUniformAllocationMetadata {
+    /// The index of this batch set in the `IndirectBatchSet` array.
+    ///
+    /// We write this into the `indirect_parameters_metadata`.
+    pub batch_set_index: u32,
+
+    /// The number of bins (a.k.a. draws, a.k.a. batches) in this batch set.
+    pub bin_count: u32,
+
+    /// The index of the first set of indirect parameters for this batch set.
+    ///
+    /// This is also the index of the first `IndirectParametersMetadata`, as
+    /// that's a parallel array with the indirect parameters.
+    pub first_indirect_parameters_index: u32,
+
+    /// The index of the first `MeshUniform` slot for this batch set.
+    pub first_output_mesh_uniform_index: u32,
+
+    /// Padding.
+    pub pad: [u32; 60],
+}
+
 /// CPU-side information needed to construct the bind groups and issue the
-/// dispatch for the `unpack_bins` shader, for a single batch set.
-pub struct BinUnpackingJob {
+/// dispatch for the `unpack_bins` and `allocate_uniforms` shaders, for a single
+/// batch set.
+///
+/// Because those two shaders are always invoked together, we combine the
+/// information together for efficiency's sake.
+pub struct SceneUnpackingJob {
     /// The GPU buffer of `GpuRenderBinnedMeshInstance`s corresponding to the
     /// mesh instances that this batch set contains.
+    ///
+    /// This is used in the `unpack_bins` shader.
     pub render_binned_mesh_instance_buffer: Buffer,
-    /// The GPU buffer that maps each bin index to the index of the indirect
-    /// drawing parameters for that bin, relative to the first such indirect
-    /// drawing parameters for this batch set.
-    pub bin_index_to_indirect_parameters_offset_buffer: Buffer,
+    /// The GPU buffer that stores various metadata for each bin, including the
+    /// indirect parameters offset and the instance count.
+    ///
+    /// This is used in both the `allocate_uniforms` and `unpack_bins` shaders.
+    pub bin_metadata_buffer: Buffer,
+    /// A temporary GPU buffer that stores the mesh uniform index of the last
+    /// instance plus one for each workgroup (i.e. for each 256-bin chunk).
+    ///
+    /// This is accumulated in the second stage of the `allocate_uniforms`
+    /// shader and written out in the third.
+    pub fan_buffer: Buffer,
+    /// A GPU buffer that maps the stable index of each bin to the index of the
+    /// metadata in the [`Self::bin_metadata_buffer`].
+    pub bin_index_to_bin_metadata_index_buffer: Buffer,
     /// The index of this batch set's [`GpuBinUnpackingMetadata`] in the
-    /// [`BinUnpackingBuffers::bin_unpacking_metadata`] buffer.
+    /// [`SceneUnpackingBuffers::bin_unpacking_metadata`] buffer.
     pub bin_unpacking_metadata_index: BinUnpackingMetadataIndex,
+    /// The index of this batch set's [`GpuUniformAllocationMetadata`] in the
+    /// [`SceneUnpackingBuffers::uniform_allocation_metadata`] buffer.
+    pub uniform_allocation_metadata_index: UniformAllocationMetadataIndex,
     /// The total number of mesh instances in this batch set.
     pub mesh_instance_count: u32,
+    /// The total number of bins (i.e. draws, i.e. separate meshes) in this
+    /// batch set.
+    pub bin_count: u32,
 }
 
 /// The buffers containing all the information that indirect draw commands use
@@ -1123,21 +1201,27 @@ impl UntypedPhaseIndirectParametersBuffers {
 }
 
 /// A resource, part of the render world, that holds all GPU buffers used for
-/// the bin unpacking shader.
+/// the bin unpacking and uniform allocation shaders.
 #[derive(Resource)]
-pub struct BinUnpackingBuffers {
+pub struct SceneUnpackingBuffers {
+    /// A buffer containing all the uniforms needed to run the uniform
+    /// allocation compute shader for each batch set.
+    pub uniform_allocation_metadata: RawBufferVec<GpuUniformAllocationMetadata>,
     /// A buffer containing all the uniforms needed to run the bin unpacking
     /// compute shader for each batch set.
     pub bin_unpacking_metadata: RawBufferVec<GpuBinUnpackingMetadata>,
     /// Per-view-phase buffers for the bin unpacking shader.
-    pub view_phase_buffers: HashMap<BinUnpackingBuffersKey, ViewPhaseBinUnpackingBuffers>,
+    pub view_phase_buffers: HashMap<SceneUnpackingBuffersKey, ViewPhaseSceneUnpackingBuffers>,
 }
 
-impl Default for BinUnpackingBuffers {
+impl Default for SceneUnpackingBuffers {
     fn default() -> Self {
+        let mut uniform_allocation_metadata = RawBufferVec::new(BufferUsages::UNIFORM);
+        uniform_allocation_metadata.set_label(Some("uniform allocation metadata buffer"));
         let mut bin_unpacking_metadata = RawBufferVec::new(BufferUsages::UNIFORM);
         bin_unpacking_metadata.set_label(Some("bin unpacking metadata buffer"));
-        BinUnpackingBuffers {
+        SceneUnpackingBuffers {
+            uniform_allocation_metadata,
             bin_unpacking_metadata,
             view_phase_buffers: HashMap::default(),
         }
@@ -1147,18 +1231,18 @@ impl Default for BinUnpackingBuffers {
 /// GPU buffers for the bin unpacking shader that are specific to each phase of
 /// each view.
 #[derive(Default)]
-pub struct ViewPhaseBinUnpackingBuffers {
+pub struct ViewPhaseSceneUnpackingBuffers {
     /// Metadata that describes each unpacking job, specific to indexed meshes.
-    pub indexed_unpacking_jobs: Vec<BinUnpackingJob>,
+    pub indexed_unpacking_jobs: Vec<SceneUnpackingJob>,
     /// Metadata that describes each unpacking job, specific to non-indexed
     /// meshes.
-    pub non_indexed_unpacking_jobs: Vec<BinUnpackingJob>,
+    pub non_indexed_unpacking_jobs: Vec<SceneUnpackingJob>,
 }
 
 /// A key used to look up the bin unpacking buffers for a specific phase of a
 /// specific view.
 #[derive(Clone, Copy, PartialEq, Eq, Hash)]
-pub struct BinUnpackingBuffersKey {
+pub struct SceneUnpackingBuffersKey {
     /// The ID of the phase.
     pub phase: TypeId,
     /// The entity ID of the view.
@@ -1166,19 +1250,33 @@ pub struct BinUnpackingBuffersKey {
 }
 
 /// The index of the metadata corresponding to one bin unpacking job in the
-/// [`BinUnpackingBuffers::bin_unpacking_metadata`] buffer.
+/// [`SceneUnpackingBuffers::bin_unpacking_metadata`] buffer.
 #[derive(Clone, Copy, Deref, DerefMut)]
 pub struct BinUnpackingMetadataIndex(pub NonMaxU32);
 
 impl BinUnpackingMetadataIndex {
     /// Returns the byte offset within the
-    /// [`BinUnpackingBuffers::bin_unpacking_metadata`] buffer corresponding to
+    /// [`SceneUnpackingBuffers::bin_unpacking_metadata`] buffer corresponding to
     /// this index.
     pub fn uniform_offset(&self) -> u32 {
         self.get() * size_of::<GpuBinUnpackingMetadata>() as u32
     }
 }
 
+/// The index of the metadata corresponding to one uniform allocation job in the
+/// [`SceneUnpackingBuffers::uniform_allocation_metadata`] buffer.
+#[derive(Clone, Copy, Deref, DerefMut)]
+pub struct UniformAllocationMetadataIndex(pub NonMaxU32);
+
+impl UniformAllocationMetadataIndex {
+    /// Returns the byte offset within the
+    /// [`SceneUnpackingBuffers::uniform_allocation_metadata`] buffer
+    /// corresponding to this index.
+    pub fn uniform_offset(&self) -> u32 {
+        self.get() * size_of::<GpuUniformAllocationMetadata>() as u32
+    }
+}
+
 /// The buffers containing all the information that indirect draw commands use
 /// to draw the scene, for a single mesh class (indexed or non-indexed), for a
 /// single phase.
@@ -1193,21 +1291,13 @@ where
     /// it to perform the draws.
     indirect_draw_parameters: UninitBufferVec<IP>,
 
-    /// The GPU buffer that holds the data used to construct indirect draw
-    /// parameters for meshes.
-    ///
-    /// The GPU mesh preprocessing shader writes to this buffer, and the
-    /// indirect parameters building shader reads this buffer to construct the
-    /// indirect draw parameters.
-    cpu_metadata: RawBufferVec<IndirectParametersCpuMetadata>,
-
     /// The GPU buffer that holds data built by the GPU used to construct
     /// indirect draw parameters for meshes.
     ///
     /// The GPU mesh preprocessing shader writes to this buffer, and the
     /// indirect parameters building shader reads this buffer to construct the
     /// indirect draw parameters.
-    gpu_metadata: UninitBufferVec<IndirectParametersGpuMetadata>,
+    metadata: PartialBufferVec<IndirectParametersMetadata>,
 
     /// The GPU buffer that holds the number of indirect draw commands for each
     /// phase of each view, for meshes.
@@ -1218,9 +1308,15 @@ where
     batch_sets: RawBufferVec<IndirectBatchSet>,
 }
 
+/// GPU-side indirect draw parameters for either indexed or non-indexed meshes.
+pub trait MeshClassIndirectParameters: Clone + ShaderSize + WriteInto {
+    /// Either the string "indexed" or "non-indexed".
+    fn debug_label() -> &'static str;
+}
+
 impl<IP> MeshClassIndirectParametersBuffers<IP>
 where
-    IP: Clone + ShaderSize + WriteInto,
+    IP: MeshClassIndirectParameters,
 {
     fn new(
         allow_copies_from_indirect_parameter_buffers: bool,
@@ -1232,8 +1328,10 @@ where
 
         MeshClassIndirectParametersBuffers {
             indirect_draw_parameters: UninitBufferVec::new(indirect_parameter_buffer_usages),
-            cpu_metadata: RawBufferVec::new(BufferUsages::STORAGE),
-            gpu_metadata: UninitBufferVec::new(BufferUsages::STORAGE),
+            metadata: PartialBufferVec::new(
+                BufferUsages::STORAGE,
+                format!("{} indirect parameters metadata buffer", IP::debug_label()),
+            ),
             batch_sets: RawBufferVec::new(indirect_parameter_buffer_usages),
         }
     }
@@ -1249,16 +1347,6 @@ where
         self.indirect_draw_parameters.buffer()
     }
 
-    /// Returns the GPU buffer that holds the CPU-constructed data used to
-    /// construct indirect draw parameters for meshes.
-    ///
-    /// The CPU writes to this buffer, and the indirect parameters building
-    /// shader reads this buffer to construct the indirect draw parameters.
-    #[inline]
-    pub fn cpu_metadata_buffer(&self) -> Option<&Buffer> {
-        self.cpu_metadata.buffer()
-    }
-
     /// Returns the GPU buffer that holds the GPU-constructed data used to
     /// construct indirect draw parameters for meshes.
     ///
@@ -1266,8 +1354,8 @@ where
     /// indirect parameters building shader reads this buffer to construct the
     /// indirect draw parameters.
     #[inline]
-    pub fn gpu_metadata_buffer(&self) -> Option<&Buffer> {
-        self.gpu_metadata.buffer()
+    pub fn metadata_buffer(&self) -> Option<&Buffer> {
+        self.metadata.buffer()
     }
 
     /// Returns the GPU buffer that holds the number of indirect draw commands
@@ -1283,24 +1371,20 @@ where
 
     /// Reserves space for `count` new batches.
     ///
-    /// This allocates in the [`Self::cpu_metadata`], [`Self::gpu_metadata`],
-    /// and [`Self::indirect_draw_parameters`] buffers.
+    /// This allocates in the [`Self::metadata`] and
+    /// [`Self::indirect_draw_parameters`] buffers.
     fn allocate(&mut self, count: u32) -> u32 {
         let length = self.indirect_draw_parameters.len();
-        self.cpu_metadata.reserve_internal(count as usize);
-        self.gpu_metadata.add_multiple(count as usize);
+        self.metadata.push_multiple_init(count as usize);
         for _ in 0..count {
             self.indirect_draw_parameters.add();
-            self.cpu_metadata
-                .push(IndirectParametersCpuMetadata::default());
         }
         length as u32
     }
 
-    /// Sets the [`IndirectParametersCpuMetadata`] for the mesh at the given
-    /// index.
-    pub fn set(&mut self, index: u32, value: IndirectParametersCpuMetadata) {
-        self.cpu_metadata.set(index, value);
+    /// Sets the [`IndirectParametersMetadata`] for the mesh at the given index.
+    pub fn set(&mut self, index: u32, value: IndirectParametersMetadata) {
+        self.metadata.set(index as usize, value);
     }
 
     /// Returns the number of batches corresponding to meshes that are currently
@@ -1313,8 +1397,7 @@ where
     /// Clears out all the buffers in preparation for a new frame.
     pub fn clear(&mut self) {
         self.indirect_draw_parameters.clear();
-        self.cpu_metadata.clear();
-        self.gpu_metadata.clear();
+        self.metadata.clear();
         self.batch_sets.clear();
     }
 }
@@ -2037,6 +2120,10 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                                 as u32,
                             // Unused.
                             first_work_item_index: 0,
+                            // Unused.
+                            first_indirect_parameters_index: 0,
+                            // Unused.
+                            first_output_mesh_uniform_index: 0,
                         });
                     }
                 }
@@ -2154,15 +2241,20 @@ where
         &mut self,
         batch_set: &RenderMultidrawableBatchSet<BPI>,
         data_buffer: &mut UninitBufferVec<GFBD::BufferData>,
-        indexed_work_item_buffer: &mut PartialBufferVec<PreprocessWorkItem>,
+        work_item_buffer: &mut PartialBufferVec<PreprocessWorkItem>,
         mesh_class_buffers: &mut MeshClassIndirectParametersBuffers<IP>,
         batch_sets: &mut Vec<BinnedRenderPhaseBatchSet<BPI::BinKey>>,
     ) where
         IP: Clone + ShaderSize + WriteInto,
     {
+        // Note that this function is O(1) and doesn't have any loops over the
+        // meshes or mesh instances in this batch set. This is very important
+        // for proper GPU-driven rendering, as we want to have no overhead on
+        // the CPU for meshes that didn't change from the last frame.
+
         let current_indexed_batch_set_index = self.batch_set_index;
         let current_output_index = data_buffer.len() as u32;
-        let first_work_item_index = indexed_work_item_buffer.len() as u32;
+        let first_work_item_index = work_item_buffer.len() as u32;
 
         let indirect_parameters_base = self.indirect_parameters_index;
 
@@ -2180,56 +2272,25 @@ where
             .representative_entity()
             .unwrap_or(MainEntity::from(Entity::PLACEHOLDER));
 
-        // Calculate where the mesh uniform (not the mesh input uniform) should
-        // go for each mesh instance in our bins. This entails performing a
-        // prefix sum on the number of elements in each bin. First, initialize
-        // each base output index to zero.
-        //
-        // TODO: Eventually, this should be done on GPU with a prefix sum. We
-        // don't want any per-bin work to be done on CPU for bins that didn't
-        // change since the last frame.
-        let cpu_metadata_offset = mesh_class_buffers.cpu_metadata.len() as u32;
-        for _ in 0..batch_set.bin_count() {
-            mesh_class_buffers
-                .cpu_metadata
-                .push(IndirectParametersCpuMetadata {
-                    // We fill this in later.
-                    base_output_index: 0,
-                    batch_set_index: self.batch_set_index,
-                });
-        }
-
-        // Next, traverse each bin and allocate the position of each mesh
-        // uniform in it. Additionally, reserve space for the mesh instances in
-        // the buffers.
-        for bin_index in batch_set.bin_key_to_bin_index.values() {
-            let bin = batch_set.bin(*bin_index).expect("Bin not present");
+        // Calculate where the indirect parameters metadata should go for this
+        // batch set. The uniform allocation shader will create the actual
+        // metadata.
+        let bin_count = batch_set.bin_count();
+        let first_metadata_index =
+            mesh_class_buffers.metadata.push_multiple_uninit(bin_count) as u32;
 
-            // Allocate the indirect parameters.
-            let indirect_parameters_offset = *batch_set
-                .gpu_buffers
-                .bin_index_to_indirect_parameters_offset_buffer
-                .get(bin_index.0)
-                .unwrap();
-            mesh_class_buffers.cpu_metadata.values_mut()
-                [cpu_metadata_offset as usize + indirect_parameters_offset as usize]
-                .base_output_index = data_buffer.len() as u32;
-
-            // Reserve space for the appropriate number of entities in the work
-            // item buffer and data buffer. Also, advance the output index and
-            // work item count.
-            let bin_entity_count = bin.entity_to_binned_mesh_instance_index.len();
-            indexed_work_item_buffer.push_multiple_uninit(bin_entity_count);
-            data_buffer.add_multiple(bin_entity_count);
-            self.work_item_count += bin_entity_count;
-        }
+        // Next, reserve space for the mesh uniforms and work items that this
+        // batch set will need.
+        let first_output_mesh_uniform_index =
+            data_buffer.add_multiple(batch_set.instance_count as usize) as u32;
+        work_item_buffer.push_multiple_uninit(batch_set.instance_count as usize);
+        self.work_item_count += batch_set.instance_count as usize;
 
         // Reserve space for the bins in this batch set in the GPU buffers.
-        let bin_count = batch_set.bin_count();
-        mesh_class_buffers.gpu_metadata.add_multiple(bin_count);
-        mesh_class_buffers
+        let first_indirect_parameters_index = mesh_class_buffers
             .indirect_draw_parameters
-            .add_multiple(bin_count);
+            .add_multiple(bin_count) as u32;
+        debug_assert_eq!(first_metadata_index, first_indirect_parameters_index);
 
         // Write the information the GPU will need about this batch set.
         mesh_class_buffers.batch_sets.push(IndirectBatchSet {
@@ -2254,6 +2315,8 @@ where
             batch_count: self.indirect_parameters_index - indirect_parameters_base,
             index: current_indexed_batch_set_index,
             first_work_item_index,
+            first_indirect_parameters_index,
+            first_output_mesh_uniform_index,
         });
     }
 }
@@ -2323,7 +2386,7 @@ pub fn write_batched_instance_buffers<GFBD>(
     render_queue: Res<RenderQueue>,
     gpu_array_buffer: ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
     pipeline_cache: Res<PipelineCache>,
-    mut bin_unpacking_buffers: ResMut<BinUnpackingBuffers>,
+    mut bin_unpacking_buffers: ResMut<SceneUnpackingBuffers>,
     mut sparse_buffer_update_jobs: ResMut<SparseBufferUpdateJobs>,
     mut sparse_buffer_update_bind_groups: ResMut<SparseBufferUpdateBindGroups>,
     sparse_buffer_update_pipelines: Res<SparseBufferUpdatePipelines>,
@@ -2339,6 +2402,8 @@ pub fn write_batched_instance_buffers<GFBD>(
     let render_device = &*render_device;
     let render_queue = &*render_queue;
 
+    let bin_unpacking_buffers = &mut *bin_unpacking_buffers;
+
     ComputeTaskPool::get().scope(|scope| {
         scope.spawn(async {
             let _span = bevy_log::info_span!("write_current_input_buffers").entered();
@@ -2401,6 +2466,17 @@ pub fn write_batched_instance_buffers<GFBD>(
                 });
             }
         }
+
+        scope.spawn(async {
+            bin_unpacking_buffers
+                .bin_unpacking_metadata
+                .write_buffer(render_device, render_queue);
+        });
+        scope.spawn(async {
+            bin_unpacking_buffers
+                .uniform_allocation_metadata
+                .write_buffer(render_device, render_queue);
+        });
     });
 
     // Create the resources necessary to perform sparse uploads of the current
@@ -2412,10 +2488,6 @@ pub fn write_batched_instance_buffers<GFBD>(
         &mut sparse_buffer_update_bind_groups,
         &sparse_buffer_update_pipelines,
     );
-
-    bin_unpacking_buffers
-        .bin_unpacking_metadata
-        .write_buffer(render_device, render_queue);
 }
 
 /// Writes the bin data for each render phase to the GPU.
@@ -2425,7 +2497,7 @@ pub fn write_batched_instance_buffers<GFBD>(
 pub fn write_binned_instance_buffers<BPI, GFBD>(
     mut views: Query<&ExtractedView>,
     mut view_binned_render_phases: ResMut<ViewBinnedRenderPhases<BPI>>,
-    bin_unpacking_buffers: ResMut<BinUnpackingBuffers>,
+    bin_unpacking_buffers: ResMut<SceneUnpackingBuffers>,
     render_device: Res<RenderDevice>,
     render_queue: Res<RenderQueue>,
 ) where
@@ -2461,7 +2533,7 @@ pub fn write_binned_instance_buffers<BPI, GFBD>(
         // combination.
         let view_phase_bin_unpacking_buffers = bin_unpacking_buffers
             .view_phase_buffers
-            .entry(BinUnpackingBuffersKey {
+            .entry(SceneUnpackingBuffersKey {
                 phase: phase_type_id,
                 view: extracted_view.retained_view_entity,
             })
@@ -2485,7 +2557,7 @@ pub fn write_binned_instance_buffers<BPI, GFBD>(
         // We use the *representative entity* as the key for the later loop to
         // find the `BatchSetBinUnpackingMetadata`, because it's a unique value
         // that can be fetched from the `BinnedRenderPhaseBatchSet`.
-        let mut representative_entity_to_batch_set_bin_unpacking_metadata =
+        let mut representative_entity_to_batch_set_scene_unpacking_metadata =
             MainEntityHashMap::default();
 
         for batch_set in batch_sets {
@@ -2498,11 +2570,14 @@ pub fn write_binned_instance_buffers<BPI, GFBD>(
             {
                 // Record the batch set bin unpacking metadata for later passes
                 // to use.
-                representative_entity_to_batch_set_bin_unpacking_metadata.insert(
+                representative_entity_to_batch_set_scene_unpacking_metadata.insert(
                     main_entity,
-                    BatchSetBinUnpackingMetadata {
+                    BatchSetSceneUnpackingMetadata {
                         base_output_work_item_index: batch_set.first_work_item_index,
                         base_indirect_parameters_index: indirect_parameters_range.start,
+                        batch_set_index: batch_set.index,
+                        first_indirect_parameters_index: batch_set.first_indirect_parameters_index,
+                        first_output_mesh_uniform_index: batch_set.first_output_mesh_uniform_index,
                     },
                 );
             }
@@ -2517,8 +2592,8 @@ pub fn write_binned_instance_buffers<BPI, GFBD>(
             let Some(representative_entity) = batch_set.representative_entity() else {
                 continue;
             };
-            let Some(bin_unpacking_metadata) =
-                representative_entity_to_batch_set_bin_unpacking_metadata
+            let Some(scene_unpacking_metadata) =
+                representative_entity_to_batch_set_scene_unpacking_metadata
                     .get(&representative_entity)
             else {
                 continue;
@@ -2532,20 +2607,32 @@ pub fn write_binned_instance_buffers<BPI, GFBD>(
                 .write_buffer(&render_device, &render_queue);
             batch_set
                 .gpu_buffers
-                .bin_index_to_indirect_parameters_offset_buffer
+                .bin_metadata_buffer
+                .write_buffer(&render_device, &render_queue);
+            batch_set
+                .gpu_buffers
+                .fan_buffer
+                .write_buffer(&render_device);
+            batch_set
+                .gpu_buffers
+                .bin_index_to_bin_metadata_index_buffer
                 .write_buffer(&render_device, &render_queue);
 
             let (
                 Some(render_bin_entry_buffer),
-                Some(bin_index_to_indirect_parameters_offset_buffer),
+                Some(bin_metadata_buffer),
+                Some(fan_buffer),
+                Some(bin_index_to_bin_metadata_index_buffer),
             ) = (
                 batch_set
                     .gpu_buffers
                     .render_binned_mesh_instance_buffer
                     .buffer(),
+                batch_set.gpu_buffers.bin_metadata_buffer.buffer(),
+                batch_set.gpu_buffers.fan_buffer.buffer(),
                 batch_set
                     .gpu_buffers
-                    .bin_index_to_indirect_parameters_offset_buffer
+                    .bin_index_to_bin_metadata_index_buffer
                     .buffer(),
             )
             else {
@@ -2556,33 +2643,59 @@ pub fn write_binned_instance_buffers<BPI, GFBD>(
                 .gpu_buffers
                 .render_binned_mesh_instance_buffer
                 .len() as u32;
+            let bin_count = batch_set.bin_count() as u32;
 
             // Build up the `GpuBinUnpackingMetadata` for this batch set.
             let gpu_bin_unpacking_metadata_index = bin_unpacking_buffers
                 .bin_unpacking_metadata
                 .push(GpuBinUnpackingMetadata {
-                    base_output_work_item_index: bin_unpacking_metadata.base_output_work_item_index,
-                    base_indirect_parameters_index: bin_unpacking_metadata
+                    base_output_work_item_index: scene_unpacking_metadata
+                        .base_output_work_item_index,
+                    base_indirect_parameters_index: scene_unpacking_metadata
                         .base_indirect_parameters_index,
                     binned_mesh_instance_count,
                     pad: [0; _],
                 });
 
-            let Some(gpu_bin_unpacking_metadata_index) =
-                NonMaxU32::new(gpu_bin_unpacking_metadata_index as u32)
+            // Build up the `GpuUniformAllocationMetadata` for this batch set.
+            let gpu_uniform_allocation_metadata_index = bin_unpacking_buffers
+                .uniform_allocation_metadata
+                .push(GpuUniformAllocationMetadata {
+                    batch_set_index: scene_unpacking_metadata.batch_set_index,
+                    bin_count,
+                    first_indirect_parameters_index: scene_unpacking_metadata
+                        .first_indirect_parameters_index,
+                    first_output_mesh_uniform_index: scene_unpacking_metadata
+                        .first_output_mesh_uniform_index,
+                    pad: [0; _],
+                });
+
+            let (
+                Some(gpu_bin_unpacking_metadata_index),
+                Some(gpu_uniform_allocation_metadata_index),
+            ) = (
+                NonMaxU32::new(gpu_bin_unpacking_metadata_index as u32),
+                NonMaxU32::new(gpu_uniform_allocation_metadata_index as u32),
+            )
             else {
                 continue;
             };
 
             // Create the [`BinUnpackingJob`].
-            let job = BinUnpackingJob {
+            let job = SceneUnpackingJob {
                 render_binned_mesh_instance_buffer: render_bin_entry_buffer.clone(),
-                bin_index_to_indirect_parameters_offset_buffer:
-                    bin_index_to_indirect_parameters_offset_buffer.clone(),
+                bin_metadata_buffer: bin_metadata_buffer.clone(),
+                fan_buffer: fan_buffer.clone(),
+                bin_index_to_bin_metadata_index_buffer: bin_index_to_bin_metadata_index_buffer
+                    .clone(),
                 bin_unpacking_metadata_index: BinUnpackingMetadataIndex(
                     gpu_bin_unpacking_metadata_index,
                 ),
+                uniform_allocation_metadata_index: UniformAllocationMetadataIndex(
+                    gpu_uniform_allocation_metadata_index,
+                ),
                 mesh_instance_count: binned_mesh_instance_count,
+                bin_count,
             };
 
             if batch_set_key.indexed() {
@@ -2606,20 +2719,28 @@ pub fn write_binned_instance_buffers<BPI, GFBD>(
         });
 }
 
-/// Clears out the [`BinUnpackingBuffers`] in preparation for a new frame.
-pub fn clear_bin_unpacking_buffers(mut bin_unpacking_buffers: ResMut<BinUnpackingBuffers>) {
-    bin_unpacking_buffers.bin_unpacking_metadata.clear();
+/// Clears out the [`SceneUnpackingBuffers`] in preparation for a new frame.
+pub fn clear_scene_unpacking_buffers(mut scene_unpacking_buffers: ResMut<SceneUnpackingBuffers>) {
+    scene_unpacking_buffers.bin_unpacking_metadata.clear();
+    scene_unpacking_buffers.uniform_allocation_metadata.clear();
 }
 
-/// CPU-side metadata needed to drive the bin unpacking compute shader for a
-/// single batch set.
-struct BatchSetBinUnpackingMetadata {
+/// CPU-side metadata needed to drive the uniform allocation and bin unpacking
+/// compute shaders for a single batch set.
+struct BatchSetSceneUnpackingMetadata {
     /// The index of the first [`PreprocessWorkItem`] that the compute shader
     /// dispatch is to write to.
     base_output_work_item_index: u32,
     /// The index of the first GPU indirect parameters command for the batch
     /// set.
     base_indirect_parameters_index: u32,
+    /// The index of the batch set in the `indirect_batch_sets` array.
+    batch_set_index: u32,
+    /// The index of the indirect parameters for the first bin in the indirect
+    /// parameters buffer.
+    first_indirect_parameters_index: u32,
+    /// The index of the first `MeshUniform` in the mesh uniforms buffer.
+    first_output_mesh_uniform_index: u32,
 }
 
 pub fn clear_indirect_parameters_buffers(
@@ -2655,35 +2776,20 @@ pub fn write_indirect_parameters_buffers(
             });
 
             scope.spawn(async {
-                let _span = bevy_log::info_span!("indexed_cpu_metadata").entered();
+                let _span = bevy_log::info_span!("indexed_metadata").entered();
                 phase_indirect_parameters_buffers
                     .indexed
-                    .cpu_metadata
+                    .metadata
                     .write_buffer(render_device, render_queue);
             });
             scope.spawn(async {
-                let _span = bevy_log::info_span!("non_indexed_cpu_metadata").entered();
+                let _span = bevy_log::info_span!("non_indexed_metadata").entered();
                 phase_indirect_parameters_buffers
                     .non_indexed
-                    .cpu_metadata
+                    .metadata
                     .write_buffer(render_device, render_queue);
             });
 
-            scope.spawn(async {
-                let _span = bevy_log::info_span!("non_indexed_gpu_metadata").entered();
-                phase_indirect_parameters_buffers
-                    .non_indexed
-                    .gpu_metadata
-                    .write_buffer(render_device);
-            });
-            scope.spawn(async {
-                let _span = bevy_log::info_span!("indexed_gpu_metadata").entered();
-                phase_indirect_parameters_buffers
-                    .indexed
-                    .gpu_metadata
-                    .write_buffer(render_device);
-            });
-
             scope.spawn(async {
                 let _span = bevy_log::info_span!("indexed_batch_sets").entered();
                 phase_indirect_parameters_buffers
diff --git a/crates/bevy_render/src/batching/mod.rs b/crates/bevy_render/src/batching/mod.rs
index d0d00367d5053..f8f36daf5eec0 100644
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@@ -164,12 +164,12 @@ pub trait GetFullBatchData: GetBatchData {
         query_item: MainEntity,
     ) -> Option<NonMaxU32>;
 
-    /// Writes the [`gpu_preprocessing::IndirectParametersGpuMetadata`]
-    /// necessary to draw this batch into the given metadata buffer at the given
-    /// index.
+    /// Writes the [`gpu_preprocessing::IndirectParametersMetadata`] necessary
+    /// to draw this batch into the given metadata buffer at the given index.
     ///
     /// This is only used if GPU culling is enabled (which requires GPU
-    /// preprocessing).
+    /// preprocessing), and only for phase items that don't support GPU uniform
+    /// allocation yet.
     ///
     /// * `indexed` is true if the mesh is indexed or false if it's non-indexed.
     ///
diff --git a/crates/bevy_render/src/occlusion_culling/mesh_preprocess_types.wgsl b/crates/bevy_render/src/occlusion_culling/mesh_preprocess_types.wgsl
index abfa658904f49..59b688a6ffb57 100644
--- a/crates/bevy_render/src/occlusion_culling/mesh_preprocess_types.wgsl
+++ b/crates/bevy_render/src/occlusion_culling/mesh_preprocess_types.wgsl
@@ -51,18 +51,50 @@ struct IndirectParametersNonIndexed {
     first_instance: u32,
 }
 
-struct IndirectParametersCpuMetadata {
+// Information needed to construct indirect draw parameters for a single draw.
+//
+// Note that is per-*draw* (i.e. per-mesh), not per-mesh-instance or
+// per-batch-set. A single multi-draw indirect call can perform multiple draws.
+//
+// Typically, the uniform allocation and mesh preprocessing phases fill in this
+// structure. However, parts of it may be filled in on the CPU for objects that
+// aren't multidrawn.
+struct IndirectParametersMetadata {
+    // The index of the first `MeshUniform` for this draw in the mesh uniform
+    // buffer.
+    //
+    // `MeshUniform`s for all instances are stored consecutively.
+    //
+    // This is filled in in the `allocate_uniforms` shader, or on the CPU when
+    // multidraw isn't in use.
     base_output_index: u32,
+
+    // The index of this batch set in the `IndirectBatchSet` array.
+    //
+    // This is filled in in the `allocate_uniforms` shader, or on the CPU when
+    // multidraw isn't in use.
     batch_set_index: u32,
-}
 
-struct IndirectParametersGpuMetadata {
+    // The index of the mesh in the `MeshInput` buffer.
+    //
+    // The mesh preprocessing shader fills this in.
     mesh_index: u32,
+
 #ifdef WRITE_INDIRECT_PARAMETERS_METADATA
+    // The number of instances that were visible last frame (if occlusion
+    // culling is in use) or that were visible at all (if occlusion culling
+    // isn't in use).
     early_instance_count: atomic<u32>,
+    // The number of instances that were visible this frame if occlusion culling
+    // is in use.
     late_instance_count: atomic<u32>,
 #else   // WRITE_INDIRECT_PARAMETERS_METADATA
+    // The number of instances that were visible last frame (if occlusion
+    // culling is in use) or that were visible at all (if occlusion culling
+    // isn't in use).
     early_instance_count: u32,
+    // The number of instances that were visible this frame if occlusion culling
+    // is in use.
     late_instance_count: u32,
 #endif  // WRITE_INDIRECT_PARAMETERS_METADATA
 }
@@ -81,4 +113,29 @@ struct PreprocessWorkItem {
     // indirect mode, the index of the `IndirectParameters` in
     // `indirect_parameters` that we write to.
     output_or_indirect_parameters_index: u32,
-}
\ No newline at end of file
+}
+
+// Information about each bin in a batch set.
+//
+// This is maintained by the CPU and cached for bins that don't change from
+// frame to frame.
+struct BinMetadata {
+    // The index of the indirect parameters for this bin, relative to the first
+    // indirect parameter index for the batch set.
+    //
+    // That is, the final indirect parameters index for this bin is
+    // `first_indirect_parameters_index` in the `UniformAllocationMetadata` plus
+    // this value.
+    indirect_parameters_offset: u32,
+
+    // The index of the bin that this metadata corresponds to.
+    //
+    // The GPU doesn't use this, but the CPU does in order to perform the
+    // reverse mapping from bin metadata index back to the bin. We could store
+    // this in a non-GPU-accessible buffer, but I figured the extra complexity
+    // wasn't worth it.
+    bin_index: u32,
+
+    // The number of mesh instances in this bin.
+    instance_count: u32,
+};
diff --git a/crates/bevy_render/src/render_phase/mod.rs b/crates/bevy_render/src/render_phase/mod.rs
index 0d819dab54f0a..b422762ceb307 100644
--- a/crates/bevy_render/src/render_phase/mod.rs
+++ b/crates/bevy_render/src/render_phase/mod.rs
@@ -44,10 +44,10 @@ pub use rangefinder::*;
 use wgpu::{BufferUsages, Features};
 
 use crate::batching::gpu_preprocessing::{
-    GpuPreprocessingMode, GpuPreprocessingSupport, PhaseBatchedInstanceBuffers,
+    GpuBinMetadata, GpuPreprocessingMode, GpuPreprocessingSupport, PhaseBatchedInstanceBuffers,
     PhaseIndirectParametersBuffers,
 };
-use crate::render_resource::RawBufferVec;
+use crate::render_resource::{RawBufferVec, UninitBufferVec};
 use crate::renderer::RenderDevice;
 use crate::sync_world::{MainEntity, MainEntityHashMap};
 use crate::view::{ExtractedView, RetainedViewEntity};
@@ -78,7 +78,6 @@ use core::{
     hash::Hash,
     iter,
     marker::PhantomData,
-    mem,
     ops::{Range, RangeBounds},
 };
 use smallvec::SmallVec;
@@ -207,8 +206,7 @@ impl RenderMultidrawableBin {
 
 /// The index of a mesh instance in the
 /// [`RenderMultidrawableBatchSetGpuBuffers::render_binned_mesh_instance_buffer`]
-/// and [`RenderMultidrawableBatchSet::render_binned_mesh_instances_cpu`]
-/// arrays.
+/// array.
 ///
 /// These two arrays are parallel and always have the same length.
 ///
@@ -218,6 +216,9 @@ impl RenderMultidrawableBin {
 #[derive(Clone, Copy, Debug, Deref, DerefMut)]
 pub(crate) struct RenderBinnedMeshInstanceIndex(pub(crate) u32);
 
+#[derive(Clone, Copy, Debug, Deref, DerefMut)]
+pub(crate) struct RenderBinMetadataIndex(pub(crate) u32);
+
 /// The GPU buffers that go along with [`RenderMultidrawableBatchSet`].
 ///
 /// The bin unpacking shader uses these in order to produce
@@ -230,31 +231,20 @@ pub struct RenderMultidrawableBatchSetGpuBuffers {
     /// (`RenderBinnedMeshInstanceIndex`) to its input uniform index
     /// ([`InputUniformIndex`]) and bin index (`RenderBinIndex`).
     pub render_binned_mesh_instance_buffer: RawBufferVec<GpuRenderBinnedMeshInstance>,
-    /// A mapping from each `RenderBinnedMeshInstanceIndex` to the offset of its
-    /// indirect draw parameters.
-    pub bin_index_to_indirect_parameters_offset_buffer: RawBufferVec<u32>,
-}
 
-/// Information about each binned mesh instance that the
-/// [`RenderMultidrawableBatchSet`] keeps on CPU.
-#[derive(Clone, Copy)]
-pub(crate) struct CpuRenderBinnedMeshInstance {
-    /// The entity associated with this mesh instance.
-    pub(crate) main_entity: MainEntity,
+    /// A mapping from each `RenderBinMetadataIndex` to the offset of its indirect draw
+    /// parameters.
+    pub bin_metadata_buffer: RawBufferVec<GpuBinMetadata>,
 
-    /// The index of the bin that the entity is in.
+    /// A mapping from each `RenderBinIndex` to the associated bin metadata
+    /// index.
     ///
-    /// Note that bin indices are stable from frame to frame.
-    bin_index: RenderBinIndex,
-}
+    /// This array isn't necessarily tightly packed (i.e. it can have holes).
+    pub bin_index_to_bin_metadata_index_buffer: RawBufferVec<u32>,
 
-impl Default for CpuRenderBinnedMeshInstance {
-    fn default() -> Self {
-        CpuRenderBinnedMeshInstance {
-            main_entity: MainEntity::from(Entity::PLACEHOLDER),
-            bin_index: RenderBinIndex::default(),
-        }
-    }
+    /// A temporary buffer used to store intermediate sums during the uniform
+    /// allocation phase.
+    pub fan_buffer: UninitBufferVec<u32>,
 }
 
 impl RenderMultidrawableBatchSetGpuBuffers {
@@ -262,14 +252,19 @@ impl RenderMultidrawableBatchSetGpuBuffers {
     fn new() -> RenderMultidrawableBatchSetGpuBuffers {
         let mut render_bin_entry_buffer = RawBufferVec::new(BufferUsages::STORAGE);
         render_bin_entry_buffer.set_label(Some("render bin entry buffer"));
-        let mut bin_index_to_indirect_parameters_offset_buffer =
-            RawBufferVec::new(BufferUsages::STORAGE);
-        bin_index_to_indirect_parameters_offset_buffer
-            .set_label(Some("bin index to indirect parameters offset buffer"));
+        let mut bin_metadata_buffer = RawBufferVec::new(BufferUsages::STORAGE);
+        bin_metadata_buffer.set_label(Some("bin metadata buffer"));
+        let mut bin_index_to_bin_metadata_index_buffer = RawBufferVec::new(BufferUsages::STORAGE);
+        bin_index_to_bin_metadata_index_buffer
+            .set_label(Some("bin-index-to-bin-metadata-index buffer"));
+        let mut fan_buffer = UninitBufferVec::new(BufferUsages::STORAGE);
+        fan_buffer.set_label(Some("fan buffer"));
 
         RenderMultidrawableBatchSetGpuBuffers {
             render_binned_mesh_instance_buffer: render_bin_entry_buffer,
-            bin_index_to_indirect_parameters_offset_buffer,
+            bin_metadata_buffer,
+            bin_index_to_bin_metadata_index_buffer,
+            fan_buffer,
         }
     }
 
@@ -277,7 +272,6 @@ impl RenderMultidrawableBatchSetGpuBuffers {
     fn insert(
         &mut self,
         bin: &mut RenderMultidrawableBin,
-        cpu_binned_mesh_instance_buffer: &mut Vec<CpuRenderBinnedMeshInstance>,
         main_entity: MainEntity,
         input_uniform_index: InputUniformIndex,
         bin_index: RenderBinIndex,
@@ -300,28 +294,21 @@ impl RenderMultidrawableBatchSetGpuBuffers {
                             .push(GpuRenderBinnedMeshInstance::default())
                             as u32,
                     );
-                    cpu_binned_mesh_instance_buffer.push(CpuRenderBinnedMeshInstance::default());
                     vacant_entry.insert(render_bin_buffer_index);
                     render_bin_buffer_index
                 }
             };
 
-        // Place the entry in the instance buffer at the proper spot. Also, save
-        // the entity and bin index in the CPU-side array.
+        // Place the entry in the instance buffer at the proper spot.
         self.render_binned_mesh_instance_buffer.values_mut()
             [render_binned_mesh_instance_buffer_index.0 as usize] = gpu_render_bin_entry;
-        cpu_binned_mesh_instance_buffer[render_binned_mesh_instance_buffer_index.0 as usize] =
-            CpuRenderBinnedMeshInstance {
-                main_entity,
-                bin_index,
-            };
 
-        // The GPU-side `render_binned_mesh_instance_buffer` and the CPU-side
-        // `cpu_binned_mesh_instance_buffer` are parallel arrays and must have
-        // the same length, so assert that in debug mode.
+        let bin_metadata_index =
+            self.bin_index_to_bin_metadata_index_buffer.values()[bin_index.0 as usize];
+        self.bin_metadata_buffer.values_mut()[bin_metadata_index as usize].instance_count += 1;
         debug_assert_eq!(
-            self.render_binned_mesh_instance_buffer.len(),
-            cpu_binned_mesh_instance_buffer.len()
+            self.bin_metadata_buffer.values()[bin_metadata_index as usize].instance_count as usize,
+            bin.entity_to_binned_mesh_instance_index.len()
         );
     }
 
@@ -339,9 +326,9 @@ impl RenderMultidrawableBatchSetGpuBuffers {
     fn remove(
         &mut self,
         bin: &mut RenderMultidrawableBin,
-        cpu_binned_mesh_instance_buffer: &mut Vec<CpuRenderBinnedMeshInstance>,
+        bin_index: RenderBinIndex,
         entity_to_remove: MainEntity,
-    ) -> Option<(RenderBinnedMeshInstanceIndex, CpuRenderBinnedMeshInstance)> {
+    ) -> Option<(RenderBinnedMeshInstanceIndex, GpuRenderBinnedMeshInstance)> {
         // Remove the entity from the `entity_to_binned_mesh_instance_index`
         // map.
         let old_index = bin
@@ -349,29 +336,29 @@ impl RenderMultidrawableBatchSetGpuBuffers {
             .remove(&entity_to_remove)
             .expect("Entity not in bin");
 
+        let bin_metadata_index =
+            self.bin_index_to_bin_metadata_index_buffer.values()[bin_index.0 as usize];
+        self.bin_metadata_buffer.values_mut()[bin_metadata_index as usize].instance_count -= 1;
+        debug_assert_eq!(
+            self.bin_metadata_buffer.values()[bin_metadata_index as usize].instance_count as usize,
+            bin.entity_to_binned_mesh_instance_index.len()
+        );
+
         // Remove the entity from the reverse
         // `render_binned_mesh_instance_buffer` list, as well
         // as the parallel `render_binned_mesh_instance_buffer`.  Because binned
         // mesh instance indices must be contiguous, this requires use of
         // `swap_remove`.
-        cpu_binned_mesh_instance_buffer.swap_remove(old_index.0 as usize);
         self.render_binned_mesh_instance_buffer
             .swap_remove(old_index.0 as usize);
 
-        // Both `render_binned_mesh_instance_buffer` and
-        // `cpu_binned_mesh_instance_buffer` must be parallel arrays, so assert
-        // that they have the same length.
-        debug_assert_eq!(
-            cpu_binned_mesh_instance_buffer.len(),
-            self.render_binned_mesh_instance_buffer.len()
-        );
-
         // If an entity was displaced (i.e. has a new binned mesh instance index
         // now), then return that to the caller so that they can perform
         // whatever bookkeeping is necessary.
-        cpu_binned_mesh_instance_buffer
+        self.render_binned_mesh_instance_buffer
+            .values()
             .get(old_index.0 as usize)
-            .map(|entity_indices| (old_index, *entity_indices))
+            .map(|gpu_render_binned_mesh_instance| (old_index, *gpu_render_binned_mesh_instance))
     }
 }
 
@@ -383,6 +370,9 @@ impl RenderMultidrawableBatchSetGpuBuffers {
 #[repr(transparent)]
 pub(crate) struct RenderBinIndex(pub(crate) u32);
 
+/// The number of threads per workgroup in the `allocate_uniforms` shader.
+pub const UNIFORM_ALLOCATION_WORKGROUP_SIZE: u32 = 256;
+
 /// A collection of mesh instances that can be drawn together, sorted into bins.
 ///
 /// This data structure stores a list of entity indices corresponding to mesh
@@ -393,50 +383,61 @@ pub(crate) struct RenderBinIndex(pub(crate) u32);
 /// of the data structure is as follows:
 ///
 /// ```text
-///         ┌─
-///         │                ─────┬──────────────┬─────
-///         │                     │ Mesh Inst. 2 │
-///         │  Binned Mesh    ... ├──────────────┤ ...
-///         │  Instances          │ Entity 8     │
-///         │                ─────┴───┬──────────┴─────
-///         │                         │
-///         │                         │   ┌───────────────────────────────┐
-///         │                         │   │                               │
-///         │                         ▼   ▼                               │
-///         │               ┌───────┬───────┬───────┬─────                │
-///         │  Bins         │ Bin 0 │ Bin 1 │ Bin 2 │ ...                 │
-///         │               └───────┴───┬───┴───────┴─────                │
-///         │                           │                                 │
-///     CPU │                           │                                 │
-///         │  Entity-to-               │  ┌──────────┬──────────┬─────   │
-///         │  Binned-Mesh-             └─►│ Entity 3 │ Entity 8 │ ...    │
-///         │  Instance-                   └──────────┴──────┬───┴─────   │
-///         │  Index                                         │            │
-///         │                                                │            │
-///         │                                                │            │
-///         │                                                │            │
-///         │                                                │            │
-///         │  Indirect-     ┌───────┬───────┬───────┬─────  │            │
-///         │  Parameters-   │ IPO 0 │ IPO 1 │ IPO 2 │ ...   │            │
-///         │  Offset-to-    └───────┴───────┴───────┴─────  │            │
-///         │  Bin-Index                         ▲           │            │
-///         │                                    │           │            │
-///         └─                                   │           │            │
-///                                      ┌───────┘           │            │
-///         ┌─                           │                   │            │
-///         │                            │                   │            │
-///         │  Bin-to-                   ▼                   │            │
-///         │  Indirect-     ┌───────┬───────┬───────┬─────  │            │
-///         │  Parameters-   │ Bin 0 │ Bin 1 │ Bin 2 │ ...   │            │
-///         │  Offset        └───────┴───────┴───────┴─────  │            │
-///     GPU │  Buffer                                        │            │
-///         │                                                │            │
-///         │                                                ▼            │
-///         │  Binned Mesh   ─────┬──────────────┬──────────────┬─────    │
-///         │  Instance       ... │ Mesh Inst. 1 │ Mesh Inst. 2 │ ...     │
-///         │  Buffer        ─────┴──────────────┴───────────┬──┴─────    │
-///         │                                                │            │
-///         └─                                               └────────────┘
+///           Bin Key to
+///           Bin Index
+///
+///          │    ...    │
+///          ├───────────┤
+///          │ Bin Key A │                                        Bins
+///          ├───────────┤                                      ┌────────────┐
+///          │ Bin Key B ├───┬─────────────────────────────────►│ Bin 1      │◄──────────────────┐
+///          ├───────────┤   │                                  └─┬──────────┤                   │
+///          │ Bin Key C │   │        Mesh Input                  │ Entity 3 │                   │
+///          ├───────────┤   │        Uniform to                  ├──────────┤                   │
+///          │    ...    │   │        Entity             ┌───────►│ Entity 7 ├────────┐          │
+///                          │                           │        ├──────────┤        │          │
+///                          │       │   ...    │        │        │ Entity 9 │        │          │
+///                          │       ├──────────┤        │      ┌─┴──────────┤        │          │
+///                          │       │ Entity 4 │        │      │ Bin 2      │        │          │
+///                          │       ├──────────┤        │      └─┬──────────┤        │          │
+///                          │   ┌──►│ Entity 7 ├────────┘        │   ...    │        │          │
+///                          │   │   ├──────────┤                                     │          │
+///                          │   │   │ Entity 1 │                                     │          │
+///                          │   │   ├──────────┤                                     │          │
+///                          │   │   │ Entity 3 │                                     │          │
+///                          │   │   ├──────────┤                                     │          │
+///                          │   │   │   ...    │                                     │          │
+///   CPU ▲                  │   │                                                    │          │
+/// ┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄│┄┄┄│┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄│┄┄┄┄┄┄┄┄┄┄│┄
+///   GPU ▼                  │   │                                                    │          │
+///                          │   │                                 Binned Mesh        │          │
+///                          │   │                                 Instances          │          │
+///                          │   │                                                    │          │
+///                          │   │                              │      ...        │   │          │
+///                          │   │                            ┌─┴─────────────────┤   │          │
+///                          │   │                            │ Mesh Instance 4   │◄──┘          │
+///                          │   │                            └─┬─────────────────┤              │
+///                          │   └──────────────────────────────┤ Input Uniform 5 │              │
+///                          │                                  ├─────────────────┤              │
+///                          │                                  │ Bin 1           ├──────────────┤
+///                          │                                ┌─┴─────────────────┤              │
+///                          │            Bin Index to Bin    │        ...        │              │
+///                          │            Metadata Index                                         │
+///                          │                                                                   │
+///                          │           │     ...      │                                        │
+///                          │           ├──────────────┤           Bin Metadata                 │
+///                          │           │ Bin Metadata │                                        │
+///                          │           │ Index 2      │         │          ...           │     │
+///                          │           ├──────────────┤       ┌─┴────────────────────────┤     │
+///                          └──────────►│ Bin Metadata ├──────►│ Bin 1                    ├─────┘
+///                                      │ Index 9      │       └─┬────────────────────────┤
+///                                      ├──────────────┤         │ Indirect Params Offset │
+///                                      │ Bin Metadata │         ├────────────────────────┤
+///                                      │ Index 4      │         │ Instance Count         │
+///                                      ├──────────────┤       ┌─┴────────────────────────┤
+///                                      │     ...      │       │ Bin 12                   │
+///                                                             └─┬────────────────────────┤
+///                                                               │          ...           │
 /// ```
 pub struct RenderMultidrawableBatchSet<BPI>
 where
@@ -454,9 +455,7 @@ where
     /// This list isn't tightly packed.
     bins: Vec<Option<RenderMultidrawableBin>>,
 
-    /// A list of unused [`RenderBinIndex`]es waiting to be reused.
-    ///
-    /// Each [`RenderBinIndex`] in this list corresponds to an empty bin.
+    /// A list of free `RenderBinIndex` values.
     bin_free_list: Vec<RenderBinIndex>,
 
     /// A mapping from the indirect parameters offset to the index of each bin.
@@ -466,8 +465,12 @@ where
     /// set.
     indirect_parameters_offset_to_bin_index: Vec<RenderBinIndex>,
 
-    /// Information about each binned mesh instance kept on CPU.
-    pub(crate) render_binned_mesh_instances_cpu: Vec<CpuRenderBinnedMeshInstance>,
+    /// A reverse mapping from the index of the `MeshInputUniform` back to the
+    /// associated [`MainEntity`].
+    mesh_input_uniform_index_to_entity: Vec<MainEntity>,
+
+    /// The total number of mesh instances in the batch.
+    pub(crate) instance_count: u32,
 }
 
 impl<BPI> RenderMultidrawableBatchSet<BPI>
@@ -483,7 +486,8 @@ where
             bins: vec![],
             bin_free_list: vec![],
             indirect_parameters_offset_to_bin_index: vec![],
-            render_binned_mesh_instances_cpu: vec![],
+            mesh_input_uniform_index_to_entity: vec![],
+            instance_count: 0,
         }
     }
 
@@ -513,6 +517,16 @@ where
         main_entity: MainEntity,
         input_uniform_index: InputUniformIndex,
     ) {
+        if (input_uniform_index.0 as usize) >= self.mesh_input_uniform_index_to_entity.len() {
+            self.mesh_input_uniform_index_to_entity
+                .extend(iter::repeat_n(
+                    MainEntity::from(Entity::PLACEHOLDER),
+                    input_uniform_index.0 as usize - self.mesh_input_uniform_index_to_entity.len()
+                        + 1,
+                ));
+        }
+        self.mesh_input_uniform_index_to_entity[input_uniform_index.0 as usize] = main_entity;
+
         let bin_index;
         match self.bin_key_to_bin_index.entry(bin_key) {
             Entry::Occupied(occupied_entry) => {
@@ -535,19 +549,23 @@ where
                 vacant_entry.insert(bin_index);
 
                 // Grab an indirect parameters offset.
-                self.allocate_indirect_parameters(bin_index);
+                self.allocate_bin_metadata(bin_index);
+
+                // Add space to the fan buffer if necessary.
+                if bin_index.0.div_ceil(UNIFORM_ALLOCATION_WORKGROUP_SIZE)
+                    == self.gpu_buffers.fan_buffer.len() as u32
+                {
+                    self.gpu_buffers.fan_buffer.add();
+                }
             }
         }
 
+        self.instance_count += 1;
+
         // Update the GPU buffers.
         let bin = self.bins[bin_index.0 as usize].as_mut().unwrap();
-        self.gpu_buffers.insert(
-            bin,
-            &mut self.render_binned_mesh_instances_cpu,
-            main_entity,
-            input_uniform_index,
-            bin_index,
-        );
+        self.gpu_buffers
+            .insert(bin, main_entity, input_uniform_index, bin_index);
     }
 
     /// Removes the given entity from the bin with the given key.
@@ -561,47 +579,53 @@ where
             .expect("Bin key not present");
         let bin = self.bins[bin_index.0 as usize].as_mut().unwrap();
 
-        let maybe_displaced_entity_indices =
-            self.gpu_buffers
-                .remove(bin, &mut self.render_binned_mesh_instances_cpu, main_entity);
-        if let Some((old_render_bin_buffer_index, displaced_entity_indices)) =
-            maybe_displaced_entity_indices
+        let maybe_displaced_binned_mesh_instance =
+            self.gpu_buffers.remove(bin, bin_index, main_entity);
+        if let Some((old_binned_mesh_instance_index, displaced_binned_mesh_instance)) =
+            maybe_displaced_binned_mesh_instance
         {
-            self.bins[displaced_entity_indices.bin_index.0 as usize]
+            let displaced_entity = self.mesh_input_uniform_index_to_entity
+                [displaced_binned_mesh_instance.input_uniform_index as usize];
+            self.bins[displaced_binned_mesh_instance.bin_index as usize]
                 .as_mut()
                 .expect("Bin not present")
                 .entity_to_binned_mesh_instance_index
-                .insert(
-                    displaced_entity_indices.main_entity,
-                    old_render_bin_buffer_index,
-                );
+                .insert(displaced_entity, old_binned_mesh_instance_index);
         }
 
+        self.instance_count -= 1;
+
         self.remove_bin_if_empty(bin_key, bin_index);
     }
 
     /// Allocates an indirect parameters slot for a new bin.
-    fn allocate_indirect_parameters(&mut self, bin_index: RenderBinIndex) {
+    fn allocate_bin_metadata(&mut self, bin_index: RenderBinIndex) {
         // Indirect parameters must be tightly packed, so we always add one to
         // the end of the list. Record the bin index for the new indirect
         // parameters offset.
         let indirect_parameters_offset = self.indirect_parameters_offset_to_bin_index.len() as u32;
         self.indirect_parameters_offset_to_bin_index.push(bin_index);
 
-        // Update the reverse mapping from bin index to indirect parameters offset.
-        if bin_index.0 as usize
+        let bin_metadata_index =
+            RenderBinMetadataIndex(self.gpu_buffers.bin_metadata_buffer.push(GpuBinMetadata {
+                indirect_parameters_offset,
+                instance_count: 0,
+                bin_index: bin_index.0,
+            }) as u32);
+
+        if bin_index.0
             == self
                 .gpu_buffers
-                .bin_index_to_indirect_parameters_offset_buffer
-                .len()
+                .bin_index_to_bin_metadata_index_buffer
+                .len() as u32
         {
             self.gpu_buffers
-                .bin_index_to_indirect_parameters_offset_buffer
-                .push(indirect_parameters_offset);
+                .bin_index_to_bin_metadata_index_buffer
+                .push(bin_metadata_index.0);
         } else {
             self.gpu_buffers
-                .bin_index_to_indirect_parameters_offset_buffer
-                .values_mut()[bin_index.0 as usize] = indirect_parameters_offset;
+                .bin_index_to_bin_metadata_index_buffer
+                .set(bin_index.0, bin_metadata_index.0);
         }
     }
 
@@ -618,19 +642,38 @@ where
         self.bin_free_list.push(bin_index);
         self.bins[bin_index.0 as usize] = None;
 
+        // Remove the bin index from the metadata list.
+        let old_bin_metadata_index = RenderBinMetadataIndex(
+            self.gpu_buffers
+                .bin_index_to_bin_metadata_index_buffer
+                .values()[bin_index.0 as usize],
+        );
+        self.gpu_buffers
+            .bin_index_to_bin_metadata_index_buffer
+            .values_mut()[bin_index.0 as usize] = u32::MAX;
+
+        // Remove the metadata.
+        let old_bin_metadata = self
+            .gpu_buffers
+            .bin_metadata_buffer
+            .swap_remove(old_bin_metadata_index.0 as usize);
+        debug_assert_eq!(old_bin_metadata.bin_index, bin_index.0);
+        if let Some(displaced_metadata) = self
+            .gpu_buffers
+            .bin_metadata_buffer
+            .get(old_bin_metadata_index.0)
+        {
+            self.gpu_buffers
+                .bin_index_to_bin_metadata_index_buffer
+                .set(displaced_metadata.bin_index, old_bin_metadata_index.0);
+        }
+
         // Remove the indirect parameters offset corresponding to the bin. Note
         // that indirect parameters must be tightly packed. Thus we must use
         // `swap_remove`.
-        let indirect_parameters_offset = mem::replace(
-            &mut self
-                .gpu_buffers
-                .bin_index_to_indirect_parameters_offset_buffer
-                .values_mut()[bin_index.0 as usize],
-            u32::MAX,
-        );
         let removed_bin_index = self
             .indirect_parameters_offset_to_bin_index
-            .swap_remove(indirect_parameters_offset as usize);
+            .swap_remove(old_bin_metadata.indirect_parameters_offset as usize);
         debug_assert_eq!(bin_index, removed_bin_index);
 
         // `swap_remove` may have changed the indirect parameter index of some
@@ -641,18 +684,24 @@ where
         // reflect the new offset of that displaced bin.
         if let Some(displaced_bin_index) = self
             .indirect_parameters_offset_to_bin_index
-            .get(indirect_parameters_offset as usize)
+            .get(old_bin_metadata.indirect_parameters_offset as usize)
         {
-            self.gpu_buffers
-                .bin_index_to_indirect_parameters_offset_buffer
-                .set(displaced_bin_index.0, indirect_parameters_offset);
+            let displaced_bin_metadata_index = self
+                .gpu_buffers
+                .bin_index_to_bin_metadata_index_buffer
+                .values()[displaced_bin_index.0 as usize];
+            self.gpu_buffers.bin_metadata_buffer.values_mut()
+                [displaced_bin_metadata_index as usize]
+                .indirect_parameters_offset = old_bin_metadata.indirect_parameters_offset;
         }
     }
 
+    /// Returns true if all bins are empty: i.e. there are no mesh instances.
     fn is_empty(&self) -> bool {
         self.bin_free_list.len() == self.bins.len()
     }
 
+    /// Returns the number of bins (i.e. draws) in this batch set.
     pub(crate) fn bin_count(&self) -> usize {
         self.bin_key_to_bin_index.len()
     }
@@ -773,6 +822,8 @@ pub struct BinnedRenderPhaseBatchSet<BK> {
     /// The index of the first preprocessing work item for this batch set in the
     /// preprocessing work item buffer.
     pub(crate) first_work_item_index: u32,
+    pub(crate) first_indirect_parameters_index: u32,
+    pub(crate) first_output_mesh_uniform_index: u32,
 }
 
 impl<BK> BinnedRenderPhaseBatchSets<BK> {
@@ -937,7 +988,7 @@ where
 /// This field is ignored if GPU preprocessing isn't in use, such as (currently)
 /// in the case of 2D meshes. In that case, it can be safely set to
 /// [`core::default::Default::default`].
-#[derive(Clone, Copy, PartialEq, Default, Deref, DerefMut, Debug, Pod, Zeroable)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Default, Deref, DerefMut, Debug, Pod, Zeroable)]
 #[repr(transparent)]
 pub struct InputUniformIndex(pub u32);
 
@@ -2231,6 +2282,7 @@ impl RenderBin {
 
 #[cfg(test)]
 mod tests {
+    use bevy_platform::collections::HashMap;
     use proptest_derive::Arbitrary;
 
     use crate::render_phase::GpuRenderBinnedMeshInstance;
@@ -2365,6 +2417,8 @@ mod tests {
             bin_index_to_entities: Vec<MainEntityHashSet>,
             /// A mapping from each entity ID to the binned mesh instance data.
             entity_to_binned_mesh_instance: MainEntityHashMap<GpuRenderBinnedMeshInstance>,
+            /// A mapping from each input uniform index back to the entity.
+            input_uniform_index_to_entity: HashMap<InputUniformIndex, MainEntity>,
         }
 
         impl ExpectedMultidrawableBatchSet {
@@ -2383,6 +2437,8 @@ mod tests {
                     },
                 );
                 self.bin_index_to_entities[bin_index.0 as usize].insert(entity);
+                self.input_uniform_index_to_entity
+                    .insert(input_uniform_index, entity);
             }
 
             /// Removes an entity from the control structure and returns its instance.
@@ -2391,6 +2447,10 @@ mod tests {
                     self.entity_to_binned_mesh_instance.remove(&entity).unwrap();
                 self.bin_index_to_entities[render_binned_mesh_instance.bin_index as usize]
                     .remove(&entity);
+                self.input_uniform_index_to_entity
+                    .remove(&InputUniformIndex(
+                        render_binned_mesh_instance.input_uniform_index,
+                    ));
                 render_binned_mesh_instance
             }
         }
@@ -2415,6 +2475,7 @@ mod tests {
                     let mut expected = ExpectedMultidrawableBatchSet {
                         bin_index_to_entities: vec![MainEntityHashSet::default(); 1024],
                         entity_to_binned_mesh_instance: MainEntityHashMap::default(),
+                        input_uniform_index_to_entity: HashMap::default(),
                     };
 
                     // Process each operation, skipping invalid ones.
@@ -2430,10 +2491,15 @@ mod tests {
                                 ));
                                 let input_uniform_index = InputUniformIndex(input_uniform_index);
 
-                                // Skip this operation if it's trying to add an entity that's already binned.
+                                // Skip this operation if it's trying to add an
+                                // entity that's already binned or duplicating
+                                // an input uniform index.
                                 if expected
                                     .entity_to_binned_mesh_instance
                                     .contains_key(&entity)
+                                    || expected
+                                        .input_uniform_index_to_entity
+                                        .contains_key(&input_uniform_index)
                                 {
                                     continue;
                                 }
@@ -2504,7 +2570,9 @@ mod tests {
             verify_render_binned_mesh_instance_buffer(batch_set, &expected.bin_index_to_entities);
 
             // Verify that no indirect parameter offsets overlap.
-            verify_indirect_parameters_offsets(batch_set);
+            verify_bin_metadata(batch_set);
+
+            verify_instance_count(batch_set);
         }
 
         /// Verifies that every entity is present in the multidrawable batch
@@ -2572,13 +2640,12 @@ mod tests {
                 .iter()
                 .enumerate()
             {
-                let binned_mesh_instance_cpu =
-                    &batch_set.render_binned_mesh_instances_cpu[render_bin_buffer_index];
+                let mapped_entity = batch_set.mesh_input_uniform_index_to_entity
+                    [gpu_render_binned_mesh_instance.input_uniform_index as usize];
 
                 // Make sure that the `GpuRenderBinnedMeshInstance::bin_index`
                 // matches the `CpuRenderBinnedMeshInstance::bin_index`.
                 let gpu_render_bin_index = gpu_render_binned_mesh_instance.bin_index;
-                assert_eq!(gpu_render_bin_index, *binned_mesh_instance_cpu.bin_index);
 
                 let render_bin = batch_set.bins[gpu_render_bin_index as usize]
                     .as_ref()
@@ -2587,7 +2654,7 @@ mod tests {
                 // Make sure that the entity in the
                 // `RenderMultidrawableBin::entity_to_binned_mesh_instance_index`
                 // table matches the entity in the
-                // `CpuRenderBinnedMeshInstance`.
+                // `mesh_input_uniform_index_to_entity`.
                 let Some(entity) = render_bin
                     .entity_to_binned_mesh_instance_index
                     .iter()
@@ -2604,7 +2671,7 @@ mod tests {
                         render_bin_buffer_index, gpu_render_bin_index
                     );
                 };
-                assert_eq!(binned_mesh_instance_cpu.main_entity, *entity);
+                assert_eq!(mapped_entity, *entity);
 
                 // Make sure that the bin with the appropriate bin key should
                 // actually contain the entity.
@@ -2629,40 +2696,71 @@ mod tests {
             }
         }
 
-        fn verify_indirect_parameters_offsets(
-            batch_set: &RenderMultidrawableBatchSet<MockBinnedPhaseItem>,
-        ) {
-            for (render_bin_index, indirect_parameters_offset) in batch_set
-                .gpu_buffers
-                .bin_index_to_indirect_parameters_offset_buffer
-                .values()
-                .iter()
-                .enumerate()
-            {
-                if *indirect_parameters_offset == u32::MAX {
+        /// Ensures that the `BinMetadata` and
+        /// indirect-parameters-offset-to-bin-index maps are correct.
+        fn verify_bin_metadata(batch_set: &RenderMultidrawableBatchSet<MockBinnedPhaseItem>) {
+            for bin_metadata in batch_set.gpu_buffers.bin_metadata_buffer.values().iter() {
+                if bin_metadata.indirect_parameters_offset == u32::MAX {
                     continue;
                 }
                 assert_eq!(
                     batch_set.indirect_parameters_offset_to_bin_index
-                        [*indirect_parameters_offset as usize],
-                    RenderBinIndex(render_bin_index as u32)
+                        [bin_metadata.indirect_parameters_offset as usize],
+                    RenderBinIndex(bin_metadata.bin_index)
                 );
             }
+
             for (indirect_parameters_offset, render_bin_index) in batch_set
                 .indirect_parameters_offset_to_bin_index
                 .iter()
                 .enumerate()
             {
                 assert!(batch_set.bins[render_bin_index.0 as usize].is_some());
+
+                let gpu_metadata_index = *batch_set
+                    .gpu_buffers
+                    .bin_index_to_bin_metadata_index_buffer
+                    .get(render_bin_index.0)
+                    .unwrap();
+
+                let gpu_metadata = batch_set
+                    .gpu_buffers
+                    .bin_metadata_buffer
+                    .get(gpu_metadata_index)
+                    .unwrap();
+
+                // Verify that the bidirectional indirect parameters offset ↔
+                // bin index mapping is correct.
                 assert_eq!(
-                    *batch_set
-                        .gpu_buffers
-                        .bin_index_to_indirect_parameters_offset_buffer
-                        .get(render_bin_index.0)
-                        .unwrap(),
+                    gpu_metadata.indirect_parameters_offset,
                     indirect_parameters_offset as u32
                 );
+                // Verify that the bin indices match up.
+                assert_eq!(gpu_metadata.bin_index, render_bin_index.0);
+                // Verify that the instance count for that bin matches up.
+                assert_eq!(
+                    gpu_metadata.instance_count,
+                    batch_set.bins[render_bin_index.0 as usize]
+                        .as_ref()
+                        .unwrap()
+                        .entity_to_binned_mesh_instance_index
+                        .len() as u32
+                );
+            }
+        }
+
+        /// Ensures that the recorded `instance_count` matches the actual number
+        /// of instances.
+        fn verify_instance_count(batch_set: &RenderMultidrawableBatchSet<MockBinnedPhaseItem>) {
+            let mut total_instance_count = 0;
+            for bin in batch_set
+                .bins
+                .iter()
+                .filter_map(|maybe_bin| maybe_bin.as_ref())
+            {
+                total_instance_count += bin.entity_to_binned_mesh_instance_index.len() as u32;
             }
+            assert_eq!(batch_set.instance_count, total_instance_count);
         }
     }
 }
diff --git a/crates/bevy_render/src/render_resource/buffer_vec.rs b/crates/bevy_render/src/render_resource/buffer_vec.rs
index ed9e1a39db3a3..dcb2d734b2ee6 100644
--- a/crates/bevy_render/src/render_resource/buffer_vec.rs
+++ b/crates/bevy_render/src/render_resource/buffer_vec.rs
@@ -952,6 +952,24 @@ where
         self.uninit_element_count += count;
         first_index
     }
+
+    /// Sets the value at the given index to the given value.
+    pub fn set(&mut self, index: usize, value: T) {
+        self.values[index] = value;
+    }
+}
+
+impl<T> PartialBufferVec<T>
+where
+    T: NoUninit + Default,
+{
+    /// Pushes `count` copies of `T::default` to the array.
+    pub fn push_multiple_init(&mut self, count: usize) -> usize {
+        debug_assert_eq!(self.uninit_element_count, 0);
+        let index = self.values.len();
+        self.values.extend(iter::repeat_n(T::default(), count));
+        index
+    }
 }
 
 /// Error returned when `write_buffer_range` fails
diff --git a/crates/bevy_sprite_render/src/mesh2d/mesh.rs b/crates/bevy_sprite_render/src/mesh2d/mesh.rs
index 7d1e5bbfdeecf..0ba52828959a9 100644
--- a/crates/bevy_sprite_render/src/mesh2d/mesh.rs
+++ b/crates/bevy_sprite_render/src/mesh2d/mesh.rs
@@ -28,7 +28,7 @@ use bevy_render::prelude::Msaa;
 use bevy_render::RenderSystems::PrepareAssets;
 use bevy_render::{
     batching::{
-        gpu_preprocessing::IndirectParametersCpuMetadata,
+        gpu_preprocessing::IndirectParametersMetadata,
         no_gpu_preprocessing::{
             self, batch_and_prepare_binned_render_phase, batch_and_prepare_sorted_render_phase,
             write_batched_instance_buffer, BatchedInstanceBuffer,
@@ -427,12 +427,16 @@ impl GetFullBatchData for Mesh2dPipeline {
         // Note that `IndirectParameters` covers both of these structures, even
         // though they actually have distinct layouts. See the comment above that
         // type for more information.
-        let indirect_parameters = IndirectParametersCpuMetadata {
+        let indirect_parameters = IndirectParametersMetadata {
             base_output_index,
             batch_set_index: match batch_set_index {
                 None => !0,
                 Some(batch_set_index) => u32::from(batch_set_index),
             },
+            // These fields are unused in the 2D pipeline.
+            mesh_index: 0,
+            early_instance_count: 0,
+            late_instance_count: 0,
         };
 
         if indexed {
diff --git a/examples/shader_advanced/custom_render_phase.rs b/examples/shader_advanced/custom_render_phase.rs
index 49a5ec1e31fd9..f3605e4bcf36e 100644
--- a/examples/shader_advanced/custom_render_phase.rs
+++ b/examples/shader_advanced/custom_render_phase.rs
@@ -34,7 +34,7 @@ use bevy::{
     render::{
         batching::{
             gpu_preprocessing::{
-                batch_and_prepare_sorted_render_phase, IndirectParametersCpuMetadata,
+                batch_and_prepare_sorted_render_phase, IndirectParametersMetadata,
                 UntypedPhaseIndirectParametersBuffers,
             },
             GetBatchData, GetFullBatchData,
@@ -464,12 +464,16 @@ impl GetFullBatchData for StencilPipeline {
         // Note that `IndirectParameters` covers both of these structures, even
         // though they actually have distinct layouts. See the comment above that
         // type for more information.
-        let indirect_parameters = IndirectParametersCpuMetadata {
+        let indirect_parameters = IndirectParametersMetadata {
             base_output_index,
             batch_set_index: match batch_set_index {
                 None => !0,
                 Some(batch_set_index) => u32::from(batch_set_index),
             },
+            // These fields are filled in by the GPU:
+            mesh_index: 0,
+            early_instance_count: 0,
+            late_instance_count: 0,
         };
 
         if indexed {