Skip to content

Commit

Permalink
Meshlet remove per-cluster data upload (#13125)
Browse files Browse the repository at this point in the history
# Objective

- Per-cluster (instance of a meshlet) data upload is ridiculously
expensive in both CPU and GPU time (8 bytes per cluster, millions of
clusters, you very quickly run into PCIE bandwidth maximums, and lots of
CPU-side copies and malloc).
- We need to be uploading only per-instance/entity data. Anything else
needs to be done on the GPU.

## Solution

- Per instance, upload:
- `meshlet_instance_meshlet_counts_prefix_sum` - An exclusive prefix sum
over the count of how many clusters each instance has.
- `meshlet_instance_meshlet_slice_starts` - The starting index of the
meshlets for each instance within the `meshlets` buffer.
- A new `fill_cluster_buffers` pass once at the start of the frame has a
thread per cluster, and finds its instance ID and meshlet ID via a
binary search of `meshlet_instance_meshlet_counts_prefix_sum` to find
what instance it belongs to, and then uses that plus
`meshlet_instance_meshlet_slice_starts` to find what number meshlet
within the instance it is. The shader then writes out the per-cluster
instance/meshlet ID buffers for later passes to quickly read from.
- I've gone from 45 -> 180 FPS in my stress test scene, and saved
~30ms/frame of overall CPU/GPU time.
  • Loading branch information
JMS55 authored May 4, 2024
1 parent ec418aa commit 77ebabc
Show file tree
Hide file tree
Showing 10 changed files with 305 additions and 102 deletions.
16 changes: 8 additions & 8 deletions crates/bevy_pbr/src/meshlet/cull_meshlets.wgsl
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
#import bevy_pbr::meshlet_bindings::{
meshlet_thread_meshlet_ids,
meshlet_cluster_meshlet_ids,
meshlet_bounding_spheres,
meshlet_thread_instance_ids,
meshlet_cluster_instance_ids,
meshlet_instance_uniforms,
meshlet_second_pass_candidates,
depth_pyramid,
view,
previous_view,
should_cull_instance,
meshlet_is_second_pass_candidate,
cluster_is_second_pass_candidate,
meshlets,
draw_indirect_args,
draw_triangle_buffer,
Expand All @@ -21,29 +21,29 @@
/// the instance, frustum, and LOD tests in the first pass, but were not visible last frame according to the occlusion culling.

@compute
@workgroup_size(128, 1, 1) // 128 threads per workgroup, 1 instanced meshlet per thread
@workgroup_size(128, 1, 1) // 128 threads per workgroup, 1 cluster per thread
fn cull_meshlets(
@builtin(workgroup_id) workgroup_id: vec3<u32>,
@builtin(num_workgroups) num_workgroups: vec3<u32>,
@builtin(local_invocation_id) local_invocation_id: vec3<u32>,
) {
// Calculate the cluster ID for this thread
let cluster_id = local_invocation_id.x + 128u * dot(workgroup_id, vec3(num_workgroups.x * num_workgroups.x, num_workgroups.x, 1u));
if cluster_id >= arrayLength(&meshlet_thread_meshlet_ids) { return; }
if cluster_id >= arrayLength(&meshlet_cluster_meshlet_ids) { return; }

#ifdef MESHLET_SECOND_CULLING_PASS
if !meshlet_is_second_pass_candidate(cluster_id) { return; }
if !cluster_is_second_pass_candidate(cluster_id) { return; }
#endif

// Check for instance culling
let instance_id = meshlet_thread_instance_ids[cluster_id];
let instance_id = meshlet_cluster_instance_ids[cluster_id];
#ifdef MESHLET_FIRST_CULLING_PASS
if should_cull_instance(instance_id) { return; }
#endif

// Calculate world-space culling bounding sphere for the cluster
let instance_uniform = meshlet_instance_uniforms[instance_id];
let meshlet_id = meshlet_thread_meshlet_ids[cluster_id];
let meshlet_id = meshlet_cluster_meshlet_ids[cluster_id];
let model = affine3_to_square(instance_uniform.model);
let model_scale = max(length(model[0]), max(length(model[1]), length(model[2])));
let bounding_spheres = meshlet_bounding_spheres[meshlet_id];
Expand Down
42 changes: 42 additions & 0 deletions crates/bevy_pbr/src/meshlet/fill_cluster_buffers.wgsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#import bevy_pbr::meshlet_bindings::{
cluster_count,
meshlet_instance_meshlet_counts_prefix_sum,
meshlet_instance_meshlet_slice_starts,
meshlet_cluster_instance_ids,
meshlet_cluster_meshlet_ids,
}

@compute
@workgroup_size(128, 1, 1) // 128 threads per workgroup, 1 cluster per thread
fn fill_cluster_buffers(
@builtin(workgroup_id) workgroup_id: vec3<u32>,
@builtin(num_workgroups) num_workgroups: vec3<u32>,
@builtin(local_invocation_id) local_invocation_id: vec3<u32>
) {
// Calculate the cluster ID for this thread
let cluster_id = local_invocation_id.x + 128u * dot(workgroup_id, vec3(num_workgroups.x * num_workgroups.x, num_workgroups.x, 1u));
if cluster_id >= cluster_count { return; }

// Binary search to find the instance this cluster belongs to
var left = 0u;
var right = arrayLength(&meshlet_instance_meshlet_counts_prefix_sum) - 1u;
while left <= right {
let mid = (left + right) / 2u;
if meshlet_instance_meshlet_counts_prefix_sum[mid] <= cluster_id {
left = mid + 1u;
} else {
right = mid - 1u;
}
}
let instance_id = right;

// Find the meshlet ID for this cluster within the instance's MeshletMesh
let meshlet_id_local = cluster_id - meshlet_instance_meshlet_counts_prefix_sum[instance_id];

// Find the overall meshlet ID in the global meshlet buffer
let meshlet_id = meshlet_id_local + meshlet_instance_meshlet_slice_starts[instance_id];

// Write results to buffers
meshlet_cluster_instance_ids[cluster_id] = instance_id;
meshlet_cluster_meshlet_ids[cluster_id] = meshlet_id;
}
Loading

0 comments on commit 77ebabc

Please sign in to comment.