Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More triangles/vertices per meshlet #15023

Merged
merged 9 commits into from
Sep 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -1075,7 +1075,7 @@ setup = [
"curl",
"-o",
"assets/models/bunny.meshlet_mesh",
"https://raw.githubusercontent.com/JMS55/bevy_meshlet_asset/b6c712cfc87c65de419f856845401aba336a7bcd/bunny.meshlet_mesh",
"https://raw.githubusercontent.com/JMS55/bevy_meshlet_asset/e3da1533b4c69fb967f233c817e9b0921134d317/bunny.meshlet_mesh",
],
]

Expand Down
45 changes: 15 additions & 30 deletions crates/bevy_pbr/src/meshlet/from_mesh.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use bevy_utils::HashMap;
use itertools::Itertools;
use meshopt::{
build_meshlets, compute_cluster_bounds, compute_meshlet_bounds, ffi::meshopt_Bounds, simplify,
simplify_scale, Meshlets, SimplifyOptions, VertexDataAdapter,
Meshlets, SimplifyOptions, VertexDataAdapter,
};
use metis::Graph;
use smallvec::SmallVec;
Expand Down Expand Up @@ -49,11 +49,9 @@ impl MeshletMesh {
},
})
.collect::<Vec<_>>();
let mesh_scale = simplify_scale(&vertices);

// Build further LODs
let mut simplification_queue = 0..meshlets.len();
let mut lod_level = 1;
while simplification_queue.len() > 1 {
// For each meshlet build a list of connected meshlets (meshlets that share a triangle edge)
let connected_meshlets_per_meshlet =
Expand All @@ -70,19 +68,14 @@ impl MeshletMesh {

for group_meshlets in groups.into_iter().filter(|group| group.len() > 1) {
// Simplify the group to ~50% triangle count
let Some((simplified_group_indices, mut group_error)) = simplify_meshlet_groups(
&group_meshlets,
&meshlets,
&vertices,
lod_level,
mesh_scale,
) else {
let Some((simplified_group_indices, mut group_error)) =
simplify_meshlet_group(&group_meshlets, &meshlets, &vertices)
else {
continue;
};

// Add the maximum child error to the parent error to make parent error cumulative from LOD 0
// (we're currently building the parent from its children)
group_error += group_meshlets.iter().fold(0.0f32, |acc, meshlet_id| {
// Force parent error to be >= child error (we're currently building the parent from its children)
group_error = group_meshlets.iter().fold(group_error, |acc, meshlet_id| {
acc.max(bounding_spheres[*meshlet_id].self_lod.radius)
});

Expand All @@ -99,7 +92,7 @@ impl MeshletMesh {
}

// Build new meshlets using the simplified group
let new_meshlets_count = split_simplified_groups_into_new_meshlets(
let new_meshlets_count = split_simplified_group_into_new_meshlets(
&simplified_group_indices,
&vertices,
&mut meshlets,
Expand All @@ -125,7 +118,6 @@ impl MeshletMesh {
}

simplification_queue = next_lod_start..meshlets.len();
lod_level += 1;
}

// Convert meshopt_Meshlet data to a custom format
Expand Down Expand Up @@ -172,7 +164,7 @@ fn validate_input_mesh(mesh: &Mesh) -> Result<Cow<'_, [u32]>, MeshToMeshletMeshC
}

fn compute_meshlets(indices: &[u32], vertices: &VertexDataAdapter) -> Meshlets {
build_meshlets(indices, vertices, 64, 64, 0.0)
build_meshlets(indices, vertices, 255, 128, 0.0) // Meshoptimizer won't currently let us do 256 vertices
}

fn find_connected_meshlets(
Expand Down Expand Up @@ -252,7 +244,7 @@ fn group_meshlets(
xadj.push(adjncy.len() as i32);

let mut group_per_meshlet = vec![0; simplification_queue.len()];
let partition_count = simplification_queue.len().div_ceil(4);
let partition_count = simplification_queue.len().div_ceil(4); // TODO: Nanite uses groups of 8-32, probably based on some kind of heuristic
Graph::new(1, partition_count as i32, &xadj, &adjncy)
.unwrap()
.set_adjwgt(&adjwgt)
Expand All @@ -267,12 +259,10 @@ fn group_meshlets(
groups
}

fn simplify_meshlet_groups(
fn simplify_meshlet_group(
group_meshlets: &[usize],
meshlets: &Meshlets,
vertices: &VertexDataAdapter<'_>,
lod_level: u32,
mesh_scale: f32,
) -> Option<(Vec<u32>, f32)> {
// Build a new index buffer into the mesh vertex data by combining all meshlet data in the group
let mut group_indices = Vec::new();
Expand All @@ -283,25 +273,20 @@ fn simplify_meshlet_groups(
}
}

// Allow more deformation for high LOD levels (1% at LOD 1, 10% at LOD 20+)
let t = (lod_level - 1) as f32 / 19.0;
let target_error_relative = 0.1 * t + 0.01 * (1.0 - t);
let target_error = target_error_relative * mesh_scale;

// Simplify the group to ~50% triangle count
// TODO: Simplify using vertex attributes
let mut error = 0.0;
let simplified_group_indices = simplify(
&group_indices,
vertices,
group_indices.len() / 2,
target_error,
SimplifyOptions::LockBorder | SimplifyOptions::Sparse | SimplifyOptions::ErrorAbsolute,
f32::MAX,
Copy link

@Scthe Scthe Sep 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After this change, is there a scenario where both of the following happen:

  1. simplification fails to reach target triangle count,
  2. there are still leftover vertices that are not shared with other meshlets?

For Nanite simplification, all vertices mentioned in point 2 could probably be removed. While this might introduce artefacts, the error metric should show more detailed meshlets in such cases.

I suspect that to produce an optimal DAG ("Batched Multi Triangulation" section 3.2) you should always remove as many triangles as possible to reach 256 triangles (2 meshlets). The only constraint is Nanite's immutable shared vertices between meshlets. Any problems caused by simplification are shoved into the error metric.

See #14998 (comment) for context.

SimplifyOptions::LockBorder | SimplifyOptions::Sparse | SimplifyOptions::ErrorAbsolute, // TODO: Specify manual vertex locks instead of meshopt's overly-strict locks
Some(&mut error),
);

// Check if we were able to simplify to at least 65% triangle count
if simplified_group_indices.len() as f32 / group_indices.len() as f32 > 0.65 {
// Check if we were able to simplify at least a little (95% of the original triangle count)
if simplified_group_indices.len() as f32 / group_indices.len() as f32 > 0.95 {
return None;
}

Expand All @@ -311,7 +296,7 @@ fn simplify_meshlet_groups(
Some((simplified_group_indices, error))
}

fn split_simplified_groups_into_new_meshlets(
fn split_simplified_group_into_new_meshlets(
simplified_group_indices: &[u32],
vertices: &VertexDataAdapter<'_>,
meshlets: &mut Meshlets,
Expand Down
2 changes: 1 addition & 1 deletion crates/bevy_pbr/src/meshlet/resolve_render_targets.wgsl
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ fn resolve_material_depth(in: FullscreenVertexOutput) -> @builtin(frag_depth) f3
let depth = visibility >> 32u;
if depth == 0lu { return 0.0; }

let cluster_id = u32(visibility) >> 6u;
let cluster_id = u32(visibility) >> 7u;
let instance_id = meshlet_cluster_instance_ids[cluster_id];
let material_id = meshlet_instance_material_ids[instance_id];
return f32(material_id) / 65535.0;
Expand Down
6 changes: 3 additions & 3 deletions crates/bevy_pbr/src/meshlet/resource_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ pub struct ResourceManager {
impl ResourceManager {
pub fn new(cluster_buffer_slots: u32, render_device: &RenderDevice) -> Self {
let needs_dispatch_remap =
cluster_buffer_slots < render_device.limits().max_compute_workgroups_per_dimension;
cluster_buffer_slots > render_device.limits().max_compute_workgroups_per_dimension;

Self {
visibility_buffer_raster_clusters: render_device.create_buffer(&BufferDescriptor {
Expand Down Expand Up @@ -472,7 +472,7 @@ pub fn prepare_meshlet_per_frame_resources(
.create_buffer_with_data(&BufferInitDescriptor {
label: Some("meshlet_visibility_buffer_hardware_raster_indirect_args_first"),
contents: DrawIndirectArgs {
vertex_count: 64 * 3,
vertex_count: 128 * 3,
instance_count: 0,
first_vertex: 0,
first_instance: 0,
Expand All @@ -484,7 +484,7 @@ pub fn prepare_meshlet_per_frame_resources(
.create_buffer_with_data(&BufferInitDescriptor {
label: Some("visibility_buffer_hardware_raster_indirect_args_second"),
contents: DrawIndirectArgs {
vertex_count: 64 * 3,
vertex_count: 128 * 3,
instance_count: 0,
first_vertex: 0,
first_instance: 0,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ fn vertex(@builtin(instance_index) instance_index: u32, @builtin(vertex_index) v
return VertexOutput(
clip_position,
#ifdef MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT
(cluster_id << 6u) | triangle_id,
(cluster_id << 7u) | triangle_id,
#endif
#ifdef DEPTH_CLAMP_ORTHO
unclamped_clip_depth,
Expand All @@ -83,7 +83,7 @@ fn fragment(vertex_output: VertexOutput) {

fn dummy_vertex() -> VertexOutput {
return VertexOutput(
vec4(0.0),
vec4(divide(0.0, 0.0)), // NaN vertex position
#ifdef MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT
0u,
#endif
Expand All @@ -92,3 +92,8 @@ fn dummy_vertex() -> VertexOutput {
#endif
);
}

// Naga doesn't allow divide by zero literals, but this lets us work around it
fn divide(a: f32, b: f32) -> f32 {
return a / b;
}
4 changes: 2 additions & 2 deletions crates/bevy_pbr/src/meshlet/visibility_buffer_resolve.wgsl
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,11 @@ struct VertexOutput {
fn resolve_vertex_output(frag_coord: vec4<f32>) -> VertexOutput {
let frag_coord_1d = u32(frag_coord.y) * u32(view.viewport.z) + u32(frag_coord.x);
let packed_ids = u32(meshlet_visibility_buffer[frag_coord_1d]); // TODO: Might be faster to load the correct u32 directly
let cluster_id = packed_ids >> 6u;
let cluster_id = packed_ids >> 7u;
let meshlet_id = meshlet_cluster_meshlet_ids[cluster_id];
let meshlet = meshlets[meshlet_id];

let triangle_id = extractBits(packed_ids, 0u, 6u);
let triangle_id = extractBits(packed_ids, 0u, 7u);
let index_ids = meshlet.start_index_id + (triangle_id * 3u) + vec3(0u, 1u, 2u);
let indices = meshlet.start_vertex_id + vec3(get_meshlet_index(index_ids.x), get_meshlet_index(index_ids.y), get_meshlet_index(index_ids.z));
let vertex_ids = vec3(meshlet_vertex_ids[indices.x], meshlet_vertex_ids[indices.y], meshlet_vertex_ids[indices.z]);
Expand Down
44 changes: 23 additions & 21 deletions crates/bevy_pbr/src/meshlet/visibility_buffer_software_raster.wgsl
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@

// TODO: Subpixel precision and top-left rule

var<workgroup> viewport_vertices: array<vec3f, 64>;
var<workgroup> viewport_vertices: array<vec3f, 255>;

@compute
@workgroup_size(64, 1, 1) // 64 threads per workgroup, 1 vertex/triangle per thread, 1 cluster per workgroup
@workgroup_size(128, 1, 1) // 128 threads per workgroup, 1-2 vertices per thread, 1 triangle per thread, 1 cluster per workgroup
fn rasterize_cluster(
@builtin(workgroup_id) workgroup_id: vec3<u32>,
@builtin(local_invocation_index) local_invocation_index: u32,
Expand All @@ -44,28 +44,30 @@ fn rasterize_cluster(
let meshlet_id = meshlet_cluster_meshlet_ids[cluster_id];
let meshlet = meshlets[meshlet_id];

// Load and project 1 vertex per thread
let vertex_id = local_invocation_index;
if vertex_id < meshlet.vertex_count {
let meshlet_vertex_id = meshlet_vertex_ids[meshlet.start_vertex_id + vertex_id];
let vertex = unpack_meshlet_vertex(meshlet_vertex_data[meshlet_vertex_id]);

// Project vertex to viewport space
let instance_id = meshlet_cluster_instance_ids[cluster_id];
let instance_uniform = meshlet_instance_uniforms[instance_id];
let world_from_local = affine3_to_square(instance_uniform.world_from_local);
let world_position = mesh_position_local_to_world(world_from_local, vec4(vertex.position, 1.0));
var clip_position = view.clip_from_world * vec4(world_position.xyz, 1.0);
var ndc_position = clip_position.xyz / clip_position.w;
let instance_id = meshlet_cluster_instance_ids[cluster_id];
let instance_uniform = meshlet_instance_uniforms[instance_id];
let world_from_local = affine3_to_square(instance_uniform.world_from_local);

// Load and project 1 vertex per thread, and then again if there are more than 128 vertices in the meshlet
for (var i = 0u; i <= 128u; i += 128u) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: up to you but it feels like i < 255u would be more clear; in general, this loop is

for (i = 0; i < max_vertices; i += workgroup_size)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, not quite the way I thought of it. My thought process was "we want to do this twice, with the second time using an offset of 128".

let vertex_id = local_invocation_index + i;
if vertex_id < meshlet.vertex_count {
let meshlet_vertex_id = meshlet_vertex_ids[meshlet.start_vertex_id + vertex_id];
let vertex = unpack_meshlet_vertex(meshlet_vertex_data[meshlet_vertex_id]);

// Project vertex to viewport space
let world_position = mesh_position_local_to_world(world_from_local, vec4(vertex.position, 1.0));
let clip_position = view.clip_from_world * vec4(world_position.xyz, 1.0);
var ndc_position = clip_position.xyz / clip_position.w;
#ifdef DEPTH_CLAMP_ORTHO
ndc_position.z = 1.0 / clip_position.z;
ndc_position.z = 1.0 / clip_position.z;
#endif
let viewport_position_xy = ndc_to_uv(ndc_position.xy) * view.viewport.zw;
let viewport_position_xy = ndc_to_uv(ndc_position.xy) * view.viewport.zw;

// Write vertex to workgroup shared memory
viewport_vertices[vertex_id] = vec3(viewport_position_xy, ndc_position.z);
// Write vertex to workgroup shared memory
viewport_vertices[vertex_id] = vec3(viewport_position_xy, ndc_position.z);
}
}

workgroupBarrier();

// Load 1 triangle's worth of vertex data per thread
Expand All @@ -76,7 +78,7 @@ fn rasterize_cluster(
let vertex_0 = viewport_vertices[vertex_ids[2]];
let vertex_1 = viewport_vertices[vertex_ids[1]];
let vertex_2 = viewport_vertices[vertex_ids[0]];
let packed_ids = (cluster_id << 6u) | triangle_id;
let packed_ids = (cluster_id << 7u) | triangle_id;

// Compute triangle bounding box
let min_x = u32(min3(vertex_0.x, vertex_1.x, vertex_2.x));
Expand Down
2 changes: 1 addition & 1 deletion examples/3d/meshlet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use camera_controller::{CameraController, CameraControllerPlugin};
use std::{f32::consts::PI, path::Path, process::ExitCode};

const ASSET_URL: &str =
"https://raw.githubusercontent.com/JMS55/bevy_meshlet_asset/10bb5471c7beedfe63ad1cf269599c92b0f10aa2/bunny.meshlet_mesh";
"https://raw.githubusercontent.com/JMS55/bevy_meshlet_asset/e3da1533b4c69fb967f233c817e9b0921134d317/bunny.meshlet_mesh";

fn main() -> ExitCode {
if !Path::new("./assets/models/bunny.meshlet_mesh").exists() {
Expand Down