From a1a81e572186c31d6e08aa23e4f2f62db7e2d3de Mon Sep 17 00:00:00 2001 From: James Liu Date: Sun, 1 Oct 2023 02:44:03 -0700 Subject: [PATCH] Parallelize extract_meshes (#9966) # Objective `extract_meshes` can easily be one of the most expensive operations in the blocking extract schedule for 3D apps. It also has no fundamentally serialized parts and can easily be run across multiple threads. Let's speed it up by parallelizing it! ## Solution Use the `ThreadLocal>>` approach utilized by #7348 in conjunction with `Query::par_iter` to build a set of thread-local queues, and collect them after going wide. ## Performance Using `cargo run --profile stress-test --features trace_tracy --example many_cubes`. Yellow is this PR. Red is main. `extract_meshes`: ![image](https://github.com/bevyengine/bevy/assets/3137680/9d45aa2e-3cfa-4fad-9c08-53498b51a73b) An average reduction from 1.2ms to 770us is seen, a 41.6% improvement. Note: this is still not including #9950's changes, so this may actually result in even faster speedups once that's merged in. --- crates/bevy_pbr/Cargo.toml | 1 + crates/bevy_pbr/src/render/mesh.rs | 91 +++++++++++++++++------------- 2 files changed, 52 insertions(+), 40 deletions(-) diff --git a/crates/bevy_pbr/Cargo.toml b/crates/bevy_pbr/Cargo.toml index 5ff50b66d6644..f35774f038cb8 100644 --- a/crates/bevy_pbr/Cargo.toml +++ b/crates/bevy_pbr/Cargo.toml @@ -33,3 +33,4 @@ bytemuck = { version = "1", features = ["derive"] } naga_oil = "0.8" radsort = "0.1" smallvec = "1.6" +thread_local = "1.0" diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs index 583475bf9a2e2..151bdcf31679a 100644 --- a/crates/bevy_pbr/src/render/mesh.rs +++ b/crates/bevy_pbr/src/render/mesh.rs @@ -45,6 +45,8 @@ use bevy_render::{ }; use bevy_transform::components::GlobalTransform; use bevy_utils::{tracing::error, EntityHashMap, HashMap, Hashed}; +use std::cell::Cell; +use thread_local::ThreadLocal; use crate::render::{ morph::{ @@ -246,6 +248,7 @@ pub fn extract_meshes( mut commands: Commands, mut previous_len: Local, mut render_mesh_instances: ResMut, + mut thread_local_queues: Local>>>, meshes_query: Extract< Query<( Entity, @@ -259,50 +262,58 @@ pub fn extract_meshes( )>, >, ) { + meshes_query.par_iter().for_each( + |( + entity, + view_visibility, + transform, + previous_transform, + handle, + not_receiver, + not_caster, + no_automatic_batching, + )| { + if !view_visibility.get() { + return; + } + let transform = transform.affine(); + let previous_transform = previous_transform.map(|t| t.0).unwrap_or(transform); + let mut flags = if not_receiver.is_some() { + MeshFlags::empty() + } else { + MeshFlags::SHADOW_RECEIVER + }; + if transform.matrix3.determinant().is_sign_positive() { + flags |= MeshFlags::SIGN_DETERMINANT_MODEL_3X3; + } + let transforms = MeshTransforms { + transform: (&transform).into(), + previous_transform: (&previous_transform).into(), + flags: flags.bits(), + }; + let tls = thread_local_queues.get_or_default(); + let mut queue = tls.take(); + queue.push(( + entity, + RenderMeshInstance { + mesh_asset_id: handle.id(), + transforms, + shadow_caster: not_caster.is_none(), + material_bind_group_id: MaterialBindGroupId::default(), + automatic_batching: !no_automatic_batching, + }, + )); + tls.set(queue); + }, + ); + render_mesh_instances.clear(); let mut entities = Vec::with_capacity(*previous_len); - - let visible_meshes = meshes_query.iter().filter(|(_, vis, ..)| vis.get()); - - for ( - entity, - _, - transform, - previous_transform, - handle, - not_receiver, - not_caster, - no_automatic_batching, - ) in visible_meshes - { - let transform = transform.affine(); - let previous_transform = previous_transform.map(|t| t.0).unwrap_or(transform); - let mut flags = if not_receiver.is_some() { - MeshFlags::empty() - } else { - MeshFlags::SHADOW_RECEIVER - }; - if transform.matrix3.determinant().is_sign_positive() { - flags |= MeshFlags::SIGN_DETERMINANT_MODEL_3X3; - } - let transforms = MeshTransforms { - transform: (&transform).into(), - previous_transform: (&previous_transform).into(), - flags: flags.bits(), - }; + for queue in thread_local_queues.iter_mut() { // FIXME: Remove this - it is just a workaround to enable rendering to work as // render commands require an entity to exist at the moment. - entities.push((entity, Mesh3d)); - render_mesh_instances.insert( - entity, - RenderMeshInstance { - mesh_asset_id: handle.id(), - transforms, - shadow_caster: not_caster.is_none(), - material_bind_group_id: MaterialBindGroupId::default(), - automatic_batching: !no_automatic_batching, - }, - ); + entities.extend(queue.get_mut().iter().map(|(e, _)| (*e, Mesh3d))); + render_mesh_instances.extend(queue.get_mut().drain(..)); } *previous_len = entities.len(); commands.insert_or_spawn_batch(entities);