From 7eadc1d46780c58ced9bf8ad6d1c99afe5589c34 Mon Sep 17 00:00:00 2001
From: vero <rodol@rivalrebels.com>
Date: Fri, 4 Oct 2024 17:24:44 -0400
Subject: [PATCH] Zero Copy Mesh (#15569)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# Objective

- Another step towards #15558

## Solution

- Instead of allocating a Vec and then having wgpu copy it into a
staging buffer, write directly into the staging buffer.
- gets rid of another hidden copy, in `pad_to_alignment`.

future work:
- why is there a gcd implementation in here (and its subpar, use
binary_gcd. its in the hot path, run twice for every mesh, every frame i
think?) make it better and put it in bevy_math
- zero-copy custom mesh api to avoid having to write out a Mesh from a
custom rep

## Testing

- lighting and many_cubes run fine (and slightly faster. havent
benchmarked though)

---

## Showcase

- look ma... no copies

at least when RenderAssetUsage is GPU only :3

---------

Co-authored-by: Alice Cecile <alice.i.cecile@gmail.com>
Co-authored-by: Kristoffer Søholm <k.soeholm@gmail.com>
---
 crates/bevy_render/src/mesh/allocator.rs | 79 ++++++++++++------------
 crates/bevy_render/src/mesh/mesh/mod.rs  | 28 +++++++--
 2 files changed, 61 insertions(+), 46 deletions(-)
diff --git a/crates/bevy_render/src/mesh/allocator.rs b/crates/bevy_render/src/mesh/allocator.rs
index c9d36c5855b51..3493f7dcf1722 100644
--- a/crates/bevy_render/src/mesh/allocator.rs
+++ b/crates/bevy_render/src/mesh/allocator.rs
@@ -1,9 +1,8 @@
 //! Manages mesh vertex and index buffers.
 
-use alloc::{borrow::Cow, vec::Vec};
+use alloc::vec::Vec;
 use core::{
     fmt::{self, Display, Formatter},
-    iter,
     ops::Range,
 };
 
@@ -21,8 +20,8 @@ use bevy_utils::{
 };
 use offset_allocator::{Allocation, Allocator};
 use wgpu::{
-    util::BufferInitDescriptor, BufferDescriptor, BufferUsages, CommandEncoderDescriptor,
-    DownlevelFlags, COPY_BUFFER_ALIGNMENT,
+    BufferDescriptor, BufferSize, BufferUsages, CommandEncoderDescriptor, DownlevelFlags,
+    COPY_BUFFER_ALIGNMENT,
 };
 
 use crate::{
@@ -427,7 +426,7 @@ impl MeshAllocator {
             if self.general_vertex_slabs_supported {
                 self.allocate(
                     mesh_id,
-                    mesh.get_vertex_size() * mesh.count_vertices() as u64,
+                    mesh.get_vertex_buffer_size() as u64,
                     vertex_element_layout,
                     &mut slabs_to_grow,
                     mesh_allocator_settings,
@@ -474,12 +473,12 @@ impl MeshAllocator {
         let Some(&slab_id) = self.mesh_id_to_vertex_slab.get(mesh_id) else {
             return;
         };
-        let vertex_data = mesh.create_packed_vertex_buffer_data();
 
         // Call the generic function.
         self.copy_element_data(
             mesh_id,
-            &vertex_data,
+            mesh.get_vertex_buffer_size(),
+            |slice| mesh.write_packed_vertex_buffer_data(slice),
             BufferUsages::VERTEX,
             slab_id,
             render_device,
@@ -506,7 +505,8 @@ impl MeshAllocator {
         // Call the generic function.
         self.copy_element_data(
             mesh_id,
-            index_data,
+            index_data.len(),
+            |slice| slice.copy_from_slice(index_data),
             BufferUsages::INDEX,
             slab_id,
             render_device,
@@ -519,7 +519,8 @@ impl MeshAllocator {
     fn copy_element_data(
         &mut self,
         mesh_id: &AssetId<Mesh>,
-        data: &[u8],
+        len: usize,
+        fill_data: impl Fn(&mut [u8]),
         buffer_usages: BufferUsages,
         slab_id: SlabId,
         render_device: &RenderDevice,
@@ -540,12 +541,18 @@ impl MeshAllocator {
 
                 let slot_size = general_slab.element_layout.slot_size();
 
-                // Write the data in.
-                render_queue.write_buffer(
-                    buffer,
-                    allocated_range.allocation.offset as u64 * slot_size,
-                    &pad_to_alignment(data, slot_size as usize),
-                );
+                // round up size to a multiple of the slot size to satisfy wgpu alignment requirements
+                if let Some(size) = BufferSize::new((len as u64).next_multiple_of(slot_size)) {
+                    // Write the data in.
+                    if let Some(mut buffer) = render_queue.write_buffer_with(
+                        buffer,
+                        allocated_range.allocation.offset as u64 * slot_size,
+                        size,
+                    ) {
+                        let slice = &mut buffer.as_mut()[..len];
+                        fill_data(slice);
+                    }
+                }
 
                 // Mark the allocation as resident.
                 general_slab
@@ -557,17 +564,22 @@ impl MeshAllocator {
                 debug_assert!(large_object_slab.buffer.is_none());
 
                 // Create the buffer and its data in one go.
-                large_object_slab.buffer = Some(render_device.create_buffer_with_data(
-                    &BufferInitDescriptor {
-                        label: Some(&format!(
-                            "large mesh slab {} ({}buffer)",
-                            slab_id,
-                            buffer_usages_to_str(buffer_usages)
-                        )),
-                        contents: data,
-                        usage: buffer_usages | BufferUsages::COPY_DST,
-                    },
-                ));
+                let buffer = render_device.create_buffer(&BufferDescriptor {
+                    label: Some(&format!(
+                        "large mesh slab {} ({}buffer)",
+                        slab_id,
+                        buffer_usages_to_str(buffer_usages)
+                    )),
+                    size: len as u64,
+                    usage: buffer_usages | BufferUsages::COPY_DST,
+                    mapped_at_creation: true,
+                });
+                {
+                    let slice = &mut buffer.slice(..).get_mapped_range_mut()[..len];
+                    fill_data(slice);
+                }
+                buffer.unmap();
+                large_object_slab.buffer = Some(buffer);
             }
         }
     }
@@ -1000,21 +1012,6 @@ fn gcd(mut a: u64, mut b: u64) -> u64 {
     a
 }
 
-/// Ensures that the size of a buffer is a multiple of the given alignment by
-/// padding it with zeroes if necessary.
-///
-/// If the buffer already has the required size, then this function doesn't
-/// allocate. Otherwise, it copies the buffer into a new one and writes the
-/// appropriate number of zeroes to the end.
-fn pad_to_alignment(buffer: &[u8], align: usize) -> Cow<[u8]> {
-    if buffer.len() % align == 0 {
-        return Cow::Borrowed(buffer);
-    }
-    let mut buffer = buffer.to_vec();
-    buffer.extend(iter::repeat(0).take(align - buffer.len() % align));
-    Cow::Owned(buffer)
-}
-
 /// Returns a string describing the given buffer usages.
 fn buffer_usages_to_str(buffer_usages: BufferUsages) -> &'static str {
     if buffer_usages.contains(BufferUsages::VERTEX) {
diff --git a/crates/bevy_render/src/mesh/mesh/mod.rs b/crates/bevy_render/src/mesh/mesh/mod.rs
index 1f40cb8b41a4c..6e6fd3f072f0c 100644
--- a/crates/bevy_render/src/mesh/mesh/mod.rs
+++ b/crates/bevy_render/src/mesh/mesh/mod.rs
@@ -385,6 +385,13 @@ impl Mesh {
             .sum()
     }
 
+    /// Returns the size required for the vertex buffer in bytes.
+    pub fn get_vertex_buffer_size(&self) -> usize {
+        let vertex_size = self.get_vertex_size() as usize;
+        let vertex_count = self.count_vertices();
+        vertex_count * vertex_size
+    }
+
     /// Computes and returns the index data of the mesh as bytes.
     /// This is used to transform the index data into a GPU friendly format.
     pub fn get_index_buffer_bytes(&self) -> Option<&[u8]> {
@@ -458,10 +465,24 @@ impl Mesh {
     ///
     /// If the vertex attributes have different lengths, they are all truncated to
     /// the length of the smallest.
+    ///
+    /// This is a convenience method which allocates a Vec.
+    /// Prefer pre-allocating and using [`Mesh::write_packed_vertex_buffer_data`] when possible.
     pub fn create_packed_vertex_buffer_data(&self) -> Vec<u8> {
+        let mut attributes_interleaved_buffer = vec![0; self.get_vertex_buffer_size()];
+        self.write_packed_vertex_buffer_data(&mut attributes_interleaved_buffer);
+        attributes_interleaved_buffer
+    }
+
+    /// Computes and write the vertex data of the mesh into a mutable byte slice.
+    /// The attributes are located in the order of their [`MeshVertexAttribute::id`].
+    /// This is used to transform the vertex data into a GPU friendly format.
+    ///
+    /// If the vertex attributes have different lengths, they are all truncated to
+    /// the length of the smallest.
+    pub fn write_packed_vertex_buffer_data(&self, slice: &mut [u8]) {
         let vertex_size = self.get_vertex_size() as usize;
         let vertex_count = self.count_vertices();
-        let mut attributes_interleaved_buffer = vec![0; vertex_count * vertex_size];
         // bundle into interleaved buffers
         let mut attribute_offset = 0;
         for attribute_data in self.attributes.values() {
@@ -473,14 +494,11 @@ impl Mesh {
                 .enumerate()
             {
                 let offset = vertex_index * vertex_size + attribute_offset;
-                attributes_interleaved_buffer[offset..offset + attribute_size]
-                    .copy_from_slice(attribute_bytes);
+                slice[offset..offset + attribute_size].copy_from_slice(attribute_bytes);
             }
 
             attribute_offset += attribute_size;
         }
-
-        attributes_interleaved_buffer
     }
 
     /// Duplicates the vertex attributes so that no vertices are shared.