diff --git a/build.bash b/build.bash index 5888eb7f..948aa6ae 100755 --- a/build.bash +++ b/build.bash @@ -21,7 +21,7 @@ case $1 in ;; serve) shift - simple-http-server target/generated -c wasm,html,js -i + simple-http-server target/generated -c wasm,html,js -i --nocache ;; ci) cargo fmt diff --git a/examples/Cargo.toml b/examples/Cargo.toml index 3c4fbbce..584f3d74 100644 --- a/examples/Cargo.toml +++ b/examples/Cargo.toml @@ -74,6 +74,7 @@ wgpu-profiler = "0.16.0" console_log = "1" console_error_panic_hook = "0.1" js-sys = "0.3" +gloo-console = "0.3" web-sys = "0.3.67" wasm-bindgen = "0.2.83" wasm-bindgen-futures = "0.4" diff --git a/examples/src/cube_no_framework/mod.rs b/examples/src/cube_no_framework/mod.rs index e764e898..d036ce98 100644 --- a/examples/src/cube_no_framework/mod.rs +++ b/examples/src/cube_no_framework/mod.rs @@ -109,13 +109,8 @@ pub fn main() { let base_rendergraph = rend3_routine::base::BaseRenderGraph::new(&renderer, &spp); let mut data_core = renderer.data_core.lock(); - let pbr_routine = rend3_routine::pbr::PbrRoutine::new( - &renderer, - &mut data_core, - &spp, - &base_rendergraph.interfaces, - &base_rendergraph.gpu_culler.culling_buffer_map_handle, - ); + let pbr_routine = + rend3_routine::pbr::PbrRoutine::new(&renderer, &mut data_core, &spp, &base_rendergraph.interfaces); drop(data_core); let tonemapping_routine = rend3_routine::tonemapping::TonemappingRoutine::new( &renderer, diff --git a/examples/src/lib.rs b/examples/src/lib.rs index cba9e27c..9f28a0d3 100644 --- a/examples/src/lib.rs +++ b/examples/src/lib.rs @@ -13,7 +13,9 @@ mod static_gltf; mod textured_quad; #[cfg(target_arch = "wasm32")] -use log::info as println; +use gloo_console::info as output; +#[cfg(not(target_arch = "wasm32"))] +use std::println as output; #[cfg(test)] mod tests; @@ -35,10 +37,10 @@ const EXAMPLES: &[ExampleDesc] = &[ ]; fn print_examples() { - println!("Usage: cargo run \n"); - println!("Available examples:"); + output!("Usage: cargo run \n"); + output!("Available examples:"); for example in EXAMPLES { - println!(" {}", example.name); + output!(" {}", example.name); } } @@ -50,7 +52,7 @@ pub fn main_with_name(example_name: Option) { }; let Some(example) = EXAMPLES.iter().find(|example| example.name == example_name) else { - println!("Unknown example: {}\n", example_name); + output!("Unknown example: {}\n", example_name); print_examples(); return; }; diff --git a/examples/src/scene_viewer/bistro.png b/examples/src/scene_viewer/bistro.png index 9ed44c9b..b8d952fe 100644 Binary files a/examples/src/scene_viewer/bistro.png and b/examples/src/scene_viewer/bistro.png differ diff --git a/examples/src/tests.rs b/examples/src/tests.rs index 14ab8675..207673d4 100644 --- a/examples/src/tests.rs +++ b/examples/src/tests.rs @@ -34,7 +34,6 @@ pub async fn test_app, T: 'static>(mut config: TestConfiguration) - &mut data_core, &spp, &base_rendergraph.interfaces, - &base_rendergraph.gpu_culler.culling_buffer_map_handle, )), skybox: Mutex::new(rend3_routine::skybox::SkyboxRoutine::new(&renderer, &spp, &base_rendergraph.interfaces)), tonemapping: Mutex::new(rend3_routine::tonemapping::TonemappingRoutine::new( diff --git a/rend3-framework/src/lib.rs b/rend3-framework/src/lib.rs index dcc12303..8567b7b0 100644 --- a/rend3-framework/src/lib.rs +++ b/rend3-framework/src/lib.rs @@ -231,7 +231,6 @@ pub async fn async_start + 'static, T: 'static>(mut app: A, window_bui &mut data_core, &spp, &base_rendergraph.interfaces, - &base_rendergraph.gpu_culler.culling_buffer_map_handle, )), skybox: Mutex::new(rend3_routine::skybox::SkyboxRoutine::new(&renderer, &spp, &base_rendergraph.interfaces)), tonemapping: Mutex::new(rend3_routine::tonemapping::TonemappingRoutine::new( diff --git a/rend3-routine/shaders/src/cull.wgsl b/rend3-routine/shaders/src/cull.wgsl deleted file mode 100644 index 295090e7..00000000 --- a/rend3-routine/shaders/src/cull.wgsl +++ /dev/null @@ -1,390 +0,0 @@ -{{include "rend3-routine/structures.wgsl"}} -{{include "rend3-routine/structures_object.wgsl"}} - -@group(0) @binding(0) -var vertex_buffer: array; -@group(0) @binding(1) -var object_buffer: array; - -fn vertex_fetch( - object_invocation: u32, - object_info: ptr, -) -> Triangle { - let index_0_index = object_invocation * 3u + 0u; - let index_1_index = object_invocation * 3u + 1u; - let index_2_index = object_invocation * 3u + 2u; - - let object = object_buffer[(*object_info).object_id]; - - let index0 = vertex_buffer[object.first_index + index_0_index]; - let index1 = vertex_buffer[object.first_index + index_1_index]; - let index2 = vertex_buffer[object.first_index + index_2_index]; - - let position_start_offset = object.vertex_attribute_start_offsets[{{position_attribute_offset}}]; - let model_position0 = extract_attribute_vec3_f32(position_start_offset, index0); - let model_position1 = extract_attribute_vec3_f32(position_start_offset, index1); - let model_position2 = extract_attribute_vec3_f32(position_start_offset, index2); - - return Triangle( - TriangleVertices(model_position0, model_position1, model_position2), - TriangleIndices(index0, index1, index2) - ); -} - -@group(0) @binding(2) -var culling_job: BatchData; - -struct DrawCallBuffer { - /// We always put the buffer that needs to be present in the next frame first. - predicted_object_offset: u32, - residual_object_offset: u32, - calls: array, -} - -@group(0) @binding(3) -var draw_calls: DrawCallBuffer; - -fn init_draw_calls(global_invocation: u32, region_id: u32) { - // Init the inheritable draw call - let predicted_object_draw_index = draw_calls.predicted_object_offset + region_id; - draw_calls.calls[predicted_object_draw_index].vertex_offset = 0; - draw_calls.calls[predicted_object_draw_index].instance_count = 1u; - draw_calls.calls[predicted_object_draw_index].base_instance = 0u; - draw_calls.calls[predicted_object_draw_index].base_index = global_invocation * 3u; - - // Init the residual objects draw call - let residual_object_draw_index = draw_calls.residual_object_offset + region_id; - draw_calls.calls[residual_object_draw_index].vertex_offset = 0; - draw_calls.calls[residual_object_draw_index].instance_count = 1u; - draw_calls.calls[residual_object_draw_index].base_instance = 0u; - draw_calls.calls[residual_object_draw_index].base_index = global_invocation * 3u; -} - -fn add_predicted_triangle_to_draw_call(region_id: u32) -> u32 { - let output_region_index = atomicAdd(&draw_calls.calls[draw_calls.predicted_object_offset + region_id].vertex_count, 3u); - let output_region_triangle = output_region_index / 3u; - return output_region_triangle; -} - -fn add_residual_triangle_to_draw_call(region_id: u32) -> u32 { - let output_region_index = atomicAdd(&draw_calls.calls[draw_calls.residual_object_offset + region_id].vertex_count, 3u); - let output_region_triangle = output_region_index / 3u; - return output_region_triangle; -} - -struct OutputIndexBuffer { - /// We always put the buffer that needs to be present in the next frame first. - predicted_object_offset: u32, - residual_object_offset: u32, - indices: array, -} -@group(0) @binding(4) -var output_indices : OutputIndexBuffer; - -fn write_predicted_atomic_triangle( - batch_object_index: u32, - object_info: ptr, - indices: TriangleIndices, -) { - let region_invocation = add_predicted_triangle_to_draw_call((*object_info).region_id); - let batch_invocation = region_invocation + (*object_info).region_base_invocation; - let global_invocation = batch_invocation + culling_job.batch_base_invocation; - - let packed_indices = pack_batch_indices(batch_object_index, indices); - - let predicted_object_indices_index = output_indices.predicted_object_offset + global_invocation * 3u; - output_indices.indices[predicted_object_indices_index] = packed_indices[0]; - output_indices.indices[predicted_object_indices_index + 1u] = packed_indices[1]; - output_indices.indices[predicted_object_indices_index + 2u] = packed_indices[2]; -} - -fn write_residual_atomic_triangle( - batch_object_index: u32, - object_info: ptr, - indices: TriangleIndices, -) { - let region_invocation = add_residual_triangle_to_draw_call((*object_info).region_id); - let batch_invocation = region_invocation + (*object_info).region_base_invocation; - let global_invocation = batch_invocation + culling_job.batch_base_invocation; - - let packed_indices = pack_batch_indices(batch_object_index, indices); - - let residual_object_indices_index = output_indices.residual_object_offset + global_invocation * 3u; - output_indices.indices[residual_object_indices_index] = packed_indices[0]; - output_indices.indices[residual_object_indices_index + 1u] = packed_indices[1]; - output_indices.indices[residual_object_indices_index + 2u] = packed_indices[2]; -} - -fn write_residual_nonatomic_triangle( - invocation: u32, - batch_object_index: u32, - object_info: ptr, - indices: TriangleIndices, -) { - add_residual_triangle_to_draw_call((*object_info).region_id); - - let packed_indices = pack_batch_indices(batch_object_index, indices); - - let residual_object_indices_index = output_indices.residual_object_offset + invocation * 3u; - output_indices.indices[residual_object_indices_index] = packed_indices[0]; - output_indices.indices[residual_object_indices_index + 1u] = packed_indices[1]; - output_indices.indices[residual_object_indices_index + 2u] = packed_indices[2]; -} - -fn write_invalid_residual_nonatomic_triangle(invocation: u32, object_info: ptr) { - add_residual_triangle_to_draw_call((*object_info).region_id); - - let residual_object_indices_index = output_indices.residual_object_offset + invocation * 3u; - output_indices.indices[residual_object_indices_index] = INVALID_VERTEX; - output_indices.indices[residual_object_indices_index + 1u] = INVALID_VERTEX; - output_indices.indices[residual_object_indices_index + 2u] = INVALID_VERTEX; -} - -struct CullingResults { - /// We always put the buffer that needs to be present in the next frame first. - output_offset: u32, - input_offset: u32, - bits: array, -} -@group(0) @binding(5) -var culling_results: CullingResults; - -fn get_previous_culling_result(object_info: ptr, object_invocation: u32) -> bool { - if (*object_info).previous_global_invocation == 0xFFFFFFFFu { - return false; - } - - let previous_global_invocation = object_invocation + (*object_info).previous_global_invocation; - let bitmask = culling_results.bits[culling_results.input_offset + (previous_global_invocation / 32u)]; - return ((bitmask >> (previous_global_invocation % 32u)) & 0x1u) == 0x1u; -} - -@group(0) @binding(6) -var per_camera_uniform: PerCameraUniform; - -fn is_shadow_pass() -> bool { - return per_camera_uniform.shadow_index != 0xFFFFFFFFu; -} - -@group(0) @binding(7) -var hirearchical_z_buffer: texture_depth_2d; -@group(0) @binding(8) -var nearest_sampler: sampler; - -{{include "rend3/vertex_attributes.wgsl"}} - -struct ObjectSearchResult { - range: ObjectCullingInformation, - index_within_region: u32, -} - -fn find_object_info(wid: u32) -> ObjectSearchResult { - let target_invocation = wid * 256u; - // pulled directly from https://doc.rust-lang.org/src/core/slice/mod.rs.html#2412-2438 - - var size = culling_job.total_objects; - var left = 0u; - var right = size; - var object_info: ObjectCullingInformation; - while left < right { - let mid = left + size / 2u; - - let probe = culling_job.object_culling_information[mid]; - - if probe.invocation_end <= target_invocation { - left = mid + 1u; - } else if probe.invocation_start > target_invocation { - right = mid; - } else { - return ObjectSearchResult(probe, mid); - } - - size = right - left; - } - - // This is unreachable, but required for the compiler to be happy - return ObjectSearchResult(object_info, 0xFFFFFFFFu); -} - -// 256 workgroup size / 32 bits -var workgroup_culling_results: array, 8>; - -fn clear_culling_results(lid: u32) { - if lid == 0u { - atomicStore(&workgroup_culling_results[0], 0u); - atomicStore(&workgroup_culling_results[1], 0u); - atomicStore(&workgroup_culling_results[2], 0u); - atomicStore(&workgroup_culling_results[3], 0u); - atomicStore(&workgroup_culling_results[4], 0u); - atomicStore(&workgroup_culling_results[5], 0u); - atomicStore(&workgroup_culling_results[6], 0u); - atomicStore(&workgroup_culling_results[7], 0u); - } -} - -fn compute_culling_results(lid: u32, passed_culling: bool) { - atomicOr(&workgroup_culling_results[lid / 32u], u32(passed_culling) << (lid % 32u)); -} - -fn save_culling_results(global_invocation: u32, lid: u32) { - if lid == 0u { - let culling_results_index = culling_results.output_offset + (global_invocation / 32u); - culling_results.bits[culling_results_index + 0u] = atomicLoad(&workgroup_culling_results[0]); - culling_results.bits[culling_results_index + 1u] = atomicLoad(&workgroup_culling_results[1]); - culling_results.bits[culling_results_index + 2u] = atomicLoad(&workgroup_culling_results[2]); - culling_results.bits[culling_results_index + 3u] = atomicLoad(&workgroup_culling_results[3]); - culling_results.bits[culling_results_index + 4u] = atomicLoad(&workgroup_culling_results[4]); - culling_results.bits[culling_results_index + 5u] = atomicLoad(&workgroup_culling_results[5]); - culling_results.bits[culling_results_index + 6u] = atomicLoad(&workgroup_culling_results[6]); - culling_results.bits[culling_results_index + 7u] = atomicLoad(&workgroup_culling_results[7]); - } -} - -fn textureSampleMin(texture: texture_depth_2d, uv: vec2, mipmap: f32) -> f32 { - let int_mipmap = i32(mipmap); - let mip_resolution = vec2(textureDimensions(texture, int_mipmap).xy); - - let pixel_coords = uv * mip_resolution - 0.5; - - let low = vec2(max(floor(pixel_coords), vec2(0.0))); - let high = vec2(min(ceil(pixel_coords), mip_resolution - 1.0)); - - let top_left = vec2(low.x, low.y); - let top_right = vec2(high.x, low.y); - let bottom_left = vec2(low.x, high.y); - let bottom_right = vec2(high.x, high.y); - - var minval = textureLoad(texture, top_left, int_mipmap); - minval = min(minval, textureLoad(texture, top_right, int_mipmap)); - minval = min(minval, textureLoad(texture, bottom_left, int_mipmap)); - minval = min(minval, textureLoad(texture, bottom_right, int_mipmap)); - return minval; -} - -fn execute_culling( - model_view_proj: mat4x4, - vertices: TriangleVertices, -) -> bool { - let position0 = model_view_proj * vec4(vertices[0], 1.0); - let position1 = model_view_proj * vec4(vertices[1], 1.0); - let position2 = model_view_proj * vec4(vertices[2], 1.0); - - let det = determinant(mat3x3(position0.xyw, position1.xyw, position2.xyw)); - - if (per_camera_uniform.flags & PCU_FLAGS_AREA_VISIBLE_MASK) == PCU_FLAGS_POSITIVE_AREA_VISIBLE && det <= 0.0 { - return false; - } - if (per_camera_uniform.flags & PCU_FLAGS_AREA_VISIBLE_MASK) == PCU_FLAGS_NEGATIVE_AREA_VISIBLE && det >= 0.0 { - return false; - } - - let ndc0 = position0.xyz / position0.w; - let ndc1 = position1.xyz / position1.w; - let ndc2 = position2.xyz / position2.w; - - let min_ndc_xy = min(ndc0.xy, min(ndc1.xy, ndc2.xy)); - let max_ndc_xy = max(ndc0.xy, max(ndc1.xy, ndc2.xy)); - - let half_res = per_camera_uniform.resolution / 2.0; - let min_screen_xy = (min_ndc_xy + 1.0) * half_res; - let max_screen_xy = (max_ndc_xy + 1.0) * half_res; - - if (per_camera_uniform.flags & PCU_FLAGS_MULTISAMPLE_MASK) == PCU_FLAGS_MULTISAMPLE_DISABLED { - let misses_pixel_center = any(round(min_screen_xy) == round(max_screen_xy)); - - if misses_pixel_center { - return false; - } - } - - // We skip hi-z calculation if we're doing a shadow pass - if per_camera_uniform.shadow_index != 0xFFFFFFFFu { - return true; - } - - var min_tex_coords = (min_ndc_xy + 1.0) / 2.0; - var max_tex_coords = (max_ndc_xy + 1.0) / 2.0; - min_tex_coords.y = 1.0 - min_tex_coords.y; - max_tex_coords.y = 1.0 - max_tex_coords.y; - - let uv = (max_tex_coords + min_tex_coords) / 2.0; - let edges = max_screen_xy - min_screen_xy; - - let longest_edge = max(edges.x, edges.y); - let mip = ceil(log2(max(longest_edge, 1.0))); - - let depth = max(max(ndc0.z, ndc1.z), ndc2.z); - let occlusion_depth = textureSampleMin(hirearchical_z_buffer, uv, mip); - - if depth < occlusion_depth { - return false; - } - - return true; -} - -@compute @workgroup_size(256) -fn cs_main( - @builtin(workgroup_id) wid: vec3, - @builtin(global_invocation_id) gid: vec3, - @builtin(local_invocation_id) lid: vec3, -) { - clear_culling_results(lid.x); - - let object_search_results = find_object_info(wid.x); - var object_info = object_search_results.range; - let batch_object_index = object_search_results.index_within_region; - let global_invocation = culling_job.batch_base_invocation + gid.x; - - // We need the workgroupBarrier to be in uniform control flow, so we can't return early here. - // - // If this is true, continue working on the other side of the barrier. - var write_culling_output = true; - if gid.x >= object_info.invocation_end { - if object_info.atomic_capable == 0u { - write_invalid_residual_nonatomic_triangle(global_invocation, &object_info); - } - write_culling_output = false; - } else { - let object_invocation = gid.x - object_info.invocation_start; - - // If the first invocation in the region, set the region's draw call - if object_info.local_region_id == 0u && object_invocation == 0u { - init_draw_calls(global_invocation, object_info.region_id); - } - - let triangle = vertex_fetch(object_invocation, &object_info); - - let model_view_proj = per_camera_uniform.objects[object_info.object_id].model_view_proj; - - let passes_culling = execute_culling(model_view_proj, triangle.vertices); - - if object_info.atomic_capable == 1u { - if passes_culling { - write_predicted_atomic_triangle(batch_object_index, &object_info, triangle.indices); - - if !is_shadow_pass() { - let previously_passed_culling = get_previous_culling_result(&object_info, object_invocation); - - if !previously_passed_culling { - write_residual_atomic_triangle(batch_object_index, &object_info, triangle.indices); - } - } - } - } else { - if passes_culling { - write_residual_nonatomic_triangle(global_invocation, batch_object_index, &object_info, triangle.indices); - } else { - write_invalid_residual_nonatomic_triangle(global_invocation, &object_info); - } - } - - compute_culling_results(lid.x, passes_culling); - } - - workgroupBarrier(); - - if write_culling_output { - save_culling_results(global_invocation, lid.x); - } -} diff --git a/rend3-routine/shaders/src/depth.wgsl b/rend3-routine/shaders/src/depth.wgsl index d7dab49f..d099cf5f 100644 --- a/rend3-routine/shaders/src/depth.wgsl +++ b/rend3-routine/shaders/src/depth.wgsl @@ -10,21 +10,19 @@ var uniforms: UniformData; @group(1) @binding(0) var object_buffer: array; @group(1) @binding(1) -var batch_data: BatchData; -@group(1) @binding(2) var vertex_buffer: array; -@group(1) @binding(3) +@group(1) @binding(2) var per_camera_uniform: PerCameraUniform; {{#if (eq profile "GpuDriven")}} -@group(1) @binding(4) +@group(1) @binding(3) var materials: array; @group(2) @binding(0) var textures: binding_array>; {{/if}} {{#if (eq profile "CpuDriven")}} -@group(1) @binding(4) +@group(1) @binding(3) var materials: array; @group(2) @binding(0) var albedo_tex: texture_2d; @@ -34,7 +32,6 @@ var albedo_tex: texture_2d; vertex_fetch object_buffer - batch_data position texture_coords_0 @@ -49,7 +46,7 @@ struct VertexOutput { } @vertex -fn vs_main(@builtin(vertex_index) vertex_index: u32) -> VertexOutput { +fn vs_main(@builtin(instance_index) instance_index: u32, @builtin(vertex_index) vertex_index: u32) -> VertexOutput { // If the vertex index is our sentinel invalid value, return a degenerate triangle. // // This is used by the culling shader to discard triangles when the ordering of the @@ -59,7 +56,7 @@ fn vs_main(@builtin(vertex_index) vertex_index: u32) -> VertexOutput { vs_out.position = vec4(0.0); return vs_out; } - let indices = unpack_vertex_index(vertex_index); + let indices = Indices(instance_index, vertex_index); let data = object_buffer[indices.object]; // If the object is disabled, return a degenerate triangle. @@ -73,7 +70,7 @@ fn vs_main(@builtin(vertex_index) vertex_index: u32) -> VertexOutput { let vs_in = get_vertices(indices); - let model_view_proj = per_camera_uniform.objects[indices.object].model_view_proj; + let model_view_proj = per_camera_uniform.view_proj * object_buffer[indices.object].transform; let position_vec4 = vec4(vs_in.position, 1.0); diff --git a/rend3-routine/shaders/src/hi_z.wgsl b/rend3-routine/shaders/src/hi_z.wgsl deleted file mode 100644 index ecd1e176..00000000 --- a/rend3-routine/shaders/src/hi_z.wgsl +++ /dev/null @@ -1,33 +0,0 @@ -@group(0) @binding(0) -var source: texture_depth_2d; - -struct VertexOutput { - @builtin(position) position: vec4, - @location(0) @interpolate(flat) resolution: vec2, -} - -@vertex -fn vs_main(@builtin(vertex_index) id: u32) -> VertexOutput { - let resolution = vec2(textureDimensions(source)); - var output: VertexOutput; - output.position = vec4(f32(id / 2u) * 4.0 - 1.0, f32(id % 2u) * 4.0 - 1.0, 0.0, 1.0); - output.resolution = resolution; - return output; -} - -@fragment -fn fs_main(vout: VertexOutput) -> @builtin(frag_depth) f32 { - let this_tex_coord = vec2(vout.position.xy); - let previous_base_tex_coord = 2u * this_tex_coord; - - let count_odd = vout.resolution & vec2(1u); - - var nearest = 1.0; - for (var x = 0u; x < 2u + count_odd.x; x += 1u) { - for (var y = 0u; y < 2u + count_odd.y; y += 1u) { - nearest = min(nearest, textureLoad(source, previous_base_tex_coord + vec2(x, y), 0)); - } - } - - return nearest; -} \ No newline at end of file diff --git a/rend3-routine/shaders/src/opaque.wgsl b/rend3-routine/shaders/src/opaque.wgsl index e398d0fc..54a13ab4 100644 --- a/rend3-routine/shaders/src/opaque.wgsl +++ b/rend3-routine/shaders/src/opaque.wgsl @@ -24,21 +24,19 @@ var shadows: texture_depth_2d; @group(1) @binding(0) var object_buffer: array; @group(1) @binding(1) -var batch_data: BatchData; -@group(1) @binding(2) var vertex_buffer: array; -@group(1) @binding(3) +@group(1) @binding(2) var per_camera_uniform: PerCameraUniform; {{#if (eq profile "GpuDriven")}} -@group(1) @binding(4) +@group(1) @binding(3) var materials: array; @group(2) @binding(0) var textures: binding_array>; {{/if}} {{#if (eq profile "CpuDriven")}} -@group(1) @binding(4) +@group(1) @binding(3) var materials: array; @group(2) @binding(0) var albedo_tex: texture_2d; @@ -66,7 +64,6 @@ var ambient_occlusion_tex: texture_2d; vertex_fetch object_buffer - batch_data position normal @@ -89,7 +86,7 @@ struct VertexOutput { @vertex -fn vs_main(@builtin(vertex_index) vertex_index: u32) -> VertexOutput { +fn vs_main(@builtin(instance_index) instance_index: u32, @builtin(vertex_index) vertex_index: u32) -> VertexOutput { // If the vertex index is our sentinel invalid value, return a degenerate triangle. // // This is used by the culling shader to discard triangles when the ordering of the @@ -99,7 +96,7 @@ fn vs_main(@builtin(vertex_index) vertex_index: u32) -> VertexOutput { vs_out.position = vec4(0.0); return vs_out; } - let indices = unpack_vertex_index(vertex_index); + let indices = Indices(instance_index, vertex_index); let data = object_buffer[indices.object]; // If the object is disabled, return a degenerate triangle. @@ -113,8 +110,8 @@ fn vs_main(@builtin(vertex_index) vertex_index: u32) -> VertexOutput { let vs_in = get_vertices(indices); - let model_view = per_camera_uniform.objects[indices.object].model_view; - let model_view_proj = per_camera_uniform.objects[indices.object].model_view_proj; + let model_view = per_camera_uniform.view * object_buffer[indices.object].transform; + let model_view_proj = per_camera_uniform.view_proj * object_buffer[indices.object].transform; let position_vec4 = vec4(vs_in.position, 1.0); let mv_mat3 = mat3x3(model_view[0].xyz, model_view[1].xyz, model_view[2].xyz); diff --git a/rend3-routine/shaders/src/resolve_depth_min.wgsl b/rend3-routine/shaders/src/resolve_depth_min.wgsl deleted file mode 100644 index 677ef722..00000000 --- a/rend3-routine/shaders/src/resolve_depth_min.wgsl +++ /dev/null @@ -1,27 +0,0 @@ -@group(0) @binding(0) -var source: texture_depth_multisampled_2d; - -const SAMPLES: i32 = {{SAMPLES}}; - -struct VertexOutput { - @builtin(position) position: vec4, -} - -@vertex -fn vs_main(@builtin(vertex_index) id: u32) -> VertexOutput { - let resolution = vec2(textureDimensions(source)); - var output: VertexOutput; - output.position = vec4(f32(id / 2u) * 4.0 - 1.0, f32(id % 2u) * 4.0 - 1.0, 0.0, 1.0); - return output; -} - -@fragment -fn fs_main(vout: VertexOutput) -> @builtin(frag_depth) f32 { - var nearest: f32 = 1.0; - - for (var sample = 0; sample < SAMPLES; sample += 1) { - nearest = min(nearest, textureLoad(source, vec2u(vout.position.xy), sample)); - } - - return nearest; -} diff --git a/rend3-routine/shaders/src/structures.wgsl b/rend3-routine/shaders/src/structures.wgsl index f10b05a2..7ca845c6 100644 --- a/rend3-routine/shaders/src/structures.wgsl +++ b/rend3-routine/shaders/src/structures.wgsl @@ -37,40 +37,15 @@ struct UniformData { resolution: vec2, } -struct PerCameraUniformObjectData { - // TODO: use less space - model_view: mat4x4, - // TODO: use less space - model_view_proj: mat4x4, -} - struct PerCameraUniform { // TODO: use less space view: mat4x4, // TODO: use less space view_proj: mat4x4, - // The index of which shadow caster we are rendering for. - // - // This will be u32::MAX if we're rendering for a camera, not a shadow map. - shadow_index: u32, frustum: Frustum, - resolution: vec2, - // Uses PCU_FLAGS_* constants - flags: u32, object_count: u32, - objects: array, } -// Area visible -const PCU_FLAGS_AREA_VISIBLE_MASK: u32 = 0x1u; -const PCU_FLAGS_NEGATIVE_AREA_VISIBLE: u32 = 0x0u; -const PCU_FLAGS_POSITIVE_AREA_VISIBLE: u32 = 0x1u; - -// Multisampled -const PCU_FLAGS_MULTISAMPLE_MASK: u32 = 0x2u; -const PCU_FLAGS_MULTISAMPLE_DISABLED: u32 = 0x0u; -const PCU_FLAGS_MULTISAMPLE_ENABLED: u32 = 0x2u; - struct DirectionalLight { /// View/Projection of directional light. Shadow rendering uses viewports /// so this always outputs [-1, 1] no matter where in the atlast the shadow is. diff --git a/rend3-routine/shaders/src/structures_object.wgsl b/rend3-routine/shaders/src/structures_object.wgsl index 2bb5b32c..4d6db525 100644 --- a/rend3-routine/shaders/src/structures_object.wgsl +++ b/rend3-routine/shaders/src/structures_object.wgsl @@ -10,21 +10,3 @@ struct Object { // 1 if enabled, 0 if disabled enabled: u32, } - -struct ObjectCullingInformation { - invocation_start: u32, - invocation_end: u32, - object_id: u32, - region_id: u32, - region_base_invocation: u32, - local_region_id: u32, - previous_global_invocation: u32, - atomic_capable: u32, -} - -struct BatchData { - total_objects: u32, - total_invocations: u32, - batch_base_invocation: u32, - object_culling_information: array, -} diff --git a/rend3-routine/shaders/src/uniform_prep.wgsl b/rend3-routine/shaders/src/uniform_prep.wgsl deleted file mode 100644 index 4812e94f..00000000 --- a/rend3-routine/shaders/src/uniform_prep.wgsl +++ /dev/null @@ -1,27 +0,0 @@ -{{include "rend3-routine/structures.wgsl"}} -{{include "rend3-routine/structures_object.wgsl"}} - -@group(0) @binding(0) -var object_buffer: array; -@group(0) @binding(1) -var per_camera_uniform: PerCameraUniform; - -@compute @workgroup_size(256) -fn cs_main( - @builtin(global_invocation_id) gid: vec3, -) { - let idx = gid.x; - - if idx >= per_camera_uniform.object_count { - return; - } - if object_buffer[idx].enabled == 0u { - return; - } - - let model_view = per_camera_uniform.view * object_buffer[idx].transform; - let model_view_proj = per_camera_uniform.view_proj * object_buffer[idx].transform; - - per_camera_uniform.objects[idx].model_view = model_view; - per_camera_uniform.objects[idx].model_view_proj = model_view_proj; -} diff --git a/rend3-routine/src/base.rs b/rend3-routine/src/base.rs index 304684f5..1b17861f 100644 --- a/rend3-routine/src/base.rs +++ b/rend3-routine/src/base.rs @@ -13,11 +13,10 @@ //! to, or muck with any of the data in there, you are free to, and the //! following routines will behave as you configure. -use std::{iter::zip, sync::Arc}; +use std::sync::Arc; use glam::{UVec2, Vec4}; use rend3::{ - format_sso, graph::{ self, DataHandle, InstructionEvaluationOutput, RenderGraph, RenderPassTargets, RenderTargetDescriptor, RenderTargetHandle, ViewportRect, @@ -25,14 +24,13 @@ use rend3::{ types::{SampleCount, TextureFormat, TextureUsages}, Renderer, ShaderPreProcessor, INTERNAL_SHADOW_DEPTH_FORMAT, }; -use wgpu::{BindGroup, Buffer}; +use wgpu::BindGroup; use crate::{ clear, common::{self, CameraSpecifier}, - culling, forward::{self, ForwardRoutineArgs}, - pbr, skinning, uniforms, + skinning, uniforms, }; #[derive(Debug, Copy, Clone, PartialEq, Eq)] @@ -103,7 +101,6 @@ pub struct BaseRenderGraphSettings { pub struct BaseRenderGraph { pub interfaces: common::WholeFrameInterfaces, pub samplers: common::Samplers, - pub gpu_culler: culling::GpuCuller, pub gpu_skinner: skinning::GpuSkinner, } @@ -116,11 +113,10 @@ impl BaseRenderGraph { let samplers = common::Samplers::new(&renderer.device); // TODO: Support more materials - let gpu_culler = culling::GpuCuller::new::(renderer, spp); let gpu_skinner = skinning::GpuSkinner::new(&renderer.device, spp); - Self { interfaces, samplers, gpu_culler, gpu_skinner } + Self { interfaces, samplers, gpu_skinner } } /// Add this to the rendergraph. This is the function you should start @@ -144,32 +140,11 @@ impl BaseRenderGraph { // Perform compute based skinning. state.skinning(self); - // Upload the uniforms for the objects in the shadow pass. - state.shadow_object_uniform_upload(self); - // Perform culling for the objects in the shadow pass. - state.pbr_shadow_culling(self); - // Render all the shadows to the shadow map. state.pbr_shadow_rendering(); - // Upload the uniforms for the objects in the forward pass. - state.object_uniform_upload(self); - // Do the first pass, rendering the predicted triangles from last frame. - state.pbr_render_opaque_predicted_triangles(); - - // Create the hi-z buffer. - state.hi_z(); - - // Perform culling for the objects in the forward pass. - // - // The result of culling will be used to predict the visible triangles for - // the next frame. It will also render all the triangles that were visible - // but were not predicted last frame. - state.pbr_culling(self); - - // Do the second pass, rendering the residual triangles. - state.pbr_render_opaque_residual_triangles(); + state.pbr_render(); // Render the skybox. state.skybox(); @@ -194,10 +169,6 @@ pub struct BaseRenderGraphIntermediateState<'a, 'node> { pub inputs: BaseRenderGraphInputs<'a, 'node>, pub settings: BaseRenderGraphSettings, - pub pre_cull: DataHandle, - pub shadow_cull: Vec>>, - pub cull: DataHandle>, - pub shadow_uniform_bg: DataHandle, pub forward_uniform_bg: DataHandle, @@ -214,9 +185,6 @@ impl<'a, 'node> BaseRenderGraphIntermediateState<'a, 'node> { inputs: BaseRenderGraphInputs<'a, 'node>, settings: BaseRenderGraphSettings, ) -> Self { - // We need to know how many shadows we need to render - let shadow_count = inputs.eval_output.shadows.len(); - // Create global bind group information let shadow_uniform_bg = graph.add_data::(); let forward_uniform_bg = graph.add_data::(); @@ -265,19 +233,11 @@ impl<'a, 'node> BaseRenderGraphIntermediateState<'a, 'node> { let pre_skinning_buffers = graph.add_data::(); - let pre_cull = graph.add_data(); - let mut shadow_cull = Vec::with_capacity(shadow_count); - shadow_cull.resize_with(shadow_count, || graph.add_data()); - let cull = graph.add_data(); Self { graph, inputs, settings, - pre_cull, - shadow_cull, - cull, - shadow_uniform_bg, forward_uniform_bg, @@ -312,60 +272,13 @@ impl<'a, 'node> BaseRenderGraphIntermediateState<'a, 'node> { ); } - pub fn shadow_object_uniform_upload(&mut self, base: &'node BaseRenderGraph) { - for (shadow_index, shadow) in self.inputs.eval_output.shadows.iter().enumerate() { - base.gpu_culler.add_object_uniform_upload_to_graph::( - self.graph, - CameraSpecifier::Shadow(shadow_index as u32), - UVec2::splat(shadow.map.size), - SampleCount::One, - &format_sso!("Shadow Culling S{}", shadow_index), - ); - } - } - - /// Does all shadow culling for the PBR materials. - pub fn pbr_shadow_culling(&mut self, base: &'node BaseRenderGraph) { - for (shadow_index, &shadow_culled) in self.shadow_cull.iter().enumerate() { - base.gpu_culler.add_culling_to_graph::( - self.graph, - shadow_culled, - self.shadow, - CameraSpecifier::Shadow(shadow_index as u32), - &format_sso!("Shadow Culling S{}", shadow_index), - ); - } - } - pub fn skinning(&mut self, base: &'node BaseRenderGraph) { skinning::add_skinning_to_graph(self.graph, &base.gpu_skinner); } - pub fn object_uniform_upload(&mut self, base: &'node BaseRenderGraph) { - base.gpu_culler.add_object_uniform_upload_to_graph::( - self.graph, - CameraSpecifier::Viewport, - self.inputs.target.resolution, - self.inputs.target.samples, - "Uniform Bake", - ); - } - - /// Does all culling for the forward PBR materials. - pub fn pbr_culling(&mut self, base: &'node BaseRenderGraph) { - base.gpu_culler.add_culling_to_graph::( - self.graph, - self.cull, - self.depth.single_sample_mipped, - CameraSpecifier::Viewport, - "Primary Culling", - ); - } - /// Render all shadows for the PBR materials. pub fn pbr_shadow_rendering(&mut self) { - let iter = zip(&self.shadow_cull, &self.inputs.eval_output.shadows); - for (shadow_index, (shadow_cull, desc)) in iter.enumerate() { + for (shadow_index, desc) in self.inputs.eval_output.shadows.iter().enumerate() { let target = self.shadow.set_viewport(ViewportRect::new(desc.map.offset, UVec2::splat(desc.map.size))); let renderpass = graph::RenderPassTargets { targets: vec![], @@ -387,7 +300,6 @@ impl<'a, 'node> BaseRenderGraphIntermediateState<'a, 'node> { per_material_bgl: &self.inputs.routines.pbr.per_material, extra_bgs: None, }, - culling_source: forward::CullingSource::Residual(*shadow_cull), samples: SampleCount::One, renderpass: renderpass.clone(), }); @@ -408,39 +320,18 @@ impl<'a, 'node> BaseRenderGraphIntermediateState<'a, 'node> { } /// Render the PBR materials. - pub fn pbr_render_opaque_predicted_triangles(&mut self) { + pub fn pbr_render(&mut self) { let routines = [&self.inputs.routines.pbr.opaque_routine, &self.inputs.routines.pbr.cutout_routine]; for routine in routines { routine.add_forward_to_graph(ForwardRoutineArgs { graph: self.graph, - label: "PBR Forward Pass 1", + label: "PBR Forward Pass", camera: CameraSpecifier::Viewport, binding_data: forward::ForwardRoutineBindingData { whole_frame_uniform_bg: self.forward_uniform_bg, per_material_bgl: &self.inputs.routines.pbr.per_material, extra_bgs: None, }, - culling_source: forward::CullingSource::Predicted, - samples: self.inputs.target.samples, - renderpass: self.primary_renderpass.clone(), - }); - } - } - - /// Render the PBR materials. - pub fn pbr_render_opaque_residual_triangles(&mut self) { - let routines = [&self.inputs.routines.pbr.opaque_routine, &self.inputs.routines.pbr.cutout_routine]; - for routine in routines { - routine.add_forward_to_graph(ForwardRoutineArgs { - graph: self.graph, - label: "PBR Forward Pass 2", - camera: CameraSpecifier::Viewport, - binding_data: forward::ForwardRoutineBindingData { - whole_frame_uniform_bg: self.forward_uniform_bg, - per_material_bgl: &self.inputs.routines.pbr.per_material, - extra_bgs: None, - }, - culling_source: forward::CullingSource::Residual(self.cull), samples: self.inputs.target.samples, renderpass: self.primary_renderpass.clone(), }); @@ -458,16 +349,11 @@ impl<'a, 'node> BaseRenderGraphIntermediateState<'a, 'node> { per_material_bgl: &self.inputs.routines.pbr.per_material, extra_bgs: None, }, - culling_source: forward::CullingSource::Residual(self.cull), samples: self.inputs.target.samples, renderpass: self.primary_renderpass.clone(), }); } - pub fn hi_z(&mut self) { - self.inputs.routines.pbr.hi_z.add_hi_z_to_graph(self.graph, self.depth, self.inputs.target.resolution); - } - /// Tonemap onto the given render target. pub fn tonemapping(&mut self) { self.inputs.routines.tonemapping.add_to_graph( diff --git a/rend3-routine/src/common/interfaces.rs b/rend3-routine/src/common/interfaces.rs index 9361ce22..2efe84de 100644 --- a/rend3-routine/src/common/interfaces.rs +++ b/rend3-routine/src/common/interfaces.rs @@ -1,5 +1,6 @@ use std::{marker::PhantomData, mem, num::NonZeroU64}; +use encase::ShaderType; use glam::{Mat4, Vec3}; use rend3::{ managers::{DirectionalLightManager, PointLightManager}, @@ -10,7 +11,10 @@ use wgpu::{ BindGroupLayout, BindingType, BufferBindingType, Device, ShaderStages, TextureSampleType, TextureViewDimension, }; -use crate::{common::samplers::Samplers, uniforms::FrameUniforms}; +use crate::{ + common::samplers::Samplers, + uniforms::{FrameUniforms, PerCameraUniform}, +}; /// Interfaces which are used throughout the whole frame. /// @@ -99,16 +103,6 @@ impl PerMaterialArchetypeInterface { }, None, ) - // Batch data buffer - .append( - ShaderStages::VERTEX_FRAGMENT, - BindingType::Buffer { - ty: BufferBindingType::Storage { read_only: true }, - has_dynamic_offset: true, - min_binding_size: None, - }, - None, - ) // Vertex buffer .append( ShaderStages::VERTEX_FRAGMENT, @@ -125,11 +119,11 @@ impl PerMaterialArchetypeInterface { BindingType::Buffer { ty: BufferBindingType::Storage { read_only: true }, has_dynamic_offset: false, - min_binding_size: None, + min_binding_size: Some(PerCameraUniform::min_size()), }, None, ) - // Mateiral data + // Material data .append( ShaderStages::VERTEX_FRAGMENT, BindingType::Buffer { diff --git a/rend3-routine/src/culling/batching.rs b/rend3-routine/src/culling/batching.rs deleted file mode 100644 index 9858dee5..00000000 --- a/rend3-routine/src/culling/batching.rs +++ /dev/null @@ -1,250 +0,0 @@ -use std::{cmp::Ordering, collections::HashMap}; - -use encase::ShaderType; -use ordered_float::OrderedFloat; -use rend3::{ - graph::NodeExecutionContext, - managers::{CameraState, TextureBindGroupIndex}, - types::{GraphDataHandle, Material, RawObjectHandle, SortingOrder, SortingReason}, - util::{math::round_up, typedefs::FastHashMap}, -}; - -use crate::common::CameraSpecifier; - -use super::{BATCH_SIZE, WORKGROUP_SIZE}; - -#[derive(Debug)] -pub struct ShaderBatchDatas { - pub(super) regions: Vec, - pub(super) jobs: Vec, -} - -#[derive(Debug)] -pub(super) struct JobSubRegion { - pub job_index: u32, - pub key: ShaderJobKey, -} - -#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] -pub(super) struct ShaderJobKey { - pub material_key: u64, - pub bind_group_index: TextureBindGroupIndex, -} - -#[derive(Debug, Clone, Copy, Eq)] -pub(super) struct ShaderJobSortingKey { - pub job_key: ShaderJobKey, - pub distance: OrderedFloat, - pub sorting_reason: SortingReason, -} - -impl PartialEq for ShaderJobSortingKey { - fn eq(&self, other: &Self) -> bool { - self.cmp(other).is_eq() - } -} - -impl PartialOrd for ShaderJobSortingKey { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for ShaderJobSortingKey { - fn cmp(&self, other: &Self) -> Ordering { - // material key always first - match self.job_key.material_key.cmp(&other.job_key.material_key) { - Ordering::Equal => {} - ord => return ord, - } - match self.sorting_reason.cmp(&other.sorting_reason) { - Ordering::Equal => {} - ord => return ord, - } - // The above comparison means that both sides are equal - if self.sorting_reason == SortingReason::Requirement { - match self.distance.cmp(&other.distance) { - Ordering::Equal => {} - ord => return ord, - } - self.job_key.bind_group_index.cmp(&other.job_key.bind_group_index) - } else { - match self.job_key.bind_group_index.cmp(&other.job_key.bind_group_index) { - Ordering::Equal => {} - ord => return ord, - } - self.distance.cmp(&other.distance) - } - } -} - -#[derive(Debug, ShaderType)] -pub struct ShaderBatchData { - #[align(256)] - pub(super) total_objects: u32, - pub(super) total_invocations: u32, - pub(super) batch_base_invocation: u32, - pub(super) object_culling_information: [ShaderObjectCullingInformation; BATCH_SIZE], -} - -#[derive(Debug, Copy, Clone, Default, ShaderType)] -pub(super) struct ShaderObjectCullingInformation { - pub invocation_start: u32, - pub invocation_end: u32, - pub object_id: u32, - pub region_id: u32, - pub base_region_invocation: u32, - pub local_region_id: u32, - pub previous_global_invocation: u32, - pub atomic_capable: u32, -} - -/// Map containing the previous invocation of each object. -pub struct PerCameraPreviousInvocationsMap { - inner: FastHashMap>, -} -impl PerCameraPreviousInvocationsMap { - pub fn new() -> Self { - Self { inner: HashMap::default() } - } - - pub fn get_and_reset_camera(&mut self, camera: CameraSpecifier) -> FastHashMap { - self.inner.remove(&camera).unwrap_or_default() - } - - pub fn set_camera(&mut self, camera: CameraSpecifier, previous_invocations: FastHashMap) { - self.inner.insert(camera, previous_invocations); - } -} - -pub(super) fn batch_objects( - ctx: &mut NodeExecutionContext, - previous_invocation_map_handle: &GraphDataHandle, - camera: &CameraState, - camera_specifier: CameraSpecifier, -) -> ShaderBatchDatas { - profiling::scope!("Batch Objects"); - - let mut per_camera_previous_invocation_map = ctx.data_core.graph_storage.get_mut(previous_invocation_map_handle); - let previous_invocation_map = per_camera_previous_invocation_map.get_and_reset_camera(camera_specifier); - let mut current_invocation_map = FastHashMap::default(); - - let mut jobs = ShaderBatchDatas { jobs: Vec::new(), regions: Vec::new() }; - - let objects = match ctx.data_core.object_manager.enumerated_objects::() { - Some(o) => o, - None => return jobs, - }; - - let material_archetype = ctx.data_core.material_manager.archetype_view::(); - - let mut sorted_objects = Vec::with_capacity(objects.len()); - { - profiling::scope!("Sort Key Creation"); - for (handle, object) in objects { - // Frustum culling - if !camera.world_frustum().contains_sphere(object.inner.bounding_sphere) { - continue; - } - - let material = material_archetype.material(*object.material_handle); - let bind_group_index = material.bind_group_index.map_gpu(|_| TextureBindGroupIndex::DUMMY).into_common(); - - let material_key = material.inner.key(); - let sorting = material.inner.sorting(); - - let mut distance_sq = - ctx.data_core.viewport_camera_state.location().distance_squared(object.location.into()); - if sorting.order == SortingOrder::BackToFront { - distance_sq = -distance_sq; - } - sorted_objects.push(( - ShaderJobSortingKey { - job_key: ShaderJobKey { material_key, bind_group_index }, - distance: OrderedFloat(distance_sq), - sorting_reason: sorting.reason, - }, - handle, - object, - )) - } - } - - { - profiling::scope!("Sorting"); - sorted_objects.sort_unstable_by_key(|(k, _, _)| *k); - } - - if !sorted_objects.is_empty() { - profiling::scope!("Batch Data Creation"); - let mut current_region_idx = 0_u32; - let mut current_region_object_index = 0_u32; - let mut current_base_invocation = 0_u32; - let mut current_region_invocation = 0_u32; - let mut current_invocation = 0_u32; - let mut current_object_index = 0_u32; - let mut current_ranges = [ShaderObjectCullingInformation::default(); BATCH_SIZE]; - let mut current_key = sorted_objects.first().unwrap().0.job_key; - - let max_dispatch_count = ctx.renderer.limits.max_compute_workgroups_per_dimension; - - for (ShaderJobSortingKey { job_key: key, sorting_reason, .. }, handle, object) in sorted_objects { - let invocation_count = object.inner.index_count / 3; - - let key_difference = key != current_key; - let object_limit = current_object_index == BATCH_SIZE as u32; - let dispatch_limit = (current_invocation + invocation_count) >= max_dispatch_count * WORKGROUP_SIZE; - - if key_difference || object_limit || dispatch_limit { - jobs.regions.push(JobSubRegion { job_index: jobs.jobs.len() as u32, key: current_key }); - current_region_idx += 1; - current_key = key; - current_region_object_index = 0; - current_region_invocation = current_invocation; - } - if object_limit || dispatch_limit { - jobs.jobs.push(ShaderBatchData { - object_culling_information: current_ranges, - total_objects: current_object_index, - total_invocations: current_invocation, - batch_base_invocation: current_base_invocation, - }); - - current_base_invocation += current_invocation; - current_invocation = 0; - current_region_invocation = 0; - current_object_index = 0; - } - - let range = ShaderObjectCullingInformation { - invocation_start: current_invocation, - invocation_end: current_invocation + invocation_count, - region_id: current_region_idx, - object_id: handle.idx as u32, - base_region_invocation: current_region_invocation, - local_region_id: current_region_object_index, - previous_global_invocation: previous_invocation_map.get(&handle).copied().unwrap_or(u32::MAX), - atomic_capable: matches!(sorting_reason, SortingReason::Optimization) as u32, - }; - - current_invocation_map.insert(handle, current_invocation + current_base_invocation); - - current_ranges[current_object_index as usize] = range; - current_object_index += 1; - current_region_object_index += 1; - current_invocation += round_up(invocation_count, WORKGROUP_SIZE); - } - - jobs.regions.push(JobSubRegion { job_index: jobs.jobs.len() as u32, key: current_key }); - jobs.jobs.push(ShaderBatchData { - object_culling_information: current_ranges, - total_objects: current_object_index, - total_invocations: current_invocation, - batch_base_invocation: current_base_invocation, - }); - } - - per_camera_previous_invocation_map.set_camera(camera_specifier, current_invocation_map); - - jobs -} diff --git a/rend3-routine/src/culling/culler.rs b/rend3-routine/src/culling/culler.rs deleted file mode 100644 index 58fbce21..00000000 --- a/rend3-routine/src/culling/culler.rs +++ /dev/null @@ -1,714 +0,0 @@ -use std::{ - any::{type_name, TypeId}, - borrow::Cow, - collections::{hash_map::Entry, HashMap}, - num::NonZeroU64, - ops::Range, - sync::Arc, -}; - -use encase::{ShaderSize, ShaderType, StorageBuffer}; -use glam::{Mat4, UVec2, Vec2}; -use rend3::{ - format_sso, - graph::{DataHandle, DeclaredDependency, NodeExecutionContext, NodeResourceUsage, RenderGraph, RenderTargetHandle}, - managers::{CameraState, ShaderObject, TextureBindGroupIndex}, - types::{GraphDataHandle, Material, MaterialArray, SampleCount, VERTEX_ATTRIBUTE_POSITION}, - util::{frustum::Frustum, math::IntegerExt, typedefs::FastHashMap}, - Renderer, ShaderPreProcessor, ShaderVertexBufferConfig, -}; -use wgpu::{ - self, AddressMode, BindGroupDescriptor, BindGroupEntry, BindGroupLayout, BindGroupLayoutDescriptor, - BindGroupLayoutEntry, BindingResource, BindingType, Buffer, BufferBinding, BufferBindingType, BufferDescriptor, - BufferUsages, CommandEncoder, ComputePassDescriptor, ComputePipeline, ComputePipelineDescriptor, Device, - FilterMode, PipelineLayoutDescriptor, Queue, Sampler, SamplerBindingType, SamplerDescriptor, - ShaderModuleDescriptor, ShaderStages, TextureSampleType, TextureViewDimension, -}; - -use crate::{ - common::CameraSpecifier, - culling::{ - batching::{batch_objects, JobSubRegion, PerCameraPreviousInvocationsMap, ShaderBatchData, ShaderBatchDatas}, - suballoc::InputOutputBuffer, - WORKGROUP_SIZE, - }, -}; - -#[derive(Debug)] -pub struct DrawCallSet { - pub culling_data_buffer: Buffer, - pub per_camera_uniform: Arc, - pub draw_calls: Vec, - /// Range of draw calls in the draw call array corresponding to a given material key. - pub material_key_ranges: HashMap>, -} - -#[derive(Debug, Clone)] -pub struct DrawCall { - pub bind_group_index: TextureBindGroupIndex, - pub batch_index: u32, -} - -#[derive(Default)] -pub struct CullingBufferMap { - inner: FastHashMap, -} -impl CullingBufferMap { - pub fn get_buffers(&self, camera: CameraSpecifier) -> Option<&CullingBuffers> { - self.inner.get(&camera) - } - - fn get_or_resize_buffers( - &mut self, - queue: &Queue, - device: &Device, - encoder: &mut CommandEncoder, - camera: CameraSpecifier, - sizes: CullingBufferSizes, - ) -> &mut CullingBuffers { - match self.inner.entry(camera) { - Entry::Occupied(b) => { - let b = b.into_mut(); - - b.update_sizes(queue, device, encoder, sizes); - - b - } - Entry::Vacant(b) => b.insert(CullingBuffers::new(device, queue, sizes)), - } - } -} - -#[derive(Debug)] -struct CullingBufferSizes { - invocations: u64, - draw_calls: u64, -} - -#[derive(Debug)] -pub struct CullingBuffers { - pub index_buffer: InputOutputBuffer, - pub draw_call_buffer: InputOutputBuffer, - pub culling_results_buffer: InputOutputBuffer, -} - -impl CullingBuffers { - fn new(device: &Device, queue: &Queue, sizes: CullingBufferSizes) -> Self { - Self { - // One element per triangle/invocation - index_buffer: InputOutputBuffer::new(device, queue, sizes.invocations * 3, "Index Buffer", 4, 4, false), - draw_call_buffer: InputOutputBuffer::new(device, queue, sizes.draw_calls, "Draw Call Buffer", 20, 4, true), - culling_results_buffer: InputOutputBuffer::new( - device, - queue, - // 32 bits in a u32 - sizes.invocations.div_round_up(u32::BITS as _), - "Culling Results Buffer", - 4, - 4, - false, - ), - } - } - - fn update_sizes( - &mut self, - queue: &Queue, - device: &Device, - encoder: &mut CommandEncoder, - sizes: CullingBufferSizes, - ) { - self.index_buffer.swap(queue, device, encoder, sizes.invocations * 3); - self.draw_call_buffer.swap(queue, device, encoder, sizes.draw_calls); - self.culling_results_buffer.swap(queue, device, encoder, sizes.invocations.div_round_up(32)); - } -} - -#[derive(Debug, Copy, Clone)] -pub enum TriangleVisibility { - PositiveAreaVisible, - NegativeAreaVisible, -} - -impl TriangleVisibility { - fn from_winding_and_face(winding: wgpu::FrontFace, culling: wgpu::Face) -> Self { - match (winding, culling) { - (wgpu::FrontFace::Ccw, wgpu::Face::Back) => TriangleVisibility::PositiveAreaVisible, - (wgpu::FrontFace::Ccw, wgpu::Face::Front) => TriangleVisibility::NegativeAreaVisible, - (wgpu::FrontFace::Cw, wgpu::Face::Back) => TriangleVisibility::NegativeAreaVisible, - (wgpu::FrontFace::Cw, wgpu::Face::Front) => TriangleVisibility::PositiveAreaVisible, - } - } - - fn is_positive(self) -> bool { - match self { - TriangleVisibility::PositiveAreaVisible => true, - TriangleVisibility::NegativeAreaVisible => false, - } - } -} - -bitflags::bitflags! { - struct PerCameraUniformFlags: u32 { - const POSTIIVE_AREA_VISIBLE = 1 << 0; - const MULTISAMPLED = 1 << 1; - } -} - -#[derive(ShaderType)] -struct PerCameraUniform { - // TODO: use less space - view: Mat4, - // TODO: use less space - view_proj: Mat4, - // The index of which shadow caster we are rendering for. - // - // This will be u32::MAX if we're rendering for a camera, not a shadow map. - shadow_index: u32, - frustum: Frustum, - resolution: Vec2, - // Created from PerCameraUniformFlags - flags: u32, - object_count: u32, - #[size(runtime)] - objects: Vec, -} - -#[derive(ShaderType)] -struct PerCameraUniformObjectData { - // TODO: use less space - model_view: Mat4, - // TODO: use less space - model_view_proj: Mat4, -} - -pub struct GpuCuller { - prep_bgl: BindGroupLayout, - prep_pipeline: ComputePipeline, - culling_bgl: BindGroupLayout, - culling_pipeline: ComputePipeline, - sampler: Sampler, - winding: wgpu::FrontFace, - type_id: TypeId, - per_material_buffer_handle: GraphDataHandle>>, - pub culling_buffer_map_handle: GraphDataHandle, - previous_invocation_map_handle: GraphDataHandle, -} - -impl GpuCuller { - pub fn new(renderer: &Arc, spp: &ShaderPreProcessor) -> Self - where - M: Material, - { - let type_name = type_name::(); - - let prep_source = spp - .render_shader( - "rend3-routine/uniform_prep.wgsl", - &(), - Some(&ShaderVertexBufferConfig::from_material::()), - ) - .unwrap(); - - let prep_sm = renderer.device.create_shader_module(ShaderModuleDescriptor { - label: Some(&format_sso!("UniformPrep {type_name} SM")), - source: wgpu::ShaderSource::Wgsl(Cow::Owned(prep_source)), - }); - - let prep_bgl = renderer.device.create_bind_group_layout(&BindGroupLayoutDescriptor { - label: Some(&format_sso!("UniformPrep {type_name} BGL")), - entries: &[ - // Object - BindGroupLayoutEntry { - binding: 0, - visibility: ShaderStages::COMPUTE, - ty: BindingType::Buffer { - ty: BufferBindingType::Storage { read_only: true }, - has_dynamic_offset: false, - min_binding_size: Some(ShaderObject::::SHADER_SIZE), - }, - count: None, - }, - // Object - BindGroupLayoutEntry { - binding: 1, - visibility: ShaderStages::COMPUTE, - ty: BindingType::Buffer { - ty: BufferBindingType::Storage { read_only: false }, - has_dynamic_offset: false, - min_binding_size: Some(PerCameraUniform::min_size()), - }, - count: None, - }, - ], - }); - - let prep_pll = renderer.device.create_pipeline_layout(&PipelineLayoutDescriptor { - label: Some(&format_sso!("UniformPrep {type_name} PLL")), - bind_group_layouts: &[&prep_bgl], - push_constant_ranges: &[], - }); - - let prep_pipeline = renderer.device.create_compute_pipeline(&ComputePipelineDescriptor { - label: Some(&format_sso!("UniformPrep {type_name} PLL")), - layout: Some(&prep_pll), - module: &prep_sm, - entry_point: "cs_main", - }); - - let position_offset = M::supported_attributes() - .into_iter() - .enumerate() - .find_map(|(idx, a)| (*a == *VERTEX_ATTRIBUTE_POSITION).then_some(idx)) - .unwrap(); - - let culling_source = spp - .render_shader( - "rend3-routine/cull.wgsl", - &serde_json::json! {{ - "position_attribute_offset": position_offset, - }}, - Some(&ShaderVertexBufferConfig::from_material::()), - ) - .unwrap(); - - let culling_sm = renderer.device.create_shader_module(ShaderModuleDescriptor { - label: Some(&format_sso!("GpuCuller {type_name} SM")), - source: wgpu::ShaderSource::Wgsl(Cow::Owned(culling_source)), - }); - - let culling_bgl = renderer.device.create_bind_group_layout(&BindGroupLayoutDescriptor { - label: Some(&format_sso!("GpuCuller {type_name} BGL")), - entries: &[ - // Vertex Buffer - BindGroupLayoutEntry { - binding: 0, - visibility: ShaderStages::COMPUTE, - ty: BindingType::Buffer { - ty: BufferBindingType::Storage { read_only: true }, - has_dynamic_offset: false, - min_binding_size: NonZeroU64::new(4), - }, - count: None, - }, - // Object Buffer - BindGroupLayoutEntry { - binding: 1, - visibility: ShaderStages::COMPUTE, - ty: BindingType::Buffer { - ty: BufferBindingType::Storage { read_only: true }, - has_dynamic_offset: false, - min_binding_size: Some(ShaderObject::::SHADER_SIZE), - }, - count: None, - }, - // Batch data - BindGroupLayoutEntry { - binding: 2, - visibility: ShaderStages::COMPUTE, - ty: BindingType::Buffer { - ty: BufferBindingType::Storage { read_only: true }, - has_dynamic_offset: true, - min_binding_size: Some(ShaderBatchData::SHADER_SIZE), - }, - count: None, - }, - // Draw Calls - BindGroupLayoutEntry { - binding: 3, - visibility: ShaderStages::COMPUTE, - ty: BindingType::Buffer { - ty: BufferBindingType::Storage { read_only: false }, - has_dynamic_offset: false, - min_binding_size: Some(NonZeroU64::new(20 + 8).unwrap()), - }, - count: None, - }, - // Index buffer - BindGroupLayoutEntry { - binding: 4, - visibility: ShaderStages::COMPUTE, - ty: BindingType::Buffer { - ty: BufferBindingType::Storage { read_only: false }, - has_dynamic_offset: false, - min_binding_size: Some(NonZeroU64::new(4 + 8).unwrap()), - }, - count: None, - }, - // Culling Results - BindGroupLayoutEntry { - binding: 5, - visibility: ShaderStages::COMPUTE, - ty: BindingType::Buffer { - ty: BufferBindingType::Storage { read_only: false }, - has_dynamic_offset: false, - min_binding_size: Some(NonZeroU64::new(4 + 8).unwrap()), - }, - count: None, - }, - // per camera uniforms - BindGroupLayoutEntry { - binding: 6, - visibility: ShaderStages::COMPUTE, - ty: BindingType::Buffer { - ty: BufferBindingType::Storage { read_only: true }, - has_dynamic_offset: false, - min_binding_size: Some(PerCameraUniform::min_size()), - }, - count: None, - }, - // hirearchical z buffer - BindGroupLayoutEntry { - binding: 7, - visibility: ShaderStages::COMPUTE, - ty: BindingType::Texture { - sample_type: TextureSampleType::Depth, - view_dimension: TextureViewDimension::D2, - multisampled: false, - }, - count: None, - }, - // hirearchical z buffer - BindGroupLayoutEntry { - binding: 8, - visibility: ShaderStages::COMPUTE, - ty: BindingType::Sampler(SamplerBindingType::NonFiltering), - count: None, - }, - ], - }); - - let culling_pll = renderer.device.create_pipeline_layout(&PipelineLayoutDescriptor { - label: Some(&format_sso!("GpuCuller {type_name} PLL")), - bind_group_layouts: &[&culling_bgl], - push_constant_ranges: &[], - }); - - let culling_pipeline = renderer.device.create_compute_pipeline(&ComputePipelineDescriptor { - label: Some(&format_sso!("GpuCuller {type_name} PLL")), - layout: Some(&culling_pll), - module: &culling_sm, - entry_point: "cs_main", - }); - - let sampler = renderer.device.create_sampler(&SamplerDescriptor { - label: Some("HiZ Sampler"), - address_mode_u: AddressMode::ClampToEdge, - address_mode_v: AddressMode::ClampToEdge, - address_mode_w: AddressMode::ClampToEdge, - mag_filter: FilterMode::Nearest, - min_filter: FilterMode::Nearest, - mipmap_filter: FilterMode::Nearest, - lod_min_clamp: 0.0, - lod_max_clamp: 100.0, - compare: None, - anisotropy_clamp: 1, - border_color: None, - }); - - let per_material_buffer_handle = renderer.add_graph_data(HashMap::default()); - let culling_buffer_map_handle = renderer.add_graph_data(CullingBufferMap::default()); - let previous_invocation_map_handle = renderer.add_graph_data(PerCameraPreviousInvocationsMap::new()); - - Self { - prep_bgl, - prep_pipeline, - culling_bgl, - culling_pipeline, - sampler, - winding: renderer.handedness.into(), - type_id: TypeId::of::(), - per_material_buffer_handle, - culling_buffer_map_handle, - previous_invocation_map_handle, - } - } - - pub fn object_uniform_upload( - &self, - ctx: &mut NodeExecutionContext, - camera: &CameraState, - camera_specifier: CameraSpecifier, - resolution: UVec2, - samples: SampleCount, - ) where - M: Material, - { - profiling::scope!("GpuCuller::object_uniform_upload"); - - assert_eq!(TypeId::of::(), self.type_id); - - let type_name = type_name::(); - - let encoder = ctx.encoder_or_pass.take_encoder(); - - // TODO: Isolate all this into a struct - let max_object_count = ctx.data_core.object_manager.buffer::().map(wgpu::Buffer::size).unwrap_or(0) - / ShaderObject::::SHADER_SIZE.get(); - - if max_object_count == 0 { - return; - } - - let per_map_buffer_size = ((max_object_count - 1) * PerCameraUniformObjectData::SHADER_SIZE.get()) - + PerCameraUniform::min_size().get(); - - let mut per_mat_buffer_map = ctx.data_core.graph_storage.get_mut(&self.per_material_buffer_handle); - - let new_per_mat_buffer = || { - Arc::new(ctx.renderer.device.create_buffer(&BufferDescriptor { - label: None, - size: per_map_buffer_size, - usage: BufferUsages::STORAGE | BufferUsages::COPY_SRC | BufferUsages::COPY_DST, - mapped_at_creation: false, - })) - }; - let buffer = match per_mat_buffer_map.entry(camera_specifier) { - Entry::Occupied(o) => { - let r = o.into_mut(); - if r.size() != per_map_buffer_size { - *r = new_per_mat_buffer(); - } - r - } - Entry::Vacant(o) => o.insert(new_per_mat_buffer()), - }; - - let culling = match camera_specifier { - CameraSpecifier::Shadow(_) => wgpu::Face::Front, - CameraSpecifier::Viewport => wgpu::Face::Back, - }; - - { - // We don't write anything in the objects right now, as this will be filled in by the preparation compute shader - profiling::scope!("PerCameraUniform Data Upload"); - let per_camera_data = PerCameraUniform { - view: camera.view(), - view_proj: camera.view_proj(), - shadow_index: camera_specifier.to_shader_index(), - frustum: camera.world_frustum(), - resolution: resolution.as_vec2(), - flags: { - let mut flags = PerCameraUniformFlags::empty(); - flags.set( - PerCameraUniformFlags::POSTIIVE_AREA_VISIBLE, - TriangleVisibility::from_winding_and_face(self.winding, culling).is_positive(), - ); - flags.set(PerCameraUniformFlags::MULTISAMPLED, samples != SampleCount::One); - flags.bits() - }, - object_count: max_object_count as u32, - objects: Vec::new(), - }; - let mut buffer = ctx.renderer.queue.write_buffer_with(buffer, 0, per_camera_data.size()).unwrap(); - StorageBuffer::new(&mut *buffer).write(&per_camera_data).unwrap(); - } - - let Some(object_manager_buffer) = ctx.data_core.object_manager.buffer::() else { - return; - }; - let prep_bg = ctx.renderer.device.create_bind_group(&BindGroupDescriptor { - label: Some(&format_sso!("UniformPrep {type_name} BG")), - layout: &self.prep_bgl, - entries: &[ - BindGroupEntry { binding: 0, resource: object_manager_buffer.as_entire_binding() }, - BindGroupEntry { binding: 1, resource: buffer.as_entire_binding() }, - ], - }); - - profiling::scope!("Command Encoding"); - - let mut cpass = encoder.begin_compute_pass(&ComputePassDescriptor { - label: Some(&format_sso!("GpuCuller {type_name} uniform bake")), - timestamp_writes: None, - }); - cpass.set_pipeline(&self.prep_pipeline); - cpass.set_bind_group(0, &prep_bg, &[]); - cpass.dispatch_workgroups((max_object_count as u32).div_round_up(WORKGROUP_SIZE), 1, 1); - drop(cpass); - } - - pub fn cull( - &self, - ctx: &mut NodeExecutionContext, - jobs: ShaderBatchDatas, - depth_handle: DeclaredDependency, - camera_specifier: CameraSpecifier, - ) -> DrawCallSet - where - M: Material, - { - profiling::scope!("GpuCuller::cull"); - - assert_eq!(TypeId::of::(), self.type_id); - - let type_name = type_name::(); - - let total_invocations: u32 = jobs - .jobs - .iter() - .map(|j: &ShaderBatchData| { - debug_assert_eq!(j.total_invocations % WORKGROUP_SIZE, 0); - j.total_invocations - }) - .sum(); - - let encoder = ctx.encoder_or_pass.take_encoder(); - - let mut culling_buffer_map = ctx.data_core.graph_storage.get_mut(&self.culling_buffer_map_handle); - let buffers = culling_buffer_map.get_or_resize_buffers( - &ctx.renderer.queue, - &ctx.renderer.device, - encoder, - camera_specifier, - CullingBufferSizes { invocations: total_invocations as u64, draw_calls: jobs.regions.len() as u64 }, - ); - - let per_camera_uniform = Arc::clone( - ctx.data_core - .graph_storage - .get_mut(&self.per_material_buffer_handle) - .get(&camera_specifier) - .unwrap_or_else(|| panic!("No per camera uniform for camera {:?}", camera_specifier)), - ); - - let culling_data_buffer = { - profiling::scope!("Culling Job Data Upload"); - - let culling_data_buffer = ctx.renderer.device.create_buffer(&wgpu::BufferDescriptor { - label: Some("Culling Data Buffer"), - size: jobs.jobs.size().get(), - usage: wgpu::BufferUsages::STORAGE, - mapped_at_creation: true, - }); - - let mut mapping = culling_data_buffer.slice(..).get_mapped_range_mut(); - StorageBuffer::new(&mut *mapping).write(&jobs.jobs).unwrap(); - drop(mapping); - culling_data_buffer.unmap(); - - culling_data_buffer - }; - - let hi_z_buffer = ctx.graph_data.get_render_target(depth_handle); - - let culling_bg = ctx.renderer.device.create_bind_group(&BindGroupDescriptor { - label: Some(&format_sso!("GpuCuller {type_name} BG")), - layout: &self.culling_bgl, - entries: &[ - BindGroupEntry { binding: 0, resource: ctx.eval_output.mesh_buffer.as_entire_binding() }, - BindGroupEntry { - binding: 1, - resource: ctx.data_core.object_manager.buffer::().unwrap().as_entire_binding(), - }, - BindGroupEntry { - binding: 2, - resource: BindingResource::Buffer(BufferBinding { - buffer: &culling_data_buffer, - offset: 0, - size: Some(ShaderBatchData::SHADER_SIZE), - }), - }, - BindGroupEntry { binding: 3, resource: buffers.draw_call_buffer.as_entire_binding() }, - BindGroupEntry { binding: 4, resource: buffers.index_buffer.as_entire_binding() }, - BindGroupEntry { binding: 5, resource: buffers.culling_results_buffer.as_entire_binding() }, - BindGroupEntry { binding: 6, resource: per_camera_uniform.as_entire_binding() }, - BindGroupEntry { binding: 7, resource: BindingResource::TextureView(hi_z_buffer) }, - BindGroupEntry { binding: 8, resource: BindingResource::Sampler(&self.sampler) }, - ], - }); - - profiling::scope!("Command Encoding"); - let mut draw_calls = Vec::with_capacity(jobs.jobs.len()); - let mut material_key_ranges = HashMap::new(); - - let mut current_material_key_range_start = 0; - let mut current_material_key = jobs.regions.first().map(|k| k.key.material_key).unwrap_or(0); - for region in jobs.regions { - let region: JobSubRegion = region; - - if current_material_key != region.key.material_key { - let range_end = draw_calls.len(); - material_key_ranges.insert(current_material_key, current_material_key_range_start..range_end); - current_material_key = region.key.material_key; - current_material_key_range_start = range_end; - } - - draw_calls.push(DrawCall { bind_group_index: region.key.bind_group_index, batch_index: region.job_index }); - } - - material_key_ranges.insert(current_material_key, current_material_key_range_start..draw_calls.len()); - - encoder.clear_buffer(&buffers.draw_call_buffer, 8, None); - let mut cpass = encoder.begin_compute_pass(&ComputePassDescriptor { - label: Some(&format_sso!("GpuCuller {type_name} Culling")), - timestamp_writes: None, - }); - - cpass.set_pipeline(&self.culling_pipeline); - for (idx, job) in jobs.jobs.iter().enumerate() { - // RA can't infer this - let job: &ShaderBatchData = job; - - cpass.set_bind_group(0, &culling_bg, &[idx as u32 * ShaderBatchData::SHADER_SIZE.get() as u32]); - cpass.dispatch_workgroups(job.total_invocations.div_round_up(WORKGROUP_SIZE), 1, 1); - } - drop(cpass); - - DrawCallSet { culling_data_buffer, per_camera_uniform, draw_calls, material_key_ranges } - } - - pub fn add_object_uniform_upload_to_graph<'node, M: Material>( - &'node self, - graph: &mut RenderGraph<'node>, - camera_specifier: CameraSpecifier, - resolution: UVec2, - samples: SampleCount, - name: &str, - ) { - let mut node = graph.add_node(name); - node.add_side_effect(); - - node.build(move |mut ctx| { - let camera = match camera_specifier { - CameraSpecifier::Shadow(i) => &ctx.eval_output.shadows[i as usize].camera, - CameraSpecifier::Viewport => &ctx.data_core.viewport_camera_state, - }; - - self.object_uniform_upload::(&mut ctx, camera, camera_specifier, resolution, samples); - }); - } - - pub fn add_culling_to_graph<'node, M: Material>( - &'node self, - graph: &mut RenderGraph<'node>, - draw_calls_hdl: DataHandle>, - depth_handle: RenderTargetHandle, - camera_specifier: CameraSpecifier, - name: &str, - ) { - let mut node = graph.add_node(name); - let output = node.add_data(draw_calls_hdl, NodeResourceUsage::Output); - let depth_handle = node.add_render_target( - depth_handle, - if camera_specifier.is_shadow() { NodeResourceUsage::Reference } else { NodeResourceUsage::Input }, - ); - - node.build(move |mut ctx| { - let camera = match camera_specifier { - CameraSpecifier::Shadow(i) => &ctx.eval_output.shadows[i as usize].camera, - CameraSpecifier::Viewport => &ctx.data_core.viewport_camera_state, - }; - - let jobs = batch_objects::(&mut ctx, &self.previous_invocation_map_handle, camera, camera_specifier); - - if jobs.jobs.is_empty() { - return; - } - - let draw_calls = self.cull::(&mut ctx, jobs, depth_handle, camera_specifier); - - ctx.graph_data.set_data(output, Some(Arc::new(draw_calls))); - }); - } -} diff --git a/rend3-routine/src/culling/mod.rs b/rend3-routine/src/culling/mod.rs deleted file mode 100644 index 3ff0b355..00000000 --- a/rend3-routine/src/culling/mod.rs +++ /dev/null @@ -1,10 +0,0 @@ -const BATCH_SIZE: usize = 256; -const WORKGROUP_SIZE: u32 = 256; - -mod batching; -mod culler; -mod suballoc; - -pub use batching::{ShaderBatchData, ShaderBatchDatas}; -pub use culler::{CullingBufferMap, DrawCall, DrawCallSet, GpuCuller}; -pub use suballoc::{InputOutputBuffer, InputOutputPartition}; diff --git a/rend3-routine/src/culling/suballoc.rs b/rend3-routine/src/culling/suballoc.rs deleted file mode 100644 index 6079dcda..00000000 --- a/rend3-routine/src/culling/suballoc.rs +++ /dev/null @@ -1,223 +0,0 @@ -use std::{ - ops::{Deref, Range}, - sync::Arc, -}; - -use encase::{internal::WriteInto, ShaderType, StorageBuffer}; -use rend3::util::{math::IntegerExt, typedefs::SsoString}; -use wgpu::CommandEncoder; - -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub enum InputOutputPartition { - Input, - Output, -} - -#[derive(Debug)] -pub struct InputOutputBuffer { - /// Label for the buffer - label: SsoString, - /// Current active buffer - buffer: Arc, - /// Amount of elements reserved in the buffer for data, not including the header. - capacity_elements: u64, - /// Size of output partition - output_partition_elements: u64, - /// Size of input partition - input_partition_elements: u64, - /// When false, output partition is comes first. - /// - /// When true, input partition comes first. - flipped: bool, - /// Clear on swap - /// - /// When true, the data in both partitions will be cleared when the buffer - /// is swapped. - clear_on_swap: bool, - /// The size of each element in the buffer. This allows the user to provide sizes in element counts only. - /// - /// Must be a multiple of `element_alignment`. - element_size: u64, - /// Size of the header, including padding. - padded_header_size: u64, -} - -impl Deref for InputOutputBuffer { - type Target = Arc; - - fn deref(&self) -> &Self::Target { - &self.buffer - } -} - -impl InputOutputBuffer { - const USAGES: wgpu::BufferUsages = wgpu::BufferUsages::STORAGE - .union(wgpu::BufferUsages::COPY_DST) - .union(wgpu::BufferUsages::COPY_SRC) - .union(wgpu::BufferUsages::INDEX) - .union(wgpu::BufferUsages::INDIRECT); - - // The size of the header, including padding - fn padded_header_size(element_alignment: u64) -> u64 { - const HEADER_SIZE: u64 = 8; - HEADER_SIZE.round_up(element_alignment) - } - - fn capacity_elements(input_partition_elements: u64, output_partition_elements: u64) -> u64 { - let max = input_partition_elements.max(output_partition_elements); - max.next_power_of_two() * 2 - } - - fn buffer_size(padded_header_size: u64, capacity_elements: u64, element_size: u64) -> u64 { - capacity_elements * element_size + padded_header_size - } - - pub fn new( - device: &wgpu::Device, - queue: &wgpu::Queue, - partition_elements: u64, - label: &str, - element_size: u64, - element_alignment: u64, - clear_on_swap: bool, - ) -> Self { - let element_size = element_size.round_up(element_alignment); - let capacity_elements = Self::capacity_elements(partition_elements, partition_elements); - let padded_header_size = Self::padded_header_size(element_alignment); - let buffer_length = Self::buffer_size(padded_header_size, capacity_elements, element_size); - - let buffer = Arc::new(device.create_buffer(&wgpu::BufferDescriptor { - label: Some(label), - size: buffer_length, - usage: Self::USAGES, - mapped_at_creation: false, - })); - - let this = Self { - label: SsoString::from(label), - buffer, - capacity_elements, - output_partition_elements: partition_elements, - input_partition_elements: partition_elements, - flipped: false, - clear_on_swap, - element_size, - padded_header_size, - }; - - this.write_headers(queue); - - this - } - - /// Returns the offset in bytes for a given element in the given partition - pub fn element_offset(&self, partition: InputOutputPartition, element: u64) -> u64 { - let partition_offset = match partition { - InputOutputPartition::Input => self.input_partition_offset(), - InputOutputPartition::Output => self.output_partition_offset(), - }; - self.padded_header_size + partition_offset + element * self.element_size - } - - pub fn partition_slice(&self, partition: InputOutputPartition) -> Range { - let partition_offset = match partition { - InputOutputPartition::Input => self.input_partition_offset(), - InputOutputPartition::Output => self.output_partition_offset(), - }; - let partition_elements = match partition { - InputOutputPartition::Input => self.input_partition_elements, - InputOutputPartition::Output => self.output_partition_elements, - }; - let partition_size = partition_elements * self.element_size; - let slice_start = self.padded_header_size + partition_offset; - let slice_end: u64 = slice_start + partition_size; - slice_start..slice_end - } - - pub fn write_to_output(&self, queue: &wgpu::Queue, data: &T) { - assert_eq!(data.size().get(), self.output_partition_elements * self.element_size); - let mut mapping = queue - .write_buffer_with(&self.buffer, self.element_offset(InputOutputPartition::Output, 0), data.size()) - .unwrap(); - StorageBuffer::new(&mut *mapping).write(data).unwrap(); - drop(mapping); - } - - /// Returns the offset in bytes to get to the start of the output partition, not including the header. - fn output_partition_offset(&self) -> u64 { - if self.flipped { - (self.capacity_elements * self.element_size) / 2 - } else { - 0 - } - } - - /// Returns the offset in bytes to get to the start of the input partition, not including the header. - fn input_partition_offset(&self) -> u64 { - if self.flipped { - 0 - } else { - (self.capacity_elements * self.element_size) / 2 - } - } - - pub fn swap( - &mut self, - queue: &wgpu::Queue, - device: &wgpu::Device, - encoder: &mut CommandEncoder, - new_partition_elements: u64, - ) { - // Offset of the output partition in the old buffer. - let old_output_partition_offset = self.output_partition_offset(); - - // The output of last frame is now the input of this frame. - self.input_partition_elements = self.output_partition_elements; - // The new output is of the given size. - self.output_partition_elements = new_partition_elements; - // We're now flipped. - self.flipped = !self.flipped; - - // Gather a new data capcity - let new_capacity_elements = - Self::capacity_elements(self.input_partition_elements, self.output_partition_elements); - - if new_capacity_elements != self.capacity_elements { - // Set the capacity reserved - self.capacity_elements = new_capacity_elements; - let new_buffer = device.create_buffer(&wgpu::BufferDescriptor { - label: Some(&self.label), - size: Self::buffer_size(self.padded_header_size, new_capacity_elements, self.element_size), - usage: Self::USAGES, - mapped_at_creation: false, - }); - if !self.clear_on_swap { - // We copy the old output partition to the input partition of the new buffer. - // - // Note that we call output_partition_offset before we change any internal parameters, - // as we need the old buffer offsets. - encoder.copy_buffer_to_buffer( - &self.buffer, - old_output_partition_offset + self.padded_header_size, - &new_buffer, - self.input_partition_offset() + self.padded_header_size, - self.input_partition_elements * self.element_size, - ); - } - // We now set the new buffer. - self.buffer = Arc::new(new_buffer); - } else if self.clear_on_swap { - encoder.clear_buffer(&self.buffer, self.padded_header_size, None); - } - - self.write_headers(queue) - } - - fn write_headers(&self, queue: &wgpu::Queue) { - let offsets = [ - (self.output_partition_offset() / self.element_size) as u32, - (self.input_partition_offset() / self.element_size) as u32, - ]; - queue.write_buffer(&self.buffer, 0, bytemuck::cast_slice(&offsets)); - } -} diff --git a/rend3-routine/src/forward.rs b/rend3-routine/src/forward.rs index e45c9003..3b5f4d6d 100644 --- a/rend3-routine/src/forward.rs +++ b/rend3-routine/src/forward.rs @@ -5,11 +5,11 @@ use std::{marker::PhantomData, sync::Arc}; use arrayvec::ArrayVec; -use encase::ShaderSize; +use encase::{ShaderSize, StorageBuffer}; use rend3::{ - graph::{self, DataHandle, NodeResourceUsage, RenderGraph, RenderPassTargets}, - types::{GraphDataHandle, Material, SampleCount}, - util::{bind_merge::BindGroupBuilder, typedefs::FastHashMap}, + graph::{DataHandle, NodeResourceUsage, RenderGraph, RenderPassTargets}, + types::{Material, SampleCount}, + util::bind_merge::BindGroupBuilder, ProfileData, Renderer, RendererDataCore, RendererProfile, ShaderPreProcessor, }; use serde::Serialize; @@ -20,10 +20,8 @@ use wgpu::{ VertexState, }; -use crate::{ - common::{CameraSpecifier, PerMaterialArchetypeInterface, WholeFrameInterfaces}, - culling::{self, CullingBufferMap, DrawCall, DrawCallSet, InputOutputPartition}, -}; +use crate::common::{CameraSpecifier, PerMaterialArchetypeInterface, WholeFrameInterfaces}; +use crate::uniforms::PerCameraUniform; #[derive(Serialize)] struct ForwardPreprocessingArguments { @@ -44,44 +42,6 @@ pub struct ShaderModulePair<'a> { pub fs_module: &'a ShaderModule, } -enum DeclaredCullingOutput { - Predicted, - Residual(graph::DeclaredDependency>>), -} - -impl DeclaredCullingOutput { - /// Returns `true` if the declared culling output is [`Residual`]. - /// - /// [`Residual`]: DeclaredCullingOutput::Residual - #[must_use] - fn is_residual(&self) -> bool { - matches!(self, Self::Residual(..)) - } -} - -pub enum CullingSource { - /// We are rendering the first pass with the predicted triangles from last frame. - /// - /// This is used on the first pass. - Predicted, - /// We are rendering the second pass with the residual triangles from this frame. - /// - /// This is used when we are rendering the residual triangles from this frame, as part of the second pass. - Residual(DataHandle>), -} - -impl CullingSource { - fn add_inner_data(&self, builder: &mut graph::RenderGraphNodeBuilder<'_, '_>) -> DeclaredCullingOutput { - match self { - CullingSource::Predicted => DeclaredCullingOutput::Predicted, - CullingSource::Residual(handle) => { - let handle = builder.add_data(*handle, NodeResourceUsage::Input); - DeclaredCullingOutput::Residual(handle) - } - } - } -} - pub struct ForwardRoutineCreateArgs<'a, M> { pub name: &'a str, @@ -96,8 +56,6 @@ pub struct ForwardRoutineCreateArgs<'a, M> { pub routine_type: RoutineType, pub shaders: ShaderModulePair<'a>, - pub culling_buffer_map_handle: GraphDataHandle, - pub extra_bgls: &'a [&'a BindGroupLayout], #[allow(clippy::type_complexity)] pub descriptor_callback: Option<&'a dyn Fn(&mut RenderPipelineDescriptor<'_>, &mut [Option])>, @@ -126,7 +84,6 @@ pub struct ForwardRoutineArgs<'a, 'node, M> { pub binding_data: ForwardRoutineBindingData<'node, M>, /// Source of culling information, determines which triangles are rendered this pass. - pub culling_source: CullingSource, pub samples: SampleCount, pub renderpass: RenderPassTargets, } @@ -136,8 +93,6 @@ pub struct ForwardRoutine { pipeline_s1: RenderPipeline, pipeline_s4: RenderPipeline, material_key: u64, - culling_buffer_map_handle: GraphDataHandle, - draw_call_set_cache_handle: GraphDataHandle>>, _phantom: PhantomData, } impl ForwardRoutine { @@ -182,8 +137,6 @@ impl ForwardRoutine { pipeline_s1: build_forward_pipeline_inner(&pll, &args, SampleCount::One), pipeline_s4: build_forward_pipeline_inner(&pll, &args, SampleCount::Four), material_key: args.material_key, - draw_call_set_cache_handle: args.renderer.add_graph_data(FastHashMap::default()), - culling_buffer_map_handle: args.culling_buffer_map_handle, _phantom: PhantomData, } } @@ -192,73 +145,49 @@ impl ForwardRoutine { pub fn add_forward_to_graph<'node>(&'node self, args: ForwardRoutineArgs<'_, 'node, M>) { let mut builder = args.graph.add_node(args.label); - builder.add_side_effect(); - - let rpass_handle = builder.add_renderpass(args.renderpass.clone(), NodeResourceUsage::Output); + let rpass_handle = builder.add_renderpass(args.renderpass.clone(), NodeResourceUsage::InputOutput); let whole_frame_uniform_handle = builder.add_data(args.binding_data.whole_frame_uniform_bg, NodeResourceUsage::Input); - let culling_output_handle = args.culling_source.add_inner_data(&mut builder); builder.build(move |mut ctx| { let rpass = ctx.encoder_or_pass.take_rpass(rpass_handle); let whole_frame_uniform_bg = ctx.graph_data.get_data(ctx.temps, whole_frame_uniform_handle).unwrap(); - // We need to store the draw call set in a cache so that next frame's predicted pass can use it. - let mut draw_call_set_cache = ctx.data_core.graph_storage.get_mut(&self.draw_call_set_cache_handle); - - let draw_call_set = match culling_output_handle { - // We are rendering the second pass with the residual triangles from this frame. - DeclaredCullingOutput::Residual(handle) => { - // If there is no draw call set for this camera in the cache, there isn't actually anything to render. - let Some(draw_call_set) = ctx.graph_data.get_data(ctx.temps, handle) else { - return; - }; - - // As we're in the residual, we need to store the draw call set for the next frame. - draw_call_set_cache.insert(args.camera, Arc::clone(draw_call_set)); - - draw_call_set - } - // We are rendering the first pass with the predicted triangles from last frame. - DeclaredCullingOutput::Predicted => { - // If there is no draw call set for this camera in the cache, that means we have yet to actually render anything, - // so either no objects yet exist, or we are in the first frame, so we can bail out. - let Some(draw_call_set) = draw_call_set_cache.get(&args.camera) else { - return; - }; - - draw_call_set - } + let Some(objects) = ctx.data_core.object_manager.enumerated_objects::() else { + return; }; - let residual = culling_output_handle.is_residual() && args.camera.is_viewport(); - let culling_buffer_storage = ctx.data_core.graph_storage.get(&self.culling_buffer_map_handle); + let archetype_view = ctx.data_core.material_manager.archetype_view::(); - // If there are no culling buffers in storage yet, we are in the first frame. We depend on culling - // to render anything, so just bail at this point. - let Some(culling_buffers) = culling_buffer_storage.get_buffers(args.camera) else { - return; + let camera = match args.camera { + CameraSpecifier::Viewport => &ctx.data_core.viewport_camera_state, + CameraSpecifier::Shadow(idx) => &ctx.eval_output.shadows[idx as usize].camera, }; - // We need to actually clone ownership of the underlying buffers and add them to renderpass temps, - // so we can use them in the renderpass. - let index_buffer = ctx.temps.add(Arc::clone(&culling_buffers.index_buffer)); - let draw_call_buffer = ctx.temps.add(Arc::clone(&culling_buffers.draw_call_buffer)); + let per_camera_uniform_values = PerCameraUniform { + view: camera.view(), + view_proj: camera.view_proj(), + frustum: camera.world_frustum(), + object_count: objects.len() as u32, + }; - // When we're rendering the residual data, we are post buffer flip. We want to be rendering using the - // "input" partition, as this is the partition that all same-frame data is in. - let partition = if residual { InputOutputPartition::Input } else { InputOutputPartition::Output }; + let per_camera_uniform_buffer = ctx.temps.add(ctx.renderer.device.create_buffer(&wgpu::BufferDescriptor { + label: Some("Per Camera Uniform"), + size: PerCameraUniform::SHADER_SIZE.get(), + usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: true, + })); + let mut mapping = per_camera_uniform_buffer.slice(..).get_mapped_range_mut(); + StorageBuffer::new(&mut *mapping).write(&per_camera_uniform_values).unwrap(); + drop(mapping); + per_camera_uniform_buffer.unmap(); let per_material_bg = ctx.temps.add( BindGroupBuilder::new() .append_buffer(ctx.data_core.object_manager.buffer::().unwrap()) - .append_buffer_with_size( - &draw_call_set.culling_data_buffer, - culling::ShaderBatchData::SHADER_SIZE.get(), - ) .append_buffer(&ctx.eval_output.mesh_buffer) - .append_buffer(&draw_call_set.per_camera_uniform) + .append_buffer(per_camera_uniform_buffer) .append_buffer(ctx.data_core.material_manager.archetype_view::().buffer()) .build(&ctx.renderer.device, Some("Per-Material BG"), &args.binding_data.per_material_bgl.bgl), ); @@ -267,10 +196,7 @@ impl ForwardRoutine { SampleCount::One => &self.pipeline_s1, SampleCount::Four => &self.pipeline_s4, }; - rpass.set_index_buffer( - index_buffer.slice(culling_buffers.index_buffer.partition_slice(partition)), - IndexFormat::Uint32, - ); + rpass.set_index_buffer(ctx.eval_output.mesh_buffer.slice(..), IndexFormat::Uint32); rpass.set_pipeline(pipeline); rpass.set_bind_group(0, whole_frame_uniform_bg, &[]); if let Some(v) = args.binding_data.extra_bgs { @@ -282,34 +208,23 @@ impl ForwardRoutine { rpass.set_bind_group(2, bg, &[]); } - // If there are no draw calls for this material, just bail. - let Some(range) = draw_call_set.material_key_ranges.get(&self.material_key) else { - return; - }; - - for (range_relative_idx, call) in draw_call_set.draw_calls[range.clone()].iter().enumerate() { - // Help RA out - let call: &DrawCall = call; - // Add the base of the range to the index to get the actual index - let idx = range_relative_idx + range.start; + for (idx, object) in objects.into_iter() { + let material = archetype_view.material(*object.material_handle); + if material.inner.key() != self.material_key { + continue; + } // If we're in cpu driven mode, we need to update the texture bind group. if ctx.renderer.profile.is_cpu_driven() { - rpass.set_bind_group( - 2, - ctx.data_core.material_manager.texture_bind_group(call.bind_group_index), - &[], - ); + let texture_bind_group = material.bind_group_index.into_cpu(); + rpass.set_bind_group(2, ctx.data_core.material_manager.texture_bind_group(texture_bind_group), &[]); } - rpass.set_bind_group( - 1, - per_material_bg, - &[call.batch_index * culling::ShaderBatchData::SHADER_SIZE.get() as u32], - ); - rpass.draw_indexed_indirect( - draw_call_buffer, - culling_buffers.draw_call_buffer.element_offset(partition, idx as u64), - ); + rpass.set_bind_group(1, per_material_bg, &[]); + rpass.draw_indexed( + object.inner.first_index..object.inner.first_index + object.inner.index_count, + 0, + idx.idx as u32..idx.idx as u32 + 1, + ) } }); } diff --git a/rend3-routine/src/hi_z.rs b/rend3-routine/src/hi_z.rs deleted file mode 100644 index e75c5f8f..00000000 --- a/rend3-routine/src/hi_z.rs +++ /dev/null @@ -1,235 +0,0 @@ -use std::borrow::Cow; - -use glam::UVec2; -use rend3::{ - graph::{ - DeclaredDependency, NodeExecutionContext, NodeResourceUsage, RenderGraph, RenderPassDepthTarget, - RenderPassHandle, RenderPassTargets, RenderTargetHandle, ViewportRect, - }, - Renderer, ShaderPreProcessor, -}; -use wgpu::{ - BindGroupDescriptor, BindGroupEntry, BindGroupLayout, BindGroupLayoutDescriptor, BindGroupLayoutEntry, - BindingResource, BindingType, CompareFunction, DepthBiasState, DepthStencilState, Extent3d, FragmentState, - MultisampleState, PipelineLayoutDescriptor, PrimitiveState, RenderPipeline, RenderPipelineDescriptor, - ShaderModuleDescriptor, ShaderStages, StencilState, TextureDimension, TextureFormat, TextureSampleType, - TextureViewDimension, VertexState, -}; - -use crate::base::DepthTargets; - -pub struct HiZRoutine { - multisampled_bgl: BindGroupLayout, - single_sampled_bgl: BindGroupLayout, - downscale_pipeline: RenderPipeline, - resolve_pipeline: RenderPipeline, -} - -impl HiZRoutine { - pub fn new(renderer: &Renderer, spp: &ShaderPreProcessor) -> Self { - let resolve_source = spp - .render_shader("rend3-routine/resolve_depth_min.wgsl", &serde_json::json!({"SAMPLES": 4}), None) - .unwrap(); - let downscale_source = spp.render_shader("rend3-routine/hi_z.wgsl", &(), None).unwrap(); - - let resolve_sm = renderer.device.create_shader_module(ShaderModuleDescriptor { - label: Some("HiZ Resolver"), - source: wgpu::ShaderSource::Wgsl(Cow::Owned(resolve_source)), - }); - let downscale_sm = renderer.device.create_shader_module(ShaderModuleDescriptor { - label: Some("HiZ Downscaler"), - source: wgpu::ShaderSource::Wgsl(Cow::Owned(downscale_source)), - }); - - let multisampled_bgl = renderer.device.create_bind_group_layout(&BindGroupLayoutDescriptor { - label: Some("Multi Sample HiZ Texture BGL"), - entries: &[BindGroupLayoutEntry { - binding: 0, - visibility: ShaderStages::VERTEX_FRAGMENT, - ty: BindingType::Texture { - sample_type: TextureSampleType::Depth, - view_dimension: TextureViewDimension::D2, - multisampled: true, - }, - count: None, - }], - }); - - let single_sampled_bgl = renderer.device.create_bind_group_layout(&BindGroupLayoutDescriptor { - label: Some("Single Sample HiZ Texture BGL"), - entries: &[BindGroupLayoutEntry { - binding: 0, - visibility: ShaderStages::VERTEX_FRAGMENT, - ty: BindingType::Texture { - sample_type: TextureSampleType::Depth, - view_dimension: TextureViewDimension::D2, - multisampled: false, - }, - count: None, - }], - }); - - let resolve_pipline_layout = renderer.device.create_pipeline_layout(&PipelineLayoutDescriptor { - label: Some("HiZ Resolve PLL"), - bind_group_layouts: &[&multisampled_bgl], - push_constant_ranges: &[], - }); - - let downscale_pipline_layout = renderer.device.create_pipeline_layout(&PipelineLayoutDescriptor { - label: Some("HiZ Downscale PLL"), - bind_group_layouts: &[&single_sampled_bgl], - push_constant_ranges: &[], - }); - - let resolve_pipeline = renderer.device.create_render_pipeline(&RenderPipelineDescriptor { - label: Some("HiZ Resolve Pipeline"), - layout: Some(&resolve_pipline_layout), - vertex: VertexState { module: &resolve_sm, entry_point: "vs_main", buffers: &[] }, - primitive: PrimitiveState::default(), - depth_stencil: Some(DepthStencilState { - format: TextureFormat::Depth32Float, - depth_write_enabled: true, - depth_compare: CompareFunction::Always, - stencil: StencilState::default(), - bias: DepthBiasState::default(), - }), - multisample: MultisampleState::default(), - fragment: Some(FragmentState { module: &resolve_sm, entry_point: "fs_main", targets: &[] }), - multiview: None, - }); - - let downscale_pipeline = renderer.device.create_render_pipeline(&RenderPipelineDescriptor { - label: Some("HiZ Downscale Pipeline"), - layout: Some(&downscale_pipline_layout), - vertex: VertexState { module: &downscale_sm, entry_point: "vs_main", buffers: &[] }, - primitive: PrimitiveState::default(), - depth_stencil: Some(DepthStencilState { - format: TextureFormat::Depth32Float, - depth_write_enabled: true, - depth_compare: CompareFunction::Always, - stencil: StencilState::default(), - bias: DepthBiasState::default(), - }), - multisample: MultisampleState::default(), - fragment: Some(FragmentState { module: &downscale_sm, entry_point: "fs_main", targets: &[] }), - multiview: None, - }); - - Self { single_sampled_bgl, downscale_pipeline, multisampled_bgl, resolve_pipeline } - } - - pub fn resolve<'pass>( - &'pass self, - mut ctx: NodeExecutionContext<'_, 'pass, '_>, - renderpass_handle: DeclaredDependency, - source_handle: DeclaredDependency, - ) { - let rpass = ctx.encoder_or_pass.take_rpass(renderpass_handle); - let source = ctx.graph_data.get_render_target(source_handle); - - let bind_group = ctx.temps.add(ctx.renderer.device.create_bind_group(&BindGroupDescriptor { - label: Some("HiZ Resolve BG"), - layout: &self.multisampled_bgl, - entries: &[BindGroupEntry { binding: 0, resource: BindingResource::TextureView(source) }], - })); - - rpass.set_pipeline(&self.resolve_pipeline); - rpass.set_bind_group(0, bind_group, &[]); - rpass.draw(0..3, 0..1); - } - - pub fn downscale<'pass>( - &'pass self, - mut ctx: NodeExecutionContext<'_, 'pass, '_>, - renderpass_handle: DeclaredDependency, - source_handle: DeclaredDependency, - ) { - let rpass = ctx.encoder_or_pass.take_rpass(renderpass_handle); - let source = ctx.graph_data.get_render_target(source_handle); - - let bind_group = ctx.temps.add(ctx.renderer.device.create_bind_group(&BindGroupDescriptor { - label: Some("HiZ Bind Group Layout"), - layout: &self.single_sampled_bgl, - entries: &[BindGroupEntry { binding: 0, resource: BindingResource::TextureView(source) }], - })); - - rpass.set_pipeline(&self.downscale_pipeline); - rpass.set_bind_group(0, bind_group, &[]); - rpass.draw(0..3, 0..1); - } - - pub fn add_hi_z_to_graph<'node>( - &'node self, - graph: &mut RenderGraph<'node>, - depth_targets: DepthTargets, - resolution: UVec2, - ) { - let extent = Extent3d { width: resolution.x, height: resolution.y, depth_or_array_layers: 1 }; - let mips = extent.max_mips(TextureDimension::D2) as u8; - - // First we need to downscale the depth buffer to a single sample texture - // if we are doing multisampling. - if let Some(multi_sample) = depth_targets.multi_sample { - let mut node = graph.add_node("HiZ Resolve"); - - let source = node.add_render_target(multi_sample, NodeResourceUsage::Output); - - let rpass_handle = node.add_renderpass( - RenderPassTargets { - targets: vec![], - depth_stencil: Some(RenderPassDepthTarget { - target: depth_targets.single_sample_mipped.set_mips(0..1), - depth_clear: Some(0.0), - stencil_clear: None, - }), - }, - NodeResourceUsage::InputOutput, - ); - - node.add_side_effect(); - - node.build(move |ctx| { - self.resolve(ctx, rpass_handle, source); - }); - } - - for dst_mip in 1..mips { - let src_mip = dst_mip - 1; - - let mut node = graph.add_node(&format!("HiZ Mip {src_mip} -> {dst_mip}")); - - let dst_extent = extent.mip_level_size(dst_mip as u32, TextureDimension::D2); - let src_extent = extent.mip_level_size(src_mip as u32, TextureDimension::D2); - - let dst_target = depth_targets - .single_sample_mipped - .set_mips(dst_mip..dst_mip + 1) - .set_viewport(ViewportRect::from_size(UVec2::new(dst_extent.width, dst_extent.height))); - let src_target = node.add_render_target( - depth_targets - .single_sample_mipped - .set_mips(src_mip..src_mip + 1) - .set_viewport(ViewportRect::from_size(UVec2::new(src_extent.width, src_extent.height))), - NodeResourceUsage::Input, - ); - - let rpass_handle = node.add_renderpass( - RenderPassTargets { - targets: vec![], - depth_stencil: Some(RenderPassDepthTarget { - target: dst_target, - depth_clear: Some(0.0), - stencil_clear: None, - }), - }, - NodeResourceUsage::InputOutput, - ); - - node.add_side_effect(); - - node.build(move |ctx| { - self.downscale(ctx, rpass_handle, src_target); - }); - } - } -} diff --git a/rend3-routine/src/lib.rs b/rend3-routine/src/lib.rs index 9f13361b..eba4350f 100644 --- a/rend3-routine/src/lib.rs +++ b/rend3-routine/src/lib.rs @@ -21,9 +21,7 @@ pub mod base; pub mod clear; pub mod common; -pub mod culling; pub mod forward; -pub mod hi_z; pub mod pbr; mod shaders; pub mod skinning; diff --git a/rend3-routine/src/pbr/routine.rs b/rend3-routine/src/pbr/routine.rs index 54b52f97..e6e8df83 100644 --- a/rend3-routine/src/pbr/routine.rs +++ b/rend3-routine/src/pbr/routine.rs @@ -1,16 +1,12 @@ use std::{borrow::Cow, sync::Arc}; -use rend3::{ - types::GraphDataHandle, Renderer, RendererDataCore, RendererProfile, ShaderPreProcessor, ShaderVertexBufferConfig, -}; +use rend3::{Renderer, RendererDataCore, RendererProfile, ShaderPreProcessor, ShaderVertexBufferConfig}; use serde::Serialize; use wgpu::{BlendState, ShaderModuleDescriptor, ShaderSource}; use crate::{ common::{PerMaterialArchetypeInterface, WholeFrameInterfaces}, - culling::CullingBufferMap, forward::{ForwardRoutine, ForwardRoutineCreateArgs, RoutineType, ShaderModulePair}, - hi_z::HiZRoutine, pbr::{PbrMaterial, TransparencyType}, }; @@ -27,7 +23,6 @@ pub struct PbrRoutine { pub opaque_routine: ForwardRoutine, pub cutout_routine: ForwardRoutine, pub blend_routine: ForwardRoutine, - pub hi_z: HiZRoutine, pub per_material: PerMaterialArchetypeInterface, } @@ -37,7 +32,6 @@ impl PbrRoutine { data_core: &mut RendererDataCore, spp: &ShaderPreProcessor, interfaces: &WholeFrameInterfaces, - culling_buffer_map_handle: &GraphDataHandle, ) -> Self { profiling::scope!("PbrRenderRoutine::new"); @@ -117,7 +111,6 @@ impl PbrRoutine { targets[0].as_mut().unwrap().blend = Some(BlendState::ALPHA_BLENDING) } }), - culling_buffer_map_handle: culling_buffer_map_handle.clone(), }) }; @@ -127,7 +120,6 @@ impl PbrRoutine { opaque_routine: inner(RoutineType::Forward, &pbr_forward, TransparencyType::Opaque), cutout_routine: inner(RoutineType::Forward, &pbr_cutout, TransparencyType::Cutout), blend_routine: inner(RoutineType::Forward, &pbr_forward, TransparencyType::Blend), - hi_z: HiZRoutine::new(renderer, spp), per_material, } } diff --git a/rend3-routine/src/uniforms.rs b/rend3-routine/src/uniforms.rs index b21f9d95..cd6ace6f 100644 --- a/rend3-routine/src/uniforms.rs +++ b/rend3-routine/src/uniforms.rs @@ -12,6 +12,16 @@ use wgpu::{BindGroup, BufferUsages}; use crate::common::{Samplers, WholeFrameInterfaces}; +#[derive(ShaderType)] +pub struct PerCameraUniform { + // TODO: use less space + pub view: Mat4, + // TODO: use less space + pub view_proj: Mat4, + pub frustum: Frustum, + pub object_count: u32, +} + /// Set of uniforms that are useful for the whole frame. #[derive(Debug, Copy, Clone, ShaderType)] pub struct FrameUniforms { diff --git a/rend3-test/src/runner.rs b/rend3-test/src/runner.rs index ad915976..d28bd3d2 100644 --- a/rend3-test/src/runner.rs +++ b/rend3-test/src/runner.rs @@ -83,13 +83,7 @@ impl TestRunnerBuilder { let base_rendergraph = BaseRenderGraph::new(&renderer, &spp); - let pbr = PbrRoutine::new( - &renderer, - &mut renderer.data_core.lock(), - &spp, - &base_rendergraph.interfaces, - &base_rendergraph.gpu_culler.culling_buffer_map_handle, - ); + let pbr = PbrRoutine::new(&renderer, &mut renderer.data_core.lock(), &spp, &base_rendergraph.interfaces); let tonemapping = TonemappingRoutine::new(&renderer, &spp, &base_rendergraph.interfaces, TextureFormat::Rgba8UnormSrgb); diff --git a/rend3/shaders/vertex_attributes.wgsl b/rend3/shaders/vertex_attributes.wgsl index baa5bf52..58cb2f1b 100644 --- a/rend3/shaders/vertex_attributes.wgsl +++ b/rend3/shaders/vertex_attributes.wgsl @@ -5,26 +5,8 @@ struct Indices { vertex: u32, } -struct BatchIndices { - /// Index _within_ the batch - local_object: u32, - /// Vertex index within the object - vertex: u32, -} - const INVALID_VERTEX: u32 = 0x00FFFFFFu; -fn unpack_batch_index(vertex_index: u32) -> BatchIndices { - return BatchIndices( - vertex_index >> 24u, - vertex_index & 0xFFFFFFu, - ); -} - -fn pack_batch_index(local_object: u32, index: u32) -> u32 { - return (local_object << 24u) | (index & 0xFFFFFFu); -} - alias TriangleVertices = array; alias TriangleIndices = array; struct Triangle { @@ -32,14 +14,6 @@ struct Triangle { indices: TriangleIndices, } -fn pack_batch_indices(local_object: u32, indices: TriangleIndices) -> TriangleIndices { - return TriangleIndices( - pack_batch_index(local_object, indices[0]), - pack_batch_index(local_object, indices[1]), - pack_batch_index(local_object, indices[2]), - ); -} - fn extract_attribute_vec2_f32(byte_base_offset: u32, vertex_index: u32) -> vec2 { let first_element_idx = byte_base_offset / 4u + vertex_index * 2u; return vec2( diff --git a/rend3/src/shader.rs b/rend3/src/shader.rs index 9c8d51d0..ae2905bb 100644 --- a/rend3/src/shader.rs +++ b/rend3/src/shader.rs @@ -214,21 +214,9 @@ impl<'a> HelperDef for ShaderVertexBufferHelper<'a> { Some(s) => s, _ => Err(RenderErrorReason::Other("Vertex buffer helper's first argument must be a string".to_string()))?, }; - let batch_buffer_value = h - .param(1) - .ok_or_else(|| { - RenderErrorReason::Other( - "Vertex buffer helper must have an argument pointing to the buffer of batch data".to_string(), - ) - })? - .relative_path(); - let batch_buffer = match batch_buffer_value { - Some(s) => s, - _ => Err(RenderErrorReason::Other("Vertex buffer helper's second argument must be a string".to_string()))?, - }; let template = self - .generate_template(h, object_buffer, batch_buffer) + .generate_template(h, object_buffer) .map_err(|_| RenderErrorReason::Other("Failed to writeln vertex template string".to_string()))?; out.write(&r.render_template(&template, ctx.data())?)?; @@ -238,30 +226,17 @@ impl<'a> HelperDef for ShaderVertexBufferHelper<'a> { } impl<'a> ShaderVertexBufferHelper<'a> { - fn generate_template( - &self, - h: &Helper, - object_buffer: &str, - batch_buffer: &str, - ) -> Result { + fn generate_template(&self, h: &Helper, object_buffer: &str) -> Result { let includes = r#"{{include "rend3/vertex_attributes.wgsl"}}"#; - let unpack_function = format!( - " - fn unpack_vertex_index(vertex_index: u32) -> Indices {{ - let batch_indices = unpack_batch_index(vertex_index); - let object_id = {batch_buffer}.object_culling_information[batch_indices.local_object].object_id; - - return Indices(object_id, batch_indices.vertex); - }}" - ); + let unpack_function = String::new(); let mut input_struct = String::new(); writeln!(input_struct, "struct VertexInput {{")?; let mut input_function = String::new(); writeln!(input_function, "fn get_vertices(indices: Indices) -> VertexInput {{")?; writeln!(input_function, " var verts: VertexInput;")?; - for requested_attribute in &h.params()[2..] { + for requested_attribute in &h.params()[1..] { let (attr_idx, spec) = self .config .specs