diff --git a/piet-gpu-hal/src/backend.rs b/piet-gpu-hal/src/backend.rs index 1086d3b..02ac7cb 100644 --- a/piet-gpu-hal/src/backend.rs +++ b/piet-gpu-hal/src/backend.rs @@ -225,6 +225,12 @@ pub trait CmdBuf { /// Prepare the timestamps for reading. This isn't required on Vulkan but /// is required on (at least) DX12. unsafe fn finish_timestamps(&mut self, _pool: &D::QueryPool) {} + + /// Begin a labeled section for debugging and profiling purposes. + unsafe fn begin_debug_label(&mut self, label: &str) {} + + /// End a section opened by `begin_debug_label`. + unsafe fn end_debug_label(&mut self) {} } /// A builder for descriptor sets with more complex layouts. diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs index 8c5926a..cc09832 100644 --- a/piet-gpu-hal/src/hub.rs +++ b/piet-gpu-hal/src/hub.rs @@ -598,6 +598,16 @@ impl CmdBuf { self.cmd_buf().finish_timestamps(pool); } + /// Begin a labeled section for debugging and profiling purposes. + pub unsafe fn begin_debug_label(&mut self, label: &str) { + self.cmd_buf().begin_debug_label(label); + } + + /// End a section opened by `begin_debug_label`. + pub unsafe fn end_debug_label(&mut self) { + self.cmd_buf().end_debug_label(); + } + /// Make sure the resource lives until the command buffer completes. /// /// The submitted command buffer will hold this reference until the corresponding diff --git a/piet-gpu-hal/src/mux.rs b/piet-gpu-hal/src/mux.rs index 4a54e96..af1702d 100644 --- a/piet-gpu-hal/src/mux.rs +++ b/piet-gpu-hal/src/mux.rs @@ -772,6 +772,22 @@ impl CmdBuf { CmdBuf::Mtl(c) => c.finish_timestamps(pool.mtl()), } } + + pub unsafe fn begin_debug_label(&mut self, label: &str) { + mux_match! { self; + CmdBuf::Vk(c) => c.begin_debug_label(label), + CmdBuf::Dx12(c) => c.begin_debug_label(label), + CmdBuf::Mtl(c) => c.begin_debug_label(label), + } + } + + pub unsafe fn end_debug_label(&mut self) { + mux_match! { self; + CmdBuf::Vk(c) => c.end_debug_label(), + CmdBuf::Dx12(c) => c.end_debug_label(), + CmdBuf::Mtl(c) => c.end_debug_label(), + } + } } impl Buffer { diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs index 924e2d6..8392899 100644 --- a/piet-gpu-hal/src/vulkan.rs +++ b/piet-gpu-hal/src/vulkan.rs @@ -8,6 +8,7 @@ use std::sync::Arc; use ash::extensions::{ext::DebugUtils, khr}; use ash::{vk, Device, Entry, Instance}; +use ash::vk::DebugUtilsLabelEXT; use smallvec::SmallVec; @@ -23,7 +24,7 @@ pub struct VkInstance { entry: Entry, instance: Instance, vk_version: u32, - _dbg_loader: Option, + dbg_loader: Option, _dbg_callbk: Option, } @@ -39,6 +40,7 @@ pub struct VkDevice { struct RawDevice { device: Device, + dbg_loader: Option, } pub struct VkSurface { @@ -202,7 +204,7 @@ impl VkInstance { None, )?; - let (_dbg_loader, _dbg_callbk) = if has_debug_ext { + let (dbg_loader, _dbg_callbk) = if has_debug_ext { let dbg_info = vk::DebugUtilsMessengerCreateInfoEXT::builder() .message_severity( vk::DebugUtilsMessageSeverityFlagsEXT::ERROR @@ -231,7 +233,7 @@ impl VkInstance { entry, instance, vk_version, - _dbg_loader, + dbg_loader, _dbg_callbk, }; @@ -317,7 +319,7 @@ impl VkInstance { let queue_index = 0; let queue = device.get_device_queue(qfi, queue_index); - let device = Arc::new(RawDevice { device }); + let device = Arc::new(RawDevice { device, dbg_loader: self.dbg_loader.clone() }); let props = self.instance.get_physical_device_properties(pdevice); let timestamp_period = props.limits.timestamp_period; @@ -1112,6 +1114,20 @@ impl crate::backend::CmdBuf for CmdBuf { query, ); } + + unsafe fn begin_debug_label(&mut self, label: &str) { + if let Some(utils) = &self.device.dbg_loader { + let label_cstr = CString::new(label).unwrap(); + let label_ext = DebugUtilsLabelEXT::builder().label_name(&label_cstr).build(); + utils.cmd_begin_debug_utils_label(self.cmd_buf, &label_ext); + } + } + + unsafe fn end_debug_label(&mut self) { + if let Some(utils) = &self.device.dbg_loader { + utils.cmd_end_debug_utils_label(self.cmd_buf); + } + } } impl crate::backend::DescriptorSetBuilder for DescriptorSetBuilder { diff --git a/piet-gpu/shader/.clang-format b/piet-gpu/shader/.clang-format new file mode 100644 index 0000000..9801ccd --- /dev/null +++ b/piet-gpu/shader/.clang-format @@ -0,0 +1,5 @@ +BasedOnStyle: LLVM +IndentWidth: 4 +ColumnLimit: 120 +AllowShortFunctionsOnASingleLine: None +SortIncludes: false diff --git a/piet-gpu/shader/backdrop.comp b/piet-gpu/shader/backdrop.comp index e4140cd..0c698b1 100644 --- a/piet-gpu/shader/backdrop.comp +++ b/piet-gpu/shader/backdrop.comp @@ -57,10 +57,10 @@ void main() { if (element_ix < conf.n_elements) { AnnotatedTag tag = Annotated_tag(conf.anno_alloc, ref); switch (tag.tag) { - case Annotated_Image: - case Annotated_LinGradient: - case Annotated_BeginClip: - case Annotated_Color: + case Annotated_Image: + case Annotated_LinGradient: + case Annotated_BeginClip: + case Annotated_Color: if (fill_mode_from_flags(tag.flags) != MODE_NONZERO) { break; } @@ -77,7 +77,8 @@ void main() { // long as it doesn't cross the left edge. row_count = 0; } - Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok); + Alloc path_alloc = new_alloc( + path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok); sh_row_alloc[th_ix] = path_alloc; } } diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp index c2b81fd..a3a8ffd 100644 --- a/piet-gpu/shader/binning.comp +++ b/piet-gpu/shader/binning.comp @@ -75,13 +75,14 @@ void main() { // trying to keep divergence low. // Right now, it's just a bbox, but we'll get finer with // segments. - uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1)/N_TILE_X; - uint height_in_bins = (conf.height_in_tiles + N_TILE_Y - 1)/N_TILE_Y; + uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1) / N_TILE_X; + uint height_in_bins = (conf.height_in_tiles + N_TILE_Y - 1) / N_TILE_Y; x0 = clamp(x0, 0, int(width_in_bins)); x1 = clamp(x1, x0, int(width_in_bins)); y0 = clamp(y0, 0, int(height_in_bins)); y1 = clamp(y1, y0, int(height_in_bins)); - if (x0 == x1) y1 = y0; + if (x0 == x1) + y1 = y0; int x = x0, y = y0; uint my_slice = gl_LocalInvocationID.x / 32; uint my_mask = 1u << (gl_LocalInvocationID.x & 31); diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja index e79908a..448caf2 100644 --- a/piet-gpu/shader/build.ninja +++ b/piet-gpu/shader/build.ninja @@ -118,3 +118,7 @@ build gen/draw_leaf.spv: glsl draw_leaf.comp | scene.h drawtag.h annotated.h set build gen/draw_leaf.hlsl: hlsl gen/draw_leaf.spv build gen/draw_leaf.dxil: dxil gen/draw_leaf.hlsl build gen/draw_leaf.msl: msl gen/draw_leaf.spv + +build spv: phony gen/backdrop_lg.spv gen/backdrop.spv gen/bbox_clear.spv gen/binning.spv gen/coarse.spv gen/draw_leaf.spv gen/draw_reduce.spv gen/draw_root.spv gen/kernel4.spv gen/path_coarse.spv gen/pathseg.spv gen/pathtag_reduce.spv gen/pathtag_root.spv gen/tile_alloc.spv gen/transform_leaf.spv gen/transform_reduce.spv gen/transform_root.spv +build dxil: phony gen/backdrop.hlsl gen/backdrop_lg.hlsl gen/bbox_clear.hlsl gen/binning.hlsl gen/coarse.hlsl gen/draw_leaf.hlsl gen/draw_reduce.hlsl gen/draw_root.hlsl gen/kernel4.hlsl gen/path_coarse.hlsl gen/pathseg.hlsl gen/pathtag_reduce.hlsl gen/pathtag_root.hlsl gen/tile_alloc.hlsl gen/transform_leaf.hlsl gen/transform_reduce.hlsl gen/transform_root.hlsl +build msl: phony gen/backdrop_lg.msl gen/backdrop.msl gen/bbox_clear.msl gen/binning.msl gen/coarse.msl gen/draw_leaf.msl gen/draw_reduce.msl gen/draw_root.msl gen/kernel4.msl gen/path_coarse.msl gen/pathseg.msl gen/pathtag_reduce.msl gen/pathtag_root.msl gen/tile_alloc.msl gen/transform_leaf.msl gen/transform_reduce.msl gen/transform_root.msl diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp index 31a64e4..bf5f949 100644 --- a/piet-gpu/shader/coarse.comp +++ b/piet-gpu/shader/coarse.comp @@ -8,7 +8,8 @@ // Each workgroup operating on one bin by stream compacting // the elements corresponding to the bin. // -// As output we have an ordered command stream per tile. Every tile from a path (backdrop + segment list) will be encoded. +// As output we have an ordered command stream per tile. Every tile from a path (backdrop + segment list) will be +// encoded. #version 450 #extension GL_GOOGLE_include_directive : enable @@ -66,7 +67,7 @@ void write_tile_alloc(uint el_ix, Alloc a) { Alloc read_tile_alloc(uint el_ix, bool mem_ok) { // All memory. - return new_alloc(0, memory.length()*4, mem_ok); + return new_alloc(0, memory.length() * 4, mem_ok); } #endif @@ -111,7 +112,7 @@ void write_fill(Alloc alloc, inout CmdRef cmd_ref, uint flags, Tile tile, float void main() { // Could use either linear or 2d layouts for both dispatch and // invocations within the workgroup. We'll use variables to abstract. - uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1)/N_TILE_X; + uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1) / N_TILE_X; uint bin_ix = width_in_bins * gl_WorkGroupID.y + gl_WorkGroupID.x; uint partition_ix = 0; uint n_partitions = (conf.n_elements + N_TILE - 1) / N_TILE; @@ -163,7 +164,7 @@ void main() { uint in_ix = (conf.bin_alloc.offset >> 2) + ((partition_ix + th_ix) * N_TILE + bin_ix) * 2; count = read_mem(conf.bin_alloc, in_ix); uint offset = read_mem(conf.bin_alloc, in_ix + 1); - sh_part_elements[th_ix] = new_alloc(offset, count*BinInstance_size, mem_ok); + sh_part_elements[th_ix] = new_alloc(offset, count * BinInstance_size, mem_ok); } // prefix sum of counts for (uint i = 0; i < LG_N_PART_READ; i++) { @@ -245,7 +246,8 @@ void main() { // base relative to bin uint base = path.tiles.offset - uint(dy * stride + dx) * Tile_size; sh_tile_base[th_ix] = base; - Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok); + Alloc path_alloc = new_alloc(path.tiles.offset, + (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok); write_tile_alloc(th_ix, path_alloc); break; default: @@ -284,7 +286,8 @@ void main() { if (tag == Annotated_BeginClip || tag == Annotated_EndClip) { include_tile = true; } else if (mem_ok) { - Tile tile = Tile_read(read_tile_alloc(el_ix, mem_ok), TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size)); + Tile tile = Tile_read(read_tile_alloc(el_ix, mem_ok), + TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size)); // Include the path in the tile if // - the tile contains at least a segment (tile offset non-zero) // - the tile is completely covered (backdrop non-zero) @@ -329,8 +332,9 @@ void main() { if (clip_zero_depth == 0) { switch (tag.tag) { case Annotated_Color: - Tile tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), TileRef(sh_tile_base[element_ref_ix] - + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); + Tile tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), + TileRef(sh_tile_base[element_ref_ix] + + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); AnnoColor fill = Annotated_Color_read(conf.anno_alloc, ref); if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { break; @@ -340,8 +344,9 @@ void main() { cmd_ref.offset += 4 + CmdColor_size; break; case Annotated_LinGradient: - tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), TileRef(sh_tile_base[element_ref_ix] - + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); + tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), + TileRef(sh_tile_base[element_ref_ix] + + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); AnnoLinGradient lin = Annotated_LinGradient_read(conf.anno_alloc, ref); if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { break; @@ -356,8 +361,9 @@ void main() { cmd_ref.offset += 4 + CmdLinGrad_size; break; case Annotated_Image: - tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), TileRef(sh_tile_base[element_ref_ix] - + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); + tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), + TileRef(sh_tile_base[element_ref_ix] + + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); AnnoImage fill_img = Annotated_Image_read(conf.anno_alloc, ref); if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { break; @@ -367,8 +373,9 @@ void main() { cmd_ref.offset += 4 + CmdImage_size; break; case Annotated_BeginClip: - tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), TileRef(sh_tile_base[element_ref_ix] - + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); + tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), + TileRef(sh_tile_base[element_ref_ix] + + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); if (tile.tile.offset == 0 && tile.backdrop == 0) { clip_zero_depth = clip_depth + 1; } else if (tile.tile.offset == 0 && clip_depth < 32) { @@ -418,7 +425,8 @@ void main() { barrier(); rd_ix += N_TILE; - if (rd_ix >= ready_ix && partition_ix >= n_partitions) break; + if (rd_ix >= ready_ix && partition_ix >= n_partitions) + break; } if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) { Cmd_End_write(cmd_alloc, cmd_ref); diff --git a/piet-gpu/shader/draw_leaf.comp b/piet-gpu/shader/draw_leaf.comp index 5de2652..c020847 100644 --- a/piet-gpu/shader/draw_leaf.comp +++ b/piet-gpu/shader/draw_leaf.comp @@ -3,7 +3,6 @@ // The leaf scan pass for draw tag scan implemented as a tree reduction. // This stage can be fused with its consumer but is separate now. - #version 450 #extension GL_GOOGLE_include_directive : enable @@ -62,7 +61,7 @@ void main() { barrier(); sh_scratch[gl_LocalInvocationID.x] = agg; } - + barrier(); Monoid row = tag_monoid_identity(); if (gl_WorkGroupID.x > 0) { diff --git a/piet-gpu/shader/draw_scan.comp b/piet-gpu/shader/draw_scan.comp index 2afc9ba..1c26c26 100644 --- a/piet-gpu/shader/draw_scan.comp +++ b/piet-gpu/shader/draw_scan.comp @@ -51,7 +51,7 @@ void main() { barrier(); sh_scratch[gl_LocalInvocationID.x] = agg; } - + barrier(); // This could be a semigroup instead of a monoid if we reworked the // conditional logic, but that might impact performance. diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp index 23353bc..9aba204 100644 --- a/piet-gpu/shader/kernel4.comp +++ b/piet-gpu/shader/kernel4.comp @@ -14,7 +14,7 @@ #define CHUNK_X 2 #define CHUNK_Y 4 -#define CHUNK CHUNK_X * CHUNK_Y +#define CHUNK (CHUNK_X * CHUNK_Y) #define CHUNK_DX (TILE_WIDTH_PX / CHUNK_X) #define CHUNK_DY (TILE_HEIGHT_PX / CHUNK_Y) layout(local_size_x = CHUNK_DX, local_size_y = CHUNK_DY) in; @@ -39,16 +39,16 @@ layout(rgba8, set = 0, binding = 4) uniform restrict readonly image2D gradients; #define MAX_BLEND_STACK 128 mediump vec3 tosRGB(mediump vec3 rgb) { bvec3 cutoff = greaterThanEqual(rgb, vec3(0.0031308)); - mediump vec3 below = vec3(12.92)*rgb; - mediump vec3 above = vec3(1.055)*pow(rgb, vec3(0.41666)) - vec3(0.055); + mediump vec3 below = vec3(12.92) * rgb; + mediump vec3 above = vec3(1.055) * pow(rgb, vec3(0.41666)) - vec3(0.055); return mix(below, above, cutoff); } mediump vec3 fromsRGB(mediump vec3 srgb) { // Formula from EXT_sRGB. bvec3 cutoff = greaterThanEqual(srgb, vec3(0.04045)); - mediump vec3 below = srgb/vec3(12.92); - mediump vec3 above = pow((srgb + vec3(0.055))/vec3(1.055), vec3(2.4)); + mediump vec3 below = srgb / vec3(12.92); + mediump vec3 above = pow((srgb + vec3(0.055)) / vec3(1.055), vec3(2.4)); return mix(below, above, cutoff); } @@ -86,7 +86,8 @@ void main() { Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC); CmdRef cmd_ref = CmdRef(cmd_alloc.offset); - uvec2 xy_uint = uvec2(gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_WorkGroupID.x, gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y); + uvec2 xy_uint = uvec2(gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_WorkGroupID.x, + gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y); vec2 xy = vec2(xy_uint); mediump vec4 rgba[CHUNK]; uint blend_stack[MAX_BLEND_STACK][CHUNK]; @@ -108,7 +109,8 @@ void main() { // Calculate distance field from all the line segments in this tile. CmdStroke stroke = Cmd_Stroke_read(cmd_alloc, cmd_ref); mediump float df[CHUNK]; - for (uint k = 0; k < CHUNK; k++) df[k] = 1e9; + for (uint k = 0; k < CHUNK; k++) + df[k] = 1e9; TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref); do { TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, mem_ok), tile_seg_ref); @@ -128,7 +130,8 @@ void main() { break; case Cmd_Fill: CmdFill fill = Cmd_Fill_read(cmd_alloc, cmd_ref); - for (uint k = 0; k < CHUNK; k++) area[k] = float(fill.backdrop); + for (uint k = 0; k < CHUNK; k++) + area[k] = float(fill.backdrop); tile_seg_ref = TileSegRef(fill.tile_ref); // Calculate coverage based on backdrop + coverage of each line segment do { diff --git a/piet-gpu/shader/path_coarse.comp b/piet-gpu/shader/path_coarse.comp index 1bd06f9..c6d3815 100644 --- a/piet-gpu/shader/path_coarse.comp +++ b/piet-gpu/shader/path_coarse.comp @@ -139,7 +139,8 @@ void main() { bool is_stroke = fill_mode_from_flags(tag.flags) == MODE_STROKE; uint path_ix = cubic.path_ix; Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size)); - Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok); + Alloc path_alloc = + new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok); ivec4 bbox = ivec4(path.bbox); vec2 p0 = cubic.p0; qp0 = cubic.p0; @@ -206,8 +207,8 @@ void main() { TileSeg tile_seg; - int xray = int(floor(p0.x*SX)); - int last_xray = int(floor(p1.x*SX)); + int xray = int(floor(p0.x * SX)); + int last_xray = int(floor(p1.x * SX)); if (p0.y > p1.y) { int tmp = xray; xray = last_xray; @@ -231,7 +232,7 @@ void main() { if (y < y1 - 1) { float tile_y1 = float((y + 1) * TILE_HEIGHT_PX); float x_edge = mix(p0.x, p1.x, (tile_y1 - p0.y) / dy); - next_xray = int(floor(x_edge*SX)); + next_xray = int(floor(x_edge * SX)); } int min_xray = min(xray, next_xray); @@ -265,7 +266,7 @@ void main() { // kernel4 uses sign(vector.x) for the sign of the intersection backdrop. // Nudge zeroes towards the intended sign. if (tile_seg.vector.x == 0) { - tile_seg.vector.x = sign(p1.x - p0.x)*1e-9; + tile_seg.vector.x = sign(p1.x - p0.x) * 1e-9; } } if (x <= min_xray || max_xray < x) { diff --git a/piet-gpu/shader/pathseg.comp b/piet-gpu/shader/pathseg.comp index 12104eb..a2ea86e 100644 --- a/piet-gpu/shader/pathseg.comp +++ b/piet-gpu/shader/pathseg.comp @@ -46,8 +46,7 @@ Monoid combine_monoid(Monoid a, Monoid b) { if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) { c.bbox = a.bbox; } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 && - (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)) - { + (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)) { c.bbox.xy = min(a.bbox.xy, c.bbox.xy); c.bbox.zw = max(a.bbox.zw, c.bbox.zw); } @@ -246,7 +245,7 @@ void main() { } // sh_scratch is the partition-wide inclusive scan of the bbox monoid, // sampled at the end of the N_SEQ sub-partition. - + barrier(); uint path_ix = save_path_ix; uint bbox_out_ix = (conf.bbox_alloc.offset >> 2) + path_ix * 6; diff --git a/piet-gpu/shader/pathtag_scan.comp b/piet-gpu/shader/pathtag_scan.comp index 7c1e74b..798622e 100644 --- a/piet-gpu/shader/pathtag_scan.comp +++ b/piet-gpu/shader/pathtag_scan.comp @@ -51,7 +51,7 @@ void main() { barrier(); sh_scratch[gl_LocalInvocationID.x] = agg; } - + barrier(); // This could be a semigroup instead of a monoid if we reworked the // conditional logic, but that might impact performance. diff --git a/piet-gpu/shader/transform_leaf.comp b/piet-gpu/shader/transform_leaf.comp index c51dfe6..a5e4003 100644 --- a/piet-gpu/shader/transform_leaf.comp +++ b/piet-gpu/shader/transform_leaf.comp @@ -68,7 +68,7 @@ void main() { barrier(); sh_scratch[gl_LocalInvocationID.x] = agg; } - + barrier(); Monoid row = monoid_identity(); if (gl_WorkGroupID.x > 0) { diff --git a/piet-gpu/shader/transform_scan.comp b/piet-gpu/shader/transform_scan.comp index c4d6745..20b2a8a 100644 --- a/piet-gpu/shader/transform_scan.comp +++ b/piet-gpu/shader/transform_scan.comp @@ -66,7 +66,7 @@ void main() { barrier(); sh_scratch[gl_LocalInvocationID.x] = agg; } - + barrier(); // This could be a semigroup instead of a monoid if we reworked the // conditional logic, but that might impact performance. diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index 3c1e27f..97e1f28 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -388,6 +388,7 @@ impl Renderer { cmd_buf.image_barrier(&self.gradients, ImageLayout::BlitDst, ImageLayout::General); cmd_buf.reset_query_pool(&query_pool); cmd_buf.write_timestamp(&query_pool, 0); + cmd_buf.begin_debug_label("Element bounding box calculation"); self.element_stage.record( cmd_buf, &self.element_code, @@ -397,43 +398,53 @@ impl Renderer { self.n_pathtag as u32, self.n_drawobj as u64, ); + cmd_buf.end_debug_label(); cmd_buf.write_timestamp(&query_pool, 1); cmd_buf.memory_barrier(); + cmd_buf.begin_debug_label("Tile allocation"); cmd_buf.dispatch( &self.tile_pipeline, &self.tile_ds, (((self.n_paths + 255) / 256) as u32, 1, 1), (256, 1, 1), ); + cmd_buf.end_debug_label(); cmd_buf.write_timestamp(&query_pool, 2); cmd_buf.memory_barrier(); + cmd_buf.begin_debug_label("Path flattening"); cmd_buf.dispatch( &self.path_pipeline, &self.path_ds, (((self.n_pathseg + 31) / 32) as u32, 1, 1), (32, 1, 1), ); + cmd_buf.end_debug_label(); cmd_buf.write_timestamp(&query_pool, 3); cmd_buf.memory_barrier(); + cmd_buf.begin_debug_label("Backdrop propagation"); cmd_buf.dispatch( &self.backdrop_pipeline, &self.backdrop_ds, (((self.n_paths + 255) / 256) as u32, 1, 1), (256, self.backdrop_y, 1), ); + cmd_buf.end_debug_label(); cmd_buf.write_timestamp(&query_pool, 4); // Note: this barrier is not needed as an actual dependency between // pipeline stages, but I am keeping it in so that timer queries are // easier to interpret. cmd_buf.memory_barrier(); + cmd_buf.begin_debug_label("Element binning"); cmd_buf.dispatch( &self.bin_pipeline, &self.bin_ds, (((self.n_paths + 255) / 256) as u32, 1, 1), (256, 1, 1), ); + cmd_buf.end_debug_label(); cmd_buf.write_timestamp(&query_pool, 5); cmd_buf.memory_barrier(); + cmd_buf.begin_debug_label("Coarse raster"); cmd_buf.dispatch( &self.coarse_pipeline, &self.coarse_ds, @@ -444,8 +455,10 @@ impl Renderer { ), (256, 1, 1), ); + cmd_buf.end_debug_label(); cmd_buf.write_timestamp(&query_pool, 6); cmd_buf.memory_barrier(); + cmd_buf.begin_debug_label("Fine raster"); cmd_buf.dispatch( &self.k4_pipeline, &self.k4_ds, @@ -456,6 +469,7 @@ impl Renderer { ), (8, 4, 1), ); + cmd_buf.end_debug_label(); cmd_buf.write_timestamp(&query_pool, 7); cmd_buf.memory_barrier(); cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);