From 4b59525e1f7a0d0bb42538af4803faf6dde97f69 Mon Sep 17 00:00:00 2001 From: Elias Naur Date: Mon, 19 Apr 2021 18:56:57 +0200 Subject: [PATCH] use mediump precision for kernel4 colors and areas Improves kernel4 performance for a Gio scene from ~22ms to ~15ms. Updates #83 Signed-off-by: Elias Naur --- piet-gpu/shader/kernel4.comp | 55 ++++++++++++++++---------------- piet-gpu/shader/kernel4.spv | Bin 36080 -> 37584 bytes piet-gpu/shader/kernel4_idx.spv | Bin 36184 -> 37688 bytes 3 files changed, 28 insertions(+), 27 deletions(-) diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp index ea05228..c613b72 100644 --- a/piet-gpu/shader/kernel4.comp +++ b/piet-gpu/shader/kernel4.comp @@ -22,45 +22,45 @@ #define CHUNK_DY (TILE_HEIGHT_PX / CHUNK_Y) layout(local_size_x = CHUNK_DX, local_size_y = CHUNK_DY) in; -layout(set = 0, binding = 1) readonly buffer ConfigBuf { +layout(set = 0, binding = 1) restrict readonly buffer ConfigBuf { Config conf; }; -layout(rgba8, set = 0, binding = 2) uniform writeonly image2D image; +layout(rgba8, set = 0, binding = 2) uniform restrict writeonly image2D image; #ifdef ENABLE_IMAGE_INDICES -layout(rgba8, set = 0, binding = 3) uniform readonly image2D images[]; +layout(rgba8, set = 0, binding = 3) uniform restrict readonly image2D images[]; #else -layout(rgba8, set = 0, binding = 3) uniform readonly image2D images[1]; +layout(rgba8, set = 0, binding = 3) uniform restrict readonly image2D images[1]; #endif #include "ptcl.h" #include "tile.h" -vec3 tosRGB(vec3 rgb) { +mediump vec3 tosRGB(mediump vec3 rgb) { bvec3 cutoff = greaterThanEqual(rgb, vec3(0.0031308)); - vec3 below = vec3(12.92)*rgb; - vec3 above = vec3(1.055)*pow(rgb, vec3(0.41666)) - vec3(0.055); + mediump vec3 below = vec3(12.92)*rgb; + mediump vec3 above = vec3(1.055)*pow(rgb, vec3(0.41666)) - vec3(0.055); return mix(below, above, cutoff); } -vec3 fromsRGB(vec3 srgb) { +mediump vec3 fromsRGB(mediump vec3 srgb) { // Formula from EXT_sRGB. bvec3 cutoff = greaterThanEqual(srgb, vec3(0.04045)); - vec3 below = srgb/vec3(12.92); - vec3 above = pow((srgb + vec3(0.055))/vec3(1.055), vec3(2.4)); + mediump vec3 below = srgb/vec3(12.92); + mediump vec3 above = pow((srgb + vec3(0.055))/vec3(1.055), vec3(2.4)); return mix(below, above, cutoff); } // unpacksRGB unpacks a color in the sRGB color space to a vec4 in the linear color // space. -vec4 unpacksRGB(uint srgba) { - vec4 color = unpackUnorm4x8(srgba).wzyx; +mediump vec4 unpacksRGB(uint srgba) { + mediump vec4 color = unpackUnorm4x8(srgba).wzyx; return vec4(fromsRGB(color.rgb), color.a); } // packsRGB packs a color in the linear color space into its 8-bit sRGB equivalent. -uint packsRGB(vec4 rgba) { +uint packsRGB(mediump vec4 rgba) { rgba = vec4(tosRGB(rgba.rgb), rgba.a); return packUnorm4x8(rgba.wzyx); } @@ -69,14 +69,15 @@ uvec2 chunk_offset(uint i) { return uvec2(i % CHUNK_X * CHUNK_DX, i / CHUNK_X * CHUNK_DY); } -vec4[CHUNK] fillImage(uvec2 xy, CmdImage cmd_img) { - vec4 rgba[CHUNK]; +mediump vec4[CHUNK] fillImage(uvec2 xy, CmdImage cmd_img) { + mediump vec4 rgba[CHUNK]; for (uint i = 0; i < CHUNK; i++) { ivec2 uv = ivec2(xy + chunk_offset(i)) + cmd_img.offset; + mediump vec4 fg_rgba; #ifdef ENABLE_IMAGE_INDICES - vec4 fg_rgba = imageLoad(images[cmd_img.index], uv); + fg_rgba = imageLoad(images[cmd_img.index], uv); #else - vec4 fg_rgba = imageLoad(images[0], uv); + fg_rgba = imageLoad(images[0], uv); #endif fg_rgba.rgb = fromsRGB(fg_rgba.rgb); rgba[i] = fg_rgba; @@ -95,7 +96,7 @@ void main() { uvec2 xy_uint = uvec2(gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_WorkGroupID.x, gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y); vec2 xy = vec2(xy_uint); - vec4 rgba[CHUNK]; + mediump vec4 rgba[CHUNK]; for (uint i = 0; i < CHUNK; i++) { rgba[i] = vec4(0.0); // TODO: remove this debug image support when the actual image method is plumbed. @@ -112,7 +113,7 @@ void main() { #endif } - float area[CHUNK]; + mediump float area[CHUNK]; uint clip_depth = 0; bool mem_ok = mem_error == NO_ERROR; while (mem_ok) { @@ -124,7 +125,7 @@ void main() { case Cmd_Stroke: // Calculate distance field from all the line segments in this tile. CmdStroke stroke = Cmd_Stroke_read(cmd_alloc, cmd_ref); - float df[CHUNK]; + mediump float df[CHUNK]; for (uint k = 0; k < CHUNK; k++) df[k] = 1e9; TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref); do { @@ -190,18 +191,18 @@ void main() { break; case Cmd_Color: CmdColor color = Cmd_Color_read(cmd_alloc, cmd_ref); - vec4 fg = unpacksRGB(color.rgba_color); + mediump vec4 fg = unpacksRGB(color.rgba_color); for (uint k = 0; k < CHUNK; k++) { - vec4 fg_k = fg * area[k]; + mediump vec4 fg_k = fg * area[k]; rgba[k] = rgba[k] * (1.0 - fg_k.a) + fg_k; } cmd_ref.offset += 4 + CmdColor_size; break; case Cmd_Image: CmdImage fill_img = Cmd_Image_read(cmd_alloc, cmd_ref); - vec4 img[CHUNK] = fillImage(xy_uint, fill_img); + mediump vec4 img[CHUNK] = fillImage(xy_uint, fill_img); for (uint k = 0; k < CHUNK; k++) { - vec4 fg_k = img[k] * area[k]; + mediump vec4 fg_k = img[k] * area[k]; rgba[k] = rgba[k] * (1.0 - fg_k.a) + fg_k; } cmd_ref.offset += 4 + CmdImage_size; @@ -212,7 +213,7 @@ void main() { for (uint k = 0; k < CHUNK; k++) { uvec2 offset = chunk_offset(k); uint srgb = packsRGB(vec4(rgba[k])); - float alpha = clamp(abs(area[k]), 0.0, 1.0); + mediump float alpha = clamp(abs(area[k]), 0.0, 1.0); write_mem(scratch_alloc, base_ix + 0 + CLIP_STATE_SIZE * (offset.x + offset.y * TILE_WIDTH_PX), srgb); write_mem(scratch_alloc, base_ix + 1 + CLIP_STATE_SIZE * (offset.x + offset.y * TILE_WIDTH_PX), floatBitsToUint(alpha)); rgba[k] = vec4(0.0); @@ -228,8 +229,8 @@ void main() { uvec2 offset = chunk_offset(k); uint srgb = read_mem(scratch_alloc, base_ix + 0 + CLIP_STATE_SIZE * (offset.x + offset.y * TILE_WIDTH_PX)); uint alpha = read_mem(scratch_alloc, base_ix + 1 + CLIP_STATE_SIZE * (offset.x + offset.y * TILE_WIDTH_PX)); - vec4 bg = unpacksRGB(srgb); - vec4 fg = rgba[k] * area[k] * uintBitsToFloat(alpha); + mediump vec4 bg = unpacksRGB(srgb); + mediump vec4 fg = rgba[k] * area[k] * uintBitsToFloat(alpha); rgba[k] = bg * (1.0 - fg.a) + fg; } cmd_ref.offset += 4; diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv index 077640e7152e374d479a96b8a0dfdd281f26c3b5..3e8849421bbf8d7eff0607ce84765fa4ab90e61c 100644 GIT binary patch delta 1522 zcmYk++baZN9LDi6W3`2=yZgPFHReg>kc4hg|HSwpA;~ z pZrZ9(ydgSTfup?DEc}>$CpY<7SLRTCWE{SRZ{S0NMZ0r2_Do0GdfP#1-HPha2wnKcfwt;8}8Aq{I9+jK_A=?55R-)5IhWf;1PHf_R22f?8gv{ z!#>y#Pr%Ev`eQBiSmLL-u0>gKZ(~>x-hp@F5WEkEh1J}g-z1BgYV&G^`-9YOG8W>4 z=iw#z7%opXo^`-(xEt;fP89AYf^B#oK7hmU5gdU};8XYvR?gA6fFIyT_!)kIU*R|S z9sYnnb?b#2L+}NE!*YtTy=FKKPKPt#Ot=`HN)h|7HSzD5K`;vk;01UQUV>NPRd@|v pPuQ$YydgTO-&WF+DEu?+PImmWqKTo}$ZPlpzJ>3Fqe0u#0NMYzrUKZS0lhPV^la?JO0T`1V9{ZChA6AnFA|?SDvvnfFa4BaF3IG5A