Optimize clips

Optimize tiles with clip masks that are all-zero or all-one.

Part of #36
This commit is contained in:
Raph Levien 2020-11-27 08:42:21 -08:00
parent 4bbc7dee1d
commit 4138f8a516
3 changed files with 113 additions and 71 deletions

View file

@ -84,11 +84,22 @@ void main() {
// Coordinates of top left of bin, in tiles. // Coordinates of top left of bin, in tiles.
uint bin_tile_x = N_TILE_X * gl_WorkGroupID.x; uint bin_tile_x = N_TILE_X * gl_WorkGroupID.x;
uint bin_tile_y = N_TILE_Y * gl_WorkGroupID.y; uint bin_tile_y = N_TILE_Y * gl_WorkGroupID.y;
// Per-tile state
uint tile_x = gl_LocalInvocationID.x % N_TILE_X; uint tile_x = gl_LocalInvocationID.x % N_TILE_X;
uint tile_y = gl_LocalInvocationID.x / N_TILE_X; uint tile_y = gl_LocalInvocationID.x / N_TILE_X;
uint this_tile_ix = (bin_tile_y + tile_y) * WIDTH_IN_TILES + bin_tile_x + tile_x; uint this_tile_ix = (bin_tile_y + tile_y) * WIDTH_IN_TILES + bin_tile_x + tile_x;
CmdRef cmd_ref = CmdRef(this_tile_ix * PTCL_INITIAL_ALLOC); CmdRef cmd_ref = CmdRef(this_tile_ix * PTCL_INITIAL_ALLOC);
uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size; uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
// The nesting depth of the clip stack
uint clip_depth = 0;
// State for the "clip zero" optimization. If it's nonzero, then we are
// currently in a clip for which the entire tile has an alpha of zero, and
// the value is the depth after the "begin clip" of that element.
uint clip_zero_depth = 0;
// State for the "clip one" optimization. If bit `i` is set, then that means
// that the clip pushed at depth `i` has an alpha of all one.
uint clip_one_mask = 0;
// I'm sure we can figure out how to do this with at least one fewer register... // I'm sure we can figure out how to do this with at least one fewer register...
// Items up to rd_ix have been read from sh_elements // Items up to rd_ix have been read from sh_elements
@ -98,6 +109,7 @@ void main() {
// Items between part_start_ix and ready_ix are ready to be transferred from sh_part_elements // Items between part_start_ix and ready_ix are ready to be transferred from sh_part_elements
uint part_start_ix = 0; uint part_start_ix = 0;
uint ready_ix = 0; uint ready_ix = 0;
while (true) { while (true) {
for (uint i = 0; i < N_SLICE; i++) { for (uint i = 0; i < N_SLICE; i++) {
sh_bitmaps[i][th_ix] = 0; sh_bitmaps[i][th_ix] = 0;
@ -274,6 +286,7 @@ void main() {
ref = AnnotatedRef(element_ix * Annotated_size); ref = AnnotatedRef(element_ix * Annotated_size);
tag = Annotated_tag(ref); tag = Annotated_tag(ref);
if (clip_zero_depth == 0) {
switch (tag) { switch (tag) {
case Annotated_Fill: case Annotated_Fill:
Tile tile = Tile_read(TileRef(sh_tile_base[element_ref_ix] Tile tile = Tile_read(TileRef(sh_tile_base[element_ref_ix]
@ -294,6 +307,11 @@ void main() {
case Annotated_BeginClip: case Annotated_BeginClip:
tile = Tile_read(TileRef(sh_tile_base[element_ref_ix] tile = Tile_read(TileRef(sh_tile_base[element_ref_ix]
+ (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
if (tile.tile.offset == 0 && tile.backdrop == 0) {
clip_zero_depth = clip_depth + 1;
} else if (tile.tile.offset == 0 && clip_depth < 32) {
clip_one_mask |= (1 << clip_depth);
} else {
alloc_cmd(cmd_ref, cmd_limit); alloc_cmd(cmd_ref, cmd_limit);
if (tile.tile.offset != 0) { if (tile.tile.offset != 0) {
CmdBeginClip cmd_begin_clip; CmdBeginClip cmd_begin_clip;
@ -306,11 +324,19 @@ void main() {
Cmd_BeginSolidClip_write(cmd_ref, CmdBeginSolidClip(alpha)); Cmd_BeginSolidClip_write(cmd_ref, CmdBeginSolidClip(alpha));
} }
cmd_ref.offset += Cmd_size; cmd_ref.offset += Cmd_size;
if (clip_depth < 32) {
clip_one_mask &= ~(1 << clip_depth);
}
}
clip_depth++;
break; break;
case Annotated_EndClip: case Annotated_EndClip:
clip_depth--;
if (clip_depth >= 32 || (clip_one_mask & (1 << clip_depth)) == 0) {
alloc_cmd(cmd_ref, cmd_limit); alloc_cmd(cmd_ref, cmd_limit);
Cmd_EndClip_write(cmd_ref, CmdEndClip(1.0)); Cmd_EndClip_write(cmd_ref, CmdEndClip(1.0));
cmd_ref.offset += Cmd_size; cmd_ref.offset += Cmd_size;
}
break; break;
case Annotated_FillMask: case Annotated_FillMask:
case Annotated_FillMaskInv: case Annotated_FillMaskInv:
@ -346,6 +372,20 @@ void main() {
cmd_ref.offset += Cmd_size; cmd_ref.offset += Cmd_size;
break; break;
} }
} else {
// In "clip zero" state, suppress all drawing
switch (tag) {
case Annotated_BeginClip:
clip_depth++;
break;
case Annotated_EndClip:
if (clip_depth == clip_zero_depth) {
clip_zero_depth = 0;
}
clip_depth--;
break;
}
}
} }
barrier(); barrier();

Binary file not shown.

View file

@ -99,9 +99,11 @@ fn render_cardioid(rc: &mut impl RenderContext) {
fn render_clip_test(rc: &mut impl RenderContext) { fn render_clip_test(rc: &mut impl RenderContext) {
const N: usize = 16; const N: usize = 16;
const X0: f64 = 50.0; const X0: f64 = 50.0;
const Y0: f64 = 50.0; const Y0: f64 = 450.0;
const X1: f64 = 100.0; // Note: if it gets much larger, it will exceed the 1MB scratch buffer.
const Y1: f64 = 100.0; // But this is a pretty demanding test.
const X1: f64 = 550.0;
const Y1: f64 = 950.0;
let step = 1.0 / ((N + 1) as f64); let step = 1.0 / ((N + 1) as f64);
for i in 0..N { for i in 0..N {
let t = ((i + 1) as f64) * step; let t = ((i + 1) as f64) * step;