Faster coarse raster

Store a lot more tile context in shared memory and do the work from
that.
This commit is contained in:
Raph Levien 2020-06-04 10:39:08 -07:00
parent e1aa9b2f5d
commit 877da4a98e
2 changed files with 50 additions and 65 deletions

View file

@ -51,6 +51,10 @@ shared uint sh_tile_width[N_TILE];
shared uint sh_tile_x0[N_TILE];
shared uint sh_tile_y0[N_TILE];
// These are set up so base + tile_y * stride + tile_x points to a Tile.
shared uint sh_tile_base[N_TILE];
shared uint sh_tile_stride[N_TILE];
// scale factors useful for converting coordinates to tiles
#define SX (1.0 / float(TILE_WIDTH_PX))
#define SY (1.0 / float(TILE_HEIGHT_PX))
@ -76,9 +80,12 @@ void main() {
vec2 xy0 = vec2(N_TILE_X * TILE_WIDTH_PX * gl_WorkGroupID.x, N_TILE_Y * TILE_HEIGHT_PX * gl_WorkGroupID.y);
uint th_ix = gl_LocalInvocationID.x;
uint tile_x = N_TILE_X * gl_WorkGroupID.x + gl_LocalInvocationID.x % N_TILE_X;
uint tile_y = N_TILE_Y * gl_WorkGroupID.y + gl_LocalInvocationID.x / N_TILE_X;
uint this_tile_ix = tile_y * WIDTH_IN_TILES + tile_x;
// Coordinates of top left of bin, in tiles.
uint bin_tile_x = N_TILE_X * gl_WorkGroupID.x;
uint bin_tile_y = N_TILE_Y * gl_WorkGroupID.y;
uint tile_x = gl_LocalInvocationID.x % N_TILE_X;
uint tile_y = gl_LocalInvocationID.x / N_TILE_X;
uint this_tile_ix = (bin_tile_y + tile_y) * WIDTH_IN_TILES + bin_tile_x + tile_x;
CmdRef cmd_ref = CmdRef(this_tile_ix * PTCL_INITIAL_ALLOC);
uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
@ -151,53 +158,48 @@ void main() {
// Read one element, compute coverage.
uint tag = Annotated_Nop;
uint element_ix;
AnnotatedRef ref;
float right_edge = 0.0;
if (th_ix + rd_ix < wr_ix) {
uint element_ix = sh_elements[th_ix];
element_ix = sh_elements[th_ix];
right_edge = sh_right_edge[th_ix];
ref = AnnotatedRef(element_ix * Annotated_size);
tag = Annotated_tag(ref);
}
// Bounding box of element in pixel coordinates.
float xmin, xmax, ymin, ymax;
uint tile_count;
switch (tag) {
case Annotated_Fill:
case Annotated_Stroke:
// Note: we take advantage of the fact that fills and strokes
// have compatible layout.
AnnoFill fill = Annotated_Fill_read(ref);
xmin = fill.bbox.x;
xmax = fill.bbox.z;
ymin = fill.bbox.y;
ymax = fill.bbox.w;
// Because the only elements we're processing right now are
// paths, we can just use the element index as the path index.
// In future, when we're doing a bunch of stuff, the path index
// should probably be stored in the annotated element.
uint path_ix = element_ix;
Path path = Path_read(PathRef(path_ix * Path_size));
uint stride = path.bbox.z - path.bbox.x;
sh_tile_stride[th_ix] = stride;
int dx = int(path.bbox.x) - int(bin_tile_x);
int dy = int(path.bbox.y) - int(bin_tile_y);
int x0 = clamp(dx, 0, N_TILE_X);
int y0 = clamp(dy, 0, N_TILE_Y);
int x1 = clamp(int(path.bbox.z) - int(bin_tile_x), 0, N_TILE_X);
int y1 = clamp(int(path.bbox.w) - int(bin_tile_y), 0, N_TILE_Y);
sh_tile_width[th_ix] = uint(x1 - x0);
sh_tile_x0[th_ix] = x0;
sh_tile_y0[th_ix] = y0;
tile_count = uint(x1 - x0) * uint(y1 - y0);
// base relative to bin
uint base = path.tiles.offset - uint(dy * stride + dx) * Tile_size;
sh_tile_base[th_ix] = base;
break;
default:
ymin = 0;
ymax = 0;
tile_count = 0;
break;
}
// Draw the coverage area into the bitmasks. This uses an algorithm
// that computes the coverage of a span for given scanline.
// Compute bounding box in tiles and clip to this bin.
int x0 = int(floor((xmin - xy0.x) * SX));
int x1 = int(ceil((xmax - xy0.x) * SX));
int y0 = int(floor((ymin - xy0.y) * SY));
int y1 = int(ceil((ymax - xy0.y) * SY));
x0 = clamp(x0, 0, N_TILE_X);
x1 = clamp(x1, x0, N_TILE_X);
y0 = clamp(y0, 0, N_TILE_Y);
y1 = clamp(y1, y0, N_TILE_Y);
uint tile_count = uint((x1 - x0) * (y1 - y0));
sh_tile_width[th_ix] = uint(x1 - x0);
sh_tile_x0[th_ix] = uint(x0);
sh_tile_y0[th_ix] = uint(y0);
// Prefix sum of sh_tile_count
sh_tile_count[th_ix] = tile_count;
for (uint i = 0; i < LG_N_TILE; i++) {
@ -223,21 +225,13 @@ void main() {
uint width = sh_tile_width[el_ix];
uint x = sh_tile_x0[el_ix] + seq_ix % width;
uint y = sh_tile_y0[el_ix] + seq_ix / width;
uint tile_x = x + gl_WorkGroupID.x * N_TILE_X;
uint tile_y = y + gl_WorkGroupID.y * N_TILE_Y;
uint element_ix = sh_elements[el_ix];
Path path = Path_read(PathRef(element_ix * Path_size));
if (tile_x >= path.bbox.x && tile_x < path.bbox.z && tile_y >= path.bbox.y && tile_y < path.bbox.w) {
uint stride = path.bbox.z - path.bbox.x;
uint tile_subix = (tile_y - path.bbox.y) * stride + tile_x - path.bbox.x;
Tile tile = Tile_read(Tile_index(path.tiles, tile_subix));
Tile tile = Tile_read(TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
if (tile.tile.offset != 0) {
uint el_slice = el_ix / 32;
uint el_mask = 1 << (el_ix & 31);
atomicOr(sh_bitmaps[el_slice][y * N_TILE_X + x], el_mask);
}
}
}
barrier();
@ -407,16 +401,8 @@ void main() {
break;
*/
case Annotated_Stroke:
// Because the only elements we're processing right now are
// paths, we can just use the element index as the path index.
// In future, when we're doing a bunch of stuff, the path index
// should probably be stored in the annotated element.
uint path_ix = element_ix;
Path path = Path_read(PathRef(path_ix * Path_size));
uint stride = path.bbox.z - path.bbox.x;
uint tile_subix = (tile_y - path.bbox.y) * stride + tile_x - path.bbox.x;
Tile tile = Tile_read(Tile_index(path.tiles, tile_subix));
if (tile.tile.offset != 0) {
Tile tile = Tile_read(TileRef(sh_tile_base[element_ref_ix]
+ (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
AnnoStroke stroke = Annotated_Stroke_read(ref);
CmdStroke cmd_stroke;
cmd_stroke.tile_ref = tile.tile.offset;
@ -425,7 +411,6 @@ void main() {
alloc_cmd(cmd_ref, cmd_limit);
Cmd_Stroke_write(cmd_ref, cmd_stroke);
cmd_ref.offset += Cmd_size;
}
break;
}
}

Binary file not shown.