Start implementing fills

This should get the "right_edge" value for each segment plumbed through to the binning phase. It also needs to be plumbed to coarse raster and wired up there. Also considering WIP because none of this logic has been tested yet.
2025-01-10 20:51:29 +11:00 · 2020-05-19 08:21:09 -07:00 · 2020-05-19 08:21:09 -07:00 · 03da52cff8
parent 0ed759814b
commit 03da52cff8
8 changed files with 73 additions and 9 deletions
--- a/piet-gpu-types/src/state.rs
+++ b/piet-gpu-types/src/state.rs
@ -8,6 +8,7 @@ piet_gpu! {
            translate: [f32; 2],
            bbox: [f32; 4],
            linewidth: f32,
            right_edge: f32,
            flags: u32,
        }
    }
--- a/piet-gpu/shader/binning.comp
+++ b/piet-gpu/shader/binning.comp
@ -11,24 +11,33 @@ layout(set = 0, binding = 0) buffer AnnotatedBuf {
    uint[] annotated;
 };
-layout(set = 0, binding = 1) buffer AllocBuf {
+// This is for scanning forward for right_edge data.
 layout(set = 0, binding = 1) buffer StateBuf {
    uint[] state;
 };
 layout(set = 0, binding = 2) buffer AllocBuf {
    uint n_elements;
    // Will be incremented atomically to claim tiles
    uint tile_ix;
    uint alloc;
 };
-layout(set = 0, binding = 2) buffer BinsBuf {
+layout(set = 0, binding = 3) buffer BinsBuf {
    uint[] bins;
 };
 #include "annotated.h"
 #include "state.h"
 #include "bins.h"
 // scale factors useful for converting coordinates to bins
 #define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX))
 #define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX))
 // Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000)
 #define INFINITY (1.0 / 0.0)
 // Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
 shared uint bitmaps[N_SLICE][N_TILE];
 shared uint count[N_SLICE][N_TILE];
@ -37,6 +46,14 @@ shared uint sh_chunk_start[N_TILE];
 shared uint sh_chunk_end[N_TILE];
 shared uint sh_chunk_jump[N_TILE];
 shared float sh_right_edge[N_TILE];
 #define StateBuf_stride (4 + 2 * State_size)
 StateRef state_aggregate_ref(uint partition_ix) {
    return StateRef(8 + partition_ix * StateBuf_stride);
 }
 void main() {
    BinChunkRef chunk_ref = BinChunkRef((gl_LocalInvocationID.x * N_WG + gl_WorkGroupID.x) * BIN_INITIAL_ALLOC);
    uint wr_limit = chunk_ref.offset + BIN_INITIAL_ALLOC;
@ -65,6 +82,7 @@ void main() {
            tag = Annotated_tag(ref);
        }
        int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
        float my_right_edge = INFINITY;
        switch (tag) {
        case Annotated_Line:
            AnnoLineSeg line = Annotated_Line_read(ref);
@ -82,8 +100,37 @@ void main() {
            y0 = int(floor(fill.bbox.y * SY));
            x1 = int(ceil(fill.bbox.z * SX));
            y1 = int(ceil(fill.bbox.w * SY));
            my_right_edge = x1;
            break;
        }
        // If the last element in this partition is a fill edge, then we need to do a
        // look-forward to find the right edge of its corresponding fill. That data is
        // recorded in aggregates computed in the element processing pass.
        if (gl_LocalInvocationID.x == N_TILE - 1 && tag == Annotated_Line) {
            uint aggregate_ix = (my_tile + 1) * ELEMENT_BINNING_RATIO;
            // This is sequential but the expectation is that the amount of
            // look-forward is small (performance may degrade in the case
            // of massively complex paths).
            do {
                StateRef agg_ref = state_aggregate_ref(aggregate_ix);
                my_right_edge = State_read(agg_ref).right_edge;
                aggregate_ix++;
            } while (isinf(my_right_edge));
        }
        // Now propagate right_edge backward, from fill to segment.
        for (uint i = 0; i < LG_N_TILE; i++) {
            // Note: we could try to cut down on write bandwidth here if the value hasn't
            // changed, but not sure it's worth the complexity to track.
            sh_right_edge[gl_LocalInvocationID.x] = my_right_edge;
            barrier();
            if (gl_LocalInvocationID.x + (1 << i) < N_TILE && isinf(my_right_edge)) {
                my_right_edge = sh_right_edge[gl_LocalInvocationID.x + (1 << i)];
            }
            barrier();
        }
        // At this point, we run an iterator over the coverage area,
        // trying to keep divergence low.
        // Right now, it's just a bbox, but we'll get finer with
@ -141,6 +188,7 @@ void main() {
                chunk_n = element_count - chunk_n;
            } else {
                chunk_end = ~0;
                chunk_new_start = ~0;
                chunk_n = element_count;
            }
            sh_chunk_start[gl_LocalInvocationID.x] = instance_ref.offset;
--- a/piet-gpu/shader/binning.spv
+++ b/piet-gpu/shader/binning.spv
--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@ -10,7 +10,7 @@
 #define N_ROWS 4
 #define WG_SIZE 32
 #define LG_WG_SIZE 5
-#define TILE_SIZE (WG_SIZE * N_ROWS)
+#define PARTITION_SIZE (WG_SIZE * N_ROWS)
 layout(local_size_x = WG_SIZE, local_size_y = 1) in;
@ -81,6 +81,7 @@ State combine_state(State a, State b) {
    c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x;
    c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y;
    c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth;
    c.right_edge = (a.flags & FLAG_SET_BBOX) != 0 ? a.right_edge : (a.flags & FLAG_RESET_BBOX) != 0 ? a.bbox.z : c.right_edge;
    c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX)) | b.flags;
    c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1;
    return c;
@ -143,6 +144,7 @@ shared vec4 sh_mat[WG_SIZE];
 shared vec2 sh_translate[WG_SIZE];
 shared vec4 sh_bbox[WG_SIZE];
 shared float sh_width[WG_SIZE];
 shared float sh_right_edge[WG_SIZE];
 shared uint sh_flags[WG_SIZE];
 shared uint sh_tile_ix;
@ -158,7 +160,7 @@ void main() {
    barrier();
    uint tile_ix = sh_tile_ix;
-    uint ix = tile_ix * TILE_SIZE + gl_LocalInvocationID.x * N_ROWS;
+    uint ix = tile_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
    ElementRef ref = ElementRef(ix * Element_size);
    th_state[0] = map_element(ref);
@ -172,6 +174,7 @@ void main() {
    sh_translate[gl_LocalInvocationID.x] = agg.translate;
    sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
    sh_width[gl_LocalInvocationID.x] = agg.linewidth;
    sh_right_edge[gl_LocalInvocationID.x] = agg.right_edge;
    sh_flags[gl_LocalInvocationID.x] = agg.flags;
    for (uint i = 0; i < LG_WG_SIZE; i++) {
        barrier();
@ -190,6 +193,7 @@ void main() {
        sh_translate[gl_LocalInvocationID.x] = agg.translate;
        sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
        sh_width[gl_LocalInvocationID.x] = agg.linewidth;
        sh_right_edge[gl_LocalInvocationID.x] = agg.right_edge;
        sh_flags[gl_LocalInvocationID.x] = agg.flags;
    }
@ -198,6 +202,7 @@ void main() {
    exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0);
    exclusive.translate = vec2(0.0, 0.0);
    exclusive.linewidth = 1.0; //TODO should be 0.0
    exclusive.right_edge = 0.0;
    exclusive.flags = 0;
    // Publish aggregate for this partition
@ -250,6 +255,7 @@ void main() {
        other.translate = sh_translate[ix];
        other.bbox = sh_bbox[ix];
        other.linewidth = sh_width[ix];
        other.right_edge = sh_right_edge[ix];
        other.flags = sh_flags[ix];
        row = combine_state(row, other);
    }
--- a/piet-gpu/shader/elements.spv
+++ b/piet-gpu/shader/elements.spv
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@ -51,9 +51,14 @@
 #define N_TILE_X 16
 #define N_TILE_Y 16
 #define N_TILE (N_TILE_X * N_TILE_Y)
 #define LG_N_TILE 8
 #define N_SLICE (N_TILE / 32)
 // Number of workgroups for binning kernel
 #define N_WG 16
 // This is the ratio of the number of elements in a binning workgroup
 // over the number of elements in a partition workgroup.
 #define ELEMENT_BINNING_RATIO 4
 #define BIN_INITIAL_ALLOC 64
 #define BIN_ALLOC 256
--- a/piet-gpu/shader/state.h
+++ b/piet-gpu/shader/state.h
@ -9,10 +9,11 @@ struct State {
    vec2 translate;
    vec4 bbox;
    float linewidth;
    float right_edge;
    uint flags;
 };
-#define State_size 48
+#define State_size 52
 StateRef State_index(StateRef ref, uint index) {
    return StateRef(ref.offset + index * State_size);
@ -32,12 +33,14 @@ State State_read(StateRef ref) {
    uint raw9 = state[ix + 9];
    uint raw10 = state[ix + 10];
    uint raw11 = state[ix + 11];
    uint raw12 = state[ix + 12];
    State s;
    s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
    s.bbox = vec4(uintBitsToFloat(raw6), uintBitsToFloat(raw7), uintBitsToFloat(raw8), uintBitsToFloat(raw9));
    s.linewidth = uintBitsToFloat(raw10);
-    s.flags = raw11;
+    s.right_edge = uintBitsToFloat(raw11);
    s.flags = raw12;
    return s;
 }
@ -54,6 +57,7 @@ void State_write(StateRef ref, State s) {
    state[ix + 8] = floatBitsToUint(s.bbox.z);
    state[ix + 9] = floatBitsToUint(s.bbox.w);
    state[ix + 10] = floatBitsToUint(s.linewidth);
-    state[ix + 11] = s.flags;
+    state[ix + 11] = floatBitsToUint(s.right_edge);
    state[ix + 12] = s.flags;
 }
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -185,10 +185,10 @@ impl<D: Device> Renderer<D> {
            ])
            ?;
        let bin_code = include_bytes!("../shader/binning.spv");
-        let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 3, 0)?;
+        let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 4, 0)?;
        let bin_ds = device.create_descriptor_set(
            &bin_pipeline,
-            &[&anno_buf, &bin_alloc_buf_dev, &bin_buf],
+            &[&anno_buf, &state_buf, &bin_alloc_buf_dev, &bin_buf],
            &[],
        )?;