Rework right_edge computation in elements

Trying to fit it into the fancy monad doesn't really work, so use a more straightforward approach to compute it from the aggregate. Also add yEdge logic (basically copying piet-metal). With a fix to ELEMENT_BINNING_RATIO (which I had simply gotten wrong), the example renders almost correctly, with small bounding box artifacts.
2025-01-09 20:31:29 +11:00 · 2020-05-20 16:36:09 -07:00 · 2020-05-20 16:36:09 -07:00 · a616b4d010
parent ed4ed30708
commit a616b4d010
12 changed files with 76 additions and 38 deletions
--- a/piet-gpu-types/src/state.rs
+++ b/piet-gpu-types/src/state.rs
@ -8,7 +8,6 @@ piet_gpu! {
            translate: [f32; 2],
            bbox: [f32; 4],
            linewidth: f32,
-            right_edge: f32,
            flags: u32,
        }
    }
--- a/piet-gpu/shader/binning.comp
+++ b/piet-gpu/shader/binning.comp
@ -50,10 +50,10 @@ shared uint sh_chunk_jump[N_TILE];

 shared float sh_right_edge[N_TILE];

-#define StateBuf_stride (4 + 2 * State_size)
+#define StateBuf_stride (8 + 2 * State_size)

-StateRef state_aggregate_ref(uint partition_ix) {
-    return StateRef(8 + partition_ix * StateBuf_stride);
+uint state_right_edge_index(uint partition_ix) {
+    return 2 + partition_ix * (StateBuf_stride / 4);
 }

 void main() {
@ -120,8 +120,7 @@ void main() {
            // look-forward is small (performance may degrade in the case
            // of massively complex paths).
            do {
-                StateRef agg_ref = state_aggregate_ref(aggregate_ix);
-                my_right_edge = State_read(agg_ref).right_edge;
+                my_right_edge = uintBitsToFloat(state[state_right_edge_index(aggregate_ix)]);
                aggregate_ix++;
            } while (isinf(my_right_edge));
        }
--- a/piet-gpu/shader/binning.spv
+++ b/piet-gpu/shader/binning.spv
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@ -12,7 +12,7 @@ build image.spv: glsl image.comp | scene.h

 build elements.spv: glsl elements.comp | scene.h state.h annotated.h

-build binning.spv: glsl binning.comp | annotated.h bins.h setup.h
+build binning.spv: glsl binning.comp | annotated.h state.h bins.h setup.h

 build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h

--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@ -310,6 +310,30 @@ void main() {

            switch (tag) {
            case Annotated_FillLine:
+                AnnoFillLineSeg fill_line = Annotated_FillLine_read(ref);
+                // This is basically the same logic as piet-metal, but should be made numerically robust.
+                vec2 tile_xy = vec2(tile_x * TILE_WIDTH_PX, tile_y * TILE_HEIGHT_PX);
+                float yEdge = mix(fill_line.p0.y, fill_line.p1.y, (tile_xy.x - fill_line.p0.x) / (fill_line.p1.x - fill_line.p0.x));
+                if (min(fill_line.p0.x, fill_line.p1.x) < tile_xy.x && yEdge >= tile_xy.y && yEdge < tile_xy.y + TILE_HEIGHT_PX) {
+                    Segment edge_seg;
+                    if (fill_line.p0.x > fill_line.p1.x) {
+                        fill_line.p1 = vec2(tile_xy.x, yEdge);
+                        edge_seg.start = fill_line.p1;
+                        edge_seg.end = vec2(tile_xy.x, tile_xy.y + TILE_HEIGHT_PX);
+                    } else {
+                        fill_line.p0 = vec2(tile_xy.x, yEdge);
+                        edge_seg.start = vec2(tile_xy.x, tile_xy.y + TILE_HEIGHT_PX);
+                        edge_seg.end = fill_line.p0;
+                    }
+                    alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit);
+                    Segment_write(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs), edge_seg);
+                    chunk_n_segs++;
+                }
+                Segment fill_seg = Segment(fill_line.p0, fill_line.p1);
+                alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit);
+                Segment_write(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs), fill_seg);
+                chunk_n_segs++;
+                break;
            case Annotated_StrokeLine:
                AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
                Segment seg = Segment(line.p0, line.p1);
--- a/piet-gpu/shader/coarse.spv
+++ b/piet-gpu/shader/coarse.spv
--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@ -34,14 +34,14 @@ layout(set = 0, binding = 2) buffer AnnotatedBuf {
 #include "state.h"
 #include "annotated.h"

-#define StateBuf_stride (4 + 2 * State_size)
+#define StateBuf_stride (8 + 2 * State_size)

 StateRef state_aggregate_ref(uint partition_ix) {
-    return StateRef(8 + partition_ix * StateBuf_stride);
+    return StateRef(12 + partition_ix * StateBuf_stride);
 }

 StateRef state_prefix_ref(uint partition_ix) {
-    return StateRef(8 + partition_ix * StateBuf_stride + State_size);
+    return StateRef(12 + partition_ix * StateBuf_stride + State_size);
 }

 uint state_flag_index(uint partition_ix) {
@ -81,13 +81,12 @@ State combine_state(State a, State b) {
    c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x;
    c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y;
    c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth;
-    c.right_edge = (a.flags & FLAG_SET_BBOX) != 0 ? a.right_edge : (a.flags & FLAG_RESET_BBOX) != 0 ? a.bbox.z : c.right_edge;
    c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX)) | b.flags;
    c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1;
    return c;
 }

-State map_element(ElementRef ref) {
+State map_element(ElementRef ref, inout bool is_fill) {
    // TODO: it would *probably* be more efficient to make the memory read patterns less
    // divergent, though it would be more wasted memory.
    uint tag = Element_tag(ref);
@ -97,6 +96,7 @@ State map_element(ElementRef ref) {
    c.translate = vec2(0.0, 0.0);
    c.linewidth = 1.0; // TODO should be 0.0
    c.flags = 0;
+    is_fill = false;
    switch (tag) {
    case Element_FillLine:
    case Element_StrokeLine:
@ -115,6 +115,8 @@ State map_element(ElementRef ref) {
        c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3));
        break;
    case Element_Fill:
+        is_fill = true;
+        // fall-through
    case Element_Stroke:
        c.flags = FLAG_RESET_BBOX;
        break;
@ -145,9 +147,10 @@ shared vec4 sh_mat[WG_SIZE];
 shared vec2 sh_translate[WG_SIZE];
 shared vec4 sh_bbox[WG_SIZE];
 shared float sh_width[WG_SIZE];
-shared float sh_right_edge[WG_SIZE];
 shared uint sh_flags[WG_SIZE];

+shared uint sh_min_fill;
+
 shared uint sh_tile_ix;
 shared State sh_prefix;

@ -157,6 +160,7 @@ void main() {
    // 4.4 of prefix sum paper).
    if (gl_LocalInvocationID.x == 0) {
        sh_tile_ix = atomicAdd(state[0], 1);
+        sh_min_fill = ~0;
    }
    barrier();
    uint tile_ix = sh_tile_ix;
@ -164,18 +168,24 @@ void main() {
    uint ix = tile_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
    ElementRef ref = ElementRef(ix * Element_size);

-    th_state[0] = map_element(ref);
+    bool is_fill;
+    uint my_min_fill = ~0;
+    th_state[0] = map_element(ref, is_fill);
+    if (is_fill) my_min_fill = ix;
    for (uint i = 1; i < N_ROWS; i++) {
        // discussion question: would it be faster to load using more coherent patterns
        // into thread memory? This is kinda strided.
-        th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i)));
+        th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i), is_fill));
+        if (is_fill && my_min_fill == ~0) {
+            my_min_fill = ix + i;
+        }
    }
+    atomicMin(sh_min_fill, my_min_fill);
    State agg = th_state[N_ROWS - 1];
    sh_mat[gl_LocalInvocationID.x] = agg.mat;
    sh_translate[gl_LocalInvocationID.x] = agg.translate;
    sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
    sh_width[gl_LocalInvocationID.x] = agg.linewidth;
-    sh_right_edge[gl_LocalInvocationID.x] = agg.right_edge;
    sh_flags[gl_LocalInvocationID.x] = agg.flags;
    for (uint i = 0; i < LG_WG_SIZE; i++) {
        barrier();
@ -194,7 +204,6 @@ void main() {
        sh_translate[gl_LocalInvocationID.x] = agg.translate;
        sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
        sh_width[gl_LocalInvocationID.x] = agg.linewidth;
-        sh_right_edge[gl_LocalInvocationID.x] = agg.right_edge;
        sh_flags[gl_LocalInvocationID.x] = agg.flags;
    }

@ -203,7 +212,6 @@ void main() {
    exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0);
    exclusive.translate = vec2(0.0, 0.0);
    exclusive.linewidth = 1.0; //TODO should be 0.0
-    exclusive.right_edge = 0.0;
    exclusive.flags = 0;

    // Publish aggregate for this partition
@ -244,6 +252,7 @@ void main() {
        }
    }
    barrier();
+    my_min_fill = sh_min_fill;
    if (tile_ix != 0) {
        exclusive = sh_prefix;
    }
@ -256,12 +265,17 @@ void main() {
        other.translate = sh_translate[ix];
        other.bbox = sh_bbox[ix];
        other.linewidth = sh_width[ix];
-        other.right_edge = sh_right_edge[ix];
        other.flags = sh_flags[ix];
        row = combine_state(row, other);
    }
+    if (my_min_fill == ~0 && gl_LocalInvocationID.x == 0) {
+        state[state_flag_index(tile_ix) + 1] = 0x7f800000; // infinity
+    }
    for (uint i = 0; i < N_ROWS; i++) {
        State st = combine_state(row, th_state[i]);
+        if (my_min_fill == ix + i) {
+            state[state_flag_index(tile_ix) + 1] = floatBitsToUint(st.bbox.z);
+        }
        // We write the state now for development purposes, but the
        // actual goal is to write transformed and annotated elements.
        //State_write(StateRef((ix + i) * State_size), st);
--- a/piet-gpu/shader/elements.spv
+++ b/piet-gpu/shader/elements.spv
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@ -58,7 +58,7 @@

 // This is the ratio of the number of elements in a binning workgroup
 // over the number of elements in a partition workgroup.
-#define ELEMENT_BINNING_RATIO 4
+#define ELEMENT_BINNING_RATIO 2

 #define BIN_INITIAL_ALLOC 64
 #define BIN_ALLOC 256
--- a/piet-gpu/shader/state.h
+++ b/piet-gpu/shader/state.h
@ -9,11 +9,10 @@ struct State {
    vec2 translate;
    vec4 bbox;
    float linewidth;
-    float right_edge;
    uint flags;
 };

-#define State_size 52
+#define State_size 48

 StateRef State_index(StateRef ref, uint index) {
    return StateRef(ref.offset + index * State_size);
@ -33,14 +32,12 @@ State State_read(StateRef ref) {
    uint raw9 = state[ix + 9];
    uint raw10 = state[ix + 10];
    uint raw11 = state[ix + 11];
-    uint raw12 = state[ix + 12];
    State s;
    s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
    s.bbox = vec4(uintBitsToFloat(raw6), uintBitsToFloat(raw7), uintBitsToFloat(raw8), uintBitsToFloat(raw9));
    s.linewidth = uintBitsToFloat(raw10);
-    s.right_edge = uintBitsToFloat(raw11);
-    s.flags = raw12;
+    s.flags = raw11;
    return s;
 }

@ -57,7 +54,6 @@ void State_write(StateRef ref, State s) {
    state[ix + 8] = floatBitsToUint(s.bbox.z);
    state[ix + 9] = floatBitsToUint(s.bbox.w);
    state[ix + 10] = floatBitsToUint(s.linewidth);
-    state[ix + 11] = floatBitsToUint(s.right_edge);
-    state[ix + 12] = s.flags;
+    state[ix + 11] = s.flags;
 }

--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -46,8 +46,8 @@ pub fn render_scene(rc: &mut impl RenderContext) {
        let circle = Circle::new(center, radius);
        rc.fill(circle, &color);
    }
-    /*
    let mut path = BezPath::new();
+    /*
    path.move_to((100.0, 1150.0));
    path.line_to((200.0, 1200.0));
    path.line_to((150.0, 1250.0));
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@ -215,6 +215,7 @@ impl PietGpuRenderContext {
                match el {
                    PathEl::MoveTo(p) => {
                        let scene_pt = to_f32_2(p);
+                        start_pt = Some(scene_pt);
                        last_pt = Some(scene_pt);
                    }
                    PathEl::LineTo(p) => {
@ -228,11 +229,13 @@ impl PietGpuRenderContext {
                    }
                    PathEl::ClosePath => {
                        if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) {
-                            let seg = LineSeg {
-                                p0: last,
-                                p1: start,
-                            };
-                            self.encode_line_seg(seg, is_fill);
+                            if last != start {
+                                let seg = LineSeg {
+                                    p0: last,
+                                    p1: start,
+                                };
+                                self.encode_line_seg(seg, is_fill);
+                            }
                        }
                    }
                    _ => (),
@ -246,6 +249,7 @@ impl PietGpuRenderContext {
                match el {
                    PathEl::MoveTo(p) => {
                        let scene_pt = to_f32_2(p);
+                        start_pt = Some(scene_pt);
                        last_pt = Some(scene_pt);
                    }
                    PathEl::LineTo(p) => {
@ -283,11 +287,13 @@ impl PietGpuRenderContext {
                    }
                    PathEl::ClosePath => {
                        if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) {
-                            let seg = LineSeg {
-                                p0: last,
-                                p1: start,
-                            };
-                            self.encode_line_seg(seg, is_fill);
+                            if last != start {
+                                let seg = LineSeg {
+                                    p0: last,
+                                    p1: start,
+                                };
+                                self.encode_line_seg(seg, is_fill);
+                            }
                        }
                    }
                }