Continue building out pipeline

Plumbs the new tiling scheme to k4. This works (stroke only) but still has some performance issues.
2025-01-09 20:31:29 +11:00 · 2020-06-03 09:28:43 -07:00 · 2020-06-03 09:28:43 -07:00 · 70a9c17e23
parent 294f6fd1db
commit 70a9c17e23
11 changed files with 94 additions and 168 deletions
--- a/piet-gpu-types/src/ptcl.rs
+++ b/piet-gpu-types/src/ptcl.rs
@ -13,8 +13,9 @@ piet_gpu! {
            end: [f32; 2],
        }
        struct CmdStroke {
-            // Consider a specialization to one segment.
-            seg_ref: Ref<SegChunk>,
+            // This is really a Ref<Tile>, but we don't have cross-module
+            // references.
+            tile_ref: u32,
            half_width: f32,
            rgba_color: u32,
        }
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@ -171,7 +171,7 @@ fn main() -> Result<(), Error> {

        let fence = device.create_fence(false)?;
        let mut cmd_buf = device.create_cmd_buf()?;
-        let query_pool = device.create_query_pool(5)?;
+        let query_pool = device.create_query_pool(7)?;

        let mut ctx = PietGpuRenderContext::new();
        if let Some(input) = matches.value_of("INPUT") {
@ -204,14 +204,16 @@ fn main() -> Result<(), Error> {
        println!("Element kernel time: {:.3}ms", ts[0] * 1e3);
        println!("Tile allocation kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3);
        println!("Coarse path kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3);
-        /*
-        println!("Render kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3);
-        */
+        println!("Binning kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3);
+        println!("Coarse raster kernel time: {:.3}ms", (ts[4] - ts[3]) * 1e3);
+        println!("Render kernel time: {:.3}ms", (ts[5] - ts[4]) * 1e3);

+        /*
        let mut data: Vec<u32> = Default::default();
-        device.read_buffer(&renderer.tile_buf, &mut data).unwrap();
+        device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap();
        piet_gpu::dump_k1_data(&data);
        //trace_ptcl(&data);
+        */

        let mut img_data: Vec<u8> = Default::default();
        // Note: because png can use a `&[u8]` slice, we could avoid an extra copy
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@ -15,17 +15,22 @@ layout(set = 0, binding = 1) buffer BinsBuf {
    uint[] bins;
 };

-layout(set = 0, binding = 2) buffer AllocBuf {
+layout(set = 0, binding = 2) buffer TileBuf {
+    uint[] tile;
+};
+
+layout(set = 0, binding = 3) buffer AllocBuf {
    uint n_elements;
    uint alloc;
 };

-layout(set = 0, binding = 3) buffer PtclBuf {
+layout(set = 0, binding = 4) buffer PtclBuf {
    uint[] ptcl;
 };

 #include "annotated.h"
 #include "bins.h"
+#include "tile.h"
 #include "ptcl.h"

 #define LG_N_PART_READ 8
@ -197,37 +202,11 @@ void main() {
            tag = Annotated_tag(ref);
        }

-        // Setup for coverage algorithm.
-        float a, b, c;
        // Bounding box of element in pixel coordinates.
        float xmin, xmax, ymin, ymax;
        uint my_slice = th_ix / 32;
        uint my_mask = 1 << (th_ix & 31);
        switch (tag) {
-        case Annotated_FillLine:
-        case Annotated_StrokeLine:
-            AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
-            xmin = min(line.p0.x, line.p1.x) - line.stroke.x;
-            xmax = max(line.p0.x, line.p1.x) + line.stroke.x;
-            ymin = min(line.p0.y, line.p1.y) - line.stroke.y;
-            ymax = max(line.p0.y, line.p1.y) + line.stroke.y;
-            float dx = line.p1.x - line.p0.x;
-            float dy = line.p1.y - line.p0.y;
-            if (tag == Annotated_FillLine) {
-                // Set bit for backdrop sign calculation, 1 is +1, 0 is -1.
-                if (dy < 0) {
-                    atomicOr(sh_bd_sign[my_slice], my_mask);
-                } else {
-                    atomicAnd(sh_bd_sign[my_slice], ~my_mask);
-                }
-            }
-            atomicOr(sh_is_segment[my_slice], my_mask);
-            // Set up for per-scanline coverage formula, below.
-            float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
-            c = (line.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y)) * SX;
-            b = invslope; // Note: assumes square tiles, otherwise scale.
-            a = (line.p0.x - xy0.x - (line.p0.y - 0.5 * float(TILE_HEIGHT_PX) - xy0.y) * b) * SX;
-            break;
        case Annotated_Fill:
        case Annotated_Stroke:
            // Note: we take advantage of the fact that fills and strokes
@ -237,10 +216,6 @@ void main() {
            xmax = fill.bbox.z;
            ymin = fill.bbox.y;
            ymax = fill.bbox.w;
-            // Just let the clamping to xmin and xmax determine the bounds.
-            a = 0.0;
-            b = 0.0;
-            c = 1e9;
            break;
        default:
            ymin = 0;
@ -254,37 +229,23 @@ void main() {
        // Compute bounding box in tiles and clip to this bin.
        int x0 = int(floor((xmin - xy0.x) * SX));
        int x1 = int(ceil((xmax - xy0.x) * SX));
-        int xr = int(ceil((right_edge - xy0.x) * SX));
        int y0 = int(floor((ymin - xy0.y) * SY));
        int y1 = int(ceil((ymax - xy0.y) * SY));
        x0 = clamp(x0, 0, N_TILE_X);
        x1 = clamp(x1, x0, N_TILE_X);
-        xr = clamp(xr, 0, N_TILE_X);
        y0 = clamp(y0, 0, N_TILE_Y);
        y1 = clamp(y1, y0, N_TILE_Y);
-        float t = a + b * float(y0);
        for (uint y = y0; y < y1; y++) {
-            uint xx0 = clamp(int(floor(t - c)), x0, x1);
-            uint xx1 = clamp(int(ceil(t + c)), x0, x1);
-            for (uint x = xx0; x < xx1; x++) {
+            for (uint x = x0; x < x1; x++) {
                atomicOr(sh_bitmaps[my_slice][y * N_TILE_X + x], my_mask);
            }
-            if (tag == Annotated_FillLine && ymin <= xy0.y + float(y * TILE_HEIGHT_PX)) {
-                // Assign backdrop to all tiles to the right of the ray crossing the
-                // top edge of this tile, up to the right edge of the fill bbox.
-                float xray = t - 0.5 * b;
-                xx0 = max(int(ceil(xray)), 0);
-                for (uint x = xx0; x < xr; x++) {
-                    atomicOr(sh_backdrop[my_slice][y * N_TILE_X + x], my_mask);
-                }
-            }
-            t += b;
        }
        barrier();

        // We've computed coverage and other info for each element in the input, now for
        // the output stage. We'll do segments first using a more parallel algorithm.

+        /*
        uint seg_count = 0;
        for (uint i = 0; i < N_SLICE; i++) {
            seg_count += bitCount(sh_bitmaps[i][th_ix] & sh_is_segment[i]);
@ -372,45 +333,29 @@ void main() {
            Segment seg = Segment(line.p0, line.p1, y_edge);
            Segment_write(SegmentRef(seg_alloc + Segment_size * ix), seg);
        }
+        */

        // Output non-segment elements for this tile. The thread does a sequential walk
        // through the non-segment elements, and for segments, count and backdrop are
        // aggregated using bit counting.
        uint slice_ix = 0;
        uint bitmap = sh_bitmaps[0][th_ix];
-        uint bd_bitmap = sh_backdrop[0][th_ix];
-        uint bd_sign = sh_bd_sign[0];
-        uint is_segment = sh_is_segment[0];
-        uint seg_start = th_ix == 0 ? 0 : sh_seg_count[th_ix - 1];
-        seg_count = 0;
        while (true) {
-            uint nonseg_bitmap = bitmap & ~is_segment;
-            if (nonseg_bitmap == 0) {
-                backdrop += count_backdrop(bd_bitmap, bd_sign);
-                seg_count += bitCount(bitmap & is_segment);
+            if (bitmap == 0) {
                slice_ix++;
                if (slice_ix == N_SLICE) {
                    break;
                }
                bitmap = sh_bitmaps[slice_ix][th_ix];
-                bd_bitmap = sh_backdrop[slice_ix][th_ix];
-                bd_sign = sh_bd_sign[slice_ix];
-                is_segment = sh_is_segment[slice_ix];
-                nonseg_bitmap = bitmap & ~is_segment;
-                if (nonseg_bitmap == 0) {
+                if (bitmap == 0) {
                    continue;
                }
            }
-            uint element_ref_ix = slice_ix * 32 + findLSB(nonseg_bitmap);
+            uint element_ref_ix = slice_ix * 32 + findLSB(bitmap);
            uint element_ix = sh_elements[element_ref_ix];

-            // Bits up to and including the lsb
-            uint bd_mask = (nonseg_bitmap - 1) ^ nonseg_bitmap;
-            backdrop += count_backdrop(bd_bitmap & bd_mask, bd_sign);
-            seg_count += bitCount(bitmap & bd_mask & is_segment);
-            // Clear bits that have been consumed.
-            bd_bitmap &= ~bd_mask;
-            bitmap &= ~bd_mask;
+            // Clear LSB
+            bitmap &= bitmap - 1;

            // At this point, we read the element again from global memory.
            // If that turns out to be expensive, maybe we can pack it into
@ -419,6 +364,7 @@ void main() {
            tag = Annotated_tag(ref);

            switch (tag) {
+            /*
            case Annotated_Fill:
                if (last_chunk_n > 0 || seg_count > 0) {
                    SegChunkRef chunk_ref = SegChunkRef(0);
@ -460,63 +406,34 @@ void main() {
                seg_count = 0;
                backdrop = 0;
                break;
+            */
            case Annotated_Stroke:
-                // TODO: reduce divergence & code duplication? Much of the
-                // fill and stroke processing is in common.
-                if (last_chunk_n > 0 || seg_count > 0) {
-                    SegChunkRef chunk_ref = SegChunkRef(0);
-                    if (seg_count > 0) {
-                        chunk_ref = alloc_seg_chunk();
-                        SegChunk chunk;
-                        chunk.n = seg_count;
-                        chunk.next = SegChunkRef(0);
-                        uint seg_offset = seg_alloc + seg_start * Segment_size;
-                        chunk.segs = SegmentRef(seg_offset);
-                        SegChunk_write(chunk_ref, chunk);
+                // Because the only elements we're processing right now are
+                // paths, we can just use the element index as the path index.
+                // In future, when we're doing a bunch of stuff, the path index
+                // should probably be stored in the annotated element.
+                uint path_ix = element_ix;
+                Path path = Path_read(PathRef(path_ix * Path_size));
+                // It may be we have a strong guarantee this will always be `true`, but
+                // I prefer not to take chances.
+                if (tile_x >= path.bbox.x && tile_x < path.bbox.z && tile_y >= path.bbox.y && tile_y < path.bbox.w) {
+                    uint stride = path.bbox.z - path.bbox.x;
+                    uint tile_subix = (tile_y - path.bbox.y) * stride + tile_x - path.bbox.x;
+                    Tile tile = Tile_read(Tile_index(path.tiles, tile_subix));
+                    if (tile.tile.offset != 0) {
+                        AnnoStroke stroke = Annotated_Stroke_read(ref);
+                        CmdStroke cmd_stroke;
+                        cmd_stroke.tile_ref = tile.tile.offset;
+                        cmd_stroke.half_width = 0.5 * stroke.linewidth;
+                        cmd_stroke.rgba_color = stroke.rgba_color;
+                        alloc_cmd(cmd_ref, cmd_limit);
+                        Cmd_Stroke_write(cmd_ref, cmd_stroke);
+                        cmd_ref.offset += Cmd_size;
                    }
-                    if (last_chunk_n > 0) {
-                        SegChunk chunk;
-                        chunk.n = last_chunk_n;
-                        chunk.next = chunk_ref;
-                        chunk.segs = last_chunk_segs;
-                        SegChunk_write(last_chunk_ref, chunk);
-                    } else {
-                        first_seg_chunk = chunk_ref;
-                    }
-
-                    AnnoStroke stroke = Annotated_Stroke_read(ref);
-                    CmdStroke cmd_stroke;
-                    cmd_stroke.seg_ref = first_seg_chunk;
-                    cmd_stroke.half_width = 0.5 * stroke.linewidth;
-                    cmd_stroke.rgba_color = stroke.rgba_color;
-                    alloc_cmd(cmd_ref, cmd_limit);
-                    Cmd_Stroke_write(cmd_ref, cmd_stroke);
-                    cmd_ref.offset += Cmd_size;
-                    last_chunk_n = 0;
                }
-                seg_start += seg_count;
-                seg_count = 0;
-                break;
-            default:
-                // This shouldn't happen, but just in case.
-                seg_start++;
                break;
            }
        }
-        if (seg_count > 0) {
-            SegChunkRef chunk_ref = alloc_seg_chunk();
-            if (last_chunk_n > 0) {
-                SegChunk_write(last_chunk_ref, SegChunk(last_chunk_n, chunk_ref, last_chunk_segs));
-            } else {
-                first_seg_chunk = chunk_ref;
-            }
-            // TODO: free two registers by writing count and segments ref now,
-            // as opposed to deferring SegChunk write until all fields are known.
-            last_chunk_ref = chunk_ref;
-            last_chunk_n = seg_count;
-            uint seg_offset = seg_alloc + seg_start * Segment_size;
-            last_chunk_segs = SegmentRef(seg_offset);
-        }
        barrier();

        rd_ix += N_TILE;
--- a/piet-gpu/shader/coarse.spv
+++ b/piet-gpu/shader/coarse.spv
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@ -17,9 +17,14 @@ layout(set = 0, binding = 0) buffer PtclBuf {
    uint[] ptcl;
 };

-layout(rgba8, set = 0, binding = 1) uniform writeonly image2D image;
+layout(set = 0, binding = 1) buffer TileBuf {
+    uint[] tile;
+};
+
+layout(rgba8, set = 0, binding = 2) uniform writeonly image2D image;

 #include "ptcl.h"
+#include "tile.h"

 #include "setup.h"

@ -57,28 +62,25 @@ void main() {
            CmdStroke stroke = Cmd_Stroke_read(cmd_ref);
            float df[CHUNK];
            for (uint k = 0; k < CHUNK; k++) df[k] = 1e9;
-            SegChunkRef seg_chunk_ref = stroke.seg_ref;
+            TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref);
            do {
-                SegChunk seg_chunk = SegChunk_read(seg_chunk_ref);
-                SegmentRef segs = seg_chunk.segs;
-                for (int i = 0; i < seg_chunk.n; i++) {
-                    Segment seg = Segment_read(Segment_index(segs, i));
-                    vec2 line_vec = seg.end - seg.start;
-                    for (uint k = 0; k < CHUNK; k++) {
-                        vec2 dpos = xy + vec2(0.5, 0.5) - seg.start;
-                        dpos.y += float(k * CHUNK_DY);
-                        float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
-                        df[k] = min(df[k], length(line_vec * t - dpos));
-                    }
+                TileSeg seg = TileSeg_read(tile_seg_ref);
+                vec2 line_vec = seg.end - seg.start;
+                for (uint k = 0; k < CHUNK; k++) {
+                    vec2 dpos = xy + vec2(0.5, 0.5) - seg.start;
+                    dpos.y += float(k * CHUNK_DY);
+                    float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
+                    df[k] = min(df[k], length(line_vec * t - dpos));
                }
-                seg_chunk_ref = seg_chunk.next;
-            } while (seg_chunk_ref.offset != 0);
+                tile_seg_ref = seg.next;
+            } while (tile_seg_ref.offset != 0);
            fg_rgba = unpackUnorm4x8(stroke.rgba_color).wzyx;
            for (uint k = 0; k < CHUNK; k++) {
                float alpha = clamp(stroke.half_width + 0.5 - df[k], 0.0, 1.0);
                rgb[k] = mix(rgb[k], fg_rgba.rgb, alpha * fg_rgba.a);
            }
            break;
+        /*
        case Cmd_Fill:
            CmdFill fill = Cmd_Fill_read(cmd_ref);
            // Probably better to store as float, but conversion is no doubt cheap.
@ -117,6 +119,7 @@ void main() {
                rgb[k] = mix(rgb[k], fg_rgba.rgb, alpha * fg_rgba.a);
            }
            break;
+        */
        case Cmd_Solid:
            CmdSolid solid = Cmd_Solid_read(cmd_ref);
            fg_rgba = unpackUnorm4x8(solid.rgba_color).wzyx;
--- a/piet-gpu/shader/kernel4.spv
+++ b/piet-gpu/shader/kernel4.spv
--- a/piet-gpu/shader/path_coarse.comp
+++ b/piet-gpu/shader/path_coarse.comp
@ -78,7 +78,7 @@ void main() {
    int stride = bbox.z - bbox.x;
    int base = (y0 - bbox.y) * stride - bbox.x;
    // TODO: can be tighter, use c to bound width
-    uint n_tile_alloc = uint(stride * (bbox.w - bbox.y));
+    uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
    // Consider using subgroups to aggregate atomic add.
    uint tile_offset = atomicAdd(alloc, n_tile_alloc * TileSeg_size);
    TileSeg tile_seg;
--- a/piet-gpu/shader/path_coarse.spv
+++ b/piet-gpu/shader/path_coarse.spv
--- a/piet-gpu/shader/ptcl.h
+++ b/piet-gpu/shader/ptcl.h
@ -68,7 +68,7 @@ CmdLineRef CmdLine_index(CmdLineRef ref, uint index) {
 }

 struct CmdStroke {
-    SegChunkRef seg_ref;
+    uint tile_ref;
    float half_width;
    uint rgba_color;
 };
@ -220,7 +220,7 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) {
    uint raw1 = ptcl[ix + 1];
    uint raw2 = ptcl[ix + 2];
    CmdStroke s;
-    s.seg_ref = SegChunkRef(raw0);
+    s.tile_ref = raw0;
    s.half_width = uintBitsToFloat(raw1);
    s.rgba_color = raw2;
    return s;
@ -228,7 +228,7 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) {

 void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) {
    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = s.seg_ref.offset;
+    ptcl[ix + 0] = s.tile_ref;
    ptcl[ix + 1] = floatBitsToUint(s.half_width);
    ptcl[ix + 2] = s.rgba_color;
 }
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -183,9 +183,9 @@ impl<D: Device> Renderer<D> {
        device.write_buffer(&scene_buf, &scene)?;

        let state_buf = device.create_buffer(1 * 1024 * 1024, dev)?;
-        let anno_buf = device.create_buffer(64 * 1024 * 1024, host)?;
-        let pathseg_buf = device.create_buffer(64 * 1024 * 1024, host)?;
-        let tile_buf = device.create_buffer(64 * 1024 * 1024, host)?;
+        let anno_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
+        let pathseg_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
+        let tile_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
        let bin_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
        let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?;
        let image_dev = device.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?;
@ -228,10 +228,10 @@ impl<D: Device> Renderer<D> {
        let bin_alloc_buf_dev = device.create_buffer(12, dev)?;

        // TODO: constants
-        let bin_alloc_start = ((n_elements + 255) & !255) * 8;
+        let bin_alloc_start = ((n_paths + 255) & !255) * 8;
        device.write_buffer(
            &bin_alloc_buf_host,
-            &[n_elements as u32, 0, bin_alloc_start as u32],
+            &[n_paths as u32, 0, bin_alloc_start as u32],
        )?;
        let bin_code = include_bytes!("../shader/binning.spv");
        let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 4, 0)?;
@ -250,16 +250,20 @@ impl<D: Device> Renderer<D> {
            &[n_elements as u32, coarse_alloc_start as u32],
        )?;
        let coarse_code = include_bytes!("../shader/coarse.spv");
-        let coarse_pipeline = device.create_simple_compute_pipeline(coarse_code, 4, 0)?;
+        let coarse_pipeline = device.create_simple_compute_pipeline(coarse_code, 5, 0)?;
        let coarse_ds = device.create_descriptor_set(
            &coarse_pipeline,
-            &[&anno_buf, &bin_buf, &coarse_alloc_buf_dev, &ptcl_buf],
+            &[&anno_buf, &bin_buf, &tile_buf, &coarse_alloc_buf_dev, &ptcl_buf],
            &[],
        )?;

        let k4_code = include_bytes!("../shader/kernel4.spv");
-        let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 1, 1)?;
-        let k4_ds = device.create_descriptor_set(&k4_pipeline, &[&ptcl_buf], &[&image_dev])?;
+        let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 2, 1)?;
+        let k4_ds = device.create_descriptor_set(
+            &k4_pipeline, 
+            &[&ptcl_buf, &tile_buf], 
+            &[&image_dev]
+        )?;

        Ok(Renderer {
            scene_buf,
@ -328,32 +332,31 @@ impl<D: Device> Renderer<D> {
            &self.path_ds,
            (((self.n_pathseg + 31) / 32) as u32, 1, 1),
        );
-        /*
+        cmd_buf.write_timestamp(&query_pool, 3);
+        // Note: this barrier is not needed as an actual dependency between
+        // pipeline stages, but I am keeping it in so that timer queries are
+        // easier to interpret.
+        cmd_buf.memory_barrier();
        cmd_buf.dispatch(
            &self.bin_pipeline,
            &self.bin_ds,
-            (((self.n_elements + 255) / 256) as u32, 1, 1),
+            (((self.n_paths + 255) / 256) as u32, 1, 1),
        );
-        */
-        cmd_buf.write_timestamp(&query_pool, 3);
+        cmd_buf.write_timestamp(&query_pool, 4);
        cmd_buf.memory_barrier();
-        /*
        cmd_buf.dispatch(
            &self.coarse_pipeline,
            &self.coarse_ds,
            (WIDTH as u32 / 256, HEIGHT as u32 / 256, 1),
        );
-        */
-        cmd_buf.write_timestamp(&query_pool, 4);
+        cmd_buf.write_timestamp(&query_pool, 5);
        cmd_buf.memory_barrier();
-        /*
        cmd_buf.dispatch(
            &self.k4_pipeline,
            &self.k4_ds,
            ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1),
        );
-        cmd_buf.write_timestamp(&query_pool, 5);
-        */
+        cmd_buf.write_timestamp(&query_pool, 6);
        cmd_buf.memory_barrier();
        cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
    }
--- a/piet-gpu/src/pico_svg.rs
+++ b/piet-gpu/src/pico_svg.rs
@ -49,8 +49,8 @@ impl PicoSvg {
        for item in &self.items {
            match item {
                Item::Fill(fill_item) => {
-                    rc.fill(&fill_item.path, &fill_item.color);
-                    //rc.stroke(&fill_item.path, &fill_item.color, 1.0);
+                    //rc.fill(&fill_item.path, &fill_item.color);
+                    rc.stroke(&fill_item.path, &fill_item.color, 1.0);
                }
                Item::Stroke(stroke_item) => {
                    rc.stroke(&stroke_item.path, &stroke_item.color, stroke_item.width);