Merge pull request #32 from eliasnaur/master

Clean up, add maximum workgroup size control
2025-01-10 12:41:30 +11:00 · 2020-09-16 08:44:48 +02:00 · 2020-09-16 08:44:48 +02:00 · 02a6bfbb6c
parent a73e7cf282 ac3ac3ddff
commit 02a6bfbb6c
12 changed files with 13 additions and 37 deletions
--- a/piet-gpu/shader/backdrop.comp
+++ b/piet-gpu/shader/backdrop.comp
@ -15,7 +15,7 @@
 #include "setup.h"
-#define LG_BACKDROP_WG 8
+#define LG_BACKDROP_WG (7 + LG_WG_FACTOR)
 #define BACKDROP_WG (1 << LG_BACKDROP_WG)
 layout(local_size_x = BACKDROP_WG, local_size_y = 1) in;
--- a/piet-gpu/shader/binning.comp
+++ b/piet-gpu/shader/binning.comp
@ -17,8 +17,6 @@ layout(set = 0, binding = 0) buffer AnnotatedBuf {
 layout(set = 0, binding = 1) buffer AllocBuf {
    uint n_elements; // paths
    // Will be incremented atomically to claim tiles
    uint tile_ix;
    uint alloc;
 };
@ -42,8 +40,6 @@ shared uint bitmaps[N_SLICE][N_TILE];
 shared uint count[N_SLICE][N_TILE];
 shared uint sh_chunk_start[N_TILE];
 shared float sh_right_edge[N_TILE];
 void main() {
    uint my_n_elements = n_elements;
    uint my_partition = gl_WorkGroupID.x;
--- a/piet-gpu/shader/binning.spv
+++ b/piet-gpu/shader/binning.spv
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@ -41,7 +41,7 @@ layout(set = 0, binding = 4) buffer PtclBuf {
 #include "tile.h"
 #include "ptcl.h"
-#define LG_N_PART_READ 8
+#define LG_N_PART_READ (7 + LG_WG_FACTOR)
 #define N_PART_READ (1 << LG_N_PART_READ)
 shared uint sh_elements[N_TILE];
--- a/piet-gpu/shader/coarse.spv
+++ b/piet-gpu/shader/coarse.spv
--- a/piet-gpu/shader/elements.spv
+++ b/piet-gpu/shader/elements.spv
--- a/piet-gpu/shader/kernel4.spv
+++ b/piet-gpu/shader/kernel4.spv
--- a/piet-gpu/shader/path_coarse.comp
+++ b/piet-gpu/shader/path_coarse.comp
@ -108,27 +108,9 @@ void main() {
    PathStrokeLine line;
    float dx;
    switch (tag) {
    /*
    case PathSeg_FillLine:
    case PathSeg_StrokeLine:
        line = PathSeg_StrokeLine_read(ref);
        xmin = min(line.p0.x, line.p1.x) - line.stroke.x;
        xmax = max(line.p0.x, line.p1.x) + line.stroke.x;
        ymin = min(line.p0.y, line.p1.y) - line.stroke.y;
        ymax = max(line.p0.y, line.p1.y) + line.stroke.y;
        dx = line.p1.x - line.p0.x;
        float dy = line.p1.y - line.p0.y;
        // Set up for per-scanline coverage formula, below.
        float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
        c = (line.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y)) * SX;
        b = invslope; // Note: assumes square tiles, otherwise scale.
        a = (line.p0.x - (line.p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX;
        break;
    */
    case PathSeg_FillCubic:
    case PathSeg_StrokeCubic:
        PathStrokeCubic cubic = PathSeg_StrokeCubic_read(ref);
        // Commented out code is for computing error bound on conversion to quadratics
        vec2 err_v = 3.0 * (cubic.p2 - cubic.p1) + cubic.p0 - cubic.p3;
        float err = err_v.x * err_v.x + err_v.y * err_v.y;
        // The number of quadratics.
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@ -3,6 +3,11 @@
 // Much of this will be made dynamic in various ways, but for now it's easiest
 // to hardcode and keep all in one place.
 // A LG_WG_FACTOR of n scales workgroup sizes by 2^n. Use 0 for a
 // maximum workgroup size of 128, or 1 for a maximum size of 256.
 #define LG_WG_FACTOR 1
 #define WG_FACTOR (1<<LG_WG_FACTOR)
 // TODO: compute all these
 #define WIDTH_IN_TILES 128
@ -10,21 +15,14 @@
 #define TILE_WIDTH_PX 16
 #define TILE_HEIGHT_PX 16
 // TODO: make the image size dynamic.
 #define IMAGE_WIDTH (WIDTH_IN_TILES*TILE_WIDTH_PX)
 #define IMAGE_HEIGHT (HEIGHT_IN_TILES*TILE_HEIGHT_PX)
 #define PTCL_INITIAL_ALLOC 1024
 // Stuff for new algorithm follows; some of the above should get
 // deleted.
 // These should probably be renamed and/or reworked. In the binning
 // kernel, they represent the number of bins. Also, the workgroup size
 // of that kernel is equal to the number of bins, but should probably
 // be more flexible (it's 512 in the K&L paper).
 #define N_TILE_X 16
-#define N_TILE_Y 16
+#define N_TILE_Y (8 * WG_FACTOR)
 #define N_TILE (N_TILE_X * N_TILE_Y)
-#define LG_N_TILE 8
+#define LG_N_TILE (7 + LG_WG_FACTOR)
 #define N_SLICE (N_TILE / 32)
--- a/piet-gpu/shader/tile_alloc.comp
+++ b/piet-gpu/shader/tile_alloc.comp
@ -5,7 +5,7 @@
 #include "setup.h"
-#define LG_TILE_ALLOC_WG 8
+#define LG_TILE_ALLOC_WG (7 + LG_WG_FACTOR)
 #define TILE_ALLOC_WG (1 << LG_TILE_ALLOC_WG)
 layout(local_size_x = TILE_ALLOC_WG, local_size_y = 1) in;
--- a/piet-gpu/shader/tile_alloc.spv
+++ b/piet-gpu/shader/tile_alloc.spv
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -227,14 +227,14 @@ impl<D: Device> Renderer<D> {
            &[],
        )?;
-        let bin_alloc_buf_host = device.create_buffer(12, host)?;
+        let bin_alloc_buf_host = device.create_buffer(8, host)?;
-        let bin_alloc_buf_dev = device.create_buffer(12, dev)?;
+        let bin_alloc_buf_dev = device.create_buffer(8, dev)?;
        // TODO: constants
        let bin_alloc_start = ((n_paths + 255) & !255) * 8;
        device.write_buffer(
            &bin_alloc_buf_host,
-            &[n_paths as u32, 0, bin_alloc_start as u32],
+            &[n_paths as u32, bin_alloc_start as u32],
        )?;
        let bin_code = include_bytes!("../shader/binning.spv");
        let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 3, 0)?;