diff --git a/piet-gpu/shader/backdrop.comp b/piet-gpu/shader/backdrop.comp
index 090db1d..afe4d62 100644
--- a/piet-gpu/shader/backdrop.comp
+++ b/piet-gpu/shader/backdrop.comp
@@ -1,4 +1,14 @@
 // Propagation of tile backdrop for filling.
+//
+// Each thread reads one path element and calculates the number of spanned tiles
+// based on the bounding box.
+// In a further compaction step, the workgroup loops over the corresponding tile rows per element in parallel.
+// For each row the per tile backdrop will be read, as calculated in the previous coarse path segment kernel,
+// and propagated from the left to the right (prefix summed).
+//
+// Output state:
+//  - Each path element has an array of tiles covering the whole path based on boundig box
+//  - Each tile per path element contains the 'backdrop' and a list of subdivided path segments
 
 #version 450
 #extension GL_GOOGLE_include_directive : enable
@@ -17,8 +27,8 @@ layout(set = 0, binding = 0) buffer AnnotatedBuf {
 // This is really only used for n_elements; maybe we can handle that
 // a different way, but it's convenient to have the same signature as
 // tile allocation.
-layout(set = 0, binding = 1) buffer AllocBuf {
-    uint n_elements;
+layout(set = 0, binding = 1) readonly buffer AllocBuf {
+    uint n_elements; // paths
     uint n_pathseg;
     uint alloc;
 };
@@ -39,6 +49,7 @@ void main() {
     uint element_ix = gl_GlobalInvocationID.x;
     AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
 
+    // Work assignment: 1 thread : 1 path element
     uint row_count = 0;
     if (element_ix < n_elements) {
         uint tag = Annotated_tag(ref);
@@ -67,6 +78,7 @@ void main() {
         sh_row_count[th_ix] = row_count;
     }
     barrier();
+    // Work assignment: 1 thread : 1 path element row
     uint total_rows = sh_row_count[BACKDROP_WG - 1];
     for (uint row = th_ix; row < total_rows; row += BACKDROP_WG) {
         // Binary search to find element
@@ -80,6 +92,7 @@ void main() {
         uint seq_ix = row - (el_ix > 0 ? sh_row_count[el_ix - 1] : 0);
         uint width = sh_row_width[el_ix];
         // Process one row sequentially
+        // Read backdrop value per tile and prefix sum it
         uint tile_el_ix = sh_row_base[el_ix] + seq_ix * 2 * width;
         uint sum = tile[tile_el_ix];
         for (uint x = 1; x < width; x++) {
diff --git a/piet-gpu/shader/backdrop.spv b/piet-gpu/shader/backdrop.spv
index a633c16..67d05df 100644
Binary files a/piet-gpu/shader/backdrop.spv and b/piet-gpu/shader/backdrop.spv differ
diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp
index d35c2d9..0ddeb7e 100644
--- a/piet-gpu/shader/binning.comp
+++ b/piet-gpu/shader/binning.comp
@@ -1,4 +1,8 @@
 // The binning stage of the pipeline.
+//
+// Each workgroup processes N_TILE paths.
+// Each thread processes one path and calculates a N_TILE_X x N_TILE_Y coverage mask
+// based on the path bounding box to bin the paths.
 
 #version 450
 #extension GL_GOOGLE_include_directive : enable
@@ -17,7 +21,7 @@ layout(set = 0, binding = 1) buffer StateBuf {
 };
 
 layout(set = 0, binding = 2) buffer AllocBuf {
-    uint n_elements;
+    uint n_elements; // paths
     // Will be incremented atomically to claim tiles
     uint tile_ix;
     uint alloc;
@@ -41,6 +45,7 @@ layout(set = 0, binding = 3) buffer BinsBuf {
 #define INFINITY (1.0 / 0.0)
 
 // Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
+// Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps)
 shared uint bitmaps[N_SLICE][N_TILE];
 shared uint count[N_SLICE][N_TILE];
 shared uint sh_chunk_start[N_TILE];
@@ -72,17 +77,17 @@ void main() {
     }
     int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
     float my_right_edge = INFINITY;
-    bool crosses_edge = false;
+    // bool crosses_edge = false;
     switch (tag) {
-    case Annotated_FillLine:
-    case Annotated_StrokeLine:
-        AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
-        x0 = int(floor((min(line.p0.x, line.p1.x) - line.stroke.x) * SX));
-        y0 = int(floor((min(line.p0.y, line.p1.y) - line.stroke.y) * SY));
-        x1 = int(ceil((max(line.p0.x, line.p1.x) + line.stroke.x) * SX));
-        y1 = int(ceil((max(line.p0.y, line.p1.y) + line.stroke.y) * SY));
-        crosses_edge = tag == Annotated_FillLine && ceil(line.p0.y * TSY) != ceil(line.p1.y * TSY);
-        break;
+    // case Annotated_FillLine:
+    // case Annotated_StrokeLine:
+    //     AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
+    //     x0 = int(floor((min(line.p0.x, line.p1.x) - line.stroke.x) * SX));
+    //     y0 = int(floor((min(line.p0.y, line.p1.y) - line.stroke.y) * SY));
+    //     x1 = int(ceil((max(line.p0.x, line.p1.x) + line.stroke.x) * SX));
+    //     y1 = int(ceil((max(line.p0.y, line.p1.y) + line.stroke.y) * SY));
+    //     crosses_edge = tag == Annotated_FillLine && ceil(line.p0.y * TSY) != ceil(line.p1.y * TSY);
+    //     break;
     case Annotated_Fill:
     case Annotated_Stroke:
         // Note: we take advantage of the fact that fills and strokes
@@ -98,6 +103,7 @@ void main() {
         break;
     }
 
+    /*
     // If the last element in this partition is a fill edge, then we need to do a
     // look-forward to find the right edge of its corresponding fill. That data is
     // recorded in aggregates computed in the element processing pass.
@@ -126,6 +132,7 @@ void main() {
     if (crosses_edge) {
         x1 = int(ceil(my_right_edge * SX));
     }
+    */
 
     // At this point, we run an iterator over the coverage area,
     // trying to keep divergence low.
diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv
index 524f9e4..0d30dfc 100644
Binary files a/piet-gpu/shader/binning.spv and b/piet-gpu/shader/binning.spv differ
diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index eec0bfe..3dcd856 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@@ -1,4 +1,12 @@
 // The coarse rasterizer stage of the pipeline.
+//
+// As input we have the ordered partitions of paths from the binning phase and
+// the annotated tile list of segments and backdrop per path.
+//
+// Each workgroup operating on one bin by stream compacting
+// the elements corresponding to the bin.
+//
+// As output we have an ordered command stream per tile. Every tile from a path (backdrop + segment list) will be encoded.
 
 #version 450
 #extension GL_GOOGLE_include_directive : enable
@@ -369,7 +377,6 @@ void main() {
                     cmd_fill.rgba_color = fill.rgba_color;
                     Cmd_Fill_write(cmd_ref, cmd_fill);
                 } else {
-                    AnnoFill fill = Annotated_Fill_read(ref);
                     Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color));
                 }
                 cmd_ref.offset += Cmd_size;
diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv
index ad24e6b..692b6c1 100644
Binary files a/piet-gpu/shader/coarse.spv and b/piet-gpu/shader/coarse.spv differ
diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp
index a88bc3d..7727b2c 100644
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@@ -59,6 +59,7 @@ void main() {
             }
             break;
         case Cmd_Stroke:
+            // Calculate distance field from all the line segments in this tile.
             CmdStroke stroke = Cmd_Stroke_read(cmd_ref);
             float df[CHUNK];
             for (uint k = 0; k < CHUNK; k++) df[k] = 1e9;
@@ -81,6 +82,7 @@ void main() {
             }
             break;
         case Cmd_Fill:
+            // Calculate coverage based on backdrop + coverage of each line segment
             CmdFill fill = Cmd_Fill_read(cmd_ref);
             // Probably better to store as float, but conversion is no doubt cheap.
             float area[CHUNK];
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index 8c720a2..7c0a1fc 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -167,7 +167,7 @@ impl<D: Device> Renderer<D> {
         let dev = MemFlags::device_local();
 
         let n_elements = scene.len() / piet_gpu_types::scene::Element::fixed_size();
-        println!("scene: {} elements", n_elements);
+        println!("scene: {} elements, {} paths, {} path_segments", n_elements, n_paths, n_pathseg);
 
         let scene_buf = device
             .create_buffer(std::mem::size_of_val(&scene[..]) as u64, host)