diff --git a/piet-gpu-types/src/bins.rs b/piet-gpu-types/src/bins.rs
new file mode 100644
index 0000000..88f16f1
--- /dev/null
+++ b/piet-gpu-types/src/bins.rs
@@ -0,0 +1,19 @@
+use piet_gpu_derive::piet_gpu;
+
+// The output of the binning stage, organized as a linked list of chunks.
+
+piet_gpu! {
+    #[gpu_write]
+    mod bins {
+        struct BinInstance {
+            element_ix: u32,
+        }
+
+        struct BinChunk {
+            // First chunk can have n = 0, subsequent ones not.
+            n: u32,
+            next: Ref<BinChunk>,
+            // Instances follow
+        }
+    }
+}
diff --git a/piet-gpu-types/src/lib.rs b/piet-gpu-types/src/lib.rs
index 1759c4d..29ed806 100644
--- a/piet-gpu-types/src/lib.rs
+++ b/piet-gpu-types/src/lib.rs
@@ -1,4 +1,7 @@
+// Structures used only internally probably don't need to be pub.
+
 pub mod annotated;
+pub mod bins;
 pub mod encoder;
 pub mod fill_seg;
 pub mod ptcl;
diff --git a/piet-gpu-types/src/main.rs b/piet-gpu-types/src/main.rs
index 68e6487..41ae021 100644
--- a/piet-gpu-types/src/main.rs
+++ b/piet-gpu-types/src/main.rs
@@ -7,6 +7,7 @@ fn main() {
         "scene" => print!("{}", piet_gpu_types::scene::gen_gpu_scene()),
         "state" => print!("{}", piet_gpu_types::state::gen_gpu_state()),
         "annotated" => print!("{}", piet_gpu_types::annotated::gen_gpu_annotated()),
+        "bins" => print!("{}", piet_gpu_types::bins::gen_gpu_bins()),
         "tilegroup" => print!("{}", piet_gpu_types::tilegroup::gen_gpu_tilegroup()),
         "segment" => print!("{}", piet_gpu_types::segment::gen_gpu_segment()),
         "fill_seg" => print!("{}", piet_gpu_types::fill_seg::gen_gpu_fill_seg()),
diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs
index 82f3491..4a4fed3 100644
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@@ -41,7 +41,7 @@ fn main() -> Result<(), Error> {
 
         let fence = device.create_fence(false)?;
         let mut cmd_buf = device.create_cmd_buf()?;
-        let query_pool = device.create_query_pool(2)?;
+        let query_pool = device.create_query_pool(3)?;
 
         let mut ctx = PietGpuRenderContext::new();
         render_scene(&mut ctx);
@@ -58,13 +58,14 @@ fn main() -> Result<(), Error> {
         cmd_buf.finish();
         device.run_cmd_buf(&cmd_buf, &[], &[], Some(&fence))?;
         device.wait_and_reset(&[fence])?;
-        let timestamps = device.reap_query_pool(&query_pool).unwrap();
-        println!("Element kernel time: {:.3}ms", timestamps[0] * 1e3);
+        let ts = device.reap_query_pool(&query_pool).unwrap();
+        println!("Element kernel time: {:.3}ms", ts[0] * 1e3);
+        println!("Binning kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3);
 
         /*
-        let mut data: Vec<u8> = Default::default();
-        device.read_buffer(&renderer.state_buf, &mut data).unwrap();
-        dump_state(&data);
+        let mut data: Vec<u32> = Default::default();
+        device.read_buffer(&renderer.bin_buf, &mut data).unwrap();
+        piet_gpu::dump_k1_data(&data);
         */
 
         let mut img_data: Vec<u8> = Default::default();
diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp
new file mode 100644
index 0000000..bf7bbae
--- /dev/null
+++ b/piet-gpu/shader/binning.comp
@@ -0,0 +1,169 @@
+// The binning stage of the pipeline.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+#define N_ROWS 4
+#define WG_SIZE 32
+#define LG_WG_SIZE 5
+#define TILE_SIZE (WG_SIZE * N_ROWS)
+
+// TODO: move these to setup file
+#define N_TILE_X 16
+#define N_TILE_Y 16
+#define N_TILE (N_TILE_X * N_TILE_Y)
+#define N_SLICE (N_TILE / 32)
+#define N_WG 16 // Number of workgroups, should be 1 per SM
+
+#define BIN_INITIAL_ALLOC 64
+#define BIN_ALLOC 256
+
+layout(local_size_x = N_TILE, local_size_y = 1) in;
+
+layout(set = 0, binding = 0) buffer AnnotatedBuf {
+    uint[] annotated;
+};
+
+layout(set = 0, binding = 1) buffer AllocBuf {
+    uint n_elements;
+    // Will be incremented atomically to claim tiles
+    uint tile_ix;
+    uint alloc;
+};
+
+layout(set = 0, binding = 2) buffer BinsBuf {
+    uint[] bins;
+};
+
+#include "annotated.h"
+#include "bins.h"
+#include "setup.h"
+
+// scale factors useful for converting coordinates to bins
+#define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX))
+#define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX))
+
+// Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
+shared uint bitmaps[N_SLICE][N_TILE];
+shared uint sh_my_tile;
+
+void main() {
+    BinChunkRef chunk_ref = BinChunkRef((gl_LocalInvocationID.x * N_WG + gl_WorkGroupID.x) * BIN_INITIAL_ALLOC);
+    uint chunk_limit = chunk_ref.offset + BIN_INITIAL_ALLOC - BinInstance_size;
+    uint chunk_n = 0;
+    BinInstanceRef instance_ref = BinInstanceRef(chunk_ref.offset + BinChunk_size);
+    while (true) {
+        if (gl_LocalInvocationID.x == 0) {
+            sh_my_tile = atomicAdd(tile_ix, 1);
+        }
+        barrier();
+        uint my_tile = sh_my_tile;
+        if (my_tile * N_TILE >= n_elements) {
+            break;
+        }
+
+        for (uint i = 0; i < N_SLICE; i++) {
+            bitmaps[i][gl_LocalInvocationID.x] = 0;
+        }
+        barrier();
+
+        // Read inputs and determine coverage of bins
+        uint element_ix = my_tile * N_TILE + gl_LocalInvocationID.x;
+        AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
+        uint tag = Annotated_tag(ref);
+        int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
+        switch (tag) {
+        case Annotated_Line:
+            AnnoLineSeg line = Annotated_Line_read(ref);
+            x0 = int(floor((min(line.p0.x, line.p1.x) - line.stroke.x) * SX));
+            y0 = int(floor((min(line.p0.y, line.p1.y) - line.stroke.y) * SY));
+            x1 = int(ceil((max(line.p0.x, line.p1.x) + line.stroke.x) * SX));
+            y1 = int(ceil((max(line.p0.y, line.p1.y) + line.stroke.y) * SY));
+            break;
+        case Annotated_Fill:
+        case Annotated_Stroke:
+            // Note: we take advantage of the fact that fills and strokes
+            // have compatible layout.
+            AnnoFill fill = Annotated_Fill_read(ref);
+            x0 = int(floor(fill.bbox.x * SX));
+            y0 = int(floor(fill.bbox.y * SY));
+            x1 = int(ceil(fill.bbox.z * SX));
+            y1 = int(ceil(fill.bbox.w * SY));
+            break;
+        }
+        // At this point, we run an iterator over the coverage area,
+        // trying to keep divergence low.
+        // Right now, it's just a bbox, but we'll get finer with
+        // segments.
+        x0 = clamp(x0, 0, N_TILE_X);
+        x1 = clamp(x1, x0, N_TILE_X);
+        y0 = clamp(y0, 0, N_TILE_Y);
+        y1 = clamp(y1, y0, N_TILE_Y);
+        if (x0 == x1) y1 = y0;
+        int x = x0, y = y0;
+        uint my_slice = gl_LocalInvocationID.x / 32;
+        uint my_mask = 1 << (gl_LocalInvocationID.x & 31);
+        while (y < y1) {
+            atomicOr(bitmaps[my_slice][y * N_TILE_X + x], my_mask);
+            x++;
+            if (x == x1) {
+                x = x0;
+                y++;
+            }
+        }
+
+        barrier();
+        // Allocate output segments.
+        uint element_count = 0;
+        for (uint i = 0; i < N_SLICE; i++) {
+            element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]);
+        }
+        // element_count is number of elements covering bin for this invocation.
+        if (element_count > 0 && chunk_n > 0) {
+            uint new_chunk = instance_ref.offset;
+            if (new_chunk + min(32, element_count * 4) > chunk_limit) {
+                new_chunk = atomicAdd(alloc, BIN_ALLOC);
+                chunk_limit = new_chunk + BIN_ALLOC - BinInstance_size;
+            }
+            BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(new_chunk)));
+            chunk_ref = BinChunkRef(new_chunk);
+            instance_ref = BinInstanceRef(new_chunk + BinChunk_size);
+            chunk_n = 0;
+        }
+        // TODO: allocate output here
+
+        // Iterate over bits set.
+        uint slice_ix = 0;
+        uint bitmap = bitmaps[0][gl_LocalInvocationID.x];
+        while (true) {
+            if (bitmap == 0) {
+                slice_ix++;
+                if (slice_ix == N_SLICE) {
+                    break;
+                }
+                bitmap = bitmaps[slice_ix][gl_LocalInvocationID.x];
+                if (bitmap == 0) {
+                    continue;
+                }
+            }
+            element_ix = my_tile * N_TILE + slice_ix * 32 + findLSB(bitmap);
+            // At this point, element_ix refers to an element that covers this bin.
+
+            // TODO: batch allocated based on element_count; this is divergent
+            if (instance_ref.offset > chunk_limit) {
+                uint new_chunk = atomicAdd(alloc, BIN_ALLOC);
+                BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(new_chunk)));
+                chunk_ref = BinChunkRef(new_chunk);
+                instance_ref = BinInstanceRef(new_chunk + BinChunk_size);
+                chunk_n = 0;
+                chunk_limit = new_chunk + BIN_ALLOC - BinInstance_size;
+            }
+            BinInstance_write(instance_ref, BinInstance(element_ix));
+            chunk_n++;
+            instance_ref.offset += BinInstance_size;
+            // clear LSB
+            bitmap &= bitmap - 1;
+        }
+    }
+    BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(0)));
+}
diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv
new file mode 100644
index 0000000..4cc5d36
Binary files /dev/null and b/piet-gpu/shader/binning.spv differ
diff --git a/piet-gpu/shader/bins.h b/piet-gpu/shader/bins.h
new file mode 100644
index 0000000..3ce06e0
--- /dev/null
+++ b/piet-gpu/shader/bins.h
@@ -0,0 +1,60 @@
+// Code auto-generated by piet-gpu-derive
+
+struct BinInstanceRef {
+    uint offset;
+};
+
+struct BinChunkRef {
+    uint offset;
+};
+
+struct BinInstance {
+    uint element_ix;
+};
+
+#define BinInstance_size 4
+
+BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) {
+    return BinInstanceRef(ref.offset + index * BinInstance_size);
+}
+
+struct BinChunk {
+    uint n;
+    BinChunkRef next;
+};
+
+#define BinChunk_size 8
+
+BinChunkRef BinChunk_index(BinChunkRef ref, uint index) {
+    return BinChunkRef(ref.offset + index * BinChunk_size);
+}
+
+BinInstance BinInstance_read(BinInstanceRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = bins[ix + 0];
+    BinInstance s;
+    s.element_ix = raw0;
+    return s;
+}
+
+void BinInstance_write(BinInstanceRef ref, BinInstance s) {
+    uint ix = ref.offset >> 2;
+    bins[ix + 0] = s.element_ix;
+}
+
+BinChunk BinChunk_read(BinChunkRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = bins[ix + 0];
+    uint raw1 = bins[ix + 1];
+    BinChunk s;
+    s.n = raw0;
+    s.next = BinChunkRef(raw1);
+    return s;
+}
+
+void BinChunk_write(BinChunkRef ref, BinChunk s) {
+    uint ix = ref.offset >> 2;
+    bins[ix + 0] = s.n;
+    bins[ix + 1] = s.next.offset;
+}
+
diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja
index b429a71..4628fd2 100644
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@@ -20,4 +20,6 @@ build kernel3.spv: glsl kernel3.comp | scene.h tilegroup.h segment.h fill_seg.h
 build kernel4.spv: glsl kernel4.comp | ptcl.h segment.h fill_seg.h setup.h
 
 
-build elements.spv: glsl elements.comp | scene.h state.h
+build elements.spv: glsl elements.comp | scene.h state.h annotated.h
+
+build binning.spv: glsl binning.comp | annotated.h setup.h
diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp
index 1061fab..c31dd2e 100644
--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@@ -1,3 +1,9 @@
+// The element processing stage, first in the pipeline.
+//
+// This stage is primarily about applying transforms and computing bounding
+// boxes. It is organized as a scan over the input elements, producing
+// annotated output elements.
+
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 
@@ -208,6 +214,13 @@ void main() {
             anno_stroke.linewidth = st.linewidth * sqrt(st.mat.x * st.mat.w - st.mat.y * st.mat.z);
             Annotated_Stroke_write(out_ref, anno_stroke);
             break;
+        case Element_Fill:
+            Fill fill = Element_Fill_read(this_ref);
+            AnnoFill anno_fill;
+            anno_fill.rgba_color = fill.rgba_color;
+            anno_fill.bbox = st.bbox;
+            Annotated_Fill_write(out_ref, anno_fill);
+            break;
         default:
             Annotated_Nop_write(out_ref);
             break;
diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv
index 41d9bc1..afb63b5 100644
Binary files a/piet-gpu/shader/elements.spv and b/piet-gpu/shader/elements.spv differ
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index 0753054..437a31a 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -32,6 +32,8 @@ const K2_PER_TILE_SIZE: usize = 8;
 
 const N_CIRCLES: usize = 1;
 
+const N_WG: u32 = 16;
+
 pub fn render_scene(rc: &mut impl RenderContext) {
     let mut rng = rand::thread_rng();
     for _ in 0..N_CIRCLES {
@@ -98,10 +100,10 @@ fn dump_scene(buf: &[u8]) {
 }
 
 #[allow(unused)]
-fn dump_k1_data(k1_buf: &[u32]) {
+pub fn dump_k1_data(k1_buf: &[u32]) {
     for i in 0..k1_buf.len() {
         if k1_buf[i] != 0 {
-            println!("{:4x}: {:8x}", i, k1_buf[i]);
+            println!("{:4x}: {:8x}", i * 4, k1_buf[i]);
         }
     }
 }
@@ -114,10 +116,17 @@ pub struct Renderer<D: Device> {
 
     pub state_buf: D::Buffer,
     pub anno_buf: D::Buffer,
+    pub bin_buf: D::Buffer,
 
     el_pipeline: D::Pipeline,
     el_ds: D::DescriptorSet,
 
+    bin_pipeline: D::Pipeline,
+    bin_ds: D::DescriptorSet,
+
+    bin_alloc_buf_host: D::Buffer,
+    bin_alloc_buf_dev: D::Buffer,
+
     /*
     k1_alloc_buf_host: D::Buffer,
     k1_alloc_buf_dev: D::Buffer,
@@ -149,6 +158,9 @@ impl<D: Device> Renderer<D> {
         let host = MemFlags::host_coherent();
         let dev = MemFlags::device_local();
 
+        let n_elements = scene.len() / piet_gpu_types::scene::Element::fixed_size();
+        println!("scene: {} elements", n_elements);
+
         let scene_buf = device
             .create_buffer(std::mem::size_of_val(&scene[..]) as u64, host)
             .unwrap();
@@ -159,6 +171,7 @@ impl<D: Device> Renderer<D> {
 
         let state_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
         let anno_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
+        let bin_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
         let image_dev = device.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?;
 
         let el_code = include_bytes!("../shader/elements.spv");
@@ -169,8 +182,25 @@ impl<D: Device> Renderer<D> {
             &[],
         )?;
 
-        let n_elements = scene.len() / piet_gpu_types::scene::Element::fixed_size();
-        println!("scene: {} elements", n_elements);
+        let bin_alloc_buf_host = device.create_buffer(12, host)?;
+        let bin_alloc_buf_dev = device.create_buffer(12, dev)?;
+
+        // TODO: constants
+        let bin_alloc_start = 256 * 64 * N_WG;
+        device
+            .write_buffer(&bin_alloc_buf_host, &[
+                n_elements as u32,
+                0,
+                bin_alloc_start,
+            ])
+            ?;
+        let bin_code = include_bytes!("../shader/binning.spv");
+        let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 3, 0)?;
+        let bin_ds = device.create_descriptor_set(
+            &bin_pipeline,
+            &[&anno_buf, &bin_alloc_buf_dev, &bin_buf],
+            &[],
+        )?;
 
         /*
         let tilegroup_buf = device.create_buffer(4 * 1024 * 1024, dev)?;
@@ -253,14 +283,20 @@ impl<D: Device> Renderer<D> {
             image_dev,
             el_pipeline,
             el_ds,
+            bin_pipeline,
+            bin_ds,
             state_buf,
             anno_buf,
+            bin_buf,
+            bin_alloc_buf_host,
+            bin_alloc_buf_dev,
             n_elements,
         })
     }
 
     pub unsafe fn record(&self, cmd_buf: &mut impl CmdBuf<D>, query_pool: &D::QueryPool) {
         cmd_buf.copy_buffer(&self.scene_buf, &self.scene_dev);
+        cmd_buf.copy_buffer(&self.bin_alloc_buf_host, &self.bin_alloc_buf_dev);
         cmd_buf.memory_barrier();
         cmd_buf.image_barrier(
             &self.image_dev,
@@ -276,6 +312,13 @@ impl<D: Device> Renderer<D> {
         );
         cmd_buf.write_timestamp(&query_pool, 1);
         cmd_buf.memory_barrier();
+        cmd_buf.dispatch(
+            &self.bin_pipeline,
+            &self.bin_ds,
+            (N_WG, 1, 1),
+        );
+        cmd_buf.write_timestamp(&query_pool, 2);
+        cmd_buf.memory_barrier();
         cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
     }
 }