Merge branch 'master' into dx12

2025-01-09 20:31:29 +11:00 · 2021-05-16 10:19:06 -07:00 · 2021-05-16 10:19:06 -07:00 · 619fc8d4eb
parent a28c0c8c83 34d8fa358b
commit 619fc8d4eb
52 changed files with 3189 additions and 2263 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -610,7 +610,19 @@ checksum = "5eb167c1febed0a496639034d0c76b3b74263636045db5489eee52143c246e73"
 dependencies = [
 "jni-sys",
 "ndk-sys",
- "num_enum",
+ "num_enum 0.4.3",
+ "thiserror",
+]
+
+[[package]]
+name = "ndk"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8794322172319b972f528bf90c6b467be0079f1fa82780ffb431088e741a73ab"
+dependencies = [
+ "jni-sys",
+ "ndk-sys",
+ "num_enum 0.5.1",
 "thiserror",
 ]

@ -623,7 +635,21 @@ dependencies = [
 "lazy_static",
 "libc",
 "log",
- "ndk",
+ "ndk 0.2.1",
+ "ndk-macro",
+ "ndk-sys",
+]
+
+[[package]]
+name = "ndk-glue"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c5caf0c24d51ac1c905c27d4eda4fa0635bbe0de596b8f79235e0b17a4d29385"
+dependencies = [
+ "lazy_static",
+ "libc",
+ "log",
+ "ndk 0.3.0",
 "ndk-macro",
 "ndk-sys",
 ]
@ -687,7 +713,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ca565a7df06f3d4b485494f25ba05da1435950f4dc263440eda7a6fa9b8e36e4"
 dependencies = [
 "derivative",
- "num_enum_derive",
+ "num_enum_derive 0.4.3",
+]
+
+[[package]]
+name = "num_enum"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "226b45a5c2ac4dd696ed30fa6b94b057ad909c7b7fc2e0d0808192bced894066"
+dependencies = [
+ "derivative",
+ "num_enum_derive 0.5.1",
 ]

 [[package]]
@ -702,6 +738,18 @@ dependencies = [
 "syn",
 ]

+[[package]]
+name = "num_enum_derive"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1c0fd9eba1d5db0994a239e09c1be402d35622277e35468ba891aa5e3188ce7e"
+dependencies = [
+ "proc-macro-crate",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "objc"
 version = "0.2.7"
@ -773,11 +821,15 @@ name = "piet-gpu"
 version = "0.1.0"
 dependencies = [
 "clap",
+ "ndk 0.3.0",
+ "ndk-glue 0.3.0",
+ "ndk-sys",
 "piet",
 "piet-gpu-hal",
 "piet-gpu-types",
 "png",
 "rand",
+ "raw-window-handle",
 "roxmltree",
 "winit",
 ]
@ -797,7 +849,6 @@ version = "0.1.0"
 dependencies = [
 "ash",
 "ash-window",
- "once_cell",
 "raw-window-handle",
 "winapi 0.3.9",
 "wio",
@ -1301,8 +1352,8 @@ dependencies = [
 "log",
 "mio",
 "mio-extras",
- "ndk",
- "ndk-glue",
+ "ndk 0.2.1",
+ "ndk-glue 0.2.1",
 "ndk-sys",
 "objc",
 "parking_lot",
--- a/README.md
+++ b/README.md
@ -12,21 +12,15 @@ The main goal is to answer research questions about the future of 2D rendering:

 * To what extent do "advanced" GPU features (subgroups, descriptor arrays) help?

+* Can we improve quality and extend the imaging model in useful ways?
+
 Another goal is to explore a standards-based, portable approach to GPU compute.

-## Non-goals
+## Blogs and other writing

-There are a great number of concerns that need to be addressed in production:
+Much of the research progress on piet-gpu is documented in blog entries. See [doc/blogs.md](doc/blogs.md) for pointers to those.

-* Compatibility with older graphics hardware (including runtime detection)
-
-* Asynchrony
-
-* Swapchains and presentation
-
-## Notes
-
-A more detailed explanation will come. But for now, a few notes. Also refer to [Fast 2D rendering on GPU] and linked blog posts for more information.
+There is a much larger and detailed [vision](doc/vision.md) that explains the longer-term goals of the project, and how we might get there.

 ### Why not gfx-hal?

@ -36,7 +30,7 @@ The hal layer in this repo is strongly inspired by gfx-hal, but with some differ

 ### Why not wgpu?

-The case for wgpu is also strong, but it's even less mature. I'd love to see it become a solid foundation, at which point I'd use it as the main integration with [druid].
+The case for wgpu is also strong, but it's even less mature. I'd love to see it become a solid foundation, at which point I'd use it as the main integration with [Druid].

 In short, the goal is to facilitate the research now, collect the data, and then use that to choose a best path for shipping later.

@ -56,7 +50,6 @@ Contributions are welcome by pull request. The [Rust code of conduct] applies.
 [SPIRV-Cross]: https://github.com/KhronosGroup/SPIRV-Cross
 [Shader Model 6]: https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/hlsl-shader-model-6-0-features-for-direct3d-12
 [DXC]: https://github.com/microsoft/DirectXShaderCompiler
-[druid]: https://github.com/xi-editor/druid
+[Druid]: https://github.com/xi-editor/druid
 [Unlicense]: https://unlicense.org/
 [Rust code of conduct]: https://www.rust-lang.org/policies/code-of-conduct
-[Fast 2D rendering on GPU]: https://raphlinus.github.io/rust/graphics/gpu/2020/06/13/fast-2d-rendering.html
--- a/doc/blogs.md
+++ b/doc/blogs.md
@ -0,0 +1,14 @@
+# Blogs and writing
+
+Much of the research progress on piet-gpu is documented in blog entries. Here are the most relevant:
+
+* [Fast 2D rendering on GPU](https://raphlinus.github.io/rust/graphics/gpu/2020/06/13/fast-2d-rendering.html), Jun 13, 2020
+* [A sort-middle architecture for 2D graphics](https://raphlinus.github.io/rust/graphics/gpu/2020/06/12/sort-middle.html), Jun 12, 2020
+* [piet-gpu progress report](https://raphlinus.github.io/rust/graphics/gpu/2020/06/01/piet-gpu-progress.html), Jun 1, 2020
+* [2D Graphics on Modern GPU](https://raphlinus.github.io/rust/graphics/gpu/2019/05/08/modern-2d.html), May 8, 2019
+
+There are some posts more general to GPU compute programming that might be of some interest:
+
+* [The stack monoid](https://raphlinus.github.io/gpu/2020/09/05/stack-monoid.html), Sep 5, 2020
+* [Prefix sum on Vulkan](https://raphlinus.github.io/gpu/2020/04/30/prefix-sum.html), Apr 30, 2020
+* [GPU resources](https://raphlinus.github.io/gpu/2020/02/12/gpu-resources.html), Feb 12, 2020
--- a/doc/vision.md
+++ b/doc/vision.md
@ -0,0 +1,183 @@
+## The piet-gpu vision
+
+Raph Levien, 2020-12-10
+
+I’ve done several [blog posts](./blogs.md) about piet-gpu already, and more generally GPU compute, but this document is a little different in scope. Rather than showing off a prototype and presenting a research result, it will set forth a bold and ambitious plan for where this might go. I find this vision compelling, and it’s motivated me to spend a lot of energy mastering some difficult material. The grand vision is much more than one person can do, so I’ll do some of it myself and maybe inspire collaboration for the rest of it.
+
+The full vision for piet-gpu is a 2D rendering engine that is considerably faster, higher quality, and more flexible than the current state of the art, and runs on a wide variety of hardware. I’ll go into some detail about why I think this goal is possible and what kind of work is needed to get there.
+
+The current state of the piet-gpu codebase is an early stage prototype, largely to test whether the ideas are viable and to gather empirical performance data on some of the more intensive parts of the rendering problem, so far mainly antialiased vector filling and stroking.
+
+## Compute-based 2D rendering
+
+The central theme of piet-gpu is to do most or all of the rendering steps in compute shaders. This is quite a different philosophy to the traditional rasterization-based approach to 2D rendering, which breaks the scene (on the CPU side) into a series of draw calls, which are then sent to the GPU. This works extremely well when the mapping to draw calls is simple (which is the case for imgui-style UI made up of text and simple graphic elements), but otherwise much less so. In using GPU compute extensively, piet-gpu draws much inspiration from [Spinel].
+
+Using compute shaders has profound effects at two particular stages in the pipeline. First, in early stages, it lets the GPU ingest a scene description that is, as much as possible, a straightforward binary encoding of the scene. That, in turn, makes the CPU-side part of the job simple and efficient, allowing higher frame rates on complex scenes without jank.
+
+Second, in the last stage (“fine rasterization”), compositing takes place within the compute shader, using vector registers rather than texture buffers in global memory for intermediate RGBA values.
+
+Note that the benefits depend on the scene. For a static (or mostly static) scene, the CPU-side encoding cost might not matter much because it can be done ahead of time. Similarly, if the scene doesn’t require sophisticated compositing, but is just a series of alpha-blended draws, existing rasterization pipelines can handle those very efficiently. But piet-gpu should fly with dynamic scenes with lots of masking and blending, where existing 2D engines would struggle.
+
+The intermediate stages benefit too. The coarse rasterization step can employ sophisticated logic to enable optimizations on a per-tile granularity that would otherwise rely on brute force.
+
+## Retained scene graph fragments
+
+Applications vary in their degree of dynamism. At one extreme, the scene is mostly static, with perhaps a few variable elements and perhaps some animation done at compositing time (I think of this as the iPhone style of UI, as it’s so well adapted to mechanisms like Core Animation). At the other extreme, every rendered frame is completely different from the one before, so encoding needs to be done entirely from scratch every time; these applications are well adapted to an “immediate mode” approach.
+
+I’m most interested in cases in the middle. I believe the best approach is to split the encoding process so that the static parts of the scene graph can be encoded once into a retained scene graph fragment, then these fragments can be stitched together, along with the dynamically encoded parts of the scene, with a minimum of CPU effort.
+
+Much of the piet-gpu architecture is geared towards supporting this goal. Notably, the global affine transformation is not baked into the encoding of vector paths, so the same binary encoding of a vector path can be instanced (perhaps multiple times within a scene) with different transforms. Applying the transform is done GPU-side, early in the [pipeline][sort-middle architecture]. Thus, animating the transform should be very efficient, and the vector paths will be re-rendered at full resolution with vector crispness.
+
+Even so, fully realizing retained scene graph fragments will be one of the more difficult parts of the vision. It requires a good API to represent retained fragments, as well as incrementally update parameters such as transformation and opacity. It also requires a sophisticated approach to resource management so that resources backing the retained fragments can be efficiently cached GPU-side without hogging relatively scarce GPU memory. As such, I will focus on immediate mode first, as that is also an important case. But make no mistake, the goal of retaining scene fragments is motivating a number of design decisions, in particular leading me away from shortcuts such as applying affine transforms CPU-side during encoding.
+
+## Portable compute runtime
+
+One challenge facing piet-gpu is the lack of adequate infrastructure for portable GPU compute. Most research is done on CUDA, as that is the only truly viable platform for GPU compute today, but that would make it essentially impossible to deploy the work on any hardware other than Nvidia.
+
+I strongly believe that Vulkan is emerging as a viable low-level platform for utilizing GPU compute resources. I’m also not the only one thinking along these lines. The [VkFFT] project is an impressive demonstration that a Vulkan deployment of one math-intensive algorithm can be just as performant as the CUDA version. In addition, there are early steps toward running machine learning workloads on Vulkan, particularly TensorFlow Lite.
+
+Of course, while it’s possible to run Vulkan on a pretty wide range of hardware, it doesn’t solve all portability problems. “Runs Vulkan” is not a binary, but rather a portal to a vast matrix of optional features and limits from the various combinations of hardware, drivers, and compatibility shims ([vulkan.gpuinfo.org] is an excellent resource). In particular, Apple forces the use of Metal. In theory, MoltenVk — or, more generally, the [Vulkan Portability Extension] — lets you run Vulkan code on Apple hardware, but in practice it doesn’t quite work (see [#42]), and there are compatibility and integration advantages to DX12 over Vulkan on Windows; older CPU generations such as Haswell and Broadwell don’t support Vulkan at all. To this end, I’ve started a portability layer (piet-gpu-hal) which should be able to run natively on these other API’s.
+
+### Why not wgpu?
+
+The compatibility layer has overlapping goals as [wgpu], and WebGPU more broadly. Why not just use that, as much of the Rust ecosystem has done?
+
+It’s *very* tempting, but there is also some divergence of goals. The main one is that to keep the piet-gpu runtime light and startup time quick, I really want to do ahead-of-time compilation of shaders, so that the binary embeds intermediate representation for the target platform (DXIL for Windows 10, etc). Further, by using Vulkan directly, we can experiment with advanced features such as subgroups, the memory model, etc., which are not yet well supported in wgpu, though it certainly would be possible to add these features. I don’t know how much these advanced features contribute, but that’s one of the research questions to be addressed. If the gain is modest, then implementing them is a low priority. If the gain is significant, then that should increase motivation for runtimes such as wgpu to include them.
+
+Also see the section on incremental present, below, which is another feature that is not yet well supported in wgpu, so working with lower level APIs should reduce the friction.
+
+At the same time, wgpu continues to improve, including focus on making the runtime leaner (using the new [naga] shader compilation engine rather than spirv-cross is one such advance). My sense is this: a primary reason for piet-gpu to have its own compatibility layer is so that we can really clarify and sharpen the requirements for a more general GPU compute runtime.
+
+### Compatibility fallback
+
+One challenge of a compute-centric approach is that there is not (yet) an ironclad guarantee that the GPU and drivers will actually be able to handle the compute shaders and resource management patterns (the latter may actually be more of a challenge, as piet-gpu relies on [descriptor indexing] to address multiple images during fine rasterization).
+
+There are a number of approaches to this problem, including building hybrid pipelines and otherwise doing lots of compatibility engineering, to target platforms well on their way to becoming obsolete. But I worry quite a bit about the complexity burden, as well as pressure away from the absolute best solution to a problem if it poses compatibility challenges.
+
+I’m more inclined to fall back to CPU rendering. Projects such as [Blend2D] show that CPU rendering can be performant, though nowhere nearly as much as a GPU. Of course, that means coming up with CPU implementations of the algorithms.
+
+One intriguing possibility is to automatically translate the Vulkan compute shaders to CPU runnable code. This approach has the advantage of maintaining one codebase for the pipeline, reducing friction for adding new features, and guaranteeing pixel-perfect consistency. The biggest question is whether such an approach would be adequately performant. A very good way to get preliminary answers is to use [SwiftShader] or Mesa’s [Lavapipe], which do JIT generation of CPU side code. Obviously, for reasons of startup time and binary size it would be better to ship ahead-of-time translated shaders, but that’s a practical rather than conceptual problem.
+
+There are examples of compile time translation of shaders to CPU code. An intriguing possibility is the [spirv to ispc translator], which doesn’t seem to be actively developed, but would seem to be a path to reasonably good CPU performance from shaders. Another, actually used in production in WebRender, is [glsl-to-cxx].
+
+A truly universal compute infrastructure with unified shader source would have implications far beyond 2D rendering. The domain most likely to invest in this area is AI (deployment to consumer hardware; for server side and in-house deployment, they’ll obviously just use CUDA and neural accelerators). I’ll also note that this problem is ostensibly within scope of OpenCL, but they have so far failed to deliver, largely because they’ve historically been entirely dependent on driver support from the GPU manufacturer. I expect *something* to happen.
+
+There is another perfectly viable path this could take, less dependent on shader compilation infrastructure: a software renderer developed in parallel with the GPU one. Possible existing Rust code bases to draw on include [raqote] and [tiny-skia]. These make more sense as community sub-projects (see below).
+
+## Text
+
+An essential part of any 2D library is text rendering. This really breaks down into text layout and painting of glyphs. Both are important to get right.
+
+The Piet of today is primarily an abstraction layer over platform 2D graphics libraries, and that’s equally true of text. We’ve lately made some really good progress in a common [rich text API] and implementations over DirectWrite and Core Text. However, it is currently lacking a Linux backend. (As a placeholder, we use the Cairo “toy text API,” but that is unsatisfying for a number of reasons.)
+
+I think we want to move away from abstracting over platform capabilities, for several reasons. One is that it’s harder to ensure consistent results. Another is that it’s hard to add new features, such as hz-style justification (see below). Thus, we follow a similar trajectory as Web browsers.
+
+As a project related to piet-gpu, I’d love to build (or mentor someone to build) a text layout engine, in Rust, suitable for most UI work. This wouldn’t be my first time; I wrote the original version of [Minikin], the text layout engine first shipped in Android Lollipop.
+
+### Painting
+
+Ultimately, I’d like piet-gpu to support 3 sources of glyph data for painting.
+
+The first is bitmaps produced by the platform. These have the advantage of matching native UI, and also take maximum advantage of hinting and subpixel RGB rendering, thus improving contrast and clarity. These bitmaps would be rendered mostly CPU-side, and uploaded into a texture atlas. The actual rasterization is just texture lookups, and should be super efficient.
+
+The second is dynamic vector rendering from glyph outlines. This source is best optimized for large text, animation (including supporting pinch-to-zoom style gestures), and possible extension into 3D, including VR and AR. The lack of hinting and RGB subpixel rendering is not a serious issue on high-dpi screens, and is not an expectation on mobile. Early measurements from piet-gpu suggest that it should be possible to maintain 60fps of text-heavy scenes on most GPUs, but power usage might not be ideal.
+
+Thus, the third source is vector rendering through a glyph cache, something of a hybrid of the first two sources. Originally, management of the cache will be CPU-side, and managed during encoding (likely using [Guillotière], [Étagère], or something similar), but in the future we might explore GPU-side algorithms to manage the cache in parallel, reducing CPU requirements further.
+
+### GPU-side variable fonts
+
+A very intriguing possibility is to offload most of the work of rendering variable fonts to GPU. There are reasons to believe this would work well: [variable font technology] is fundamentally based on multiplying vectors of “deltas” with basis functions and adding those up, a task ideally suited to GPU.
+
+A challenge is representing the coordinate data and deltas in a GPU-friendly format; the [glyf] and [gvar] table formats are designed for compact data representation and (reasonably) simple decoding by scalar CPUs, but are challenging for massively parallel algorithms. Decoding to fixed-size numbers is straightforward but might use a lot of GPU memory and bandwidth to represent the font data (especially a problem for CJK fonts). One intriguing approach is to re-encode the underlying data using a self-synchronizing variable integer encoding, which would reduce the memory requirements but preserve the ability to do processing in parallel.
+
+The major advantages of GPU-side variable font rendering are to allow efficient animation of variable font axes, and also to open up the possibility of adjusting the axes to improve text layout, for example to improve the quality of paragraph justification as pioneered by the [hz] prototype and recently demonstrated with [amstelvar], or to support calligraphic styles and complex scripts better, for example to make more beautiful [kashida] for Arabic, all without significantly reducing performance.
+
+## Improving rendering quality
+
+The question of quality in GPU 2D rendering has long been complex. Many rasterization based approaches are dependent on [MSAA] in the GPU’s fixed-function pipeline, which may not always be available or perhaps only practical at lower settings (especially on mobile). Thus, GPU accelerated 2D rendering quality has gotten something of a bad name.
+
+A compute-centric approach changes the story. All actual pixels are generated by code; the quality of the rendering is entirely up to the author of that code. The current piet-gpu codebase uses an exact-area approach to antialiasing (in the [tradition of libart]), and thus does not exhibit stepping or graininess characteristic of MSAA at low or medium settings. The quality should be the same as a good software renderer, because it *is* a software renderer, just one that happens to be running on hardware with orders of magnitude more parallelism than any reasonable CPU.
+
+Even so, I believe it’s possible to do even better. A CPU-bound renderer has barely enough performance to get pixels to the screen, so takes whatever shortcuts are needed to get the job done in that performance budget. A GPU typically has an order of magnitude more raw compute bandwidth, so there is headroom that can be used to improve quality.
+
+The details of what I have in mind could be a blog post in and of itself, but I’ll sketch out the highlights.
+
+Perhaps the most important quality problem is that of so-called “conflation artifacts,” the seams that happen when compositing antialiased elements (see [#49]). Most of the academic literature on 2D rendering on GPU addresses this question. I think it’s practical to do in the piet-gpu architecture, basically by swapping out soft-alpha compositing in the fine rasterizer with one based on supersampling. Some of the academic literature also takes the opportunity at that stage in the pipeline to apply a reconstruction filter more sophisticated than a box filter, but I am not yet convinced that the improvement is worth it, especially as physical display resolution increases.
+
+The next major area of potential quality improvement is getting gamma right. This is a surprisingly tricky area, as a theoretically “correct” approach to gamma often yields text and hairline strokes that appear weak and spindly. Another concern is document compatibility; simply changing the gamma of the colorspace in which alpha blending happens will change the color of the result. Likely, a perfect solution to this problem will require cooperation with the application driving the renderer; if it is designed with gamma-perfect rendering in mind, there is no real problem, but otherwise it’s likely that various heuristics will need to be applied to get good results. (Note that [stem darkening] is one approach used specifically for text rendering, and among other things is a source of considerable variation between platforms.)
+
+When driving low-dpi displays (which still exist), one opportunity to improve quality is more sophisticated RGB [subpixel rendering]. Currently, that’s basically text-only, but could be applied to vector rendering as well, and often doesn’t survive sophisticated compositing, as an RGBA texture with a transparent background cannot represent RGB subpixel text. One solution is to do compositing with per-channel alpha, which can be done very efficiently when compositing in a compute shader, but would be a serious performance problem if intermediate texture buffers needed to be written out to global memory.
+
+These potential quality improvements may well provide the answer to the question, “why move to a new rendering architecture instead of incrementally improving what we’ve got now?”
+
+## Enriching the imaging model
+
+There is consensus on “the modern 2D imaging model,” roughly encompassing PDF, SVG, HTML Canvas, and Direct2D, but it is not set in stone and with considerable variation in advanced features within those systems (for example, gradient meshes are more or less unique to PDF — the feature was proposed for SVG 2 but [then removed](http://libregraphicsworld.org/blog/entry/gradient-meshes-and-hatching-to-be-removed-from-svg-2-0)).
+
+I like this consensus 2D imaging model because I feel it is extremely well suited for UI and documents of considerable richness and complexity, and is quite designer-friendly. There is also tension pulling away from it, I think for two reasons. One is that it is not always implemented efficiently on GPU, especially with deeply nested soft clipping and other nontrivial compositing requirements. The other is that it’s possible to do things on GPU (especially using custom shaders) that are not easily possible with the standard 2D api. Shadertoy shows *many* things that are possible in shaders. One idea I’d like to explore is watercolor brush strokes (see [Computer-Generated Watercolor](https://grail.cs.washington.edu/projects/watercolor/paper_small.pdf) for inspiration). I think it would be possible to get pretty far with distance fields and procedural noise, and a simple function to go from those to paint values for paint-like compositing.
+
+Another direction the imaging model should go is support for [HDR] (strong overlap with the gamma issue above). This will require color transformations for tone mapping in the compositing pipeline, which again can be written as shaders.
+
+One interesting existing 2D engine with extension points is Direct2D, which lets users provide [Custom effects](https://docs.microsoft.com/en-us/windows/win32/direct2d/custom-effects) by linking in compute shaders. Of course, it is a major challenge to make such a thing portable, but I’m encouraged about building on existing GPU infrastructure efforts. In particular, over time, I think WebGPU could become a standard way to provide such an extension point portably.
+
+Blurs are a specific case that should probably be done early, as they’re very widely used in UI. In the general case, it will require allocating temporary buffers for the contents being blurred, which is not exactly in the spirit of piet-gpu compositing, largely because it requires a lot of resource management and pipeline building CPU-side, but is possible. I’ve already done research on a special case, a [blurred rounded rectangle], which can be computed extremely efficiently as a fairly simple shader. The encoder would apply a peephole-like optimization during encoding time, pattern matching the blurred contents and swapping in the more efficient shader when possible.
+
+## Incremental present
+
+In the old days, UI tracked “dirty rectangles,” and only redrew what actually changed, as computers just weren’t fast enough to redraw the entire screen contents in a single refresh period. Games, on the other hand, need to redraw every pixel every frame, so the GPU pipeline became optimized for those, and many rendering engines got more relaxed about avoiding redrawing, as the GPU was plenty fast for that.
+
+Today, the GPU is still plenty fast, but there are still gains to be had from incremental present, primarily power consumption. Blinking a cursor in a text editor should not run the battery down. Also, on low resource devices, incremental present can reduce latency and increase the chance of smooth running without dropped frames.
+
+The tile-based architecture of piet-gpu is extremely well suited to incremental present, as the various pipeline stages are optimized to only do work within the viewport (render region). This is especially true for fine rasterization, which doesn’t touch any work outside that region.
+
+A small challenge is support by the GPU infrastructure, which tends to be more optimized for games than UI. DirectX has long had [good support](https://docs.microsoft.com/en-us/windows/win32/api/dxgi1_2/nf-dxgi1_2-idxgiswapchain1-present1). The Vulkan world is spottier, as it’s available as an extension. That extension tends to be available on Linux (largely because [Gnome can make good use of it](https://feaneron.com/2019/10/05/incremental-present-in-gtk4/)), and some on Android, but in my experiments less so on desktop. And of course Metal can’t do it at all.
+
+## Roadmap and community
+
+This vision is *very* ambitious. There’s no way one person could do it all in a reasonable amount of time. It’s a multi-year project at best, and that’s not counting the year and a half since the first piet-metal prototype.
+
+There are a few ways I plan to deal with this. First is to be explicit that it is a research project. That means that certain elements, especially dealing with compatibility, are a lower priority. Other projects in a similar space have sunk a lot of time and energy into working around driver bugs and dealing with the complex landscape of GPU capability diversity (especially on older devices and mobile). The initial goal is to prove that the concepts work on a reasonably modern GPU platform.
+
+Another strategy is to split up the work so that at least some parts can be taken up by the community. There are a number of interesting subprojects. Also, it would be wonderful for the runtime work to be taken up by another project, as most of it is not specific to the needs of 2D rendering.
+
+I’d really like to build a good open-source community around piet-gpu, and that’s already starting to happen. The #gpu stream on [xi.zulipchat.com] hosts some really interesting discussions. In addition, the [gio] project is exploring adopting the compute shaders of piet-gpu (with the CPU runtime in Go) and has made substantive contributions to the code base. There’s a lot of research potential in piet-gpu, and knowledge about GPU compute programming in general, that I think is valuable to share, so it’s my intent to keep creating blog posts and other materials to spread that knowledge. Academic papers are also within scope, and I’m open to collaboration on those.
+
+I'm really excited to see where this goes. I think there's the potential to build something truly great, and I look forward to working with others to realize that vision.
+
+There's been some great discussion on [/r/rust](https://www.reddit.com/r/rust/comments/kal8ac/the_pietgpu_vision/).
+
+[hz]: https://en.wikipedia.org/wiki/Hz-program
+[spirv to ispc translator]: https://software.intel.com/content/www/us/en/develop/articles/spir-v-to-ispc-convert-gpu-compute-to-the-cpu.html
+[tiny-skia]: https://github.com/RazrFalcon/tiny-skia
+[raqote]: https://github.com/jrmuizel/raqote
+[Blend2D]: https://blend2d.com/
+[amstelvar]: https://variablefonts.typenetwork.com/topics/spacing/justification
+[kashida]: https://andreasmhallberg.github.io/stretchable-kashida/
+[SwiftShader]: https://swiftshader.googlesource.com/SwiftShader
+[Lavapipe]: https://www.phoronix.com/scan.php?page=news_item&px=Mesa-Vulkan-Lavapipe
+[glsl-to-cxx]: https://github.com/servo/webrender/tree/master/glsl-to-cxx
+[sort-middle architecture]: https://raphlinus.github.io/rust/graphics/gpu/2020/06/12/sort-middle.html
+[vulkan.gpuinfo.org]: https://vulkan.gpuinfo.org
+[Vulkan Portability Extension]: https://www.khronos.org/blog/fighting-fragmentation-vulkan-portability-extension-released-implementations-shipping
+[xi.zulipchat.com]: https://xi.zulipchat.com
+[glyf]: https://docs.microsoft.com/en-us/typography/opentype/spec/glyf
+[gvar]: https://docs.microsoft.com/en-us/typography/opentype/spec/gvar
+[VkFFT]: https://github.com/DTolm/VkFFT
+[Spinel]: https://fuchsia.googlesource.com/fuchsia/+/refs/heads/master/src/graphics/lib/compute/spinel/
+[wgpu]: https://github.com/gfx-rs/wgpu
+[naga]: https://github.com/gfx-rs/naga
+[descriptor indexing]: http://chunkstories.xyz/blog/a-note-on-descriptor-indexing/
+[rich text API]: https://www.cmyr.net/blog/piet-text-work.html
+[Guillotière]: https://github.com/nical/guillotiere
+[Étagère]: https://crates.io/crates/etagere
+[variable font technology]: https://docs.microsoft.com/en-us/typography/opentype/spec/otvaroverview
+[MSAA]: https://en.wikipedia.org/wiki/Multisample_anti-aliasing
+[tradition of libart]: https://people.gnome.org/~mathieu/libart/internals.html
+[stem darkening]: https://www.freetype.org/freetype2/docs/text-rendering-general.html
+[subpixel rendering]: https://en.wikipedia.org/wiki/Subpixel_rendering
+[HDR]: https://en.wikipedia.org/wiki/High-dynamic-range_imaging
+[blurred rounded rectangle]: https://raphlinus.github.io/graphics/2020/04/21/blurred-rounded-rects.html
+[gio]: https://gioui.org/
+[Minikin]: https://android.googlesource.com/platform/frameworks/minikin/
+[#42]: https://github.com/linebender/piet-gpu/issues/42
+[#49]: https://github.com/linebender/piet-gpu/issues/49
--- a/piet-gpu-derive/src/derive.rs
+++ b/piet-gpu-derive/src/derive.rs
@ -73,18 +73,31 @@ fn gen_derive_def(name: &str, size: usize, def: &LayoutTypeDef) -> proc_macro2::

                let mut args = Vec::new();
                let mut field_encoders = proc_macro2::TokenStream::new();
-                for (i, (offset, _ty)) in payload.iter().enumerate() {
+                let mut tag_field = None;
+                for (i, (offset, ty)) in payload.iter().enumerate() {
                    let field_id = format_ident!("f{}", i);
-                    let field_encoder = quote! {
-                        #field_id.encode_to(&mut buf[#offset..]);
-                    };
-                    field_encoders.extend(field_encoder);
+                    if matches!(ty.ty, GpuType::Scalar(GpuScalar::TagFlags)) {
+                        tag_field = Some(field_id.clone());
+                    } else {
+                        let field_encoder = quote! {
+                            #field_id.encode_to(&mut buf[#offset..]);
+                        };
+                        field_encoders.extend(field_encoder);
+                    }
                    args.push(field_id);
                }
                let tag = variant_ix as u32;
+                let tag_encode = match tag_field {
+                    None => quote! {
+                        buf[0..4].copy_from_slice(&#tag.to_le_bytes());
+                    },
+                    Some(tag_field) => quote! {
+                        buf[0..4].copy_from_slice(&(#tag | ((*#tag_field as u32) << 16)).to_le_bytes());
+                    },
+                };
                let case = quote! {
                    #name_id::#variant_id(#(#args),*) => {
-                        buf[0..4].copy_from_slice(&#tag.to_le_bytes());
+                        #tag_encode
                        #field_encoders
                    }
                };
@ -139,12 +152,15 @@ fn gen_derive_scalar_ty(ty: &GpuScalar) -> proc_macro2::TokenStream {
        GpuScalar::U8 => quote!(u8),
        GpuScalar::U16 => quote!(u16),
        GpuScalar::U32 => quote!(u32),
+        GpuScalar::TagFlags => quote!(u16),
    }
 }

 fn gen_encode_field(name: &str, offset: usize, ty: &GpuType) -> proc_macro2::TokenStream {
    let name_id = format_ident!("{}", name);
    match ty {
+        // encoding of flags into tag word is handled elsewhere
+        GpuType::Scalar(GpuScalar::TagFlags) => quote! {},
        GpuType::Scalar(s) => {
            let end = offset + s.size();
            quote! {
--- a/piet-gpu-derive/src/glsl.rs
+++ b/piet-gpu-derive/src/glsl.rs
@ -8,6 +8,11 @@ use crate::parse::{GpuScalar, GpuType};

 pub fn gen_glsl(module: &LayoutModule) -> String {
    let mut r = String::new();
+    writeln!(
+        &mut r,
+        "// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense\n"
+    )
+    .unwrap();
    writeln!(&mut r, "// Code auto-generated by piet-gpu-derive\n").unwrap();
    // Note: GLSL needs definitions before uses. We could do a topological sort here,
    // but easiest for now to just require that in spec.
@ -24,23 +29,25 @@ pub fn gen_glsl(module: &LayoutModule) -> String {
            (size, LayoutTypeDef::Enum(en)) => {
                gen_enum_def(&mut r, name, en);
                gen_item_def(&mut r, name, size.size);
+                gen_tag_def(&mut r, name);
            }
        }
    }

    for name in &module.def_names {
        let def = module.defs.get(name).unwrap();
+        let is_mem = !module.name.eq(&"state") && !module.name.eq(&"scene");
        match def {
            (_size, LayoutTypeDef::Struct(fields)) => {
-                gen_struct_read(&mut r, &module.name, &name, fields);
+                gen_struct_read(&mut r, &module.name, &name, is_mem, fields);
                if module.gpu_write {
-                    gen_struct_write(&mut r, &module.name, &name, fields);
+                    gen_struct_write(&mut r, &module.name, &name, is_mem, fields);
                }
            }
            (_size, LayoutTypeDef::Enum(en)) => {
-                gen_enum_read(&mut r, &module.name, &name, en);
+                gen_enum_read(&mut r, &module.name, &name, is_mem, en);
                if module.gpu_write {
-                    gen_enum_write(&mut r, &module.name, &name, en);
+                    gen_enum_write(&mut r, &module.name, &name, is_mem, en);
                }
            }
        }
@ -86,18 +93,34 @@ fn gen_item_def(r: &mut String, name: &str, size: usize) {
    writeln!(r, "}}\n").unwrap();
 }

+fn gen_tag_def(r: &mut String, name: &str) {
+    writeln!(r, "struct {}Tag {{", name).unwrap();
+    writeln!(r, "   uint tag;").unwrap();
+    writeln!(r, "   uint flags;").unwrap();
+    writeln!(r, "}};\n").unwrap();
+}
+
 fn gen_struct_read(
    r: &mut String,
    bufname: &str,
    name: &str,
+    is_mem: bool,
    fields: &[(String, usize, LayoutType)],
 ) {
-    writeln!(r, "{} {}_read({}Ref ref) {{", name, name, name).unwrap();
+    write!(r, "{} {}_read(", name, name).unwrap();
+    if is_mem {
+        write!(r, "Alloc a, ").unwrap();
+    }
+    writeln!(r, "{}Ref ref) {{", name).unwrap();
    writeln!(r, "    uint ix = ref.offset >> 2;").unwrap();
    let coverage = crate::layout::struct_coverage(fields, false);
    for (i, fields) in coverage.iter().enumerate() {
        if !fields.is_empty() {
-            writeln!(r, "    uint raw{} = {}[ix + {}];", i, bufname, i).unwrap();
+            if is_mem {
+                writeln!(r, "    uint raw{} = read_mem(a, ix + {});", i, i).unwrap();
+            } else {
+                writeln!(r, "    uint raw{} = {}[ix + {}];", i, bufname, i).unwrap();
+            }
        }
    }
    writeln!(r, "    {} s;", name).unwrap();
@ -124,26 +147,64 @@ fn gen_enum_read(
    r: &mut String,
    bufname: &str,
    name: &str,
+    is_mem: bool,
    variants: &[(String, Vec<(usize, LayoutType)>)],
 ) {
-    writeln!(r, "uint {}_tag({}Ref ref) {{", name, name).unwrap();
-    writeln!(r, "    return {}[ref.offset >> 2];", bufname).unwrap();
+    if is_mem {
+        writeln!(r, "{}Tag {}_tag(Alloc a, {}Ref ref) {{", name, name, name).unwrap();
+        writeln!(r, "    uint tag_and_flags = read_mem(a, ref.offset >> 2);").unwrap();
+    } else {
+        writeln!(r, "{}Tag {}_tag({}Ref ref) {{", name, name, name).unwrap();
+        writeln!(r, "    uint tag_and_flags = {}[ref.offset >> 2];", bufname).unwrap();
+    }
+    writeln!(
+        r,
+        "    return {}Tag(tag_and_flags & 0xffff, tag_and_flags >> 16);",
+        name
+    )
+    .unwrap();
    writeln!(r, "}}\n").unwrap();
    for (var_name, payload) in variants {
-        if payload.len() == 1 {
-            if let GpuType::InlineStruct(structname) = &payload[0].1.ty {
-                writeln!(
-                    r,
-                    "{} {}_{}_read({}Ref ref) {{",
-                    structname, name, var_name, name
-                )
-                .unwrap();
-                writeln!(
-                    r,
-                    "    return {}_read({}Ref(ref.offset + {}));",
-                    structname, structname, payload[0].0
-                )
-                .unwrap();
+        let payload_ix = if payload.len() == 1 {
+            Some(0)
+        } else if payload.len() == 2 {
+            if matches!(payload[0].1.ty, GpuType::Scalar(GpuScalar::TagFlags)) {
+                Some(1)
+            } else {
+                None
+            }
+        } else {
+            None
+        };
+        if let Some(payload_ix) = payload_ix {
+            if let GpuType::InlineStruct(structname) = &payload[payload_ix].1.ty {
+                if is_mem {
+                    writeln!(
+                        r,
+                        "{} {}_{}_read(Alloc a, {}Ref ref) {{",
+                        structname, name, var_name, name
+                    )
+                    .unwrap();
+                    writeln!(
+                        r,
+                        "    return {}_read(a, {}Ref(ref.offset + {}));",
+                        structname, structname, payload[0].0
+                    )
+                    .unwrap();
+                } else {
+                    writeln!(
+                        r,
+                        "{} {}_{}_read({}Ref ref) {{",
+                        structname, name, var_name, name
+                    )
+                    .unwrap();
+                    writeln!(
+                        r,
+                        "    return {}_read({}Ref(ref.offset + {}));",
+                        structname, structname, payload[0].0
+                    )
+                    .unwrap();
+                }
                writeln!(r, "}}\n").unwrap();
            }
        }
@ -225,6 +286,7 @@ fn gen_extract_scalar(offset: usize, ty: &GpuScalar) -> String {
        GpuScalar::F16 | GpuScalar::F32 => extract_fbits(offset, ty.size()),
        GpuScalar::U8 | GpuScalar::U16 | GpuScalar::U32 => extract_ubits(offset, ty.size()),
        GpuScalar::I8 | GpuScalar::I16 | GpuScalar::I32 => extract_ibits(offset, ty.size()),
+        GpuScalar::TagFlags => format!("0 /* TODO */"),
    }
 }

@ -297,9 +359,14 @@ fn gen_struct_write(
    r: &mut String,
    bufname: &str,
    name: &str,
+    is_mem: bool,
    fields: &[(String, usize, LayoutType)],
 ) {
-    writeln!(r, "void {}_write({}Ref ref, {} s) {{", name, name, name).unwrap();
+    write!(r, "void {}_write(", name).unwrap();
+    if is_mem {
+        write!(r, "Alloc a, ").unwrap();
+    }
+    writeln!(r, "{}Ref ref, {} s) {{", name, name).unwrap();
    writeln!(r, "    uint ix = ref.offset >> 2;").unwrap();
    let coverage = crate::layout::struct_coverage(fields, true);

@ -375,13 +442,20 @@ fn gen_struct_write(
        }

        if !pieces.is_empty() {
-            write!(r, "    {}[ix + {}] = ", bufname, i).unwrap();
+            if is_mem {
+                write!(r, "    write_mem(a, ix + {}, ", i).unwrap();
+            } else {
+                write!(r, "    {}[ix + {}] = ", bufname, i).unwrap();
+            }
            for (j, piece) in pieces.iter().enumerate() {
                if j != 0 {
                    write!(r, " | ").unwrap();
                }
                write!(r, "{}", piece).unwrap();
            }
+            if is_mem {
+                write!(r, ")").unwrap();
+            }
            writeln!(r, ";").unwrap();
        }
    }
@ -411,6 +485,7 @@ fn gen_pack_bits_scalar(ty: &GpuScalar, offset: usize, inner: &str) -> String {
            }
        }
        GpuScalar::I32 => format!("uint({})", inner),
+        GpuScalar::TagFlags => format!("0"),
    };
    if shift == 0 {
        bits
@ -423,38 +498,120 @@ fn gen_enum_write(
    r: &mut String,
    bufname: &str,
    name: &str,
+    is_mem: bool,
    variants: &[(String, Vec<(usize, LayoutType)>)],
 ) {
    for (var_name, payload) in variants {
        if payload.is_empty() {
-            writeln!(r, "void {}_{}_write({}Ref ref) {{", name, var_name, name).unwrap();
-            writeln!(
-                r,
-                "    {}[ref.offset >> 2] = {}_{};",
-                bufname, name, var_name
-            )
-            .unwrap();
-            writeln!(r, "}}\n").unwrap();
-        } else if payload.len() == 1 {
-            if let GpuType::InlineStruct(structname) = &payload[0].1.ty {
+            if is_mem {
                writeln!(
                    r,
-                    "void {}_{}_write({}Ref ref, {} s) {{",
-                    name, var_name, name, structname
+                    "void {}_{}_write(Alloc a, {}Ref ref) {{",
+                    name, var_name, name
                )
                .unwrap();
+                writeln!(
+                    r,
+                    "    write_mem(a, ref.offset >> 2, {}_{});",
+                    name, var_name
+                )
+                .unwrap();
+            } else {
+                writeln!(r, "void {}_{}_write({}Ref ref) {{", name, var_name, name).unwrap();
                writeln!(
                    r,
                    "    {}[ref.offset >> 2] = {}_{};",
                    bufname, name, var_name
                )
                .unwrap();
-                writeln!(
-                    r,
-                    "    {}_write({}Ref(ref.offset + {}), s);",
-                    structname, structname, payload[0].0
-                )
-                .unwrap();
+            }
+            writeln!(r, "}}\n").unwrap();
+        } else if payload.len() == 1 {
+            if let GpuType::InlineStruct(structname) = &payload[0].1.ty {
+                if is_mem {
+                    writeln!(
+                        r,
+                        "void {}_{}_write(Alloc a, {}Ref ref, {} s) {{",
+                        name, var_name, name, structname
+                    )
+                    .unwrap();
+                    writeln!(
+                        r,
+                        "    write_mem(a, ref.offset >> 2, {}_{});",
+                        name, var_name
+                    )
+                    .unwrap();
+                    writeln!(
+                        r,
+                        "    {}_write(a, {}Ref(ref.offset + {}), s);",
+                        structname, structname, payload[0].0
+                    )
+                    .unwrap();
+                } else {
+                    writeln!(
+                        r,
+                        "void {}_{}_write({}Ref ref, {} s) {{",
+                        name, var_name, name, structname
+                    )
+                    .unwrap();
+                    writeln!(
+                        r,
+                        "    {}[ref.offset >> 2] = {}_{};",
+                        bufname, name, var_name
+                    )
+                    .unwrap();
+                    writeln!(
+                        r,
+                        "    {}_write({}Ref(ref.offset + {}), s);",
+                        structname, structname, payload[0].0
+                    )
+                    .unwrap();
+                }
+                writeln!(r, "}}\n").unwrap();
+            }
+        } else if payload.len() == 2
+            && matches!(payload[0].1.ty, GpuType::Scalar(GpuScalar::TagFlags))
+        {
+            if let GpuType::InlineStruct(structname) = &payload[1].1.ty {
+                if is_mem {
+                    writeln!(
+                        r,
+                        "void {}_{}_write(Alloc a, {}Ref ref, uint flags, {} s) {{",
+                        name, var_name, name, structname
+                    )
+                    .unwrap();
+                    writeln!(
+                        r,
+                        "    write_mem(a, ref.offset >> 2, (flags << 16) | {}_{});",
+                        name, var_name
+                    )
+                    .unwrap();
+                    writeln!(
+                        r,
+                        "    {}_write(a, {}Ref(ref.offset + {}), s);",
+                        structname, structname, payload[0].0
+                    )
+                    .unwrap();
+                } else {
+                    writeln!(
+                        r,
+                        "void {}_{}_write({}Ref ref, uint flags, {} s) {{",
+                        name, var_name, name, structname
+                    )
+                    .unwrap();
+                    writeln!(
+                        r,
+                        "    {}[ref.offset >> 2] = (flags << 16) | {}_{};",
+                        bufname, name, var_name
+                    )
+                    .unwrap();
+                    writeln!(
+                        r,
+                        "    {}_write({}Ref(ref.offset + {}), s);",
+                        structname, structname, payload[0].0
+                    )
+                    .unwrap();
+                }
                writeln!(r, "}}\n").unwrap();
            }
        }
@ -490,7 +647,7 @@ fn glsl_scalar(s: &GpuScalar) -> &'static str {
    match s {
        GpuScalar::F16 | GpuScalar::F32 => "float",
        GpuScalar::I8 | GpuScalar::I16 | GpuScalar::I32 => "int",
-        GpuScalar::U8 | GpuScalar::U16 | GpuScalar::U32 => "uint",
+        GpuScalar::U8 | GpuScalar::U16 | GpuScalar::U32 | GpuScalar::TagFlags => "uint",
    }
 }

@ -498,7 +655,7 @@ fn glsl_vecname(s: &GpuScalar) -> &'static str {
    match s {
        GpuScalar::F16 | GpuScalar::F32 => "vec",
        GpuScalar::I8 | GpuScalar::I16 | GpuScalar::I32 => "ivec",
-        GpuScalar::U8 | GpuScalar::U16 | GpuScalar::U32 => "uvec",
+        GpuScalar::U8 | GpuScalar::U16 | GpuScalar::U32 | GpuScalar::TagFlags => "uvec",
    }
 }

--- a/piet-gpu-derive/src/layout.rs
+++ b/piet-gpu-derive/src/layout.rs
@ -240,5 +240,5 @@ impl Size {
 }

 fn align_padding(offset: usize, alignment: usize) -> usize {
-    offset.wrapping_neg() & (alignment - 1)
+    offset.wrapping_neg() & (alignment.max(1) - 1)
 }
--- a/piet-gpu-derive/src/parse.rs
+++ b/piet-gpu-derive/src/parse.rs
@ -20,6 +20,7 @@ pub enum GpuScalar {
    U8,
    U16,
    U32,
+    TagFlags,
 }

 /// An algebraic datatype.
@ -59,6 +60,7 @@ impl GpuScalar {
            "u8" => Some(GpuScalar::U8),
            "u16" => Some(GpuScalar::U16),
            "u32" => Some(GpuScalar::U32),
+            "TagFlags" => Some(GpuScalar::TagFlags),
            _ => None,
        })
    }
@ -72,6 +74,7 @@ impl GpuScalar {
            GpuScalar::F32 | GpuScalar::I32 | GpuScalar::U32 => 4,
            GpuScalar::I8 | GpuScalar::U8 => 1,
            GpuScalar::F16 | GpuScalar::I16 | GpuScalar::U16 => 2,
+            GpuScalar::TagFlags => 0,
        }
    }
 }
--- a/piet-gpu-hal/Cargo.toml
+++ b/piet-gpu-hal/Cargo.toml
@ -8,7 +8,6 @@ edition = "2018"

 [dependencies]
 ash = "0.31"
-once_cell = "1.3.1"
 ash-window = "0.5"
 raw-window-handle = "0.3"

--- a/piet-gpu-hal/examples/collatz.rs
+++ b/piet-gpu-hal/examples/collatz.rs
@ -14,9 +14,9 @@ fn main() {
            .unwrap();
        buffer.write(&src).unwrap();
        let code = include_bytes!("./shader/collatz.spv");
-        let pipeline = session.create_simple_compute_pipeline(code, 1, 0).unwrap();
+        let pipeline = session.create_simple_compute_pipeline(code, 1).unwrap();
        let descriptor_set = session
-            .create_descriptor_set(&pipeline, &[buffer.vk_buffer()], &[])
+            .create_simple_descriptor_set(&pipeline, &[&buffer])
            .unwrap();
        let query_pool = session.create_query_pool(2).unwrap();
        let mut cmd_buf = session.cmd_buf().unwrap();
--- a/piet-gpu-hal/src/hub.rs
+++ b/piet-gpu-hal/src/hub.rs
@ -8,13 +8,16 @@ use std::any::Any;
 use std::sync::{Arc, Mutex, Weak};

 use crate::vulkan;
-use crate::{Device, Error};
+use crate::DescriptorSetBuilder as DescriptorSetBuilderTrait;
+use crate::PipelineBuilder as PipelineBuilderTrait;
+use crate::{Device, Error, GpuInfo, SamplerParams};

 pub type MemFlags = <vulkan::VkDevice as Device>::MemFlags;
 pub type Semaphore = <vulkan::VkDevice as Device>::Semaphore;
 pub type Pipeline = <vulkan::VkDevice as Device>::Pipeline;
 pub type DescriptorSet = <vulkan::VkDevice as Device>::DescriptorSet;
 pub type QueryPool = <vulkan::VkDevice as Device>::QueryPool;
+pub type Sampler = <vulkan::VkDevice as Device>::Sampler;

 type Fence = <vulkan::VkDevice as Device>::Fence;

@ -29,6 +32,7 @@ struct SessionInner {
    cmd_buf_pool: Mutex<Vec<(vulkan::CmdBuf, Fence)>>,
    /// Command buffers that are still pending (so resources can't be freed).
    pending: Mutex<Vec<SubmittedCmdBufInner>>,
+    gpu_info: GpuInfo,
 }

 pub struct CmdBuf {
@ -63,10 +67,16 @@ struct BufferInner {
    session: Weak<SessionInner>,
 }

+pub struct PipelineBuilder(vulkan::PipelineBuilder);
+
+pub struct DescriptorSetBuilder(vulkan::DescriptorSetBuilder);
+
 impl Session {
    pub fn new(device: vulkan::VkDevice) -> Session {
+        let gpu_info = device.query_gpu_info();
        Session(Arc::new(SessionInner {
            device,
+            gpu_info,
            cmd_buf_pool: Default::default(),
            pending: Default::default(),
        }))
@ -158,31 +168,28 @@ impl Session {
        self.0.device.create_semaphore()
    }

-    /// This creates a pipeline that runs over the buffer.
+    /// This creates a pipeline that operates on some buffers and images.
    ///
    /// The descriptor set layout is just some number of storage buffers and storage images (this might change).
    pub unsafe fn create_simple_compute_pipeline(
        &self,
        code: &[u8],
        n_buffers: u32,
-        n_images: u32,
    ) -> Result<Pipeline, Error> {
-        self.0
-            .device
-            .create_simple_compute_pipeline(code, n_buffers, n_images)
+        self.pipeline_builder()
+            .add_buffers(n_buffers)
+            .create_compute_pipeline(self, code)
    }

-    /// Create a descriptor set for a simple pipeline that just references buffers and images.
-    ///
-    /// Note: when we do portability, the signature will change to not reference the Vulkan types
-    /// directly.
-    pub unsafe fn create_descriptor_set(
+    /// Create a descriptor set for a simple pipeline that just references buffers.
+    pub unsafe fn create_simple_descriptor_set<'a>(
        &self,
        pipeline: &Pipeline,
-        bufs: &[&vulkan::Buffer],
-        images: &[&vulkan::Image],
+        buffers: impl IntoRefs<'a, Buffer>,
    ) -> Result<DescriptorSet, Error> {
-        self.0.device.create_descriptor_set(pipeline, bufs, images)
+        self.descriptor_set_builder()
+            .add_buffers(buffers)
+            .build(self, pipeline)
    }

    /// Create a query pool for timestamp queries.
@ -193,6 +200,22 @@ impl Session {
    pub unsafe fn fetch_query_pool(&self, pool: &QueryPool) -> Result<Vec<f64>, Error> {
        self.0.device.fetch_query_pool(pool)
    }
+
+    pub unsafe fn pipeline_builder(&self) -> PipelineBuilder {
+        PipelineBuilder(self.0.device.pipeline_builder())
+    }
+
+    pub unsafe fn descriptor_set_builder(&self) -> DescriptorSetBuilder {
+        DescriptorSetBuilder(self.0.device.descriptor_set_builder())
+    }
+
+    pub unsafe fn create_sampler(&self, params: SamplerParams) -> Result<Sampler, Error> {
+        self.0.device.create_sampler(params)
+    }
+
+    pub fn gpu_info(&self) -> &GpuInfo {
+        &self.0.gpu_info
+    }
 }

 impl CmdBuf {
@ -299,3 +322,134 @@ impl Buffer {
        Ok(())
    }
 }
+
+impl PipelineBuilder {
+    /// Add buffers to the pipeline. Each has its own binding.
+    pub fn add_buffers(mut self, n_buffers: u32) -> Self {
+        self.0.add_buffers(n_buffers);
+        self
+    }
+
+    /// Add storage images to the pipeline. Each has its own binding.
+    pub fn add_images(mut self, n_images: u32) -> Self {
+        self.0.add_images(n_images);
+        self
+    }
+
+    /// Add a binding with a variable-size array of textures.
+    pub fn add_textures(mut self, max_textures: u32) -> Self {
+        self.0.add_textures(max_textures);
+        self
+    }
+
+    pub unsafe fn create_compute_pipeline(
+        self,
+        session: &Session,
+        code: &[u8],
+    ) -> Result<Pipeline, Error> {
+        self.0.create_compute_pipeline(&session.0.device, code)
+    }
+}
+
+impl DescriptorSetBuilder {
+    pub fn add_buffers<'a>(mut self, buffers: impl IntoRefs<'a, Buffer>) -> Self {
+        let vk_buffers = buffers
+            .into_refs()
+            .map(|b| b.vk_buffer())
+            .collect::<Vec<_>>();
+        self.0.add_buffers(&vk_buffers);
+        self
+    }
+
+    pub fn add_images<'a>(mut self, images: impl IntoRefs<'a, Image>) -> Self {
+        let vk_images = images.into_refs().map(|i| i.vk_image()).collect::<Vec<_>>();
+        self.0.add_images(&vk_images);
+        self
+    }
+
+    pub fn add_textures<'a>(mut self, images: impl IntoRefs<'a, Image>) -> Self {
+        let vk_images = images.into_refs().map(|i| i.vk_image()).collect::<Vec<_>>();
+        self.0.add_textures(&vk_images);
+        self
+    }
+
+    pub unsafe fn build(
+        self,
+        session: &Session,
+        pipeline: &Pipeline,
+    ) -> Result<DescriptorSet, Error> {
+        self.0.build(&session.0.device, pipeline)
+    }
+}
+
+// This lets us use either a slice or a vector. The type is clunky but it
+// seems fine enough to use.
+pub trait IntoRefs<'a, T: 'a> {
+    type Iterator: Iterator<Item = &'a T>;
+
+    fn into_refs(self) -> Self::Iterator;
+}
+
+impl<'a, T> IntoRefs<'a, T> for &'a [T] {
+    type Iterator = std::slice::Iter<'a, T>;
+    fn into_refs(self) -> Self::Iterator {
+        self.into_iter()
+    }
+}
+
+impl<'a, T> IntoRefs<'a, T> for &'a [&'a T] {
+    type Iterator = std::iter::Copied<std::slice::Iter<'a, &'a T>>;
+    fn into_refs(self) -> Self::Iterator {
+        self.into_iter().copied()
+    }
+}
+
+// TODO: this will benefit from const generics!
+impl<'a, T> IntoRefs<'a, T> for &'a [&'a T; 1] {
+    type Iterator = std::iter::Copied<std::slice::Iter<'a, &'a T>>;
+    fn into_refs(self) -> Self::Iterator {
+        self.into_iter().copied()
+    }
+}
+
+impl<'a, T> IntoRefs<'a, T> for &'a [&'a T; 2] {
+    type Iterator = std::iter::Copied<std::slice::Iter<'a, &'a T>>;
+    fn into_refs(self) -> Self::Iterator {
+        self.into_iter().copied()
+    }
+}
+
+impl<'a, T> IntoRefs<'a, T> for &'a [&'a T; 3] {
+    type Iterator = std::iter::Copied<std::slice::Iter<'a, &'a T>>;
+    fn into_refs(self) -> Self::Iterator {
+        self.into_iter().copied()
+    }
+}
+
+impl<'a, T> IntoRefs<'a, T> for &'a [&'a T; 4] {
+    type Iterator = std::iter::Copied<std::slice::Iter<'a, &'a T>>;
+    fn into_refs(self) -> Self::Iterator {
+        self.into_iter().copied()
+    }
+}
+
+impl<'a, T> IntoRefs<'a, T> for &'a [&'a T; 5] {
+    type Iterator = std::iter::Copied<std::slice::Iter<'a, &'a T>>;
+    fn into_refs(self) -> Self::Iterator {
+        self.into_iter().copied()
+    }
+}
+
+impl<'a, T> IntoRefs<'a, T> for &'a [&'a T; 6] {
+    type Iterator = std::iter::Copied<std::slice::Iter<'a, &'a T>>;
+    fn into_refs(self) -> Self::Iterator {
+        self.into_iter().copied()
+    }
+}
+
+impl<'a, T> IntoRefs<'a, T> for Vec<&'a T> {
+    type Iterator = std::vec::IntoIter<&'a T>;
+    fn into_refs(self) -> Self::Iterator {
+        self.into_iter()
+    }
+}
--- a/piet-gpu-hal/src/lib.rs
+++ b/piet-gpu-hal/src/lib.rs
@ -18,10 +18,44 @@ pub enum ImageLayout {
    BlitSrc,
    BlitDst,
    General,
+    ShaderRead,
+}
+
+/// The type of sampling for image lookup.
+///
+/// This could take a lot more params, such as filtering, repeat, behavior
+/// at edges, etc., but for now we'll keep it simple.
+#[derive(Copy, Clone, Debug)]
+pub enum SamplerParams {
+    Nearest,
+    Linear,
+}
+
+#[derive(Clone, Debug)]
+/// Information about the GPU.
+pub struct GpuInfo {
+    /// The GPU supports descriptor indexing.
+    pub has_descriptor_indexing: bool,
+    /// The GPU supports subgroups.
+    ///
+    /// Right now, this just checks for basic subgroup capability (as
+    /// required in Vulkan 1.1), and we should have finer grained
+    /// queries for shuffles, etc.
+    pub has_subgroups: bool,
+    /// Info about subgroup size control, if available.
+    pub subgroup_size: Option<SubgroupSize>,
+    /// The GPU supports a real, grown-ass memory model.
+    pub has_memory_model: bool,
+}
+
+#[derive(Clone, Debug)]
+pub struct SubgroupSize {
+    min: u32,
+    max: u32,
 }

 pub trait Device: Sized {
-    type Buffer;
+    type Buffer: 'static;
    type Image;
    type MemFlags: MemFlags;
    type Pipeline;
@ -30,6 +64,15 @@ pub trait Device: Sized {
    type CmdBuf: CmdBuf<Self>;
    type Fence;
    type Semaphore;
+    type PipelineBuilder: PipelineBuilder<Self>;
+    type DescriptorSetBuilder: DescriptorSetBuilder<Self>;
+    type Sampler;
+
+    /// Query the GPU info.
+    ///
+    /// This method may be expensive, so the hub should call it once and retain
+    /// the info.
+    fn query_gpu_info(&self) -> GpuInfo;

    fn create_buffer(&self, size: u64, mem_flags: Self::MemFlags) -> Result<Self::Buffer, Error>;

@ -58,19 +101,48 @@ pub trait Device: Sized {
    /// Maybe doesn't need result return?
    unsafe fn destroy_image(&self, image: &Self::Image) -> Result<(), Error>;

+    /// Start building a pipeline.
+    ///
+    /// A pipeline is a bit of shader IR plus a signature for what kinds of resources
+    /// it expects.
+    unsafe fn pipeline_builder(&self) -> Self::PipelineBuilder;
+
+    /// Start building a descriptor set.
+    ///
+    /// A descriptor set is a binding of resources for a given pipeline.
+    unsafe fn descriptor_set_builder(&self) -> Self::DescriptorSetBuilder;
+
+    /// Create a simple compute pipeline that operates on buffers and storage images.
+    ///
+    /// This is provided as a convenience but will probably go away, as the functionality
+    /// is subsumed by the builder.
    unsafe fn create_simple_compute_pipeline(
        &self,
        code: &[u8],
        n_buffers: u32,
        n_images: u32,
-    ) -> Result<Self::Pipeline, Error>;
+    ) -> Result<Self::Pipeline, Error> {
+        let mut builder = self.pipeline_builder();
+        builder.add_buffers(n_buffers);
+        builder.add_images(n_images);
+        builder.create_compute_pipeline(self, code)
+    }

+    /// Create a descriptor set for a given pipeline, binding buffers and images.
+    ///
+    /// This is provided as a convenience but will probably go away, as the functionality
+    /// is subsumed by the builder.
    unsafe fn create_descriptor_set(
        &self,
        pipeline: &Self::Pipeline,
        bufs: &[&Self::Buffer],
        images: &[&Self::Image],
-    ) -> Result<Self::DescriptorSet, Error>;
+    ) -> Result<Self::DescriptorSet, Error> {
+        let mut builder = self.descriptor_set_builder();
+        builder.add_buffers(bufs);
+        builder.add_images(images);
+        builder.build(self, pipeline)
+    }

    fn create_cmd_buf(&self) -> Result<Self::CmdBuf, Error>;

@ -109,6 +181,8 @@ pub trait Device: Sized {
    unsafe fn create_fence(&self, signaled: bool) -> Result<Self::Fence, Error>;
    unsafe fn wait_and_reset(&self, fences: &[Self::Fence]) -> Result<(), Error>;
    unsafe fn get_fence_status(&self, fence: Self::Fence) -> Result<bool, Error>;
+
+    unsafe fn create_sampler(&self, params: SamplerParams) -> Result<Self::Sampler, Error>;
 }

 pub trait CmdBuf<D: Device> {
@ -150,7 +224,7 @@ pub trait CmdBuf<D: Device> {
    /// This is readily supported in Vulkan, but for portability it is remarkably
    /// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute
    /// kernel, or organize the code not to need it.
-    unsafe fn clear_buffer(&self, buffer: &D::Buffer);
+    unsafe fn clear_buffer(&self, buffer: &D::Buffer, size: Option<u64>);

    unsafe fn copy_buffer(&self, src: &D::Buffer, dst: &D::Buffer);

@ -176,3 +250,34 @@ pub trait MemFlags: Sized + Clone + Copy {

    fn host_coherent() -> Self;
 }
+
+/// A builder for pipelines with more complex layouts.
+pub trait PipelineBuilder<D: Device> {
+    /// Add buffers to the pipeline. Each has its own binding.
+    fn add_buffers(&mut self, n_buffers: u32);
+    /// Add storage images to the pipeline. Each has its own binding.
+    fn add_images(&mut self, n_images: u32);
+    /// Add a binding with a variable-size array of textures.
+    fn add_textures(&mut self, max_textures: u32);
+    unsafe fn create_compute_pipeline(self, device: &D, code: &[u8]) -> Result<D::Pipeline, Error>;
+}
+
+/// A builder for descriptor sets with more complex layouts.
+///
+/// Note: the order needs to match the pipeline building, and it also needs to
+/// be buffers, then images, then textures.
+pub trait DescriptorSetBuilder<D: Device> {
+    fn add_buffers(&mut self, buffers: &[&D::Buffer]);
+    /// Add an array of storage images.
+    ///
+    /// The images need to be in `ImageLayout::General` layout.
+    fn add_images(&mut self, images: &[&D::Image]);
+    /// Add an array of textures.
+    ///
+    /// The images need to be in `ImageLayout::ShaderRead` layout.
+    ///
+    /// The same sampler is used for all textures, which is not very sophisticated;
+    /// we should have a way to vary the sampler.
+    fn add_textures(&mut self, images: &[&D::Image]);
+    unsafe fn build(self, device: &D, pipeline: &D::Pipeline) -> Result<D::DescriptorSet, Error>;
+}
--- a/piet-gpu-hal/src/vulkan.rs
+++ b/piet-gpu-hal/src/vulkan.rs
@ -2,20 +2,22 @@

 use std::borrow::Cow;
 use std::ffi::{CStr, CString};
+use std::os::raw::c_char;
 use std::sync::Arc;

 use ash::extensions::{ext::DebugUtils, khr};
-use ash::version::{DeviceV1_0, EntryV1_0, InstanceV1_0};
+use ash::version::{DeviceV1_0, EntryV1_0, InstanceV1_0, InstanceV1_1};
 use ash::{vk, Device, Entry, Instance};
-use once_cell::sync::Lazy;

-use crate::{Device as DeviceTrait, Error, ImageLayout};
+use crate::{Device as DeviceTrait, Error, GpuInfo, ImageLayout, SamplerParams, SubgroupSize};

 pub struct VkInstance {
    /// Retain the dynamic lib.
    #[allow(unused)]
    entry: Entry,
    instance: Instance,
+    get_phys_dev_props: Option<vk::KhrGetPhysicalDeviceProperties2Fn>,
+    vk_version: u32,
    _dbg_loader: Option<DebugUtils>,
    _dbg_callbk: Option<vk::DebugUtilsMessengerEXT>,
 }
@ -27,6 +29,7 @@ pub struct VkDevice {
    queue: vk::Queue,
    qfi: u32,
    timestamp_period: f32,
+    gpu_info: GpuInfo,
 }

 struct RawDevice {
@ -71,6 +74,7 @@ pub struct Pipeline {
    pipeline: vk::Pipeline,
    descriptor_set_layout: vk::DescriptorSetLayout,
    pipeline_layout: vk::PipelineLayout,
+    max_textures: u32,
 }

 pub struct DescriptorSet {
@ -90,6 +94,30 @@ pub struct QueryPool {
 #[derive(Clone, Copy)]
 pub struct MemFlags(vk::MemoryPropertyFlags);

+pub struct PipelineBuilder {
+    bindings: Vec<vk::DescriptorSetLayoutBinding>,
+    binding_flags: Vec<vk::DescriptorBindingFlags>,
+    max_textures: u32,
+    has_descriptor_indexing: bool,
+}
+
+pub struct DescriptorSetBuilder {
+    buffers: Vec<vk::Buffer>,
+    images: Vec<vk::ImageView>,
+    textures: Vec<vk::ImageView>,
+    sampler: vk::Sampler,
+}
+
+struct Extensions {
+    exts: Vec<*const c_char>,
+    exist_exts: Vec<vk::ExtensionProperties>,
+}
+
+struct Layers {
+    layers: Vec<*const c_char>,
+    exist_layers: Vec<vk::LayerProperties>,
+}
+
 unsafe extern "system" fn vulkan_debug_callback(
    message_severity: vk::DebugUtilsMessageSeverityFlagsEXT,
    message_type: vk::DebugUtilsMessageTypeFlagsEXT,
@ -119,22 +147,6 @@ unsafe extern "system" fn vulkan_debug_callback(
    vk::FALSE
 }

-static LAYERS: Lazy<Vec<&'static CStr>> = Lazy::new(|| {
-    let mut layers: Vec<&'static CStr> = vec![];
-    if cfg!(debug_assertions) {
-        layers.push(CStr::from_bytes_with_nul(b"VK_LAYER_KHRONOS_validation\0").unwrap());
-    }
-    layers
-});
-
-static EXTS: Lazy<Vec<&'static CStr>> = Lazy::new(|| {
-    let mut exts: Vec<&'static CStr> = vec![];
-    if cfg!(debug_assertions) {
-        exts.push(DebugUtils::name());
-    }
-    exts
-});
-
 impl VkInstance {
    /// Create a new instance.
    ///
@ -150,50 +162,35 @@ impl VkInstance {
            let app_name = CString::new("VkToy").unwrap();
            let entry = Entry::new()?;

-            let exist_layers = entry.enumerate_instance_layer_properties()?;
-            let layers = LAYERS
-                .iter()
-                .filter_map(|&lyr| {
-                    exist_layers
-                        .iter()
-                        .find(|x| CStr::from_ptr(x.layer_name.as_ptr()) == lyr)
-                        .map(|_| lyr.as_ptr())
-                        .or_else(|| {
-                            println!(
-                                "Unable to find layer: {}, have you installed the Vulkan SDK?",
-                                lyr.to_string_lossy()
-                            );
-                            None
-                        })
-                })
-                .collect::<Vec<_>>();
-
-            let exist_exts = entry.enumerate_instance_extension_properties()?;
-            let mut exts = EXTS
-                .iter()
-                .filter_map(|&ext| {
-                    exist_exts
-                        .iter()
-                        .find(|x| CStr::from_ptr(x.extension_name.as_ptr()) == ext)
-                        .map(|_| ext.as_ptr())
-                        .or_else(|| {
-                            println!(
-                                "Unable to find extension: {}, have you installed the Vulkan SDK?",
-                                ext.to_string_lossy()
-                            );
-                            None
-                        })
-                })
-                .collect::<Vec<_>>();
-
-            let surface_extensions = match window_handle {
-                Some(ref handle) => ash_window::enumerate_required_extensions(*handle)?,
-                None => vec![],
-            };
-            for extension in surface_extensions {
-                exts.push(extension.as_ptr());
+            let mut layers = Layers::new(entry.enumerate_instance_layer_properties()?);
+            if cfg!(debug_assertions) {
+                layers
+                    .try_add(CStr::from_bytes_with_nul(b"VK_LAYER_KHRONOS_validation\0").unwrap());
            }

+            let mut exts = Extensions::new(entry.enumerate_instance_extension_properties()?);
+            let mut has_debug_ext = false;
+            if cfg!(debug_assertions) {
+                has_debug_ext = exts.try_add(DebugUtils::name());
+            }
+            // We'll need this to do runtime query of descriptor indexing.
+            let has_phys_dev_props = exts.try_add(vk::KhrGetPhysicalDeviceProperties2Fn::name());
+            if let Some(ref handle) = window_handle {
+                for ext in ash_window::enumerate_required_extensions(*handle)? {
+                    exts.try_add(ext);
+                }
+            }
+
+            let supported_version = entry
+                .try_enumerate_instance_version()?
+                .unwrap_or(vk::make_version(1, 0, 0));
+            let vk_version = if supported_version >= vk::make_version(1, 1, 0) {
+                // We need Vulkan 1.1 to do subgroups; most other things can be extensions.
+                vk::make_version(1, 1, 0)
+            } else {
+                vk::make_version(1, 0, 0)
+            };
+
            let instance = entry.create_instance(
                &vk::InstanceCreateInfo::builder()
                    .application_info(
@ -201,14 +198,14 @@ impl VkInstance {
                            .application_name(&app_name)
                            .application_version(0)
                            .engine_name(&app_name)
-                            .api_version(vk::make_version(1, 0, 0)),
+                            .api_version(vk_version),
                    )
-                    .enabled_layer_names(&layers)
-                    .enabled_extension_names(&exts),
+                    .enabled_layer_names(layers.as_ptrs())
+                    .enabled_extension_names(exts.as_ptrs()),
                None,
            )?;

-            let (_dbg_loader, _dbg_callbk) = if cfg!(debug_assertions) {
+            let (_dbg_loader, _dbg_callbk) = if has_debug_ext {
                let dbg_info = vk::DebugUtilsMessengerCreateInfoEXT::builder()
                    .message_severity(
                        vk::DebugUtilsMessageSeverityFlagsEXT::ERROR
@ -233,9 +230,21 @@ impl VkInstance {
                None => None,
            };

+            let get_phys_dev_props = if has_phys_dev_props {
+                Some(vk::KhrGetPhysicalDeviceProperties2Fn::load(|name| {
+                    std::mem::transmute(
+                        entry.get_instance_proc_addr(instance.handle(), name.as_ptr()),
+                    )
+                }))
+            } else {
+                None
+            };
+
            let vk_instance = VkInstance {
                entry,
                instance,
+                get_phys_dev_props,
+                vk_version,
                _dbg_loader,
                _dbg_callbk,
            };
@ -256,19 +265,55 @@ impl VkInstance {
        let (pdevice, qfi) =
            choose_compute_device(&self.instance, &devices, surface).ok_or("no suitable device")?;

+        let mut has_descriptor_indexing = false;
+        if let Some(ref get_phys_dev_props) = self.get_phys_dev_props {
+            let mut descriptor_indexing_features =
+                vk::PhysicalDeviceDescriptorIndexingFeatures::builder();
+            // See https://github.com/MaikKlein/ash/issues/325 for why we do this workaround.
+            let mut features_v2 = vk::PhysicalDeviceFeatures2::default();
+            features_v2.p_next =
+                &mut descriptor_indexing_features as *mut _ as *mut std::ffi::c_void;
+            get_phys_dev_props.get_physical_device_features2_khr(pdevice, &mut features_v2);
+            has_descriptor_indexing = descriptor_indexing_features
+                .shader_storage_image_array_non_uniform_indexing
+                == vk::TRUE
+                && descriptor_indexing_features.descriptor_binding_variable_descriptor_count
+                    == vk::TRUE
+                && descriptor_indexing_features.runtime_descriptor_array == vk::TRUE;
+        }
+
        let queue_priorities = [1.0];
        let queue_create_infos = [vk::DeviceQueueCreateInfo::builder()
            .queue_family_index(qfi)
            .queue_priorities(&queue_priorities)
            .build()];
-        let extensions = match surface {
-            Some(_) => vec![khr::Swapchain::name().as_ptr()],
-            None => vec![],
-        };
-        let create_info = vk::DeviceCreateInfo::builder()
+
+        let mut descriptor_indexing = vk::PhysicalDeviceDescriptorIndexingFeatures::builder()
+            .shader_storage_image_array_non_uniform_indexing(true)
+            .descriptor_binding_variable_descriptor_count(true)
+            .runtime_descriptor_array(true);
+
+        let mut extensions = Extensions::new(
+            self.instance
+                .enumerate_device_extension_properties(pdevice)?,
+        );
+        if surface.is_some() {
+            extensions.try_add(khr::Swapchain::name());
+        }
+        if has_descriptor_indexing {
+            extensions.try_add(vk::KhrMaintenance3Fn::name());
+            extensions.try_add(vk::ExtDescriptorIndexingFn::name());
+        }
+        let has_subgroup_size = self.vk_version >= vk::make_version(1, 1, 0)
+            && extensions.try_add(vk::ExtSubgroupSizeControlFn::name());
+        let has_memory_model = self.vk_version >= vk::make_version(1, 1, 0)
+            && extensions.try_add(vk::KhrVulkanMemoryModelFn::name());
+        let mut create_info = vk::DeviceCreateInfo::builder()
            .queue_create_infos(&queue_create_infos)
-            .enabled_extension_names(&extensions)
-            .build();
+            .enabled_extension_names(extensions.as_ptrs());
+        if has_descriptor_indexing {
+            create_info = create_info.push_next(&mut descriptor_indexing);
+        }
        let device = self.instance.create_device(pdevice, &create_info, None)?;

        let device_mem_props = self.instance.get_physical_device_memory_properties(pdevice);
@ -280,6 +325,28 @@ impl VkInstance {

        let props = self.instance.get_physical_device_properties(pdevice);
        let timestamp_period = props.limits.timestamp_period;
+        let subgroup_size = if has_subgroup_size {
+            let mut subgroup_props = vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT::default();
+            let mut properties =
+                vk::PhysicalDeviceProperties2::builder().push_next(&mut subgroup_props);
+            self.instance
+                .get_physical_device_properties2(pdevice, &mut properties);
+            Some(SubgroupSize {
+                min: subgroup_props.min_subgroup_size,
+                max: subgroup_props.max_subgroup_size,
+            })
+        } else {
+            None
+        };
+
+        // TODO: finer grained query of specific subgroup info.
+        let has_subgroups = self.vk_version >= vk::make_version(1, 1, 0);
+        let gpu_info = GpuInfo {
+            has_descriptor_indexing,
+            has_subgroups,
+            subgroup_size,
+            has_memory_model,
+        };

        Ok(VkDevice {
            device,
@ -288,11 +355,14 @@ impl VkInstance {
            qfi,
            queue,
            timestamp_period,
+            gpu_info,
        })
    }

    pub unsafe fn swapchain(
        &self,
+        width: usize,
+        height: usize,
        device: &VkDevice,
        surface: &VkSurface,
    ) -> Result<VkSwapchain, Error> {
@ -326,8 +396,13 @@ impl VkInstance {
            .find(|mode| mode == &vk::PresentModeKHR::MAILBOX)
            .unwrap_or(vk::PresentModeKHR::FIFO);

-        let image_count = 2; // TODO
-        let extent = capabilities.current_extent; // TODO: wayland for example will complain here ..
+        let image_count = capabilities.min_image_count;
+        let mut extent = capabilities.current_extent;
+        if extent.width == u32::MAX || extent.height == u32::MAX {
+            // We're deciding the size.
+            extent.width = width as u32;
+            extent.height = height as u32;
+        }

        let create_info = vk::SwapchainCreateInfoKHR::builder()
            .surface(surface.surface)
@ -375,6 +450,13 @@ impl crate::Device for VkDevice {
    type MemFlags = MemFlags;
    type Fence = vk::Fence;
    type Semaphore = vk::Semaphore;
+    type PipelineBuilder = PipelineBuilder;
+    type DescriptorSetBuilder = DescriptorSetBuilder;
+    type Sampler = vk::Sampler;
+
+    fn query_gpu_info(&self) -> GpuInfo {
+        self.gpu_info.clone()
+    }

    fn create_buffer(&self, size: u64, mem_flags: MemFlags) -> Result<Buffer, Error> {
        unsafe {
@ -527,151 +609,22 @@ impl crate::Device for VkDevice {
        Ok(device.get_fence_status(fence)?)
    }

-    /// This creates a pipeline that runs over the buffer.
-    ///
-    /// The descriptor set layout is just some number of storage buffers and storage images (this might change).
-    unsafe fn create_simple_compute_pipeline(
-        &self,
-        code: &[u8],
-        n_buffers: u32,
-        n_images: u32,
-    ) -> Result<Pipeline, Error> {
-        let device = &self.device.device;
-        let mut bindings = Vec::new();
-        for i in 0..n_buffers {
-            bindings.push(
-                vk::DescriptorSetLayoutBinding::builder()
-                    .binding(i)
-                    .descriptor_type(vk::DescriptorType::STORAGE_BUFFER)
-                    .descriptor_count(1)
-                    .stage_flags(vk::ShaderStageFlags::COMPUTE)
-                    .build(),
-            );
+    unsafe fn pipeline_builder(&self) -> PipelineBuilder {
+        PipelineBuilder {
+            bindings: Vec::new(),
+            binding_flags: Vec::new(),
+            max_textures: 0,
+            has_descriptor_indexing: self.gpu_info.has_descriptor_indexing,
        }
-        for i in n_buffers..n_buffers + n_images {
-            bindings.push(
-                vk::DescriptorSetLayoutBinding::builder()
-                    .binding(i)
-                    .descriptor_type(vk::DescriptorType::STORAGE_IMAGE)
-                    .descriptor_count(1)
-                    .stage_flags(vk::ShaderStageFlags::COMPUTE)
-                    .build(),
-            );
-        }
-        let descriptor_set_layout = device.create_descriptor_set_layout(
-            &vk::DescriptorSetLayoutCreateInfo::builder().bindings(&bindings),
-            None,
-        )?;
-
-        let descriptor_set_layouts = [descriptor_set_layout];
-
-        // Create compute pipeline.
-        let code_u32 = convert_u32_vec(code);
-        let compute_shader_module = device
-            .create_shader_module(&vk::ShaderModuleCreateInfo::builder().code(&code_u32), None)?;
-        let entry_name = CString::new("main").unwrap();
-        let pipeline_layout = device.create_pipeline_layout(
-            &vk::PipelineLayoutCreateInfo::builder().set_layouts(&descriptor_set_layouts),
-            None,
-        )?;
-
-        let pipeline = device
-            .create_compute_pipelines(
-                vk::PipelineCache::null(),
-                &[vk::ComputePipelineCreateInfo::builder()
-                    .stage(
-                        vk::PipelineShaderStageCreateInfo::builder()
-                            .stage(vk::ShaderStageFlags::COMPUTE)
-                            .module(compute_shader_module)
-                            .name(&entry_name)
-                            .build(),
-                    )
-                    .layout(pipeline_layout)
-                    .build()],
-                None,
-            )
-            .map_err(|(_pipeline, err)| err)?[0];
-        Ok(Pipeline {
-            pipeline,
-            pipeline_layout,
-            descriptor_set_layout,
-        })
    }

-    unsafe fn create_descriptor_set(
-        &self,
-        pipeline: &Pipeline,
-        bufs: &[&Buffer],
-        images: &[&Image],
-    ) -> Result<DescriptorSet, Error> {
-        let device = &self.device.device;
-        let mut descriptor_pool_sizes = Vec::new();
-        if !bufs.is_empty() {
-            descriptor_pool_sizes.push(
-                vk::DescriptorPoolSize::builder()
-                    .ty(vk::DescriptorType::STORAGE_BUFFER)
-                    .descriptor_count(bufs.len() as u32)
-                    .build(),
-            );
+    unsafe fn descriptor_set_builder(&self) -> DescriptorSetBuilder {
+        DescriptorSetBuilder {
+            buffers: Vec::new(),
+            images: Vec::new(),
+            textures: Vec::new(),
+            sampler: vk::Sampler::null(),
        }
-        if !images.is_empty() {
-            descriptor_pool_sizes.push(
-                vk::DescriptorPoolSize::builder()
-                    .ty(vk::DescriptorType::STORAGE_IMAGE)
-                    .descriptor_count(images.len() as u32)
-                    .build(),
-            );
-        }
-        let descriptor_pool = device.create_descriptor_pool(
-            &vk::DescriptorPoolCreateInfo::builder()
-                .pool_sizes(&descriptor_pool_sizes)
-                .max_sets(1),
-            None,
-        )?;
-        let descriptor_set_layouts = [pipeline.descriptor_set_layout];
-        let descriptor_sets = device
-            .allocate_descriptor_sets(
-                &vk::DescriptorSetAllocateInfo::builder()
-                    .descriptor_pool(descriptor_pool)
-                    .set_layouts(&descriptor_set_layouts),
-            )
-            .unwrap();
-        for (i, buf) in bufs.iter().enumerate() {
-            let buf_info = vk::DescriptorBufferInfo::builder()
-                .buffer(buf.buffer)
-                .offset(0)
-                .range(vk::WHOLE_SIZE)
-                .build();
-            device.update_descriptor_sets(
-                &[vk::WriteDescriptorSet::builder()
-                    .dst_set(descriptor_sets[0])
-                    .dst_binding(i as u32)
-                    .descriptor_type(vk::DescriptorType::STORAGE_BUFFER)
-                    .buffer_info(&[buf_info])
-                    .build()],
-                &[],
-            );
-        }
-        for (i, image) in images.iter().enumerate() {
-            let binding = i + bufs.len();
-            let image_info = vk::DescriptorImageInfo::builder()
-                .sampler(vk::Sampler::null())
-                .image_view(image.image_view)
-                .image_layout(vk::ImageLayout::GENERAL)
-                .build();
-            device.update_descriptor_sets(
-                &[vk::WriteDescriptorSet::builder()
-                    .dst_set(descriptor_sets[0])
-                    .dst_binding(binding as u32)
-                    .descriptor_type(vk::DescriptorType::STORAGE_IMAGE)
-                    .image_info(&[image_info])
-                    .build()],
-                &[],
-            );
-        }
-        Ok(DescriptorSet {
-            descriptor_set: descriptor_sets[0],
-        })
    }

    fn create_cmd_buf(&self) -> Result<CmdBuf, Error> {
@ -797,6 +750,32 @@ impl crate::Device for VkDevice {
        device.unmap_memory(buffer.buffer_memory);
        Ok(())
    }
+
+    unsafe fn create_sampler(&self, params: SamplerParams) -> Result<Self::Sampler, Error> {
+        let device = &self.device.device;
+        let filter = match params {
+            SamplerParams::Linear => vk::Filter::LINEAR,
+            SamplerParams::Nearest => vk::Filter::NEAREST,
+        };
+        let sampler = device.create_sampler(
+            &vk::SamplerCreateInfo::builder()
+                .mag_filter(filter)
+                .min_filter(filter)
+                .mipmap_mode(vk::SamplerMipmapMode::LINEAR)
+                .address_mode_u(vk::SamplerAddressMode::CLAMP_TO_BORDER)
+                .address_mode_v(vk::SamplerAddressMode::CLAMP_TO_BORDER)
+                .address_mode_w(vk::SamplerAddressMode::CLAMP_TO_BORDER)
+                .mip_lod_bias(0.0)
+                .compare_op(vk::CompareOp::NEVER)
+                .min_lod(0.0)
+                .max_lod(0.0)
+                .border_color(vk::BorderColor::FLOAT_TRANSPARENT_BLACK)
+                .max_anisotropy(1.0)
+                .anisotropy_enable(false),
+            None,
+        )?;
+        Ok(sampler)
+    }
 }

 impl crate::CmdBuf<VkDevice> for CmdBuf {
@ -902,9 +881,10 @@ impl crate::CmdBuf<VkDevice> for CmdBuf {
        );
    }

-    unsafe fn clear_buffer(&self, buffer: &Buffer) {
+    unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
        let device = &self.device.device;
-        device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, vk::WHOLE_SIZE, 0);
+        let size = size.unwrap_or(vk::WHOLE_SIZE);
+        device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, size, 0);
    }

    unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {
@ -1032,6 +1012,234 @@ impl crate::MemFlags for MemFlags {
    }
 }

+impl crate::PipelineBuilder<VkDevice> for PipelineBuilder {
+    fn add_buffers(&mut self, n_buffers: u32) {
+        let start = self.bindings.len() as u32;
+        for i in 0..n_buffers {
+            self.bindings.push(
+                vk::DescriptorSetLayoutBinding::builder()
+                    .binding(start + i)
+                    .descriptor_type(vk::DescriptorType::STORAGE_BUFFER)
+                    .descriptor_count(1)
+                    .stage_flags(vk::ShaderStageFlags::COMPUTE)
+                    .build(),
+            );
+            self.binding_flags
+                .push(vk::DescriptorBindingFlags::default());
+        }
+    }
+
+    fn add_images(&mut self, n_images: u32) {
+        let start = self.bindings.len() as u32;
+        for i in 0..n_images {
+            self.bindings.push(
+                vk::DescriptorSetLayoutBinding::builder()
+                    .binding(start + i)
+                    .descriptor_type(vk::DescriptorType::STORAGE_IMAGE)
+                    .descriptor_count(1)
+                    .stage_flags(vk::ShaderStageFlags::COMPUTE)
+                    .build(),
+            );
+            self.binding_flags
+                .push(vk::DescriptorBindingFlags::default());
+        }
+    }
+
+    fn add_textures(&mut self, max_textures: u32) {
+        let start = self.bindings.len() as u32;
+        self.bindings.push(
+            vk::DescriptorSetLayoutBinding::builder()
+                .binding(start)
+                .descriptor_type(vk::DescriptorType::STORAGE_IMAGE)
+                .descriptor_count(max_textures)
+                .stage_flags(vk::ShaderStageFlags::COMPUTE)
+                .build(),
+        );
+        let flags = if self.has_descriptor_indexing {
+            vk::DescriptorBindingFlags::VARIABLE_DESCRIPTOR_COUNT
+        } else {
+            Default::default()
+        };
+        self.binding_flags.push(flags);
+        self.max_textures += max_textures;
+    }
+
+    unsafe fn create_compute_pipeline(
+        self,
+        device: &VkDevice,
+        code: &[u8],
+    ) -> Result<Pipeline, Error> {
+        let device = &device.device.device;
+        let descriptor_set_layout = device.create_descriptor_set_layout(
+            &vk::DescriptorSetLayoutCreateInfo::builder()
+                .bindings(&self.bindings)
+                // It might be a slight optimization not to push this if max_textures = 0
+                .push_next(
+                    &mut vk::DescriptorSetLayoutBindingFlagsCreateInfo::builder()
+                        .binding_flags(&self.binding_flags)
+                        .build(),
+                ),
+            None,
+        )?;
+        let descriptor_set_layouts = [descriptor_set_layout];
+
+        // Create compute pipeline.
+        let code_u32 = convert_u32_vec(code);
+        let compute_shader_module = device
+            .create_shader_module(&vk::ShaderModuleCreateInfo::builder().code(&code_u32), None)?;
+        let entry_name = CString::new("main").unwrap();
+        let pipeline_layout = device.create_pipeline_layout(
+            &vk::PipelineLayoutCreateInfo::builder().set_layouts(&descriptor_set_layouts),
+            None,
+        )?;
+
+        let pipeline = device
+            .create_compute_pipelines(
+                vk::PipelineCache::null(),
+                &[vk::ComputePipelineCreateInfo::builder()
+                    .stage(
+                        vk::PipelineShaderStageCreateInfo::builder()
+                            .stage(vk::ShaderStageFlags::COMPUTE)
+                            .module(compute_shader_module)
+                            .name(&entry_name)
+                            .build(),
+                    )
+                    .layout(pipeline_layout)
+                    .build()],
+                None,
+            )
+            .map_err(|(_pipeline, err)| err)?[0];
+        Ok(Pipeline {
+            pipeline,
+            pipeline_layout,
+            descriptor_set_layout,
+            max_textures: self.max_textures,
+        })
+    }
+}
+
+impl crate::DescriptorSetBuilder<VkDevice> for DescriptorSetBuilder {
+    fn add_buffers(&mut self, buffers: &[&Buffer]) {
+        self.buffers.extend(buffers.iter().map(|b| b.buffer));
+    }
+
+    fn add_images(&mut self, images: &[&Image]) {
+        self.images.extend(images.iter().map(|i| i.image_view));
+    }
+
+    fn add_textures(&mut self, images: &[&Image]) {
+        self.textures.extend(images.iter().map(|i| i.image_view));
+    }
+
+    unsafe fn build(self, device: &VkDevice, pipeline: &Pipeline) -> Result<DescriptorSet, Error> {
+        let device = &device.device.device;
+        let mut descriptor_pool_sizes = Vec::new();
+        if !self.buffers.is_empty() {
+            descriptor_pool_sizes.push(
+                vk::DescriptorPoolSize::builder()
+                    .ty(vk::DescriptorType::STORAGE_BUFFER)
+                    .descriptor_count(self.buffers.len() as u32)
+                    .build(),
+            );
+        }
+        if !self.images.is_empty() {
+            descriptor_pool_sizes.push(
+                vk::DescriptorPoolSize::builder()
+                    .ty(vk::DescriptorType::STORAGE_IMAGE)
+                    .descriptor_count(self.images.len() as u32)
+                    .build(),
+            );
+        }
+        if pipeline.max_textures > 0 {
+            descriptor_pool_sizes.push(
+                vk::DescriptorPoolSize::builder()
+                    .ty(vk::DescriptorType::STORAGE_IMAGE)
+                    .descriptor_count(pipeline.max_textures)
+                    .build(),
+            );
+        }
+        let descriptor_pool = device.create_descriptor_pool(
+            &vk::DescriptorPoolCreateInfo::builder()
+                .pool_sizes(&descriptor_pool_sizes)
+                .max_sets(1),
+            None,
+        )?;
+        let descriptor_set_layouts = [pipeline.descriptor_set_layout];
+
+        let counts = &[pipeline.max_textures];
+        let variable_info = vk::DescriptorSetVariableDescriptorCountAllocateInfo::builder()
+            .descriptor_counts(counts);
+        let descriptor_sets = device
+            .allocate_descriptor_sets(
+                &vk::DescriptorSetAllocateInfo::builder()
+                    .descriptor_pool(descriptor_pool)
+                    .set_layouts(&descriptor_set_layouts)
+                    .push_next(&mut variable_info.build()),
+            )
+            .unwrap();
+        let mut binding = 0;
+        // Maybe one call to update_descriptor_sets with an array of descriptor_writes?
+        for buf in &self.buffers {
+            device.update_descriptor_sets(
+                &[vk::WriteDescriptorSet::builder()
+                    .dst_set(descriptor_sets[0])
+                    .dst_binding(binding)
+                    .descriptor_type(vk::DescriptorType::STORAGE_BUFFER)
+                    .buffer_info(&[vk::DescriptorBufferInfo::builder()
+                        .buffer(*buf)
+                        .offset(0)
+                        .range(vk::WHOLE_SIZE)
+                        .build()])
+                    .build()],
+                &[],
+            );
+            binding += 1;
+        }
+        for image in &self.images {
+            device.update_descriptor_sets(
+                &[vk::WriteDescriptorSet::builder()
+                    .dst_set(descriptor_sets[0])
+                    .dst_binding(binding)
+                    .descriptor_type(vk::DescriptorType::STORAGE_IMAGE)
+                    .image_info(&[vk::DescriptorImageInfo::builder()
+                        .sampler(vk::Sampler::null())
+                        .image_view(*image)
+                        .image_layout(vk::ImageLayout::GENERAL)
+                        .build()])
+                    .build()],
+                &[],
+            );
+            binding += 1;
+        }
+        if !self.textures.is_empty() {
+            let infos = self
+                .textures
+                .iter()
+                .map(|texture| {
+                    vk::DescriptorImageInfo::builder()
+                        .sampler(self.sampler)
+                        .image_view(*texture)
+                        .image_layout(vk::ImageLayout::GENERAL)
+                        .build()
+                })
+                .collect::<Vec<_>>();
+            device.update_descriptor_sets(
+                &[vk::WriteDescriptorSet::builder()
+                    .dst_set(descriptor_sets[0])
+                    .dst_binding(binding)
+                    .descriptor_type(vk::DescriptorType::STORAGE_IMAGE)
+                    .image_info(&infos)
+                    .build()],
+                &[],
+            );
+            //binding += 1;
+        }
+        Ok(DescriptorSet {
+            descriptor_set: descriptor_sets[0],
+        })
+    }
+}
+
 impl VkSwapchain {
    pub unsafe fn next(&mut self) -> Result<(usize, vk::Semaphore), Error> {
        let acquisition_semaphore = self.acquisition_semaphores[self.acquisition_idx];
@ -1075,6 +1283,64 @@ impl VkSwapchain {
    }
 }

+impl Extensions {
+    fn new(exist_exts: Vec<vk::ExtensionProperties>) -> Extensions {
+        Extensions {
+            exist_exts,
+            exts: vec![],
+        }
+    }
+
+    fn try_add(&mut self, ext: &'static CStr) -> bool {
+        unsafe {
+            if self
+                .exist_exts
+                .iter()
+                .find(|x| CStr::from_ptr(x.extension_name.as_ptr()) == ext)
+                .is_some()
+            {
+                self.exts.push(ext.as_ptr());
+                true
+            } else {
+                false
+            }
+        }
+    }
+
+    fn as_ptrs(&self) -> &[*const c_char] {
+        &self.exts
+    }
+}
+
+impl Layers {
+    fn new(exist_layers: Vec<vk::LayerProperties>) -> Layers {
+        Layers {
+            exist_layers,
+            layers: vec![],
+        }
+    }
+
+    fn try_add(&mut self, ext: &'static CStr) -> bool {
+        unsafe {
+            if self
+                .exist_layers
+                .iter()
+                .find(|x| CStr::from_ptr(x.layer_name.as_ptr()) == ext)
+                .is_some()
+            {
+                self.layers.push(ext.as_ptr());
+                true
+            } else {
+                false
+            }
+        }
+    }
+
+    fn as_ptrs(&self) -> &[*const c_char] {
+        &self.layers
+    }
+}
+
 unsafe fn choose_compute_device(
    instance: &Instance,
    devices: &[vk::PhysicalDevice],
@ -1136,5 +1402,6 @@ fn map_image_layout(layout: ImageLayout) -> vk::ImageLayout {
        ImageLayout::BlitSrc => vk::ImageLayout::TRANSFER_SRC_OPTIMAL,
        ImageLayout::BlitDst => vk::ImageLayout::TRANSFER_DST_OPTIMAL,
        ImageLayout::General => vk::ImageLayout::GENERAL,
+        ImageLayout::ShaderRead => vk::ImageLayout::SHADER_READ_ONLY_OPTIMAL,
    }
 }
--- a/piet-gpu-types/src/annotated.rs
+++ b/piet-gpu-types/src/annotated.rs
@ -3,34 +3,33 @@ use piet_gpu_derive::piet_gpu;
 piet_gpu! {
    #[gpu_write]
    mod annotated {
-        struct AnnoFill {
-            // The bbox is always first, as we take advantage of common
-            // layout when binning.
+        struct AnnoImage {
            bbox: [f32; 4],
-            rgba_color: u32,
+            linewidth: f32,
+            index: u32,
+            offset: [i16; 2],
        }
-        struct AnnoFillMask {
+        struct AnnoColor {
            bbox: [f32; 4],
-            mask: f32,
-        }
-        struct AnnoStroke {
-            bbox: [f32; 4],
-            rgba_color: u32,
+            // For stroked fills.
            // For the nonuniform scale case, this needs to be a 2x2 matrix.
            // That's expected to be uncommon, so we could special-case it.
            linewidth: f32,
+            rgba_color: u32,
        }
-        struct AnnoClip {
+        struct AnnoBeginClip {
+            bbox: [f32; 4],
+            linewidth: f32,
+        }
+        struct AnnoEndClip {
            bbox: [f32; 4],
        }
        enum Annotated {
            Nop,
-            Stroke(AnnoStroke),
-            Fill(AnnoFill),
-            FillMask(AnnoFillMask),
-            FillMaskInv(AnnoFillMask),
-            BeginClip(AnnoClip),
-            EndClip(AnnoClip),
+            Color(TagFlags, AnnoColor),
+            Image(TagFlags, AnnoImage),
+            BeginClip(TagFlags, AnnoBeginClip),
+            EndClip(AnnoEndClip),
        }
    }
 }
--- a/piet-gpu-types/src/bins.rs
+++ b/piet-gpu-types/src/bins.rs
@ -8,12 +8,5 @@ piet_gpu! {
        struct BinInstance {
            element_ix: u32,
        }
-
-        struct BinChunk {
-            // First chunk can have n = 0, subsequent ones not.
-            n: u32,
-            next: Ref<BinChunk>,
-            // Instances follow
-        }
    }
 }
--- a/piet-gpu-types/src/pathseg.rs
+++ b/piet-gpu-types/src/pathseg.rs
@ -3,65 +3,20 @@ use piet_gpu_derive::piet_gpu;
 piet_gpu! {
    #[gpu_write]
    mod pathseg {
-        struct PathFillLine {
-            p0: [f32; 2],
-            p1: [f32; 2],
-            path_ix: u32,
-            // A note: the layout of this struct is shared with
-            // PathStrokeLine. In that case, we actually write
-            // [0.0, 0.0] as the stroke field, to minimize divergence.
-        }
-        struct PathStrokeLine {
-            p0: [f32; 2],
-            p1: [f32; 2],
-            path_ix: u32,
-            // halfwidth in both x and y for binning
-            stroke: [f32; 2],
-        }
-        struct PathFillCubic {
-            p0: [f32; 2],
-            p1: [f32; 2],
-            p2: [f32; 2],
-            p3: [f32; 2],
-            path_ix: u32,
-            // A note: the layout of this struct is shared with
-            // PathStrokeCubic. In that case, we actually write
-            // [0.0, 0.0] as the stroke field, to minimize divergence.
-        }
-        struct PathStrokeCubic {
-            p0: [f32; 2],
-            p1: [f32; 2],
-            p2: [f32; 2],
-            p3: [f32; 2],
-            path_ix: u32,
-            // halfwidth in both x and y for binning
-            stroke: [f32; 2],
-        }
-        /*
-        struct PathQuad {
-            p0: [f32; 2],
-            p1: [f32; 2],
-            p2: [f32; 2],
-            stroke: [f32; 2],
-        }
        struct PathCubic {
            p0: [f32; 2],
            p1: [f32; 2],
            p2: [f32; 2],
            p3: [f32; 2],
+            path_ix: u32,
+            // trans_ix is the transform index. It is 1-based, 0 means no transformation.
+            trans_ix: u32,
+            // Halfwidth in both x and y for binning. For strokes only.
            stroke: [f32; 2],
        }
-        */
        enum PathSeg {
            Nop,
-            FillLine(PathFillLine),
-            StrokeLine(PathStrokeLine),
-            FillCubic(PathFillCubic),
-            StrokeCubic(PathStrokeCubic),
-            /*
-            Quad(AnnoQuadSeg),
-            Cubic(AnnoCubicSeg),
-            */
+            Cubic(TagFlags, PathCubic),
        }
    }
 }
--- a/piet-gpu-types/src/ptcl.rs
+++ b/piet-gpu-types/src/ptcl.rs
@ -3,88 +3,41 @@ use piet_gpu_derive::piet_gpu;
 piet_gpu! {
    #[gpu_write]
    mod ptcl {
-        struct CmdCircle {
-            center: [f32; 2],
-            radius: f32,
-            rgba_color: u32,
-        }
-        struct CmdLine {
-            start: [f32; 2],
-            end: [f32; 2],
-        }
        struct CmdStroke {
            // This is really a Ref<Tile>, but we don't have cross-module
            // references.
            tile_ref: u32,
            half_width: f32,
-            rgba_color: u32,
        }
        struct CmdFill {
            // As above, really Ref<Tile>
            tile_ref: u32,
            backdrop: i32,
+        }
+        struct CmdColor {
            rgba_color: u32,
        }
-        struct CmdFillMask {
-            tile_ref: u32,
-            backdrop: i32,
-            mask: f32,
+        struct CmdImage {
+            index: u32,
+            offset: [i16; 2],
        }
-        struct CmdBeginClip {
-            tile_ref: u32,
-            backdrop: i32,
-        }
-        // This is mostly here for expedience and can always be optimized
-        // out for pure clips, but will be useful for blend groups.
-        struct CmdBeginSolidClip {
+        struct CmdAlpha {
            alpha: f32,
        }
-        struct CmdEndClip {
-            // This will be 1.0 for clips, but we can imagine blend groups.
-            alpha: f32,
-        }
-        struct CmdSolid {
-            rgba_color: u32,
-        }
-        struct CmdSolidMask {
-            mask: f32,
-        }
        struct CmdJump {
            new_ref: u32,
        }
        enum Cmd {
            End,
-            Circle(CmdCircle),
-            Line(CmdLine),
            Fill(CmdFill),
-            FillMask(CmdFillMask),
-            FillMaskInv(CmdFillMask),
-            BeginClip(CmdBeginClip),
-            BeginSolidClip(CmdBeginSolidClip),
-            EndClip(CmdEndClip),
            Stroke(CmdStroke),
-            Solid(CmdSolid),
-            SolidMask(CmdSolidMask),
+            Solid,
+            Alpha(CmdAlpha),
+            Color(CmdColor),
+            Image(CmdImage),
+            BeginClip,
+            EndClip,
            Jump(CmdJump),
        }
-
-        // TODO: strongly consider using f16. If so, these would be
-        // relative to the tile. We're doing f32 for now to minimize
-        // divergence from piet-metal originals.
-        struct Segment {
-            start: [f32; 2],
-            end: [f32; 2],
-
-            // This is used for fills only, but we're including it in
-            // the general structure for simplicity.
-            y_edge: f32,
-        }
-
-        struct SegChunk {
-            n: u32,
-            next: Ref<SegChunk>,
-            // Actually a reference to a variable-sized slice.
-            segs: Ref<Segment>,
-        }
    }
 }
--- a/piet-gpu-types/src/scene.rs
+++ b/piet-gpu-types/src/scene.rs
@ -1,7 +1,7 @@
 use piet_gpu_derive::piet_gpu;

 pub use self::scene::{
-    Clip, CubicSeg, Element, Fill, LineSeg, QuadSeg, SetLineWidth, Stroke, Transform,
+    Clip, CubicSeg, Element, FillColor, LineSeg, QuadSeg, SetFillMode, SetLineWidth, Transform,
 };

 piet_gpu! {
@ -22,14 +22,12 @@ piet_gpu! {
            p2: [f32; 2],
            p3: [f32; 2],
        }
-        struct Fill {
+        struct FillColor {
            rgba_color: u32,
        }
-        struct FillMask {
-            mask: f32,
-        }
-        struct Stroke {
-            rgba_color: u32,
+        struct FillImage {
+            index: u32,
+            offset: [i16; 2],
        }
        struct SetLineWidth {
            width: f32,
@ -42,27 +40,23 @@ piet_gpu! {
            bbox: [f32; 4],
            // TODO: add alpha?
        }
+        struct SetFillMode {
+            fill_mode: u32,
+        }
        enum Element {
            Nop,
-            // Another approach to encoding would be to use a single
-            // variant but have a bool for fill/stroke. This could be
-            // packed into the tag, so the on-the-wire representation
-            // would be very similar to what's here.
-            StrokeLine(LineSeg),
-            FillLine(LineSeg),

-            StrokeQuad(QuadSeg),
-            FillQuad(QuadSeg),
-            StrokeCubic(CubicSeg),
-            FillCubic(CubicSeg),
-            Stroke(Stroke),
-            Fill(Fill),
+            Line(LineSeg),
+            Quad(QuadSeg),
+            Cubic(CubicSeg),
+
+            FillColor(FillColor),
            SetLineWidth(SetLineWidth),
            Transform(Transform),
-            FillMask(FillMask),
-            FillMaskInv(FillMask),
            BeginClip(Clip),
            EndClip(Clip),
+            FillImage(FillImage),
+            SetFillMode(SetFillMode),
        }
    }
 }
--- a/piet-gpu-types/src/state.rs
+++ b/piet-gpu-types/src/state.rs
@ -11,6 +11,7 @@ piet_gpu! {
            flags: u32,
            path_count: u32,
            pathseg_count: u32,
+            trans_count: u32,
        }
    }
 }
--- a/piet-gpu-types/src/tile.rs
+++ b/piet-gpu-types/src/tile.rs
@ -13,10 +13,14 @@ piet_gpu! {
        }
        // Segments within a tile are represented as a linked list.
        struct TileSeg {
-            start: [f32; 2],
-            end: [f32; 2],
+            origin: [f32; 2],
+            vector: [f32; 2],
            y_edge: f32,
            next: Ref<TileSeg>,
        }
+        struct TransformSeg {
+            mat: [f32; 4],
+            translate: [f32; 2],
+        }
    }
 }
--- a/piet-gpu/Cargo.toml
+++ b/piet-gpu/Cargo.toml
@ -14,6 +14,11 @@ path = "bin/cli.rs"
 name = "winit"
 path = "bin/winit.rs"

+[[example]]
+name = "android"
+path = "bin/android.rs"
+crate-type = ["cdylib"]
+
 [dependencies.piet-gpu-hal]
 path = "../piet-gpu-hal"

@ -27,3 +32,9 @@ rand = "0.7.3"
 roxmltree = "0.13"
 winit = "0.23"
 clap = "2.33"
+
+[target.'cfg(target_os = "android")'.dependencies]
+ndk = "0.3"
+ndk-sys = "0.2.0"
+ndk-glue = "0.3"
+raw-window-handle = "0.3"
--- a/piet-gpu/bin/android.rs
+++ b/piet-gpu/bin/android.rs
@ -0,0 +1,176 @@
+//! Android example
+//!
+//! Run using `cargo apk run --example android`
+//!
+//! Requires the [cargo-apk] tool.
+//! [cargo-apk]: https://crates.io/crates/cargo-apk
+
+use raw_window_handle::android::AndroidHandle;
+use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};
+
+use ndk::native_window::NativeWindow;
+use ndk_glue::Event;
+
+use piet_gpu_hal::hub;
+use piet_gpu_hal::vulkan::{QueryPool, VkInstance, VkSurface, VkSwapchain};
+use piet_gpu_hal::{CmdBuf, Error, ImageLayout};
+
+use piet_gpu::{render_scene, PietGpuRenderContext, Renderer};
+
+#[cfg_attr(target_os = "android", ndk_glue::main(backtrace = "on"))]
+fn main() {
+    my_main().unwrap();
+}
+
+struct MyHandle {
+    handle: AndroidHandle,
+}
+
+// State required to render and present the contents
+struct GfxState {
+    session: hub::Session,
+    renderer: Renderer,
+    swapchain: VkSwapchain,
+    current_frame: usize,
+    last_frame_idx: usize,
+    submitted: Option<hub::SubmittedCmdBuf>,
+    query_pools: Vec<QueryPool>,
+    present_semaphores: Vec<hub::Semaphore>,
+}
+
+const WIDTH: usize = 1080;
+const HEIGHT: usize = 2280;
+const NUM_FRAMES: usize = 2;
+
+fn my_main() -> Result<(), Error> {
+    let mut gfx_state = None;
+    loop {
+        for event in ndk_glue::poll_events() {
+            println!("got event {:?}", event);
+            match event {
+                Event::WindowCreated => {
+                    let window = ndk_glue::native_window();
+                    if let Some(window) = &*window {
+                        let handle = get_handle(window);
+                        let (instance, surface) = VkInstance::new(Some(&handle))?;
+                        gfx_state = Some(GfxState::new(&instance, surface.as_ref())?);
+                    } else {
+                        println!("native window is sadly none");
+                    }
+                }
+                Event::WindowRedrawNeeded => {
+                    if let Some(gfx_state) = gfx_state.as_mut() {
+                        for _ in 0..10 {
+                            gfx_state.redraw();
+                        }
+                    }
+                }
+                _ => (),
+            }
+        }
+    }
+}
+
+fn get_handle(window: &NativeWindow) -> MyHandle {
+    println!(
+        "window = {:?}, {}x{}",
+        window.ptr(),
+        window.width(),
+        window.height()
+    );
+    let mut handle = AndroidHandle::empty();
+    handle.a_native_window = window.ptr().as_ptr() as *mut std::ffi::c_void;
+    MyHandle { handle }
+}
+
+unsafe impl HasRawWindowHandle for MyHandle {
+    fn raw_window_handle(&self) -> RawWindowHandle {
+        RawWindowHandle::Android(self.handle)
+    }
+}
+
+impl GfxState {
+    fn new(instance: &VkInstance, surface: Option<&VkSurface>) -> Result<GfxState, Error> {
+        unsafe {
+            let device = instance.device(surface)?;
+            let mut swapchain =
+                instance.swapchain(WIDTH / 2, HEIGHT / 2, &device, surface.unwrap())?;
+            let session = hub::Session::new(device);
+            let mut current_frame = 0;
+            let present_semaphores = (0..NUM_FRAMES)
+                .map(|_| session.create_semaphore())
+                .collect::<Result<Vec<_>, Error>>()?;
+            let query_pools = (0..NUM_FRAMES)
+                .map(|_| session.create_query_pool(8))
+                .collect::<Result<Vec<_>, Error>>()?;
+
+            let mut ctx = PietGpuRenderContext::new();
+            render_scene(&mut ctx);
+            let n_paths = ctx.path_count();
+            let n_pathseg = ctx.pathseg_count();
+            let n_trans = ctx.pathseg_count();
+            let scene = ctx.get_scene_buf();
+
+            let renderer = Renderer::new(&session, scene, n_paths, n_pathseg, n_trans)?;
+
+            let submitted: Option<hub::SubmittedCmdBuf> = None;
+            let current_frame = 0;
+            let last_frame_idx = 0;
+            Ok(GfxState {
+                session,
+                renderer,
+                swapchain,
+                current_frame,
+                last_frame_idx,
+                submitted,
+                query_pools,
+                present_semaphores,
+            })
+        }
+    }
+
+    fn redraw(&mut self) {
+        println!("redraw");
+        unsafe {
+            if let Some(submitted) = self.submitted.take() {
+                submitted.wait().unwrap();
+
+                let ts = self
+                    .session
+                    .fetch_query_pool(&self.query_pools[self.last_frame_idx])
+                    .unwrap();
+                println!("render time: {:?}", ts);
+            }
+            let frame_idx = self.current_frame % NUM_FRAMES;
+            let (image_idx, acquisition_semaphore) = self.swapchain.next().unwrap();
+            let swap_image = self.swapchain.image(image_idx);
+            let query_pool = &self.query_pools[frame_idx];
+            let mut cmd_buf = self.session.cmd_buf().unwrap();
+            cmd_buf.begin();
+            self.renderer.record(&mut cmd_buf, &query_pool);
+
+            // Image -> Swapchain
+            cmd_buf.image_barrier(&swap_image, ImageLayout::Undefined, ImageLayout::BlitDst);
+            cmd_buf.blit_image(self.renderer.image_dev.vk_image(), &swap_image);
+            cmd_buf.image_barrier(&swap_image, ImageLayout::BlitDst, ImageLayout::Present);
+            cmd_buf.finish();
+
+            self.submitted = Some(
+                self.session
+                    .run_cmd_buf(
+                        cmd_buf,
+                        &[acquisition_semaphore],
+                        &[self.present_semaphores[frame_idx]],
+                    )
+                    .unwrap(),
+            );
+            self.last_frame_idx = frame_idx;
+
+            self.swapchain
+                .present(image_idx, &[self.present_semaphores[frame_idx]])
+                .unwrap();
+
+            self.current_frame += 1;
+        }
+    }
+}
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@ -248,10 +248,11 @@ fn main() -> Result<(), Error> {
        }
        let n_paths = ctx.path_count();
        let n_pathseg = ctx.pathseg_count();
+        let n_trans = ctx.trans_count();
        let scene = ctx.get_scene_buf();
        //dump_scene(&scene);

-        let renderer = Renderer::new(&session, scene, n_paths, n_pathseg)?;
+        let renderer = Renderer::new(&session, scene, n_paths, n_pathseg, n_trans)?;
        let image_buf =
            session.create_buffer((WIDTH * HEIGHT * 4) as u64, MemFlags::host_coherent())?;

--- a/piet-gpu/bin/winit.rs
+++ b/piet-gpu/bin/winit.rs
@ -25,7 +25,8 @@ fn main() -> Result<(), Error> {
    let (instance, surface) = VkInstance::new(Some(&window))?;
    unsafe {
        let device = instance.device(surface.as_ref())?;
-        let mut swapchain = instance.swapchain(&device, surface.as_ref().unwrap())?;
+        let mut swapchain =
+            instance.swapchain(WIDTH / 2, HEIGHT / 2, &device, surface.as_ref().unwrap())?;
        let session = hub::Session::new(device);

        let mut current_frame = 0;
@ -40,11 +41,13 @@ fn main() -> Result<(), Error> {
        render_scene(&mut ctx);
        let n_paths = ctx.path_count();
        let n_pathseg = ctx.pathseg_count();
+        let n_trans = ctx.trans_count();
        let scene = ctx.get_scene_buf();

-        let renderer = Renderer::new(&session, scene, n_paths, n_pathseg)?;
+        let renderer = Renderer::new(&session, scene, n_paths, n_pathseg, n_trans)?;

        let mut submitted: Option<hub::SubmittedCmdBuf> = None;
+        let mut last_frame_idx = 0;

        event_loop.run(move |event, _, control_flow| {
            *control_flow = ControlFlow::Poll; // `ControlFlow::Wait` if only re-render on event
@ -63,12 +66,16 @@ fn main() -> Result<(), Error> {
                }
                Event::RedrawRequested(window_id) if window_id == window.id() => {
                    let frame_idx = current_frame % NUM_FRAMES;
-                    let query_pool = &query_pools[frame_idx];

+                    // Note: this logic is a little strange. We have two sets of renderer
+                    // resources, so we could have two frames in flight (submit two, wait on
+                    // the first), but we actually just wait on the last submitted.
+                    //
+                    // Getting this right will take some thought.
                    if let Some(submitted) = submitted.take() {
                        submitted.wait().unwrap();

-                        let ts = session.fetch_query_pool(query_pool).unwrap();
+                        let ts = session.fetch_query_pool(&query_pools[last_frame_idx]).unwrap();
                        window.set_title(&format!(
                            "{:.3}ms :: e:{:.3}ms|alloc:{:.3}ms|cp:{:.3}ms|bd:{:.3}ms|bin:{:.3}ms|cr:{:.3}ms|r:{:.3}ms",
                            ts[6] * 1e3,
@ -82,8 +89,10 @@ fn main() -> Result<(), Error> {
                        ));
                    }

+
                    let (image_idx, acquisition_semaphore) = swapchain.next().unwrap();
                    let swap_image = swapchain.image(image_idx);
+                    let query_pool = &query_pools[frame_idx];
                    let mut cmd_buf = session.cmd_buf().unwrap();
                    cmd_buf.begin();
                    renderer.record(&mut cmd_buf, &query_pool);
@ -105,6 +114,7 @@ fn main() -> Result<(), Error> {
                            &[present_semaphores[frame_idx]],
                        )
                        .unwrap());
+                    last_frame_idx = frame_idx;

                    swapchain
                        .present(image_idx, &[present_semaphores[frame_idx]])
--- a/piet-gpu/shader/annotated.h
+++ b/piet-gpu/shader/annotated.h
@ -1,18 +1,20 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
 // Code auto-generated by piet-gpu-derive

-struct AnnoFillRef {
+struct AnnoImageRef {
    uint offset;
 };

-struct AnnoFillMaskRef {
+struct AnnoColorRef {
    uint offset;
 };

-struct AnnoStrokeRef {
+struct AnnoBeginClipRef {
    uint offset;
 };

-struct AnnoClipRef {
+struct AnnoEndClipRef {
    uint offset;
 };

@ -20,210 +22,204 @@ struct AnnotatedRef {
    uint offset;
 };

-struct AnnoFill {
+struct AnnoImage {
    vec4 bbox;
+    float linewidth;
+    uint index;
+    ivec2 offset;
+};
+
+#define AnnoImage_size 28
+
+AnnoImageRef AnnoImage_index(AnnoImageRef ref, uint index) {
+    return AnnoImageRef(ref.offset + index * AnnoImage_size);
+}
+
+struct AnnoColor {
+    vec4 bbox;
+    float linewidth;
    uint rgba_color;
 };

-#define AnnoFill_size 20
+#define AnnoColor_size 24

-AnnoFillRef AnnoFill_index(AnnoFillRef ref, uint index) {
-    return AnnoFillRef(ref.offset + index * AnnoFill_size);
+AnnoColorRef AnnoColor_index(AnnoColorRef ref, uint index) {
+    return AnnoColorRef(ref.offset + index * AnnoColor_size);
 }

-struct AnnoFillMask {
+struct AnnoBeginClip {
    vec4 bbox;
-    float mask;
-};
-
-#define AnnoFillMask_size 20
-
-AnnoFillMaskRef AnnoFillMask_index(AnnoFillMaskRef ref, uint index) {
-    return AnnoFillMaskRef(ref.offset + index * AnnoFillMask_size);
-}
-
-struct AnnoStroke {
-    vec4 bbox;
-    uint rgba_color;
    float linewidth;
 };

-#define AnnoStroke_size 24
+#define AnnoBeginClip_size 20

-AnnoStrokeRef AnnoStroke_index(AnnoStrokeRef ref, uint index) {
-    return AnnoStrokeRef(ref.offset + index * AnnoStroke_size);
+AnnoBeginClipRef AnnoBeginClip_index(AnnoBeginClipRef ref, uint index) {
+    return AnnoBeginClipRef(ref.offset + index * AnnoBeginClip_size);
 }

-struct AnnoClip {
+struct AnnoEndClip {
    vec4 bbox;
 };

-#define AnnoClip_size 16
+#define AnnoEndClip_size 16

-AnnoClipRef AnnoClip_index(AnnoClipRef ref, uint index) {
-    return AnnoClipRef(ref.offset + index * AnnoClip_size);
+AnnoEndClipRef AnnoEndClip_index(AnnoEndClipRef ref, uint index) {
+    return AnnoEndClipRef(ref.offset + index * AnnoEndClip_size);
 }

 #define Annotated_Nop 0
-#define Annotated_Stroke 1
-#define Annotated_Fill 2
-#define Annotated_FillMask 3
-#define Annotated_FillMaskInv 4
-#define Annotated_BeginClip 5
-#define Annotated_EndClip 6
-#define Annotated_size 28
+#define Annotated_Color 1
+#define Annotated_Image 2
+#define Annotated_BeginClip 3
+#define Annotated_EndClip 4
+#define Annotated_size 32

 AnnotatedRef Annotated_index(AnnotatedRef ref, uint index) {
    return AnnotatedRef(ref.offset + index * Annotated_size);
 }

-AnnoFill AnnoFill_read(AnnoFillRef ref) {
+struct AnnotatedTag {
+   uint tag;
+   uint flags;
+};
+
+AnnoImage AnnoImage_read(Alloc a, AnnoImageRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = annotated[ix + 0];
-    uint raw1 = annotated[ix + 1];
-    uint raw2 = annotated[ix + 2];
-    uint raw3 = annotated[ix + 3];
-    uint raw4 = annotated[ix + 4];
-    AnnoFill s;
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
+    uint raw5 = read_mem(a, ix + 5);
+    uint raw6 = read_mem(a, ix + 6);
+    AnnoImage s;
    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.rgba_color = raw4;
+    s.linewidth = uintBitsToFloat(raw4);
+    s.index = raw5;
+    s.offset = ivec2(int(raw6 << 16) >> 16, int(raw6) >> 16);
    return s;
 }

-void AnnoFill_write(AnnoFillRef ref, AnnoFill s) {
+void AnnoImage_write(Alloc a, AnnoImageRef ref, AnnoImage s) {
    uint ix = ref.offset >> 2;
-    annotated[ix + 0] = floatBitsToUint(s.bbox.x);
-    annotated[ix + 1] = floatBitsToUint(s.bbox.y);
-    annotated[ix + 2] = floatBitsToUint(s.bbox.z);
-    annotated[ix + 3] = floatBitsToUint(s.bbox.w);
-    annotated[ix + 4] = s.rgba_color;
+    write_mem(a, ix + 0, floatBitsToUint(s.bbox.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.bbox.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
+    write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
+    write_mem(a, ix + 4, floatBitsToUint(s.linewidth));
+    write_mem(a, ix + 5, s.index);
+    write_mem(a, ix + 6, (uint(s.offset.x) & 0xffff) | (uint(s.offset.y) << 16));
 }

-AnnoFillMask AnnoFillMask_read(AnnoFillMaskRef ref) {
+AnnoColor AnnoColor_read(Alloc a, AnnoColorRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = annotated[ix + 0];
-    uint raw1 = annotated[ix + 1];
-    uint raw2 = annotated[ix + 2];
-    uint raw3 = annotated[ix + 3];
-    uint raw4 = annotated[ix + 4];
-    AnnoFillMask s;
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
+    uint raw5 = read_mem(a, ix + 5);
+    AnnoColor s;
    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.mask = uintBitsToFloat(raw4);
+    s.linewidth = uintBitsToFloat(raw4);
+    s.rgba_color = raw5;
    return s;
 }

-void AnnoFillMask_write(AnnoFillMaskRef ref, AnnoFillMask s) {
+void AnnoColor_write(Alloc a, AnnoColorRef ref, AnnoColor s) {
    uint ix = ref.offset >> 2;
-    annotated[ix + 0] = floatBitsToUint(s.bbox.x);
-    annotated[ix + 1] = floatBitsToUint(s.bbox.y);
-    annotated[ix + 2] = floatBitsToUint(s.bbox.z);
-    annotated[ix + 3] = floatBitsToUint(s.bbox.w);
-    annotated[ix + 4] = floatBitsToUint(s.mask);
+    write_mem(a, ix + 0, floatBitsToUint(s.bbox.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.bbox.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
+    write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
+    write_mem(a, ix + 4, floatBitsToUint(s.linewidth));
+    write_mem(a, ix + 5, s.rgba_color);
 }

-AnnoStroke AnnoStroke_read(AnnoStrokeRef ref) {
+AnnoBeginClip AnnoBeginClip_read(Alloc a, AnnoBeginClipRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = annotated[ix + 0];
-    uint raw1 = annotated[ix + 1];
-    uint raw2 = annotated[ix + 2];
-    uint raw3 = annotated[ix + 3];
-    uint raw4 = annotated[ix + 4];
-    uint raw5 = annotated[ix + 5];
-    AnnoStroke s;
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
+    AnnoBeginClip s;
    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.rgba_color = raw4;
-    s.linewidth = uintBitsToFloat(raw5);
+    s.linewidth = uintBitsToFloat(raw4);
    return s;
 }

-void AnnoStroke_write(AnnoStrokeRef ref, AnnoStroke s) {
+void AnnoBeginClip_write(Alloc a, AnnoBeginClipRef ref, AnnoBeginClip s) {
    uint ix = ref.offset >> 2;
-    annotated[ix + 0] = floatBitsToUint(s.bbox.x);
-    annotated[ix + 1] = floatBitsToUint(s.bbox.y);
-    annotated[ix + 2] = floatBitsToUint(s.bbox.z);
-    annotated[ix + 3] = floatBitsToUint(s.bbox.w);
-    annotated[ix + 4] = s.rgba_color;
-    annotated[ix + 5] = floatBitsToUint(s.linewidth);
+    write_mem(a, ix + 0, floatBitsToUint(s.bbox.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.bbox.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
+    write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
+    write_mem(a, ix + 4, floatBitsToUint(s.linewidth));
 }

-AnnoClip AnnoClip_read(AnnoClipRef ref) {
+AnnoEndClip AnnoEndClip_read(Alloc a, AnnoEndClipRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = annotated[ix + 0];
-    uint raw1 = annotated[ix + 1];
-    uint raw2 = annotated[ix + 2];
-    uint raw3 = annotated[ix + 3];
-    AnnoClip s;
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    AnnoEndClip s;
    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    return s;
 }

-void AnnoClip_write(AnnoClipRef ref, AnnoClip s) {
+void AnnoEndClip_write(Alloc a, AnnoEndClipRef ref, AnnoEndClip s) {
    uint ix = ref.offset >> 2;
-    annotated[ix + 0] = floatBitsToUint(s.bbox.x);
-    annotated[ix + 1] = floatBitsToUint(s.bbox.y);
-    annotated[ix + 2] = floatBitsToUint(s.bbox.z);
-    annotated[ix + 3] = floatBitsToUint(s.bbox.w);
+    write_mem(a, ix + 0, floatBitsToUint(s.bbox.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.bbox.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
+    write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
 }

-uint Annotated_tag(AnnotatedRef ref) {
-    return annotated[ref.offset >> 2];
+AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref) {
+    uint tag_and_flags = read_mem(a, ref.offset >> 2);
+    return AnnotatedTag(tag_and_flags & 0xffff, tag_and_flags >> 16);
 }

-AnnoStroke Annotated_Stroke_read(AnnotatedRef ref) {
-    return AnnoStroke_read(AnnoStrokeRef(ref.offset + 4));
+AnnoColor Annotated_Color_read(Alloc a, AnnotatedRef ref) {
+    return AnnoColor_read(a, AnnoColorRef(ref.offset + 4));
 }

-AnnoFill Annotated_Fill_read(AnnotatedRef ref) {
-    return AnnoFill_read(AnnoFillRef(ref.offset + 4));
+AnnoImage Annotated_Image_read(Alloc a, AnnotatedRef ref) {
+    return AnnoImage_read(a, AnnoImageRef(ref.offset + 4));
 }

-AnnoFillMask Annotated_FillMask_read(AnnotatedRef ref) {
-    return AnnoFillMask_read(AnnoFillMaskRef(ref.offset + 4));
+AnnoBeginClip Annotated_BeginClip_read(Alloc a, AnnotatedRef ref) {
+    return AnnoBeginClip_read(a, AnnoBeginClipRef(ref.offset + 4));
 }

-AnnoFillMask Annotated_FillMaskInv_read(AnnotatedRef ref) {
-    return AnnoFillMask_read(AnnoFillMaskRef(ref.offset + 4));
+AnnoEndClip Annotated_EndClip_read(Alloc a, AnnotatedRef ref) {
+    return AnnoEndClip_read(a, AnnoEndClipRef(ref.offset + 4));
 }

-AnnoClip Annotated_BeginClip_read(AnnotatedRef ref) {
-    return AnnoClip_read(AnnoClipRef(ref.offset + 4));
+void Annotated_Nop_write(Alloc a, AnnotatedRef ref) {
+    write_mem(a, ref.offset >> 2, Annotated_Nop);
 }

-AnnoClip Annotated_EndClip_read(AnnotatedRef ref) {
-    return AnnoClip_read(AnnoClipRef(ref.offset + 4));
+void Annotated_Color_write(Alloc a, AnnotatedRef ref, uint flags, AnnoColor s) {
+    write_mem(a, ref.offset >> 2, (flags << 16) | Annotated_Color);
+    AnnoColor_write(a, AnnoColorRef(ref.offset + 4), s);
 }

-void Annotated_Nop_write(AnnotatedRef ref) {
-    annotated[ref.offset >> 2] = Annotated_Nop;
+void Annotated_Image_write(Alloc a, AnnotatedRef ref, uint flags, AnnoImage s) {
+    write_mem(a, ref.offset >> 2, (flags << 16) | Annotated_Image);
+    AnnoImage_write(a, AnnoImageRef(ref.offset + 4), s);
 }

-void Annotated_Stroke_write(AnnotatedRef ref, AnnoStroke s) {
-    annotated[ref.offset >> 2] = Annotated_Stroke;
-    AnnoStroke_write(AnnoStrokeRef(ref.offset + 4), s);
+void Annotated_BeginClip_write(Alloc a, AnnotatedRef ref, uint flags, AnnoBeginClip s) {
+    write_mem(a, ref.offset >> 2, (flags << 16) | Annotated_BeginClip);
+    AnnoBeginClip_write(a, AnnoBeginClipRef(ref.offset + 4), s);
 }

-void Annotated_Fill_write(AnnotatedRef ref, AnnoFill s) {
-    annotated[ref.offset >> 2] = Annotated_Fill;
-    AnnoFill_write(AnnoFillRef(ref.offset + 4), s);
-}
-
-void Annotated_FillMask_write(AnnotatedRef ref, AnnoFillMask s) {
-    annotated[ref.offset >> 2] = Annotated_FillMask;
-    AnnoFillMask_write(AnnoFillMaskRef(ref.offset + 4), s);
-}
-
-void Annotated_FillMaskInv_write(AnnotatedRef ref, AnnoFillMask s) {
-    annotated[ref.offset >> 2] = Annotated_FillMaskInv;
-    AnnoFillMask_write(AnnoFillMaskRef(ref.offset + 4), s);
-}
-
-void Annotated_BeginClip_write(AnnotatedRef ref, AnnoClip s) {
-    annotated[ref.offset >> 2] = Annotated_BeginClip;
-    AnnoClip_write(AnnoClipRef(ref.offset + 4), s);
-}
-
-void Annotated_EndClip_write(AnnotatedRef ref, AnnoClip s) {
-    annotated[ref.offset >> 2] = Annotated_EndClip;
-    AnnoClip_write(AnnoClipRef(ref.offset + 4), s);
+void Annotated_EndClip_write(Alloc a, AnnotatedRef ref, AnnoEndClip s) {
+    write_mem(a, ref.offset >> 2, Annotated_EndClip);
+    AnnoEndClip_write(a, AnnoEndClipRef(ref.offset + 4), s);
 }

--- a/piet-gpu/shader/backdrop.comp
+++ b/piet-gpu/shader/backdrop.comp
@ -1,3 +1,5 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
 // Propagation of tile backdrop for filling.
 //
 // Each thread reads one path element and calculates the number of spanned tiles
@ -13,6 +15,7 @@
 #version 450
 #extension GL_GOOGLE_include_directive : enable

+#include "mem.h"
 #include "setup.h"

 #define LG_BACKDROP_WG (7 + LG_WG_FACTOR)
@ -20,54 +23,49 @@

 layout(local_size_x = BACKDROP_WG, local_size_y = 1) in;

-layout(set = 0, binding = 0) buffer AnnotatedBuf {
-    uint[] annotated;
-};
-
-// This is really only used for n_elements; maybe we can handle that
-// a different way, but it's convenient to have the same signature as
-// tile allocation.
-layout(set = 0, binding = 1) readonly buffer AllocBuf {
-    uint n_elements; // paths
-    uint n_pathseg;
-    uint alloc;
-};
-
-layout(set = 0, binding = 2) buffer TileBuf {
-    uint[] tile;
+layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+    Config conf;
 };

 #include "annotated.h"
 #include "tile.h"

 shared uint sh_row_count[BACKDROP_WG];
-shared uint sh_row_base[BACKDROP_WG];
+shared Alloc sh_row_alloc[BACKDROP_WG];
 shared uint sh_row_width[BACKDROP_WG];

 void main() {
    uint th_ix = gl_LocalInvocationID.x;
    uint element_ix = gl_GlobalInvocationID.x;
-    AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
+    AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);

    // Work assignment: 1 thread : 1 path element
    uint row_count = 0;
-    if (element_ix < n_elements) {
-        uint tag = Annotated_tag(ref);
-        switch (tag) {
-        case Annotated_Fill:
-        case Annotated_FillMask:
-        case Annotated_FillMaskInv:
+    bool mem_ok = mem_error == NO_ERROR;
+    if (element_ix < conf.n_elements) {
+        AnnotatedTag tag = Annotated_tag(conf.anno_alloc, ref);
+        switch (tag.tag) {
+        case Annotated_Image:
        case Annotated_BeginClip:
-            PathRef path_ref = PathRef(element_ix * Path_size);
-            Path path = Path_read(path_ref);
+        case Annotated_Color:
+            if (fill_mode_from_flags(tag.flags) != MODE_NONZERO) {
+                break;
+            }
+            // Fall through.
+            PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
+            Path path = Path_read(conf.tile_alloc, path_ref);
            sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
            row_count = path.bbox.w - path.bbox.y;
-            if (row_count == 1) {
+            // Paths that don't cross tile top edges don't have backdrops.
+            // Don't apply the optimization to paths that may cross the y = 0
+            // top edge, but clipped to 1 row.
+            if (row_count == 1 && path.bbox.y > 0) {
                // Note: this can probably be expanded to width = 2 as
                // long as it doesn't cross the left edge.
                row_count = 0;
            }
-            sh_row_base[th_ix] = (path.tiles.offset >> 2) + 1;
+            Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
+            sh_row_alloc[th_ix] = path_alloc;
        }
    }

@ -93,16 +91,19 @@ void main() {
                el_ix = probe;
            }
        }
-        uint seq_ix = row - (el_ix > 0 ? sh_row_count[el_ix - 1] : 0);
        uint width = sh_row_width[el_ix];
-        // Process one row sequentially
-        // Read backdrop value per tile and prefix sum it
-        uint tile_el_ix = sh_row_base[el_ix] + seq_ix * 2 * width;
-        uint sum = tile[tile_el_ix];
-        for (uint x = 1; x < width; x++) {
-            tile_el_ix += 2;
-            sum += tile[tile_el_ix];
-            tile[tile_el_ix] = sum;
+        if (width > 0 && mem_ok) {
+            // Process one row sequentially
+            // Read backdrop value per tile and prefix sum it
+            Alloc tiles_alloc = sh_row_alloc[el_ix];
+            uint seq_ix = row - (el_ix > 0 ? sh_row_count[el_ix - 1] : 0);
+            uint tile_el_ix = (tiles_alloc.offset >> 2) + 1 + seq_ix * 2 * width;
+            uint sum = read_mem(tiles_alloc, tile_el_ix);
+            for (uint x = 1; x < width; x++) {
+                tile_el_ix += 2;
+                sum += read_mem(tiles_alloc, tile_el_ix);
+                write_mem(tiles_alloc, tile_el_ix, sum);
+            }
        }
    }
 }
--- a/piet-gpu/shader/backdrop.spv
+++ b/piet-gpu/shader/backdrop.spv
--- a/piet-gpu/shader/binning.comp
+++ b/piet-gpu/shader/binning.comp
@ -1,3 +1,5 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
 // The binning stage of the pipeline.
 //
 // Each workgroup processes N_TILE paths.
@ -7,21 +9,13 @@
 #version 450
 #extension GL_GOOGLE_include_directive : enable

+#include "mem.h"
 #include "setup.h"

 layout(local_size_x = N_TILE, local_size_y = 1) in;

-layout(set = 0, binding = 0) buffer AnnotatedBuf {
-    uint[] annotated;
-};
-
-layout(set = 0, binding = 1) buffer AllocBuf {
-    uint n_elements; // paths
-    uint alloc;
-};
-
-layout(set = 0, binding = 2) buffer BinsBuf {
-    uint[] bins;
+layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+    Config conf;
 };

 #include "annotated.h"
@ -38,39 +32,41 @@ layout(set = 0, binding = 2) buffer BinsBuf {
 // Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps)
 shared uint bitmaps[N_SLICE][N_TILE];
 shared uint count[N_SLICE][N_TILE];
-shared uint sh_chunk_start[N_TILE];
+shared Alloc sh_chunk_alloc[N_TILE];
+shared bool sh_alloc_failed;

 void main() {
-    uint my_n_elements = n_elements;
+    uint my_n_elements = conf.n_elements;
    uint my_partition = gl_WorkGroupID.x;

    for (uint i = 0; i < N_SLICE; i++) {
        bitmaps[i][gl_LocalInvocationID.x] = 0;
    }
+    if (gl_LocalInvocationID.x == 0) {
+        sh_alloc_failed = false;
+    }
    barrier();

    // Read inputs and determine coverage of bins
    uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x;
-    AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
+    AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
    uint tag = Annotated_Nop;
    if (element_ix < my_n_elements) {
-        tag = Annotated_tag(ref);
+        tag = Annotated_tag(conf.anno_alloc, ref).tag;
    }
    int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
    switch (tag) {
-    case Annotated_Fill:
-    case Annotated_FillMask:
-    case Annotated_FillMaskInv:
-    case Annotated_Stroke:
+    case Annotated_Color:
+    case Annotated_Image:
    case Annotated_BeginClip:
    case Annotated_EndClip:
        // Note: we take advantage of the fact that these drawing elements
        // have the bbox at the same place in their layout.
-        AnnoFill fill = Annotated_Fill_read(ref);
-        x0 = int(floor(fill.bbox.x * SX));
-        y0 = int(floor(fill.bbox.y * SY));
-        x1 = int(ceil(fill.bbox.z * SX));
-        y1 = int(ceil(fill.bbox.w * SY));
+        AnnoEndClip clip = Annotated_EndClip_read(conf.anno_alloc, ref);
+        x0 = int(floor(clip.bbox.x * SX));
+        y0 = int(floor(clip.bbox.y * SY));
+        x1 = int(ceil(clip.bbox.z * SX));
+        y1 = int(ceil(clip.bbox.w * SY));
        break;
    }

@ -78,16 +74,18 @@ void main() {
    // trying to keep divergence low.
    // Right now, it's just a bbox, but we'll get finer with
    // segments.
-    x0 = clamp(x0, 0, N_TILE_X);
-    x1 = clamp(x1, x0, N_TILE_X);
-    y0 = clamp(y0, 0, N_TILE_Y);
-    y1 = clamp(y1, y0, N_TILE_Y);
+    uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1)/N_TILE_X;
+    uint height_in_bins = (conf.height_in_tiles + N_TILE_Y - 1)/N_TILE_Y;
+    x0 = clamp(x0, 0, int(width_in_bins));
+    x1 = clamp(x1, x0, int(width_in_bins));
+    y0 = clamp(y0, 0, int(height_in_bins));
+    y1 = clamp(y1, y0, int(height_in_bins));
    if (x0 == x1) y1 = y0;
    int x = x0, y = y0;
    uint my_slice = gl_LocalInvocationID.x / 32;
    uint my_mask = 1 << (gl_LocalInvocationID.x & 31);
    while (y < y1) {
-        atomicOr(bitmaps[my_slice][y * N_TILE_X + x], my_mask);
+        atomicOr(bitmaps[my_slice][y * width_in_bins + x], my_mask);
        x++;
        if (x == x1) {
            x = x0;
@ -103,33 +101,42 @@ void main() {
        count[i][gl_LocalInvocationID.x] = element_count;
    }
    // element_count is number of elements covering bin for this invocation.
-    uint chunk_start = 0;
+    Alloc chunk_alloc = new_alloc(0, 0, true);
    if (element_count != 0) {
        // TODO: aggregate atomic adds (subgroup is probably fastest)
-        chunk_start = atomicAdd(alloc, element_count * BinInstance_size);
-        sh_chunk_start[gl_LocalInvocationID.x] = chunk_start;
+        MallocResult chunk = malloc(element_count * BinInstance_size);
+        chunk_alloc = chunk.alloc;
+        sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
+        if (chunk.failed) {
+            sh_alloc_failed = true;
+        }
    }
    // Note: it might be more efficient for reading to do this in the
    // other order (each bin is a contiguous sequence of partitions)
-    uint out_ix = (my_partition * N_TILE + gl_LocalInvocationID.x) * 2;
-    bins[out_ix] = element_count;
-    bins[out_ix + 1] = chunk_start;
+    uint out_ix = (conf.bin_alloc.offset >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2;
+    write_mem(conf.bin_alloc, out_ix, element_count);
+    write_mem(conf.bin_alloc, out_ix + 1, chunk_alloc.offset);

    barrier();
+    if (sh_alloc_failed || mem_error != NO_ERROR) {
+        return;
+    }
+
    // Use similar strategy as Laine & Karras paper; loop over bbox of bins
    // touched by this element
    x = x0;
    y = y0;
    while (y < y1) {
-        uint bin_ix = y * N_TILE_X + x;
+        uint bin_ix = y * width_in_bins + x;
        uint out_mask = bitmaps[my_slice][bin_ix];
        if ((out_mask & my_mask) != 0) {
            uint idx = bitCount(out_mask & (my_mask - 1));
            if (my_slice > 0) {
                idx += count[my_slice - 1][bin_ix];
            }
-            uint out_offset = sh_chunk_start[bin_ix] + idx * BinInstance_size;
-            BinInstance_write(BinInstanceRef(out_offset), BinInstance(element_ix));
+            Alloc out_alloc = sh_chunk_alloc[bin_ix];
+            uint out_offset = out_alloc.offset + idx * BinInstance_size;
+            BinInstance_write(out_alloc, BinInstanceRef(out_offset), BinInstance(element_ix));
        }
        x++;
        if (x == x1) {
--- a/piet-gpu/shader/binning.spv
+++ b/piet-gpu/shader/binning.spv
--- a/piet-gpu/shader/bins.h
+++ b/piet-gpu/shader/bins.h
@ -1,13 +1,11 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
 // Code auto-generated by piet-gpu-derive

 struct BinInstanceRef {
    uint offset;
 };

-struct BinChunkRef {
-    uint offset;
-};
-
 struct BinInstance {
    uint element_ix;
 };
@ -18,43 +16,16 @@ BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) {
    return BinInstanceRef(ref.offset + index * BinInstance_size);
 }

-struct BinChunk {
-    uint n;
-    BinChunkRef next;
-};
-
-#define BinChunk_size 8
-
-BinChunkRef BinChunk_index(BinChunkRef ref, uint index) {
-    return BinChunkRef(ref.offset + index * BinChunk_size);
-}
-
-BinInstance BinInstance_read(BinInstanceRef ref) {
+BinInstance BinInstance_read(Alloc a, BinInstanceRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = bins[ix + 0];
+    uint raw0 = read_mem(a, ix + 0);
    BinInstance s;
    s.element_ix = raw0;
    return s;
 }

-void BinInstance_write(BinInstanceRef ref, BinInstance s) {
+void BinInstance_write(Alloc a, BinInstanceRef ref, BinInstance s) {
    uint ix = ref.offset >> 2;
-    bins[ix + 0] = s.element_ix;
-}
-
-BinChunk BinChunk_read(BinChunkRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = bins[ix + 0];
-    uint raw1 = bins[ix + 1];
-    BinChunk s;
-    s.n = raw0;
-    s.next = BinChunkRef(raw1);
-    return s;
-}
-
-void BinChunk_write(BinChunkRef ref, BinChunk s) {
-    uint ix = ref.offset >> 2;
-    bins[ix + 0] = s.n;
-    bins[ix + 1] = s.next.offset;
+    write_mem(a, ix + 0, s.element_ix);
 }

--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@ -5,7 +5,7 @@
 glslang_validator = glslangValidator

 rule glsl
-  command = $glslang_validator -V -o $out $in
+  command = $glslang_validator $flags -V -o $out $in


 build elements.spv: glsl elements.comp | scene.h state.h annotated.h
@ -21,3 +21,6 @@ build backdrop.spv: glsl backdrop.comp | annotated.h tile.h setup.h
 build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h

 build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h
+
+build kernel4_idx.spv: glsl kernel4.comp | ptcl.h setup.h
+  flags = -DENABLE_IMAGE_INDICES
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@ -1,3 +1,5 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
 // The coarse rasterizer stage of the pipeline.
 //
 // As input we have the ordered partitions of paths from the binning phase and
@ -11,29 +13,13 @@
 #version 450
 #extension GL_GOOGLE_include_directive : enable

+#include "mem.h"
 #include "setup.h"

 layout(local_size_x = N_TILE, local_size_y = 1) in;

-layout(set = 0, binding = 0) buffer AnnotatedBuf {
-    uint[] annotated;
-};
-
-layout(set = 0, binding = 1) buffer BinsBuf {
-    uint[] bins;
-};
-
-layout(set = 0, binding = 2) buffer TileBuf {
-    uint[] tile;
-};
-
-layout(set = 0, binding = 3) buffer AllocBuf {
-    uint n_elements;
-    uint alloc;
-};
-
-layout(set = 0, binding = 4) buffer PtclBuf {
-    uint[] ptcl;
+layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+    Config conf;
 };

 #include "annotated.h"
@ -48,7 +34,7 @@ shared uint sh_elements[N_TILE];

 // Number of elements in the partition; prefix sum.
 shared uint sh_part_count[N_PART_READ];
-shared uint sh_part_elements[N_PART_READ];
+shared Alloc sh_part_elements[N_PART_READ];

 shared uint sh_bitmaps[N_SLICE][N_TILE];

@ -62,33 +48,96 @@ shared uint sh_tile_y0[N_TILE];
 shared uint sh_tile_base[N_TILE];
 shared uint sh_tile_stride[N_TILE];

-// Perhaps cmd_limit should be a global? This is a style question.
-void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
-    if (cmd_ref.offset > cmd_limit) {
-        uint new_cmd = atomicAdd(alloc, PTCL_INITIAL_ALLOC);
-        CmdJump jump = CmdJump(new_cmd);
-        Cmd_Jump_write(cmd_ref, jump);
-        cmd_ref = CmdRef(new_cmd);
-        cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
+#ifdef MEM_DEBUG
+// Store allocs only when MEM_DEBUG to save shared memory traffic.
+shared Alloc sh_tile_alloc[N_TILE];
+
+void write_tile_alloc(uint el_ix, Alloc a) {
+    sh_tile_alloc[el_ix] = a;
+}
+
+Alloc read_tile_alloc(uint el_ix, bool mem_ok) {
+    return sh_tile_alloc[el_ix];
+}
+#else
+void write_tile_alloc(uint el_ix, Alloc a) {
+    // No-op
+}
+
+Alloc read_tile_alloc(uint el_ix, bool mem_ok) {
+    // All memory.
+    return new_alloc(0, memory.length()*4, mem_ok);
+}
+#endif
+
+// The maximum number of commands per annotated element.
+#define ANNO_COMMANDS 2
+
+// Perhaps cmd_alloc should be a global? This is a style question.
+bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) {
+    if (cmd_ref.offset < cmd_limit) {
+        return true;
+    }
+    MallocResult new_cmd = malloc(PTCL_INITIAL_ALLOC);
+    if (new_cmd.failed) {
+        return false;
+    }
+    CmdJump jump = CmdJump(new_cmd.alloc.offset);
+    Cmd_Jump_write(cmd_alloc, cmd_ref, jump);
+    cmd_alloc = new_cmd.alloc;
+    cmd_ref = CmdRef(cmd_alloc.offset);
+    // Reserve space for the maximum number of commands and a potential jump.
+    cmd_limit = cmd_alloc.offset + PTCL_INITIAL_ALLOC - (ANNO_COMMANDS + 1) * Cmd_size;
+    return true;
+}
+
+void write_fill(Alloc alloc, inout CmdRef cmd_ref, uint flags, Tile tile, float linewidth) {
+    if (fill_mode_from_flags(flags) == MODE_NONZERO) {
+        if (tile.tile.offset != 0) {
+            CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop);
+            Cmd_Fill_write(alloc, cmd_ref, cmd_fill);
+            cmd_ref.offset += 4 + CmdFill_size;
+        } else {
+            Cmd_Solid_write(alloc, cmd_ref);
+            cmd_ref.offset += 4;
+        }
+    } else {
+        CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * linewidth);
+        Cmd_Stroke_write(alloc, cmd_ref, cmd_stroke);
+        cmd_ref.offset += 4 + CmdStroke_size;
    }
 }

 void main() {
    // Could use either linear or 2d layouts for both dispatch and
    // invocations within the workgroup. We'll use variables to abstract.
-    uint bin_ix = N_TILE_X * gl_WorkGroupID.y + gl_WorkGroupID.x;
+    uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1)/N_TILE_X;
+    uint bin_ix = width_in_bins * gl_WorkGroupID.y + gl_WorkGroupID.x;
    uint partition_ix = 0;
-    uint n_partitions = (n_elements + N_TILE - 1) / N_TILE;
+    uint n_partitions = (conf.n_elements + N_TILE - 1) / N_TILE;
    uint th_ix = gl_LocalInvocationID.x;

    // Coordinates of top left of bin, in tiles.
    uint bin_tile_x = N_TILE_X * gl_WorkGroupID.x;
    uint bin_tile_y = N_TILE_Y * gl_WorkGroupID.y;
+
+    // Per-tile state
    uint tile_x = gl_LocalInvocationID.x % N_TILE_X;
    uint tile_y = gl_LocalInvocationID.x / N_TILE_X;
-    uint this_tile_ix = (bin_tile_y + tile_y) * WIDTH_IN_TILES + bin_tile_x + tile_x;
-    CmdRef cmd_ref = CmdRef(this_tile_ix * PTCL_INITIAL_ALLOC);
-    uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
+    uint this_tile_ix = (bin_tile_y + tile_y) * conf.width_in_tiles + bin_tile_x + tile_x;
+    Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, this_tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
+    CmdRef cmd_ref = CmdRef(cmd_alloc.offset);
+    // Reserve space for the maximum number of commands and a potential jump.
+    uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - (ANNO_COMMANDS + 1) * Cmd_size;
+    // The nesting depth of the clip stack
+    uint clip_depth = 0;
+    // State for the "clip zero" optimization. If it's nonzero, then we are
+    // currently in a clip for which the entire tile has an alpha of zero, and
+    // the value is the depth after the "begin clip" of that element.
+    uint clip_zero_depth = 0;
+    // State for the "clip one" optimization. If bit `i` is set, then that means
+    // that the clip pushed at depth `i` has an alpha of all one.
+    uint clip_one_mask = 0;

    // I'm sure we can figure out how to do this with at least one fewer register...
    // Items up to rd_ix have been read from sh_elements
@ -98,6 +147,14 @@ void main() {
    // Items between part_start_ix and ready_ix are ready to be transferred from sh_part_elements
    uint part_start_ix = 0;
    uint ready_ix = 0;
+
+    // Leave room for the fine rasterizer scratch allocation.
+    Alloc scratch_alloc = slice_mem(cmd_alloc, 0, Alloc_size);
+    cmd_ref.offset += Alloc_size;
+
+    uint num_begin_slots = 0;
+    uint begin_slot = 0;
+    bool mem_ok = mem_error == NO_ERROR;
    while (true) {
        for (uint i = 0; i < N_SLICE; i++) {
            sh_bitmaps[i][th_ix] = 0;
@ -109,9 +166,10 @@ void main() {
                part_start_ix = ready_ix;
                uint count = 0;
                if (th_ix < N_PART_READ && partition_ix + th_ix < n_partitions) {
-                    uint in_ix = ((partition_ix + th_ix) * N_TILE + bin_ix) * 2;
-                    count = bins[in_ix];
-                    sh_part_elements[th_ix] = bins[in_ix + 1];
+                    uint in_ix = (conf.bin_alloc.offset >> 2) + ((partition_ix + th_ix) * N_TILE + bin_ix) * 2;
+                    count = read_mem(conf.bin_alloc, in_ix);
+                    uint offset = read_mem(conf.bin_alloc, in_ix + 1);
+                    sh_part_elements[th_ix] = new_alloc(offset, count*BinInstance_size, mem_ok);
                }
                // prefix sum of counts
                for (uint i = 0; i < LG_N_PART_READ; i++) {
@ -135,7 +193,7 @@ void main() {
            }
            // use binary search to find element to read
            uint ix = rd_ix + th_ix;
-            if (ix >= wr_ix && ix < ready_ix) {
+            if (ix >= wr_ix && ix < ready_ix && mem_ok) {
                uint part_ix = 0;
                for (uint i = 0; i < LG_N_PART_READ; i++) {
                    uint probe = part_ix + ((N_PART_READ / 2) >> i);
@ -144,8 +202,9 @@ void main() {
                    }
                }
                ix -= part_ix > 0 ? sh_part_count[part_ix - 1] : part_start_ix;
-                BinInstanceRef inst_ref = BinInstanceRef(sh_part_elements[part_ix]);
-                BinInstance inst = BinInstance_read(BinInstance_index(inst_ref, ix));
+                Alloc bin_alloc = sh_part_elements[part_ix];
+                BinInstanceRef inst_ref = BinInstanceRef(bin_alloc.offset);
+                BinInstance inst = BinInstance_read(bin_alloc, BinInstance_index(inst_ref, ix));
                sh_elements[th_ix] = inst.element_ix;
            }
            barrier();
@ -161,23 +220,21 @@ void main() {
        AnnotatedRef ref;
        if (th_ix + rd_ix < wr_ix) {
            element_ix = sh_elements[th_ix];
-            ref = AnnotatedRef(element_ix * Annotated_size);
-            tag = Annotated_tag(ref);
+            ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
+            tag = Annotated_tag(conf.anno_alloc, ref).tag;
        }

        // Bounding box of element in pixel coordinates.
        uint tile_count;
        switch (tag) {
-        case Annotated_Fill:
-        case Annotated_FillMask:
-        case Annotated_FillMaskInv:
-        case Annotated_Stroke:
+        case Annotated_Color:
+        case Annotated_Image:
        case Annotated_BeginClip:
        case Annotated_EndClip:
            // We have one "path" for each element, even if the element isn't
            // actually a path (currently EndClip, but images etc in the future).
            uint path_ix = element_ix;
-            Path path = Path_read(PathRef(path_ix * Path_size));
+            Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size));
            uint stride = path.bbox.z - path.bbox.x;
            sh_tile_stride[th_ix] = stride;
            int dx = int(path.bbox.x) - int(bin_tile_x);
@ -193,6 +250,8 @@ void main() {
            // base relative to bin
            uint base = path.tiles.offset - uint(dy * stride + dx) * Tile_size;
            sh_tile_base[th_ix] = base;
+            Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
+            write_tile_alloc(th_ix, path_alloc);
            break;
        default:
            tile_count = 0;
@ -220,23 +279,21 @@ void main() {
                    el_ix = probe;
                }
            }
-            AnnotatedRef ref = AnnotatedRef(sh_elements[el_ix] * Annotated_size);
-            uint tag = Annotated_tag(ref);
+            AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + sh_elements[el_ix] * Annotated_size);
+            uint tag = Annotated_tag(conf.anno_alloc, ref).tag;
            uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0);
            uint width = sh_tile_width[el_ix];
            uint x = sh_tile_x0[el_ix] + seq_ix % width;
            uint y = sh_tile_y0[el_ix] + seq_ix / width;
-            bool include_tile;
+            bool include_tile = false;
            if (tag == Annotated_BeginClip || tag == Annotated_EndClip) {
                include_tile = true;
-            } else {
-                Tile tile = Tile_read(TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
+            } else if (mem_ok) {
+                Tile tile = Tile_read(read_tile_alloc(el_ix, mem_ok), TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
                // Include the path in the tile if
                // - the tile contains at least a segment (tile offset non-zero)
                // - the tile is completely covered (backdrop non-zero)
-                bool inside = tile.backdrop != 0;
-                bool fill = tag != Annotated_FillMaskInv;
-                include_tile =  tile.tile.offset != 0 || inside == fill;
+                include_tile = tile.tile.offset != 0 || tile.backdrop != 0;
            }
            if (include_tile) {
                uint el_slice = el_ix / 32;
@ -251,7 +308,7 @@ void main() {
        // through the non-segment elements.
        uint slice_ix = 0;
        uint bitmap = sh_bitmaps[0][th_ix];
-        while (true) {
+        while (mem_ok) {
            if (bitmap == 0) {
                slice_ix++;
                if (slice_ix == N_SLICE) {
@ -271,80 +328,83 @@ void main() {
            // At this point, we read the element again from global memory.
            // If that turns out to be expensive, maybe we can pack it into
            // shared memory (or perhaps just the tag).
-            ref = AnnotatedRef(element_ix * Annotated_size);
-            tag = Annotated_tag(ref);
+            ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
+            AnnotatedTag tag = Annotated_tag(conf.anno_alloc, ref);

-            switch (tag) {
-            case Annotated_Fill:
-                Tile tile = Tile_read(TileRef(sh_tile_base[element_ref_ix]
-                    + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
-                AnnoFill fill = Annotated_Fill_read(ref);
-                alloc_cmd(cmd_ref, cmd_limit);
-                if (tile.tile.offset != 0) {
-                    CmdFill cmd_fill;
-                    cmd_fill.tile_ref = tile.tile.offset;
-                    cmd_fill.backdrop = tile.backdrop;
-                    cmd_fill.rgba_color = fill.rgba_color;
-                    Cmd_Fill_write(cmd_ref, cmd_fill);
-                } else {
-                    Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color));
-                }
-                cmd_ref.offset += Cmd_size;
-                break;
-            case Annotated_BeginClip:
-                tile = Tile_read(TileRef(sh_tile_base[element_ref_ix]
-                    + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
-                alloc_cmd(cmd_ref, cmd_limit);
-                if (tile.tile.offset != 0) {
-                    CmdBeginClip cmd_begin_clip;
-                    cmd_begin_clip.tile_ref = tile.tile.offset;
-                    cmd_begin_clip.backdrop = tile.backdrop;
-                    Cmd_BeginClip_write(cmd_ref, cmd_begin_clip);
-                } else {
-                    // TODO: here is where a bunch of optimization magic should happen
-                    float alpha = tile.backdrop == 0 ? 0.0 : 1.0;
-                    Cmd_BeginSolidClip_write(cmd_ref, CmdBeginSolidClip(alpha));
-                }
-                cmd_ref.offset += Cmd_size;
-                break;
-            case Annotated_EndClip:
-                alloc_cmd(cmd_ref, cmd_limit);
-                Cmd_EndClip_write(cmd_ref, CmdEndClip(1.0));
-                cmd_ref.offset += Cmd_size;
-                break;
-            case Annotated_FillMask:
-            case Annotated_FillMaskInv:
-                tile = Tile_read(TileRef(sh_tile_base[element_ref_ix]
-                    + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
-                AnnoFillMask fill_mask = Annotated_FillMask_read(ref);
-                alloc_cmd(cmd_ref, cmd_limit);
-                if (tile.tile.offset != 0) {
-                    CmdFillMask cmd_fill;
-                    cmd_fill.tile_ref = tile.tile.offset;
-                    cmd_fill.backdrop = tile.backdrop;
-                    cmd_fill.mask = fill_mask.mask;
-                    if (tag == Annotated_FillMask) {
-                        Cmd_FillMask_write(cmd_ref, cmd_fill);
-                    } else {
-                        Cmd_FillMaskInv_write(cmd_ref, cmd_fill);
+            if (clip_zero_depth == 0) {
+                switch (tag.tag) {
+                case Annotated_Color:
+                    Tile tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), TileRef(sh_tile_base[element_ref_ix]
+                        + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
+                    AnnoColor fill = Annotated_Color_read(conf.anno_alloc, ref);
+                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
+                        break;
                    }
-                } else {
-                    Cmd_SolidMask_write(cmd_ref, CmdSolidMask(fill_mask.mask));
+                    write_fill(cmd_alloc, cmd_ref, tag.flags, tile, fill.linewidth);
+                    Cmd_Color_write(cmd_alloc, cmd_ref, CmdColor(fill.rgba_color));
+                    cmd_ref.offset += 4 + CmdColor_size;
+                    break;
+                case Annotated_Image:
+                    tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), TileRef(sh_tile_base[element_ref_ix]
+                        + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
+                    AnnoImage fill_img = Annotated_Image_read(conf.anno_alloc, ref);
+                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
+                        break;
+                    }
+                    write_fill(cmd_alloc, cmd_ref, tag.flags, tile, fill_img.linewidth);
+                    Cmd_Image_write(cmd_alloc, cmd_ref, CmdImage(fill_img.index, fill_img.offset));
+                    cmd_ref.offset += 4 + CmdImage_size;
+                    break;
+                case Annotated_BeginClip:
+                    tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), TileRef(sh_tile_base[element_ref_ix]
+                        + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
+                    if (tile.tile.offset == 0 && tile.backdrop == 0) {
+                        clip_zero_depth = clip_depth + 1;
+                    } else if (tile.tile.offset == 0 && clip_depth < 32) {
+                        clip_one_mask |= (1 << clip_depth);
+                    } else {
+                        AnnoBeginClip begin_clip = Annotated_BeginClip_read(conf.anno_alloc, ref);
+                        if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
+                            break;
+                        }
+                        write_fill(cmd_alloc, cmd_ref, tag.flags, tile, begin_clip.linewidth);
+                        Cmd_BeginClip_write(cmd_alloc, cmd_ref);
+                        cmd_ref.offset += 4;
+                        if (clip_depth < 32) {
+                            clip_one_mask &= ~(1 << clip_depth);
+                        }
+                        begin_slot++;
+                        num_begin_slots = max(num_begin_slots, begin_slot);
+                    }
+                    clip_depth++;
+                    break;
+                case Annotated_EndClip:
+                    clip_depth--;
+                    if (clip_depth >= 32 || (clip_one_mask & (1 << clip_depth)) == 0) {
+                        if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
+                            break;
+                        }
+                        Cmd_Solid_write(cmd_alloc, cmd_ref);
+                        cmd_ref.offset += 4;
+                        begin_slot--;
+                        Cmd_EndClip_write(cmd_alloc, cmd_ref);
+                        cmd_ref.offset += 4;
+                    }
+                    break;
+                }
+            } else {
+                // In "clip zero" state, suppress all drawing
+                switch (tag.tag) {
+                case Annotated_BeginClip:
+                    clip_depth++;
+                    break;
+                case Annotated_EndClip:
+                    if (clip_depth == clip_zero_depth) {
+                        clip_zero_depth = 0;
+                    }
+                    clip_depth--;
+                    break;
                }
-                cmd_ref.offset += Cmd_size;
-                break;
-            case Annotated_Stroke:
-                tile = Tile_read(TileRef(sh_tile_base[element_ref_ix]
-                    + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
-                AnnoStroke stroke = Annotated_Stroke_read(ref);
-                CmdStroke cmd_stroke;
-                cmd_stroke.tile_ref = tile.tile.offset;
-                cmd_stroke.half_width = 0.5 * stroke.linewidth;
-                cmd_stroke.rgba_color = stroke.rgba_color;
-                alloc_cmd(cmd_ref, cmd_limit);
-                Cmd_Stroke_write(cmd_ref, cmd_stroke);
-                cmd_ref.offset += Cmd_size;
-                break;
            }
        }
        barrier();
@ -352,5 +412,15 @@ void main() {
        rd_ix += N_TILE;
        if (rd_ix >= ready_ix && partition_ix >= n_partitions) break;
    }
-    Cmd_End_write(cmd_ref);
+    if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) {
+        Cmd_End_write(cmd_alloc, cmd_ref);
+        if (num_begin_slots > 0) {
+            // Write scratch allocation: one state per BeginClip per rasterizer chunk.
+            uint scratch_size = num_begin_slots * TILE_WIDTH_PX * TILE_HEIGHT_PX * CLIP_STATE_SIZE * 4;
+            MallocResult scratch = malloc(scratch_size);
+            // Ignore scratch.failed; we don't use the allocation and kernel4
+            // checks for memory overflow before using it.
+            alloc_write(scratch_alloc, scratch_alloc.offset, scratch.alloc);
+        }
+    }
 }
--- a/piet-gpu/shader/coarse.spv
+++ b/piet-gpu/shader/coarse.spv
--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@ -1,3 +1,5 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
 // The element processing stage, first in the pipeline.
 //
 // This stage is primarily about applying transforms and computing bounding
@ -7,6 +9,9 @@
 #version 450
 #extension GL_GOOGLE_include_directive : enable

+#include "mem.h"
+#include "setup.h"
+
 #define N_ROWS 4
 #define WG_SIZE 32
 #define LG_WG_SIZE 5
@ -14,44 +19,40 @@

 layout(local_size_x = WG_SIZE, local_size_y = 1) in;

-layout(set = 0, binding = 0) readonly buffer SceneBuf {
+layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+    Config conf;
+};
+
+layout(set = 0, binding = 2) readonly buffer SceneBuf {
    uint[] scene;
 };

 // It would be better to use the Vulkan memory model than
 // "volatile" but shooting for compatibility here rather
 // than doing things right.
-layout(set = 0, binding = 1) volatile buffer StateBuf {
+layout(set = 0, binding = 3) volatile buffer StateBuf {
+    uint part_counter;
    uint[] state;
 };

-// The annotated results are stored here.
-layout(set = 0, binding = 2) buffer AnnotatedBuf {
-    uint[] annotated;
-};
-
-// Path segments are stored here.
-layout(set = 0, binding = 3) buffer PathSegBuf {
-    uint[] pathseg;
-};
-
 #include "scene.h"
 #include "state.h"
 #include "annotated.h"
 #include "pathseg.h"
+#include "tile.h"

-#define StateBuf_stride (8 + 2 * State_size)
+#define StateBuf_stride (4 + 2 * State_size)

 StateRef state_aggregate_ref(uint partition_ix) {
-    return StateRef(12 + partition_ix * StateBuf_stride);
+    return StateRef(4 + partition_ix * StateBuf_stride);
 }

 StateRef state_prefix_ref(uint partition_ix) {
-    return StateRef(12 + partition_ix * StateBuf_stride + State_size);
+    return StateRef(4 + partition_ix * StateBuf_stride + State_size);
 }

 uint state_flag_index(uint partition_ix) {
-    return 1 + partition_ix * (StateBuf_stride / 4);
+    return partition_ix * (StateBuf_stride / 4);
 }

 // These correspond to X, A, P respectively in the prefix sum paper.
@ -62,6 +63,11 @@ uint state_flag_index(uint partition_ix) {
 #define FLAG_SET_LINEWIDTH 1
 #define FLAG_SET_BBOX 2
 #define FLAG_RESET_BBOX 4
+#define FLAG_SET_FILL_MODE 8
+// Fill modes take up the next bit. Non-zero fill is 0, stroke is 1.
+#define LG_FILL_MODE 4
+#define FILL_MODE_BITS 1
+#define FILL_MODE_MASK (FILL_MODE_BITS << LG_FILL_MODE)

 // This is almost like a monoid (the interaction between transformation and
 // bounding boxes is approximate)
@ -87,17 +93,21 @@ State combine_state(State a, State b) {
    c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x;
    c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y;
    c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth;
-    c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX)) | b.flags;
+    c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX | FLAG_SET_FILL_MODE)) | b.flags;
    c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1;
+    uint fill_mode = (b.flags & FLAG_SET_FILL_MODE) == 0 ? a.flags : b.flags;
+    fill_mode &= FILL_MODE_MASK;
+    c.flags = (c.flags & ~FILL_MODE_MASK) | fill_mode;
    c.path_count = a.path_count + b.path_count;
    c.pathseg_count = a.pathseg_count + b.pathseg_count;
+    c.trans_count = a.trans_count + b.trans_count;
    return c;
 }

 State map_element(ElementRef ref) {
    // TODO: it would *probably* be more efficient to make the memory read patterns less
    // divergent, though it would be more wasted memory.
-    uint tag = Element_tag(ref);
+    uint tag = Element_tag(ref).tag;
    State c;
    c.bbox = vec4(0.0, 0.0, 0.0, 0.0);
    c.mat = vec4(1.0, 0.0, 0.0, 1.0);
@ -106,32 +116,28 @@ State map_element(ElementRef ref) {
    c.flags = 0;
    c.path_count = 0;
    c.pathseg_count = 0;
+    c.trans_count = 0;
    switch (tag) {
-    case Element_FillLine:
-    case Element_StrokeLine:
-        LineSeg line = Element_FillLine_read(ref);
+    case Element_Line:
+        LineSeg line = Element_Line_read(ref);
        c.bbox.xy = min(line.p0, line.p1);
        c.bbox.zw = max(line.p0, line.p1);
        c.pathseg_count = 1;
        break;
-    case Element_FillQuad:
-    case Element_StrokeQuad:
-        QuadSeg quad = Element_FillQuad_read(ref);
+    case Element_Quad:
+        QuadSeg quad = Element_Quad_read(ref);
        c.bbox.xy = min(min(quad.p0, quad.p1), quad.p2);
        c.bbox.zw = max(max(quad.p0, quad.p1), quad.p2);
        c.pathseg_count = 1;
        break;
-    case Element_FillCubic:
-    case Element_StrokeCubic:
-        CubicSeg cubic = Element_FillCubic_read(ref);
+    case Element_Cubic:
+        CubicSeg cubic = Element_Cubic_read(ref);
        c.bbox.xy = min(min(cubic.p0, cubic.p1), min(cubic.p2, cubic.p3));
        c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3));
        c.pathseg_count = 1;
        break;
-    case Element_Fill:
-    case Element_FillMask:
-    case Element_FillMaskInv:
-    case Element_Stroke:
+    case Element_FillColor:
+    case Element_FillImage:
    case Element_BeginClip:
        c.flags = FLAG_RESET_BBOX;
        c.path_count = 1;
@ -148,6 +154,11 @@ State map_element(ElementRef ref) {
        Transform t = Element_Transform_read(ref);
        c.mat = t.mat;
        c.translate = t.translate;
+        c.trans_count = 1;
+        break;
+    case Element_SetFillMode:
+        SetFillMode fm = Element_SetFillMode_read(ref);
+        c.flags = FLAG_SET_FILL_MODE | (fm.fill_mode << LG_FILL_MODE);
        break;
    }
    return c;
@ -159,16 +170,7 @@ vec2 get_linewidth(State st) {
    return 0.5 * st.linewidth * vec2(length(st.mat.xz), length(st.mat.yw));
 }

-// We should be able to use an array of structs but the NV shader compiler
-// doesn't seem to like it :/
-//shared State sh_state[WG_SIZE];
-shared vec4 sh_mat[WG_SIZE];
-shared vec2 sh_translate[WG_SIZE];
-shared vec4 sh_bbox[WG_SIZE];
-shared float sh_width[WG_SIZE];
-shared uint sh_flags[WG_SIZE];
-shared uint sh_path_count[WG_SIZE];
-shared uint sh_pathseg_count[WG_SIZE];
+shared State sh_state[WG_SIZE];

 shared uint sh_part_ix;
 shared State sh_prefix;
@ -178,7 +180,7 @@ void main() {
    // Determine partition to process by atomic counter (described in Section
    // 4.4 of prefix sum paper).
    if (gl_LocalInvocationID.x == 0) {
-        sh_part_ix = atomicAdd(state[0], 1);
+        sh_part_ix = atomicAdd(part_counter, 1);
    }
    barrier();
    uint part_ix = sh_part_ix;
@ -193,35 +195,15 @@ void main() {
        th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i)));
    }
    State agg = th_state[N_ROWS - 1];
-    sh_mat[gl_LocalInvocationID.x] = agg.mat;
-    sh_translate[gl_LocalInvocationID.x] = agg.translate;
-    sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
-    sh_width[gl_LocalInvocationID.x] = agg.linewidth;
-    sh_flags[gl_LocalInvocationID.x] = agg.flags;
-    sh_path_count[gl_LocalInvocationID.x] = agg.path_count;
-    sh_pathseg_count[gl_LocalInvocationID.x] = agg.pathseg_count;
+    sh_state[gl_LocalInvocationID.x] = agg;
    for (uint i = 0; i < LG_WG_SIZE; i++) {
        barrier();
        if (gl_LocalInvocationID.x >= (1 << i)) {
-            State other;
-            uint ix = gl_LocalInvocationID.x - (1 << i);
-            other.mat = sh_mat[ix];
-            other.translate = sh_translate[ix];
-            other.bbox = sh_bbox[ix];
-            other.linewidth = sh_width[ix];
-            other.flags = sh_flags[ix];
-            other.path_count = sh_path_count[ix];
-            other.pathseg_count = sh_pathseg_count[ix];
+            State other = sh_state[gl_LocalInvocationID.x - (1 << i)];
            agg = combine_state(other, agg);
        }
        barrier();
-        sh_mat[gl_LocalInvocationID.x] = agg.mat;
-        sh_translate[gl_LocalInvocationID.x] = agg.translate;
-        sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
-        sh_width[gl_LocalInvocationID.x] = agg.linewidth;
-        sh_flags[gl_LocalInvocationID.x] = agg.flags;
-        sh_path_count[gl_LocalInvocationID.x] = agg.path_count;
-        sh_pathseg_count[gl_LocalInvocationID.x] = agg.pathseg_count;
+        sh_state[gl_LocalInvocationID.x] = agg;
    }

    State exclusive;
@ -232,6 +214,7 @@ void main() {
    exclusive.flags = 0;
    exclusive.path_count = 0;
    exclusive.pathseg_count = 0;
+    exclusive.trans_count = 0;

    // Publish aggregate for this partition
    if (gl_LocalInvocationID.x == WG_SIZE - 1) {
@ -302,15 +285,7 @@ void main() {

    State row = exclusive;
    if (gl_LocalInvocationID.x > 0) {
-        uint ix = gl_LocalInvocationID.x - 1;
-        State other;
-        other.mat = sh_mat[ix];
-        other.translate = sh_translate[ix];
-        other.bbox = sh_bbox[ix];
-        other.linewidth = sh_width[ix];
-        other.flags = sh_flags[ix];
-        other.path_count = sh_path_count[ix];
-        other.pathseg_count = sh_pathseg_count[ix];
+        State other = sh_state[gl_LocalInvocationID.x - 1];
        row = combine_state(row, other);
    }
    for (uint i = 0; i < N_ROWS; i++) {
@ -320,125 +295,115 @@ void main() {
        // gains to be had from stashing in shared memory or possibly
        // registers (though register pressure is an issue).
        ElementRef this_ref = Element_index(ref, i);
-        uint tag = Element_tag(this_ref);
-        switch (tag) {
-        case Element_FillLine:
-        case Element_StrokeLine:
-            LineSeg line = Element_StrokeLine_read(this_ref);
-            vec2 p0 = st.mat.xy * line.p0.x + st.mat.zw * line.p0.y + st.translate;
-            vec2 p1 = st.mat.xy * line.p1.x + st.mat.zw * line.p1.y + st.translate;
-            PathStrokeCubic path_cubic;
-            path_cubic.p0 = p0;
-            path_cubic.p1 = mix(p0, p1, 1.0 / 3.0);
-            path_cubic.p2 = mix(p1, p0, 1.0 / 3.0);
-            path_cubic.p3 = p1;
+        ElementTag tag = Element_tag(this_ref);
+        uint fill_mode = fill_mode_from_flags(st.flags >> LG_FILL_MODE);
+        bool is_stroke = fill_mode == MODE_STROKE;
+        switch (tag.tag) {
+        case Element_Line:
+            LineSeg line = Element_Line_read(this_ref);
+            PathCubic path_cubic;
+            path_cubic.p0 = line.p0;
+            path_cubic.p1 = mix(line.p0, line.p1, 1.0 / 3.0);
+            path_cubic.p2 = mix(line.p1, line.p0, 1.0 / 3.0);
+            path_cubic.p3 = line.p1;
            path_cubic.path_ix = st.path_count;
-            if (tag == Element_StrokeLine) {
+            path_cubic.trans_ix = st.trans_count;
+            if (is_stroke) {
                path_cubic.stroke = get_linewidth(st);
            } else {
                path_cubic.stroke = vec2(0.0);
            }
-            // We do encoding a bit by hand to minimize divergence. Another approach
-            // would be to have a fill/stroke bool.
-            PathSegRef path_out_ref = PathSegRef((st.pathseg_count - 1) * PathSeg_size);
-            uint out_tag = tag == Element_FillLine ? PathSeg_FillCubic : PathSeg_StrokeCubic;
-            pathseg[path_out_ref.offset >> 2] = out_tag;
-            PathStrokeCubic_write(PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic);
+            PathSegRef path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size);
+            PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic);
            break;
-        case Element_FillQuad:
-        case Element_StrokeQuad:
-            QuadSeg quad = Element_StrokeQuad_read(this_ref);
-            p0 = st.mat.xy * quad.p0.x + st.mat.zw * quad.p0.y + st.translate;
-            p1 = st.mat.xy * quad.p1.x + st.mat.zw * quad.p1.y + st.translate;
-            vec2 p2 = st.mat.xy * quad.p2.x + st.mat.zw * quad.p2.y + st.translate;
-            path_cubic;
-            path_cubic.p0 = p0;
-            path_cubic.p1 = mix(p1, p0, 1.0 / 3.0);
-            path_cubic.p2 = mix(p1, p2, 1.0 / 3.0);
-            path_cubic.p3 = p2;
+        case Element_Quad:
+            QuadSeg quad = Element_Quad_read(this_ref);
+            path_cubic.p0 = quad.p0;
+            path_cubic.p1 = mix(quad.p1, quad.p0, 1.0 / 3.0);
+            path_cubic.p2 = mix(quad.p1, quad.p2, 1.0 / 3.0);
+            path_cubic.p3 = quad.p2;
            path_cubic.path_ix = st.path_count;
-            if (tag == Element_StrokeQuad) {
+            path_cubic.trans_ix = st.trans_count;
+            if (is_stroke) {
                path_cubic.stroke = get_linewidth(st);
            } else {
                path_cubic.stroke = vec2(0.0);
            }
-            // We do encoding a bit by hand to minimize divergence. Another approach
-            // would be to have a fill/stroke bool.
-            path_out_ref = PathSegRef((st.pathseg_count - 1) * PathSeg_size);
-            out_tag = tag == Element_FillQuad ? PathSeg_FillCubic : PathSeg_StrokeCubic;
-            pathseg[path_out_ref.offset >> 2] = out_tag;
-            PathStrokeCubic_write(PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic);
+            path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size);
+            PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic);
            break;
-        case Element_FillCubic:
-        case Element_StrokeCubic:
-            CubicSeg cubic = Element_StrokeCubic_read(this_ref);
-            path_cubic;
-            path_cubic.p0 = st.mat.xy * cubic.p0.x + st.mat.zw * cubic.p0.y + st.translate;
-            path_cubic.p1 = st.mat.xy * cubic.p1.x + st.mat.zw * cubic.p1.y + st.translate;
-            path_cubic.p2 = st.mat.xy * cubic.p2.x + st.mat.zw * cubic.p2.y + st.translate;
-            path_cubic.p3 = st.mat.xy * cubic.p3.x + st.mat.zw * cubic.p3.y + st.translate;
+        case Element_Cubic:
+            CubicSeg cubic = Element_Cubic_read(this_ref);
+            path_cubic.p0 = cubic.p0;
+            path_cubic.p1 = cubic.p1;
+            path_cubic.p2 = cubic.p2;
+            path_cubic.p3 = cubic.p3;
            path_cubic.path_ix = st.path_count;
-            if (tag == Element_StrokeCubic) {
+            path_cubic.trans_ix = st.trans_count;
+            if (is_stroke) {
                path_cubic.stroke = get_linewidth(st);
            } else {
                path_cubic.stroke = vec2(0.0);
            }
-            // We do encoding a bit by hand to minimize divergence. Another approach
-            // would be to have a fill/stroke bool.
-            path_out_ref = PathSegRef((st.pathseg_count - 1) * PathSeg_size);
-            out_tag = tag == Element_FillCubic ? PathSeg_FillCubic : PathSeg_StrokeCubic;
-            pathseg[path_out_ref.offset >> 2] = out_tag;
-            PathStrokeCubic_write(PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic);
+            path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size);
+            PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic);
            break;
-        case Element_Stroke:
-            Stroke stroke = Element_Stroke_read(this_ref);
-            AnnoStroke anno_stroke;
-            anno_stroke.rgba_color = stroke.rgba_color;
-            vec2 lw = get_linewidth(st);
-            anno_stroke.bbox = st.bbox + vec4(-lw, lw);
-            anno_stroke.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
-            AnnotatedRef out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size);
-            Annotated_Stroke_write(out_ref, anno_stroke);
-            break;
-        case Element_Fill:
-            Fill fill = Element_Fill_read(this_ref);
-            AnnoFill anno_fill;
+        case Element_FillColor:
+            FillColor fill = Element_FillColor_read(this_ref);
+            AnnoColor anno_fill;
            anno_fill.rgba_color = fill.rgba_color;
-            anno_fill.bbox = st.bbox;
-            out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size);
-            Annotated_Fill_write(out_ref, anno_fill);
+            if (is_stroke) {
+                vec2 lw = get_linewidth(st);
+                anno_fill.bbox = st.bbox + vec4(-lw, lw);
+                anno_fill.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
+            } else {
+                anno_fill.bbox = st.bbox;
+                anno_fill.linewidth = 0.0;
+            }
+            AnnotatedRef out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
+            Annotated_Color_write(conf.anno_alloc, out_ref, fill_mode, anno_fill);
            break;
-        case Element_FillMask:
-            FillMask fill_mask = Element_FillMask_read(this_ref);
-            AnnoFillMask anno_fill_mask;
-            anno_fill_mask.mask = fill_mask.mask;
-            anno_fill_mask.bbox = st.bbox;
-            out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size);
-            Annotated_FillMask_write(out_ref, anno_fill_mask);
-            break;
-        case Element_FillMaskInv:
-            fill_mask = Element_FillMaskInv_read(this_ref);
-            anno_fill_mask.mask = fill_mask.mask;
-            // The inverse fill conceptually takes up the entire screen.
-            // TODO: Tighten bounds to contain only affected paths.
-            anno_fill_mask.bbox = vec4(0, 0, 1e9, 1e9);
-            out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size);
-            Annotated_FillMaskInv_write(out_ref, anno_fill_mask);
+        case Element_FillImage:
+            FillImage fill_img = Element_FillImage_read(this_ref);
+            AnnoImage anno_img;
+            anno_img.index = fill_img.index;
+            anno_img.offset = fill_img.offset;
+            if (is_stroke) {
+                vec2 lw = get_linewidth(st);
+                anno_img.bbox = st.bbox + vec4(-lw, lw);
+                anno_img.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
+            } else {
+                anno_img.bbox = st.bbox;
+                anno_img.linewidth = 0.0;
+            }
+            out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
+            Annotated_Image_write(conf.anno_alloc, out_ref, fill_mode, anno_img);
            break;
        case Element_BeginClip:
            Clip begin_clip = Element_BeginClip_read(this_ref);
-            AnnoClip anno_begin_clip = AnnoClip(begin_clip.bbox);
+            AnnoBeginClip anno_begin_clip;
            // This is the absolute bbox, it's been transformed during encoding.
            anno_begin_clip.bbox = begin_clip.bbox;
-            out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size);
-            Annotated_BeginClip_write(out_ref, anno_begin_clip);
+            if (is_stroke) {
+                vec2 lw = get_linewidth(st);
+                anno_begin_clip.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
+            } else {
+                anno_fill.linewidth = 0.0;
+            }
+            out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
+            Annotated_BeginClip_write(conf.anno_alloc, out_ref, fill_mode, anno_begin_clip);
            break;
        case Element_EndClip:
            Clip end_clip = Element_EndClip_read(this_ref);
            // This bbox is expected to be the same as the begin one.
-            AnnoClip anno_end_clip = AnnoClip(end_clip.bbox);
-            out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size);
-            Annotated_EndClip_write(out_ref, anno_end_clip);
+            AnnoEndClip anno_end_clip = AnnoEndClip(end_clip.bbox);
+            out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
+            Annotated_EndClip_write(conf.anno_alloc, out_ref, anno_end_clip);
+            break;
+        case Element_Transform:
+            TransformSeg transform = TransformSeg(st.mat, st.translate);
+            TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + (st.trans_count - 1) * TransformSeg_size);
+            TransformSeg_write(conf.trans_alloc, trans_ref, transform);
            break;
        }
    }
--- a/piet-gpu/shader/elements.spv
+++ b/piet-gpu/shader/elements.spv
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@ -1,3 +1,5 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
 // This is "kernel 4" in a 4-kernel pipeline. It renders the commands
 // in the per-tile command list to an image.

@ -6,187 +8,241 @@

 #version 450
 #extension GL_GOOGLE_include_directive : enable
+#ifdef ENABLE_IMAGE_INDICES
+#extension GL_EXT_nonuniform_qualifier : enable
+#endif

+#include "mem.h"
 #include "setup.h"

-#define CHUNK 8
-#define CHUNK_DY (TILE_HEIGHT_PX / CHUNK)
-layout(local_size_x = TILE_WIDTH_PX, local_size_y = CHUNK_DY) in;
+#define CHUNK_X 2
+#define CHUNK_Y 4
+#define CHUNK CHUNK_X * CHUNK_Y
+#define CHUNK_DX (TILE_WIDTH_PX / CHUNK_X)
+#define CHUNK_DY (TILE_HEIGHT_PX / CHUNK_Y)
+layout(local_size_x = CHUNK_DX, local_size_y = CHUNK_DY) in;

-// Same concern that this should be readonly as in kernel 3.
-layout(set = 0, binding = 0) buffer PtclBuf {
-    uint[] ptcl;
+layout(set = 0, binding = 1) restrict readonly buffer ConfigBuf {
+    Config conf;
 };

-layout(set = 0, binding = 1) buffer TileBuf {
-    uint[] tile;
-};
+layout(rgba8, set = 0, binding = 2) uniform restrict writeonly image2D image;

-layout(rgba8, set = 0, binding = 2) uniform writeonly image2D image;
+#ifdef ENABLE_IMAGE_INDICES
+layout(rgba8, set = 0, binding = 3) uniform restrict readonly image2D images[];
+#else
+layout(rgba8, set = 0, binding = 3) uniform restrict readonly image2D images[1];
+#endif

 #include "ptcl.h"
 #include "tile.h"

-#define BLEND_STACK_SIZE 4
+mediump vec3 tosRGB(mediump vec3 rgb) {
+    bvec3 cutoff = greaterThanEqual(rgb, vec3(0.0031308));
+    mediump vec3 below = vec3(12.92)*rgb;
+    mediump vec3 above = vec3(1.055)*pow(rgb, vec3(0.41666)) - vec3(0.055);
+    return mix(below, above, cutoff);
+}

-// Calculate coverage based on backdrop + coverage of each line segment
-float[CHUNK] computeArea(vec2 xy, int backdrop, uint tile_ref) {
-    // Probably better to store as float, but conversion is no doubt cheap.
-    float area[CHUNK];
-    for (uint k = 0; k < CHUNK; k++) area[k] = float(backdrop);
-    TileSegRef tile_seg_ref = TileSegRef(tile_ref);
-    do {
-        TileSeg seg = TileSeg_read(tile_seg_ref);
-        for (uint k = 0; k < CHUNK; k++) {
-            vec2 my_xy = vec2(xy.x, xy.y + float(k * CHUNK_DY));
-            vec2 start = seg.start - my_xy;
-            vec2 end = seg.end - my_xy;
-            vec2 window = clamp(vec2(start.y, end.y), 0.0, 1.0);
-            if (window.x != window.y) {
-                vec2 t = (window - start.y) / (end.y - start.y);
-                vec2 xs = vec2(mix(start.x, end.x, t.x), mix(start.x, end.x, t.y));
-                float xmin = min(min(xs.x, xs.y), 1.0) - 1e-6;
-                float xmax = max(xs.x, xs.y);
-                float b = min(xmax, 1.0);
-                float c = max(b, 0.0);
-                float d = max(xmin, 0.0);
-                float a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin);
-                area[k] += a * (window.x - window.y);
-            }
-            area[k] += sign(end.x - start.x) * clamp(my_xy.y - seg.y_edge + 1.0, 0.0, 1.0);
-        }
-        tile_seg_ref = seg.next;
-    } while (tile_seg_ref.offset != 0);
-    for (uint k = 0; k < CHUNK; k++) {
-        area[k] = min(abs(area[k]), 1.0);
+mediump vec3 fromsRGB(mediump vec3 srgb) {
+    // Formula from EXT_sRGB.
+    bvec3 cutoff = greaterThanEqual(srgb, vec3(0.04045));
+    mediump vec3 below = srgb/vec3(12.92);
+    mediump vec3 above = pow((srgb + vec3(0.055))/vec3(1.055), vec3(2.4));
+    return mix(below, above, cutoff);
+}
+
+// unpacksRGB unpacks a color in the sRGB color space to a vec4 in the linear color
+// space.
+mediump vec4 unpacksRGB(uint srgba) {
+    mediump vec4 color = unpackUnorm4x8(srgba).wzyx;
+    return vec4(fromsRGB(color.rgb), color.a);
+}
+
+// packsRGB packs a color in the linear color space into its 8-bit sRGB equivalent.
+uint packsRGB(mediump vec4 rgba) {
+    rgba = vec4(tosRGB(rgba.rgb), rgba.a);
+    return packUnorm4x8(rgba.wzyx);
+}
+
+uvec2 chunk_offset(uint i) {
+    return uvec2(i % CHUNK_X * CHUNK_DX, i / CHUNK_X * CHUNK_DY);
+}
+
+mediump vec4[CHUNK] fillImage(uvec2 xy, CmdImage cmd_img) {
+    mediump vec4 rgba[CHUNK];
+    for (uint i = 0; i < CHUNK; i++) {
+        ivec2 uv = ivec2(xy + chunk_offset(i)) + cmd_img.offset;
+        mediump vec4 fg_rgba;
+#ifdef ENABLE_IMAGE_INDICES
+        fg_rgba = imageLoad(images[cmd_img.index], uv);
+#else
+        fg_rgba = imageLoad(images[0], uv);
+#endif
+        fg_rgba.rgb = fromsRGB(fg_rgba.rgb);
+        rgba[i] = fg_rgba;
    }
-    return area;
+    return rgba;
 }

 void main() {
-    uint tile_ix = gl_WorkGroupID.y * WIDTH_IN_TILES + gl_WorkGroupID.x;
-    CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);
+    uint tile_ix = gl_WorkGroupID.y * conf.width_in_tiles + gl_WorkGroupID.x;
+    Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
+    CmdRef cmd_ref = CmdRef(cmd_alloc.offset);

-    uvec2 xy_uint = uvec2(gl_GlobalInvocationID.x, gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y);
+    // Read scrach space allocation, written first in the command list.
+    Alloc scratch_alloc = alloc_read(cmd_alloc, cmd_ref.offset);
+    cmd_ref.offset += Alloc_size;
+
+    uvec2 xy_uint = uvec2(gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_WorkGroupID.x, gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y);
    vec2 xy = vec2(xy_uint);
-    vec3 rgb[CHUNK];
-    float mask[CHUNK];
-    uint blend_stack[BLEND_STACK_SIZE][CHUNK];
-    uint blend_sp = 0;
+    mediump vec4 rgba[CHUNK];
    for (uint i = 0; i < CHUNK; i++) {
-        rgb[i] = vec3(0.5);
-        mask[i] = 1.0;
+        rgba[i] = vec4(0.0);
+        // TODO: remove this debug image support when the actual image method is plumbed.
+#ifdef DEBUG_IMAGES
+#ifdef ENABLE_IMAGE_INDICES
+        if (xy_uint.x < 1024 && xy_uint.y < 1024) {
+            rgba[i] = imageLoad(images[gl_WorkGroupID.x / 64], ivec2(xy_uint + chunk_offset(i))/4);
+        }
+#else
+        if (xy_uint.x < 1024 && xy_uint.y < 1024) {
+            rgb[i] = imageLoad(images[0], ivec2(xy_uint + chunk_offset(i))/4).rgb;
+        }
+#endif
+#endif
    }

-    while (true) {
-        uint tag = Cmd_tag(cmd_ref);
+    mediump float area[CHUNK];
+    uint clip_depth = 0;
+    bool mem_ok = mem_error == NO_ERROR;
+    while (mem_ok) {
+        uint tag = Cmd_tag(cmd_alloc, cmd_ref).tag;
        if (tag == Cmd_End) {
            break;
        }
        switch (tag) {
-        case Cmd_Circle:
-            CmdCircle circle = Cmd_Circle_read(cmd_ref);
-            vec4 fg_rgba = unpackUnorm4x8(circle.rgba_color).wzyx;
-            for (uint i = 0; i < CHUNK; i++) {
-                float dy = float(i * CHUNK_DY);
-                float r = length(vec2(xy.x, xy.y + dy) + vec2(0.5, 0.5) - circle.center.xy);
-                float alpha = clamp(0.5 + circle.radius - r, 0.0, 1.0);
-                // TODO: sRGB
-                rgb[i] = mix(rgb[i], fg_rgba.rgb, mask[i] * alpha * fg_rgba.a);
-            }
-            break;
        case Cmd_Stroke:
            // Calculate distance field from all the line segments in this tile.
-            CmdStroke stroke = Cmd_Stroke_read(cmd_ref);
-            float df[CHUNK];
+            CmdStroke stroke = Cmd_Stroke_read(cmd_alloc, cmd_ref);
+            mediump float df[CHUNK];
            for (uint k = 0; k < CHUNK; k++) df[k] = 1e9;
            TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref);
            do {
-                TileSeg seg = TileSeg_read(tile_seg_ref);
-                vec2 line_vec = seg.end - seg.start;
+                TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, mem_ok), tile_seg_ref);
+                vec2 line_vec = seg.vector;
                for (uint k = 0; k < CHUNK; k++) {
-                    vec2 dpos = xy + vec2(0.5, 0.5) - seg.start;
-                    dpos.y += float(k * CHUNK_DY);
+                    vec2 dpos = xy + vec2(0.5, 0.5) - seg.origin;
+                    dpos += vec2(chunk_offset(k));
                    float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
                    df[k] = min(df[k], length(line_vec * t - dpos));
                }
                tile_seg_ref = seg.next;
            } while (tile_seg_ref.offset != 0);
-            fg_rgba = unpackUnorm4x8(stroke.rgba_color).wzyx;
            for (uint k = 0; k < CHUNK; k++) {
-                float alpha = clamp(stroke.half_width + 0.5 - df[k], 0.0, 1.0);
-                rgb[k] = mix(rgb[k], fg_rgba.rgb, mask[k] * alpha * fg_rgba.a);
+                area[k] = clamp(stroke.half_width + 0.5 - df[k], 0.0, 1.0);
            }
+            cmd_ref.offset += 4 + CmdStroke_size;
            break;
        case Cmd_Fill:
-            CmdFill fill = Cmd_Fill_read(cmd_ref);
-            float area[CHUNK];
-            area = computeArea(xy, fill.backdrop, fill.tile_ref);
-            fg_rgba = unpackUnorm4x8(fill.rgba_color).wzyx;
+            CmdFill fill = Cmd_Fill_read(cmd_alloc, cmd_ref);
+            for (uint k = 0; k < CHUNK; k++) area[k] = float(fill.backdrop);
+            tile_seg_ref = TileSegRef(fill.tile_ref);
+            // Calculate coverage based on backdrop + coverage of each line segment
+            do {
+                TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, mem_ok), tile_seg_ref);
+                for (uint k = 0; k < CHUNK; k++) {
+                    vec2 my_xy = xy + vec2(chunk_offset(k));
+                    vec2 start = seg.origin - my_xy;
+                    vec2 end = start + seg.vector;
+                    vec2 window = clamp(vec2(start.y, end.y), 0.0, 1.0);
+                    if (window.x != window.y) {
+                        vec2 t = (window - start.y) / seg.vector.y;
+                        vec2 xs = vec2(mix(start.x, end.x, t.x), mix(start.x, end.x, t.y));
+                        float xmin = min(min(xs.x, xs.y), 1.0) - 1e-6;
+                        float xmax = max(xs.x, xs.y);
+                        float b = min(xmax, 1.0);
+                        float c = max(b, 0.0);
+                        float d = max(xmin, 0.0);
+                        float a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin);
+                        area[k] += a * (window.x - window.y);
+                    }
+                    area[k] += sign(seg.vector.x) * clamp(my_xy.y - seg.y_edge + 1.0, 0.0, 1.0);
+                }
+                tile_seg_ref = seg.next;
+            } while (tile_seg_ref.offset != 0);
            for (uint k = 0; k < CHUNK; k++) {
-                rgb[k] = mix(rgb[k], fg_rgba.rgb, mask[k] * area[k] * fg_rgba.a);
-            }
-            break;
-        case Cmd_FillMask:
-            CmdFillMask fill_mask = Cmd_FillMask_read(cmd_ref);
-            area = computeArea(xy, fill_mask.backdrop, fill_mask.tile_ref);
-            for (uint k = 0; k < CHUNK; k++) {
-                mask[k] = mix(mask[k], fill_mask.mask, area[k]);
-            }
-            break;
-        case Cmd_FillMaskInv:
-            fill_mask = Cmd_FillMask_read(cmd_ref);
-            area = computeArea(xy, fill_mask.backdrop, fill_mask.tile_ref);
-            for (uint k = 0; k < CHUNK; k++) {
-                mask[k] = mix(mask[k], fill_mask.mask, 1.0 - area[k]);
-            }
-            break;
-        case Cmd_BeginClip:
-            CmdBeginClip begin_clip = Cmd_BeginClip_read(cmd_ref);
-            area = computeArea(xy, begin_clip.backdrop, begin_clip.tile_ref);
-            for (uint k = 0; k < CHUNK; k++) {
-                blend_stack[blend_sp][k] = packUnorm4x8(vec4(rgb[k], clamp(abs(area[k]), 0.0, 1.0)));
-            }
-            blend_sp++;
-            break;
-        case Cmd_BeginSolidClip:
-            CmdBeginSolidClip begin_solid_clip = Cmd_BeginSolidClip_read(cmd_ref);
-            float solid_alpha = begin_solid_clip.alpha;
-            for (uint k = 0; k < CHUNK; k++) {
-                blend_stack[blend_sp][k] = packUnorm4x8(vec4(rgb[k], solid_alpha));
-            }
-            blend_sp++;
-            break;
-        case Cmd_EndClip:
-            CmdEndClip end_clip = Cmd_EndClip_read(cmd_ref);
-            blend_sp--;
-            for (uint k = 0; k < CHUNK; k++) {
-                vec4 rgba = unpackUnorm4x8(blend_stack[blend_sp][k]);
-                rgb[k] = mix(rgba.rgb, rgb[k], end_clip.alpha * rgba.a);
+                area[k] = min(abs(area[k]), 1.0);
            }
+            cmd_ref.offset += 4 + CmdFill_size;
            break;
        case Cmd_Solid:
-            CmdSolid solid = Cmd_Solid_read(cmd_ref);
-            fg_rgba = unpackUnorm4x8(solid.rgba_color).wzyx;
            for (uint k = 0; k < CHUNK; k++) {
-                rgb[k] = mix(rgb[k], fg_rgba.rgb, mask[k] * fg_rgba.a);
+                area[k] = 1.0;
            }
+            cmd_ref.offset += 4;
            break;
-        case Cmd_SolidMask:
-            CmdSolidMask solid_mask = Cmd_SolidMask_read(cmd_ref);
+        case Cmd_Alpha:
+            CmdAlpha alpha = Cmd_Alpha_read(cmd_alloc, cmd_ref);
            for (uint k = 0; k < CHUNK; k++) {
-                mask[k] = solid_mask.mask;
+                area[k] = alpha.alpha;
            }
+            cmd_ref.offset += 4 + CmdAlpha_size;
+            break;
+        case Cmd_Color:
+            CmdColor color = Cmd_Color_read(cmd_alloc, cmd_ref);
+            mediump vec4 fg = unpacksRGB(color.rgba_color);
+            for (uint k = 0; k < CHUNK; k++) {
+                mediump vec4 fg_k = fg * area[k];
+                rgba[k] = rgba[k] * (1.0 - fg_k.a) + fg_k;
+            }
+            cmd_ref.offset += 4 + CmdColor_size;
+            break;
+        case Cmd_Image:
+            CmdImage fill_img = Cmd_Image_read(cmd_alloc, cmd_ref);
+            mediump vec4 img[CHUNK] = fillImage(xy_uint, fill_img);
+            for (uint k = 0; k < CHUNK; k++) {
+                mediump vec4 fg_k = img[k] * area[k];
+                rgba[k] = rgba[k] * (1.0 - fg_k.a) + fg_k;
+            }
+            cmd_ref.offset += 4 + CmdImage_size;
+            break;
+        case Cmd_BeginClip:
+            uint base_ix = (scratch_alloc.offset >> 2) + CLIP_STATE_SIZE * (clip_depth * TILE_WIDTH_PX * TILE_HEIGHT_PX +
+                gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y);
+            for (uint k = 0; k < CHUNK; k++) {
+                uvec2 offset = chunk_offset(k);
+                uint srgb = packsRGB(vec4(rgba[k]));
+                mediump float alpha = clamp(abs(area[k]), 0.0, 1.0);
+                write_mem(scratch_alloc, base_ix + 0 + CLIP_STATE_SIZE * (offset.x + offset.y * TILE_WIDTH_PX), srgb);
+                write_mem(scratch_alloc, base_ix + 1 + CLIP_STATE_SIZE * (offset.x + offset.y * TILE_WIDTH_PX), floatBitsToUint(alpha));
+                rgba[k] = vec4(0.0);
+            }
+            clip_depth++;
+            cmd_ref.offset += 4;
+            break;
+        case Cmd_EndClip:
+            clip_depth--;
+            base_ix = (scratch_alloc.offset >> 2) + CLIP_STATE_SIZE * (clip_depth * TILE_WIDTH_PX * TILE_HEIGHT_PX +
+                gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y);
+            for (uint k = 0; k < CHUNK; k++) {
+                uvec2 offset = chunk_offset(k);
+                uint srgb = read_mem(scratch_alloc, base_ix + 0 + CLIP_STATE_SIZE * (offset.x + offset.y * TILE_WIDTH_PX));
+                uint alpha = read_mem(scratch_alloc, base_ix + 1 + CLIP_STATE_SIZE * (offset.x + offset.y * TILE_WIDTH_PX));
+                mediump vec4 bg = unpacksRGB(srgb);
+                mediump vec4 fg = rgba[k] * area[k] * uintBitsToFloat(alpha);
+                rgba[k] = bg * (1.0 - fg.a) + fg;
+            }
+            cmd_ref.offset += 4;
            break;
        case Cmd_Jump:
-            cmd_ref = CmdRef(Cmd_Jump_read(cmd_ref).new_ref);
-            continue;
+            cmd_ref = CmdRef(Cmd_Jump_read(cmd_alloc, cmd_ref).new_ref);
+            cmd_alloc.offset = cmd_ref.offset;
+            break;
        }
-        cmd_ref.offset += Cmd_size;
    }

-    // TODO: sRGB
    for (uint i = 0; i < CHUNK; i++) {
-        imageStore(image, ivec2(xy_uint.x, xy_uint.y + CHUNK_DY * i), vec4(rgb[i], 1.0));
+        imageStore(image, ivec2(xy_uint + chunk_offset(i)), vec4(tosRGB(rgba[i].rgb), rgba[i].a));
    }
 }
--- a/piet-gpu/shader/kernel4.spv
+++ b/piet-gpu/shader/kernel4.spv
--- a/piet-gpu/shader/kernel4_idx.spv
+++ b/piet-gpu/shader/kernel4_idx.spv
--- a/piet-gpu/shader/mem.h
+++ b/piet-gpu/shader/mem.h
@ -0,0 +1,147 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+layout(set = 0, binding = 0) buffer Memory {
+    // offset into memory of the next allocation, initialized by the user.
+    uint mem_offset;
+    // mem_error tracks the status of memory accesses, initialized to NO_ERROR
+    // by the user. ERR_MALLOC_FAILED is reported for insufficient memory.
+    // If MEM_DEBUG is defined the following errors are reported:
+    // - ERR_OUT_OF_BOUNDS is reported for out of bounds writes.
+    // - ERR_UNALIGNED_ACCESS for memory access not aligned to 32-bit words.
+    uint mem_error;
+    uint[] memory;
+};
+
+// Uncomment this line to add the size field to Alloc and enable memory checks.
+// Note that the Config struct in setup.h grows size fields as well.
+//#define MEM_DEBUG
+
+#define NO_ERROR 0
+#define ERR_MALLOC_FAILED 1
+#define ERR_OUT_OF_BOUNDS 2
+#define ERR_UNALIGNED_ACCESS 3
+
+#ifdef MEM_DEBUG
+#define Alloc_size 16
+#else
+#define Alloc_size 8
+#endif
+
+// Alloc represents a memory allocation.
+struct Alloc {
+    // offset in bytes into memory.
+    uint offset;
+#ifdef MEM_DEBUG
+    // size in bytes of the allocation.
+    uint size;
+#endif
+};
+
+struct MallocResult {
+    Alloc alloc;
+    // failed is true if the allocation overflowed memory.
+    bool failed;
+};
+
+// new_alloc synthesizes an Alloc from an offset and size.
+Alloc new_alloc(uint offset, uint size, bool mem_ok) {
+    Alloc a;
+    a.offset = offset;
+#ifdef MEM_DEBUG
+    if (mem_ok) {
+        a.size = size;
+    } else {
+        a.size = 0;
+    }
+#endif
+    return a;
+}
+
+// malloc allocates size bytes of memory.
+MallocResult malloc(uint size) {
+    MallocResult r;
+    uint offset = atomicAdd(mem_offset, size);
+    r.failed = offset + size > memory.length() * 4;
+    r.alloc = new_alloc(offset, size, !r.failed);
+    if (r.failed) {
+        atomicMax(mem_error, ERR_MALLOC_FAILED);
+        return r;
+    }
+#ifdef MEM_DEBUG
+    if ((size & 3) != 0) {
+        r.failed = true;
+        atomicMax(mem_error, ERR_UNALIGNED_ACCESS);
+        return r;
+    }
+#endif
+    return r;
+}
+
+// touch_mem checks whether access to the memory word at offset is valid.
+// If MEM_DEBUG is defined, touch_mem returns false if offset is out of bounds.
+// Offset is in words.
+bool touch_mem(Alloc alloc, uint offset) {
+#ifdef MEM_DEBUG
+    if (offset < alloc.offset/4 || offset >= (alloc.offset + alloc.size)/4) {
+        atomicMax(mem_error, ERR_OUT_OF_BOUNDS);
+        return false;
+    }
+#endif
+    return true;
+}
+
+// write_mem writes val to memory at offset.
+// Offset is in words.
+void write_mem(Alloc alloc, uint offset, uint val) {
+    if (!touch_mem(alloc, offset)) {
+        return;
+    }
+    memory[offset] = val;
+}
+
+// read_mem reads the value from memory at offset.
+// Offset is in words.
+uint read_mem(Alloc alloc, uint offset) {
+    if (!touch_mem(alloc, offset)) {
+        return 0;
+    }
+    uint v = memory[offset];
+    return v;
+}
+
+// slice_mem returns a sub-allocation inside another. Offset and size are in
+// bytes, relative to a.offset.
+Alloc slice_mem(Alloc a, uint offset, uint size) {
+#ifdef MEM_DEBUG
+    if ((offset & 3) != 0 || (size & 3) != 0) {
+        atomicMax(mem_error, ERR_UNALIGNED_ACCESS);
+        return Alloc(0, 0);
+    }
+    if (offset + size > a.size) {
+        // slice_mem is sometimes used for slices outside bounds,
+        // but never written.
+        return Alloc(0, 0);
+    }
+    return Alloc(a.offset + offset, size);
+#else
+    return Alloc(a.offset + offset);
+#endif
+}
+
+// alloc_write writes alloc to memory at offset bytes.
+void alloc_write(Alloc a, uint offset, Alloc alloc) {
+    write_mem(a, offset >> 2, alloc.offset);
+#ifdef MEM_DEBUG
+    write_mem(a, (offset >> 2) + 1, alloc.size);
+#endif
+}
+
+// alloc_read reads an Alloc from memory at offset bytes.
+Alloc alloc_read(Alloc a, uint offset) {
+    Alloc alloc;
+    alloc.offset = read_mem(a, offset >> 2);
+#ifdef MEM_DEBUG
+    alloc.size = read_mem(a, (offset >> 2) + 1);
+#endif
+    return alloc;
+}
--- a/piet-gpu/shader/path_coarse.comp
+++ b/piet-gpu/shader/path_coarse.comp
@ -1,3 +1,5 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
 // Coarse rasterization of path segments.

 // Allocation and initialization of tiles for paths.
@ -5,6 +7,7 @@
 #version 450
 #extension GL_GOOGLE_include_directive : enable

+#include "mem.h"
 #include "setup.h"

 #define LG_COARSE_WG 5
@ -12,18 +15,8 @@

 layout(local_size_x = COARSE_WG, local_size_y = 1) in;

-layout(set = 0, binding = 0) buffer PathSegBuf {
-    uint[] pathseg;
-};
-
-layout(set = 0, binding = 1) buffer AllocBuf {
-    uint n_paths;
-    uint n_pathseg;
-    uint alloc;
-};
-
-layout(set = 0, binding = 2) buffer TileBuf {
-    uint[] tile;
+layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+    Config conf;
 };

 #include "pathseg.h"
@ -95,22 +88,27 @@ SubdivResult estimate_subdiv(vec2 p0, vec2 p1, vec2 p2, float sqrt_tol) {

 void main() {
    uint element_ix = gl_GlobalInvocationID.x;
-    PathSegRef ref = PathSegRef(element_ix * PathSeg_size);
+    PathSegRef ref = PathSegRef(conf.pathseg_alloc.offset + element_ix * PathSeg_size);

-    uint tag = PathSeg_Nop;
-    if (element_ix < n_pathseg) {
-        tag = PathSeg_tag(ref);
+    PathSegTag tag = PathSegTag(PathSeg_Nop, 0);
+    if (element_ix < conf.n_pathseg) {
+        tag = PathSeg_tag(conf.pathseg_alloc, ref);
    }
-    // Setup for coverage algorithm.
-    float a, b, c;
-    // Bounding box of element in pixel coordinates.
-    float xmin, xmax, ymin, ymax;
-    PathStrokeLine line;
-    float dx;
-    switch (tag) {
-    case PathSeg_FillCubic:
-    case PathSeg_StrokeCubic:
-        PathStrokeCubic cubic = PathSeg_StrokeCubic_read(ref);
+    bool mem_ok = mem_error == NO_ERROR;
+    switch (tag.tag) {
+    case PathSeg_Cubic:
+        PathCubic cubic = PathSeg_Cubic_read(conf.pathseg_alloc, ref);
+
+        uint trans_ix = cubic.trans_ix;
+        if (trans_ix > 0) {
+            TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + (trans_ix - 1) * TransformSeg_size);
+            TransformSeg trans = TransformSeg_read(conf.trans_alloc, trans_ref);
+            cubic.p0 = trans.mat.xy * cubic.p0.x + trans.mat.zw * cubic.p0.y + trans.translate;
+            cubic.p1 = trans.mat.xy * cubic.p1.x + trans.mat.zw * cubic.p1.y + trans.translate;
+            cubic.p2 = trans.mat.xy * cubic.p2.x + trans.mat.zw * cubic.p2.y + trans.translate;
+            cubic.p3 = trans.mat.xy * cubic.p3.x + trans.mat.zw * cubic.p3.y + trans.translate;
+        }
+
        vec2 err_v = 3.0 * (cubic.p2 - cubic.p1) + cubic.p0 - cubic.p3;
        float err = err_v.x * err_v.x + err_v.y * err_v.y;
        // The number of quadratics.
@ -131,8 +129,10 @@ void main() {
        }
        uint n = max(uint(ceil(val * 0.5 / sqrt(REM_ACCURACY))), 1);

+        bool is_stroke = fill_mode_from_flags(tag.flags) == MODE_STROKE;
        uint path_ix = cubic.path_ix;
-        Path path = Path_read(PathRef(path_ix * Path_size));
+        Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size));
+        Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
        ivec4 bbox = ivec4(path.bbox);
        vec2 p0 = cubic.p0;
        qp0 = cubic.p0;
@ -162,22 +162,24 @@ void main() {
                }

                // Output line segment
-                xmin = min(p0.x, p1.x) - cubic.stroke.x;
-                xmax = max(p0.x, p1.x) + cubic.stroke.x;
-                ymin = min(p0.y, p1.y) - cubic.stroke.y;
-                ymax = max(p0.y, p1.y) + cubic.stroke.y;
+
+                // Bounding box of element in pixel coordinates.
+                float xmin = min(p0.x, p1.x) - cubic.stroke.x;
+                float xmax = max(p0.x, p1.x) + cubic.stroke.x;
+                float ymin = min(p0.y, p1.y) - cubic.stroke.y;
+                float ymax = max(p0.y, p1.y) + cubic.stroke.y;
                float dx = p1.x - p0.x;
                float dy = p1.y - p0.y;
                // Set up for per-scanline coverage formula, below.
                float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
-                c = (cubic.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + cubic.stroke.y)) * SX;
-                b = invslope; // Note: assumes square tiles, otherwise scale.
-                a = (p0.x - (p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX;
+                float c = (cubic.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + cubic.stroke.y)) * SX;
+                float b = invslope; // Note: assumes square tiles, otherwise scale.
+                float a = (p0.x - (p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX;

-                int x0 = int(floor((xmin) * SX));
-                int x1 = int(ceil((xmax) * SX));
-                int y0 = int(floor((ymin) * SY));
-                int y1 = int(ceil((ymax) * SY));
+                int x0 = int(floor(xmin * SX));
+                int x1 = int(floor(xmax * SX) + 1);
+                int y0 = int(floor(ymin * SY));
+                int y1 = int(floor(ymax * SY) + 1);

                x0 = clamp(x0, bbox.x, bbox.z);
                y0 = clamp(y0, bbox.y, bbox.w);
@ -189,48 +191,89 @@ void main() {
                // TODO: can be tighter, use c to bound width
                uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
                // Consider using subgroups to aggregate atomic add.
-                uint tile_offset = atomicAdd(alloc, n_tile_alloc * TileSeg_size);
+                MallocResult tile_alloc = malloc(n_tile_alloc * TileSeg_size);
+                if (tile_alloc.failed || !mem_ok) {
+                    return;
+                }
+                uint tile_offset = tile_alloc.alloc.offset;
+
                TileSeg tile_seg;
+
+                int xray = int(floor(p0.x*SX));
+                int last_xray = int(floor(p1.x*SX));
+                if (p0.y > p1.y) {
+                    int tmp = xray;
+                    xray = last_xray;
+                    last_xray = tmp;
+                }
                for (int y = y0; y < y1; y++) {
                    float tile_y0 = float(y * TILE_HEIGHT_PX);
-                    if (tag == PathSeg_FillCubic && min(p0.y, p1.y) <= tile_y0) {
-                        int xray = max(int(ceil(xc - 0.5 * b)), bbox.x);
-                        if (xray < bbox.z) {
-                            int backdrop = p1.y < p0.y ? 1 : -1;
-                            TileRef tile_ref = Tile_index(path.tiles, uint(base + xray));
-                            uint tile_el = tile_ref.offset >> 2;
-                            atomicAdd(tile[tile_el + 1], backdrop);
+                    int xbackdrop = max(xray + 1, bbox.x);
+                    if (!is_stroke && min(p0.y, p1.y) < tile_y0 && xbackdrop < bbox.z) {
+                        int backdrop = p1.y < p0.y ? 1 : -1;
+                        TileRef tile_ref = Tile_index(path.tiles, uint(base + xbackdrop));
+                        uint tile_el = tile_ref.offset >> 2;
+                        if (touch_mem(path_alloc, tile_el + 1)) {
+                            atomicAdd(memory[tile_el + 1], backdrop);
                        }
                    }
-                    int xx0 = clamp(int(floor(xc - c)), x0, x1);
-                    int xx1 = clamp(int(ceil(xc + c)), x0, x1);
+
+                    // next_xray is the xray for the next scanline; the line segment intersects
+                    // all tiles between xray and next_xray.
+                    int next_xray = last_xray;
+                    if (y < y1 - 1) {
+                        float tile_y1 = float((y + 1) * TILE_HEIGHT_PX);
+                        float x_edge = mix(p0.x, p1.x, (tile_y1 - p0.y) / dy);
+                        next_xray = int(floor(x_edge*SX));
+                    }
+
+                    int min_xray = min(xray, next_xray);
+                    int max_xray = max(xray, next_xray);
+                    int xx0 = min(int(floor(xc - c)), min_xray);
+                    int xx1 = max(int(ceil(xc + c)), max_xray + 1);
+                    xx0 = clamp(xx0, x0, x1);
+                    xx1 = clamp(xx1, x0, x1);
+
                    for (int x = xx0; x < xx1; x++) {
                        float tile_x0 = float(x * TILE_WIDTH_PX);
-                        TileRef tile_ref = Tile_index(path.tiles, uint(base + x));
+                        TileRef tile_ref = Tile_index(TileRef(path.tiles.offset), uint(base + x));
                        uint tile_el = tile_ref.offset >> 2;
-                        uint old = atomicExchange(tile[tile_el], tile_offset);
-                        tile_seg.start = p0;
-                        tile_seg.end = p1;
+                        uint old = 0;
+                        if (touch_mem(path_alloc, tile_el)) {
+                            old = atomicExchange(memory[tile_el], tile_offset);
+                        }
+                        tile_seg.origin = p0;
+                        tile_seg.vector = p1 - p0;
                        float y_edge = 0.0;
-                        if (tag == PathSeg_FillCubic) {
+                        if (!is_stroke) {
                            y_edge = mix(p0.y, p1.y, (tile_x0 - p0.x) / dx);
-                            if (min(p0.x, p1.x) < tile_x0 && y_edge >= tile_y0 && y_edge < tile_y0 + TILE_HEIGHT_PX) {
+                            if (min(p0.x, p1.x) < tile_x0) {
+                                vec2 p = vec2(tile_x0, y_edge);
                                if (p0.x > p1.x) {
-                                    tile_seg.end = vec2(tile_x0, y_edge);
+                                    tile_seg.vector = p - p0;
                                } else {
-                                    tile_seg.start = vec2(tile_x0, y_edge);
+                                    tile_seg.origin = p;
+                                    tile_seg.vector = p1 - p;
                                }
-                            } else {
+                                // kernel4 uses sign(vector.x) for the sign of the intersection backdrop.
+                                // Nudge zeroes towards the intended sign.
+                                if (tile_seg.vector.x == 0) {
+                                    tile_seg.vector.x = sign(p1.x - p0.x)*1e-9;
+                                }
+                            }
+                            if (x <= min_xray || max_xray < x) {
+                                // Reject inconsistent intersections.
                                y_edge = 1e9;
                            }
                        }
                        tile_seg.y_edge = y_edge;
                        tile_seg.next.offset = old;
-                        TileSeg_write(TileSegRef(tile_offset), tile_seg);
+                        TileSeg_write(tile_alloc.alloc, TileSegRef(tile_offset), tile_seg);
                        tile_offset += TileSeg_size;
                    }
                    xc += b;
                    base += stride;
+                    xray = next_xray;
                }

                n_out += 1;
--- a/piet-gpu/shader/path_coarse.spv
+++ b/piet-gpu/shader/path_coarse.spv
--- a/piet-gpu/shader/pathseg.h
+++ b/piet-gpu/shader/pathseg.h
@ -1,18 +1,8 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
 // Code auto-generated by piet-gpu-derive

-struct PathFillLineRef {
-    uint offset;
-};
-
-struct PathStrokeLineRef {
-    uint offset;
-};
-
-struct PathFillCubicRef {
-    uint offset;
-};
-
-struct PathStrokeCubicRef {
+struct PathCubicRef {
    uint offset;
 };

@ -20,234 +10,91 @@ struct PathSegRef {
    uint offset;
 };

-struct PathFillLine {
-    vec2 p0;
-    vec2 p1;
-    uint path_ix;
-};
-
-#define PathFillLine_size 20
-
-PathFillLineRef PathFillLine_index(PathFillLineRef ref, uint index) {
-    return PathFillLineRef(ref.offset + index * PathFillLine_size);
-}
-
-struct PathStrokeLine {
-    vec2 p0;
-    vec2 p1;
-    uint path_ix;
-    vec2 stroke;
-};
-
-#define PathStrokeLine_size 28
-
-PathStrokeLineRef PathStrokeLine_index(PathStrokeLineRef ref, uint index) {
-    return PathStrokeLineRef(ref.offset + index * PathStrokeLine_size);
-}
-
-struct PathFillCubic {
-    vec2 p0;
-    vec2 p1;
-    vec2 p2;
-    vec2 p3;
-    uint path_ix;
-};
-
-#define PathFillCubic_size 36
-
-PathFillCubicRef PathFillCubic_index(PathFillCubicRef ref, uint index) {
-    return PathFillCubicRef(ref.offset + index * PathFillCubic_size);
-}
-
-struct PathStrokeCubic {
+struct PathCubic {
    vec2 p0;
    vec2 p1;
    vec2 p2;
    vec2 p3;
    uint path_ix;
+    uint trans_ix;
    vec2 stroke;
 };

-#define PathStrokeCubic_size 44
+#define PathCubic_size 48

-PathStrokeCubicRef PathStrokeCubic_index(PathStrokeCubicRef ref, uint index) {
-    return PathStrokeCubicRef(ref.offset + index * PathStrokeCubic_size);
+PathCubicRef PathCubic_index(PathCubicRef ref, uint index) {
+    return PathCubicRef(ref.offset + index * PathCubic_size);
 }

 #define PathSeg_Nop 0
-#define PathSeg_FillLine 1
-#define PathSeg_StrokeLine 2
-#define PathSeg_FillCubic 3
-#define PathSeg_StrokeCubic 4
-#define PathSeg_size 48
+#define PathSeg_Cubic 1
+#define PathSeg_size 52

 PathSegRef PathSeg_index(PathSegRef ref, uint index) {
    return PathSegRef(ref.offset + index * PathSeg_size);
 }

-PathFillLine PathFillLine_read(PathFillLineRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = pathseg[ix + 0];
-    uint raw1 = pathseg[ix + 1];
-    uint raw2 = pathseg[ix + 2];
-    uint raw3 = pathseg[ix + 3];
-    uint raw4 = pathseg[ix + 4];
-    PathFillLine s;
-    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
-    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.path_ix = raw4;
-    return s;
-}
+struct PathSegTag {
+   uint tag;
+   uint flags;
+};

-void PathFillLine_write(PathFillLineRef ref, PathFillLine s) {
+PathCubic PathCubic_read(Alloc a, PathCubicRef ref) {
    uint ix = ref.offset >> 2;
-    pathseg[ix + 0] = floatBitsToUint(s.p0.x);
-    pathseg[ix + 1] = floatBitsToUint(s.p0.y);
-    pathseg[ix + 2] = floatBitsToUint(s.p1.x);
-    pathseg[ix + 3] = floatBitsToUint(s.p1.y);
-    pathseg[ix + 4] = s.path_ix;
-}
-
-PathStrokeLine PathStrokeLine_read(PathStrokeLineRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = pathseg[ix + 0];
-    uint raw1 = pathseg[ix + 1];
-    uint raw2 = pathseg[ix + 2];
-    uint raw3 = pathseg[ix + 3];
-    uint raw4 = pathseg[ix + 4];
-    uint raw5 = pathseg[ix + 5];
-    uint raw6 = pathseg[ix + 6];
-    PathStrokeLine s;
-    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
-    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.path_ix = raw4;
-    s.stroke = vec2(uintBitsToFloat(raw5), uintBitsToFloat(raw6));
-    return s;
-}
-
-void PathStrokeLine_write(PathStrokeLineRef ref, PathStrokeLine s) {
-    uint ix = ref.offset >> 2;
-    pathseg[ix + 0] = floatBitsToUint(s.p0.x);
-    pathseg[ix + 1] = floatBitsToUint(s.p0.y);
-    pathseg[ix + 2] = floatBitsToUint(s.p1.x);
-    pathseg[ix + 3] = floatBitsToUint(s.p1.y);
-    pathseg[ix + 4] = s.path_ix;
-    pathseg[ix + 5] = floatBitsToUint(s.stroke.x);
-    pathseg[ix + 6] = floatBitsToUint(s.stroke.y);
-}
-
-PathFillCubic PathFillCubic_read(PathFillCubicRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = pathseg[ix + 0];
-    uint raw1 = pathseg[ix + 1];
-    uint raw2 = pathseg[ix + 2];
-    uint raw3 = pathseg[ix + 3];
-    uint raw4 = pathseg[ix + 4];
-    uint raw5 = pathseg[ix + 5];
-    uint raw6 = pathseg[ix + 6];
-    uint raw7 = pathseg[ix + 7];
-    uint raw8 = pathseg[ix + 8];
-    PathFillCubic s;
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
+    uint raw5 = read_mem(a, ix + 5);
+    uint raw6 = read_mem(a, ix + 6);
+    uint raw7 = read_mem(a, ix + 7);
+    uint raw8 = read_mem(a, ix + 8);
+    uint raw9 = read_mem(a, ix + 9);
+    uint raw10 = read_mem(a, ix + 10);
+    uint raw11 = read_mem(a, ix + 11);
+    PathCubic s;
    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
    s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7));
    s.path_ix = raw8;
+    s.trans_ix = raw9;
+    s.stroke = vec2(uintBitsToFloat(raw10), uintBitsToFloat(raw11));
    return s;
 }

-void PathFillCubic_write(PathFillCubicRef ref, PathFillCubic s) {
+void PathCubic_write(Alloc a, PathCubicRef ref, PathCubic s) {
    uint ix = ref.offset >> 2;
-    pathseg[ix + 0] = floatBitsToUint(s.p0.x);
-    pathseg[ix + 1] = floatBitsToUint(s.p0.y);
-    pathseg[ix + 2] = floatBitsToUint(s.p1.x);
-    pathseg[ix + 3] = floatBitsToUint(s.p1.y);
-    pathseg[ix + 4] = floatBitsToUint(s.p2.x);
-    pathseg[ix + 5] = floatBitsToUint(s.p2.y);
-    pathseg[ix + 6] = floatBitsToUint(s.p3.x);
-    pathseg[ix + 7] = floatBitsToUint(s.p3.y);
-    pathseg[ix + 8] = s.path_ix;
+    write_mem(a, ix + 0, floatBitsToUint(s.p0.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.p0.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.p1.x));
+    write_mem(a, ix + 3, floatBitsToUint(s.p1.y));
+    write_mem(a, ix + 4, floatBitsToUint(s.p2.x));
+    write_mem(a, ix + 5, floatBitsToUint(s.p2.y));
+    write_mem(a, ix + 6, floatBitsToUint(s.p3.x));
+    write_mem(a, ix + 7, floatBitsToUint(s.p3.y));
+    write_mem(a, ix + 8, s.path_ix);
+    write_mem(a, ix + 9, s.trans_ix);
+    write_mem(a, ix + 10, floatBitsToUint(s.stroke.x));
+    write_mem(a, ix + 11, floatBitsToUint(s.stroke.y));
 }

-PathStrokeCubic PathStrokeCubic_read(PathStrokeCubicRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = pathseg[ix + 0];
-    uint raw1 = pathseg[ix + 1];
-    uint raw2 = pathseg[ix + 2];
-    uint raw3 = pathseg[ix + 3];
-    uint raw4 = pathseg[ix + 4];
-    uint raw5 = pathseg[ix + 5];
-    uint raw6 = pathseg[ix + 6];
-    uint raw7 = pathseg[ix + 7];
-    uint raw8 = pathseg[ix + 8];
-    uint raw9 = pathseg[ix + 9];
-    uint raw10 = pathseg[ix + 10];
-    PathStrokeCubic s;
-    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
-    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
-    s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7));
-    s.path_ix = raw8;
-    s.stroke = vec2(uintBitsToFloat(raw9), uintBitsToFloat(raw10));
-    return s;
+PathSegTag PathSeg_tag(Alloc a, PathSegRef ref) {
+    uint tag_and_flags = read_mem(a, ref.offset >> 2);
+    return PathSegTag(tag_and_flags & 0xffff, tag_and_flags >> 16);
 }

-void PathStrokeCubic_write(PathStrokeCubicRef ref, PathStrokeCubic s) {
-    uint ix = ref.offset >> 2;
-    pathseg[ix + 0] = floatBitsToUint(s.p0.x);
-    pathseg[ix + 1] = floatBitsToUint(s.p0.y);
-    pathseg[ix + 2] = floatBitsToUint(s.p1.x);
-    pathseg[ix + 3] = floatBitsToUint(s.p1.y);
-    pathseg[ix + 4] = floatBitsToUint(s.p2.x);
-    pathseg[ix + 5] = floatBitsToUint(s.p2.y);
-    pathseg[ix + 6] = floatBitsToUint(s.p3.x);
-    pathseg[ix + 7] = floatBitsToUint(s.p3.y);
-    pathseg[ix + 8] = s.path_ix;
-    pathseg[ix + 9] = floatBitsToUint(s.stroke.x);
-    pathseg[ix + 10] = floatBitsToUint(s.stroke.y);
+PathCubic PathSeg_Cubic_read(Alloc a, PathSegRef ref) {
+    return PathCubic_read(a, PathCubicRef(ref.offset + 4));
 }

-uint PathSeg_tag(PathSegRef ref) {
-    return pathseg[ref.offset >> 2];
+void PathSeg_Nop_write(Alloc a, PathSegRef ref) {
+    write_mem(a, ref.offset >> 2, PathSeg_Nop);
 }

-PathFillLine PathSeg_FillLine_read(PathSegRef ref) {
-    return PathFillLine_read(PathFillLineRef(ref.offset + 4));
-}
-
-PathStrokeLine PathSeg_StrokeLine_read(PathSegRef ref) {
-    return PathStrokeLine_read(PathStrokeLineRef(ref.offset + 4));
-}
-
-PathFillCubic PathSeg_FillCubic_read(PathSegRef ref) {
-    return PathFillCubic_read(PathFillCubicRef(ref.offset + 4));
-}
-
-PathStrokeCubic PathSeg_StrokeCubic_read(PathSegRef ref) {
-    return PathStrokeCubic_read(PathStrokeCubicRef(ref.offset + 4));
-}
-
-void PathSeg_Nop_write(PathSegRef ref) {
-    pathseg[ref.offset >> 2] = PathSeg_Nop;
-}
-
-void PathSeg_FillLine_write(PathSegRef ref, PathFillLine s) {
-    pathseg[ref.offset >> 2] = PathSeg_FillLine;
-    PathFillLine_write(PathFillLineRef(ref.offset + 4), s);
-}
-
-void PathSeg_StrokeLine_write(PathSegRef ref, PathStrokeLine s) {
-    pathseg[ref.offset >> 2] = PathSeg_StrokeLine;
-    PathStrokeLine_write(PathStrokeLineRef(ref.offset + 4), s);
-}
-
-void PathSeg_FillCubic_write(PathSegRef ref, PathFillCubic s) {
-    pathseg[ref.offset >> 2] = PathSeg_FillCubic;
-    PathFillCubic_write(PathFillCubicRef(ref.offset + 4), s);
-}
-
-void PathSeg_StrokeCubic_write(PathSegRef ref, PathStrokeCubic s) {
-    pathseg[ref.offset >> 2] = PathSeg_StrokeCubic;
-    PathStrokeCubic_write(PathStrokeCubicRef(ref.offset + 4), s);
+void PathSeg_Cubic_write(Alloc a, PathSegRef ref, uint flags, PathCubic s) {
+    write_mem(a, ref.offset >> 2, (flags << 16) | PathSeg_Cubic);
+    PathCubic_write(a, PathCubicRef(ref.offset + 4), s);
 }

--- a/piet-gpu/shader/ptcl.h
+++ b/piet-gpu/shader/ptcl.h
@ -1,13 +1,7 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
 // Code auto-generated by piet-gpu-derive

-struct CmdCircleRef {
-    uint offset;
-};
-
-struct CmdLineRef {
-    uint offset;
-};
-
 struct CmdStrokeRef {
    uint offset;
 };
@ -16,27 +10,15 @@ struct CmdFillRef {
    uint offset;
 };

-struct CmdFillMaskRef {
+struct CmdColorRef {
    uint offset;
 };

-struct CmdBeginClipRef {
+struct CmdImageRef {
    uint offset;
 };

-struct CmdBeginSolidClipRef {
-    uint offset;
-};
-
-struct CmdEndClipRef {
-    uint offset;
-};
-
-struct CmdSolidRef {
-    uint offset;
-};
-
-struct CmdSolidMaskRef {
+struct CmdAlphaRef {
    uint offset;
 };

@ -48,44 +30,12 @@ struct CmdRef {
    uint offset;
 };

-struct SegmentRef {
-    uint offset;
-};
-
-struct SegChunkRef {
-    uint offset;
-};
-
-struct CmdCircle {
-    vec2 center;
-    float radius;
-    uint rgba_color;
-};
-
-#define CmdCircle_size 16
-
-CmdCircleRef CmdCircle_index(CmdCircleRef ref, uint index) {
-    return CmdCircleRef(ref.offset + index * CmdCircle_size);
-}
-
-struct CmdLine {
-    vec2 start;
-    vec2 end;
-};
-
-#define CmdLine_size 16
-
-CmdLineRef CmdLine_index(CmdLineRef ref, uint index) {
-    return CmdLineRef(ref.offset + index * CmdLine_size);
-}
-
 struct CmdStroke {
    uint tile_ref;
    float half_width;
-    uint rgba_color;
 };

-#define CmdStroke_size 12
+#define CmdStroke_size 8

 CmdStrokeRef CmdStroke_index(CmdStrokeRef ref, uint index) {
    return CmdStrokeRef(ref.offset + index * CmdStroke_size);
@ -94,76 +44,43 @@ CmdStrokeRef CmdStroke_index(CmdStrokeRef ref, uint index) {
 struct CmdFill {
    uint tile_ref;
    int backdrop;
-    uint rgba_color;
 };

-#define CmdFill_size 12
+#define CmdFill_size 8

 CmdFillRef CmdFill_index(CmdFillRef ref, uint index) {
    return CmdFillRef(ref.offset + index * CmdFill_size);
 }

-struct CmdFillMask {
-    uint tile_ref;
-    int backdrop;
-    float mask;
-};
-
-#define CmdFillMask_size 12
-
-CmdFillMaskRef CmdFillMask_index(CmdFillMaskRef ref, uint index) {
-    return CmdFillMaskRef(ref.offset + index * CmdFillMask_size);
-}
-
-struct CmdBeginClip {
-    uint tile_ref;
-    int backdrop;
-};
-
-#define CmdBeginClip_size 8
-
-CmdBeginClipRef CmdBeginClip_index(CmdBeginClipRef ref, uint index) {
-    return CmdBeginClipRef(ref.offset + index * CmdBeginClip_size);
-}
-
-struct CmdBeginSolidClip {
-    float alpha;
-};
-
-#define CmdBeginSolidClip_size 4
-
-CmdBeginSolidClipRef CmdBeginSolidClip_index(CmdBeginSolidClipRef ref, uint index) {
-    return CmdBeginSolidClipRef(ref.offset + index * CmdBeginSolidClip_size);
-}
-
-struct CmdEndClip {
-    float alpha;
-};
-
-#define CmdEndClip_size 4
-
-CmdEndClipRef CmdEndClip_index(CmdEndClipRef ref, uint index) {
-    return CmdEndClipRef(ref.offset + index * CmdEndClip_size);
-}
-
-struct CmdSolid {
+struct CmdColor {
    uint rgba_color;
 };

-#define CmdSolid_size 4
+#define CmdColor_size 4

-CmdSolidRef CmdSolid_index(CmdSolidRef ref, uint index) {
-    return CmdSolidRef(ref.offset + index * CmdSolid_size);
+CmdColorRef CmdColor_index(CmdColorRef ref, uint index) {
+    return CmdColorRef(ref.offset + index * CmdColor_size);
 }

-struct CmdSolidMask {
-    float mask;
+struct CmdImage {
+    uint index;
+    ivec2 offset;
 };

-#define CmdSolidMask_size 4
+#define CmdImage_size 8

-CmdSolidMaskRef CmdSolidMask_index(CmdSolidMaskRef ref, uint index) {
-    return CmdSolidMaskRef(ref.offset + index * CmdSolidMask_size);
+CmdImageRef CmdImage_index(CmdImageRef ref, uint index) {
+    return CmdImageRef(ref.offset + index * CmdImage_size);
+}
+
+struct CmdAlpha {
+    float alpha;
+};
+
+#define CmdAlpha_size 4
+
+CmdAlphaRef CmdAlpha_index(CmdAlphaRef ref, uint index) {
+    return CmdAlphaRef(ref.offset + index * CmdAlpha_size);
 }

 struct CmdJump {
@ -177,382 +94,185 @@ CmdJumpRef CmdJump_index(CmdJumpRef ref, uint index) {
 }

 #define Cmd_End 0
-#define Cmd_Circle 1
-#define Cmd_Line 2
-#define Cmd_Fill 3
-#define Cmd_FillMask 4
-#define Cmd_FillMaskInv 5
-#define Cmd_BeginClip 6
-#define Cmd_BeginSolidClip 7
+#define Cmd_Fill 1
+#define Cmd_Stroke 2
+#define Cmd_Solid 3
+#define Cmd_Alpha 4
+#define Cmd_Color 5
+#define Cmd_Image 6
+#define Cmd_BeginClip 7
 #define Cmd_EndClip 8
-#define Cmd_Stroke 9
-#define Cmd_Solid 10
-#define Cmd_SolidMask 11
-#define Cmd_Jump 12
-#define Cmd_size 20
+#define Cmd_Jump 9
+#define Cmd_size 12

 CmdRef Cmd_index(CmdRef ref, uint index) {
    return CmdRef(ref.offset + index * Cmd_size);
 }

-struct Segment {
-    vec2 start;
-    vec2 end;
-    float y_edge;
+struct CmdTag {
+   uint tag;
+   uint flags;
 };

-#define Segment_size 20
-
-SegmentRef Segment_index(SegmentRef ref, uint index) {
-    return SegmentRef(ref.offset + index * Segment_size);
-}
-
-struct SegChunk {
-    uint n;
-    SegChunkRef next;
-    SegmentRef segs;
-};
-
-#define SegChunk_size 12
-
-SegChunkRef SegChunk_index(SegChunkRef ref, uint index) {
-    return SegChunkRef(ref.offset + index * SegChunk_size);
-}
-
-CmdCircle CmdCircle_read(CmdCircleRef ref) {
+CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
-    uint raw1 = ptcl[ix + 1];
-    uint raw2 = ptcl[ix + 2];
-    uint raw3 = ptcl[ix + 3];
-    CmdCircle s;
-    s.center = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
-    s.radius = uintBitsToFloat(raw2);
-    s.rgba_color = raw3;
-    return s;
-}
-
-void CmdCircle_write(CmdCircleRef ref, CmdCircle s) {
-    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = floatBitsToUint(s.center.x);
-    ptcl[ix + 1] = floatBitsToUint(s.center.y);
-    ptcl[ix + 2] = floatBitsToUint(s.radius);
-    ptcl[ix + 3] = s.rgba_color;
-}
-
-CmdLine CmdLine_read(CmdLineRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
-    uint raw1 = ptcl[ix + 1];
-    uint raw2 = ptcl[ix + 2];
-    uint raw3 = ptcl[ix + 3];
-    CmdLine s;
-    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
-    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    return s;
-}
-
-void CmdLine_write(CmdLineRef ref, CmdLine s) {
-    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = floatBitsToUint(s.start.x);
-    ptcl[ix + 1] = floatBitsToUint(s.start.y);
-    ptcl[ix + 2] = floatBitsToUint(s.end.x);
-    ptcl[ix + 3] = floatBitsToUint(s.end.y);
-}
-
-CmdStroke CmdStroke_read(CmdStrokeRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
-    uint raw1 = ptcl[ix + 1];
-    uint raw2 = ptcl[ix + 2];
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
    CmdStroke s;
    s.tile_ref = raw0;
    s.half_width = uintBitsToFloat(raw1);
-    s.rgba_color = raw2;
    return s;
 }

-void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) {
+void CmdStroke_write(Alloc a, CmdStrokeRef ref, CmdStroke s) {
    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = s.tile_ref;
-    ptcl[ix + 1] = floatBitsToUint(s.half_width);
-    ptcl[ix + 2] = s.rgba_color;
+    write_mem(a, ix + 0, s.tile_ref);
+    write_mem(a, ix + 1, floatBitsToUint(s.half_width));
 }

-CmdFill CmdFill_read(CmdFillRef ref) {
+CmdFill CmdFill_read(Alloc a, CmdFillRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
-    uint raw1 = ptcl[ix + 1];
-    uint raw2 = ptcl[ix + 2];
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
    CmdFill s;
    s.tile_ref = raw0;
    s.backdrop = int(raw1);
-    s.rgba_color = raw2;
    return s;
 }

-void CmdFill_write(CmdFillRef ref, CmdFill s) {
+void CmdFill_write(Alloc a, CmdFillRef ref, CmdFill s) {
    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = s.tile_ref;
-    ptcl[ix + 1] = uint(s.backdrop);
-    ptcl[ix + 2] = s.rgba_color;
+    write_mem(a, ix + 0, s.tile_ref);
+    write_mem(a, ix + 1, uint(s.backdrop));
 }

-CmdFillMask CmdFillMask_read(CmdFillMaskRef ref) {
+CmdColor CmdColor_read(Alloc a, CmdColorRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
-    uint raw1 = ptcl[ix + 1];
-    uint raw2 = ptcl[ix + 2];
-    CmdFillMask s;
-    s.tile_ref = raw0;
-    s.backdrop = int(raw1);
-    s.mask = uintBitsToFloat(raw2);
-    return s;
-}
-
-void CmdFillMask_write(CmdFillMaskRef ref, CmdFillMask s) {
-    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = s.tile_ref;
-    ptcl[ix + 1] = uint(s.backdrop);
-    ptcl[ix + 2] = floatBitsToUint(s.mask);
-}
-
-CmdBeginClip CmdBeginClip_read(CmdBeginClipRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
-    uint raw1 = ptcl[ix + 1];
-    CmdBeginClip s;
-    s.tile_ref = raw0;
-    s.backdrop = int(raw1);
-    return s;
-}
-
-void CmdBeginClip_write(CmdBeginClipRef ref, CmdBeginClip s) {
-    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = s.tile_ref;
-    ptcl[ix + 1] = uint(s.backdrop);
-}
-
-CmdBeginSolidClip CmdBeginSolidClip_read(CmdBeginSolidClipRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
-    CmdBeginSolidClip s;
-    s.alpha = uintBitsToFloat(raw0);
-    return s;
-}
-
-void CmdBeginSolidClip_write(CmdBeginSolidClipRef ref, CmdBeginSolidClip s) {
-    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = floatBitsToUint(s.alpha);
-}
-
-CmdEndClip CmdEndClip_read(CmdEndClipRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
-    CmdEndClip s;
-    s.alpha = uintBitsToFloat(raw0);
-    return s;
-}
-
-void CmdEndClip_write(CmdEndClipRef ref, CmdEndClip s) {
-    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = floatBitsToUint(s.alpha);
-}
-
-CmdSolid CmdSolid_read(CmdSolidRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
-    CmdSolid s;
+    uint raw0 = read_mem(a, ix + 0);
+    CmdColor s;
    s.rgba_color = raw0;
    return s;
 }

-void CmdSolid_write(CmdSolidRef ref, CmdSolid s) {
+void CmdColor_write(Alloc a, CmdColorRef ref, CmdColor s) {
    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = s.rgba_color;
+    write_mem(a, ix + 0, s.rgba_color);
 }

-CmdSolidMask CmdSolidMask_read(CmdSolidMaskRef ref) {
+CmdImage CmdImage_read(Alloc a, CmdImageRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
-    CmdSolidMask s;
-    s.mask = uintBitsToFloat(raw0);
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    CmdImage s;
+    s.index = raw0;
+    s.offset = ivec2(int(raw1 << 16) >> 16, int(raw1) >> 16);
    return s;
 }

-void CmdSolidMask_write(CmdSolidMaskRef ref, CmdSolidMask s) {
+void CmdImage_write(Alloc a, CmdImageRef ref, CmdImage s) {
    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = floatBitsToUint(s.mask);
+    write_mem(a, ix + 0, s.index);
+    write_mem(a, ix + 1, (uint(s.offset.x) & 0xffff) | (uint(s.offset.y) << 16));
 }

-CmdJump CmdJump_read(CmdJumpRef ref) {
+CmdAlpha CmdAlpha_read(Alloc a, CmdAlphaRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
+    uint raw0 = read_mem(a, ix + 0);
+    CmdAlpha s;
+    s.alpha = uintBitsToFloat(raw0);
+    return s;
+}
+
+void CmdAlpha_write(Alloc a, CmdAlphaRef ref, CmdAlpha s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, floatBitsToUint(s.alpha));
+}
+
+CmdJump CmdJump_read(Alloc a, CmdJumpRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
    CmdJump s;
    s.new_ref = raw0;
    return s;
 }

-void CmdJump_write(CmdJumpRef ref, CmdJump s) {
+void CmdJump_write(Alloc a, CmdJumpRef ref, CmdJump s) {
    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = s.new_ref;
+    write_mem(a, ix + 0, s.new_ref);
 }

-uint Cmd_tag(CmdRef ref) {
-    return ptcl[ref.offset >> 2];
+CmdTag Cmd_tag(Alloc a, CmdRef ref) {
+    uint tag_and_flags = read_mem(a, ref.offset >> 2);
+    return CmdTag(tag_and_flags & 0xffff, tag_and_flags >> 16);
 }

-CmdCircle Cmd_Circle_read(CmdRef ref) {
-    return CmdCircle_read(CmdCircleRef(ref.offset + 4));
+CmdFill Cmd_Fill_read(Alloc a, CmdRef ref) {
+    return CmdFill_read(a, CmdFillRef(ref.offset + 4));
 }

-CmdLine Cmd_Line_read(CmdRef ref) {
-    return CmdLine_read(CmdLineRef(ref.offset + 4));
+CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref) {
+    return CmdStroke_read(a, CmdStrokeRef(ref.offset + 4));
 }

-CmdFill Cmd_Fill_read(CmdRef ref) {
-    return CmdFill_read(CmdFillRef(ref.offset + 4));
+CmdAlpha Cmd_Alpha_read(Alloc a, CmdRef ref) {
+    return CmdAlpha_read(a, CmdAlphaRef(ref.offset + 4));
 }

-CmdFillMask Cmd_FillMask_read(CmdRef ref) {
-    return CmdFillMask_read(CmdFillMaskRef(ref.offset + 4));
+CmdColor Cmd_Color_read(Alloc a, CmdRef ref) {
+    return CmdColor_read(a, CmdColorRef(ref.offset + 4));
 }

-CmdFillMask Cmd_FillMaskInv_read(CmdRef ref) {
-    return CmdFillMask_read(CmdFillMaskRef(ref.offset + 4));
+CmdImage Cmd_Image_read(Alloc a, CmdRef ref) {
+    return CmdImage_read(a, CmdImageRef(ref.offset + 4));
 }

-CmdBeginClip Cmd_BeginClip_read(CmdRef ref) {
-    return CmdBeginClip_read(CmdBeginClipRef(ref.offset + 4));
+CmdJump Cmd_Jump_read(Alloc a, CmdRef ref) {
+    return CmdJump_read(a, CmdJumpRef(ref.offset + 4));
 }

-CmdBeginSolidClip Cmd_BeginSolidClip_read(CmdRef ref) {
-    return CmdBeginSolidClip_read(CmdBeginSolidClipRef(ref.offset + 4));
+void Cmd_End_write(Alloc a, CmdRef ref) {
+    write_mem(a, ref.offset >> 2, Cmd_End);
 }

-CmdEndClip Cmd_EndClip_read(CmdRef ref) {
-    return CmdEndClip_read(CmdEndClipRef(ref.offset + 4));
+void Cmd_Fill_write(Alloc a, CmdRef ref, CmdFill s) {
+    write_mem(a, ref.offset >> 2, Cmd_Fill);
+    CmdFill_write(a, CmdFillRef(ref.offset + 4), s);
 }

-CmdStroke Cmd_Stroke_read(CmdRef ref) {
-    return CmdStroke_read(CmdStrokeRef(ref.offset + 4));
+void Cmd_Stroke_write(Alloc a, CmdRef ref, CmdStroke s) {
+    write_mem(a, ref.offset >> 2, Cmd_Stroke);
+    CmdStroke_write(a, CmdStrokeRef(ref.offset + 4), s);
 }

-CmdSolid Cmd_Solid_read(CmdRef ref) {
-    return CmdSolid_read(CmdSolidRef(ref.offset + 4));
+void Cmd_Solid_write(Alloc a, CmdRef ref) {
+    write_mem(a, ref.offset >> 2, Cmd_Solid);
 }

-CmdSolidMask Cmd_SolidMask_read(CmdRef ref) {
-    return CmdSolidMask_read(CmdSolidMaskRef(ref.offset + 4));
+void Cmd_Alpha_write(Alloc a, CmdRef ref, CmdAlpha s) {
+    write_mem(a, ref.offset >> 2, Cmd_Alpha);
+    CmdAlpha_write(a, CmdAlphaRef(ref.offset + 4), s);
 }

-CmdJump Cmd_Jump_read(CmdRef ref) {
-    return CmdJump_read(CmdJumpRef(ref.offset + 4));
+void Cmd_Color_write(Alloc a, CmdRef ref, CmdColor s) {
+    write_mem(a, ref.offset >> 2, Cmd_Color);
+    CmdColor_write(a, CmdColorRef(ref.offset + 4), s);
 }

-void Cmd_End_write(CmdRef ref) {
-    ptcl[ref.offset >> 2] = Cmd_End;
+void Cmd_Image_write(Alloc a, CmdRef ref, CmdImage s) {
+    write_mem(a, ref.offset >> 2, Cmd_Image);
+    CmdImage_write(a, CmdImageRef(ref.offset + 4), s);
 }

-void Cmd_Circle_write(CmdRef ref, CmdCircle s) {
-    ptcl[ref.offset >> 2] = Cmd_Circle;
-    CmdCircle_write(CmdCircleRef(ref.offset + 4), s);
+void Cmd_BeginClip_write(Alloc a, CmdRef ref) {
+    write_mem(a, ref.offset >> 2, Cmd_BeginClip);
 }

-void Cmd_Line_write(CmdRef ref, CmdLine s) {
-    ptcl[ref.offset >> 2] = Cmd_Line;
-    CmdLine_write(CmdLineRef(ref.offset + 4), s);
+void Cmd_EndClip_write(Alloc a, CmdRef ref) {
+    write_mem(a, ref.offset >> 2, Cmd_EndClip);
 }

-void Cmd_Fill_write(CmdRef ref, CmdFill s) {
-    ptcl[ref.offset >> 2] = Cmd_Fill;
-    CmdFill_write(CmdFillRef(ref.offset + 4), s);
-}
-
-void Cmd_FillMask_write(CmdRef ref, CmdFillMask s) {
-    ptcl[ref.offset >> 2] = Cmd_FillMask;
-    CmdFillMask_write(CmdFillMaskRef(ref.offset + 4), s);
-}
-
-void Cmd_FillMaskInv_write(CmdRef ref, CmdFillMask s) {
-    ptcl[ref.offset >> 2] = Cmd_FillMaskInv;
-    CmdFillMask_write(CmdFillMaskRef(ref.offset + 4), s);
-}
-
-void Cmd_BeginClip_write(CmdRef ref, CmdBeginClip s) {
-    ptcl[ref.offset >> 2] = Cmd_BeginClip;
-    CmdBeginClip_write(CmdBeginClipRef(ref.offset + 4), s);
-}
-
-void Cmd_BeginSolidClip_write(CmdRef ref, CmdBeginSolidClip s) {
-    ptcl[ref.offset >> 2] = Cmd_BeginSolidClip;
-    CmdBeginSolidClip_write(CmdBeginSolidClipRef(ref.offset + 4), s);
-}
-
-void Cmd_EndClip_write(CmdRef ref, CmdEndClip s) {
-    ptcl[ref.offset >> 2] = Cmd_EndClip;
-    CmdEndClip_write(CmdEndClipRef(ref.offset + 4), s);
-}
-
-void Cmd_Stroke_write(CmdRef ref, CmdStroke s) {
-    ptcl[ref.offset >> 2] = Cmd_Stroke;
-    CmdStroke_write(CmdStrokeRef(ref.offset + 4), s);
-}
-
-void Cmd_Solid_write(CmdRef ref, CmdSolid s) {
-    ptcl[ref.offset >> 2] = Cmd_Solid;
-    CmdSolid_write(CmdSolidRef(ref.offset + 4), s);
-}
-
-void Cmd_SolidMask_write(CmdRef ref, CmdSolidMask s) {
-    ptcl[ref.offset >> 2] = Cmd_SolidMask;
-    CmdSolidMask_write(CmdSolidMaskRef(ref.offset + 4), s);
-}
-
-void Cmd_Jump_write(CmdRef ref, CmdJump s) {
-    ptcl[ref.offset >> 2] = Cmd_Jump;
-    CmdJump_write(CmdJumpRef(ref.offset + 4), s);
-}
-
-Segment Segment_read(SegmentRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
-    uint raw1 = ptcl[ix + 1];
-    uint raw2 = ptcl[ix + 2];
-    uint raw3 = ptcl[ix + 3];
-    uint raw4 = ptcl[ix + 4];
-    Segment s;
-    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
-    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.y_edge = uintBitsToFloat(raw4);
-    return s;
-}
-
-void Segment_write(SegmentRef ref, Segment s) {
-    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = floatBitsToUint(s.start.x);
-    ptcl[ix + 1] = floatBitsToUint(s.start.y);
-    ptcl[ix + 2] = floatBitsToUint(s.end.x);
-    ptcl[ix + 3] = floatBitsToUint(s.end.y);
-    ptcl[ix + 4] = floatBitsToUint(s.y_edge);
-}
-
-SegChunk SegChunk_read(SegChunkRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
-    uint raw1 = ptcl[ix + 1];
-    uint raw2 = ptcl[ix + 2];
-    SegChunk s;
-    s.n = raw0;
-    s.next = SegChunkRef(raw1);
-    s.segs = SegmentRef(raw2);
-    return s;
-}
-
-void SegChunk_write(SegChunkRef ref, SegChunk s) {
-    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = s.n;
-    ptcl[ix + 1] = s.next.offset;
-    ptcl[ix + 2] = s.segs.offset;
+void Cmd_Jump_write(Alloc a, CmdRef ref, CmdJump s) {
+    write_mem(a, ref.offset >> 2, Cmd_Jump);
+    CmdJump_write(a, CmdJumpRef(ref.offset + 4), s);
 }

--- a/piet-gpu/shader/scene.h
+++ b/piet-gpu/shader/scene.h
@ -1,3 +1,5 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
 // Code auto-generated by piet-gpu-derive

 struct LineSegRef {
@ -12,15 +14,11 @@ struct CubicSegRef {
    uint offset;
 };

-struct FillRef {
+struct FillColorRef {
    uint offset;
 };

-struct FillMaskRef {
-    uint offset;
-};
-
-struct StrokeRef {
+struct FillImageRef {
    uint offset;
 };

@ -36,6 +34,10 @@ struct ClipRef {
    uint offset;
 };

+struct SetFillModeRef {
+    uint offset;
+};
+
 struct ElementRef {
    uint offset;
 };
@ -76,34 +78,25 @@ CubicSegRef CubicSeg_index(CubicSegRef ref, uint index) {
    return CubicSegRef(ref.offset + index * CubicSeg_size);
 }

-struct Fill {
+struct FillColor {
    uint rgba_color;
 };

-#define Fill_size 4
+#define FillColor_size 4

-FillRef Fill_index(FillRef ref, uint index) {
-    return FillRef(ref.offset + index * Fill_size);
+FillColorRef FillColor_index(FillColorRef ref, uint index) {
+    return FillColorRef(ref.offset + index * FillColor_size);
 }

-struct FillMask {
-    float mask;
+struct FillImage {
+    uint index;
+    ivec2 offset;
 };

-#define FillMask_size 4
+#define FillImage_size 8

-FillMaskRef FillMask_index(FillMaskRef ref, uint index) {
-    return FillMaskRef(ref.offset + index * FillMask_size);
-}
-
-struct Stroke {
-    uint rgba_color;
-};
-
-#define Stroke_size 4
-
-StrokeRef Stroke_index(StrokeRef ref, uint index) {
-    return StrokeRef(ref.offset + index * Stroke_size);
+FillImageRef FillImage_index(FillImageRef ref, uint index) {
+    return FillImageRef(ref.offset + index * FillImage_size);
 }

 struct SetLineWidth {
@ -137,27 +130,38 @@ ClipRef Clip_index(ClipRef ref, uint index) {
    return ClipRef(ref.offset + index * Clip_size);
 }

+struct SetFillMode {
+    uint fill_mode;
+};
+
+#define SetFillMode_size 4
+
+SetFillModeRef SetFillMode_index(SetFillModeRef ref, uint index) {
+    return SetFillModeRef(ref.offset + index * SetFillMode_size);
+}
+
 #define Element_Nop 0
-#define Element_StrokeLine 1
-#define Element_FillLine 2
-#define Element_StrokeQuad 3
-#define Element_FillQuad 4
-#define Element_StrokeCubic 5
-#define Element_FillCubic 6
-#define Element_Stroke 7
-#define Element_Fill 8
-#define Element_SetLineWidth 9
-#define Element_Transform 10
-#define Element_FillMask 11
-#define Element_FillMaskInv 12
-#define Element_BeginClip 13
-#define Element_EndClip 14
+#define Element_Line 1
+#define Element_Quad 2
+#define Element_Cubic 3
+#define Element_FillColor 4
+#define Element_SetLineWidth 5
+#define Element_Transform 6
+#define Element_BeginClip 7
+#define Element_EndClip 8
+#define Element_FillImage 9
+#define Element_SetFillMode 10
 #define Element_size 36

 ElementRef Element_index(ElementRef ref, uint index) {
    return ElementRef(ref.offset + index * Element_size);
 }

+struct ElementTag {
+   uint tag;
+   uint flags;
+};
+
 LineSeg LineSeg_read(LineSegRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = scene[ix + 0];
@ -203,27 +207,21 @@ CubicSeg CubicSeg_read(CubicSegRef ref) {
    return s;
 }

-Fill Fill_read(FillRef ref) {
+FillColor FillColor_read(FillColorRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = scene[ix + 0];
-    Fill s;
+    FillColor s;
    s.rgba_color = raw0;
    return s;
 }

-FillMask FillMask_read(FillMaskRef ref) {
+FillImage FillImage_read(FillImageRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = scene[ix + 0];
-    FillMask s;
-    s.mask = uintBitsToFloat(raw0);
-    return s;
-}
-
-Stroke Stroke_read(StrokeRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = scene[ix + 0];
-    Stroke s;
-    s.rgba_color = raw0;
+    uint raw1 = scene[ix + 1];
+    FillImage s;
+    s.index = raw0;
+    s.offset = ivec2(int(raw1 << 16) >> 16, int(raw1) >> 16);
    return s;
 }

@ -260,40 +258,33 @@ Clip Clip_read(ClipRef ref) {
    return s;
 }

-uint Element_tag(ElementRef ref) {
-    return scene[ref.offset >> 2];
+SetFillMode SetFillMode_read(SetFillModeRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = scene[ix + 0];
+    SetFillMode s;
+    s.fill_mode = raw0;
+    return s;
 }

-LineSeg Element_StrokeLine_read(ElementRef ref) {
+ElementTag Element_tag(ElementRef ref) {
+    uint tag_and_flags = scene[ref.offset >> 2];
+    return ElementTag(tag_and_flags & 0xffff, tag_and_flags >> 16);
+}
+
+LineSeg Element_Line_read(ElementRef ref) {
    return LineSeg_read(LineSegRef(ref.offset + 4));
 }

-LineSeg Element_FillLine_read(ElementRef ref) {
-    return LineSeg_read(LineSegRef(ref.offset + 4));
-}
-
-QuadSeg Element_StrokeQuad_read(ElementRef ref) {
+QuadSeg Element_Quad_read(ElementRef ref) {
    return QuadSeg_read(QuadSegRef(ref.offset + 4));
 }

-QuadSeg Element_FillQuad_read(ElementRef ref) {
-    return QuadSeg_read(QuadSegRef(ref.offset + 4));
-}
-
-CubicSeg Element_StrokeCubic_read(ElementRef ref) {
+CubicSeg Element_Cubic_read(ElementRef ref) {
    return CubicSeg_read(CubicSegRef(ref.offset + 4));
 }

-CubicSeg Element_FillCubic_read(ElementRef ref) {
-    return CubicSeg_read(CubicSegRef(ref.offset + 4));
-}
-
-Stroke Element_Stroke_read(ElementRef ref) {
-    return Stroke_read(StrokeRef(ref.offset + 4));
-}
-
-Fill Element_Fill_read(ElementRef ref) {
-    return Fill_read(FillRef(ref.offset + 4));
+FillColor Element_FillColor_read(ElementRef ref) {
+    return FillColor_read(FillColorRef(ref.offset + 4));
 }

 SetLineWidth Element_SetLineWidth_read(ElementRef ref) {
@ -304,14 +295,6 @@ Transform Element_Transform_read(ElementRef ref) {
    return Transform_read(TransformRef(ref.offset + 4));
 }

-FillMask Element_FillMask_read(ElementRef ref) {
-    return FillMask_read(FillMaskRef(ref.offset + 4));
-}
-
-FillMask Element_FillMaskInv_read(ElementRef ref) {
-    return FillMask_read(FillMaskRef(ref.offset + 4));
-}
-
 Clip Element_BeginClip_read(ElementRef ref) {
    return Clip_read(ClipRef(ref.offset + 4));
 }
@ -320,3 +303,11 @@ Clip Element_EndClip_read(ElementRef ref) {
    return Clip_read(ClipRef(ref.offset + 4));
 }

+FillImage Element_FillImage_read(ElementRef ref) {
+    return FillImage_read(FillImageRef(ref.offset + 4));
+}
+
+SetFillMode Element_SetFillMode_read(ElementRef ref) {
+    return SetFillMode_read(SetFillModeRef(ref.offset + 4));
+}
+
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@ -1,3 +1,5 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
 // Various constants for the sizes of groups and tiles.

 // Much of this will be made dynamic in various ways, but for now it's easiest
@ -8,15 +10,14 @@
 #define LG_WG_FACTOR 1
 #define WG_FACTOR (1<<LG_WG_FACTOR)

-// TODO: compute all these
-
-#define WIDTH_IN_TILES 128
-#define HEIGHT_IN_TILES 96
 #define TILE_WIDTH_PX 16
 #define TILE_HEIGHT_PX 16

 #define PTCL_INITIAL_ALLOC 1024

+// This is now set in the ninja file during compilation
+//#define ENABLE_IMAGE_INDICES
+
 // These should probably be renamed and/or reworked. In the binning
 // kernel, they represent the number of bins. Also, the workgroup size
 // of that kernel is equal to the number of bins, but should probably
@ -26,3 +27,28 @@
 #define N_TILE (N_TILE_X * N_TILE_Y)
 #define LG_N_TILE (7 + LG_WG_FACTOR)
 #define N_SLICE (N_TILE / 32)
+
+struct Config {
+    uint n_elements; // paths
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+};
+
+// Fill modes.
+#define MODE_NONZERO 0
+#define MODE_STROKE 1
+
+// Size of kernel4 clip state, in words.
+#define CLIP_STATE_SIZE 2
+
+// fill_mode_from_flags extracts the fill mode from tag flags.
+uint fill_mode_from_flags(uint flags) {
+    return flags & 0x1;
+}
--- a/piet-gpu/shader/state.h
+++ b/piet-gpu/shader/state.h
@ -1,3 +1,5 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
 // Code auto-generated by piet-gpu-derive

 struct StateRef {
@ -12,9 +14,10 @@ struct State {
    uint flags;
    uint path_count;
    uint pathseg_count;
+    uint trans_count;
 };

-#define State_size 56
+#define State_size 60

 StateRef State_index(StateRef ref, uint index) {
    return StateRef(ref.offset + index * State_size);
@ -36,6 +39,7 @@ State State_read(StateRef ref) {
    uint raw11 = state[ix + 11];
    uint raw12 = state[ix + 12];
    uint raw13 = state[ix + 13];
+    uint raw14 = state[ix + 14];
    State s;
    s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
@ -44,6 +48,7 @@ State State_read(StateRef ref) {
    s.flags = raw11;
    s.path_count = raw12;
    s.pathseg_count = raw13;
+    s.trans_count = raw14;
    return s;
 }

@ -63,5 +68,6 @@ void State_write(StateRef ref, State s) {
    state[ix + 11] = s.flags;
    state[ix + 12] = s.path_count;
    state[ix + 13] = s.pathseg_count;
+    state[ix + 14] = s.trans_count;
 }

--- a/piet-gpu/shader/tile.h
+++ b/piet-gpu/shader/tile.h
@ -1,3 +1,5 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
 // Code auto-generated by piet-gpu-derive

 struct PathRef {
@ -12,6 +14,10 @@ struct TileSegRef {
    uint offset;
 };

+struct TransformSegRef {
+    uint offset;
+};
+
 struct Path {
    uvec4 bbox;
    TileRef tiles;
@ -35,8 +41,8 @@ TileRef Tile_index(TileRef ref, uint index) {
 }

 struct TileSeg {
-    vec2 start;
-    vec2 end;
+    vec2 origin;
+    vec2 vector;
    float y_edge;
    TileSegRef next;
 };
@ -47,63 +53,98 @@ TileSegRef TileSeg_index(TileSegRef ref, uint index) {
    return TileSegRef(ref.offset + index * TileSeg_size);
 }

-Path Path_read(PathRef ref) {
+struct TransformSeg {
+    vec4 mat;
+    vec2 translate;
+};
+
+#define TransformSeg_size 24
+
+TransformSegRef TransformSeg_index(TransformSegRef ref, uint index) {
+    return TransformSegRef(ref.offset + index * TransformSeg_size);
+}
+
+Path Path_read(Alloc a, PathRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = tile[ix + 0];
-    uint raw1 = tile[ix + 1];
-    uint raw2 = tile[ix + 2];
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
    Path s;
    s.bbox = uvec4(raw0 & 0xffff, raw0 >> 16, raw1 & 0xffff, raw1 >> 16);
    s.tiles = TileRef(raw2);
    return s;
 }

-void Path_write(PathRef ref, Path s) {
+void Path_write(Alloc a, PathRef ref, Path s) {
    uint ix = ref.offset >> 2;
-    tile[ix + 0] = s.bbox.x | (s.bbox.y << 16);
-    tile[ix + 1] = s.bbox.z | (s.bbox.w << 16);
-    tile[ix + 2] = s.tiles.offset;
+    write_mem(a, ix + 0, s.bbox.x | (s.bbox.y << 16));
+    write_mem(a, ix + 1, s.bbox.z | (s.bbox.w << 16));
+    write_mem(a, ix + 2, s.tiles.offset);
 }

-Tile Tile_read(TileRef ref) {
+Tile Tile_read(Alloc a, TileRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = tile[ix + 0];
-    uint raw1 = tile[ix + 1];
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
    Tile s;
    s.tile = TileSegRef(raw0);
    s.backdrop = int(raw1);
    return s;
 }

-void Tile_write(TileRef ref, Tile s) {
+void Tile_write(Alloc a, TileRef ref, Tile s) {
    uint ix = ref.offset >> 2;
-    tile[ix + 0] = s.tile.offset;
-    tile[ix + 1] = uint(s.backdrop);
+    write_mem(a, ix + 0, s.tile.offset);
+    write_mem(a, ix + 1, uint(s.backdrop));
 }

-TileSeg TileSeg_read(TileSegRef ref) {
+TileSeg TileSeg_read(Alloc a, TileSegRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = tile[ix + 0];
-    uint raw1 = tile[ix + 1];
-    uint raw2 = tile[ix + 2];
-    uint raw3 = tile[ix + 3];
-    uint raw4 = tile[ix + 4];
-    uint raw5 = tile[ix + 5];
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
+    uint raw5 = read_mem(a, ix + 5);
    TileSeg s;
-    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
-    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.origin = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.vector = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.y_edge = uintBitsToFloat(raw4);
    s.next = TileSegRef(raw5);
    return s;
 }

-void TileSeg_write(TileSegRef ref, TileSeg s) {
+void TileSeg_write(Alloc a, TileSegRef ref, TileSeg s) {
    uint ix = ref.offset >> 2;
-    tile[ix + 0] = floatBitsToUint(s.start.x);
-    tile[ix + 1] = floatBitsToUint(s.start.y);
-    tile[ix + 2] = floatBitsToUint(s.end.x);
-    tile[ix + 3] = floatBitsToUint(s.end.y);
-    tile[ix + 4] = floatBitsToUint(s.y_edge);
-    tile[ix + 5] = s.next.offset;
+    write_mem(a, ix + 0, floatBitsToUint(s.origin.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.origin.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.vector.x));
+    write_mem(a, ix + 3, floatBitsToUint(s.vector.y));
+    write_mem(a, ix + 4, floatBitsToUint(s.y_edge));
+    write_mem(a, ix + 5, s.next.offset);
+}
+
+TransformSeg TransformSeg_read(Alloc a, TransformSegRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
+    uint raw5 = read_mem(a, ix + 5);
+    TransformSeg s;
+    s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
+    return s;
+}
+
+void TransformSeg_write(Alloc a, TransformSegRef ref, TransformSeg s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, floatBitsToUint(s.mat.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.mat.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.mat.z));
+    write_mem(a, ix + 3, floatBitsToUint(s.mat.w));
+    write_mem(a, ix + 4, floatBitsToUint(s.translate.x));
+    write_mem(a, ix + 5, floatBitsToUint(s.translate.y));
 }

--- a/piet-gpu/shader/tile_alloc.comp
+++ b/piet-gpu/shader/tile_alloc.comp
@ -1,8 +1,11 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
 // Allocation and initialization of tiles for paths.

 #version 450
 #extension GL_GOOGLE_include_directive : enable

+#include "mem.h"
 #include "setup.h"

 #define LG_TILE_ALLOC_WG (7 + LG_WG_FACTOR)
@ -10,18 +13,8 @@

 layout(local_size_x = TILE_ALLOC_WG, local_size_y = 1) in;

-layout(set = 0, binding = 0) buffer AnnotatedBuf {
-    uint[] annotated;
-};
-
-layout(set = 0, binding = 1) buffer AllocBuf {
-    uint n_elements;
-    uint n_pathseg;
-    uint alloc;
-};
-
-layout(set = 0, binding = 2) buffer TileBuf {
-    uint[] tile;
+layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+    Config conf;
 };

 #include "annotated.h"
@ -32,39 +25,37 @@ layout(set = 0, binding = 2) buffer TileBuf {
 #define SY (1.0 / float(TILE_HEIGHT_PX))

 shared uint sh_tile_count[TILE_ALLOC_WG];
-shared uint sh_tile_alloc;
+shared MallocResult sh_tile_alloc;

 void main() {
    uint th_ix = gl_LocalInvocationID.x;
    uint element_ix = gl_GlobalInvocationID.x;
-    PathRef path_ref = PathRef(element_ix * Path_size);
-    AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
+    PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
+    AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);

    uint tag = Annotated_Nop;
-    if (element_ix < n_elements) {
-        tag = Annotated_tag(ref);
+    if (element_ix < conf.n_elements) {
+        tag = Annotated_tag(conf.anno_alloc, ref).tag;
    }
    int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
    switch (tag) {
-    case Annotated_Fill:
-    case Annotated_FillMask:
-    case Annotated_FillMaskInv:
-    case Annotated_Stroke:
+    case Annotated_Color:
+    case Annotated_Image:
    case Annotated_BeginClip:
    case Annotated_EndClip:
        // Note: we take advantage of the fact that fills, strokes, and
        // clips have compatible layout.
-        AnnoFill fill = Annotated_Fill_read(ref);
-        x0 = int(floor(fill.bbox.x * SX));
-        y0 = int(floor(fill.bbox.y * SY));
-        x1 = int(ceil(fill.bbox.z * SX));
-        y1 = int(ceil(fill.bbox.w * SY));
+        AnnoEndClip clip = Annotated_EndClip_read(conf.anno_alloc, ref);
+        x0 = int(floor(clip.bbox.x * SX));
+        y0 = int(floor(clip.bbox.y * SY));
+        x1 = int(ceil(clip.bbox.z * SX));
+        y1 = int(ceil(clip.bbox.w * SY));
        break;
    }
-    x0 = clamp(x0, 0, WIDTH_IN_TILES);
-    y0 = clamp(y0, 0, HEIGHT_IN_TILES);
-    x1 = clamp(x1, 0, WIDTH_IN_TILES);
-    y1 = clamp(y1, 0, HEIGHT_IN_TILES);
+    x0 = clamp(x0, 0, int(conf.width_in_tiles));
+    y0 = clamp(y0, 0, int(conf.height_in_tiles));
+    x1 = clamp(x1, 0, int(conf.width_in_tiles));
+    y1 = clamp(y1, 0, int(conf.height_in_tiles));

    Path path;
    path.bbox = uvec4(x0, y0, x1, y1);
@ -76,33 +67,38 @@ void main() {
    }

    sh_tile_count[th_ix] = tile_count;
+    uint total_tile_count = tile_count;
    // Prefix sum of sh_tile_count
    for (uint i = 0; i < LG_TILE_ALLOC_WG; i++) {
        barrier();
        if (th_ix >= (1 << i)) {
-            tile_count += sh_tile_count[th_ix - (1 << i)];
+            total_tile_count += sh_tile_count[th_ix - (1 << i)];
        }
        barrier();
-        sh_tile_count[th_ix] = tile_count;
+        sh_tile_count[th_ix] = total_tile_count;
    }
    if (th_ix == TILE_ALLOC_WG - 1) {
-        sh_tile_alloc = atomicAdd(alloc, tile_count * Tile_size);
+        sh_tile_alloc = malloc(total_tile_count * Tile_size);
    }
    barrier();
-    uint alloc_start = sh_tile_alloc;
+    MallocResult alloc_start = sh_tile_alloc;
+    if (alloc_start.failed || mem_error != NO_ERROR) {
+        return;
+    }

-    if (element_ix < n_elements) {
+    if (element_ix < conf.n_elements) {
        uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0;
-        path.tiles = TileRef(alloc_start + Tile_size * tile_subix);
-        Path_write(path_ref, path);
+        Alloc tiles_alloc = slice_mem(alloc_start.alloc, Tile_size * tile_subix, Tile_size * tile_count);
+        path.tiles = TileRef(tiles_alloc.offset);
+        Path_write(conf.tile_alloc, path_ref, path);
    }

    // Zero out allocated tiles efficiently
    uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4);
-    uint start_ix = alloc_start >> 2;
+    uint start_ix = alloc_start.alloc.offset >> 2;
    for (uint i = th_ix; i < total_count; i += TILE_ALLOC_WG) {
        // Note: this interleaving is faster than using Tile_write
        // by a significant amount.
-        tile[start_ix + i] = 0;
+        write_mem(alloc_start.alloc, start_ix + i, 0);
    }
 }
--- a/piet-gpu/shader/tile_alloc.spv
+++ b/piet-gpu/shader/tile_alloc.spv
--- a/piet-gpu/shader/tilegroup.h
+++ b/piet-gpu/shader/tilegroup.h
@ -1,132 +0,0 @@
-// Code auto-generated by piet-gpu-derive
-
-struct InstanceRef {
-    uint offset;
-};
-
-struct JumpRef {
-    uint offset;
-};
-
-struct ChunkRef {
-    uint offset;
-};
-
-struct TileGroupRef {
-    uint offset;
-};
-
-struct Instance {
-    uint item_ref;
-    vec2 offset;
-};
-
-#define Instance_size 12
-
-InstanceRef Instance_index(InstanceRef ref, uint index) {
-    return InstanceRef(ref.offset + index * Instance_size);
-}
-
-struct Jump {
-    TileGroupRef new_ref;
-};
-
-#define Jump_size 4
-
-JumpRef Jump_index(JumpRef ref, uint index) {
-    return JumpRef(ref.offset + index * Jump_size);
-}
-
-struct Chunk {
-    uint chunk_n;
-    ChunkRef next;
-};
-
-#define Chunk_size 8
-
-ChunkRef Chunk_index(ChunkRef ref, uint index) {
-    return ChunkRef(ref.offset + index * Chunk_size);
-}
-
-#define TileGroup_Instance 0
-#define TileGroup_Jump 1
-#define TileGroup_End 2
-#define TileGroup_size 16
-
-TileGroupRef TileGroup_index(TileGroupRef ref, uint index) {
-    return TileGroupRef(ref.offset + index * TileGroup_size);
-}
-
-Instance Instance_read(InstanceRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = tilegroup[ix + 0];
-    uint raw1 = tilegroup[ix + 1];
-    uint raw2 = tilegroup[ix + 2];
-    Instance s;
-    s.item_ref = raw0;
-    s.offset = vec2(uintBitsToFloat(raw1), uintBitsToFloat(raw2));
-    return s;
-}
-
-void Instance_write(InstanceRef ref, Instance s) {
-    uint ix = ref.offset >> 2;
-    tilegroup[ix + 0] = s.item_ref;
-    tilegroup[ix + 1] = floatBitsToUint(s.offset.x);
-    tilegroup[ix + 2] = floatBitsToUint(s.offset.y);
-}
-
-Jump Jump_read(JumpRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = tilegroup[ix + 0];
-    Jump s;
-    s.new_ref = TileGroupRef(raw0);
-    return s;
-}
-
-void Jump_write(JumpRef ref, Jump s) {
-    uint ix = ref.offset >> 2;
-    tilegroup[ix + 0] = s.new_ref.offset;
-}
-
-Chunk Chunk_read(ChunkRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = tilegroup[ix + 0];
-    uint raw1 = tilegroup[ix + 1];
-    Chunk s;
-    s.chunk_n = raw0;
-    s.next = ChunkRef(raw1);
-    return s;
-}
-
-void Chunk_write(ChunkRef ref, Chunk s) {
-    uint ix = ref.offset >> 2;
-    tilegroup[ix + 0] = s.chunk_n;
-    tilegroup[ix + 1] = s.next.offset;
-}
-
-uint TileGroup_tag(TileGroupRef ref) {
-    return tilegroup[ref.offset >> 2];
-}
-
-Instance TileGroup_Instance_read(TileGroupRef ref) {
-    return Instance_read(InstanceRef(ref.offset + 4));
-}
-
-Jump TileGroup_Jump_read(TileGroupRef ref) {
-    return Jump_read(JumpRef(ref.offset + 4));
-}
-
-void TileGroup_Instance_write(TileGroupRef ref, Instance s) {
-    tilegroup[ref.offset >> 2] = TileGroup_Instance;
-    Instance_write(InstanceRef(ref.offset + 4), s);
-}
-
-void TileGroup_Jump_write(TileGroupRef ref, Jump s) {
-    tilegroup[ref.offset >> 2] = TileGroup_Jump;
-    Jump_write(JumpRef(ref.offset + 4), s);
-}
-
-void TileGroup_End_write(TileGroupRef ref) {
-    tilegroup[ref.offset >> 2] = TileGroup_End;
-}
-
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -7,7 +7,7 @@ pub use render_ctx::PietGpuRenderContext;

 use rand::{Rng, RngCore};

-use piet::kurbo::{BezPath, Circle, Point, Vec2};
+use piet::kurbo::{BezPath, Circle, Point, Shape, Vec2};
 use piet::{Color, ImageFormat, RenderContext};

 use piet_gpu_types::encoder::Encode;
@ -73,7 +73,9 @@ pub fn render_scene(rc: &mut impl RenderContext) {
        5.0,
    );
    //render_cardioid(rc);
-    render_tiger(rc);
+    render_clip_test(rc);
+    render_alpha_test(rc);
+    //render_tiger(rc);
 }

 #[allow(unused)]
@ -94,6 +96,67 @@ fn render_cardioid(rc: &mut impl RenderContext) {
    rc.stroke(&path, &Color::BLACK, 2.0);
 }

+#[allow(unused)]
+fn render_clip_test(rc: &mut impl RenderContext) {
+    const N: usize = 16;
+    const X0: f64 = 50.0;
+    const Y0: f64 = 450.0;
+    // Note: if it gets much larger, it will exceed the 1MB scratch buffer.
+    // But this is a pretty demanding test.
+    const X1: f64 = 550.0;
+    const Y1: f64 = 950.0;
+    let step = 1.0 / ((N + 1) as f64);
+    for i in 0..N {
+        let t = ((i + 1) as f64) * step;
+        rc.save();
+        let mut path = BezPath::new();
+        path.move_to((X0, Y0));
+        path.line_to((X1, Y0));
+        path.line_to((X1, Y0 + t * (Y1 - Y0)));
+        path.line_to((X1 + t * (X0 - X1), Y1));
+        path.line_to((X0, Y1));
+        path.close_path();
+        rc.clip(path);
+    }
+    let rect = piet::kurbo::Rect::new(X0, Y0, X1, Y1);
+    rc.fill(rect, &Color::BLACK);
+    for _ in 0..N {
+        rc.restore();
+    }
+}
+
+#[allow(unused)]
+fn render_alpha_test(rc: &mut impl RenderContext) {
+    // Alpha compositing tests.
+    rc.fill(
+        diamond(Point::new(1024.0, 100.0)),
+        &Color::Rgba32(0xff0000ff),
+    );
+    rc.fill(
+        diamond(Point::new(1024.0, 125.0)),
+        &Color::Rgba32(0x00ff0080),
+    );
+    rc.save();
+    rc.clip(diamond(Point::new(1024.0, 150.0)));
+    rc.fill(
+        diamond(Point::new(1024.0, 175.0)),
+        &Color::Rgba32(0x0000ff80),
+    );
+    rc.restore();
+}
+
+fn diamond(origin: Point) -> impl Shape {
+    let mut path = BezPath::new();
+    const SIZE: f64 = 50.0;
+    path.move_to((origin.x, origin.y - SIZE));
+    path.line_to((origin.x + SIZE, origin.y));
+    path.line_to((origin.x, origin.y + SIZE));
+    path.line_to((origin.x - SIZE, origin.y));
+    path.close_path();
+    return path;
+}
+
+#[allow(unused)]
 fn render_tiger(rc: &mut impl RenderContext) {
    let xml_str = std::str::from_utf8(include_bytes!("../Ghostscript_Tiger.svg")).unwrap();
    let start = std::time::Instant::now();
@ -126,15 +189,16 @@ pub fn dump_k1_data(k1_buf: &[u32]) {
 pub struct Renderer {
    pub image_dev: hub::Image, // resulting image

-    scene_buf: hub::Buffer,
-    scene_dev: hub::Buffer,
+    scene_buf_host: hub::Buffer,
+    scene_buf_dev: hub::Buffer,

-    pub state_buf: hub::Buffer,
-    pub anno_buf: hub::Buffer,
-    pub pathseg_buf: hub::Buffer,
-    pub tile_buf: hub::Buffer,
-    pub bin_buf: hub::Buffer,
-    pub ptcl_buf: hub::Buffer,
+    memory_buf_host: hub::Buffer,
+    memory_buf_dev: hub::Buffer,
+
+    state_buf: hub::Buffer,
+
+    config_buf_host: hub::Buffer,
+    config_buf_dev: hub::Buffer,

    el_pipeline: hub::Pipeline,
    el_ds: hub::DescriptorSet,
@ -148,27 +212,21 @@ pub struct Renderer {
    backdrop_pipeline: hub::Pipeline,
    backdrop_ds: hub::DescriptorSet,

-    tile_alloc_buf_host: hub::Buffer,
-    tile_alloc_buf_dev: hub::Buffer,
-
    bin_pipeline: hub::Pipeline,
    bin_ds: hub::DescriptorSet,

-    bin_alloc_buf_host: hub::Buffer,
-    bin_alloc_buf_dev: hub::Buffer,
-
    coarse_pipeline: hub::Pipeline,
    coarse_ds: hub::DescriptorSet,

-    coarse_alloc_buf_host: hub::Buffer,
-    coarse_alloc_buf_dev: hub::Buffer,
-
    k4_pipeline: hub::Pipeline,
    k4_ds: hub::DescriptorSet,

    n_elements: usize,
    n_paths: usize,
    n_pathseg: usize,
+
+    // Keep a reference to the image so that it is not destroyed.
+    _bg_image: hub::Image,
 }

 impl Renderer {
@ -177,137 +235,142 @@ impl Renderer {
        scene: &[u8],
        n_paths: usize,
        n_pathseg: usize,
+        n_trans: usize,
    ) -> Result<Self, Error> {
        let host = MemFlags::host_coherent();
        let dev = MemFlags::device_local();

        let n_elements = scene.len() / piet_gpu_types::scene::Element::fixed_size();
        println!(
-            "scene: {} elements, {} paths, {} path_segments",
-            n_elements, n_paths, n_pathseg
+            "scene: {} elements, {} paths, {} path_segments, {} transforms",
+            n_elements, n_paths, n_pathseg, n_trans
        );

-        let mut scene_buf = session
+        let mut scene_buf_host = session
            .create_buffer(std::mem::size_of_val(&scene[..]) as u64, host)
            .unwrap();
-        let scene_dev = session
+        let scene_buf_dev = session
            .create_buffer(std::mem::size_of_val(&scene[..]) as u64, dev)
            .unwrap();
-        scene_buf.write(&scene)?;
+        scene_buf_host.write(&scene)?;

        let state_buf = session.create_buffer(1 * 1024 * 1024, dev)?;
-        let anno_buf = session.create_buffer(64 * 1024 * 1024, dev)?;
-        let pathseg_buf = session.create_buffer(64 * 1024 * 1024, dev)?;
-        let tile_buf = session.create_buffer(64 * 1024 * 1024, dev)?;
-        let bin_buf = session.create_buffer(64 * 1024 * 1024, dev)?;
-        let ptcl_buf = session.create_buffer(48 * 1024 * 1024, dev)?;
        let image_dev = session.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?;

-        let el_code = include_bytes!("../shader/elements.spv");
-        let el_pipeline = session.create_simple_compute_pipeline(el_code, 4, 0)?;
-        let el_ds = session.create_descriptor_set(
-            &el_pipeline,
-            &[
-                scene_dev.vk_buffer(),
-                state_buf.vk_buffer(),
-                anno_buf.vk_buffer(),
-                pathseg_buf.vk_buffer(),
-            ],
-            &[],
-        )?;
-
-        let mut tile_alloc_buf_host = session.create_buffer(12, host)?;
-        let tile_alloc_buf_dev = session.create_buffer(12, dev)?;
+        const CONFIG_SIZE: u64 = 10 * 4; // Size of Config in setup.h.
+        let mut config_buf_host = session.create_buffer(CONFIG_SIZE, host)?;
+        let config_buf_dev = session.create_buffer(CONFIG_SIZE, dev)?;

        // TODO: constants
        const PATH_SIZE: usize = 12;
-        let tile_alloc_start = ((n_paths + 31) & !31) * PATH_SIZE;
-        tile_alloc_buf_host.write(&[n_paths as u32, n_pathseg as u32, tile_alloc_start as u32])?;
-        let tile_alloc_code = include_bytes!("../shader/tile_alloc.spv");
-        let tile_pipeline = session.create_simple_compute_pipeline(tile_alloc_code, 3, 0)?;
-        let tile_ds = session.create_descriptor_set(
-            &tile_pipeline,
-            &[
-                anno_buf.vk_buffer(),
-                tile_alloc_buf_dev.vk_buffer(),
-                tile_buf.vk_buffer(),
-            ],
-            &[],
+        const BIN_SIZE: usize = 8;
+        const PATHSEG_SIZE: usize = 52;
+        const ANNO_SIZE: usize = 32;
+        const TRANS_SIZE: usize = 24;
+        let mut alloc = 0;
+        let tile_base = alloc;
+        alloc += ((n_paths + 3) & !3) * PATH_SIZE;
+        let bin_base = alloc;
+        alloc += ((n_paths + 255) & !255) * BIN_SIZE;
+        let ptcl_base = alloc;
+        alloc += WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
+        let pathseg_base = alloc;
+        alloc += (n_pathseg * PATHSEG_SIZE + 3) & !3;
+        let anno_base = alloc;
+        alloc += (n_paths * ANNO_SIZE + 3) & !3;
+        let trans_base = alloc;
+        alloc += (n_trans * TRANS_SIZE + 3) & !3;
+        config_buf_host.write(&[
+            n_paths as u32,
+            n_pathseg as u32,
+            WIDTH_IN_TILES as u32,
+            HEIGHT_IN_TILES as u32,
+            tile_base as u32,
+            bin_base as u32,
+            ptcl_base as u32,
+            pathseg_base as u32,
+            anno_base as u32,
+            trans_base as u32,
+        ])?;
+
+        let mut memory_buf_host = session.create_buffer(2 * 4, host)?;
+        let memory_buf_dev = session.create_buffer(128 * 1024 * 1024, dev)?;
+        memory_buf_host.write(&[alloc as u32, 0 /* Overflow flag */])?;
+
+        let el_code = include_bytes!("../shader/elements.spv");
+        let el_pipeline = session.create_simple_compute_pipeline(el_code, 4)?;
+        let el_ds = session.create_simple_descriptor_set(
+            &el_pipeline,
+            &[&memory_buf_dev, &config_buf_dev, &scene_buf_dev, &state_buf],
        )?;

+        let tile_alloc_code = include_bytes!("../shader/tile_alloc.spv");
+        let tile_pipeline = session.create_simple_compute_pipeline(tile_alloc_code, 2)?;
+        let tile_ds = session
+            .create_simple_descriptor_set(&tile_pipeline, &[&memory_buf_dev, &config_buf_dev])?;
+
        let path_alloc_code = include_bytes!("../shader/path_coarse.spv");
-        let path_pipeline = session.create_simple_compute_pipeline(path_alloc_code, 3, 0)?;
-        let path_ds = session.create_descriptor_set(
-            &path_pipeline,
-            &[
-                pathseg_buf.vk_buffer(),
-                tile_alloc_buf_dev.vk_buffer(),
-                tile_buf.vk_buffer(),
-            ],
-            &[],
-        )?;
+        let path_pipeline = session.create_simple_compute_pipeline(path_alloc_code, 2)?;
+        let path_ds = session
+            .create_simple_descriptor_set(&path_pipeline, &[&memory_buf_dev, &config_buf_dev])?;

        let backdrop_alloc_code = include_bytes!("../shader/backdrop.spv");
-        let backdrop_pipeline =
-            session.create_simple_compute_pipeline(backdrop_alloc_code, 3, 0)?;
-        let backdrop_ds = session.create_descriptor_set(
+        let backdrop_pipeline = session.create_simple_compute_pipeline(backdrop_alloc_code, 2)?;
+        let backdrop_ds = session.create_simple_descriptor_set(
            &backdrop_pipeline,
-            &[
-                anno_buf.vk_buffer(),
-                tile_alloc_buf_dev.vk_buffer(),
-                tile_buf.vk_buffer(),
-            ],
-            &[],
+            &[&memory_buf_dev, &config_buf_dev],
        )?;

-        let mut bin_alloc_buf_host = session.create_buffer(8, host)?;
-        let bin_alloc_buf_dev = session.create_buffer(8, dev)?;
-
        // TODO: constants
-        let bin_alloc_start = ((n_paths + 255) & !255) * 8;
-        bin_alloc_buf_host.write(&[n_paths as u32, bin_alloc_start as u32])?;
        let bin_code = include_bytes!("../shader/binning.spv");
-        let bin_pipeline = session.create_simple_compute_pipeline(bin_code, 3, 0)?;
-        let bin_ds = session.create_descriptor_set(
-            &bin_pipeline,
-            &[
-                anno_buf.vk_buffer(),
-                bin_alloc_buf_dev.vk_buffer(),
-                bin_buf.vk_buffer(),
-            ],
-            &[],
-        )?;
+        let bin_pipeline = session.create_simple_compute_pipeline(bin_code, 2)?;
+        let bin_ds = session
+            .create_simple_descriptor_set(&bin_pipeline, &[&memory_buf_dev, &config_buf_dev])?;

-        let mut coarse_alloc_buf_host = session.create_buffer(8, host)?;
-        let coarse_alloc_buf_dev = session.create_buffer(8, dev)?;
-
-        let coarse_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
-        coarse_alloc_buf_host.write(&[n_paths as u32, coarse_alloc_start as u32])?;
        let coarse_code = include_bytes!("../shader/coarse.spv");
-        let coarse_pipeline = session.create_simple_compute_pipeline(coarse_code, 5, 0)?;
-        let coarse_ds = session.create_descriptor_set(
-            &coarse_pipeline,
-            &[
-                anno_buf.vk_buffer(),
-                bin_buf.vk_buffer(),
-                tile_buf.vk_buffer(),
-                coarse_alloc_buf_dev.vk_buffer(),
-                ptcl_buf.vk_buffer(),
-            ],
-            &[],
-        )?;
+        let coarse_pipeline = session.create_simple_compute_pipeline(coarse_code, 2)?;
+        let coarse_ds = session
+            .create_simple_descriptor_set(&coarse_pipeline, &[&memory_buf_dev, &config_buf_dev])?;

-        let k4_code = include_bytes!("../shader/kernel4.spv");
-        let k4_pipeline = session.create_simple_compute_pipeline(k4_code, 2, 1)?;
-        let k4_ds = session.create_descriptor_set(
-            &k4_pipeline,
-            &[ptcl_buf.vk_buffer(), tile_buf.vk_buffer()],
-            &[image_dev.vk_image()],
-        )?;
+        let bg_image = Self::make_test_bg_image(&session);
+
+        let k4_code = if session.gpu_info().has_descriptor_indexing {
+            &include_bytes!("../shader/kernel4_idx.spv")[..]
+        } else {
+            println!("doing non-indexed k4");
+            &include_bytes!("../shader/kernel4.spv")[..]
+        };
+        // This is an arbitrary limit on the number of textures that can be referenced by
+        // the fine rasterizer. To set it for real, we probably want to pay attention both
+        // to the device limit (maxDescriptorSetSampledImages) but also to the number of
+        // images encoded (I believe there's an cost when allocating descriptor pools). If
+        // it can't be satisfied, then for compatibility we'll probably want to fall back
+        // to an atlasing approach.
+        //
+        // However, we're adding only one texture for now. Avoid a harmless Vulkan validation
+        // error by using a tight bound.
+        let max_textures = 1;
+        let k4_pipeline = session
+            .pipeline_builder()
+            .add_buffers(2)
+            .add_images(1)
+            .add_textures(max_textures)
+            .create_compute_pipeline(&session, k4_code)?;
+        let k4_ds = session
+            .descriptor_set_builder()
+            .add_buffers(&[&memory_buf_dev, &config_buf_dev])
+            .add_images(&[&image_dev])
+            .add_textures(&[&bg_image])
+            .build(&session, &k4_pipeline)?;

        Ok(Renderer {
-            scene_buf,
-            scene_dev,
+            scene_buf_host,
+            scene_buf_dev,
+            memory_buf_host,
+            memory_buf_dev,
+            state_buf,
+            config_buf_host,
+            config_buf_dev,
            image_dev,
            el_pipeline,
            el_ds,
@ -323,39 +386,27 @@ impl Renderer {
            coarse_ds,
            k4_pipeline,
            k4_ds,
-            state_buf,
-            anno_buf,
-            pathseg_buf,
-            tile_buf,
-            bin_buf,
-            ptcl_buf,
-            tile_alloc_buf_host,
-            tile_alloc_buf_dev,
-            bin_alloc_buf_host,
-            bin_alloc_buf_dev,
-            coarse_alloc_buf_host,
-            coarse_alloc_buf_dev,
            n_elements,
            n_paths,
            n_pathseg,
+            _bg_image: bg_image,
        })
    }

    pub unsafe fn record(&self, cmd_buf: &mut hub::CmdBuf, query_pool: &hub::QueryPool) {
-        cmd_buf.copy_buffer(self.scene_buf.vk_buffer(), self.scene_dev.vk_buffer());
        cmd_buf.copy_buffer(
-            self.tile_alloc_buf_host.vk_buffer(),
-            self.tile_alloc_buf_dev.vk_buffer(),
+            self.scene_buf_host.vk_buffer(),
+            self.scene_buf_dev.vk_buffer(),
        );
        cmd_buf.copy_buffer(
-            self.bin_alloc_buf_host.vk_buffer(),
-            self.bin_alloc_buf_dev.vk_buffer(),
+            self.config_buf_host.vk_buffer(),
+            self.config_buf_dev.vk_buffer(),
        );
        cmd_buf.copy_buffer(
-            self.coarse_alloc_buf_host.vk_buffer(),
-            self.coarse_alloc_buf_dev.vk_buffer(),
+            self.memory_buf_host.vk_buffer(),
+            self.memory_buf_dev.vk_buffer(),
        );
-        cmd_buf.clear_buffer(self.state_buf.vk_buffer());
+        cmd_buf.clear_buffer(self.state_buf.vk_buffer(), None);
        cmd_buf.memory_barrier();
        cmd_buf.image_barrier(
            self.image_dev.vk_image(),
@ -405,7 +456,7 @@ impl Renderer {
        cmd_buf.dispatch(
            &self.coarse_pipeline,
            &self.coarse_ds,
-            (WIDTH as u32 / 256, HEIGHT as u32 / 256, 1),
+            ((WIDTH as u32 + 255) / 256, (HEIGHT as u32 + 255) / 256, 1),
        );
        cmd_buf.write_timestamp(&query_pool, 6);
        cmd_buf.memory_barrier();
@ -448,7 +499,6 @@ impl Renderer {
                ImageLayout::BlitDst,
            );
            cmd_buf.copy_buffer_to_image(buffer.vk_buffer(), image.vk_image());
-            // TODO: instead of General, we might want ShaderReadOnly
            cmd_buf.image_barrier(image.vk_image(), ImageLayout::BlitDst, ImageLayout::General);
            cmd_buf.finish();
            // Make sure not to drop the buffer and image until the command buffer completes.
@ -459,4 +509,22 @@ impl Renderer {
            Ok(image)
        }
    }
+
+    /// Make a test image.
+    fn make_test_bg_image(session: &hub::Session) -> hub::Image {
+        const WIDTH: usize = 256;
+        const HEIGHT: usize = 256;
+        let mut buf = vec![255u8; WIDTH * HEIGHT * 4];
+        for y in 0..HEIGHT {
+            for x in 0..WIDTH {
+                let r = x as u8;
+                let g = y as u8;
+                let b = r ^ g;
+                buf[(y * WIDTH + x) * 4] = r;
+                buf[(y * WIDTH + x) * 4 + 1] = g;
+                buf[(y * WIDTH + x) * 4 + 2] = b;
+            }
+        }
+        Self::make_image(session, WIDTH, HEIGHT, &buf, ImageFormat::RgbaPremul).unwrap()
+    }
 }
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@ -1,21 +1,19 @@
 use std::{borrow::Cow, ops::RangeBounds};

-use piet_gpu_types::encoder::{Encode, Encoder};
-
-use piet_gpu_types::scene::{
-    Clip, CubicSeg, Element, Fill, LineSeg, QuadSeg, SetLineWidth, Stroke, Transform,
-};
-
 use piet::{
    kurbo::{Affine, Insets, PathEl, Point, Rect, Shape, Size},
    HitTestPosition, TextAttribute, TextStorage,
 };
-
 use piet::{
    Color, Error, FixedGradient, FontFamily, HitTestPoint, ImageFormat, InterpolationMode,
    IntoBrush, LineMetric, RenderContext, StrokeStyle, Text, TextLayout, TextLayoutBuilder,
 };

+use piet_gpu_types::encoder::{Encode, Encoder};
+use piet_gpu_types::scene::{
+    Clip, CubicSeg, Element, FillColor, LineSeg, QuadSeg, SetFillMode, SetLineWidth, Transform,
+};
+
 pub struct PietGpuImage;

 #[derive(Clone)]
@ -32,12 +30,15 @@ pub struct PietGpuRenderContext {
    // Will probably need direct accesss to hal Device to create images etc.
    inner_text: PietGpuText,
    stroke_width: f32,
+    fill_mode: FillMode,
    // We're tallying these cpu-side for expedience, but will probably
    // move this to some kind of readback from element processing.
    /// The count of elements that make it through to coarse rasterization.
    path_count: usize,
    /// The count of path segment elements.
    pathseg_count: usize,
+    /// The count of transform elements.
+    trans_count: usize,

    cur_transform: Affine,
    state_stack: Vec<State>,
@ -67,6 +68,14 @@ struct ClipElement {
    bbox: Option<Rect>,
 }

+#[derive(Clone, Copy, PartialEq)]
+enum FillMode {
+    // Fill path according to the non-zero winding rule.
+    Nonzero = 0,
+    // Fill stroked path.
+    Stroke = 1,
+}
+
 const TOLERANCE: f64 = 0.25;

 impl PietGpuRenderContext {
@ -80,8 +89,10 @@ impl PietGpuRenderContext {
            elements,
            inner_text,
            stroke_width,
+            fill_mode: FillMode::Nonzero,
            path_count: 0,
            pathseg_count: 0,
+            trans_count: 0,
            cur_transform: Affine::default(),
            state_stack: Vec::new(),
            clip_stack: Vec::new(),
@ -100,6 +111,19 @@ impl PietGpuRenderContext {
    pub fn pathseg_count(&self) -> usize {
        self.pathseg_count
    }
+
+    pub fn trans_count(&self) -> usize {
+        self.trans_count
+    }
+}
+
+fn set_fill_mode(ctx: &mut PietGpuRenderContext, fill_mode: FillMode) {
+    if ctx.fill_mode != fill_mode {
+        ctx.elements.push(Element::SetFillMode(SetFillMode {
+            fill_mode: fill_mode as u32,
+        }));
+        ctx.fill_mode = fill_mode;
+    }
 }

 impl RenderContext for PietGpuRenderContext {
@ -113,7 +137,19 @@ impl RenderContext for PietGpuRenderContext {
    }

    fn solid_brush(&mut self, color: Color) -> Self::Brush {
-        PietGpuBrush::Solid(color.as_rgba_u32())
+        // kernel4 expects colors encoded in alpha-premultiplied sRGB:
+        //
+        // [α,sRGB(α⋅R),sRGB(α⋅G),sRGB(α⋅B)]
+        //
+        // See also http://ssp.impulsetrain.com/gamma-premult.html.
+        let (r, g, b, a) = color.as_rgba();
+        let premul = Color::rgba(
+            to_srgb(from_srgb(r) * a),
+            to_srgb(from_srgb(g) * a),
+            to_srgb(from_srgb(b) * a),
+            a,
+        );
+        PietGpuBrush::Solid(premul.as_rgba_u32())
    }

    fn gradient(&mut self, _gradient: impl Into<FixedGradient>) -> Result<Self::Brush, Error> {
@ -129,6 +165,7 @@ impl RenderContext for PietGpuRenderContext {
                .push(Element::SetLineWidth(SetLineWidth { width: width_f32 }));
            self.stroke_width = width_f32;
        }
+        set_fill_mode(self, FillMode::Stroke);
        let brush = brush.make_brush(self, || shape.bounding_box()).into_owned();
        match brush {
            PietGpuBrush::Solid(rgba_color) => {
@ -136,8 +173,8 @@ impl RenderContext for PietGpuRenderContext {
                self.accumulate_bbox(|| shape.bounding_box() + Insets::uniform(width * 0.5));
                let path = shape.path_elements(TOLERANCE);
                self.encode_path(path, false);
-                let stroke = Stroke { rgba_color };
-                self.elements.push(Element::Stroke(stroke));
+                let stroke = FillColor { rgba_color };
+                self.elements.push(Element::FillColor(stroke));
                self.path_count += 1;
            }
            _ => (),
@ -160,9 +197,10 @@ impl RenderContext for PietGpuRenderContext {
            // Perhaps that should be added to kurbo.
            self.accumulate_bbox(|| shape.bounding_box());
            let path = shape.path_elements(TOLERANCE);
+            set_fill_mode(self, FillMode::Nonzero);
            self.encode_path(path, true);
-            let fill = Fill { rgba_color };
-            self.elements.push(Element::Fill(fill));
+            let fill = FillColor { rgba_color };
+            self.elements.push(Element::FillColor(fill));
            self.path_count += 1;
        }
    }
@ -170,6 +208,7 @@ impl RenderContext for PietGpuRenderContext {
    fn fill_even_odd(&mut self, _shape: impl Shape, _brush: &impl IntoBrush<Self>) {}

    fn clip(&mut self, shape: impl Shape) {
+        set_fill_mode(self, FillMode::Nonzero);
        let path = shape.path_elements(TOLERANCE);
        self.encode_path(path, true);
        let begin_ix = self.elements.len();
@ -207,6 +246,7 @@ impl RenderContext for PietGpuRenderContext {
                let a_inv = state.rel_transform.inverse();
                self.elements
                    .push(Element::Transform(to_scene_transform(a_inv)));
+                self.trans_count += 1;
            }
            self.cur_transform = state.transform;
            for _ in 0..state.n_clip {
@ -228,6 +268,7 @@ impl RenderContext for PietGpuRenderContext {
    fn transform(&mut self, transform: Affine) {
        self.elements
            .push(Element::Transform(to_scene_transform(transform)));
+        self.trans_count += 1;
        if let Some(tos) = self.state_stack.last_mut() {
            tos.rel_transform *= transform;
        }
@ -275,34 +316,40 @@ impl RenderContext for PietGpuRenderContext {
 }

 impl PietGpuRenderContext {
-    fn encode_line_seg(&mut self, seg: LineSeg, is_fill: bool) {
-        if is_fill {
-            self.elements.push(Element::FillLine(seg));
-        } else {
-            self.elements.push(Element::StrokeLine(seg));
-        }
+    fn encode_line_seg(&mut self, seg: LineSeg) {
+        self.elements.push(Element::Line(seg));
        self.pathseg_count += 1;
    }

-    fn encode_quad_seg(&mut self, seg: QuadSeg, is_fill: bool) {
-        if is_fill {
-            self.elements.push(Element::FillQuad(seg));
-        } else {
-            self.elements.push(Element::StrokeQuad(seg));
-        }
+    fn encode_quad_seg(&mut self, seg: QuadSeg) {
+        self.elements.push(Element::Quad(seg));
        self.pathseg_count += 1;
    }

-    fn encode_cubic_seg(&mut self, seg: CubicSeg, is_fill: bool) {
-        if is_fill {
-            self.elements.push(Element::FillCubic(seg));
-        } else {
-            self.elements.push(Element::StrokeCubic(seg));
-        }
+    fn encode_cubic_seg(&mut self, seg: CubicSeg) {
+        self.elements.push(Element::Cubic(seg));
        self.pathseg_count += 1;
    }

    fn encode_path(&mut self, path: impl Iterator<Item = PathEl>, is_fill: bool) {
+        if is_fill {
+            self.encode_path_inner(
+                path.flat_map(|el| {
+                    match el {
+                        PathEl::MoveTo(..) => Some(PathEl::ClosePath),
+                        _ => None,
+                    }
+                    .into_iter()
+                    .chain(Some(el))
+                })
+                .chain(Some(PathEl::ClosePath)),
+            )
+        } else {
+            self.encode_path_inner(path)
+        }
+    }
+
+    fn encode_path_inner(&mut self, path: impl Iterator<Item = PathEl>) {
        let flatten = false;
        if flatten {
            let mut start_pt = None;
@ -320,7 +367,7 @@ impl PietGpuRenderContext {
                            p0: last_pt.unwrap(),
                            p1: scene_pt,
                        };
-                        self.encode_line_seg(seg, is_fill);
+                        self.encode_line_seg(seg);
                        last_pt = Some(scene_pt);
                    }
                    PathEl::ClosePath => {
@ -330,7 +377,7 @@ impl PietGpuRenderContext {
                                    p0: last,
                                    p1: start,
                                };
-                                self.encode_line_seg(seg, is_fill);
+                                self.encode_line_seg(seg);
                            }
                        }
                    }
@ -354,7 +401,7 @@ impl PietGpuRenderContext {
                            p0: last_pt.unwrap(),
                            p1: scene_pt,
                        };
-                        self.encode_line_seg(seg, is_fill);
+                        self.encode_line_seg(seg);
                        last_pt = Some(scene_pt);
                    }
                    PathEl::QuadTo(p1, p2) => {
@ -365,7 +412,7 @@ impl PietGpuRenderContext {
                            p1: scene_p1,
                            p2: scene_p2,
                        };
-                        self.encode_quad_seg(seg, is_fill);
+                        self.encode_quad_seg(seg);
                        last_pt = Some(scene_p2);
                    }
                    PathEl::CurveTo(p1, p2, p3) => {
@ -378,7 +425,7 @@ impl PietGpuRenderContext {
                            p2: scene_p2,
                            p3: scene_p3,
                        };
-                        self.encode_cubic_seg(seg, is_fill);
+                        self.encode_cubic_seg(seg);
                        last_pt = Some(scene_p3);
                    }
                    PathEl::ClosePath => {
@ -388,7 +435,7 @@ impl PietGpuRenderContext {
                                    p0: last,
                                    p1: start,
                                };
-                                self.encode_line_seg(seg, is_fill);
+                                self.encode_line_seg(seg);
                            }
                        }
                    }
@ -550,3 +597,21 @@ fn to_scene_transform(transform: Affine) -> Transform {
        translate: [c[4] as f32, c[5] as f32],
    }
 }
+
+fn to_srgb(f: f64) -> f64 {
+    if f <= 0.0031308 {
+        f * 12.92
+    } else {
+        let a = 0.055;
+        (1. + a) * f64::powf(f, f64::recip(2.4)) - a
+    }
+}
+
+fn from_srgb(f: f64) -> f64 {
+    if f <= 0.04045 {
+        f / 12.92
+    } else {
+        let a = 0.055;
+        f64::powf((f + a) * f64::recip(1. + a), 2.4)
+    }
+}