Remove the old pipeline (#209)

Remove the old hal and pipeline. Update the README to reflect this
2025-01-08 20:01:30 +11:00 · 2022-11-27 16:02:16 +00:00 · 2022-11-27 16:02:16 +00:00 · c689767782
parent 1f9a4d5322
commit c689767782
117 changed files with 37 additions and 22797 deletions
--- a/.github/actions/shader/action.yml
+++ b/.github/actions/shader/action.yml
@ -1,30 +0,0 @@
-name: compile shaders
-
-runs:
-  using: 'composite'
-  steps:
-    - uses: seanmiddleditch/gha-setup-ninja@master
-
-    - name: setup SPIRV tools
-      # consider install-vulkan-sdk instead
-      uses: humbletim/setup-vulkan-sdk@v1.2.0
-      with:
-        vulkan-query-version: 1.3.204.0
-        vulkan-components: Glslang, SPIRV-Cross
-        vulkan-use-cache: true
-
-    - name: install DXC
-      uses: napokue/setup-dxc@v1.0.0
-
-    - name: 'run shader compilers: piet-gpu'
-      run: mkdir gen && ninja
-      shell: pwsh
-      working-directory: piet-gpu/shader
-    - name: 'run shader compilers: tests'
-      run: mkdir gen && ninja
-      shell: pwsh
-      working-directory: tests/shader
-    - name: 'run shader compilers: piet-gpu-hal/examples'
-      run: mkdir gen && ninja
-      shell: pwsh
-      working-directory: piet-gpu-hal/examples/shader
--- a/.github/workflows/push-shader.yml
+++ b/.github/workflows/push-shader.yml
@ -1,38 +0,0 @@
-on:
-  push:
-    branches:
-      - dev
-
-jobs:
-  push-shaders:
-    runs-on: windows-latest
-    name: compile shaders and push to main
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          # need history to make the merge work
-          # possibly we can optimize this and set
-          # allow-unrelated-histories on merge
-          fetch-depth: 0
-      - name: prepare repo for compilation
-        run: |
-          git fetch origin main
-          git switch main
-          git config user.name "Commit by GitHub Action"
-          git config user.email "nobody@example.com"
-          git merge dev -m "merge from dev branch - ${{ github.ref_name }}"
-          sed -i '' '/shader\/gen/d' .gitignore
-          git add .gitignore
-          git rm -r --ignore-unmatch piet-gpu/shader/gen
-          git rm -r --ignore-unmatch tests/shader/gen
-          git rm -r --ignore-unmatch piet-gpu-hal/examples/shader/gen
-      - uses: ./.github/actions/shader
-      - name: commit compiled shaders
-        continue-on-error: true
-        run: |
-          git add piet-gpu/shader/gen
-          git add tests/shader/gen
-          git add piet-gpu-hal/examples/shader/gen
-          git commit -m "commit compiled shaders"
-      - name: push
-        run: git push origin main
--- a/.github/workflows/shader.yml
+++ b/.github/workflows/shader.yml
@ -1,12 +0,0 @@
-on:
-  pull_request:
-    branches-ignore:
-     - main
-
-jobs:
-  push-shaders:
-    runs-on: windows-latest
-    name: compile shaders
-    steps:
-      - uses: actions/checkout@v3
-      - uses: ./.github/actions/shader
--- a/Cargo.lock
+++ b/Cargo.lock
@ -14,7 +14,7 @@ version = "0.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47"
 dependencies = [
- "getrandom 0.2.7",
+ "getrandom",
 "once_cell",
 "version_check",
 ]
@ -28,15 +28,6 @@ dependencies = [
 "libc",
 ]

-[[package]]
-name = "ansi_term"
-version = "0.12.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2"
-dependencies = [
- "winapi",
-]
-
 [[package]]
 name = "arrayref"
 version = "0.3.6"
@ -64,28 +55,6 @@ dependencies = [
 "libloading",
 ]

-[[package]]
-name = "ash-window"
-version = "0.12.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b912285a7c29f3a8f87ca6f55afc48768624e5e33ec17dbd2f2075903f5e35ab"
-dependencies = [
- "ash",
- "raw-window-handle 0.5.0",
- "raw-window-metal",
-]
-
-[[package]]
-name = "atty"
-version = "0.2.14"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
-dependencies = [
- "hermit-abi",
- "libc",
- "winapi",
-]
-
 [[package]]
 name = "autocfg"
 version = "1.1.0"
@ -164,25 +133,6 @@ dependencies = [
 "vec_map",
 ]

-[[package]]
-name = "cbindgen"
-version = "0.20.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51e3973b165dc0f435831a9e426de67e894de532754ff7a3f307c03ee5dec7dc"
-dependencies = [
- "clap 2.34.0",
- "heck",
- "indexmap",
- "log",
- "proc-macro2",
- "quote",
- "serde",
- "serde_json",
- "syn",
- "tempfile",
- "toml",
-]
-
 [[package]]
 name = "cc"
 version = "1.0.73"
@ -201,45 +151,6 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e"

-[[package]]
-name = "clap"
-version = "2.34.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c"
-dependencies = [
- "ansi_term",
- "atty",
- "bitflags",
- "strsim 0.8.0",
- "textwrap 0.11.0",
- "unicode-width",
- "vec_map",
-]
-
-[[package]]
-name = "clap"
-version = "3.2.23"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "71655c45cb9845d3270c9d6df84ebe72b4dad3c2ba3f7023ad47c144e4e473a5"
-dependencies = [
- "atty",
- "bitflags",
- "clap_lex",
- "indexmap",
- "strsim 0.10.0",
- "termcolor",
- "textwrap 0.16.0",
-]
-
-[[package]]
-name = "clap_lex"
-version = "0.2.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2850f2f5a82cbf437dd5af4d49848fbdfc27c157c3d010345776f952765261c5"
-dependencies = [
- "os_str_bytes",
-]
-
 [[package]]
 name = "cmake"
 version = "0.1.49"
@ -392,38 +303,14 @@ dependencies = [
 "winapi",
 ]

-[[package]]
-name = "darling"
-version = "0.10.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0d706e75d87e35569db781a9b5e2416cff1236a47ed380831f959382ccd5f858"
-dependencies = [
- "darling_core 0.10.2",
- "darling_macro 0.10.2",
-]
-
 [[package]]
 name = "darling"
 version = "0.13.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a01d95850c592940db9b8194bc39f4bc0e89dee5c4265e4b1807c34a9aba453c"
 dependencies = [
- "darling_core 0.13.4",
- "darling_macro 0.13.4",
-]
-
-[[package]]
-name = "darling_core"
-version = "0.10.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f0c960ae2da4de88a91b2d920c2a7233b400bc33cb28453a2987822d8392519b"
-dependencies = [
- "fnv",
- "ident_case",
- "proc-macro2",
- "quote",
- "strsim 0.9.3",
- "syn",
+ "darling_core",
+ "darling_macro",
 ]

 [[package]]
@ -436,18 +323,7 @@ dependencies = [
 "ident_case",
 "proc-macro2",
 "quote",
- "strsim 0.10.0",
- "syn",
-]
-
-[[package]]
-name = "darling_macro"
-version = "0.10.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d9b5a2f4ac4969822c62224815d069952656cadc7084fdca9751e6d959189b72"
-dependencies = [
- "darling_core 0.10.2",
- "quote",
+ "strsim",
 "syn",
 ]

@ -457,7 +333,7 @@ version = "0.13.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c972679f83bdf9c42bd905396b6c3588a843a17f0f16dfcfa3e2c5d57441835"
 dependencies = [
- "darling_core 0.13.4",
+ "darling_core",
 "quote",
 "syn",
 ]
@ -507,15 +383,6 @@ dependencies = [
 "pkg-config",
 ]

-[[package]]
-name = "fastrand"
-version = "1.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499"
-dependencies = [
- "instant",
-]
-
 [[package]]
 name = "flate2"
 version = "1.0.24"
@ -622,17 +489,6 @@ dependencies = [
 "byteorder",
 ]

-[[package]]
-name = "getrandom"
-version = "0.1.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce"
-dependencies = [
- "cfg-if",
- "libc",
- "wasi 0.9.0+wasi-snapshot-preview1",
-]
-
 [[package]]
 name = "getrandom"
 version = "0.2.7"
@ -641,7 +497,7 @@ checksum = "4eb1a864a501629691edf6c15a593b7a51eebaa1e8468e9ddc623de7c9b58ec6"
 dependencies = [
 "cfg-if",
 "libc",
- "wasi 0.11.0+wasi-snapshot-preview1",
+ "wasi",
 ]

 [[package]]
@ -695,12 +551,6 @@ dependencies = [
 "bitflags",
 ]

-[[package]]
-name = "half"
-version = "1.8.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
-
 [[package]]
 name = "hashbrown"
 version = "0.12.3"
@ -710,24 +560,6 @@ dependencies = [
 "ahash",
 ]

-[[package]]
-name = "heck"
-version = "0.3.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c"
-dependencies = [
- "unicode-segmentation",
-]
-
-[[package]]
-name = "hermit-abi"
-version = "0.1.19"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
-dependencies = [
- "libc",
-]
-
 [[package]]
 name = "hexf-parse"
 version = "0.2.1"
@ -762,12 +594,6 @@ dependencies = [
 "web-sys",
 ]

-[[package]]
-name = "itoa"
-version = "1.0.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4217ad341ebadf8d8e724e264f13e593e0648f5b3e94b3896a5df283be015ecc"
-
 [[package]]
 name = "jni-sys"
 version = "0.3.0"
@ -794,15 +620,6 @@ dependencies = [
 "pkg-config",
 ]

-[[package]]
-name = "kurbo"
-version = "0.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "16cb54cd28cb3d2e964d9444ca185676a94fd9b7cce5f02b22c717947ed8e9a2"
-dependencies = [
- "arrayvec 0.5.2",
-]
-
 [[package]]
 name = "kurbo"
 version = "0.9.0"
@ -923,7 +740,7 @@ checksum = "e5d732bc30207a6423068df043e3d02e0735b155ad7ce1a6f76fe2baa5b158de"
 dependencies = [
 "libc",
 "log",
- "wasi 0.11.0+wasi-snapshot-preview1",
+ "wasi",
 "windows-sys 0.42.0",
 ]

@ -954,18 +771,6 @@ dependencies = [
 "unicode-xid",
 ]

-[[package]]
-name = "ndk"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8794322172319b972f528bf90c6b467be0079f1fa82780ffb431088e741a73ab"
-dependencies = [
- "jni-sys",
- "ndk-sys 0.2.2",
- "num_enum",
- "thiserror",
-]
-
 [[package]]
 name = "ndk"
 version = "0.7.0"
@ -974,7 +779,7 @@ checksum = "451422b7e4718271c8b5b3aadf5adedba43dc76312454b387e98fae0fc951aa0"
 dependencies = [
 "bitflags",
 "jni-sys",
- "ndk-sys 0.4.0",
+ "ndk-sys",
 "num_enum",
 "raw-window-handle 0.5.0",
 "thiserror",
@ -986,20 +791,6 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "27b02d87554356db9e9a873add8782d4ea6e3e58ea071a9adb9a2e8ddb884a8b"

-[[package]]
-name = "ndk-glue"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c5caf0c24d51ac1c905c27d4eda4fa0635bbe0de596b8f79235e0b17a4d29385"
-dependencies = [
- "lazy_static",
- "libc",
- "log",
- "ndk 0.3.0",
- "ndk-macro 0.2.0",
- "ndk-sys 0.2.2",
-]
-
 [[package]]
 name = "ndk-glue"
 version = "0.7.0"
@ -1008,46 +799,27 @@ checksum = "0434fabdd2c15e0aab768ca31d5b7b333717f03cf02037d5a0a3ff3c278ed67f"
 dependencies = [
 "libc",
 "log",
- "ndk 0.7.0",
+ "ndk",
 "ndk-context",
- "ndk-macro 0.3.0",
- "ndk-sys 0.4.0",
+ "ndk-macro",
+ "ndk-sys",
 "once_cell",
 "parking_lot",
 ]

-[[package]]
-name = "ndk-macro"
-version = "0.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05d1c6307dc424d0f65b9b06e94f88248e6305726b14729fd67a5e47b2dc481d"
-dependencies = [
- "darling 0.10.2",
- "proc-macro-crate 0.1.5",
- "proc-macro2",
- "quote",
- "syn",
-]
-
 [[package]]
 name = "ndk-macro"
 version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0df7ac00c4672f9d5aece54ee3347520b7e20f158656c7db2e6de01902eb7a6c"
 dependencies = [
- "darling 0.13.4",
- "proc-macro-crate 1.2.1",
+ "darling",
+ "proc-macro-crate",
 "proc-macro2",
 "quote",
 "syn",
 ]

-[[package]]
-name = "ndk-sys"
-version = "0.2.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e1bcdd74c20ad5d95aacd60ef9ba40fdf77f767051040541df557b7a9b2a2121"
-
 [[package]]
 name = "ndk-sys"
 version = "0.4.0"
@ -1103,7 +875,7 @@ version = "0.5.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3b0498641e53dd6ac1a4f22547548caa6864cc4933784319cd1775271c5a46ce"
 dependencies = [
- "proc-macro-crate 1.2.1",
+ "proc-macro-crate",
 "proc-macro2",
 "quote",
 "syn",
@ -1134,12 +906,6 @@ version = "1.15.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e82dad04139b71a90c080c8463fe0dc7902db5192d939bd0950f074d014339e1"

-[[package]]
-name = "os_str_bytes"
-version = "6.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ff7415e9ae3fff1225851df9e0d9e4e5479f947619774677a63572e55e80eff"
-
 [[package]]
 name = "parking_lot"
 version = "0.12.1"
@ -1168,7 +934,7 @@ name = "peniko"
 version = "0.1.0"
 source = "git+https://github.com/linebender/peniko#b83821720aa51a3942be5d20c71525a1ae61ac0a"
 dependencies = [
- "kurbo 0.9.0",
+ "kurbo",
 "smallvec",
 ]

@ -1178,88 +944,6 @@ version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e"

-[[package]]
-name = "pgpu-render"
-version = "0.1.0"
-dependencies = [
- "cbindgen",
- "cocoa",
- "metal",
- "objc",
- "piet-gpu",
- "piet-gpu-hal",
- "piet-scene",
-]
-
-[[package]]
-name = "piet-gpu"
-version = "0.1.0"
-dependencies = [
- "bytemuck",
- "clap 3.2.23",
- "ndk 0.3.0",
- "ndk-glue 0.3.0",
- "ndk-sys 0.2.2",
- "piet-gpu-hal",
- "piet-gpu-types",
- "piet-scene",
- "png",
- "rand 0.8.5",
- "raw-window-handle 0.3.4",
- "raw-window-handle 0.5.0",
- "roxmltree",
- "winit 0.27.5",
-]
-
-[[package]]
-name = "piet-gpu-derive"
-version = "0.0.0"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
-[[package]]
-name = "piet-gpu-hal"
-version = "0.1.0"
-dependencies = [
- "ash",
- "ash-window",
- "bitflags",
- "block",
- "bytemuck",
- "cocoa-foundation",
- "core-graphics-types",
- "foreign-types 0.3.2",
- "metal",
- "objc",
- "raw-window-handle 0.5.0",
- "smallvec",
- "winapi",
- "wio",
-]
-
-[[package]]
-name = "piet-gpu-tests"
-version = "0.1.0"
-dependencies = [
- "bytemuck",
- "clap 3.2.23",
- "kurbo 0.7.1",
- "piet-gpu",
- "piet-gpu-hal",
- "rand 0.7.3",
-]
-
-[[package]]
-name = "piet-gpu-types"
-version = "0.0.0"
-dependencies = [
- "half",
- "piet-gpu-derive",
-]
-
 [[package]]
 name = "piet-scene"
 version = "0.1.0"
@ -1312,21 +996,6 @@ version = "0.2.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5da3b0203fd7ee5720aa0b5e790b591aa5d3f41c3ed2c34a3a393382198af2f7"

-[[package]]
-name = "ppv-lite86"
-version = "0.2.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872"
-
-[[package]]
-name = "proc-macro-crate"
-version = "0.1.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d6ea3c4595b96363c13943497db34af4460fb474a95c43f4446ad341b8c9785"
-dependencies = [
- "toml",
-]
-
 [[package]]
 name = "proc-macro-crate"
 version = "1.2.1"
@ -1362,93 +1031,12 @@ dependencies = [
 "proc-macro2",
 ]

-[[package]]
-name = "rand"
-version = "0.7.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
-dependencies = [
- "getrandom 0.1.16",
- "libc",
- "rand_chacha 0.2.2",
- "rand_core 0.5.1",
- "rand_hc",
-]
-
-[[package]]
-name = "rand"
-version = "0.8.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
-dependencies = [
- "libc",
- "rand_chacha 0.3.1",
- "rand_core 0.6.4",
-]
-
-[[package]]
-name = "rand_chacha"
-version = "0.2.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402"
-dependencies = [
- "ppv-lite86",
- "rand_core 0.5.1",
-]
-
-[[package]]
-name = "rand_chacha"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
-dependencies = [
- "ppv-lite86",
- "rand_core 0.6.4",
-]
-
-[[package]]
-name = "rand_core"
-version = "0.5.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
-dependencies = [
- "getrandom 0.1.16",
-]
-
-[[package]]
-name = "rand_core"
-version = "0.6.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
-dependencies = [
- "getrandom 0.2.7",
-]
-
-[[package]]
-name = "rand_hc"
-version = "0.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c"
-dependencies = [
- "rand_core 0.5.1",
-]
-
 [[package]]
 name = "range-alloc"
 version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "63e935c45e09cc6dcf00d2f0b2d630a58f4095320223d47fc68918722f0538b6"

-[[package]]
-name = "raw-window-handle"
-version = "0.3.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e28f55143d0548dad60bb4fbdc835a3d7ac6acc3324506450c5fdd6e42903a76"
-dependencies = [
- "libc",
- "raw-window-handle 0.4.3",
-]
-
 [[package]]
 name = "raw-window-handle"
 version = "0.4.3"
@ -1467,18 +1055,6 @@ dependencies = [
 "cty",
 ]

-[[package]]
-name = "raw-window-metal"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5d18241d631f19847a5f4cc0a3f81d978202c375573ab7d90ab14dcf0a9262ec"
-dependencies = [
- "cocoa",
- "core-graphics",
- "objc",
- "raw-window-handle 0.5.0",
-]
-
 [[package]]
 name = "redox_syscall"
 version = "0.2.16"
@ -1488,15 +1064,6 @@ dependencies = [
 "bitflags",
 ]

-[[package]]
-name = "remove_dir_all"
-version = "0.5.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7"
-dependencies = [
- "winapi",
-]
-
 [[package]]
 name = "renderdoc-sys"
 version = "0.7.1"
@ -1518,12 +1085,6 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"

-[[package]]
-name = "ryu"
-version = "1.0.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09"
-
 [[package]]
 name = "safe_arch"
 version = "0.5.2"
@ -1562,9 +1123,6 @@ name = "serde"
 version = "1.0.147"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d193d69bae983fc11a79df82342761dfbf28a99fc8d203dca4c3c1b590948965"
-dependencies = [
- "serde_derive",
-]

 [[package]]
 name = "serde_derive"
@ -1577,17 +1135,6 @@ dependencies = [
 "syn",
 ]

-[[package]]
-name = "serde_json"
-version = "1.0.87"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ce777b7b150d76b9cf60d28b55f5847135a003f7d7350c6be7a773508ce7d45"
-dependencies = [
- "itoa",
- "ryu",
- "serde",
-]
-
 [[package]]
 name = "servo-fontconfig"
 version = "0.5.1"
@ -1659,18 +1206,6 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"

-[[package]]
-name = "strsim"
-version = "0.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
-
-[[package]]
-name = "strsim"
-version = "0.9.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6446ced80d6c486436db5c078dde11a9f73d42b57fb273121e160b84f63d894c"
-
 [[package]]
 name = "strsim"
 version = "0.10.0"
@ -1688,20 +1223,6 @@ dependencies = [
 "unicode-ident",
 ]

-[[package]]
-name = "tempfile"
-version = "3.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4"
-dependencies = [
- "cfg-if",
- "fastrand",
- "libc",
- "redox_syscall",
- "remove_dir_all",
- "winapi",
-]
-
 [[package]]
 name = "termcolor"
 version = "1.1.3"
@ -1711,21 +1232,6 @@ dependencies = [
 "winapi-util",
 ]

-[[package]]
-name = "textwrap"
-version = "0.11.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
-dependencies = [
- "unicode-width",
-]
-
-[[package]]
-name = "textwrap"
-version = "0.16.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d"
-
 [[package]]
 name = "thiserror"
 version = "1.0.37"
@ -1786,12 +1292,6 @@ version = "1.0.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3"

-[[package]]
-name = "unicode-segmentation"
-version = "1.10.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fdbf052a0783de01e944a6ce7a8cb939e295b1e7be835a1112c3b9a7f047a5a"
-
 [[package]]
 name = "unicode-width"
 version = "0.1.10"
@ -1816,12 +1316,6 @@ version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"

-[[package]]
-name = "wasi"
-version = "0.9.0+wasi-snapshot-preview1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519"
-
 [[package]]
 name = "wasi"
 version = "0.11.0+wasi-snapshot-preview1"
@ -2228,8 +1722,8 @@ dependencies = [
 "libc",
 "log",
 "mio",
- "ndk 0.7.0",
- "ndk-glue 0.7.0",
+ "ndk",
+ "ndk-glue",
 "objc",
 "once_cell",
 "parking_lot",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,19 +1,9 @@
 [workspace]
 resolver = "2"

-members = [
-    "pgpu-render",
-    "piet-gpu",
-    "piet-gpu-derive",
-    "piet-gpu-hal",
-    "piet-gpu-types",
-    "piet-scene",
-    "piet-wgsl",
-    "piet-wgsl/examples/winit",
-    "tests",
-]
+members = ["piet-scene", "piet-wgsl", "piet-wgsl/examples/winit"]

 [patch.crates-io]
 # Required for metal support to work on wgpu
 # TODO: remove when wgpu is upgraded to 0.15
-naga = { git = "https://github.com/gfx-rs/naga", rev="ddcd5d3121150b2b1beee6e54e9125ff31aaa9a2" }
+naga = { git = "https://github.com/gfx-rs/naga", rev = "ddcd5d3121150b2b1beee6e54e9125ff31aaa9a2" }
--- a/README.md
+++ b/README.md
@ -4,17 +4,20 @@ This repo contains the new prototype for a new compute-centric 2D GPU renderer.

 It succeeds the previous prototype, [piet-metal].

+The latest version is a middleware for [`wgpu`]. This is used as the rendering backend for
+[xilem], a UI toolkit.
+
+<!-- TODO: Are we transitioning to more production? If so, should we rewrite the README a bit? -->
+
 ## Goals

 The main goal is to answer research questions about the future of 2D rendering:

-* Is a compute-centered approach better than rasterization ([Direct2D])? How much so?
+-   Is a compute-centered approach better than rasterization ([Direct2D])? How much so?

-* To what extent do "advanced" GPU features (subgroups, descriptor arrays) help?
+-   To what extent do "advanced" GPU features (subgroups, descriptor arrays) help?

-* Can we improve quality and extend the imaging model in useful ways?
-
-Another goal is to explore a standards-based, portable approach to GPU compute.
+-   Can we improve quality and extend the imaging model in useful ways?

 ## Blogs and other writing

@ -22,17 +25,9 @@ Much of the research progress on piet-gpu is documented in blog entries. See [do

 There is a much larger and detailed [vision](doc/vision.md) that explains the longer-term goals of the project, and how we might get there.

-### Why not gfx-hal?
+## History

-It makes a lot of sense to use gfx-hal, as it addresses the ability to write kernel and runtime code once and run it portably. But in exploring it I've found some points of friction, especially in using more "advanced" features. To serve the research goals, I'm enjoying using Vulkan directly, through [ash], which I've found does a good job tracking Vulkan releases. One example is experimenting with `VK_EXT_subgroup_size_control`.
-
-The hal layer in this repo is strongly inspired by gfx-hal, but with some differences. One is that we're shooting for a compile-time pipeline to generate GPU IR on DX12 and Metal, while gfx-hal ships [SPIRV-Cross] in the runtime. To access [Shader Model 6], that would also require bundling [DXC] at runtime, which is not yet implemented (though it's certainly possible).
-
-### Why not wgpu?
-
-The case for wgpu is also strong, but it's even less mature. I'd love to see it become a solid foundation, at which point I'd use it as the main integration with [Druid].
-
-In short, the goal is to facilitate the research now, collect the data, and then use that to choose a best path for shipping later.
+A prior incarnation used a custom cross-API hal. An archive of this version can be found in the branches [`custom-hal-archive-with-shaders`] and [`custom-hal-archive`].

 ## License and contributions.

@ -42,14 +37,12 @@ In addition, the shaders are provided under the terms of the [Unlicense](UNLICEN

 The dx12 backend was adapted from piet-dx12 by Brian Merchant.

-Contributions are welcome by pull request. The [Rust code of conduct] applies. Pull requests should be against the `dev` branch; see [shader_compilation.md] for explanation and details.
+Contributions are welcome by pull request. The [Rust code of conduct] applies.

 [piet-metal]: https://github.com/linebender/piet-metal
-[Direct2D]: https://docs.microsoft.com/en-us/windows/win32/direct2d/direct2d-portal
-[ash]: https://github.com/MaikKlein/ash
-[SPIRV-Cross]: https://github.com/KhronosGroup/SPIRV-Cross
-[Shader Model 6]: https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/hlsl-shader-model-6-0-features-for-direct3d-12
-[DXC]: https://github.com/microsoft/DirectXShaderCompiler
-[Druid]: https://github.com/linebender/druid
-[Rust code of conduct]: https://www.rust-lang.org/policies/code-of-conduct
-[shader_compilation.md]: ./doc/shader_compilation.md
+[direct2d]: https://docs.microsoft.com/en-us/windows/win32/direct2d/direct2d-portal
+[`wgpu`]: https://wgpu.rs/
+[xilem]: https://github.com/linebender/xilem/
+[rust code of conduct]: https://www.rust-lang.org/policies/code-of-conduct
+[`custom-hal-archive-with-shaders`]: https://github.com/linebender/piet-gpu/tree/custom-hal-archive-with-shaders
+[`custom-hal-archive`]: https://github.com/linebender/piet-gpu/tree/custom-hal-archive
--- a/pgpu-render/Cargo.toml
+++ b/pgpu-render/Cargo.toml
@ -1,22 +0,0 @@
-[package]
-name = "pgpu-render"
-version = "0.1.0"
-description = "C interface for glyph rendering using piet-gpu."
-license = "MIT/Apache-2.0"
-edition = "2021"
-
-[lib]
-crate-type = ["cdylib"]
-
-[dependencies]
-piet-gpu = { path = "../piet-gpu" }
-piet-gpu-hal = { path = "../piet-gpu-hal" }
-piet-scene = { path = "../piet-scene" }
-
-[target.'cfg(all(not(target_arch = "wasm32"), any(target_os = "ios", target_os = "macos")))'.dependencies]
-metal = "0.24"
-objc = "0.2.7"
-cocoa = "0.24.0"
-
-[build-dependencies]
-cbindgen = "0.20.0"
--- a/pgpu-render/build.rs
+++ b/pgpu-render/build.rs
@ -1,30 +0,0 @@
-// Copyright 2022 The piet-gpu authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Also licensed under MIT license, at your choice.
-
-extern crate cbindgen;
-
-use std::env;
-
-fn main() {
-    let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
-    cbindgen::Builder::new()
-        .with_crate(crate_dir)
-        .with_define("target_os", "ios", "__APPLE__")
-        .with_header("/** Automatically generated from pgpu-render/src/lib.rs with cbindgen. **/")
-        .generate()
-        .expect("Unable to generate bindings")
-        .write_to_file("pgpu.h");
-}
--- a/pgpu-render/pgpu.h
+++ b/pgpu-render/pgpu.h
@ -1,238 +0,0 @@
-/** Automatically generated from pgpu-render/src/lib.rs with cbindgen. **/
-
-#include <cstdarg>
-#include <cstdint>
-#include <cstdlib>
-#include <ostream>
-#include <new>
-
-enum class PgpuBrushKind {
-  Solid = 0,
-};
-
-enum class PgpuFill {
-  NonZero = 0,
-  EvenOdd = 1,
-};
-
-enum class PgpuPathVerb {
-  MoveTo = 0,
-  LineTo = 1,
-  QuadTo = 2,
-  CurveTo = 3,
-  Close = 4,
-};
-
-/// Encoded (possibly color) outline for a glyph.
-struct PgpuGlyph;
-
-/// Context for loading and scaling glyphs.
-struct PgpuGlyphContext;
-
-/// Context for loading a scaling glyphs from a specific font.
-struct PgpuGlyphProvider;
-
-/// State and resources for rendering a scene.
-struct PgpuRenderer;
-
-/// Encoded streams and resources describing a vector graphics scene.
-struct PgpuScene;
-
-/// Builder for constructing an encoded scene.
-struct PgpuSceneBuilder;
-
-/// Encoded streams and resources describing a vector graphics scene fragment.
-struct PgpuSceneFragment;
-
-/// Affine transformation matrix.
-struct PgpuTransform {
-  float xx;
-  float yx;
-  float xy;
-  float yy;
-  float dx;
-  float dy;
-};
-
-struct PgpuColor {
-  uint8_t r;
-  uint8_t g;
-  uint8_t b;
-  uint8_t a;
-};
-
-union PgpuBrushData {
-  PgpuColor solid;
-};
-
-struct PgpuBrush {
-  PgpuBrushKind kind;
-  PgpuBrushData data;
-};
-
-struct PgpuPoint {
-  float x;
-  float y;
-};
-
-struct PgpuPathElement {
-  PgpuPathVerb verb;
-  PgpuPoint points[3];
-};
-
-struct PgpuPathIter {
-  void *context;
-  bool (*next_element)(void*, PgpuPathElement*);
-};
-
-/// Tag and value for a font variation axis.
-struct PgpuFontVariation {
-  /// Tag that specifies the axis.
-  uint32_t tag;
-  /// Requested setting for the axis.
-  float value;
-};
-
-/// Description of a font.
-struct PgpuFontDesc {
-  /// Pointer to the context of the font file.
-  const uint8_t *data;
-  /// Size of the font file data in bytes.
-  uintptr_t data_len;
-  /// Index of the requested font in the font file.
-  uint32_t index;
-  /// Unique identifier for the font.
-  uint64_t unique_id;
-  /// Requested size in pixels per em unit. Set to 0.0 for
-  /// unscaled outlines.
-  float ppem;
-  /// Pointer to array of font variation settings.
-  const PgpuFontVariation *variations;
-  /// Number of font variation settings.
-  uintptr_t variations_len;
-};
-
-/// Rectangle defined by minimum and maximum points.
-struct PgpuRect {
-  float x0;
-  float y0;
-  float x1;
-  float y1;
-};
-
-extern "C" {
-
-#if defined(__APPLE__)
-/// Creates a new piet-gpu renderer for the specified Metal device and
-/// command queue.
-///
-/// device: MTLDevice*
-/// queue: MTLCommandQueue*
-PgpuRenderer *pgpu_renderer_new(void *device, void *queue);
-#endif
-
-#if defined(__APPLE__)
-/// Renders a prepared scene into a texture target. Commands for rendering are
-/// recorded into the specified command buffer. Returns an id representing
-/// resources that may have been allocated during this process. After the
-/// command buffer has been retired, call `pgpu_renderer_release` with this id
-/// to drop any associated resources.
-///
-/// target: MTLTexture*
-/// cmdbuf: MTLCommandBuffer*
-uint32_t pgpu_renderer_render(PgpuRenderer *renderer,
-                              const PgpuScene *scene,
-                              void *target,
-                              void *cmdbuf);
-#endif
-
-/// Releases the internal resources associated with the specified id from a
-/// previous render operation.
-void pgpu_renderer_release(PgpuRenderer *renderer, uint32_t id);
-
-/// Destroys the piet-gpu renderer.
-void pgpu_renderer_destroy(PgpuRenderer *renderer);
-
-/// Creates a new, empty piet-gpu scene.
-PgpuScene *pgpu_scene_new();
-
-/// Destroys the piet-gpu scene.
-void pgpu_scene_destroy(PgpuScene *scene);
-
-/// Creates a new, empty piet-gpu scene fragment.
-PgpuSceneFragment *pgpu_scene_fragment_new();
-
-/// Destroys the piet-gpu scene fragment.
-void pgpu_scene_fragment_destroy(PgpuSceneFragment *fragment);
-
-/// Creates a new builder for filling a piet-gpu scene. The specified scene
-/// should not be accessed while the builder is live.
-PgpuSceneBuilder *pgpu_scene_builder_for_scene(PgpuScene *scene);
-
-/// Creates a new builder for filling a piet-gpu scene fragment. The specified
-/// scene fragment should not be accessed while the builder is live.
-PgpuSceneBuilder *pgpu_scene_builder_for_fragment(PgpuSceneFragment *fragment);
-
-/// Adds a glyph with the specified transform to the underlying scene.
-void pgpu_scene_builder_add_glyph(PgpuSceneBuilder *builder,
-                                  const PgpuGlyph *glyph,
-                                  const PgpuTransform *transform);
-
-/// Sets the current absolute transform for the scene builder.
-void pgpu_scene_builder_transform(PgpuSceneBuilder *builder, const PgpuTransform *transform);
-
-/// Fills a path using the specified fill style and brush. If the brush
-/// parameter is nullptr, a solid color white brush will be used. The
-/// brush_transform may be nullptr.
-void pgpu_scene_builder_fill_path(PgpuSceneBuilder *builder,
-                                  PgpuFill fill,
-                                  const PgpuBrush *brush,
-                                  const PgpuTransform *brush_transform,
-                                  PgpuPathIter *path);
-
-/// Appends a scene fragment to the underlying scene or fragment. The
-/// transform parameter represents an absolute transform to apply to
-/// the fragment. If it is nullptr, the fragment will be appended to
-/// the scene with an assumed identity transform regardless of the
-/// current transform state.
-void pgpu_scene_builder_append_fragment(PgpuSceneBuilder *builder,
-                                        const PgpuSceneFragment *fragment,
-                                        const PgpuTransform *transform);
-
-/// Finalizes the scene builder, making the underlying scene ready for
-/// rendering. This takes ownership and consumes the builder.
-void pgpu_scene_builder_finish(PgpuSceneBuilder *builder);
-
-/// Creates a new context for loading glyph outlines.
-PgpuGlyphContext *pgpu_glyph_context_new();
-
-/// Destroys the glyph context.
-void pgpu_glyph_context_destroy(PgpuGlyphContext *gcx);
-
-/// Creates a new glyph provider for the specified glyph context and font
-/// descriptor. May return nullptr if the font data is invalid. Only one glyph
-/// provider may be live for a glyph context.
-PgpuGlyphProvider *pgpu_glyph_provider_new(PgpuGlyphContext *gcx, const PgpuFontDesc *font);
-
-/// Returns an encoded outline for the specified glyph provider and glyph id.
-/// May return nullptr if the requested glyph is not available.
-PgpuGlyph *pgpu_glyph_provider_get(PgpuGlyphProvider *provider, uint16_t gid);
-
-/// Returns an encoded color outline for the specified glyph provider, color
-/// palette index and glyph id. May return nullptr if the requested glyph is
-/// not available.
-PgpuGlyph *pgpu_glyph_provider_get_color(PgpuGlyphProvider *provider,
-                                         uint16_t palette_index,
-                                         uint16_t gid);
-
-/// Destroys the glyph provider.
-void pgpu_glyph_provider_destroy(PgpuGlyphProvider *provider);
-
-/// Computes the bounding box for the glyph after applying the specified
-/// transform.
-PgpuRect pgpu_glyph_bbox(const PgpuGlyph *glyph, const float (*transform)[6]);
-
-/// Destroys the glyph.
-void pgpu_glyph_destroy(PgpuGlyph *glyph);
-
-} // extern "C"
--- a/pgpu-render/src/lib.rs
+++ b/pgpu-render/src/lib.rs
@ -1,464 +0,0 @@
-// Copyright 2022 The piet-gpu authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Also licensed under MIT license, at your choice.
-
-// We only really have implementations for IOS targets so far
-// Note that this is the same cfg that wgpu uses for metal support
-#![cfg_attr(
-    not(all(
-        not(target_arch = "wasm32"),
-        any(target_os = "ios", target_os = "macos")
-    )),
-    allow(unused)
-)]
-
-mod render;
-
-use piet_scene::kurbo::{Affine, PathEl, Point};
-use piet_scene::{Brush, Color, Fill};
-use render::*;
-use std::ffi::c_void;
-use std::mem::transmute;
-
-/// Creates a new piet-gpu renderer for the specified Metal device and
-/// command queue.
-///
-/// device: MTLDevice*
-/// queue: MTLCommandQueue*
-#[no_mangle]
-#[cfg(all(
-    not(target_arch = "wasm32"),
-    any(target_os = "ios", target_os = "macos")
-))]
-pub unsafe extern "C" fn pgpu_renderer_new(
-    device: *mut c_void,
-    queue: *mut c_void,
-) -> *mut PgpuRenderer {
-    let device: &metal::DeviceRef = transmute(device);
-    let queue: &metal::CommandQueueRef = transmute(queue);
-    Box::into_raw(Box::new(PgpuRenderer::new(device, queue)))
-}
-
-/// Renders a prepared scene into a texture target. Commands for rendering are
-/// recorded into the specified command buffer. Returns an id representing
-/// resources that may have been allocated during this process. After the
-/// command buffer has been retired, call `pgpu_renderer_release` with this id
-/// to drop any associated resources.
-///
-/// target: MTLTexture*
-/// cmdbuf: MTLCommandBuffer*
-#[no_mangle]
-#[cfg(all(
-    not(target_arch = "wasm32"),
-    any(target_os = "ios", target_os = "macos")
-))]
-pub unsafe extern "C" fn pgpu_renderer_render(
-    renderer: *mut PgpuRenderer,
-    scene: *const PgpuScene,
-    target: *mut c_void,
-    cmdbuf: *mut c_void,
-) -> u32 {
-    let cmdbuf: &metal::CommandBufferRef = transmute(cmdbuf);
-    let target: &metal::TextureRef = transmute(target);
-    (*renderer).render(&*scene, cmdbuf, target)
-}
-
-/// Releases the internal resources associated with the specified id from a
-/// previous render operation.
-#[no_mangle]
-pub unsafe extern "C" fn pgpu_renderer_release(renderer: *mut PgpuRenderer, id: u32) {
-    (*renderer).release(id);
-}
-
-/// Destroys the piet-gpu renderer.
-#[no_mangle]
-pub unsafe extern "C" fn pgpu_renderer_destroy(renderer: *mut PgpuRenderer) {
-    Box::from_raw(renderer);
-}
-
-/// Creates a new, empty piet-gpu scene.
-#[no_mangle]
-pub unsafe extern "C" fn pgpu_scene_new() -> *mut PgpuScene {
-    Box::into_raw(Box::new(PgpuScene::new()))
-}
-
-/// Destroys the piet-gpu scene.
-#[no_mangle]
-pub unsafe extern "C" fn pgpu_scene_destroy(scene: *mut PgpuScene) {
-    Box::from_raw(scene);
-}
-
-/// Creates a new, empty piet-gpu scene fragment.
-#[no_mangle]
-pub unsafe extern "C" fn pgpu_scene_fragment_new() -> *mut PgpuSceneFragment {
-    Box::into_raw(Box::new(PgpuSceneFragment::new()))
-}
-
-/// Destroys the piet-gpu scene fragment.
-#[no_mangle]
-pub unsafe extern "C" fn pgpu_scene_fragment_destroy(fragment: *mut PgpuSceneFragment) {
-    Box::from_raw(fragment);
-}
-
-#[derive(Copy, Clone, PartialEq, Debug)]
-#[repr(C)]
-pub enum PgpuPathVerb {
-    MoveTo = 0,
-    LineTo = 1,
-    QuadTo = 2,
-    CurveTo = 3,
-    Close = 4,
-}
-
-#[derive(Copy, Clone, Default, Debug)]
-#[repr(C)]
-pub struct PgpuPoint {
-    pub x: f32,
-    pub y: f32,
-}
-
-/// Rectangle defined by minimum and maximum points.
-#[derive(Copy, Clone, Default)]
-#[repr(C)]
-pub struct PgpuRect {
-    pub x0: f32,
-    pub y0: f32,
-    pub x1: f32,
-    pub y1: f32,
-}
-
-#[derive(Copy, Clone, Debug)]
-#[repr(C)]
-pub struct PgpuPathElement {
-    pub verb: PgpuPathVerb,
-    pub points: [PgpuPoint; 3],
-}
-
-#[derive(Copy, Clone)]
-#[repr(C)]
-pub struct PgpuPathIter {
-    pub context: *mut c_void,
-    pub next_element: extern "C" fn(*mut c_void, *mut PgpuPathElement) -> bool,
-}
-
-#[derive(Copy, Clone, PartialEq, Debug)]
-#[repr(C)]
-pub enum PgpuFill {
-    NonZero = 0,
-    EvenOdd = 1,
-}
-
-#[derive(Copy, Clone, PartialEq, Debug)]
-#[repr(C)]
-pub enum PgpuBrushKind {
-    Solid = 0,
-}
-
-#[derive(Copy, Clone, Debug)]
-#[repr(C)]
-pub struct PgpuColor {
-    pub r: u8,
-    pub g: u8,
-    pub b: u8,
-    pub a: u8,
-}
-
-#[repr(C)]
-pub union PgpuBrushData {
-    pub solid: PgpuColor,
-}
-
-#[repr(C)]
-pub struct PgpuBrush {
-    pub kind: PgpuBrushKind,
-    pub data: PgpuBrushData,
-}
-
-/// Affine transformation matrix.
-#[derive(Copy, Clone, Debug)]
-#[repr(C)]
-pub struct PgpuTransform {
-    pub xx: f32,
-    pub yx: f32,
-    pub xy: f32,
-    pub yy: f32,
-    pub dx: f32,
-    pub dy: f32,
-}
-
-impl From<PgpuTransform> for Affine {
-    fn from(xform: PgpuTransform) -> Self {
-        Affine::new([
-            xform.xx as f64,
-            xform.yx as f64,
-            xform.xy as f64,
-            xform.yy as f64,
-            xform.dx as f64,
-            xform.dy as f64,
-        ])
-    }
-}
-
-/// Creates a new builder for filling a piet-gpu scene. The specified scene
-/// should not be accessed while the builder is live.
-#[no_mangle]
-pub unsafe extern "C" fn pgpu_scene_builder_for_scene(
-    scene: *mut PgpuScene,
-) -> *mut PgpuSceneBuilder<'static> {
-    Box::into_raw(Box::new((*scene).builder()))
-}
-
-/// Creates a new builder for filling a piet-gpu scene fragment. The specified
-/// scene fragment should not be accessed while the builder is live.
-#[no_mangle]
-pub unsafe extern "C" fn pgpu_scene_builder_for_fragment(
-    fragment: *mut PgpuSceneFragment,
-) -> *mut PgpuSceneBuilder<'static> {
-    Box::into_raw(Box::new((*fragment).builder()))
-}
-
-/// Adds a glyph with the specified transform to the underlying scene.
-#[no_mangle]
-pub unsafe extern "C" fn pgpu_scene_builder_add_glyph(
-    builder: *mut PgpuSceneBuilder<'static>,
-    glyph: *const PgpuGlyph,
-    transform: *const PgpuTransform,
-) {
-    (*builder).add_glyph(&*glyph, &(*transform).into());
-}
-
-impl Iterator for PgpuPathIter {
-    type Item = PathEl;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        let mut el = PgpuPathElement {
-            verb: PgpuPathVerb::MoveTo,
-            points: [PgpuPoint::default(); 3],
-        };
-        fn conv_pt(pt: PgpuPoint) -> Point {
-            Point::new(pt.x as f64, pt.y as f64)
-        }
-        if (self.next_element)(self.context, &mut el as _) {
-            let p = &el.points;
-            Some(match el.verb {
-                PgpuPathVerb::MoveTo => PathEl::MoveTo(conv_pt(p[0])),
-                PgpuPathVerb::LineTo => PathEl::LineTo(conv_pt(p[0])),
-                PgpuPathVerb::QuadTo => PathEl::QuadTo(conv_pt(p[0]), conv_pt(p[1])),
-                PgpuPathVerb::CurveTo => {
-                    PathEl::CurveTo(conv_pt(p[0]), conv_pt(p[1]), conv_pt(p[2]))
-                }
-                PgpuPathVerb::Close => PathEl::ClosePath,
-            })
-        } else {
-            None
-        }
-    }
-}
-
-/// Sets the current absolute transform for the scene builder.
-#[no_mangle]
-pub unsafe extern "C" fn pgpu_scene_builder_transform(
-    builder: *mut PgpuSceneBuilder<'static>,
-    transform: *const PgpuTransform,
-) {
-    if let Some(transform) = transform.as_ref() {
-        (*builder).transform = (*transform).into();
-    }
-}
-
-/// Fills a path using the specified fill style and brush. If the brush
-/// parameter is nullptr, a solid color white brush will be used. The
-/// brush_transform may be nullptr.
-#[no_mangle]
-pub unsafe extern "C" fn pgpu_scene_builder_fill_path(
-    builder: *mut PgpuSceneBuilder<'static>,
-    fill: PgpuFill,
-    brush: *const PgpuBrush,
-    brush_transform: *const PgpuTransform,
-    path: *mut PgpuPathIter,
-) {
-    let fill = match fill {
-        PgpuFill::NonZero => Fill::NonZero,
-        PgpuFill::EvenOdd => Fill::EvenOdd,
-    };
-    let brush = if brush.is_null() {
-        Brush::Solid(Color::rgb8(255, 255, 255))
-    } else {
-        match (*brush).kind {
-            PgpuBrushKind::Solid => {
-                let color = &(*brush).data.solid;
-                Brush::Solid(Color::rgba8(color.r, color.g, color.b, color.a))
-            }
-        }
-    };
-    let brush_transform = if brush_transform.is_null() {
-        None
-    } else {
-        Some((*brush_transform).into())
-    };
-    let path_els = (*path).collect::<Vec<_>>();
-    (*builder).builder.fill(
-        fill,
-        (*builder).transform,
-        &brush,
-        brush_transform,
-        &&path_els[..],
-    );
-}
-
-/// Appends a scene fragment to the underlying scene or fragment. The
-/// transform parameter represents an absolute transform to apply to
-/// the fragment. If it is nullptr, the fragment will be appended to
-/// the scene with an assumed identity transform regardless of the
-/// current transform state.
-#[no_mangle]
-pub unsafe extern "C" fn pgpu_scene_builder_append_fragment(
-    builder: *mut PgpuSceneBuilder<'static>,
-    fragment: *const PgpuSceneFragment,
-    transform: *const PgpuTransform,
-) {
-    let transform = if transform.is_null() {
-        None
-    } else {
-        Some((*transform).into())
-    };
-    (*builder).builder.append(&(*fragment).0, transform);
-}
-
-/// Finalizes the scene builder, making the underlying scene ready for
-/// rendering. This takes ownership and consumes the builder.
-#[no_mangle]
-pub unsafe extern "C" fn pgpu_scene_builder_finish(builder: *mut PgpuSceneBuilder<'static>) {
-    let builder = Box::from_raw(builder);
-    builder.finish();
-}
-
-/// Creates a new context for loading glyph outlines.
-#[no_mangle]
-pub unsafe extern "C" fn pgpu_glyph_context_new() -> *mut PgpuGlyphContext {
-    Box::into_raw(Box::new(PgpuGlyphContext::new()))
-}
-
-/// Destroys the glyph context.
-#[no_mangle]
-pub unsafe extern "C" fn pgpu_glyph_context_destroy(gcx: *mut PgpuGlyphContext) {
-    Box::from_raw(gcx);
-}
-
-/// Description of a font.
-#[derive(Copy, Clone)]
-#[repr(C)]
-pub struct PgpuFontDesc {
-    /// Pointer to the context of the font file.
-    data: *const u8,
-    /// Size of the font file data in bytes.
-    data_len: usize,
-    /// Index of the requested font in the font file.
-    index: u32,
-    /// Unique identifier for the font.
-    unique_id: u64,
-    /// Requested size in pixels per em unit. Set to 0.0 for
-    /// unscaled outlines.
-    ppem: f32,
-    /// Pointer to array of font variation settings.
-    variations: *const PgpuFontVariation,
-    /// Number of font variation settings.
-    variations_len: usize,
-}
-
-/// Creates a new glyph provider for the specified glyph context and font
-/// descriptor. May return nullptr if the font data is invalid. Only one glyph
-/// provider may be live for a glyph context.
-#[no_mangle]
-pub unsafe extern "C" fn pgpu_glyph_provider_new(
-    gcx: *mut PgpuGlyphContext,
-    font: *const PgpuFontDesc,
-) -> *mut PgpuGlyphProvider<'static> {
-    let font = &*font;
-    let font_data = std::slice::from_raw_parts(font.data, font.data_len);
-    let variations = std::slice::from_raw_parts(font.variations, font.variations_len);
-    if let Some(provider) = (*gcx).new_provider(
-        font_data,
-        font.index,
-        font.unique_id,
-        font.ppem,
-        false,
-        variations,
-    ) {
-        Box::into_raw(Box::new(provider))
-    } else {
-        std::ptr::null_mut()
-    }
-}
-
-/// Returns an encoded outline for the specified glyph provider and glyph id.
-/// May return nullptr if the requested glyph is not available.
-#[no_mangle]
-pub unsafe extern "C" fn pgpu_glyph_provider_get(
-    provider: *mut PgpuGlyphProvider,
-    gid: u16,
-) -> *mut PgpuGlyph {
-    if let Some(glyph) = (*provider).get(gid) {
-        Box::into_raw(Box::new(glyph))
-    } else {
-        std::ptr::null_mut()
-    }
-}
-
-/// Returns an encoded color outline for the specified glyph provider, color
-/// palette index and glyph id. May return nullptr if the requested glyph is
-/// not available.
-#[no_mangle]
-pub unsafe extern "C" fn pgpu_glyph_provider_get_color(
-    provider: *mut PgpuGlyphProvider,
-    palette_index: u16,
-    gid: u16,
-) -> *mut PgpuGlyph {
-    if let Some(glyph) = (*provider).get_color(palette_index, gid) {
-        Box::into_raw(Box::new(glyph))
-    } else {
-        std::ptr::null_mut()
-    }
-}
-
-/// Destroys the glyph provider.
-#[no_mangle]
-pub unsafe extern "C" fn pgpu_glyph_provider_destroy(provider: *mut PgpuGlyphProvider) {
-    Box::from_raw(provider);
-}
-
-/// Computes the bounding box for the glyph after applying the specified
-/// transform.
-#[no_mangle]
-pub unsafe extern "C" fn pgpu_glyph_bbox(
-    glyph: *const PgpuGlyph,
-    transform: &[f32; 6],
-) -> PgpuRect {
-    let transform: PgpuTransform = std::mem::transmute(*transform);
-    let transform = transform.into();
-    let rect = (*glyph).bbox(Some(transform));
-    PgpuRect {
-        x0: rect.min_x() as f32,
-        y0: rect.min_y() as f32,
-        x1: rect.max_x() as f32,
-        y1: rect.max_y() as f32,
-    }
-}
-
-/// Destroys the glyph.
-#[no_mangle]
-pub unsafe extern "C" fn pgpu_glyph_destroy(glyph: *mut PgpuGlyph) {
-    Box::from_raw(glyph);
-}
--- a/pgpu-render/src/render.rs
+++ b/pgpu-render/src/render.rs
@ -1,239 +0,0 @@
-// Copyright 2022 The piet-gpu authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Also licensed under MIT license, at your choice.
-
-use piet_gpu::{PixelFormat, RenderConfig};
-use piet_gpu_hal::{QueryPool, Session};
-use piet_scene::glyph::pinot::{types::Tag, FontDataRef};
-use piet_scene::glyph::{GlyphContext, GlyphProvider};
-use piet_scene::kurbo::{Affine, Point, Rect};
-use piet_scene::{Scene, SceneFragment};
-
-/// State and resources for rendering a scene.
-pub struct PgpuRenderer {
-    session: Session,
-    pgpu_renderer: Option<piet_gpu::Renderer>,
-    query_pool: QueryPool,
-    width: u32,
-    height: u32,
-    is_color: bool,
-}
-
-impl PgpuRenderer {
-    #[cfg(all(
-        not(target_arch = "wasm32"),
-        any(target_os = "ios", target_os = "macos")
-    ))]
-    pub fn new(device: &metal::DeviceRef, queue: &metal::CommandQueueRef) -> Self {
-        let piet_device = piet_gpu_hal::Device::new_from_raw_mtl(device, &queue);
-        let session = Session::new(piet_device);
-        let query_pool = session.create_query_pool(12).unwrap();
-        Self {
-            session,
-            pgpu_renderer: None,
-            query_pool,
-            width: 0,
-            height: 0,
-            is_color: false,
-        }
-    }
-
-    #[cfg(all(
-        not(target_arch = "wasm32"),
-        any(target_os = "ios", target_os = "macos")
-    ))]
-    pub fn render(
-        &mut self,
-        scene: &PgpuScene,
-        cmdbuf: &metal::CommandBufferRef,
-        target: &metal::TextureRef,
-    ) -> u32 {
-        let is_color = target.pixel_format() != metal::MTLPixelFormat::R8Unorm;
-        let width = target.width() as u32;
-        let height = target.height() as u32;
-        if self.pgpu_renderer.is_none()
-            || self.width != width
-            || self.height != height
-            || self.is_color != is_color
-        {
-            self.width = width;
-            self.height = height;
-            self.is_color = is_color;
-            let format = if is_color {
-                PixelFormat::Rgba8
-            } else {
-                PixelFormat::A8
-            };
-            let config = RenderConfig::new(width as usize, height as usize).pixel_format(format);
-            unsafe {
-                self.pgpu_renderer =
-                    piet_gpu::Renderer::new_from_config(&self.session, config, 1).ok();
-            }
-        }
-        unsafe {
-            let mut cmd_buf = self.session.cmd_buf_from_raw_mtl(cmdbuf);
-            let dst_image = self
-                .session
-                .image_from_raw_mtl(target, self.width, self.height);
-            if let Some(renderer) = &mut self.pgpu_renderer {
-                renderer.upload_scene(&scene.0, 0).unwrap();
-                renderer.record(&mut cmd_buf, &self.query_pool, 0);
-                // TODO later: we can bind the destination image and avoid the copy.
-                cmd_buf.blit_image(&renderer.image_dev, &dst_image);
-                cmd_buf.flush();
-            }
-        }
-        0
-    }
-
-    pub fn release(&mut self, _id: u32) {
-        // TODO: worry about freeing resources / managing overlapping submits
-    }
-}
-
-/// Encoded streams and resources describing a vector graphics scene.
-pub struct PgpuScene(pub Scene);
-
-impl PgpuScene {
-    pub fn new() -> Self {
-        Self(Scene::default())
-    }
-
-    pub fn builder(&mut self) -> PgpuSceneBuilder {
-        PgpuSceneBuilder {
-            builder: piet_scene::SceneBuilder::for_scene(&mut self.0),
-            transform: Affine::IDENTITY,
-        }
-    }
-}
-
-/// Encoded streams and resources describing a vector graphics scene fragment.
-pub struct PgpuSceneFragment(pub SceneFragment);
-
-impl PgpuSceneFragment {
-    pub fn new() -> Self {
-        Self(SceneFragment::default())
-    }
-
-    pub fn builder(&mut self) -> PgpuSceneBuilder {
-        PgpuSceneBuilder {
-            builder: piet_scene::SceneBuilder::for_fragment(&mut self.0),
-            transform: Affine::IDENTITY,
-        }
-    }
-}
-
-/// Builder for constructing an encoded scene.
-pub struct PgpuSceneBuilder<'a> {
-    pub builder: piet_scene::SceneBuilder<'a>,
-    pub transform: Affine,
-}
-
-impl<'a> PgpuSceneBuilder<'a> {
-    pub fn add_glyph(&mut self, glyph: &PgpuGlyph, transform: &Affine) {
-        self.builder.append(&glyph.fragment, Some(*transform));
-    }
-
-    pub fn finish(self) {
-        self.builder.finish();
-    }
-}
-
-/// Tag and value for a font variation axis.
-#[derive(Copy, Clone)]
-#[repr(C)]
-pub struct PgpuFontVariation {
-    /// Tag that specifies the axis.
-    pub tag: u32,
-    /// Requested setting for the axis.
-    pub value: f32,
-}
-
-/// Context for loading and scaling glyphs.
-pub struct PgpuGlyphContext(GlyphContext);
-
-impl PgpuGlyphContext {
-    pub fn new() -> Self {
-        Self(GlyphContext::new())
-    }
-
-    pub fn new_provider<'a>(
-        &'a mut self,
-        font_data: &'a [u8],
-        font_index: u32,
-        font_id: u64,
-        ppem: f32,
-        hint: bool,
-        variations: &[PgpuFontVariation],
-    ) -> Option<PgpuGlyphProvider> {
-        let font = FontDataRef::new(font_data).and_then(|f| f.get(font_index))?;
-        Some(PgpuGlyphProvider(
-            self.0.new_provider(
-                &font,
-                Some(font_id),
-                ppem,
-                hint,
-                variations
-                    .iter()
-                    .map(|variation| (Tag(variation.tag), variation.value)),
-            ),
-        ))
-    }
-}
-
-/// Context for loading a scaling glyphs from a specific font.
-pub struct PgpuGlyphProvider<'a>(GlyphProvider<'a>);
-
-impl<'a> PgpuGlyphProvider<'a> {
-    pub fn get(&mut self, gid: u16) -> Option<PgpuGlyph> {
-        let fragment = self.0.get(gid, None)?;
-        Some(PgpuGlyph { fragment })
-    }
-
-    pub fn get_color(&mut self, palette_index: u16, gid: u16) -> Option<PgpuGlyph> {
-        let fragment = self.0.get_color(palette_index, gid)?;
-        Some(PgpuGlyph { fragment })
-    }
-}
-
-/// Encoded (possibly color) outline for a glyph.
-pub struct PgpuGlyph {
-    fragment: SceneFragment,
-}
-
-impl PgpuGlyph {
-    pub fn bbox(&self, transform: Option<Affine>) -> Rect {
-        let points = self.fragment.points();
-        if points.is_empty() {
-            return Rect::default();
-        }
-        let mut points = points
-            .iter()
-            .map(|pt| Point::new(pt[0] as f64, pt[1] as f64));
-        if let Some(transform) = &transform {
-            let mut rect = Rect::from_center_size(points.next().unwrap(), (0.0, 0.0));
-            for point in points {
-                rect = rect.union_pt(*transform * point);
-            }
-            rect
-        } else {
-            let mut rect = Rect::from_center_size(points.next().unwrap(), (0.0, 0.0));
-            for point in points {
-                rect = rect.union_pt(point);
-            }
-            rect
-        }
-    }
-}
--- a/piet-gpu-derive/Cargo.toml
+++ b/piet-gpu-derive/Cargo.toml
@ -1,17 +0,0 @@
-[package]
-name = "piet-gpu-derive"
-version = "0.0.0"
-authors = ["Raph Levien <raph.levien@gmail.com>"]
-description = "Proc macro derives for piet-gpu."
-license = "MIT/Apache-2.0"
-edition = "2018"
-keywords = ["graphics", "2d"]
-categories = ["rendering::graphics-api"]
-
-[lib]
-proc-macro = true
-
-[dependencies]
-syn = {version = "1.0.17", features = ["extra-traits", "full"]}
-quote = "1.0.3"
-proc-macro2 = "1.0.10"
--- a/piet-gpu-derive/src/derive.rs
+++ b/piet-gpu-derive/src/derive.rs
@ -1,192 +0,0 @@
-//! Generation of Rust derive functions for encoding.
-
-use quote::{format_ident, quote};
-
-use crate::layout::{LayoutModule, LayoutTypeDef};
-use crate::parse::{GpuScalar, GpuType};
-
-pub fn gen_derive(module: &LayoutModule) -> proc_macro2::TokenStream {
-    let mut ts = proc_macro2::TokenStream::new();
-    let module_name = format_ident!("{}", module.name);
-    for name in &module.def_names {
-        let def = module.defs.get(name).unwrap();
-        ts.extend(gen_derive_def(name, def.0.size, &def.1));
-    }
-    quote! {
-        mod #module_name {
-            pub trait HalfToLeBytes {
-                fn to_le_bytes(&self) -> [u8; 2];
-            }
-
-            impl HalfToLeBytes for half::f16 {
-                fn to_le_bytes(&self) -> [u8; 2] {
-                    self.to_bits().to_le_bytes()
-                }
-            }
-
-            #ts
-        }
-    }
-}
-
-fn gen_derive_def(name: &str, size: usize, def: &LayoutTypeDef) -> proc_macro2::TokenStream {
-    let name_id = format_ident!("{}", name);
-    match def {
-        LayoutTypeDef::Struct(fields) => {
-            let mut gen_fields = proc_macro2::TokenStream::new();
-            let mut encode_fields = proc_macro2::TokenStream::new();
-            for (field_name, offset, ty) in fields {
-                let field_name_id = format_ident!("{}", field_name);
-                let gen_ty = gen_derive_ty(&ty.ty);
-                let gen_field = quote! {
-                    pub #field_name_id: #gen_ty,
-                };
-                gen_fields.extend(gen_field);
-
-                encode_fields.extend(gen_encode_field(field_name, *offset, &ty.ty));
-            }
-            quote! {
-                #[derive(Clone)]
-                pub struct #name_id {
-                    #gen_fields
-                }
-
-                impl crate::encoder::Encode for #name_id {
-                    fn fixed_size() -> usize {
-                        #size
-                    }
-                    fn encode_to(&self, buf: &mut [u8]) {
-                        #encode_fields
-                    }
-                }
-            }
-        }
-        LayoutTypeDef::Enum(variants) => {
-            let mut gen_variants = proc_macro2::TokenStream::new();
-            let mut cases = proc_macro2::TokenStream::new();
-            for (variant_ix, (variant_name, payload)) in variants.iter().enumerate() {
-                let variant_id = format_ident!("{}", variant_name);
-                let field_tys = payload.iter().map(|(_offset, ty)| gen_derive_ty(&ty.ty));
-                let variant = quote! {
-                    #variant_id(#(#field_tys),*),
-                };
-                gen_variants.extend(variant);
-
-                let mut args = Vec::new();
-                let mut field_encoders = proc_macro2::TokenStream::new();
-                let mut tag_field = None;
-                for (i, (offset, ty)) in payload.iter().enumerate() {
-                    let field_id = format_ident!("f{}", i);
-                    if matches!(ty.ty, GpuType::Scalar(GpuScalar::TagFlags)) {
-                        tag_field = Some(field_id.clone());
-                    } else {
-                        let field_encoder = quote! {
-                            #field_id.encode_to(&mut buf[#offset..]);
-                        };
-                        field_encoders.extend(field_encoder);
-                    }
-                    args.push(field_id);
-                }
-                let tag = variant_ix as u32;
-                let tag_encode = match tag_field {
-                    None => quote! {
-                        buf[0..4].copy_from_slice(&#tag.to_le_bytes());
-                    },
-                    Some(tag_field) => quote! {
-                        buf[0..4].copy_from_slice(&(#tag | ((*#tag_field as u32) << 16)).to_le_bytes());
-                    },
-                };
-                let case = quote! {
-                    #name_id::#variant_id(#(#args),*) => {
-                        #tag_encode
-                        #field_encoders
-                    }
-                };
-                cases.extend(case);
-            }
-            quote! {
-                #[derive(Clone)]
-                pub enum #name_id {
-                    #gen_variants
-                }
-
-                impl crate::encoder::Encode for #name_id {
-                    fn fixed_size() -> usize {
-                        #size
-                    }
-                    fn encode_to(&self, buf: &mut [u8]) {
-                        match self {
-                            #cases
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-/// Generate a Rust type.
-fn gen_derive_ty(ty: &GpuType) -> proc_macro2::TokenStream {
-    match ty {
-        GpuType::Scalar(s) => gen_derive_scalar_ty(s),
-        GpuType::Vector(s, len) => {
-            let scalar = gen_derive_scalar_ty(s);
-            quote! { [#scalar; #len] }
-        }
-        GpuType::InlineStruct(name) => {
-            let name_id = format_ident!("{}", name);
-            quote! { #name_id }
-        }
-        GpuType::Ref(ty) => {
-            let gen_ty = gen_derive_ty(ty);
-            quote! { crate::encoder::Ref<#gen_ty> }
-        }
-    }
-}
-
-fn gen_derive_scalar_ty(ty: &GpuScalar) -> proc_macro2::TokenStream {
-    match ty {
-        GpuScalar::F16 => quote!(half::f16),
-        GpuScalar::F32 => quote!(f32),
-        GpuScalar::I8 => quote!(i8),
-        GpuScalar::I16 => quote!(i16),
-        GpuScalar::I32 => quote!(i32),
-        GpuScalar::U8 => quote!(u8),
-        GpuScalar::U16 => quote!(u16),
-        GpuScalar::U32 => quote!(u32),
-        GpuScalar::TagFlags => quote!(u16),
-    }
-}
-
-fn gen_encode_field(name: &str, offset: usize, ty: &GpuType) -> proc_macro2::TokenStream {
-    let name_id = format_ident!("{}", name);
-    match ty {
-        // encoding of flags into tag word is handled elsewhere
-        GpuType::Scalar(GpuScalar::TagFlags) => quote! {},
-        GpuType::Scalar(s) => {
-            let end = offset + s.size();
-            quote! {
-                buf[#offset..#end].copy_from_slice(&self.#name_id.to_le_bytes());
-            }
-        }
-        GpuType::Vector(s, len) => {
-            let size = s.size();
-            quote! {
-                for i in 0..#len {
-                    let offset = #offset + i * #size;
-                    buf[offset..offset + #size].copy_from_slice(&self.#name_id[i].to_le_bytes());
-                }
-            }
-        }
-        GpuType::Ref(_) => {
-            quote! {
-                buf[#offset..#offset + 4].copy_from_slice(&self.#name_id.offset().to_le_bytes());
-            }
-        }
-        _ => {
-            quote! {
-                &self.#name_id.encode_to(&mut buf[#offset..]);
-            }
-        }
-    }
-}
--- a/piet-gpu-derive/src/glsl.rs
+++ b/piet-gpu-derive/src/glsl.rs
@ -1,669 +0,0 @@
-//! Generation of GLSL struct definitions and accessor functions.
-
-use std::fmt::Write;
-use std::ops::Deref;
-
-use crate::layout::{LayoutModule, LayoutType, LayoutTypeDef};
-use crate::parse::{GpuScalar, GpuType};
-
-pub fn gen_glsl(module: &LayoutModule) -> String {
-    let mut r = String::new();
-    writeln!(
-        &mut r,
-        "// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense\n"
-    )
-    .unwrap();
-    writeln!(&mut r, "// Code auto-generated by piet-gpu-derive\n").unwrap();
-    // Note: GLSL needs definitions before uses. We could do a topological sort here,
-    // but easiest for now to just require that in spec.
-    for name in &module.def_names {
-        gen_refdef(&mut r, &name);
-    }
-
-    for name in &module.def_names {
-        match module.defs.get(name).unwrap() {
-            (size, LayoutTypeDef::Struct(fields)) => {
-                gen_struct_def(&mut r, name, fields);
-                gen_item_def(&mut r, name, size.size);
-            }
-            (size, LayoutTypeDef::Enum(en)) => {
-                gen_enum_def(&mut r, name, en);
-                gen_item_def(&mut r, name, size.size);
-                gen_tag_def(&mut r, name);
-            }
-        }
-    }
-
-    for name in &module.def_names {
-        let def = module.defs.get(name).unwrap();
-        let is_mem = !module.name.eq(&"state") && !module.name.eq(&"scene");
-        match def {
-            (_size, LayoutTypeDef::Struct(fields)) => {
-                gen_struct_read(&mut r, &module.name, &name, is_mem, fields);
-                if module.gpu_write {
-                    gen_struct_write(&mut r, &module.name, &name, is_mem, fields);
-                }
-            }
-            (_size, LayoutTypeDef::Enum(en)) => {
-                gen_enum_read(&mut r, &module.name, &name, is_mem, en);
-                if module.gpu_write {
-                    gen_enum_write(&mut r, &module.name, &name, is_mem, en);
-                }
-            }
-        }
-    }
-
-    r
-}
-
-fn gen_refdef(r: &mut String, name: &str) {
-    writeln!(r, "struct {}Ref {{", name).unwrap();
-    writeln!(r, "    uint offset;").unwrap();
-    writeln!(r, "}};\n").unwrap();
-}
-
-fn gen_struct_def(r: &mut String, name: &str, fields: &[(String, usize, LayoutType)]) {
-    writeln!(r, "struct {} {{", name).unwrap();
-    for (name, _offset, ty) in fields {
-        writeln!(r, "    {} {};", glsl_type(&ty.ty), name).unwrap();
-    }
-    writeln!(r, "}};\n").unwrap();
-}
-
-fn gen_enum_def(r: &mut String, name: &str, variants: &[(String, Vec<(usize, LayoutType)>)]) {
-    for (i, (var_name, _payload)) in variants.iter().enumerate() {
-        writeln!(r, "#define {}_{} {}", name, var_name, i).unwrap();
-    }
-}
-
-fn gen_item_def(r: &mut String, name: &str, size: usize) {
-    writeln!(r, "#define {}_size {}\n", name, size).unwrap();
-    writeln!(
-        r,
-        "{}Ref {}_index({}Ref ref, uint index) {{",
-        name, name, name
-    )
-    .unwrap();
-    writeln!(
-        r,
-        "    return {}Ref(ref.offset + index * {}_size);",
-        name, name
-    )
-    .unwrap();
-    writeln!(r, "}}\n").unwrap();
-}
-
-fn gen_tag_def(r: &mut String, name: &str) {
-    writeln!(r, "struct {}Tag {{", name).unwrap();
-    writeln!(r, "   uint tag;").unwrap();
-    writeln!(r, "   uint flags;").unwrap();
-    writeln!(r, "}};\n").unwrap();
-}
-
-fn gen_struct_read(
-    r: &mut String,
-    bufname: &str,
-    name: &str,
-    is_mem: bool,
-    fields: &[(String, usize, LayoutType)],
-) {
-    write!(r, "{} {}_read(", name, name).unwrap();
-    if is_mem {
-        write!(r, "Alloc a, ").unwrap();
-    }
-    writeln!(r, "{}Ref ref) {{", name).unwrap();
-    writeln!(r, "    uint ix = ref.offset >> 2;").unwrap();
-    let coverage = crate::layout::struct_coverage(fields, false);
-    for (i, fields) in coverage.iter().enumerate() {
-        if !fields.is_empty() {
-            if is_mem {
-                writeln!(r, "    uint raw{} = read_mem(a, ix + {});", i, i).unwrap();
-            } else {
-                writeln!(r, "    uint raw{} = {}[ix + {}];", i, bufname, i).unwrap();
-            }
-        }
-    }
-    writeln!(r, "    {} s;", name).unwrap();
-
-    let mut preload: bool = false;
-    for (name, offset, ty) in fields {
-        let (setup, extract) = gen_extract(*offset, &ty.ty, preload);
-        writeln!(r, "{}    s.{} = {};", setup, name, extract).unwrap();
-
-        if let GpuType::Scalar(GpuScalar::F16) = &ty.ty {
-            if offset % 4 == 0 {
-                preload = true;
-                continue;
-            }
-        }
-        preload = false;
-    }
-
-    writeln!(r, "    return s;").unwrap();
-    writeln!(r, "}}\n").unwrap();
-}
-
-fn gen_enum_read(
-    r: &mut String,
-    bufname: &str,
-    name: &str,
-    is_mem: bool,
-    variants: &[(String, Vec<(usize, LayoutType)>)],
-) {
-    if is_mem {
-        writeln!(r, "{}Tag {}_tag(Alloc a, {}Ref ref) {{", name, name, name).unwrap();
-        writeln!(r, "    uint tag_and_flags = read_mem(a, ref.offset >> 2);").unwrap();
-    } else {
-        writeln!(r, "{}Tag {}_tag({}Ref ref) {{", name, name, name).unwrap();
-        writeln!(r, "    uint tag_and_flags = {}[ref.offset >> 2];", bufname).unwrap();
-    }
-    writeln!(
-        r,
-        "    return {}Tag(tag_and_flags & 0xffff, tag_and_flags >> 16);",
-        name
-    )
-    .unwrap();
-    writeln!(r, "}}\n").unwrap();
-    for (var_name, payload) in variants {
-        let payload_ix = if payload.len() == 1 {
-            Some(0)
-        } else if payload.len() == 2 {
-            if matches!(payload[0].1.ty, GpuType::Scalar(GpuScalar::TagFlags)) {
-                Some(1)
-            } else {
-                None
-            }
-        } else {
-            None
-        };
-        if let Some(payload_ix) = payload_ix {
-            if let GpuType::InlineStruct(structname) = &payload[payload_ix].1.ty {
-                if is_mem {
-                    writeln!(
-                        r,
-                        "{} {}_{}_read(Alloc a, {}Ref ref) {{",
-                        structname, name, var_name, name
-                    )
-                    .unwrap();
-                    writeln!(
-                        r,
-                        "    return {}_read(a, {}Ref(ref.offset + {}));",
-                        structname, structname, payload[0].0
-                    )
-                    .unwrap();
-                } else {
-                    writeln!(
-                        r,
-                        "{} {}_{}_read({}Ref ref) {{",
-                        structname, name, var_name, name
-                    )
-                    .unwrap();
-                    writeln!(
-                        r,
-                        "    return {}_read({}Ref(ref.offset + {}));",
-                        structname, structname, payload[0].0
-                    )
-                    .unwrap();
-                }
-                writeln!(r, "}}\n").unwrap();
-            }
-        }
-        // TODO: support for variants that aren't one struct.
-    }
-}
-
-fn gen_extract(offset: usize, ty: &GpuType, preload: bool) -> (String, String) {
-    match ty {
-        GpuType::Scalar(scalar) => {
-            let setup = match scalar {
-                GpuScalar::F16 => {
-                    if preload {
-                        String::new()
-                    } else {
-                        let ix = offset / 4;
-                        format!("    vec2 halves{} = unpackHalf2x16(raw{});\n", ix, ix)
-                    }
-                }
-                _ => String::new(),
-            };
-
-            (setup, gen_extract_scalar(offset, scalar))
-        }
-        GpuType::Vector(scalar, size) => {
-            let is_f16 = match scalar {
-                GpuScalar::F16 => true,
-                _ => false,
-            };
-
-            let mut setup = String::new();
-            let mut extract = glsl_type(ty);
-            extract.push_str("(");
-            for i in 0..*size {
-                if i != 0 {
-                    extract.push_str(", ");
-                }
-
-                if is_f16 && i % 2 == 0 {
-                    let ix = (offset + i * scalar.size()) / 4;
-                    let s = format!("    vec2 halves{} = unpackHalf2x16(raw{});\n", ix, ix);
-                    setup.push_str(&s);
-                };
-
-                let el_offset = offset + i * scalar.size();
-                extract.push_str(&gen_extract_scalar(el_offset, scalar));
-            }
-            extract.push_str(")");
-            (setup, extract)
-        }
-        GpuType::InlineStruct(name) => (
-            String::new(),
-            format!(
-                "{}_read({}Ref({}))",
-                name,
-                name,
-                simplified_add("ref.offset", offset)
-            ),
-        ),
-        GpuType::Ref(inner) => {
-            if let GpuType::InlineStruct(name) = inner.deref() {
-                (
-                    String::new(),
-                    format!(
-                        "{}Ref({})",
-                        name,
-                        gen_extract_scalar(offset, &GpuScalar::U32)
-                    ),
-                )
-            } else {
-                panic!("only know how to deal with Ref of struct")
-            }
-        }
-    }
-}
-
-fn gen_extract_scalar(offset: usize, ty: &GpuScalar) -> String {
-    match ty {
-        GpuScalar::F16 | GpuScalar::F32 => extract_fbits(offset, ty.size()),
-        GpuScalar::U8 | GpuScalar::U16 | GpuScalar::U32 => extract_ubits(offset, ty.size()),
-        GpuScalar::I8 | GpuScalar::I16 | GpuScalar::I32 => extract_ibits(offset, ty.size()),
-        GpuScalar::TagFlags => format!("0 /* TODO */"),
-    }
-}
-
-fn extract_ubits(offset: usize, nbytes: usize) -> String {
-    if nbytes == 4 {
-        return format!("raw{}", offset / 4);
-    }
-    let mask = (1 << (nbytes * 8)) - 1;
-    if offset % 4 == 0 {
-        format!("raw{} & 0x{:x}", offset / 4, mask)
-    } else if offset % 4 + nbytes == 4 {
-        format!("raw{} >> {}", offset / 4, (offset % 4) * 8)
-    } else {
-        format!("(raw{} >> {}) & 0x{:x}", offset / 4, (offset % 4) * 8, mask)
-    }
-}
-
-fn extract_ibits(offset: usize, nbytes: usize) -> String {
-    if nbytes == 4 {
-        return format!("int(raw{})", offset / 4);
-    }
-    if offset % 4 + nbytes == 4 {
-        format!("int(raw{}) >> {}", offset / 4, (offset % 4) * 8)
-    } else {
-        format!(
-            "int(raw{} << {}) >> {}",
-            offset / 4,
-            ((4 - nbytes) - offset % 4) * 8,
-            (4 - nbytes) * 8
-        )
-    }
-}
-
-fn extract_fbits(offset: usize, nbytes: usize) -> String {
-    match nbytes {
-        4 => format!("uintBitsToFloat(raw{})", offset / 4),
-        2 => match offset % 4 {
-            0 => {
-                let ix = offset / 4;
-                format!("halves{}.x", ix)
-            }
-            2 => format!("halves{}.y", offset / 4),
-            _ => panic!("unexpected packing of f16 at offset {}", offset % 4),
-        },
-        _ => {
-            panic!("unexpected extraction of float with nbytes = {}", nbytes);
-        }
-    }
-}
-
-// Writing
-
-fn is_f16(ty: &GpuType) -> bool {
-    match ty {
-        GpuType::Scalar(GpuScalar::F16) => true,
-        GpuType::Vector(GpuScalar::F16, _) => true,
-        _ => false,
-    }
-}
-
-fn is_f16_pair(field_ixs: &[usize], fields: &[(String, usize, LayoutType)]) -> bool {
-    if field_ixs.len() == 2 {
-        fields.iter().all(|(_, _, t)| is_f16(&t.ty))
-    } else {
-        false
-    }
-}
-
-fn gen_struct_write(
-    r: &mut String,
-    bufname: &str,
-    name: &str,
-    is_mem: bool,
-    fields: &[(String, usize, LayoutType)],
-) {
-    write!(r, "void {}_write(", name).unwrap();
-    if is_mem {
-        write!(r, "Alloc a, ").unwrap();
-    }
-    writeln!(r, "{}Ref ref, {} s) {{", name, name).unwrap();
-    writeln!(r, "    uint ix = ref.offset >> 2;").unwrap();
-    let coverage = crate::layout::struct_coverage(fields, true);
-
-    for (i, field_ixs) in coverage.iter().enumerate() {
-        let mut pieces = Vec::new();
-
-        if is_f16_pair(field_ixs, fields) {
-            let (ix0, ix1) = (field_ixs[0], field_ixs[1]);
-            let inner0 = format!("s.{}", fields[ix0].0);
-            let inner1 = format!("s.{}", fields[ix1].0);
-            pieces.push(format!("packHalf2x16(vec2({}, {}))", &inner0, &inner1));
-        } else {
-            for field_ix in field_ixs {
-                let (name, offset, ty) = &fields[*field_ix];
-                match &ty.ty {
-                    GpuType::Scalar(scalar) => {
-                        let inner = format!("s.{}", name);
-                        pieces.push(gen_pack_bits_scalar(scalar, *offset, &inner));
-                    }
-                    GpuType::Vector(scalar, len) => {
-                        let size = scalar.size();
-                        let ix_lo = (i * 4 - offset) / size;
-                        let ix_hi = ((4 + i * 4 - offset) / size).min(*len);
-                        match scalar {
-                            GpuScalar::F16 => {
-                                if ix_hi - ix_lo == 2 {
-                                    let inner0 =
-                                        format!("s.{}.{}", name, &"xyzw"[ix_lo..ix_lo + 1]);
-                                    let inner1 =
-                                        format!("s.{}.{}", name, &"xyzw"[ix_lo + 1..ix_hi]);
-                                    pieces.push(format!(
-                                        "packHalf2x16(vec2({}, {}))",
-                                        &inner0, &inner1
-                                    ));
-                                } else {
-                                    let ix = ix_lo;
-                                    let scalar_offset = offset + ix * size;
-                                    let inner = format!("s.{}.{}", name, &"xyzw"[ix..ix + 1]);
-                                    pieces.push(gen_pack_bits_scalar(
-                                        scalar,
-                                        scalar_offset,
-                                        &inner,
-                                    ));
-                                }
-                            }
-                            _ => {
-                                for ix in ix_lo..ix_hi {
-                                    let scalar_offset = offset + ix * size;
-                                    let inner = format!("s.{}.{}", name, &"xyzw"[ix..ix + 1]);
-                                    pieces.push(gen_pack_bits_scalar(
-                                        scalar,
-                                        scalar_offset,
-                                        &inner,
-                                    ));
-                                }
-                            }
-                        }
-                    }
-                    GpuType::InlineStruct(structname) => {
-                        writeln!(
-                            r,
-                            "    {}_write({}Ref({}), s.{});",
-                            structname,
-                            structname,
-                            simplified_add("ref.offset", *offset),
-                            name
-                        )
-                        .unwrap();
-                    }
-                    GpuType::Ref(_) => pieces.push(format!("s.{}.offset", name)),
-                }
-            }
-        }
-
-        if !pieces.is_empty() {
-            if is_mem {
-                write!(r, "    write_mem(a, ix + {}, ", i).unwrap();
-            } else {
-                write!(r, "    {}[ix + {}] = ", bufname, i).unwrap();
-            }
-            for (j, piece) in pieces.iter().enumerate() {
-                if j != 0 {
-                    write!(r, " | ").unwrap();
-                }
-                write!(r, "{}", piece).unwrap();
-            }
-            if is_mem {
-                write!(r, ")").unwrap();
-            }
-            writeln!(r, ";").unwrap();
-        }
-    }
-    writeln!(r, "}}\n").unwrap();
-}
-
-fn gen_pack_bits_scalar(ty: &GpuScalar, offset: usize, inner: &str) -> String {
-    let shift = (offset % 4) * 8;
-    let bits = match ty {
-        GpuScalar::F16 => format!("packHalf2x16(vec2({}, 0.0)) & 0xffff", inner),
-        GpuScalar::F32 => format!("floatBitsToUint({})", inner),
-        // Note: this doesn't mask small unsigned int types; the caller is
-        // responsible for making sure they don't overflow.
-        GpuScalar::U8 | GpuScalar::U16 | GpuScalar::U32 => inner.into(),
-        GpuScalar::I8 => {
-            if shift == 24 {
-                format!("uint({})", inner)
-            } else {
-                format!("(uint({}) & 0xff)", inner)
-            }
-        }
-        GpuScalar::I16 => {
-            if shift == 16 {
-                format!("uint({})", inner)
-            } else {
-                format!("(uint({}) & 0xffff)", inner)
-            }
-        }
-        GpuScalar::I32 => format!("uint({})", inner),
-        GpuScalar::TagFlags => format!("0"),
-    };
-    if shift == 0 {
-        bits
-    } else {
-        format!("({} << {})", bits, shift)
-    }
-}
-
-fn gen_enum_write(
-    r: &mut String,
-    bufname: &str,
-    name: &str,
-    is_mem: bool,
-    variants: &[(String, Vec<(usize, LayoutType)>)],
-) {
-    for (var_name, payload) in variants {
-        if payload.is_empty() {
-            if is_mem {
-                writeln!(
-                    r,
-                    "void {}_{}_write(Alloc a, {}Ref ref) {{",
-                    name, var_name, name
-                )
-                .unwrap();
-                writeln!(
-                    r,
-                    "    write_mem(a, ref.offset >> 2, {}_{});",
-                    name, var_name
-                )
-                .unwrap();
-            } else {
-                writeln!(r, "void {}_{}_write({}Ref ref) {{", name, var_name, name).unwrap();
-                writeln!(
-                    r,
-                    "    {}[ref.offset >> 2] = {}_{};",
-                    bufname, name, var_name
-                )
-                .unwrap();
-            }
-            writeln!(r, "}}\n").unwrap();
-        } else if payload.len() == 1 {
-            if let GpuType::InlineStruct(structname) = &payload[0].1.ty {
-                if is_mem {
-                    writeln!(
-                        r,
-                        "void {}_{}_write(Alloc a, {}Ref ref, {} s) {{",
-                        name, var_name, name, structname
-                    )
-                    .unwrap();
-                    writeln!(
-                        r,
-                        "    write_mem(a, ref.offset >> 2, {}_{});",
-                        name, var_name
-                    )
-                    .unwrap();
-                    writeln!(
-                        r,
-                        "    {}_write(a, {}Ref(ref.offset + {}), s);",
-                        structname, structname, payload[0].0
-                    )
-                    .unwrap();
-                } else {
-                    writeln!(
-                        r,
-                        "void {}_{}_write({}Ref ref, {} s) {{",
-                        name, var_name, name, structname
-                    )
-                    .unwrap();
-                    writeln!(
-                        r,
-                        "    {}[ref.offset >> 2] = {}_{};",
-                        bufname, name, var_name
-                    )
-                    .unwrap();
-                    writeln!(
-                        r,
-                        "    {}_write({}Ref(ref.offset + {}), s);",
-                        structname, structname, payload[0].0
-                    )
-                    .unwrap();
-                }
-                writeln!(r, "}}\n").unwrap();
-            }
-        } else if payload.len() == 2
-            && matches!(payload[0].1.ty, GpuType::Scalar(GpuScalar::TagFlags))
-        {
-            if let GpuType::InlineStruct(structname) = &payload[1].1.ty {
-                if is_mem {
-                    writeln!(
-                        r,
-                        "void {}_{}_write(Alloc a, {}Ref ref, uint flags, {} s) {{",
-                        name, var_name, name, structname
-                    )
-                    .unwrap();
-                    writeln!(
-                        r,
-                        "    write_mem(a, ref.offset >> 2, (flags << 16) | {}_{});",
-                        name, var_name
-                    )
-                    .unwrap();
-                    writeln!(
-                        r,
-                        "    {}_write(a, {}Ref(ref.offset + {}), s);",
-                        structname, structname, payload[0].0
-                    )
-                    .unwrap();
-                } else {
-                    writeln!(
-                        r,
-                        "void {}_{}_write({}Ref ref, uint flags, {} s) {{",
-                        name, var_name, name, structname
-                    )
-                    .unwrap();
-                    writeln!(
-                        r,
-                        "    {}[ref.offset >> 2] = (flags << 16) | {}_{};",
-                        bufname, name, var_name
-                    )
-                    .unwrap();
-                    writeln!(
-                        r,
-                        "    {}_write({}Ref(ref.offset + {}), s);",
-                        structname, structname, payload[0].0
-                    )
-                    .unwrap();
-                }
-                writeln!(r, "}}\n").unwrap();
-            }
-        }
-        // TODO: support for variants that aren't one struct.
-    }
-}
-
-// Utility functions
-
-fn glsl_type(ty: &GpuType) -> String {
-    match ty {
-        GpuType::Scalar(scalar) => glsl_scalar(scalar).into(),
-        GpuType::Vector(scalar, size) => {
-            if *size == 1 {
-                glsl_scalar(scalar).into()
-            } else {
-                format!("{}{}", glsl_vecname(scalar), size)
-            }
-        }
-        GpuType::InlineStruct(name) => name.clone(),
-        GpuType::Ref(inner) => {
-            if let GpuType::InlineStruct(name) = inner.deref() {
-                format!("{}Ref", name)
-            } else {
-                panic!("only know how to deal with Ref of struct")
-            }
-        }
-    }
-}
-
-// GLSL type that can contain the scalar value.
-fn glsl_scalar(s: &GpuScalar) -> &'static str {
-    match s {
-        GpuScalar::F16 | GpuScalar::F32 => "float",
-        GpuScalar::I8 | GpuScalar::I16 | GpuScalar::I32 => "int",
-        GpuScalar::U8 | GpuScalar::U16 | GpuScalar::U32 | GpuScalar::TagFlags => "uint",
-    }
-}
-
-fn glsl_vecname(s: &GpuScalar) -> &'static str {
-    match s {
-        GpuScalar::F16 | GpuScalar::F32 => "vec",
-        GpuScalar::I8 | GpuScalar::I16 | GpuScalar::I32 => "ivec",
-        GpuScalar::U8 | GpuScalar::U16 | GpuScalar::U32 | GpuScalar::TagFlags => "uvec",
-    }
-}
-
-/// If `c = 0`, return `"var_name"`, else `"var_name + c"`
-fn simplified_add(var_name: &str, c: usize) -> String {
-    if c == 0 {
-        String::from(var_name)
-    } else {
-        format!("{} + {}", var_name, c)
-    }
-}
--- a/piet-gpu-derive/src/layout.rs
+++ b/piet-gpu-derive/src/layout.rs
@ -1,244 +0,0 @@
-//! Logic for layout of structures in memory.
-
-// This is fairly simple now, but there are some extensions that are likely:
-// * Addition of f16 types
-//   + These will probably have 2-byte alignments to support `packHalf2x16`
-// * 1 byte tag values (so small struct fields can be packed along with tag)
-// * (Possibly) reordering for better packing
-
-use std::collections::{HashMap, HashSet};
-
-use crate::parse::{GpuModule, GpuType, GpuTypeDef};
-
-#[derive(Clone)]
-pub struct LayoutType {
-    size: Size,
-    pub ty: GpuType,
-}
-
-#[derive(Clone)]
-pub enum LayoutTypeDef {
-    /// Name, offset, field type. Make a separate struct?
-    Struct(Vec<(String, usize, LayoutType)>),
-    Enum(Vec<(String, Vec<(usize, LayoutType)>)>),
-}
-
-pub struct LayoutModule {
-    pub name: String,
-    pub def_names: Vec<String>,
-    pub defs: HashMap<String, (Size, LayoutTypeDef)>,
-    enum_variants: HashSet<String>,
-
-    /// Generate shader code to write the module.
-    ///
-    /// This is derived from the presence of the `gpu_write` attribute in the source module.
-    pub gpu_write: bool,
-    /// Generate Rust code to encode the module.
-    ///
-    /// This is derived from the presence of the `rust_encode` attribute in the source module.
-    pub rust_encode: bool,
-}
-
-struct LayoutSession<'a> {
-    enum_variants: HashSet<String>,
-    orig_defs: HashMap<String, &'a GpuTypeDef>,
-    defs: HashMap<String, (Size, LayoutTypeDef)>,
-}
-
-#[derive(Clone, Copy)]
-pub struct Size {
-    pub size: usize,
-    alignment: usize,
-}
-
-impl LayoutType {
-    fn from_gpu(ty: &GpuType, session: &mut LayoutSession) -> LayoutType {
-        let size = session.get_size(ty);
-        LayoutType {
-            size,
-            ty: ty.clone(),
-        }
-    }
-}
-
-impl LayoutTypeDef {
-    // Maybe have a type representing the tuple?
-    fn from_gpu(def: &GpuTypeDef, session: &mut LayoutSession) -> (Size, LayoutTypeDef) {
-        match def {
-            GpuTypeDef::Struct(_name, fields) => {
-                // TODO: We want to be able to pack enums more tightly, in particular
-                // other struct fields along with the enum tag. Structs in that category
-                // (first field has an alignment < 4, serve as enum variant) will have a
-                // different layout. This is why we're tracking `is_enum_variant`.
-                //
-                // But it's a bit of YAGNI for now; we're currently reserving 4 bytes for
-                // the tag, so structure layout doesn't care.
-                let mut offset = 0;
-                let mut result = Vec::new();
-                for field in fields {
-                    let layout_ty = LayoutType::from_gpu(&field.1, session);
-                    offset += align_padding(offset, layout_ty.size.alignment);
-                    let size = layout_ty.size.size;
-                    result.push((field.0.clone(), offset, layout_ty));
-                    offset += size;
-                }
-                offset += align_padding(offset, 4);
-                let size = Size::new_struct(offset);
-                (size, LayoutTypeDef::Struct(result))
-            }
-            GpuTypeDef::Enum(en) => {
-                let mut result = Vec::new();
-                let mut max_offset = 0;
-                for variant in &en.variants {
-                    let mut r2 = Vec::new();
-                    let mut offset = 4;
-                    for field in &variant.1 {
-                        let layout_ty = LayoutType::from_gpu(field, session);
-                        offset += align_padding(offset, layout_ty.size.alignment);
-                        let size = layout_ty.size.size;
-                        r2.push((offset, layout_ty));
-                        offset += size;
-                    }
-                    max_offset = max_offset.max(offset);
-                    result.push((variant.0.clone(), r2));
-                }
-                max_offset += align_padding(max_offset, 4);
-                let size = Size::new_struct(max_offset);
-                (size, LayoutTypeDef::Enum(result))
-            }
-        }
-    }
-}
-
-impl LayoutModule {
-    pub fn from_gpu(module: &GpuModule) -> LayoutModule {
-        let def_names = module
-            .defs
-            .iter()
-            .map(|def| def.name().to_owned())
-            .collect::<Vec<_>>();
-        let mut session = LayoutSession::new(module);
-        for def in &module.defs {
-            let _ = session.layout_def(def.name());
-        }
-        let gpu_write = module.attrs.contains("gpu_write");
-        let rust_encode = module.attrs.contains("rust_encode");
-        LayoutModule {
-            name: module.name.clone(),
-            gpu_write,
-            rust_encode,
-            def_names,
-            enum_variants: session.enum_variants,
-            defs: session.defs,
-        }
-    }
-
-    #[allow(unused)]
-    pub fn is_enum_variant(&self, name: &str) -> bool {
-        self.enum_variants.contains(name)
-    }
-}
-
-impl<'a> LayoutSession<'a> {
-    fn new(module: &GpuModule) -> LayoutSession {
-        let mut orig_defs = HashMap::new();
-        let mut enum_variants = HashSet::new();
-        for def in &module.defs {
-            orig_defs.insert(def.name().to_owned(), def.clone());
-            if let GpuTypeDef::Enum(en) = def {
-                for variant in &en.variants {
-                    if let Some(GpuType::InlineStruct(name)) = variant.1.first() {
-                        enum_variants.insert(name.clone());
-                    }
-                }
-            }
-        }
-        LayoutSession {
-            enum_variants,
-            orig_defs,
-            defs: HashMap::new(),
-        }
-    }
-
-    /// Do layout of one def.
-    ///
-    /// This might be called recursively.
-    /// Note: expect stack overflow for circular dependencies.
-    fn layout_def(&mut self, name: &str) -> Size {
-        if let Some(def) = self.defs.get(name) {
-            return def.0;
-        }
-        let def = self.orig_defs.get(name).unwrap();
-        let layout = LayoutTypeDef::from_gpu(def, self);
-        let size = layout.0;
-        self.defs.insert(name.to_owned(), layout);
-        size
-    }
-
-    fn get_size(&mut self, ty: &GpuType) -> Size {
-        match ty {
-            GpuType::Scalar(scalar) => Size::new(scalar.size()),
-            GpuType::Vector(scalar, len) => Size::new(scalar.size() * len),
-            GpuType::Ref(_) => Size::new(4),
-            GpuType::InlineStruct(name) => self.layout_def(name),
-        }
-    }
-
-    #[allow(unused)]
-    fn is_enum_variant(&self, name: &str) -> bool {
-        self.enum_variants.contains(name)
-    }
-}
-
-/// Compute coverage of fields.
-///
-/// Each element of the result represents a list of fields for one 4-byte chunk of
-/// the struct layout. Inline structs are only included if requested.
-pub fn struct_coverage(
-    fields: &[(String, usize, LayoutType)],
-    include_inline: bool,
-) -> Vec<Vec<usize>> {
-    let mut result: Vec<Vec<usize>> = Vec::new();
-    for (i, (_name, offset, ty)) in fields.iter().enumerate() {
-        let size = match ty.ty {
-            GpuType::Scalar(scalar) => scalar.size(),
-            GpuType::Vector(scalar, len) => scalar.size() * len,
-            GpuType::Ref(_) => 4,
-            GpuType::InlineStruct(_) => {
-                if include_inline {
-                    4
-                } else {
-                    0
-                }
-            }
-        };
-        if size > 0 {
-            for ix in (offset / 4)..(offset + size + 3) / 4 {
-                if ix >= result.len() {
-                    result.resize_with(ix + 1, Default::default);
-                }
-                result[ix].push(i);
-            }
-        }
-    }
-    result
-}
-
-impl Size {
-    fn new(size: usize) -> Size {
-        // Note: there is special case we could do better:
-        // `(u8, u16, u8)`, where the alignment could be 1. However,
-        // this case can also be solved by reordering.
-        let alignment = size.min(4);
-        Size { size, alignment }
-    }
-
-    fn new_struct(size: usize) -> Size {
-        let alignment = 4;
-        Size { size, alignment }
-    }
-}
-
-fn align_padding(offset: usize, alignment: usize) -> usize {
-    offset.wrapping_neg() & (alignment.max(1) - 1)
-}
--- a/piet-gpu-derive/src/lib.rs
+++ b/piet-gpu-derive/src/lib.rs
@ -1,30 +0,0 @@
-mod derive;
-mod glsl;
-mod layout;
-mod parse;
-
-use proc_macro::TokenStream;
-use quote::{format_ident, quote};
-use syn::parse_macro_input;
-
-use layout::LayoutModule;
-use parse::GpuModule;
-
-#[proc_macro]
-pub fn piet_gpu(input: TokenStream) -> TokenStream {
-    let input = parse_macro_input!(input as syn::ItemMod);
-    //println!("input: {:#?}", input);
-    let module = GpuModule::from_syn(&input).unwrap();
-    let layout = LayoutModule::from_gpu(&module);
-    let glsl = glsl::gen_glsl(&layout);
-    let gen_gpu_fn = format_ident!("gen_gpu_{}", layout.name);
-    let mut expanded = quote! {
-        pub fn #gen_gpu_fn() -> String {
-            #glsl.into()
-        }
-    };
-    if layout.rust_encode {
-        expanded.extend(derive::gen_derive(&layout));
-    }
-    expanded.into()
-}
--- a/piet-gpu-derive/src/parse.rs
+++ b/piet-gpu-derive/src/parse.rs
@ -1,228 +0,0 @@
-//! Parsing of the source
-
-extern crate proc_macro;
-
-use std::collections::HashSet;
-
-use syn::{
-    Expr, ExprLit, Fields, FieldsNamed, FieldsUnnamed, GenericArgument, ItemEnum, ItemStruct, Lit,
-    PathArguments, TypeArray, TypePath,
-};
-
-/// A scalar that can be represented in a packed data structure.
-#[derive(Clone, Copy, PartialEq)]
-pub enum GpuScalar {
-    F16,
-    F32,
-    I8,
-    I16,
-    I32,
-    U8,
-    U16,
-    U32,
-    TagFlags,
-}
-
-/// An algebraic datatype.
-#[derive(Clone)]
-pub enum GpuType {
-    Scalar(GpuScalar),
-    Vector(GpuScalar, usize),
-    /// Used mostly for the body of enum variants.
-    InlineStruct(String),
-    Ref(Box<GpuType>),
-}
-
-pub struct GpuEnum {
-    pub name: String,
-    pub variants: Vec<(String, Vec<GpuType>)>,
-}
-
-pub enum GpuTypeDef {
-    Struct(String, Vec<(String, GpuType)>),
-    Enum(GpuEnum),
-}
-
-pub struct GpuModule {
-    pub name: String,
-    pub attrs: HashSet<String>,
-    pub defs: Vec<GpuTypeDef>,
-}
-
-impl GpuScalar {
-    fn from_syn(ty: &syn::Type) -> Option<Self> {
-        ty_as_single_ident(ty).and_then(|ident| match ident.as_str() {
-            "f32" => Some(GpuScalar::F32),
-            "f16" => Some(GpuScalar::F16),
-            "i8" => Some(GpuScalar::I8),
-            "i16" => Some(GpuScalar::I16),
-            "i32" => Some(GpuScalar::I32),
-            "u8" => Some(GpuScalar::U8),
-            "u16" => Some(GpuScalar::U16),
-            "u32" => Some(GpuScalar::U32),
-            "TagFlags" => Some(GpuScalar::TagFlags),
-            _ => None,
-        })
-    }
-
-    /// Size of scalar type.
-    ///
-    /// This is arguably a concern at the layout level, not syntax, but it's here because
-    /// it's not likely to be variable, so reduces the total number of types.
-    pub fn size(self) -> usize {
-        match self {
-            GpuScalar::F32 | GpuScalar::I32 | GpuScalar::U32 => 4,
-            GpuScalar::I8 | GpuScalar::U8 => 1,
-            GpuScalar::F16 | GpuScalar::I16 | GpuScalar::U16 => 2,
-            GpuScalar::TagFlags => 0,
-        }
-    }
-}
-
-impl GpuType {
-    fn from_syn(ty: &syn::Type) -> Result<Self, String> {
-        //println!("gputype {:#?}", ty);
-        if let Some(scalar) = GpuScalar::from_syn(ty) {
-            return Ok(GpuType::Scalar(scalar));
-        }
-        if let Some(name) = ty_as_single_ident(ty) {
-            // Note: we're not doing any validation here.
-            return Ok(GpuType::InlineStruct(name));
-        }
-        match ty {
-            syn::Type::Path(TypePath {
-                path: syn::Path { segments, .. },
-                ..
-            }) => {
-                if segments.len() == 1 {
-                    let seg = &segments[0];
-                    if seg.ident == "Ref" {
-                        if let PathArguments::AngleBracketed(args) = &seg.arguments {
-                            if args.args.len() == 1 {
-                                if let GenericArgument::Type(inner) = &args.args[0] {
-                                    let inner_ty = GpuType::from_syn(inner)?;
-                                    return Ok(GpuType::Ref(Box::new(inner_ty)));
-                                }
-                            }
-                        }
-                    }
-                }
-                Err("unknown path case".into())
-            }
-            syn::Type::Array(TypeArray { elem, len, .. }) => {
-                if let Some(elem) = GpuScalar::from_syn(&elem) {
-                    if let Some(len) = expr_int_lit(len) {
-                        // maybe sanity-check length here
-                        Ok(GpuType::Vector(elem, len))
-                    } else {
-                        Err("can't deal with variable length scalar arrays".into())
-                    }
-                } else {
-                    Err("can't deal with non-scalar arrays".into())
-                }
-            }
-            _ => Err("unknown type".into()),
-        }
-    }
-}
-
-impl GpuTypeDef {
-    fn from_syn(item: &syn::Item) -> Result<Self, String> {
-        match item {
-            syn::Item::Struct(ItemStruct {
-                ident,
-                fields: Fields::Named(FieldsNamed { named, .. }),
-                ..
-            }) => {
-                let mut fields = Vec::new();
-                for field in named {
-                    let field_ty = GpuType::from_syn(&field.ty)?;
-                    let field_name = field.ident.as_ref().ok_or("need name".to_string())?;
-                    fields.push((field_name.to_string(), field_ty));
-                }
-                Ok(GpuTypeDef::Struct(ident.to_string(), fields))
-            }
-            syn::Item::Enum(ItemEnum {
-                ident, variants, ..
-            }) => {
-                let mut v = Vec::new();
-                for variant in variants {
-                    let vname = variant.ident.to_string();
-                    let mut fields = Vec::new();
-                    if let Fields::Unnamed(FieldsUnnamed { unnamed, .. }) = &variant.fields {
-                        for field in unnamed {
-                            fields.push(GpuType::from_syn(&field.ty)?);
-                        }
-                    }
-                    v.push((vname, fields));
-                }
-                let en = GpuEnum {
-                    name: ident.to_string(),
-                    variants: v,
-                };
-                Ok(GpuTypeDef::Enum(en))
-            }
-            _ => {
-                eprintln!("{:#?}", item);
-                Err("unknown item".into())
-            }
-        }
-    }
-
-    pub fn name(&self) -> &str {
-        match self {
-            GpuTypeDef::Struct(name, _) => name,
-            GpuTypeDef::Enum(en) => &en.name,
-        }
-    }
-}
-
-impl GpuModule {
-    pub fn from_syn(module: &syn::ItemMod) -> Result<Self, String> {
-        let name = module.ident.to_string();
-        let mut attrs = HashSet::new();
-        for attr in &module.attrs {
-            if let Some(id) = path_as_single_ident(&attr.path) {
-                attrs.insert(id.to_owned());
-            }
-        }
-        let mut defs = Vec::new();
-        if let Some((_brace, items)) = &module.content {
-            for item in items {
-                let def = GpuTypeDef::from_syn(item)?;
-                defs.push(def);
-            }
-        }
-        Ok(GpuModule { name, attrs, defs })
-    }
-}
-
-fn path_as_single_ident(path: &syn::Path) -> Option<String> {
-    if path.segments.len() == 1 {
-        let seg = &path.segments[0];
-        if seg.arguments == PathArguments::None {
-            return Some(seg.ident.to_string());
-        }
-    }
-    None
-}
-
-fn ty_as_single_ident(ty: &syn::Type) -> Option<String> {
-    if let syn::Type::Path(TypePath { path, .. }) = ty {
-        path_as_single_ident(path)
-    } else {
-        None
-    }
-}
-
-fn expr_int_lit(e: &Expr) -> Option<usize> {
-    if let Expr::Lit(ExprLit {
-        lit: Lit::Int(lit_int),
-        ..
-    }) = e
-    {
-        lit_int.base10_parse().ok()
-    } else {
-        None
-    }
-}
--- a/piet-gpu-hal/Cargo.toml
+++ b/piet-gpu-hal/Cargo.toml
@ -1,33 +0,0 @@
-[package]
-name = "piet-gpu-hal"
-version = "0.1.0"
-authors = ["Raph Levien <raph.levien@gmail.com>"]
-description = "An abstraction layer for running compute kernels on GPU."
-license = "MIT/Apache-2.0"
-edition = "2018"
-
-[dependencies]
-ash = { version = "0.37", features = ["loaded"] }
-ash-window = "0.12"
-raw-window-handle = "0.5"
-bitflags = "1.3.2"
-smallvec = "1.9"
-bytemuck = "1.12.1"
-
-[target.'cfg(target_os="windows")'.dependencies]
-winapi = { version = "0.3.9", features = [
-    'd3d12', 'd3d12sdklayers', 'd3dcommon', 'd3dcompiler', 'dxgi',
-    'dxgi1_2', 'dxgi1_3', 'dxgi1_4', 'dxgidebug', 'dxgiformat', 'dxgitype',
-    'libloaderapi', 'shellscalingapi', 'synchapi', 'winbase', 'windef',
-    'winerror', 'winuser'
-] }
-wio = "0.2.2"
-
-[target.'cfg(target_os="macos")'.dependencies]
-metal = "0.24"
-objc = "0.2.7"
-block = "0.1.6"
-cocoa-foundation = "0.1"
-# Note: foreign-types is up to 0.5 but metal hasn't upgraded to it
-foreign-types = "0.3.2"
-core-graphics-types = "0.1.1"
--- a/piet-gpu-hal/examples/collatz.rs
+++ b/piet-gpu-hal/examples/collatz.rs
@ -1,39 +0,0 @@
-use piet_gpu_hal::{include_shader, BindType, ComputePassDescriptor};
-use piet_gpu_hal::{BufferUsage, Instance, InstanceFlags, Session};
-
-fn main() {
-    let instance = Instance::new(InstanceFlags::empty()).unwrap();
-    unsafe {
-        let device = instance.device().unwrap();
-        let session = Session::new(device);
-        let usage = BufferUsage::MAP_READ | BufferUsage::STORAGE;
-        let src = (0..256).map(|x| x + 1).collect::<Vec<u32>>();
-        let buffer = session.create_buffer_init(&src, usage).unwrap();
-        let code = include_shader!(&session, "./shader/gen/collatz");
-        let pipeline = session
-            .create_compute_pipeline(code, &[BindType::Buffer])
-            .unwrap();
-        let descriptor_set = session
-            .create_simple_descriptor_set(&pipeline, &[&buffer])
-            .unwrap();
-        let query_pool = session.create_query_pool(2).unwrap();
-        let mut cmd_buf = session.cmd_buf().unwrap();
-        cmd_buf.begin();
-        cmd_buf.reset_query_pool(&query_pool);
-        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 0, 1));
-        pass.dispatch(&pipeline, &descriptor_set, (256, 1, 1), (1, 1, 1));
-        pass.end();
-        cmd_buf.finish_timestamps(&query_pool);
-        cmd_buf.host_barrier();
-        cmd_buf.finish();
-        let submitted = session.run_cmd_buf(cmd_buf, &[], &[]).unwrap();
-        submitted.wait().unwrap();
-        let timestamps = session.fetch_query_pool(&query_pool);
-        let mut dst: Vec<u32> = Default::default();
-        buffer.read(&mut dst).unwrap();
-        for (i, val) in dst.iter().enumerate().take(16) {
-            println!("{}: {}", i, val);
-        }
-        println!("{:?}", timestamps);
-    }
-}
--- a/piet-gpu-hal/examples/shader/build.ninja
+++ b/piet-gpu-hal/examples/shader/build.ninja
@ -1,24 +0,0 @@
-# Build file for shaders.
-
-# You must have Vulkan tools in your path, or patch here.
-
-glslang_validator = glslangValidator
-spirv_cross = spirv-cross
-dxc = dxc
-
-rule glsl
-  command = $glslang_validator -V -o $out $in
-
-rule hlsl
-  command = $spirv_cross --hlsl $in --output $out
-
-rule dxil
-  command = $dxc -T cs_6_0 $in -Fo $out
-
-rule msl
-  command = $spirv_cross --msl $in --output $out
-
-build gen/collatz.spv: glsl collatz.comp
-build gen/collatz.hlsl: hlsl gen/collatz.spv
-build gen/collatz.dxil: dxil gen/collatz.hlsl
-build gen/collatz.msl: msl gen/collatz.spv
--- a/piet-gpu-hal/examples/shader/collatz.comp
+++ b/piet-gpu-hal/examples/shader/collatz.comp
@ -1,35 +0,0 @@
-// Copied from wgpu hello-compute example
-
-// TODO: delete or clean up attribution before releasing
-
-#version 450
-layout(local_size_x = 1) in;
-
-layout(set = 0, binding = 0) buffer PrimeIndices {
-    uint[] indices;
-}; // this is used as both input and output for convenience
-
-// The Collatz Conjecture states that for any integer n:
-// If n is even, n = n/2
-// If n is odd, n = 3n+1
-// And repeat this process for each new n, you will always eventually reach 1.
-// Though the conjecture has not been proven, no counterexample has ever been found.
-// This function returns how many times this recurrence needs to be applied to reach 1.
-uint collatz_iterations(uint n) {
-    uint i = 0;
-    while(n != 1) {
-        if (mod(n, 2) == 0) {
-            n = n / 2;
-        }
-        else {
-            n = (3 * n) + 1;
-        }
-        i++;
-    }
-    return i;
-}
-
-void main() {
-    uint index = gl_GlobalInvocationID.x;
-    indices[index] = collatz_iterations(indices[index]);
-}
--- a/piet-gpu-hal/src/backend.rs
+++ b/piet-gpu-hal/src/backend.rs
@ -1,309 +0,0 @@
-// Copyright 2021 The piet-gpu authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Also licensed under MIT license, at your choice.
-
-//! The generic trait for backends to implement.
-
-use crate::{
-    BindType, BufferUsage, ComputePassDescriptor, Error, GpuInfo, ImageFormat, ImageLayout,
-    MapMode, SamplerParams,
-};
-
-pub trait Device: Sized {
-    type Buffer: 'static;
-    type Image;
-    type Pipeline;
-    type DescriptorSet;
-    type QueryPool;
-    type CmdBuf: CmdBuf<Self>;
-    type Fence;
-    type Semaphore;
-    type DescriptorSetBuilder: DescriptorSetBuilder<Self>;
-    type Sampler;
-    type ShaderSource: ?Sized;
-
-    /// Query the GPU info.
-    ///
-    /// This method may be expensive, so the hub should call it once and retain
-    /// the info.
-    fn query_gpu_info(&self) -> GpuInfo;
-
-    fn create_buffer(&self, size: u64, usage: BufferUsage) -> Result<Self::Buffer, Error>;
-
-    /// Destroy a buffer.
-    ///
-    /// The same safety requirements hold as in Vulkan: the buffer cannot be used
-    /// after this call, and all commands referencing this buffer must have completed.
-    ///
-    /// Maybe doesn't need result return?
-    unsafe fn destroy_buffer(&self, buffer: &Self::Buffer) -> Result<(), Error>;
-
-    unsafe fn create_image2d(
-        &self,
-        width: u32,
-        height: u32,
-        format: ImageFormat,
-    ) -> Result<Self::Image, Error>;
-
-    /// Destroy an image.
-    ///
-    /// The same safety requirements hold as in Vulkan: the image cannot be used
-    /// after this call, and all commands referencing this image must have completed.
-    ///
-    /// Use this only with images we created, not for swapchain images.
-    ///
-    /// Maybe doesn't need result return?
-    unsafe fn destroy_image(&self, image: &Self::Image) -> Result<(), Error>;
-
-    /// Build a compute pipeline.
-    ///
-    /// A pipeline is a bit of shader IR plus a signature for what kinds of resources
-    /// it expects.
-    unsafe fn create_compute_pipeline(
-        &self,
-        code: &Self::ShaderSource,
-        bind_types: &[BindType],
-    ) -> Result<Self::Pipeline, Error>;
-
-    /// Start building a descriptor set.
-    ///
-    /// A descriptor set is a binding of resources for a given pipeline.
-    unsafe fn descriptor_set_builder(&self) -> Self::DescriptorSetBuilder;
-
-    /// Create a descriptor set for a given pipeline, binding buffers and images.
-    ///
-    /// This is provided as a convenience but will probably go away, as the functionality
-    /// is subsumed by the builder.
-    unsafe fn create_descriptor_set(
-        &self,
-        pipeline: &Self::Pipeline,
-        bufs: &[&Self::Buffer],
-        images: &[&Self::Image],
-    ) -> Result<Self::DescriptorSet, Error> {
-        let mut builder = self.descriptor_set_builder();
-        builder.add_buffers(bufs);
-        builder.add_images(images);
-        builder.build(self, pipeline)
-    }
-
-    /// Update a descriptor in a descriptor set.
-    ///
-    /// The index is the same as the binding number in Vulkan.
-    ///
-    /// # Safety
-    ///
-    /// The descriptor set must not be used in any in-flight command buffer. The index must be valid.
-    /// The resource type must match that at descriptor set creation time.
-    unsafe fn update_buffer_descriptor(
-        &self,
-        ds: &mut Self::DescriptorSet,
-        index: u32,
-        buf: &Self::Buffer,
-    );
-
-    /// Update a descriptor in a descriptor set.
-    ///
-    /// The index is the same as the binding number in Vulkan.
-    ///
-    /// # Safety
-    ///
-    /// The descriptor set must not be used in any in-flight command buffer. The index must be valid.
-    /// The resource type must match that at descriptor set creation time.
-    unsafe fn update_image_descriptor(
-        &self,
-        ds: &mut Self::DescriptorSet,
-        index: u32,
-        image: &Self::Image,
-    );
-
-    fn create_cmd_buf(&self) -> Result<Self::CmdBuf, Error>;
-
-    /// If the command buffer was submitted, it must complete before this is called.
-    unsafe fn destroy_cmd_buf(&self, cmd_buf: Self::CmdBuf) -> Result<(), Error>;
-
-    fn create_query_pool(&self, n_queries: u32) -> Result<Self::QueryPool, Error>;
-
-    /// Get results from query pool, destroying it in the process.
-    ///
-    /// The returned vector is one less than the number of queries; the first is used as
-    /// a baseline.
-    ///
-    /// # Safety
-    /// All submitted commands that refer to this query pool must have completed.
-    unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result<Vec<f64>, Error>;
-
-    unsafe fn run_cmd_bufs(
-        &self,
-        cmd_buf: &[&Self::CmdBuf],
-        wait_semaphores: &[&Self::Semaphore],
-        signal_semaphores: &[&Self::Semaphore],
-        fence: Option<&mut Self::Fence>,
-    ) -> Result<(), Error>;
-
-    /// Map the buffer into addressable memory.
-    ///
-    /// # Safety
-    ///
-    /// The buffer must be valid to access. The offset + size much be within the
-    /// buffer's allocation. The buffer must not already be mapped. Of course,
-    /// the usual safety rules apply to the returned pointer.
-    unsafe fn map_buffer(
-        &self,
-        buffer: &Self::Buffer,
-        offset: u64,
-        size: u64,
-        mode: MapMode,
-    ) -> Result<*mut u8, Error>;
-
-    /// Map the buffer into addressable memory.
-    ///
-    /// # Safety
-    ///
-    /// The buffer must be mapped. The parameters must be the same as the map
-    /// call.
-    unsafe fn unmap_buffer(
-        &self,
-        buffer: &Self::Buffer,
-        offset: u64,
-        size: u64,
-        mode: MapMode,
-    ) -> Result<(), Error>;
-
-    unsafe fn create_semaphore(&self) -> Result<Self::Semaphore, Error>;
-    unsafe fn create_fence(&self, signaled: bool) -> Result<Self::Fence, Error>;
-    unsafe fn destroy_fence(&self, fence: Self::Fence) -> Result<(), Error>;
-    unsafe fn wait_and_reset(&self, fences: Vec<&mut Self::Fence>) -> Result<(), Error>;
-    unsafe fn get_fence_status(&self, fence: &mut Self::Fence) -> Result<bool, Error>;
-
-    unsafe fn create_sampler(&self, params: SamplerParams) -> Result<Self::Sampler, Error>;
-}
-
-/// The trait implemented by backend command buffer implementations.
-///
-/// Valid encoding is represented by a state machine (currently not validated
-/// but it is easy to imagine there might be at least debug validation). Most
-/// methods are only valid in a particular state, and some move it to another
-/// state.
-pub trait CmdBuf<D: Device> {
-    /// Begin encoding.
-    ///
-    /// State: init -> ready
-    unsafe fn begin(&mut self);
-
-    /// State: ready -> finished
-    unsafe fn finish(&mut self);
-
-    /// Commits any open command encoder.
-    unsafe fn flush(&mut self);
-
-    /// Return true if the command buffer is suitable for reuse.
-    unsafe fn reset(&mut self) -> bool;
-
-    /// Begin a compute pass.
-    ///
-    /// State: ready -> in_compute_pass
-    unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor);
-
-    /// Dispatch
-    ///
-    /// State: in_compute_pass
-    unsafe fn dispatch(
-        &mut self,
-        pipeline: &D::Pipeline,
-        descriptor_set: &D::DescriptorSet,
-        workgroup_count: (u32, u32, u32),
-        workgroup_size: (u32, u32, u32),
-    );
-
-    /// State: in_compute_pass -> ready
-    unsafe fn end_compute_pass(&mut self);
-
-    /// Insert an execution and memory barrier.
-    ///
-    /// Compute kernels (and other actions) after this barrier may read from buffers
-    /// that were written before this barrier.
-    unsafe fn memory_barrier(&mut self);
-
-    /// Insert a barrier for host access to buffers.
-    ///
-    /// The host may read buffers written before this barrier, after the fence for
-    /// the command buffer is signaled.
-    ///
-    /// See http://themaister.net/blog/2019/08/14/yet-another-blog-explaining-vulkan-synchronization/
-    /// ("Host memory reads") for an explanation of this barrier.
-    unsafe fn host_barrier(&mut self);
-
-    unsafe fn image_barrier(
-        &mut self,
-        image: &D::Image,
-        src_layout: ImageLayout,
-        dst_layout: ImageLayout,
-    );
-
-    /// Clear the buffer.
-    ///
-    /// This is readily supported in Vulkan, but for portability it is remarkably
-    /// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute
-    /// kernel, or organize the code not to need it.
-    unsafe fn clear_buffer(&mut self, buffer: &D::Buffer, size: Option<u64>);
-
-    unsafe fn copy_buffer(&mut self, src: &D::Buffer, dst: &D::Buffer);
-
-    unsafe fn copy_image_to_buffer(&mut self, src: &D::Image, dst: &D::Buffer);
-
-    unsafe fn copy_buffer_to_image(&mut self, src: &D::Buffer, dst: &D::Image);
-
-    // low portability, dx12 doesn't support it natively
-    unsafe fn blit_image(&mut self, src: &D::Image, dst: &D::Image);
-
-    /// Reset the query pool.
-    ///
-    /// The query pool must be reset before each use, to avoid validation errors.
-    /// This is annoying, and we could tweak the API to make it implicit, doing
-    /// the reset before the first timestamp write.
-    unsafe fn reset_query_pool(&mut self, pool: &D::QueryPool);
-
-    unsafe fn write_timestamp(&mut self, pool: &D::QueryPool, query: u32);
-
-    /// Prepare the timestamps for reading. This isn't required on Vulkan but
-    /// is required on (at least) DX12.
-    unsafe fn finish_timestamps(&mut self, _pool: &D::QueryPool) {}
-
-    /// Begin a labeled section for debugging and profiling purposes.
-    unsafe fn begin_debug_label(&mut self, _label: &str) {}
-
-    /// End a section opened by `begin_debug_label`.
-    unsafe fn end_debug_label(&mut self) {}
-}
-
-/// A builder for descriptor sets with more complex layouts.
-///
-/// Note: the order needs to match the pipeline building, and it also needs to
-/// be buffers, then images, then textures.
-pub trait DescriptorSetBuilder<D: Device> {
-    fn add_buffers(&mut self, buffers: &[&D::Buffer]);
-    /// Add an array of storage images.
-    ///
-    /// The images need to be in `ImageLayout::General` layout.
-    fn add_images(&mut self, images: &[&D::Image]);
-    /// Add an array of textures.
-    ///
-    /// The images need to be in `ImageLayout::ShaderRead` layout.
-    ///
-    /// The same sampler is used for all textures, which is not very sophisticated;
-    /// we should have a way to vary the sampler.
-    fn add_textures(&mut self, images: &[&D::Image]);
-    unsafe fn build(self, device: &D, pipeline: &D::Pipeline) -> Result<D::DescriptorSet, Error>;
-}
--- a/piet-gpu-hal/src/bestfit.rs
+++ b/piet-gpu-hal/src/bestfit.rs
@ -1,81 +0,0 @@
-// Copyright © 2021 piet-gpu developers.
-//
-// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
-// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
-// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
-// option. This file may not be copied, modified, or distributed
-// except according to those
-
-//! A simple best-fit allocator.
-
-use std::collections::{BTreeMap, BTreeSet};
-
-/// An allocator that tracks free ranges and returns best fit.
-pub struct BestFit {
-    // map offset to size of free block
-    free_by_ix: BTreeMap<u32, u32>,
-    // size and offset
-    free_by_size: BTreeSet<(u32, u32)>,
-}
-
-impl BestFit {
-    pub fn new(size: u32) -> BestFit {
-        let mut free_by_ix = BTreeMap::new();
-        free_by_ix.insert(0, size);
-        let mut free_by_size = BTreeSet::new();
-        free_by_size.insert((size, 0));
-        BestFit {
-            free_by_ix,
-            free_by_size,
-        }
-    }
-
-    pub fn alloc(&mut self, size: u32) -> Option<u32> {
-        let block = *self.free_by_size.range((size, 0)..).next()?;
-        let ix = block.1;
-        self.free_by_ix.remove(&ix);
-        self.free_by_size.remove(&block);
-        let fragment_size = block.0 - size;
-        if fragment_size > 0 {
-            let fragment_ix = ix + size;
-            self.free_by_ix.insert(fragment_ix, fragment_size);
-            self.free_by_size.insert((fragment_size, fragment_ix));
-        }
-        Some(ix)
-    }
-
-    pub fn free(&mut self, ix: u32, size: u32) {
-        let next_ix = size + ix;
-        if let Some((&prev_ix, &prev_size)) = self.free_by_ix.range(..ix).rev().next() {
-            if prev_ix + prev_size == ix {
-                self.free_by_size.remove(&(prev_size, prev_ix));
-                if let Some(&next_size) = self.free_by_ix.get(&next_ix) {
-                    // consolidate with prev and next
-                    let new_size = prev_size + size + next_size;
-                    *self.free_by_ix.get_mut(&prev_ix).unwrap() = new_size;
-                    self.free_by_ix.remove(&next_ix);
-                    self.free_by_size.remove(&(next_size, next_ix));
-                    self.free_by_size.insert((new_size, prev_ix));
-                } else {
-                    // consolidate with prev
-                    let new_size = prev_size + size;
-                    *self.free_by_ix.get_mut(&prev_ix).unwrap() = new_size;
-                    self.free_by_size.insert((new_size, prev_ix));
-                }
-                return;
-            }
-        }
-        if let Some(&next_size) = self.free_by_ix.get(&next_ix) {
-            // consolidate with next
-            let new_size = size + next_size;
-            self.free_by_ix.remove(&next_ix);
-            self.free_by_ix.insert(ix, new_size);
-            self.free_by_size.remove(&(next_size, next_ix));
-            self.free_by_size.insert((new_size, ix));
-        } else {
-            // new isolated free block
-            self.free_by_ix.insert(ix, size);
-            self.free_by_size.insert((size, ix));
-        }
-    }
-}
--- a/piet-gpu-hal/src/bufwrite.rs
+++ b/piet-gpu-hal/src/bufwrite.rs
@ -1,150 +0,0 @@
-// Copyright © 2021 piet-gpu developers.
-//
-// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
-// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
-// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
-// option. This file may not be copied, modified, or distributed
-// except according to those
-
-//! An abstraction for writing to GPU buffers.
-
-use bytemuck::Pod;
-
-/// A GPU buffer to be filled.
-pub struct BufWrite {
-    ptr: *mut u8,
-    len: usize,
-    capacity: usize,
-}
-
-impl BufWrite {
-    pub(crate) fn new(ptr: *mut u8, len: usize, capacity: usize) -> BufWrite {
-        BufWrite { ptr, len, capacity }
-    }
-
-    /// Append a plain data object to the buffer.
-    ///
-    /// Panics if capacity is inadequate.
-    #[inline]
-    pub fn push(&mut self, item: impl Pod) {
-        self.push_bytes(bytemuck::bytes_of(&item));
-    }
-
-    /// Extend with a slice of plain data objects.
-    ///
-    /// Panics if capacity is inadequate.
-    #[inline]
-    pub fn extend_slice(&mut self, slice: &[impl Pod]) {
-        self.push_bytes(bytemuck::cast_slice(slice));
-    }
-
-    /// Extend with a byte slice.
-    ///
-    /// Panics if capacity is inadequate.
-    #[inline]
-    pub fn push_bytes(&mut self, bytes: &[u8]) {
-        let len = bytes.len();
-        assert!(self.capacity - self.len >= len);
-        unsafe {
-            std::ptr::copy_nonoverlapping(bytes.as_ptr(), self.ptr.add(self.len), len);
-        }
-        self.len += len;
-    }
-
-    /// Extend with zeros.
-    ///
-    /// Panics if capacity is inadequate.
-    #[inline]
-    pub fn fill_zero(&mut self, len: usize) {
-        assert!(self.capacity - self.len >= len);
-        unsafe {
-            let slice = std::slice::from_raw_parts_mut(self.ptr.add(self.len), len);
-            slice.fill(0);
-        }
-        self.len += len;
-    }
-
-    /// The total capacity of the buffer, in bytes.
-    #[inline]
-    pub fn capacity(&self) -> usize {
-        self.capacity
-    }
-
-    /// Extend with an iterator over plain data objects.
-    ///
-    /// Currently, this doesn't panic, just truncates. That may change.
-    // Note: when specialization lands, this can be another impl of
-    // `Extend`.
-    pub fn extend_ref_iter<'a, I, T: Pod + 'a>(&mut self, iter: I)
-    where
-        I: IntoIterator<Item = &'a T>,
-    {
-        let item_size = std::mem::size_of::<T>();
-        if item_size == 0 {
-            return;
-        }
-        let mut iter = iter.into_iter();
-        let n_remaining = (self.capacity - self.len) / item_size;
-        unsafe {
-            let mut dst = self.ptr.add(self.len);
-            for _ in 0..n_remaining {
-                if let Some(item) = iter.next() {
-                    std::ptr::copy_nonoverlapping(
-                        bytemuck::bytes_of(item).as_ptr(),
-                        dst,
-                        item_size,
-                    );
-                    self.len += item_size;
-                    dst = dst.add(item_size);
-                } else {
-                    break;
-                }
-            }
-        }
-        // TODO: should we test the iter and panic on overflow?
-    }
-}
-
-impl std::ops::Deref for BufWrite {
-    type Target = [u8];
-    fn deref(&self) -> &[u8] {
-        unsafe { std::slice::from_raw_parts(self.ptr, self.len) }
-    }
-}
-
-impl std::ops::DerefMut for BufWrite {
-    fn deref_mut(&mut self) -> &mut [u8] {
-        unsafe { std::slice::from_raw_parts_mut(self.ptr, self.len) }
-    }
-}
-
-impl<T: Pod> std::iter::Extend<T> for BufWrite {
-    fn extend<I>(&mut self, iter: I)
-    where
-        I: IntoIterator<Item = T>,
-    {
-        let item_size = std::mem::size_of::<T>();
-        if item_size == 0 {
-            return;
-        }
-        let mut iter = iter.into_iter();
-        let n_remaining = (self.capacity - self.len) / item_size;
-        unsafe {
-            let mut dst = self.ptr.add(self.len);
-            for _ in 0..n_remaining {
-                if let Some(item) = iter.next() {
-                    std::ptr::copy_nonoverlapping(
-                        bytemuck::bytes_of(&item).as_ptr(),
-                        dst,
-                        item_size,
-                    );
-                    self.len += item_size;
-                    dst = dst.add(item_size);
-                } else {
-                    break;
-                }
-            }
-        }
-        // TODO: should we test the iter and panic on overflow?
-    }
-}
--- a/piet-gpu-hal/src/dx12.rs
+++ b/piet-gpu-hal/src/dx12.rs
@ -1,858 +0,0 @@
-//! DX12 implemenation of HAL trait.
-
-mod descriptor;
-mod error;
-mod wrappers;
-
-use std::{
-    cell::Cell,
-    convert::{TryFrom, TryInto},
-    mem, ptr,
-    sync::{Arc, Mutex},
-};
-
-#[allow(unused)]
-use winapi::shared::dxgi1_3; // for error reporting in debug mode
-use winapi::shared::minwindef::TRUE;
-use winapi::shared::{dxgi, dxgi1_2, dxgitype};
-use winapi::um::d3d12;
-
-use raw_window_handle::{RawDisplayHandle, RawWindowHandle};
-
-use smallvec::SmallVec;
-
-use crate::{
-    BindType, BufferUsage, ComputePassDescriptor, Error, GpuInfo, ImageFormat, ImageLayout,
-    MapMode, WorkgroupLimits,
-};
-
-use self::{
-    descriptor::{CpuHeapRefOwned, DescriptorPool, GpuHeapRefOwned},
-    wrappers::{
-        CommandAllocator, CommandQueue, DescriptorHeap, Device, Factory4, Resource, ShaderByteCode,
-    },
-};
-
-pub struct Dx12Instance {
-    factory: Factory4,
-}
-
-pub struct Dx12Surface {
-    hwnd: winapi::shared::windef::HWND,
-}
-
-pub struct Dx12Swapchain {
-    swapchain: wrappers::SwapChain3,
-    size: (u32, u32),
-}
-
-pub struct Dx12Device {
-    device: Device,
-    command_queue: CommandQueue,
-    ts_freq: u64,
-    gpu_info: GpuInfo,
-    memory_arch: MemoryArchitecture,
-    descriptor_pool: Mutex<DescriptorPool>,
-}
-
-#[derive(Clone)]
-pub struct Buffer {
-    resource: Resource,
-    pub size: u64,
-    // Always present except for query readback buffer.
-    cpu_ref: Option<Arc<CpuHeapRefOwned>>,
-    // Present when created with CLEAR usage. Heap is here for
-    // the same reason it's in DescriptorSet, and might be removed
-    // when CmdBuf has access to the descriptor pool.
-    gpu_ref: Option<(Arc<GpuHeapRefOwned>, DescriptorHeap)>,
-}
-
-#[derive(Clone)]
-pub struct Image {
-    resource: Resource,
-    // Present except for swapchain images.
-    cpu_ref: Option<Arc<CpuHeapRefOwned>>,
-    size: (u32, u32),
-}
-
-pub struct CmdBuf {
-    c: wrappers::GraphicsCommandList,
-    allocator: CommandAllocator,
-    needs_reset: bool,
-    end_query: Option<(wrappers::QueryHeap, u32)>,
-}
-
-pub struct Pipeline {
-    pipeline_state: wrappers::PipelineState,
-    root_signature: wrappers::RootSignature,
-}
-
-pub struct DescriptorSet {
-    gpu_ref: GpuHeapRefOwned,
-    // Note: the heap is only needed here so CmdBuf::dispatch can get
-    // use it easily. If CmdBuf had a reference to the Device (or just
-    // the descriptor pool), we could get rid of this.
-    heap: DescriptorHeap,
-}
-
-pub struct QueryPool {
-    heap: wrappers::QueryHeap,
-    // Maybe this should just be a Resource, not a full Buffer.
-    buf: Buffer,
-    n_queries: u32,
-}
-
-pub struct Fence {
-    fence: wrappers::Fence,
-    event: wrappers::Event,
-    // This could as well be an atomic, if we needed to cross threads.
-    val: Cell<u64>,
-}
-
-/// This will probably be renamed "PresentSem" or similar. I believe no
-/// semaphore is needed for presentation on DX12.
-pub struct Semaphore;
-
-#[derive(Default)]
-pub struct DescriptorSetBuilder {
-    handles: SmallVec<[d3d12::D3D12_CPU_DESCRIPTOR_HANDLE; 16]>,
-}
-
-#[derive(PartialEq, Eq)]
-enum MemoryArchitecture {
-    /// Integrated graphics
-    CacheCoherentUMA,
-    /// Unified memory with no cache coherence (does this happen?)
-    UMA,
-    /// Discrete graphics
-    NUMA,
-}
-
-impl Dx12Instance {
-    /// Create a new instance.
-    pub fn new() -> Result<Dx12Instance, Error> {
-        unsafe {
-            #[cfg(debug_assertions)]
-            if let Err(e) = wrappers::enable_debug_layer() {
-                // Maybe a better logging solution?
-                println!("{}", e);
-            }
-
-            #[cfg(debug_assertions)]
-            let factory_flags = dxgi1_3::DXGI_CREATE_FACTORY_DEBUG;
-
-            #[cfg(not(debug_assertions))]
-            let factory_flags: u32 = 0;
-
-            let factory = Factory4::create(factory_flags)?;
-
-            Ok(Dx12Instance { factory })
-        }
-    }
-
-    /// Create a surface for the specified window handle.
-    pub fn surface(
-        &self,
-        _display_handle: RawDisplayHandle,
-        window_handle: RawWindowHandle,
-    ) -> Result<Dx12Surface, Error> {
-        if let RawWindowHandle::Win32(w) = window_handle {
-            let hwnd = w.hwnd as *mut _;
-            Ok(Dx12Surface { hwnd })
-        } else {
-            Err("can't create surface for window handle".into())
-        }
-    }
-
-    /// Get a device suitable for compute workloads.
-    pub fn device(&self) -> Result<Dx12Device, Error> {
-        unsafe {
-            let device = Device::create_device(&self.factory)?;
-            let list_type = d3d12::D3D12_COMMAND_LIST_TYPE_DIRECT;
-            let command_queue = device.create_command_queue(
-                list_type,
-                0,
-                d3d12::D3D12_COMMAND_QUEUE_FLAG_NONE,
-                0,
-            )?;
-
-            let ts_freq = command_queue.get_timestamp_frequency()?;
-            let features_architecture = device.get_features_architecture()?;
-            let uma = features_architecture.UMA == TRUE;
-            let cc_uma = features_architecture.CacheCoherentUMA == TRUE;
-            let memory_arch = match (uma, cc_uma) {
-                (true, true) => MemoryArchitecture::CacheCoherentUMA,
-                (true, false) => MemoryArchitecture::UMA,
-                _ => MemoryArchitecture::NUMA,
-            };
-            let use_staging_buffers = memory_arch == MemoryArchitecture::NUMA;
-            // These values are appropriate for Shader Model 5. When we open up
-            // DXIL, fix this with proper dynamic queries.
-            let gpu_info = GpuInfo {
-                has_descriptor_indexing: false,
-                has_subgroups: false,
-                subgroup_size: None,
-                workgroup_limits: WorkgroupLimits {
-                    max_size: [1024, 1024, 64],
-                    max_invocations: 1024,
-                },
-                has_memory_model: false,
-                use_staging_buffers,
-            };
-            let descriptor_pool = Default::default();
-            Ok(Dx12Device {
-                device,
-                command_queue,
-                ts_freq,
-                memory_arch,
-                gpu_info,
-                descriptor_pool,
-            })
-        }
-    }
-
-    pub unsafe fn swapchain(
-        &self,
-        width: usize,
-        height: usize,
-        device: &Dx12Device,
-        surface: &Dx12Surface,
-    ) -> Result<Dx12Swapchain, Error> {
-        const FRAME_COUNT: u32 = 2;
-        let desc = dxgi1_2::DXGI_SWAP_CHAIN_DESC1 {
-            Width: width as u32,
-            Height: height as u32,
-            AlphaMode: dxgi1_2::DXGI_ALPHA_MODE_IGNORE,
-            BufferCount: FRAME_COUNT,
-            Format: winapi::shared::dxgiformat::DXGI_FORMAT_R8G8B8A8_UNORM,
-            Flags: 0,
-            BufferUsage: dxgitype::DXGI_USAGE_RENDER_TARGET_OUTPUT,
-            SampleDesc: dxgitype::DXGI_SAMPLE_DESC {
-                Count: 1,
-                Quality: 0,
-            },
-            Scaling: dxgi1_2::DXGI_SCALING_STRETCH,
-            Stereo: winapi::shared::minwindef::FALSE,
-            SwapEffect: dxgi::DXGI_SWAP_EFFECT_FLIP_DISCARD,
-        };
-        let swapchain =
-            self.factory
-                .create_swapchain_for_hwnd(&device.command_queue, surface.hwnd, desc)?;
-        let size = (width as u32, height as u32);
-        Ok(Dx12Swapchain { swapchain, size })
-    }
-}
-
-impl crate::backend::Device for Dx12Device {
-    type Buffer = Buffer;
-
-    type Image = Image;
-
-    type Pipeline = Pipeline;
-
-    type DescriptorSet = DescriptorSet;
-
-    type QueryPool = QueryPool;
-
-    type CmdBuf = CmdBuf;
-
-    type Fence = Fence;
-
-    type Semaphore = Semaphore;
-
-    type DescriptorSetBuilder = DescriptorSetBuilder;
-
-    type Sampler = ();
-
-    // Currently due to type inflexibility this is hardcoded to either HLSL or
-    // DXIL, but it would be nice to be able to handle both at runtime.
-    type ShaderSource = [u8];
-
-    fn create_buffer(&self, size: u64, usage: BufferUsage) -> Result<Self::Buffer, Error> {
-        // TODO: consider supporting BufferUsage::QUERY_RESOLVE here rather than
-        // having a separate function.
-        unsafe {
-            let page_property = self.memory_arch.page_property(usage);
-            let memory_pool = self.memory_arch.memory_pool(usage);
-            //TODO: consider flag D3D12_HEAP_FLAG_ALLOW_SHADER_ATOMICS?
-            let flags = d3d12::D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS;
-            let resource = self.device.create_buffer(
-                size,
-                d3d12::D3D12_HEAP_TYPE_CUSTOM,
-                page_property,
-                memory_pool,
-                d3d12::D3D12_RESOURCE_STATE_COMMON,
-                flags,
-            )?;
-            let mut descriptor_pool = self.descriptor_pool.lock().unwrap();
-            let cpu_ref = Arc::new(descriptor_pool.alloc_cpu(&self.device)?);
-            let cpu_handle = descriptor_pool.cpu_handle(&cpu_ref);
-            self.device
-                .create_byte_addressed_buffer_unordered_access_view(
-                    &resource,
-                    cpu_handle,
-                    0,
-                    (size / 4).try_into()?,
-                );
-            let gpu_ref = if usage.contains(BufferUsage::CLEAR) {
-                let gpu_ref = Arc::new(descriptor_pool.alloc_gpu(&self.device, 1)?);
-                let gpu_handle = descriptor_pool.cpu_handle_of_gpu(&gpu_ref, 0);
-                self.device.copy_descriptors(
-                    &[gpu_handle],
-                    &[1],
-                    &[cpu_handle],
-                    &[1],
-                    d3d12::D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV,
-                );
-                let heap = descriptor_pool.gpu_heap(&gpu_ref).to_owned();
-                Some((gpu_ref, heap))
-            } else {
-                None
-            };
-            Ok(Buffer {
-                resource,
-                size,
-                cpu_ref: Some(cpu_ref),
-                gpu_ref,
-            })
-        }
-    }
-
-    unsafe fn destroy_buffer(&self, buffer: &Self::Buffer) -> Result<(), Error> {
-        buffer.resource.destroy();
-        Ok(())
-    }
-
-    unsafe fn create_image2d(
-        &self,
-        width: u32,
-        height: u32,
-        format: ImageFormat,
-    ) -> Result<Self::Image, Error> {
-        let format = match format {
-            ImageFormat::A8 => winapi::shared::dxgiformat::DXGI_FORMAT_R8_UNORM,
-            ImageFormat::Rgba8 | ImageFormat::Surface => winapi::shared::dxgiformat::DXGI_FORMAT_R8G8B8A8_UNORM,
-        };
-        let resource = self
-            .device
-            .create_texture2d_buffer(width.into(), height, format, true)?;
-
-        let mut descriptor_pool = self.descriptor_pool.lock().unwrap();
-        let cpu_ref = Arc::new(descriptor_pool.alloc_cpu(&self.device)?);
-        let cpu_handle = descriptor_pool.cpu_handle(&cpu_ref);
-        self.device
-            .create_unordered_access_view(&resource, cpu_handle);
-        let size = (width, height);
-        Ok(Image {
-            resource,
-            cpu_ref: Some(cpu_ref),
-            size,
-        })
-    }
-
-    unsafe fn destroy_image(&self, image: &Self::Image) -> Result<(), Error> {
-        image.resource.destroy();
-        Ok(())
-    }
-
-    fn create_cmd_buf(&self) -> Result<Self::CmdBuf, Error> {
-        let list_type = d3d12::D3D12_COMMAND_LIST_TYPE_DIRECT;
-        let allocator = unsafe { self.device.create_command_allocator(list_type)? };
-        let node_mask = 0;
-        unsafe {
-            let c = self
-                .device
-                .create_graphics_command_list(list_type, &allocator, None, node_mask)?;
-            Ok(CmdBuf {
-                c,
-                allocator,
-                needs_reset: false,
-                end_query: None,
-            })
-        }
-    }
-
-    unsafe fn destroy_cmd_buf(&self, _cmd_buf: Self::CmdBuf) -> Result<(), Error> {
-        Ok(())
-    }
-
-    fn create_query_pool(&self, n_queries: u32) -> Result<Self::QueryPool, Error> {
-        unsafe {
-            let heap = self
-                .device
-                .create_query_heap(d3d12::D3D12_QUERY_HEAP_TYPE_TIMESTAMP, n_queries)?;
-            let buf = self.create_readback_buffer((n_queries * 8) as u64)?;
-            Ok(QueryPool {
-                heap,
-                buf,
-                n_queries,
-            })
-        }
-    }
-
-    unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result<Vec<f64>, Error> {
-        let mut buf = vec![0u64; pool.n_queries as usize];
-        let size = mem::size_of_val(buf.as_slice());
-        let mapped = self.map_buffer(&pool.buf, 0, size as u64, MapMode::Read)?;
-        std::ptr::copy_nonoverlapping(mapped, buf.as_mut_ptr() as *mut u8, size);
-        self.unmap_buffer(&pool.buf, 0, size as u64, MapMode::Read)?;
-        let tsp = (self.ts_freq as f64).recip();
-        let result = buf.iter().map(|ts| *ts as f64 * tsp).collect();
-        Ok(result)
-    }
-
-    unsafe fn run_cmd_bufs(
-        &self,
-        cmd_bufs: &[&Self::CmdBuf],
-        _wait_semaphores: &[&Self::Semaphore],
-        _signal_semaphores: &[&Self::Semaphore],
-        fence: Option<&mut Self::Fence>,
-    ) -> Result<(), Error> {
-        // TODO: handle semaphores
-        let lists = cmd_bufs
-            .iter()
-            .map(|c| c.c.as_raw_command_list())
-            .collect::<SmallVec<[_; 4]>>();
-        self.command_queue.execute_command_lists(&lists);
-        if let Some(fence) = fence {
-            let val = fence.val.get() + 1;
-            fence.val.set(val);
-            self.command_queue.signal(&fence.fence, val)?;
-            fence.fence.set_event_on_completion(&fence.event, val)?;
-        }
-        Ok(())
-    }
-
-    unsafe fn map_buffer(
-        &self,
-        buffer: &Self::Buffer,
-        offset: u64,
-        size: u64,
-        mode: MapMode,
-    ) -> Result<*mut u8, Error> {
-        let mapped = buffer.resource.map_buffer(offset, size, mode)?;
-        Ok(mapped)
-    }
-
-    unsafe fn unmap_buffer(
-        &self,
-        buffer: &Self::Buffer,
-        offset: u64,
-        size: u64,
-        mode: MapMode,
-    ) -> Result<(), Error> {
-        buffer.resource.unmap_buffer(offset, size, mode)?;
-        Ok(())
-    }
-
-    unsafe fn create_semaphore(&self) -> Result<Self::Semaphore, Error> {
-        Ok(Semaphore)
-    }
-
-    unsafe fn create_fence(&self, signaled: bool) -> Result<Self::Fence, Error> {
-        let fence = self.device.create_fence(0)?;
-        let event = wrappers::Event::create(false, signaled)?;
-        let val = Cell::new(0);
-        Ok(Fence { fence, event, val })
-    }
-
-    unsafe fn destroy_fence(&self, _fence: Self::Fence) -> Result<(), Error> {
-        Ok(())
-    }
-
-    unsafe fn wait_and_reset(&self, fences: Vec<&mut Self::Fence>) -> Result<(), Error> {
-        for fence in fences {
-            // TODO: probably handle errors here.
-            let _status = fence.event.wait(winapi::um::winbase::INFINITE);
-        }
-        Ok(())
-    }
-
-    unsafe fn get_fence_status(&self, fence: &mut Self::Fence) -> Result<bool, Error> {
-        let fence_val = fence.fence.get_value();
-        Ok(fence_val == fence.val.get())
-    }
-
-    fn query_gpu_info(&self) -> crate::GpuInfo {
-        self.gpu_info.clone()
-    }
-
-    unsafe fn create_compute_pipeline(
-        &self,
-        code: &Self::ShaderSource,
-        bind_types: &[BindType],
-    ) -> Result<Pipeline, Error> {
-        if u32::try_from(bind_types.len()).is_err() {
-            panic!("bind type length overflow");
-        }
-        let mut ranges = Vec::new();
-        let mut i = 0;
-        fn map_range_type(bind_type: BindType) -> d3d12::D3D12_DESCRIPTOR_RANGE_TYPE {
-            match bind_type {
-                BindType::Buffer | BindType::Image | BindType::ImageRead => {
-                    d3d12::D3D12_DESCRIPTOR_RANGE_TYPE_UAV
-                }
-                BindType::BufReadOnly => d3d12::D3D12_DESCRIPTOR_RANGE_TYPE_SRV,
-            }
-        }
-        while i < bind_types.len() {
-            let range_type = map_range_type(bind_types[i]);
-            let mut end = i + 1;
-            while end < bind_types.len() && map_range_type(bind_types[end]) == range_type {
-                end += 1;
-            }
-            let n_descriptors = (end - i) as u32;
-            ranges.push(d3d12::D3D12_DESCRIPTOR_RANGE {
-                RangeType: range_type,
-                NumDescriptors: n_descriptors,
-                BaseShaderRegister: i as u32,
-                RegisterSpace: 0,
-                OffsetInDescriptorsFromTableStart: d3d12::D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND,
-            });
-            i = end;
-        }
-
-        // We could always have ShaderSource as [u8] even when it's HLSL, and use the
-        // magic number to distinguish. In any case, for now it's hardcoded as one or
-        // the other.
-        /*
-        // HLSL code path
-        #[cfg(debug_assertions)]
-        let flags = winapi::um::d3dcompiler::D3DCOMPILE_DEBUG
-            | winapi::um::d3dcompiler::D3DCOMPILE_SKIP_OPTIMIZATION;
-        #[cfg(not(debug_assertions))]
-        let flags = 0;
-        let shader_blob = ShaderByteCode::compile(code, "cs_5_1", "main", flags)?;
-        let shader = ShaderByteCode::from_blob(shader_blob);
-        */
-
-        // DXIL code path
-        let shader = ShaderByteCode::from_slice(code);
-
-        let mut root_parameter = d3d12::D3D12_ROOT_PARAMETER {
-            ParameterType: d3d12::D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE,
-            ShaderVisibility: d3d12::D3D12_SHADER_VISIBILITY_ALL,
-            ..mem::zeroed()
-        };
-        *root_parameter.u.DescriptorTable_mut() = d3d12::D3D12_ROOT_DESCRIPTOR_TABLE {
-            NumDescriptorRanges: ranges.len() as u32,
-            pDescriptorRanges: ranges.as_ptr(),
-        };
-        let root_signature_desc = d3d12::D3D12_ROOT_SIGNATURE_DESC {
-            NumParameters: 1,
-            pParameters: &root_parameter,
-            NumStaticSamplers: 0,
-            pStaticSamplers: ptr::null(),
-            Flags: d3d12::D3D12_ROOT_SIGNATURE_FLAG_NONE,
-        };
-        let root_signature_blob = wrappers::RootSignature::serialize_description(
-            &root_signature_desc,
-            d3d12::D3D_ROOT_SIGNATURE_VERSION_1,
-        )?;
-        let root_signature = self.device.create_root_signature(0, root_signature_blob)?;
-        let desc = d3d12::D3D12_COMPUTE_PIPELINE_STATE_DESC {
-            pRootSignature: root_signature.0.as_raw(),
-            CS: shader.bytecode,
-            NodeMask: 0,
-            CachedPSO: d3d12::D3D12_CACHED_PIPELINE_STATE {
-                pCachedBlob: ptr::null(),
-                CachedBlobSizeInBytes: 0,
-            },
-            Flags: d3d12::D3D12_PIPELINE_STATE_FLAG_NONE,
-        };
-        let pipeline_state = self.device.create_compute_pipeline_state(&desc)?;
-
-        Ok(Pipeline {
-            pipeline_state,
-            root_signature,
-        })
-    }
-
-    unsafe fn descriptor_set_builder(&self) -> Self::DescriptorSetBuilder {
-        DescriptorSetBuilder::default()
-    }
-
-    unsafe fn update_buffer_descriptor(
-        &self,
-        ds: &mut Self::DescriptorSet,
-        index: u32,
-        buf: &Self::Buffer,
-    ) {
-        let src_cpu_ref = buf.cpu_ref.as_ref().unwrap().handle();
-        ds.gpu_ref
-            .copy_one_descriptor(&self.device, src_cpu_ref, index);
-    }
-
-    unsafe fn update_image_descriptor(
-        &self,
-        ds: &mut Self::DescriptorSet,
-        index: u32,
-        image: &Self::Image,
-    ) {
-        let src_cpu_ref = image.cpu_ref.as_ref().unwrap().handle();
-        ds.gpu_ref
-            .copy_one_descriptor(&self.device, src_cpu_ref, index);
-    }
-
-    unsafe fn create_sampler(&self, _params: crate::SamplerParams) -> Result<Self::Sampler, Error> {
-        todo!()
-    }
-}
-
-impl Dx12Device {
-    fn create_readback_buffer(&self, size: u64) -> Result<Buffer, Error> {
-        unsafe {
-            let resource = self.device.create_buffer(
-                size,
-                d3d12::D3D12_HEAP_TYPE_READBACK,
-                d3d12::D3D12_CPU_PAGE_PROPERTY_UNKNOWN,
-                d3d12::D3D12_MEMORY_POOL_UNKNOWN,
-                d3d12::D3D12_RESOURCE_STATE_COPY_DEST,
-                d3d12::D3D12_RESOURCE_FLAG_NONE,
-            )?;
-            let cpu_ref = None;
-            let gpu_ref = None;
-            Ok(Buffer {
-                resource,
-                size,
-                cpu_ref,
-                gpu_ref,
-            })
-        }
-    }
-}
-
-impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
-    unsafe fn begin(&mut self) {
-        if self.needs_reset {}
-    }
-
-    unsafe fn finish(&mut self) {
-        let _ = self.c.close();
-        self.needs_reset = true;
-    }
-
-    unsafe fn flush(&mut self) {}
-
-    unsafe fn reset(&mut self) -> bool {
-        self.allocator.reset().is_ok() && self.c.reset(&self.allocator, None).is_ok()
-    }
-
-    unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
-        if let Some((pool, start, end)) = &desc.timer_queries {
-            #[allow(irrefutable_let_patterns)]
-            if let crate::hub::QueryPool::Dx12(pool) = pool {
-                self.write_timestamp(pool, *start);
-                self.end_query = Some((pool.heap.clone(), *end));
-            }
-        }
-    }
-
-    unsafe fn dispatch(
-        &mut self,
-        pipeline: &Pipeline,
-        descriptor_set: &DescriptorSet,
-        workgroup_count: (u32, u32, u32),
-        _workgroup_size: (u32, u32, u32),
-    ) {
-        self.c.set_pipeline_state(&pipeline.pipeline_state);
-        self.c
-            .set_compute_pipeline_root_signature(&pipeline.root_signature);
-        // TODO: persist heap ix and only set if changed.
-        self.c.set_descriptor_heaps(&[&descriptor_set.heap]);
-        self.c
-            .set_compute_root_descriptor_table(0, descriptor_set.gpu_ref.gpu_handle());
-        self.c
-            .dispatch(workgroup_count.0, workgroup_count.1, workgroup_count.2);
-    }
-
-    unsafe fn end_compute_pass(&mut self) {
-        if let Some((heap, end)) = self.end_query.take() {
-            self.c.end_timing_query(&heap, end);
-        }
-    }
-
-    unsafe fn memory_barrier(&mut self) {
-        // See comments in CommandBuffer::pipeline_barrier in gfx-hal dx12 backend.
-        // The "proper" way to do this would be to name the actual buffers participating
-        // in the barrier. But it seems like this is a reasonable way to create a
-        // global barrier.
-        let bar = wrappers::create_uav_resource_barrier(ptr::null_mut());
-        self.c.resource_barrier(&[bar]);
-    }
-
-    unsafe fn host_barrier(&mut self) {
-        // My understanding is that a host barrier is not needed, but am still hunting
-        // down an authoritative source for that. Among other things, the docs for
-        // Map suggest that it does the needed visibility operation.
-        //
-        // https://docs.microsoft.com/en-us/windows/win32/api/d3d12/nf-d3d12-id3d12resource-map
-    }
-
-    unsafe fn image_barrier(
-        &mut self,
-        image: &Image,
-        src_layout: crate::ImageLayout,
-        dst_layout: crate::ImageLayout,
-    ) {
-        let src_state = resource_state_for_image_layout(src_layout);
-        let dst_state = resource_state_for_image_layout(dst_layout);
-        if src_state != dst_state {
-            let bar = wrappers::create_transition_resource_barrier(
-                image.resource.get_mut(),
-                src_state,
-                dst_state,
-            );
-            self.c.resource_barrier(&[bar]);
-        }
-        // Always do a memory barrier in case of UAV image access. We probably
-        // want to make these barriers more precise.
-        self.memory_barrier();
-    }
-
-    unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
-        let cpu_ref = buffer.cpu_ref.as_ref().unwrap();
-        let (gpu_ref, heap) = buffer
-            .gpu_ref
-            .as_ref()
-            .expect("Need to set CLEAR usage on buffer");
-        // Same TODO as dispatch: track and only set if changed.
-        self.c.set_descriptor_heaps(&[heap]);
-        // Discussion question: would compute shader be faster? Should measure.
-        self.c.clear_uav(
-            gpu_ref.gpu_handle(),
-            cpu_ref.handle(),
-            &buffer.resource,
-            0,
-            size,
-        );
-    }
-
-    unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
-        // TODO: consider using copy_resource here (if sizes match)
-        let size = src.size.min(dst.size);
-        self.c.copy_buffer(&dst.resource, 0, &src.resource, 0, size);
-    }
-
-    unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
-        self.c
-            .copy_texture_to_buffer(&src.resource, &dst.resource, src.size.0, src.size.1);
-    }
-
-    unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
-        self.c
-            .copy_buffer_to_texture(&src.resource, &dst.resource, dst.size.0, dst.size.1);
-    }
-
-    unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
-        self.c.copy_resource(&src.resource, &dst.resource);
-    }
-
-    unsafe fn reset_query_pool(&mut self, _pool: &QueryPool) {}
-
-    unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
-        self.c.end_timing_query(&pool.heap, query);
-    }
-
-    unsafe fn finish_timestamps(&mut self, pool: &QueryPool) {
-        self.c
-            .resolve_timing_query_data(&pool.heap, 0, pool.n_queries, &pool.buf.resource, 0);
-    }
-}
-
-impl crate::backend::DescriptorSetBuilder<Dx12Device> for DescriptorSetBuilder {
-    fn add_buffers(&mut self, buffers: &[&Buffer]) {
-        for buf in buffers {
-            self.handles.push(buf.cpu_ref.as_ref().unwrap().handle());
-        }
-    }
-
-    fn add_images(&mut self, images: &[&Image]) {
-        for img in images {
-            self.handles.push(img.cpu_ref.as_ref().unwrap().handle());
-        }
-    }
-
-    fn add_textures(&mut self, images: &[&Image]) {
-        for img in images {
-            self.handles.push(img.cpu_ref.as_ref().unwrap().handle());
-        }
-    }
-
-    unsafe fn build(
-        self,
-        device: &Dx12Device,
-        _pipeline: &Pipeline,
-    ) -> Result<DescriptorSet, Error> {
-        let mut descriptor_pool = device.descriptor_pool.lock().unwrap();
-        let n_descriptors = self.handles.len().try_into()?;
-        let gpu_ref = descriptor_pool.alloc_gpu(&device.device, n_descriptors)?;
-        gpu_ref.copy_descriptors(&device.device, &self.handles);
-        let heap = descriptor_pool.gpu_heap(&gpu_ref).to_owned();
-        Ok(DescriptorSet { gpu_ref, heap })
-    }
-}
-
-impl MemoryArchitecture {
-    // See https://msdn.microsoft.com/de-de/library/windows/desktop/dn788678(v=vs.85).aspx
-
-    fn page_property(&self, usage: BufferUsage) -> d3d12::D3D12_CPU_PAGE_PROPERTY {
-        if usage.contains(BufferUsage::MAP_READ) {
-            d3d12::D3D12_CPU_PAGE_PROPERTY_WRITE_BACK
-        } else if usage.contains(BufferUsage::MAP_WRITE) {
-            if *self == MemoryArchitecture::CacheCoherentUMA {
-                d3d12::D3D12_CPU_PAGE_PROPERTY_WRITE_BACK
-            } else {
-                d3d12::D3D12_CPU_PAGE_PROPERTY_WRITE_COMBINE
-            }
-        } else {
-            d3d12::D3D12_CPU_PAGE_PROPERTY_NOT_AVAILABLE
-        }
-    }
-
-    fn memory_pool(&self, usage: BufferUsage) -> d3d12::D3D12_MEMORY_POOL {
-        if *self == MemoryArchitecture::NUMA
-            && !usage.intersects(BufferUsage::MAP_READ | BufferUsage::MAP_WRITE)
-        {
-            d3d12::D3D12_MEMORY_POOL_L1
-        } else {
-            d3d12::D3D12_MEMORY_POOL_L0
-        }
-    }
-}
-
-fn resource_state_for_image_layout(layout: ImageLayout) -> d3d12::D3D12_RESOURCE_STATES {
-    match layout {
-        ImageLayout::Undefined => d3d12::D3D12_RESOURCE_STATE_COMMON,
-        ImageLayout::Present => d3d12::D3D12_RESOURCE_STATE_PRESENT,
-        ImageLayout::BlitSrc => d3d12::D3D12_RESOURCE_STATE_COPY_SOURCE,
-        ImageLayout::BlitDst => d3d12::D3D12_RESOURCE_STATE_COPY_DEST,
-        ImageLayout::General => d3d12::D3D12_RESOURCE_STATE_COMMON,
-        ImageLayout::ShaderRead => d3d12::D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE,
-    }
-}
-
-impl Dx12Swapchain {
-    pub unsafe fn next(&mut self) -> Result<(usize, Semaphore), Error> {
-        let idx = self.swapchain.get_current_back_buffer_index();
-        Ok((idx as usize, Semaphore))
-    }
-
-    pub unsafe fn image(&self, idx: usize) -> Image {
-        let buffer = self.swapchain.get_buffer(idx as u32);
-        Image {
-            resource: buffer,
-            cpu_ref: None,
-            size: self.size,
-        }
-    }
-
-    pub unsafe fn present(
-        &self,
-        _image_idx: usize,
-        _semaphores: &[&Semaphore],
-    ) -> Result<bool, Error> {
-        self.swapchain.present(1, 0)?;
-        Ok(false)
-    }
-}
--- a/piet-gpu-hal/src/dx12/descriptor.rs
+++ b/piet-gpu-hal/src/dx12/descriptor.rs
@ -1,309 +0,0 @@
-// Copyright © 2021 piet-gpu developers.
-//
-// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
-// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
-// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
-// option. This file may not be copied, modified, or distributed
-// except according to those
-
-//! Descriptor management.
-
-use std::{
-    convert::TryInto,
-    ops::Deref,
-    sync::{Arc, Mutex, Weak},
-};
-
-use smallvec::SmallVec;
-use winapi::um::d3d12::{
-    D3D12_CPU_DESCRIPTOR_HANDLE, D3D12_DESCRIPTOR_HEAP_DESC, D3D12_DESCRIPTOR_HEAP_FLAG_NONE,
-    D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV,
-    D3D12_GPU_DESCRIPTOR_HANDLE,
-};
-
-use crate::{bestfit::BestFit, Error};
-
-use super::wrappers::{DescriptorHeap, Device};
-
-const CPU_CHUNK_SIZE: u32 = 256;
-const GPU_CHUNK_SIZE: u32 = 4096;
-
-#[derive(Default)]
-pub struct DescriptorPool {
-    cpu_visible: Vec<CpuHeap>,
-    gpu_visible: Vec<GpuHeap>,
-    free_list: Arc<Mutex<DescriptorFreeList>>,
-}
-
-#[derive(Default)]
-pub struct DescriptorFreeList {
-    cpu_free: Vec<Vec<u32>>,
-    gpu_free: Vec<BestFit>,
-}
-
-struct CpuHeap {
-    // Retained for lifetime reasons.
-    #[allow(unused)]
-    dx12_heap: DescriptorHeap,
-    cpu_handle: D3D12_CPU_DESCRIPTOR_HANDLE,
-    increment_size: u32,
-}
-
-pub struct CpuHeapRef {
-    heap_ix: usize,
-    offset: u32,
-}
-
-/// An owned reference to the CPU heap.
-///
-/// When dropped, the corresponding heap range will be freed.
-pub struct CpuHeapRefOwned {
-    heap_ref: CpuHeapRef,
-    handle: D3D12_CPU_DESCRIPTOR_HANDLE,
-    free_list: Weak<Mutex<DescriptorFreeList>>,
-}
-
-/// A shader-visible descriptor heap.
-struct GpuHeap {
-    dx12_heap: DescriptorHeap,
-    cpu_handle: D3D12_CPU_DESCRIPTOR_HANDLE,
-    gpu_handle: D3D12_GPU_DESCRIPTOR_HANDLE,
-    increment_size: u32,
-}
-
-pub struct GpuHeapRef {
-    heap_ix: usize,
-    offset: u32,
-    n: u32,
-}
-
-/// An owned reference to the GPU heap.
-///
-/// When dropped, the corresponding heap range will be freed.
-pub struct GpuHeapRefOwned {
-    heap_ref: GpuHeapRef,
-    cpu_handle: D3D12_CPU_DESCRIPTOR_HANDLE,
-    gpu_handle: D3D12_GPU_DESCRIPTOR_HANDLE,
-    increment_size: u32,
-    free_list: Weak<Mutex<DescriptorFreeList>>,
-}
-
-impl DescriptorPool {
-    pub fn alloc_cpu(&mut self, device: &Device) -> Result<CpuHeapRefOwned, Error> {
-        let free_list = &self.free_list;
-        let mk_owned = |heap_ref, handle| CpuHeapRefOwned {
-            heap_ref,
-            handle,
-            free_list: Arc::downgrade(free_list),
-        };
-        let mut free_list = free_list.lock().unwrap();
-        for (heap_ix, free) in free_list.cpu_free.iter_mut().enumerate() {
-            if let Some(offset) = free.pop() {
-                let handle = self.cpu_visible[heap_ix].cpu_handle(offset);
-                return Ok(mk_owned(CpuHeapRef { heap_ix, offset }, handle));
-            }
-        }
-        unsafe {
-            let heap_type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
-            let desc = D3D12_DESCRIPTOR_HEAP_DESC {
-                Type: heap_type,
-                NumDescriptors: CPU_CHUNK_SIZE,
-                Flags: D3D12_DESCRIPTOR_HEAP_FLAG_NONE,
-                NodeMask: 0,
-            };
-            let dx12_heap = device.create_descriptor_heap(&desc)?;
-            let mut free = (0..CPU_CHUNK_SIZE).rev().collect::<Vec<_>>();
-            let offset = free.pop().unwrap();
-            debug_assert_eq!(offset, 0);
-            let heap_ref = CpuHeapRef {
-                heap_ix: self.cpu_visible.len(),
-                offset,
-            };
-            let cpu_handle = dx12_heap.get_cpu_descriptor_handle_for_heap_start();
-            let increment_size = device.get_descriptor_increment_size(heap_type);
-            let heap = CpuHeap {
-                dx12_heap,
-                cpu_handle,
-                increment_size,
-            };
-            self.cpu_visible.push(heap);
-            free_list.cpu_free.push(free);
-            Ok(mk_owned(heap_ref, cpu_handle))
-        }
-    }
-
-    pub fn cpu_handle(&self, cpu_ref: &CpuHeapRef) -> D3D12_CPU_DESCRIPTOR_HANDLE {
-        self.cpu_visible[cpu_ref.heap_ix].cpu_handle(cpu_ref.offset)
-    }
-
-    pub fn alloc_gpu(&mut self, device: &Device, n: u32) -> Result<GpuHeapRefOwned, Error> {
-        let free_list = &self.free_list;
-        let heap_type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
-        let increment_size = unsafe { device.get_descriptor_increment_size(heap_type) };
-        let mk_owned = |heap_ref, cpu_handle, gpu_handle| GpuHeapRefOwned {
-            heap_ref,
-            cpu_handle,
-            gpu_handle,
-            increment_size,
-            free_list: Arc::downgrade(free_list),
-        };
-        let mut free_list = free_list.lock().unwrap();
-        for (heap_ix, free) in free_list.gpu_free.iter_mut().enumerate() {
-            if let Some(offset) = free.alloc(n) {
-                let heap = &self.gpu_visible[heap_ix];
-                let cpu_handle = heap.cpu_handle(offset);
-                let gpu_handle = heap.gpu_handle(offset);
-                return Ok(mk_owned(
-                    GpuHeapRef { heap_ix, offset, n },
-                    cpu_handle,
-                    gpu_handle,
-                ));
-            }
-        }
-        unsafe {
-            let size = n.max(GPU_CHUNK_SIZE).next_power_of_two();
-            let desc = D3D12_DESCRIPTOR_HEAP_DESC {
-                Type: heap_type,
-                NumDescriptors: size,
-                Flags: D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE,
-                NodeMask: 0,
-            };
-            let dx12_heap = device.create_descriptor_heap(&desc)?;
-            let heap_ix = self.gpu_visible.len();
-            let mut free = BestFit::new(size);
-            let offset = free.alloc(n).unwrap();
-            // We assume the first allocation is at 0, to avoid recomputing offsets.
-            debug_assert_eq!(offset, 0);
-            let cpu_handle = dx12_heap.get_cpu_descriptor_handle_for_heap_start();
-            let gpu_handle = dx12_heap.get_gpu_descriptor_handle_for_heap_start();
-            let increment_size = device.get_descriptor_increment_size(heap_type);
-            let heap = GpuHeap {
-                dx12_heap,
-                cpu_handle,
-                gpu_handle,
-                increment_size,
-            };
-            self.gpu_visible.push(heap);
-            free_list.gpu_free.push(free);
-            Ok(mk_owned(
-                GpuHeapRef { heap_ix, offset, n },
-                cpu_handle,
-                gpu_handle,
-            ))
-        }
-    }
-
-    pub fn cpu_handle_of_gpu(
-        &self,
-        gpu_ref: &GpuHeapRef,
-        offset: u32,
-    ) -> D3D12_CPU_DESCRIPTOR_HANDLE {
-        debug_assert!(offset < gpu_ref.n);
-        let dx12_heap = &self.gpu_visible[gpu_ref.heap_ix];
-        dx12_heap.cpu_handle(gpu_ref.offset + offset)
-    }
-
-    pub fn gpu_heap(&self, gpu_ref: &GpuHeapRef) -> &DescriptorHeap {
-        &self.gpu_visible[gpu_ref.heap_ix].dx12_heap
-    }
-}
-
-impl DescriptorFreeList {
-    fn free_cpu(&mut self, cpu_ref: &CpuHeapRef) {
-        self.cpu_free[cpu_ref.heap_ix].push(cpu_ref.offset);
-    }
-
-    fn free_gpu(&mut self, gpu_ref: &GpuHeapRef) {
-        self.gpu_free[gpu_ref.heap_ix].free(gpu_ref.offset, gpu_ref.n);
-    }
-}
-
-impl Drop for CpuHeapRefOwned {
-    fn drop(&mut self) {
-        if let Some(a) = self.free_list.upgrade() {
-            a.lock().unwrap().free_cpu(&self.heap_ref)
-        }
-    }
-}
-
-impl CpuHeapRefOwned {
-    pub fn handle(&self) -> D3D12_CPU_DESCRIPTOR_HANDLE {
-        self.handle
-    }
-}
-
-impl GpuHeapRefOwned {
-    pub fn gpu_handle(&self) -> D3D12_GPU_DESCRIPTOR_HANDLE {
-        self.gpu_handle
-    }
-
-    pub unsafe fn copy_descriptors(&self, device: &Device, src: &[D3D12_CPU_DESCRIPTOR_HANDLE]) {
-        // TODO: optimize a bit (use simple variant where appropriate)
-        let n = src.len().try_into().unwrap();
-        let sizes = (0..n).map(|_| 1).collect::<SmallVec<[u32; 16]>>();
-        device.copy_descriptors(
-            &[self.cpu_handle],
-            &[n],
-            src,
-            &sizes,
-            D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV,
-        );
-    }
-
-    pub unsafe fn copy_one_descriptor(
-        &self,
-        device: &Device,
-        src: D3D12_CPU_DESCRIPTOR_HANDLE,
-        index: u32,
-    ) {
-        let mut dst = self.cpu_handle;
-        dst.ptr += (index * self.increment_size) as usize;
-        device.copy_one_descriptor(dst, src, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
-    }
-}
-
-impl Deref for CpuHeapRefOwned {
-    type Target = CpuHeapRef;
-
-    fn deref(&self) -> &Self::Target {
-        &self.heap_ref
-    }
-}
-
-impl Drop for GpuHeapRefOwned {
-    fn drop(&mut self) {
-        if let Some(a) = self.free_list.upgrade() {
-            a.lock().unwrap().free_gpu(&self.heap_ref)
-        }
-    }
-}
-
-impl Deref for GpuHeapRefOwned {
-    type Target = GpuHeapRef;
-
-    fn deref(&self) -> &Self::Target {
-        &self.heap_ref
-    }
-}
-
-impl CpuHeap {
-    fn cpu_handle(&self, offset: u32) -> D3D12_CPU_DESCRIPTOR_HANDLE {
-        let mut handle = self.cpu_handle;
-        handle.ptr += (offset as usize) * (self.increment_size as usize);
-        handle
-    }
-}
-
-impl GpuHeap {
-    fn cpu_handle(&self, offset: u32) -> D3D12_CPU_DESCRIPTOR_HANDLE {
-        let mut handle = self.cpu_handle;
-        handle.ptr += (offset as usize) * (self.increment_size as usize);
-        handle
-    }
-
-    fn gpu_handle(&self, offset: u32) -> D3D12_GPU_DESCRIPTOR_HANDLE {
-        let mut handle = self.gpu_handle;
-        handle.ptr += (offset as u64) * (self.increment_size as u64);
-        handle
-    }
-}
--- a/piet-gpu-hal/src/dx12/error.rs
+++ b/piet-gpu-hal/src/dx12/error.rs
@ -1,85 +0,0 @@
-// Copyright © 2019 piet-gpu developers.
-//
-// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
-// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
-// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
-// option. This file may not be copied, modified, or distributed
-// except according to those terms.
-
-//! This is a Windows-specific error mechanism (adapted from piet-dx12),
-//! but we should adapt it to be more general.
-
-use winapi::shared::winerror;
-
-pub enum Error {
-    Hresult(winerror::HRESULT),
-    ExplainedHr(&'static str, winerror::HRESULT),
-}
-
-impl std::fmt::Debug for Error {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        match self {
-            Error::Hresult(hr) => write!(f, "hresult {:x}", hr),
-            Error::ExplainedHr(exp, hr) => {
-                write!(f, "{}: ", exp)?;
-                write_hr(f, *hr)
-            }
-        }
-    }
-}
-
-impl std::fmt::Display for Error {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        std::fmt::Debug::fmt(self, f)
-    }
-}
-
-impl std::error::Error for Error {}
-
-/// Strings for errors we're likely to see.
-///
-/// See https://docs.microsoft.com/en-us/windows/win32/direct3ddxgi/dxgi-error
-fn err_str_for_hr(hr: winerror::HRESULT) -> Option<&'static str> {
-    Some(match hr as u32 {
-        0x80004005 => "E_FAIL",
-        0x80070057 => "E_INVALIDARG",
-        0x887a0001 => "DXGI_ERROR_INVALID_CALL",
-        0x887a0002 => "DXGI_ERROR_NOT_FOUND",
-        0x887a0004 => "DXGI_ERROR_UNSUPPORTED",
-        0x887a0005 => "DXGI_ERROR_DEVICE_REMOVED",
-        0x887a0006 => "DXGI_ERROR_DEVICE_HUNG",
-        _ => return None,
-    })
-}
-
-fn write_hr(f: &mut std::fmt::Formatter, hr: winerror::HRESULT) -> std::fmt::Result {
-    if let Some(err_str) = err_str_for_hr(hr) {
-        write!(f, "{:x} ({})", hr, err_str)
-    } else {
-        write!(f, "{:x}", hr)
-    }
-}
-
-pub type D3DResult<T> = (T, winerror::HRESULT);
-
-pub fn error_if_failed_else_value<T>(result: D3DResult<T>) -> Result<T, Error> {
-    let (result_value, hresult) = result;
-
-    if winerror::SUCCEEDED(hresult) {
-        Ok(result_value)
-    } else {
-        Err(Error::Hresult(hresult))
-    }
-}
-
-pub fn error_if_failed_else_unit(hresult: winerror::HRESULT) -> Result<(), Error> {
-    error_if_failed_else_value(((), hresult))
-}
-
-pub fn explain_error(hresult: winerror::HRESULT, explanation: &'static str) -> Result<(), Error> {
-    if winerror::SUCCEEDED(hresult) {
-        Ok(())
-    } else {
-        Err(Error::ExplainedHr(explanation, hresult))
-    }
-}
--- a/piet-gpu-hal/src/dx12/wrappers.rs
+++ b/piet-gpu-hal/src/dx12/wrappers.rs
--- a/piet-gpu-hal/src/hub.rs
+++ b/piet-gpu-hal/src/hub.rs
--- a/piet-gpu-hal/src/lib.rs
+++ b/piet-gpu-hal/src/lib.rs
@ -1,213 +0,0 @@
-//! The cross-platform abstraction for a GPU device.
-//!
-//! This abstraction is inspired by gfx-hal, but is specialized to the needs of piet-gpu.
-//! In time, it may go away and be replaced by either gfx-hal or wgpu.
-
-use bitflags::bitflags;
-
-mod backend;
-mod bestfit;
-mod bufwrite;
-mod hub;
-
-#[macro_use]
-mod macros;
-
-mod mux;
-
-pub use crate::mux::{
-    DescriptorSet, Device, Fence, Instance, Pipeline, QueryPool, Sampler, Semaphore, ShaderCode,
-    Surface, Swapchain,
-};
-pub use bufwrite::BufWrite;
-pub use hub::{
-    BufReadGuard, BufWriteGuard, Buffer, CmdBuf, ComputePass, DescriptorSetBuilder, Image,
-    RetainResource, Session, SubmittedCmdBuf,
-};
-
-// TODO: because these are conditionally included, "cargo fmt" does not
-// see them. Figure that out, possibly including running rustfmt manually.
-mux_cfg! {
-    #[cfg(vk)]
-    mod vulkan;
-}
-mux_cfg! {
-    #[cfg(dx12)]
-    mod dx12;
-}
-#[cfg(target_os = "macos")]
-mod metal;
-
-/// The common error type for the crate.
-///
-/// This keeps things simple and can be expanded later.
-pub type Error = Box<dyn std::error::Error>;
-
-bitflags! {
-    /// Options when creating an instance.
-    #[derive(Default)]
-    pub struct InstanceFlags: u32 {
-        /// Prefer DX12 over Vulkan.
-        const DX12 = 0x1;
-        // TODO: discrete vs integrated selection
-    }
-}
-
-/// The GPU backend that was selected.
-#[derive(Clone, Copy, PartialEq, Eq, Debug)]
-pub enum BackendType {
-    Vulkan,
-    Dx12,
-    Metal,
-}
-
-/// An image layout state.
-///
-/// An image must be in a particular layout state to be used for
-/// a purpose such as being bound to a shader.
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum ImageLayout {
-    /// The initial state for a newly created image.
-    Undefined,
-    /// A swapchain ready to be presented.
-    Present,
-    /// The source for a copy operation.
-    BlitSrc,
-    /// The destination for a copy operation.
-    BlitDst,
-    /// Read/write binding to a shader.
-    General,
-    /// Able to be sampled from by shaders.
-    ShaderRead,
-}
-
-/// The type of sampling for image lookup.
-///
-/// This could take a lot more params, such as filtering, repeat, behavior
-/// at edges, etc., but for now we'll keep it simple.
-#[derive(Copy, Clone, Debug)]
-pub enum SamplerParams {
-    Nearest,
-    Linear,
-}
-
-/// Image format.
-#[derive(Copy, Clone, Debug)]
-pub enum ImageFormat {
-    // 8 bit grayscale / alpha
-    A8,
-    // 8 bit per pixel RGBA
-    Rgba8,
-    // Format that matches the target surface
-    Surface,
-}
-
-bitflags! {
-    /// The intended usage for a buffer, specified on creation.
-    pub struct BufferUsage: u32 {
-        /// The buffer can be mapped for reading CPU-side.
-        const MAP_READ = 0x1;
-        /// The buffer can be mapped for writing CPU-side.
-        const MAP_WRITE = 0x2;
-        /// The buffer can be copied from.
-        const COPY_SRC = 0x4;
-        /// The buffer can be copied to.
-        const COPY_DST = 0x8;
-        /// The buffer can be bound to a compute shader.
-        const STORAGE = 0x80;
-        /// The buffer can be used to store the results of queries.
-        const QUERY_RESOLVE = 0x200;
-        /// The buffer may be cleared.
-        const CLEAR = 0x8000;
-        // May add other types.
-    }
-}
-
-/// The type of resource that will be bound to a slot in a shader.
-#[derive(Clone, Copy, PartialEq, Eq)]
-pub enum BindType {
-    /// A storage buffer with read/write access.
-    Buffer,
-    /// A storage buffer with read only access.
-    BufReadOnly,
-    /// A storage image.
-    Image,
-    /// A storage image with read only access.
-    ///
-    /// A note on this. None of the backends are currently making a
-    /// distinction between Image and ImageRead as far as bindings go,
-    /// but the `--hlsl-nonwritable-uav-texture-as-srv` option to
-    /// spirv-cross (marked as unstable) would do so.
-    ImageRead,
-    // TODO: Uniform, Sampler, maybe others
-}
-
-/// Whether to map a buffer in read or write mode.
-pub enum MapMode {
-    /// Map for reading.
-    Read,
-    /// Map for writing.
-    Write,
-}
-
-#[derive(Clone, Debug)]
-/// Information about the GPU.
-pub struct GpuInfo {
-    /// The GPU supports descriptor indexing.
-    pub has_descriptor_indexing: bool,
-    /// The GPU supports subgroups.
-    ///
-    /// Right now, this just checks for basic subgroup capability (as
-    /// required in Vulkan 1.1), and we should have finer grained
-    /// queries for shuffles, etc.
-    pub has_subgroups: bool,
-    /// Limits on workgroup size for compute shaders.
-    pub workgroup_limits: WorkgroupLimits,
-    /// Info about subgroup size control, if available.
-    pub subgroup_size: Option<SubgroupSize>,
-    /// The GPU supports a real, grown-ass memory model.
-    pub has_memory_model: bool,
-    /// Whether staging buffers should be used.
-    pub use_staging_buffers: bool,
-}
-
-/// The range of subgroup sizes supported by a back-end, when available.
-///
-/// The subgroup size is always a power of 2. The ability to specify
-/// subgroup size for a compute shader is a newer feature, not always
-/// available.
-#[derive(Clone, Debug)]
-pub struct SubgroupSize {
-    pub min: u32,
-    pub max: u32,
-}
-
-/// The range of workgroup sizes supported by a back-end.
-#[derive(Clone, Debug)]
-pub struct WorkgroupLimits {
-    /// The maximum size on each workgroup dimension can be.
-    pub max_size: [u32; 3],
-    /// The maximum overall invocations a workgroup can have. That is, the product of sizes in each
-    /// dimension.
-    pub max_invocations: u32,
-}
-
-/// Options for creating a compute pass.
-#[derive(Default)]
-pub struct ComputePassDescriptor<'a> {
-    // Maybe label should go here? It does in wgpu and wgpu_hal.
-    /// Timer query parameters.
-    ///
-    /// To record timer queries for a compute pass, set the query pool, start
-    /// query index, and end query index here. The indices must be less than
-    /// the size of the query pool.
-    timer_queries: Option<(&'a QueryPool, u32, u32)>,
-}
-
-impl<'a> ComputePassDescriptor<'a> {
-    pub fn timer(pool: &'a QueryPool, start_query: u32, end_query: u32) -> ComputePassDescriptor {
-        ComputePassDescriptor {
-            timer_queries: Some((pool, start_query, end_query)),
-        }
-    }
-}
--- a/piet-gpu-hal/src/macros.rs
+++ b/piet-gpu-hal/src/macros.rs
@ -1,205 +0,0 @@
-// Copyright 2021 The piet-gpu authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Also licensed under MIT license, at your choice.
-
-//! Macros, mostly to automate backend selection tedium.
-
-#[doc(hidden)]
-/// Configure an item to be included only for the given GPU.
-#[macro_export]
-macro_rules! mux_cfg {
-    ( #[cfg(vk)] $($tokens:tt)* ) => {
-        #[cfg(not(target_os="macos"))] $( $tokens )*
-    };
-
-    ( #[cfg(dx12)] $($tokens:tt)* ) => {
-        #[cfg(target_os="windows")] $( $tokens )*
-    };
-
-    ( #[cfg(mtl)] $($tokens:tt)* ) => {
-        #[cfg(target_os="macos")] $( $tokens )*
-    };
-}
-
-#[doc(hidden)]
-/// Define an enum with a variant per GPU.
-#[macro_export]
-macro_rules! mux_enum {
-    ( $(#[$outer:meta])* $v:vis enum $name:ident {
-        Vk($vk:ty),
-        Dx12($dx12:ty),
-        Mtl($mtl:ty),
-    } ) => {
-        $(#[$outer])* $v enum $name {
-            #[cfg(not(target_os="macos"))]
-            Vk($vk),
-            #[cfg(target_os="windows")]
-            Dx12($dx12),
-            #[cfg(target_os="macos")]
-            Mtl($mtl),
-        }
-
-        impl $name {
-            $crate::mux_cfg! {
-                #[cfg(vk)]
-                #[allow(unused)]
-                fn vk(&self) -> &$vk {
-                    match self {
-                        $name::Vk(x) => x,
-                        _ => panic!("downcast error")
-                    }
-                }
-            }
-            $crate::mux_cfg! {
-                #[cfg(vk)]
-                #[allow(unused)]
-                fn vk_mut(&mut self) -> &mut $vk {
-                    match self {
-                        $name::Vk(x) => x,
-                        _ => panic!("downcast error")
-                    }
-                }
-            }
-            $crate::mux_cfg! {
-                #[cfg(vk)]
-                #[allow(unused)]
-                fn vk_owned(self) -> $vk {
-                    match self {
-                        $name::Vk(x) => x,
-                        _ => panic!("downcast error")
-                    }
-                }
-            }
-
-            $crate::mux_cfg! {
-                #[cfg(dx12)]
-                #[allow(unused)]
-                fn dx12(&self) -> &$dx12 {
-                    match self {
-                        $name::Dx12(x) => x,
-                        _ => panic!("downcast error")
-                    }
-                }
-            }
-            $crate::mux_cfg! {
-                #[cfg(dx12)]
-                #[allow(unused)]
-                fn dx12_mut(&mut self) -> &mut $dx12 {
-                    match self {
-                        $name::Dx12(x) => x,
-                        _ => panic!("downcast error")
-                    }
-                }
-            }
-            $crate::mux_cfg! {
-                #[cfg(dx12)]
-                #[allow(unused)]
-                fn dx12_owned(self) -> $dx12 {
-                    match self {
-                        $name::Dx12(x) => x,
-                        _ => panic!("downcast error")
-                    }
-                }
-            }
-
-            $crate::mux_cfg! {
-                #[cfg(mtl)]
-                #[allow(unused)]
-                fn mtl(&self) -> &$mtl {
-                    match self {
-                        $name::Mtl(x) => x,
-                    }
-                }
-            }
-            $crate::mux_cfg! {
-                #[cfg(mtl)]
-                #[allow(unused)]
-                fn mtl_mut(&mut self) -> &mut $mtl {
-                    match self {
-                        $name::Mtl(x) => x,
-                    }
-                }
-            }
-            $crate::mux_cfg! {
-                #[cfg(mtl)]
-                #[allow(unused)]
-                fn mtl_owned(self) -> $mtl {
-                    match self {
-                        $name::Mtl(x) => x,
-                    }
-                }
-            }
-        }
-    };
-}
-
-/// Define an enum with a variant per GPU for a Device associated type.
-macro_rules! mux_device_enum {
-    ( $(#[$outer:meta])* $assoc_type: ident) => {
-        $crate::mux_enum! {
-            $(#[$outer])*
-            pub enum $assoc_type {
-                Vk(<$crate::vulkan::VkDevice as $crate::backend::Device>::$assoc_type),
-                Dx12(<$crate::dx12::Dx12Device as $crate::backend::Device>::$assoc_type),
-                Mtl(<$crate::metal::MtlDevice as $crate::backend::Device>::$assoc_type),
-            }
-        }
-    }
-}
-
-#[doc(hidden)]
-/// A match statement where match arms are conditionally configured per GPU.
-#[macro_export]
-macro_rules! mux_match {
-    ( $e:expr ;
-        $vkname:ident::Vk($vkvar:ident) => $vkblock: block
-        $dx12name:ident::Dx12($dx12var:ident) => $dx12block: block
-        $mtlname:ident::Mtl($mtlvar:ident) => $mtlblock: block
-    ) => {
-        match $e {
-            #[cfg(not(target_os="macos"))]
-            $vkname::Vk($vkvar) => $vkblock
-            #[cfg(target_os="windows")]
-            $dx12name::Dx12($dx12var) => $dx12block
-            #[cfg(target_os="macos")]
-            $mtlname::Mtl($mtlvar) => $mtlblock
-        }
-    };
-
-    ( $e:expr ;
-        $vkname:ident::Vk($vkvar:ident) => $vkblock: expr,
-        $dx12name:ident::Dx12($dx12var:ident) => $dx12block: expr,
-        $mtlname:ident::Mtl($mtlvar:ident) => $mtlblock: expr,
-    ) => {
-        $crate::mux_match! { $e;
-            $vkname::Vk($vkvar) => { $vkblock }
-            $dx12name::Dx12($dx12var) => { $dx12block }
-            $mtlname::Mtl($mtlvar) => { $mtlblock }
-        }
-    };
-}
-
-/// A convenience macro for selecting a shader from included files.
-#[macro_export]
-macro_rules! include_shader {
-    ( $device:expr, $path_base:expr) => {
-        $device.choose_shader(
-            include_bytes!(concat!($path_base, ".spv")),
-            include_str!(concat!($path_base, ".hlsl")),
-            include_bytes!(concat!($path_base, ".dxil")),
-            include_str!(concat!($path_base, ".msl")),
-        )
-    };
-}
--- a/piet-gpu-hal/src/metal.rs
+++ b/piet-gpu-hal/src/metal.rs
@ -1,954 +0,0 @@
-// Copyright 2021 The piet-gpu authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Also licensed under MIT license, at your choice.
-
-mod clear;
-mod timer;
-mod util;
-
-use std::mem;
-use std::sync::{Arc, Mutex};
-
-use block::Block;
-use cocoa_foundation::base::id;
-use cocoa_foundation::foundation::{NSInteger, NSUInteger};
-use foreign_types::ForeignType;
-use objc::rc::autoreleasepool;
-use objc::runtime::{Object, BOOL, YES};
-use objc::{class, msg_send, sel, sel_impl};
-
-use core_graphics_types::base::CGFloat;
-use metal::{CommandBufferRef, MTLFeatureSet};
-
-use raw_window_handle::{RawDisplayHandle, RawWindowHandle};
-
-use crate::{
-    BufferUsage, ComputePassDescriptor, Error, GpuInfo, ImageFormat, MapMode, WorkgroupLimits,
-};
-
-use util::*;
-
-use self::timer::{CounterSampleBuffer, CounterSet, TimeCalibration};
-
-pub struct MtlInstance;
-
-pub struct MtlDevice {
-    device: metal::Device,
-    cmd_queue: Arc<Mutex<metal::CommandQueue>>,
-    gpu_info: GpuInfo,
-    helpers: Arc<Helpers>,
-    timer_set: Option<CounterSet>,
-    counter_style: CounterStyle,
-}
-
-/// Type of counter sampling.
-///
-/// See https://developer.apple.com/documentation/metal/counter_sampling/sampling_gpu_data_into_counter_sample_buffers
-#[derive(Clone, Copy, PartialEq, Eq, Debug)]
-enum CounterStyle {
-    None,
-    Stage,
-    Command,
-}
-
-pub struct MtlSurface {
-    layer: metal::MetalLayer,
-}
-
-pub struct MtlSwapchain {
-    layer: metal::MetalLayer,
-    cmd_queue: Arc<Mutex<metal::CommandQueue>>,
-    drawable: Mutex<Option<metal::MetalDrawable>>,
-    n_drawables: usize,
-    drawable_ix: usize,
-}
-
-#[derive(Clone)]
-pub struct Buffer {
-    buffer: metal::Buffer,
-    pub(crate) size: u64,
-}
-
-#[derive(Clone)]
-pub struct Image {
-    texture: metal::Texture,
-    width: u32,
-    height: u32,
-}
-
-// This is the way gfx-hal does it, but a more Vulkan-like strategy would be
-// to have a semaphore that gets signaled from the command buffer's completion
-// handler.
-pub enum Fence {
-    Idle,
-    CmdBufPending(metal::CommandBuffer),
-}
-
-pub struct Semaphore;
-
-pub struct CmdBuf {
-    cmd_buf: metal::CommandBuffer,
-    helpers: Arc<Helpers>,
-    cur_encoder: Encoder,
-    time_calibration: Arc<Mutex<TimeCalibration>>,
-    counter_style: CounterStyle,
-}
-
-enum Encoder {
-    None,
-    Compute(metal::ComputeCommandEncoder, Option<(id, u32)>),
-    Blit(metal::BlitCommandEncoder),
-}
-
-#[derive(Default)]
-pub struct QueryPool {
-    counter_sample_buf: Option<CounterSampleBuffer>,
-    calibration: Arc<Mutex<Option<Arc<Mutex<TimeCalibration>>>>>,
-}
-
-pub struct Pipeline(metal::ComputePipelineState);
-
-#[derive(Default)]
-pub struct DescriptorSetBuilder(DescriptorSet);
-
-#[derive(Default)]
-pub struct DescriptorSet {
-    buffers: Vec<Buffer>,
-    images: Vec<Image>,
-}
-
-struct Helpers {
-    clear_pipeline: metal::ComputePipelineState,
-}
-
-impl MtlInstance {
-    pub fn new() -> Result<MtlInstance, Error> {
-        Ok(MtlInstance)
-    }
-
-    pub unsafe fn surface(
-        &self,
-        _display_handle: RawDisplayHandle,
-        window_handle: RawWindowHandle,
-    ) -> Result<MtlSurface, Error> {
-        if let RawWindowHandle::AppKit(handle) = window_handle {
-            Ok(Self::make_surface(handle.ns_view as id, handle.ns_window as id).unwrap())
-        } else {
-            Err("can't create surface for window handle".into())
-        }
-    }
-
-    unsafe fn make_surface(ns_view: id, ns_window: id) -> Option<MtlSurface> {
-        let ca_ml_class = class!(CAMetalLayer);
-        let is_ca_ml: BOOL = msg_send![ns_view, isKindOfClass: ca_ml_class];
-        if is_ca_ml == YES {
-            todo!("create surface from layer")
-        }
-        let layer: id = msg_send![ns_view, layer];
-        let use_current = !layer.is_null() && {
-            let result: BOOL = msg_send![layer, isKindOfClass: ca_ml_class];
-            result == YES
-        };
-        let metal_layer = if use_current {
-            mem::transmute::<_, &metal::MetalLayerRef>(layer).to_owned()
-        } else {
-            let metal_layer: metal::MetalLayer = msg_send![ca_ml_class, new];
-            let () = msg_send![ns_view, setLayer: metal_layer.as_ref()];
-            let () = msg_send![ns_view, setWantsLayer: YES];
-            let bounds: CGRect = msg_send![ns_view, bounds];
-            let () = msg_send![metal_layer, setFrame: bounds];
-
-            if !ns_window.is_null() {
-                let scale_factor: CGFloat = msg_send![ns_window, backingScaleFactor];
-                let () = msg_send![metal_layer, setContentsScale: scale_factor];
-            }
-            // gfx-hal sets a delegate here
-            metal_layer
-        };
-        let () = msg_send![metal_layer, setContentsGravity: kCAGravityTopLeft];
-        Some(MtlSurface { layer: metal_layer })
-    }
-
-    // TODO might do some enumeration of devices
-
-    pub fn device(&self) -> Result<MtlDevice, Error> {
-        if let Some(device) = metal::Device::system_default() {
-            let cmd_queue = device.new_command_queue();
-            Ok(MtlDevice::new_from_raw_mtl(device, cmd_queue))
-        } else {
-            Err("can't create system default Metal device".into())
-        }
-    }
-
-    pub unsafe fn swapchain(
-        &self,
-        _width: usize,
-        _height: usize,
-        device: &MtlDevice,
-        surface: &MtlSurface,
-    ) -> Result<MtlSwapchain, Error> {
-        surface.layer.set_device(&device.device);
-        let n_drawables = surface.layer.maximum_drawable_count() as usize;
-        Ok(MtlSwapchain {
-            layer: surface.layer.to_owned(),
-            cmd_queue: device.cmd_queue.clone(),
-            drawable: Default::default(),
-            n_drawables,
-            drawable_ix: 0,
-        })
-    }
-}
-
-impl MtlDevice {
-    pub fn new_from_raw_mtl(device: metal::Device, cmd_queue: metal::CommandQueue) -> MtlDevice {
-        let is_mac = device.supports_feature_set(MTLFeatureSet::macOS_GPUFamily1_v1);
-        let is_ios = device.supports_feature_set(MTLFeatureSet::iOS_GPUFamily1_v1);
-        let version = NSOperatingSystemVersion::get();
-
-        let use_staging_buffers =
-            if (is_mac && version.at_least(10, 15)) || (is_ios && version.at_least(13, 0)) {
-                !device.has_unified_memory()
-            } else {
-                !device.is_low_power()
-            };
-        // TODO: these are conservative; we need to derive these from
-        // supports_feature_set queries.
-        let gpu_info = GpuInfo {
-            has_descriptor_indexing: false,
-            has_subgroups: false,
-            subgroup_size: None,
-            // The workgroup limits are taken from the minimum of a desktop installation;
-            // we don't support iOS right now, but in case of testing on those devices it might
-            // need to change these (or just queried properly).
-            workgroup_limits: WorkgroupLimits {
-                max_size: [1024, 1024, 64],
-                max_invocations: 1024,
-            },
-            has_memory_model: false,
-            use_staging_buffers,
-        };
-        let helpers = Arc::new(Helpers {
-            clear_pipeline: clear::make_clear_pipeline(&device),
-        });
-        // Timer stuff
-        let timer_set = CounterSet::get_timer_counter_set(&device);
-        let counter_style = if timer_set.is_some() {
-            if device.supports_counter_sampling(metal::MTLCounterSamplingPoint::AtStageBoundary) {
-                CounterStyle::Stage
-            } else if device
-                .supports_counter_sampling(metal::MTLCounterSamplingPoint::AtDispatchBoundary)
-            {
-                CounterStyle::Command
-            } else {
-                CounterStyle::None
-            }
-        } else {
-            CounterStyle::None
-        };
-        MtlDevice {
-            device,
-            cmd_queue: Arc::new(Mutex::new(cmd_queue)),
-            gpu_info,
-            helpers,
-            timer_set,
-            counter_style,
-        }
-    }
-
-    pub fn cmd_buf_from_raw_mtl(&self, raw_cmd_buf: metal::CommandBuffer) -> CmdBuf {
-        let cmd_buf = raw_cmd_buf;
-        let helpers = self.helpers.clone();
-        let cur_encoder = Encoder::None;
-        let time_calibration = Default::default();
-        CmdBuf {
-            cmd_buf,
-            helpers,
-            cur_encoder,
-            time_calibration,
-            counter_style: self.counter_style,
-        }
-    }
-
-    pub fn image_from_raw_mtl(&self, texture: metal::Texture, width: u32, height: u32) -> Image {
-        Image {
-            texture,
-            width,
-            height,
-        }
-    }
-}
-
-impl crate::backend::Device for MtlDevice {
-    type Buffer = Buffer;
-
-    type Image = Image;
-
-    type Pipeline = Pipeline;
-
-    type DescriptorSet = DescriptorSet;
-
-    type QueryPool = QueryPool;
-
-    type CmdBuf = CmdBuf;
-
-    type Fence = Fence;
-
-    type Semaphore = Semaphore;
-
-    type DescriptorSetBuilder = DescriptorSetBuilder;
-
-    type Sampler = ();
-
-    type ShaderSource = str;
-
-    fn query_gpu_info(&self) -> crate::GpuInfo {
-        self.gpu_info.clone()
-    }
-
-    fn create_buffer(&self, size: u64, usage: BufferUsage) -> Result<Self::Buffer, Error> {
-        let options = if usage.contains(BufferUsage::MAP_READ) {
-            metal::MTLResourceOptions::StorageModeShared
-                | metal::MTLResourceOptions::CPUCacheModeDefaultCache
-        } else if usage.contains(BufferUsage::MAP_WRITE) {
-            metal::MTLResourceOptions::StorageModeShared
-                | metal::MTLResourceOptions::CPUCacheModeWriteCombined
-        } else {
-            metal::MTLResourceOptions::StorageModePrivate
-        };
-        let buffer = self.device.new_buffer(size, options);
-        Ok(Buffer { buffer, size })
-    }
-
-    unsafe fn destroy_buffer(&self, _buffer: &Self::Buffer) -> Result<(), Error> {
-        // This defers dropping until the buffer object is dropped. We probably need
-        // to rethink buffer lifetime if descriptor sets can retain references.
-        Ok(())
-    }
-
-    unsafe fn create_image2d(
-        &self,
-        width: u32,
-        height: u32,
-        format: ImageFormat,
-    ) -> Result<Self::Image, Error> {
-        let desc = metal::TextureDescriptor::new();
-        desc.set_width(width as u64);
-        desc.set_height(height as u64);
-        // These are defaults so don't need to be explicitly set.
-        //desc.set_depth(1);
-        //desc.set_mipmap_level_count(1);
-        let mtl_format = match format {
-            ImageFormat::A8 => metal::MTLPixelFormat::R8Unorm,
-            ImageFormat::Rgba8 => metal::MTLPixelFormat::RGBA8Unorm,
-            ImageFormat::Surface => metal::MTLPixelFormat::BGRA8Unorm,
-        };
-        desc.set_pixel_format(mtl_format);
-        desc.set_usage(metal::MTLTextureUsage::ShaderRead | metal::MTLTextureUsage::ShaderWrite);
-        let texture = self.device.new_texture(&desc);
-        Ok(Image {
-            texture,
-            width,
-            height,
-        })
-    }
-
-    unsafe fn destroy_image(&self, _image: &Self::Image) -> Result<(), Error> {
-        // TODO figure out what we want to do here
-        Ok(())
-    }
-
-    unsafe fn create_compute_pipeline(
-        &self,
-        code: &Self::ShaderSource,
-        _bind_types: &[crate::BindType],
-    ) -> Result<Self::Pipeline, Error> {
-        let options = metal::CompileOptions::new();
-        let library = self.device.new_library_with_source(code, &options)?;
-        let function = library.get_function("main0", None)?;
-        let pipeline = self
-            .device
-            .new_compute_pipeline_state_with_function(&function)?;
-        Ok(Pipeline(pipeline))
-    }
-
-    unsafe fn descriptor_set_builder(&self) -> Self::DescriptorSetBuilder {
-        DescriptorSetBuilder::default()
-    }
-
-    unsafe fn update_buffer_descriptor(
-        &self,
-        ds: &mut Self::DescriptorSet,
-        index: u32,
-        buf: &Self::Buffer,
-    ) {
-        ds.buffers[index as usize] = buf.clone();
-    }
-
-    unsafe fn update_image_descriptor(
-        &self,
-        ds: &mut Self::DescriptorSet,
-        index: u32,
-        image: &Self::Image,
-    ) {
-        ds.images[index as usize - ds.buffers.len()] = image.clone();
-    }
-
-    fn create_cmd_buf(&self) -> Result<Self::CmdBuf, Error> {
-        let cmd_queue = self.cmd_queue.lock().unwrap();
-        // A discussion about autorelease pools.
-        //
-        // Autorelease pools are a sore point in Rust/Objective-C interop. Basically,
-        // you can have any two of correctness, ergonomics, and performance. Here we've
-        // chosen the first two, using the pattern of a fine grained autorelease pool
-        // to give the Obj-C object Rust-like lifetime semantics whenever objects are
-        // created as autorelease (by convention, this is any object creation with an
-        // Obj-C method name that doesn't begin with "new" or "alloc").
-        //
-        // To gain back some of the performance, we'd need a way to wrap an autorelease
-        // pool over a chunk of work - that could be one frame of rendering, but for
-        // tests that iterate a number of command buffer submissions, it would need to
-        // be around that. On non-mac platforms, it would be a no-op.
-        //
-        // In any case, this way, the caller doesn't need to worry, and the performance
-        // hit might not be so bad (perhaps we should measure).
-
-        // consider new_command_buffer_with_unretained_references for performance
-        let cmd_buf = autoreleasepool(|| cmd_queue.new_command_buffer().to_owned());
-        let helpers = self.helpers.clone();
-        let cur_encoder = Encoder::None;
-        let time_calibration = Default::default();
-        Ok(CmdBuf {
-            cmd_buf,
-            helpers,
-            cur_encoder,
-            time_calibration,
-            counter_style: self.counter_style,
-        })
-    }
-
-    unsafe fn destroy_cmd_buf(&self, _cmd_buf: Self::CmdBuf) -> Result<(), Error> {
-        Ok(())
-    }
-
-    fn create_query_pool(&self, n_queries: u32) -> Result<Self::QueryPool, Error> {
-        if let Some(timer_set) = &self.timer_set {
-            let pool = CounterSampleBuffer::new(&self.device, n_queries as u64, timer_set)
-                .ok_or("error creating timer query pool")?;
-            return Ok(QueryPool {
-                counter_sample_buf: Some(pool),
-                calibration: Default::default(),
-            });
-        }
-        Ok(QueryPool::default())
-    }
-
-    unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result<Vec<f64>, Error> {
-        if let Some(raw) = &pool.counter_sample_buf {
-            let resolved = raw.resolve();
-            let calibration = pool.calibration.lock().unwrap();
-            if let Some(calibration) = &*calibration {
-                let calibration = calibration.lock().unwrap();
-                let result = resolved
-                    .iter()
-                    .map(|time_ns| calibration.correlate(*time_ns))
-                    .collect();
-                return Ok(result);
-            }
-        }
-        // Maybe should return None indicating it wasn't successful? But that might break.
-        Ok(Vec::new())
-    }
-
-    unsafe fn run_cmd_bufs(
-        &self,
-        cmd_bufs: &[&Self::CmdBuf],
-        _wait_semaphores: &[&Self::Semaphore],
-        _signal_semaphores: &[&Self::Semaphore],
-        fence: Option<&mut Self::Fence>,
-    ) -> Result<(), Error> {
-        unsafe fn add_scheduled_handler(
-            cmd_buf: &metal::CommandBufferRef,
-            block: &Block<(&CommandBufferRef,), ()>,
-        ) {
-            msg_send![cmd_buf, addScheduledHandler: block]
-        }
-        for cmd_buf in cmd_bufs {
-            let time_calibration = cmd_buf.time_calibration.clone();
-            let start_block = block::ConcreteBlock::new(move |buffer: &metal::CommandBufferRef| {
-                let device: id = msg_send![buffer, device];
-                let mut time_calibration = time_calibration.lock().unwrap();
-                let cpu_ts_ptr = &mut time_calibration.cpu_start_ts as *mut _;
-                let gpu_ts_ptr = &mut time_calibration.gpu_start_ts as *mut _;
-                // TODO: only do this if supported.
-                let () = msg_send![device, sampleTimestamps: cpu_ts_ptr gpuTimestamp: gpu_ts_ptr];
-            })
-            .copy();
-            add_scheduled_handler(&cmd_buf.cmd_buf, &start_block);
-            let time_calibration = cmd_buf.time_calibration.clone();
-            let completed_block =
-                block::ConcreteBlock::new(move |buffer: &metal::CommandBufferRef| {
-                    let device: id = msg_send![buffer, device];
-                    let mut time_calibration = time_calibration.lock().unwrap();
-                    let cpu_ts_ptr = &mut time_calibration.cpu_end_ts as *mut _;
-                    let gpu_ts_ptr = &mut time_calibration.gpu_end_ts as *mut _;
-                    // TODO: only do this if supported.
-                    let () =
-                        msg_send![device, sampleTimestamps: cpu_ts_ptr gpuTimestamp: gpu_ts_ptr];
-                })
-                .copy();
-            cmd_buf.cmd_buf.add_completed_handler(&completed_block);
-            cmd_buf.cmd_buf.commit();
-        }
-        if let Some(last_cmd_buf) = cmd_bufs.last() {
-            if let Some(fence) = fence {
-                *fence = Fence::CmdBufPending(last_cmd_buf.cmd_buf.to_owned());
-            }
-        }
-        Ok(())
-    }
-
-    unsafe fn map_buffer(
-        &self,
-        buffer: &Self::Buffer,
-        offset: u64,
-        _size: u64,
-        _mode: MapMode,
-    ) -> Result<*mut u8, Error> {
-        let contents_ptr = buffer.buffer.contents();
-        if contents_ptr.is_null() {
-            return Err("probably trying to map private buffer".into());
-        }
-        Ok((contents_ptr as *mut u8).add(offset as usize))
-    }
-
-    unsafe fn unmap_buffer(
-        &self,
-        _buffer: &Self::Buffer,
-        _offset: u64,
-        _size: u64,
-        _mode: MapMode,
-    ) -> Result<(), Error> {
-        Ok(())
-    }
-
-    unsafe fn create_semaphore(&self) -> Result<Self::Semaphore, Error> {
-        Ok(Semaphore)
-    }
-
-    unsafe fn create_fence(&self, _signaled: bool) -> Result<Self::Fence, Error> {
-        // Doesn't handle signaled case. Maybe the fences should have more
-        // limited functionality than, say, Vulkan.
-        Ok(Fence::Idle)
-    }
-
-    unsafe fn destroy_fence(&self, _fence: Self::Fence) -> Result<(), Error> {
-        Ok(())
-    }
-
-    unsafe fn wait_and_reset(&self, fences: Vec<&mut Self::Fence>) -> Result<(), Error> {
-        for fence in fences {
-            match fence {
-                Fence::Idle => (),
-                Fence::CmdBufPending(cmd_buf) => {
-                    cmd_buf.wait_until_completed();
-                    // TODO: this would be a good place to check errors, currently
-                    // dropped on the floor.
-                    *fence = Fence::Idle;
-                }
-            }
-        }
-        Ok(())
-    }
-
-    unsafe fn get_fence_status(&self, fence: &mut Self::Fence) -> Result<bool, Error> {
-        match fence {
-            Fence::Idle => Ok(true),
-            Fence::CmdBufPending(cmd_buf) => {
-                Ok(cmd_buf.status() == metal::MTLCommandBufferStatus::Completed)
-            }
-        }
-    }
-
-    unsafe fn create_sampler(&self, params: crate::SamplerParams) -> Result<Self::Sampler, Error> {
-        todo!()
-    }
-}
-
-impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
-    unsafe fn begin(&mut self) {}
-
-    unsafe fn finish(&mut self) {
-        self.flush_encoder();
-    }
-
-    unsafe fn flush(&mut self) {
-        self.flush_encoder();
-    }
-
-    unsafe fn reset(&mut self) -> bool {
-        false
-    }
-
-    unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
-        // TODO: we might want to get better about validation but the following
-        // assert is likely to trigger, and also a case can be made that
-        // validation should be done at the hub level, for consistency.
-        //debug_assert!(matches!(self.cur_encoder, Encoder::None));
-        self.flush_encoder();
-        autoreleasepool(|| {
-            let (encoder, end_query) = match (&desc.timer_queries, self.counter_style) {
-                (Some(queries), CounterStyle::Stage) => {
-                    let descriptor: id =
-                        msg_send![class!(MTLComputePassDescriptor), computePassDescriptor];
-                    let attachments: id = msg_send![descriptor, sampleBufferAttachments];
-                    let index: NSUInteger = 0;
-                    let attachment: id = msg_send![attachments, objectAtIndexedSubscript: index];
-                    // Here we break the hub/mux separation a bit, for expedience
-                    #[allow(irrefutable_let_patterns)]
-                    if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
-                        if let Some(sample_buf) = &query_pool.counter_sample_buf {
-                            let () = msg_send![attachment, setSampleBuffer: sample_buf.id()];
-                        }
-                    }
-                    let start_index = queries.1 as NSUInteger;
-                    let end_index = queries.2 as NSInteger;
-                    let () = msg_send![attachment, setStartOfEncoderSampleIndex: start_index];
-                    let () = msg_send![attachment, setEndOfEncoderSampleIndex: end_index];
-                    (
-                        msg_send![
-                            self.cmd_buf,
-                            computeCommandEncoderWithDescriptor: descriptor
-                        ],
-                        None,
-                    )
-                }
-                (Some(queries), CounterStyle::Command) => {
-                    let encoder = self.cmd_buf.new_compute_command_encoder();
-                    #[allow(irrefutable_let_patterns)]
-                    let end_query = if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
-                        if let Some(sample_buf) = &query_pool.counter_sample_buf {
-                            let sample_index = queries.1 as NSUInteger;
-                            let sample_buf = sample_buf.id();
-                            let () = msg_send![encoder, sampleCountersInBuffer: sample_buf atSampleIndex: sample_index withBarrier: true];
-                            Some((sample_buf, queries.2))
-                        } else {
-                            None
-                        }
-                    } else {
-                        None
-                    };
-                    (encoder, end_query)
-                }
-                _ => (self.cmd_buf.new_compute_command_encoder(), None),
-            };
-            self.cur_encoder = Encoder::Compute(encoder.to_owned(), end_query);
-        });
-    }
-
-    unsafe fn dispatch(
-        &mut self,
-        pipeline: &Pipeline,
-        descriptor_set: &DescriptorSet,
-        workgroup_count: (u32, u32, u32),
-        workgroup_size: (u32, u32, u32),
-    ) {
-        let encoder = self.compute_command_encoder();
-        encoder.set_compute_pipeline_state(&pipeline.0);
-        let mut buf_ix = 0;
-        for buffer in &descriptor_set.buffers {
-            encoder.set_buffer(buf_ix, Some(&buffer.buffer), 0);
-            buf_ix += 1;
-        }
-        let mut img_ix = buf_ix;
-        for image in &descriptor_set.images {
-            encoder.set_texture(img_ix, Some(&image.texture));
-            img_ix += 1;
-        }
-        let workgroup_count = metal::MTLSize {
-            width: workgroup_count.0 as u64,
-            height: workgroup_count.1 as u64,
-            depth: workgroup_count.2 as u64,
-        };
-        let workgroup_size = metal::MTLSize {
-            width: workgroup_size.0 as u64,
-            height: workgroup_size.1 as u64,
-            depth: workgroup_size.2 as u64,
-        };
-        encoder.dispatch_thread_groups(workgroup_count, workgroup_size);
-    }
-
-    unsafe fn end_compute_pass(&mut self) {
-        // TODO: might validate that we are in a compute encoder state
-        self.flush_encoder();
-    }
-
-    unsafe fn memory_barrier(&mut self) {
-        // We'll probably move to explicit barriers, but for now rely on
-        // Metal's own tracking.
-    }
-
-    unsafe fn host_barrier(&mut self) {}
-
-    unsafe fn image_barrier(
-        &mut self,
-        _image: &Image,
-        _src_layout: crate::ImageLayout,
-        _dst_layout: crate::ImageLayout,
-    ) {
-        // I think these are being tracked.
-    }
-
-    unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
-        let size = size.unwrap_or(buffer.size);
-        let _ = self.compute_command_encoder();
-        // Getting this directly is a workaround for a borrow checker issue.
-        if let Encoder::Compute(e, _) = &self.cur_encoder {
-            clear::encode_clear(e, &self.helpers.clear_pipeline, &buffer.buffer, size);
-        }
-    }
-
-    unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
-        let encoder = self.blit_command_encoder();
-        let size = src.size.min(dst.size);
-        encoder.copy_from_buffer(&src.buffer, 0, &dst.buffer, 0, size);
-    }
-
-    unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
-        let encoder = self.blit_command_encoder();
-        assert_eq!(dst.size, (src.width as u64) * (src.height as u64) * 4);
-        let bytes_per_row = (src.width * 4) as NSUInteger;
-        let src_size = metal::MTLSize {
-            width: src.width as NSUInteger,
-            height: src.height as NSUInteger,
-            depth: 1,
-        };
-        let origin = metal::MTLOrigin { x: 0, y: 0, z: 0 };
-        encoder.copy_from_texture_to_buffer(
-            &src.texture,
-            0,
-            0,
-            origin,
-            src_size,
-            &dst.buffer,
-            0,
-            bytes_per_row,
-            bytes_per_row * src.height as NSUInteger,
-            metal::MTLBlitOption::empty(),
-        );
-    }
-
-    unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
-        let encoder = self.blit_command_encoder();
-        assert_eq!(src.size, (dst.width as u64) * (dst.height as u64) * 4);
-        let bytes_per_row = (dst.width * 4) as NSUInteger;
-        let src_size = metal::MTLSize {
-            width: dst.width as NSUInteger,
-            height: dst.height as NSUInteger,
-            depth: 1,
-        };
-        let origin = metal::MTLOrigin { x: 0, y: 0, z: 0 };
-        encoder.copy_from_buffer_to_texture(
-            &src.buffer,
-            0,
-            bytes_per_row,
-            bytes_per_row * dst.height as NSUInteger,
-            src_size,
-            &dst.texture,
-            0,
-            0,
-            origin,
-            metal::MTLBlitOption::empty(),
-        );
-    }
-
-    unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
-        let encoder = self.blit_command_encoder();
-        let src_size = metal::MTLSize {
-            width: src.width.min(dst.width) as NSUInteger,
-            height: src.width.min(dst.height) as NSUInteger,
-            depth: 1,
-        };
-        let origin = metal::MTLOrigin { x: 0, y: 0, z: 0 };
-        encoder.copy_from_texture(
-            &src.texture,
-            0,
-            0,
-            origin,
-            src_size,
-            &dst.texture,
-            0,
-            0,
-            origin,
-        );
-    }
-
-    unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {
-        let mut calibration = pool.calibration.lock().unwrap();
-        *calibration = Some(self.time_calibration.clone());
-    }
-
-    unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
-        if let Some(buf) = &pool.counter_sample_buf {
-            if matches!(self.cur_encoder, Encoder::None) {
-                self.cur_encoder =
-                    Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned(), None);
-            }
-            let sample_index = query as NSUInteger;
-            if self.counter_style == CounterStyle::Command {
-                match &self.cur_encoder {
-                    Encoder::Compute(e, _) => {
-                        let () = msg_send![e.as_ptr(), sampleCountersInBuffer: buf.id() atSampleIndex: sample_index withBarrier: true];
-                    }
-                    Encoder::None => unreachable!(),
-                    _ => todo!(),
-                }
-            } else if self.counter_style == CounterStyle::Stage {
-                match &self.cur_encoder {
-                    Encoder::Compute(_e, _) => {
-                        println!("write_timestamp is not supported for stage-style encoders");
-                    }
-                    _ => (),
-                }
-            }
-        }
-    }
-}
-
-impl CmdBuf {
-    fn compute_command_encoder(&mut self) -> &metal::ComputeCommandEncoder {
-        if !matches!(self.cur_encoder, Encoder::Compute(..)) {
-            self.flush_encoder();
-            self.cur_encoder =
-                Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned(), None);
-        }
-        if let Encoder::Compute(e, _) = &self.cur_encoder {
-            e
-        } else {
-            unreachable!()
-        }
-    }
-
-    fn blit_command_encoder(&mut self) -> &metal::BlitCommandEncoder {
-        if !matches!(self.cur_encoder, Encoder::Blit(_)) {
-            self.flush_encoder();
-            self.cur_encoder = Encoder::Blit(self.cmd_buf.new_blit_command_encoder().to_owned());
-        }
-        if let Encoder::Blit(e) = &self.cur_encoder {
-            e
-        } else {
-            unreachable!()
-        }
-    }
-
-    fn flush_encoder(&mut self) {
-        match std::mem::replace(&mut self.cur_encoder, Encoder::None) {
-            Encoder::Compute(e, Some((sample_buf, end_query))) => {
-                let sample_index = end_query as NSUInteger;
-                unsafe {
-                    let () = msg_send![e.as_ptr(), sampleCountersInBuffer: sample_buf atSampleIndex: sample_index withBarrier: true];
-                }
-                e.end_encoding();
-            }
-            Encoder::Compute(e, None) => e.end_encoding(),
-            Encoder::Blit(e) => e.end_encoding(),
-            Encoder::None => (),
-        }
-    }
-}
-
-impl crate::backend::DescriptorSetBuilder<MtlDevice> for DescriptorSetBuilder {
-    fn add_buffers(&mut self, buffers: &[&Buffer]) {
-        self.0.buffers.extend(buffers.iter().copied().cloned());
-    }
-
-    fn add_images(&mut self, images: &[&Image]) {
-        self.0.images.extend(images.iter().copied().cloned());
-    }
-
-    fn add_textures(&mut self, images: &[&Image]) {
-        self.add_images(images);
-    }
-
-    unsafe fn build(
-        self,
-        _device: &MtlDevice,
-        _pipeline: &Pipeline,
-    ) -> Result<DescriptorSet, Error> {
-        Ok(self.0)
-    }
-}
-
-impl MtlSwapchain {
-    pub unsafe fn next(&mut self) -> Result<(usize, Semaphore), Error> {
-        let drawable_ix = self.drawable_ix;
-        self.drawable_ix = (drawable_ix + 1) % self.n_drawables;
-        Ok((drawable_ix, Semaphore))
-    }
-
-    pub unsafe fn image(&self, _idx: usize) -> Image {
-        let (drawable, texture) = autoreleasepool(|| {
-            let drawable = self.layer.next_drawable().unwrap();
-            (drawable.to_owned(), drawable.texture().to_owned())
-        });
-        *self.drawable.lock().unwrap() = Some(drawable);
-        let size = self.layer.drawable_size();
-        Image {
-            texture,
-            width: size.width.round() as u32,
-            height: size.height.round() as u32,
-        }
-    }
-
-    pub unsafe fn present(
-        &self,
-        _image_idx: usize,
-        _semaphores: &[&Semaphore],
-    ) -> Result<bool, Error> {
-        let drawable = self.drawable.lock().unwrap().take();
-        if let Some(drawable) = drawable {
-            autoreleasepool(|| {
-                let cmd_queue = self.cmd_queue.lock().unwrap();
-                let cmd_buf = cmd_queue.new_command_buffer();
-                cmd_buf.present_drawable(&drawable);
-                cmd_buf.commit();
-            });
-        } else {
-            println!("no drawable; present called without acquiring image?");
-        }
-        Ok(false)
-    }
-}
-
-#[repr(C)]
-struct NSOperatingSystemVersion {
-    major: NSInteger,
-    minor: NSInteger,
-    patch: NSInteger,
-}
-
-impl NSOperatingSystemVersion {
-    fn get() -> NSOperatingSystemVersion {
-        unsafe {
-            let process_info: *mut Object = msg_send![class!(NSProcessInfo), processInfo];
-            msg_send![process_info, operatingSystemVersion]
-        }
-    }
-
-    fn at_least(&self, major: u32, minor: u32) -> bool {
-        let major = major as NSInteger;
-        let minor = minor as NSInteger;
-        self.major > major || (self.major == major && self.minor >= minor)
-    }
-}
--- a/piet-gpu-hal/src/metal/clear.rs
+++ b/piet-gpu-hal/src/metal/clear.rs
@ -1,77 +0,0 @@
-// Copyright 2021 The piet-gpu authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Also licensed under MIT license, at your choice.
-
-//! The compute shader and stage for clearing buffers.
-
-use metal::{ComputePipelineState, Device};
-
-const CLEAR_MSL: &str = r#"
-using namespace metal;
-
-struct ConfigBuf
-{
-    uint size;
-    uint value;
-};
-
-kernel void main0(const device ConfigBuf& config [[buffer(0)]], device uint *data [[buffer(1)]], uint3 gid [[thread_position_in_grid]])
-{
-    uint ix = gid.x;
-    if (ix < config.size)
-    {
-        data[ix] = config.value;
-    }
-}
-"#;
-
-pub fn make_clear_pipeline(device: &Device) -> ComputePipelineState {
-    let options = metal::CompileOptions::new();
-    let library = device.new_library_with_source(CLEAR_MSL, &options).unwrap();
-    let function = library.get_function("main0", None).unwrap();
-    device
-        .new_compute_pipeline_state_with_function(&function)
-        .unwrap()
-}
-
-pub fn encode_clear(
-    encoder: &metal::ComputeCommandEncoderRef,
-    clear_pipeline: &ComputePipelineState,
-    buffer: &metal::Buffer,
-    size: u64,
-) {
-    // TODO: should be more careful with overflow
-    let size_in_u32s = (size / 4) as u32;
-    encoder.set_compute_pipeline_state(&clear_pipeline);
-    let config = [size_in_u32s, 0];
-    encoder.set_bytes(
-        0,
-        std::mem::size_of_val(&config) as u64,
-        config.as_ptr() as *const _,
-    );
-    encoder.set_buffer(1, Some(buffer), 0);
-    let n_wg = (size_in_u32s + 255) / 256;
-    let workgroup_count = metal::MTLSize {
-        width: n_wg as u64,
-        height: 1,
-        depth: 1,
-    };
-    let workgroup_size = metal::MTLSize {
-        width: 256,
-        height: 1,
-        depth: 1,
-    };
-    encoder.dispatch_thread_groups(workgroup_count, workgroup_size);
-}
--- a/piet-gpu-hal/src/metal/timer.rs
+++ b/piet-gpu-hal/src/metal/timer.rs
@ -1,172 +0,0 @@
-// Copyright 2021 The piet-gpu authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Also licensed under MIT license, at your choice.
-
-//! Support for timer queries.
-//!
-//! Likely some of this should be upstreamed into metal-rs.
-
-use std::{ffi::CStr, ptr::null_mut};
-
-use cocoa_foundation::{
-    base::id,
-    foundation::{NSRange, NSUInteger},
-};
-use metal::{DeviceRef, MTLStorageMode};
-use objc::{class, msg_send, sel, sel_impl};
-
-pub struct CounterSampleBuffer {
-    id: id,
-    count: u64,
-}
-
-pub struct CounterSet {
-    id: id,
-}
-
-#[derive(Default)]
-pub struct TimeCalibration {
-    pub cpu_start_ts: u64,
-    pub gpu_start_ts: u64,
-    pub cpu_end_ts: u64,
-    pub gpu_end_ts: u64,
-}
-
-impl Drop for CounterSampleBuffer {
-    fn drop(&mut self) {
-        unsafe { msg_send![self.id, release] }
-    }
-}
-
-impl Clone for CounterSampleBuffer {
-    fn clone(&self) -> CounterSampleBuffer {
-        unsafe {
-            CounterSampleBuffer {
-                id: msg_send![self.id, retain],
-                count: self.count,
-            }
-        }
-    }
-}
-
-impl CounterSampleBuffer {
-    pub fn id(&self) -> id {
-        self.id
-    }
-}
-
-impl Drop for CounterSet {
-    fn drop(&mut self) {
-        unsafe { msg_send![self.id, release] }
-    }
-}
-
-impl CounterSet {
-    pub fn get_timer_counter_set(device: &DeviceRef) -> Option<CounterSet> {
-        unsafe {
-            // TODO: version check
-            let sets: id = msg_send!(device, counterSets);
-            let count: NSUInteger = msg_send![sets, count];
-            for i in 0..count {
-                let set: id = msg_send![sets, objectAtIndex: i];
-                let name: id = msg_send![set, name];
-                let name_cstr = CStr::from_ptr(msg_send![name, UTF8String]);
-                if name_cstr.to_bytes() == b"timestamp" {
-                    return Some(CounterSet { id: set });
-                }
-            }
-            None
-        }
-    }
-}
-
-// copied from metal-rs; should be in common utilities maybe?
-fn nsstring_as_str(nsstr: &objc::runtime::Object) -> &str {
-    let bytes = unsafe {
-        let bytes: *const std::os::raw::c_char = msg_send![nsstr, UTF8String];
-        bytes as *const u8
-    };
-    let len: NSUInteger = unsafe { msg_send![nsstr, length] };
-    unsafe {
-        let bytes = std::slice::from_raw_parts(bytes, len as usize);
-        std::str::from_utf8(bytes).unwrap()
-    }
-}
-
-impl CounterSampleBuffer {
-    pub fn new(
-        device: &DeviceRef,
-        count: u64,
-        counter_set: &CounterSet,
-    ) -> Option<CounterSampleBuffer> {
-        unsafe {
-            let desc_cls = class!(MTLCounterSampleBufferDescriptor);
-            let descriptor: id = msg_send![desc_cls, alloc];
-            let _: id = msg_send![descriptor, init];
-            let count = count as NSUInteger;
-            let () = msg_send![descriptor, setSampleCount: count];
-            let () = msg_send![descriptor, setCounterSet: counter_set.id];
-            let () = msg_send![
-                descriptor,
-                setStorageMode: MTLStorageMode::Shared as NSUInteger
-            ];
-            let mut error: id = null_mut();
-            let buf: id = msg_send![device, newCounterSampleBufferWithDescriptor: descriptor error: &mut error];
-            let () = msg_send![descriptor, release];
-            if !error.is_null() {
-                let description = msg_send![error, localizedDescription];
-                println!(
-                    "error allocating sample buffer, code = {}",
-                    nsstring_as_str(description)
-                );
-                let () = msg_send![error, release];
-                return None;
-            }
-            Some(CounterSampleBuffer { id: buf, count })
-        }
-    }
-
-    // Read the timestamps.
-    //
-    // Safety: the lifetime of the returned slice is wrong, it's actually autoreleased.
-    pub unsafe fn resolve(&self) -> &[u64] {
-        let range = NSRange::new(0, self.count);
-        let data: id = msg_send![self.id, resolveCounterRange: range];
-        if data.is_null() {
-            &[]
-        } else {
-            let bytes: *const u64 = msg_send![data, bytes];
-            std::slice::from_raw_parts(bytes, self.count as usize)
-        }
-    }
-}
-
-impl TimeCalibration {
-    /// Convert GPU timestamp into CPU time base.
-    ///
-    /// See https://developer.apple.com/documentation/metal/performance_tuning/correlating_cpu_and_gpu_timestamps
-    pub fn correlate(&self, raw_ts: u64) -> f64 {
-        let delta_cpu = self.cpu_end_ts - self.cpu_start_ts;
-        let delta_gpu = self.gpu_end_ts - self.gpu_start_ts;
-        let adj_ts = if delta_gpu > 0 {
-            let scale = delta_cpu as f64 / delta_gpu as f64;
-            self.cpu_start_ts as f64 + (raw_ts as f64 - self.gpu_start_ts as f64) * scale
-        } else {
-            // Default is ns on Apple Silicon; on other hardware this will be wrong
-            raw_ts as f64
-        };
-        adj_ts * 1e-9
-    }
-}
--- a/piet-gpu-hal/src/metal/util.rs
+++ b/piet-gpu-hal/src/metal/util.rs
@ -1,39 +0,0 @@
-// Copyright 2021 The piet-gpu authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Also licensed under MIT license, at your choice.
-
-//! Utilities and types for Metal integration
-
-use core_graphics_types::{base::CGFloat, geometry::CGSize};
-
-#[link(name = "QuartzCore", kind = "framework")]
-extern "C" {
-    #[allow(non_upper_case_globals)]
-    pub static kCAGravityTopLeft: cocoa_foundation::base::id;
-}
-
-#[repr(C)]
-#[derive(Clone, Copy, Debug, Default)]
-pub struct CGPoint {
-    pub x: CGFloat,
-    pub y: CGFloat,
-}
-
-#[repr(C)]
-#[derive(Clone, Copy, Debug, Default)]
-pub struct CGRect {
-    pub origin: CGPoint,
-    pub size: CGSize,
-}
--- a/piet-gpu-hal/src/mux.rs
+++ b/piet-gpu-hal/src/mux.rs
@ -1,925 +0,0 @@
-// Copyright 2021 The piet-gpu authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Also licensed under MIT license, at your choice.
-
-//! A multiplexer module that selects a back-end at runtime.
-
-use raw_window_handle::RawDisplayHandle;
-use raw_window_handle::RawWindowHandle;
-use smallvec::SmallVec;
-
-mux_cfg! {
-    #[cfg(vk)]
-    use crate::vulkan;
-}
-mux_cfg! {
-    #[cfg(dx12)]
-    use crate::dx12;
-}
-mux_cfg! {
-    #[cfg(mtl)]
-    use crate::metal;
-}
-use crate::backend::CmdBuf as CmdBufTrait;
-use crate::backend::DescriptorSetBuilder as DescriptorSetBuilderTrait;
-use crate::backend::Device as DeviceTrait;
-use crate::BackendType;
-use crate::BindType;
-use crate::ComputePassDescriptor;
-use crate::ImageFormat;
-use crate::MapMode;
-use crate::{BufferUsage, Error, GpuInfo, ImageLayout, InstanceFlags};
-
-mux_enum! {
-    /// An instance, selected from multiple backends.
-    pub enum Instance {
-        Vk(vulkan::VkInstance),
-        Dx12(dx12::Dx12Instance),
-        Mtl(metal::MtlInstance),
-    }
-}
-
-mux_enum! {
-    /// A device, selected from multiple backends.
-    pub enum Device {
-        Vk(vulkan::VkDevice),
-        Dx12(dx12::Dx12Device),
-        Mtl(metal::MtlDevice),
-    }
-}
-
-mux_enum! {
-    /// A surface, which can apply to one of multiple backends.
-    pub enum Surface {
-        Vk(vulkan::VkSurface),
-        Dx12(dx12::Dx12Surface),
-        Mtl(metal::MtlSurface),
-    }
-}
-
-mux_enum! {
-    /// A surface, which can apply to one of multiple backends.
-    pub enum Swapchain {
-        Vk(vulkan::VkSwapchain),
-        Dx12(dx12::Dx12Swapchain),
-        Mtl(metal::MtlSwapchain),
-    }
-}
-
-mux_device_enum! { Buffer }
-mux_device_enum! { Image }
-mux_device_enum! {
-/// An object for waiting on command buffer completion.
-Fence }
-mux_device_enum! {
-/// A semaphore for swapchain presentation.
-///
-/// Depending on what kind of synchronization is needed for swapchain
-/// presentation by the back-end, this may or may not be a "real"
-/// semaphore.
-Semaphore }
-mux_device_enum! {
-/// A pipeline object; basically a compiled shader.
-Pipeline }
-mux_device_enum! { DescriptorSetBuilder }
-mux_device_enum! {
-/// A descriptor set; a binding of resources for access by a shader.
-DescriptorSet }
-mux_device_enum! { CmdBuf }
-mux_device_enum! {
-/// An object for recording timer queries.
-QueryPool }
-mux_device_enum! { Sampler }
-
-/// The code for a shader, either as source or intermediate representation.
-pub enum ShaderCode<'a> {
-    /// SPIR-V (binary intermediate representation)
-    Spv(&'a [u8]),
-    /// HLSL (source)
-    Hlsl(&'a str),
-    /// DXIL (DX12 intermediate language)
-    Dxil(&'a [u8]),
-    /// Metal Shading Language (source)
-    Msl(&'a str),
-}
-
-impl Instance {
-    /// Create a new GPU instance.
-    ///
-    /// When multiple back-end GPU APIs are available (for example, Vulkan
-    /// and DX12), this function selects one at runtime.
-    ///
-    /// When no surface is given, the instance is suitable for compute-only
-    /// work.
-    pub fn new(flags: InstanceFlags) -> Result<Instance, Error> {
-        let mut backends = [BackendType::Vulkan, BackendType::Dx12];
-        if flags.contains(InstanceFlags::DX12) {
-            backends.swap(0, 1);
-        }
-        for backend in backends {
-            if backend == BackendType::Vulkan {
-                mux_cfg! {
-                    #[cfg(vk)]
-                    {
-                        if let Ok(instance) = vulkan::VkInstance::new() {
-                            return Ok(Instance::Vk(instance));
-                        }
-                    }
-                }
-            }
-            if backend == BackendType::Dx12 {
-                mux_cfg! {
-                    #[cfg(dx12)]
-                    {
-                        if let Ok(instance) = dx12::Dx12Instance::new() {
-                            return Ok(Instance::Dx12(instance))
-                        }
-                    }
-                }
-            }
-        }
-        mux_cfg! {
-            #[cfg(mtl)]
-            {
-                if let Ok(instance) = metal::MtlInstance::new() {
-                    return Ok(Instance::Mtl(instance));
-                }
-            }
-        }
-        // TODO plumb creation errors through.
-        Err("No suitable instances found".into())
-    }
-
-    /// Create a surface from the specified window handle.
-    pub unsafe fn surface(
-        &self,
-        display_handle: RawDisplayHandle,
-        window_handle: RawWindowHandle,
-    ) -> Result<Surface, Error> {
-        mux_match! { self;
-            Instance::Vk(i) => i.surface(display_handle, window_handle).map(Surface::Vk),
-            Instance::Dx12(i) => i.surface(display_handle, window_handle).map(Surface::Dx12),
-            Instance::Mtl(i) => i.surface(display_handle, window_handle).map(Surface::Mtl),
-        }
-    }
-
-    /// Create a device.
-    ///
-    /// The "device" is the low-level GPU abstraction for creating resources
-    /// and submitting work. Most users of this library will want to wrap it in
-    /// a "session" which is similar but provides many conveniences.
-    pub unsafe fn device(&self) -> Result<Device, Error> {
-        mux_match! { self;
-            Instance::Vk(i) => i.device().map(Device::Vk),
-            Instance::Dx12(i) => i.device().map(Device::Dx12),
-            Instance::Mtl(i) => i.device().map(Device::Mtl),
-        }
-    }
-
-    /// Create a swapchain.
-    ///
-    /// A swapchain is a small vector of images shared with the platform's
-    /// presentation logic. To actually display pixels, the application writes
-    /// into the swapchain images, then calls the present method to display
-    /// them.
-    pub unsafe fn swapchain(
-        &self,
-        width: usize,
-        height: usize,
-        device: &Device,
-        surface: &Surface,
-    ) -> Result<Swapchain, Error> {
-        mux_match! { self;
-            Instance::Vk(i) => i
-                .swapchain(width, height, device.vk(), surface.vk())
-                .map(Swapchain::Vk),
-            Instance::Dx12(i) => i
-                .swapchain(width, height, device.dx12(), surface.dx12())
-                .map(Swapchain::Dx12),
-            Instance::Mtl(i) => i
-                .swapchain(width, height, device.mtl(), surface.mtl())
-                .map(Swapchain::Mtl),
-        }
-    }
-}
-
-// This is basically re-exporting the backend device trait, and we could do that,
-// but not doing so lets us diverge more easily (at the moment, the divergence is
-// missing functionality).
-impl Device {
-    #[cfg(target_os = "macos")]
-    pub fn new_from_raw_mtl(
-        device: &::metal::DeviceRef,
-        queue: &::metal::CommandQueueRef,
-    ) -> Device {
-        Device::Mtl(metal::MtlDevice::new_from_raw_mtl(
-            device.to_owned(),
-            queue.to_owned(),
-        ))
-    }
-
-    #[cfg(target_os = "macos")]
-    pub fn cmd_buf_from_raw_mtl(&self, raw_cmd_buf: &::metal::CommandBufferRef) -> CmdBuf {
-        // Note: this will cause problems if we support multiple back-ends on mac. But it will
-        // be a compile error;
-        let Device::Mtl(d) = self;
-        CmdBuf::Mtl(d.cmd_buf_from_raw_mtl(raw_cmd_buf.to_owned()))
-    }
-
-    #[cfg(target_os = "macos")]
-    pub fn image_from_raw_mtl(
-        &self,
-        raw_texture: &::metal::TextureRef,
-        width: u32,
-        height: u32,
-    ) -> Image {
-        // Note: this will cause problems if we support multiple back-ends on mac. But it will
-        // be a compile error;
-        let Device::Mtl(d) = self;
-        Image::Mtl(d.image_from_raw_mtl(raw_texture.to_owned(), width, height))
-    }
-
-    pub fn query_gpu_info(&self) -> GpuInfo {
-        mux_match! { self;
-            Device::Vk(d) => d.query_gpu_info(),
-            Device::Dx12(d) => d.query_gpu_info(),
-            Device::Mtl(d) => d.query_gpu_info(),
-        }
-    }
-
-    pub fn create_buffer(&self, size: u64, usage: BufferUsage) -> Result<Buffer, Error> {
-        mux_match! { self;
-            Device::Vk(d) => d.create_buffer(size, usage).map(Buffer::Vk),
-            Device::Dx12(d) => d.create_buffer(size, usage).map(Buffer::Dx12),
-            Device::Mtl(d) => d.create_buffer(size, usage).map(Buffer::Mtl),
-        }
-    }
-
-    pub unsafe fn destroy_buffer(&self, buffer: &Buffer) -> Result<(), Error> {
-        mux_match! { self;
-            Device::Vk(d) => d.destroy_buffer(buffer.vk()),
-            Device::Dx12(d) => d.destroy_buffer(buffer.dx12()),
-            Device::Mtl(d) => d.destroy_buffer(buffer.mtl()),
-        }
-    }
-
-    pub unsafe fn create_image2d(
-        &self,
-        width: u32,
-        height: u32,
-        format: ImageFormat,
-    ) -> Result<Image, Error> {
-        mux_match! { self;
-            Device::Vk(d) => d.create_image2d(width, height, format).map(Image::Vk),
-            Device::Dx12(d) => d.create_image2d(width, height, format).map(Image::Dx12),
-            Device::Mtl(d) => d.create_image2d(width, height, format).map(Image::Mtl),
-        }
-    }
-
-    pub unsafe fn destroy_image(&self, image: &Image) -> Result<(), Error> {
-        mux_match! { self;
-            Device::Vk(d) => d.destroy_image(image.vk()),
-            Device::Dx12(d) => d.destroy_image(image.dx12()),
-            Device::Mtl(d) => d.destroy_image(image.mtl()),
-        }
-    }
-
-    pub unsafe fn create_fence(&self, signaled: bool) -> Result<Fence, Error> {
-        mux_match! { self;
-            Device::Vk(d) => d.create_fence(signaled).map(Fence::Vk),
-            Device::Dx12(d) => d.create_fence(signaled).map(Fence::Dx12),
-            Device::Mtl(d) => d.create_fence(signaled).map(Fence::Mtl),
-        }
-    }
-
-    pub unsafe fn destroy_fence(&self, fence: Fence) -> Result<(), Error> {
-        mux_match! { self;
-            Device::Vk(d) => d.destroy_fence(fence.vk_owned()),
-            Device::Dx12(d) => d.destroy_fence(fence.dx12_owned()),
-            Device::Mtl(d) => d.destroy_fence(fence.mtl_owned()),
-        }
-    }
-
-    // Consider changing Vec to iterator (as is done in gfx-hal)
-    pub unsafe fn wait_and_reset(&self, fences: Vec<&mut Fence>) -> Result<(), Error> {
-        mux_match! { self;
-            Device::Vk(d) => {
-                let fences = fences
-                    .into_iter()
-                    .map(|f| f.vk_mut())
-                    .collect::<Vec<_>>();
-                d.wait_and_reset(fences)
-            }
-            Device::Dx12(d) => {
-                let fences = fences
-                    .into_iter()
-                    .map(|f| f.dx12_mut())
-                    .collect::<Vec<_>>();
-                d.wait_and_reset(fences)
-            }
-            Device::Mtl(d) => {
-                let fences = fences
-                    .into_iter()
-                    .map(|f| f.mtl_mut())
-                    .collect::<Vec<_>>();
-                d.wait_and_reset(fences)
-            }
-        }
-    }
-
-    pub unsafe fn get_fence_status(&self, fence: &mut Fence) -> Result<bool, Error> {
-        mux_match! { self;
-            Device::Vk(d) => d.get_fence_status(fence.vk_mut()),
-            Device::Dx12(d) => d.get_fence_status(fence.dx12_mut()),
-            Device::Mtl(d) => d.get_fence_status(fence.mtl_mut()),
-        }
-    }
-
-    pub unsafe fn create_semaphore(&self) -> Result<Semaphore, Error> {
-        mux_match! { self;
-            Device::Vk(d) => d.create_semaphore().map(Semaphore::Vk),
-            Device::Dx12(d) => d.create_semaphore().map(Semaphore::Dx12),
-            Device::Mtl(d) => d.create_semaphore().map(Semaphore::Mtl),
-        }
-    }
-
-    pub unsafe fn create_compute_pipeline<'a>(
-        &self,
-        code: ShaderCode<'a>,
-        bind_types: &[BindType],
-    ) -> Result<Pipeline, Error> {
-        mux_match! { self;
-            Device::Vk(d) => {
-                let shader_code = match code {
-                    ShaderCode::Spv(spv) => spv,
-                    // Panic or return "incompatible shader" error here?
-                    _ => panic!("Vulkan backend requires shader code in SPIR-V format"),
-                };
-                d.create_compute_pipeline(shader_code, bind_types)
-                    .map(Pipeline::Vk)
-            }
-            Device::Dx12(d) => {
-                let shader_code = match code {
-                    //ShaderCode::Hlsl(hlsl) => hlsl,
-                    ShaderCode::Dxil(dxil) => dxil,
-                    // Panic or return "incompatible shader" error here?
-                    _ => panic!("DX12 backend requires shader code in DXIL format"),
-                };
-                d.create_compute_pipeline(shader_code, bind_types)
-                    .map(Pipeline::Dx12)
-            }
-            Device::Mtl(d) => {
-                let shader_code = match code {
-                    ShaderCode::Msl(msl) => msl,
-                    // Panic or return "incompatible shader" error here?
-                    _ => panic!("Metal backend requires shader code in MSL format"),
-                };
-                d.create_compute_pipeline(shader_code, bind_types)
-                    .map(Pipeline::Mtl)
-            }
-        }
-    }
-
-    pub unsafe fn descriptor_set_builder(&self) -> DescriptorSetBuilder {
-        mux_match! { self;
-            Device::Vk(d) => DescriptorSetBuilder::Vk(d.descriptor_set_builder()),
-            Device::Dx12(d) => DescriptorSetBuilder::Dx12(d.descriptor_set_builder()),
-            Device::Mtl(d) => DescriptorSetBuilder::Mtl(d.descriptor_set_builder()),
-        }
-    }
-
-    pub unsafe fn update_buffer_descriptor(
-        &self,
-        ds: &mut DescriptorSet,
-        index: u32,
-        buffer: &Buffer,
-    ) {
-        mux_match! { self;
-            Device::Vk(d) => d.update_buffer_descriptor(ds.vk_mut(), index, buffer.vk()),
-            Device::Dx12(d) => d.update_buffer_descriptor(ds.dx12_mut(), index, buffer.dx12()),
-            Device::Mtl(d) => d.update_buffer_descriptor(ds.mtl_mut(), index, buffer.mtl()),
-        }
-    }
-
-    pub unsafe fn update_image_descriptor(
-        &self,
-        ds: &mut DescriptorSet,
-        index: u32,
-        image: &Image,
-    ) {
-        mux_match! { self;
-            Device::Vk(d) => d.update_image_descriptor(ds.vk_mut(), index, image.vk()),
-            Device::Dx12(d) => d.update_image_descriptor(ds.dx12_mut(), index, image.dx12()),
-            Device::Mtl(d) => d.update_image_descriptor(ds.mtl_mut(), index, image.mtl()),
-        }
-    }
-
-    pub fn create_cmd_buf(&self) -> Result<CmdBuf, Error> {
-        mux_match! { self;
-            Device::Vk(d) => d.create_cmd_buf().map(CmdBuf::Vk),
-            Device::Dx12(d) => d.create_cmd_buf().map(CmdBuf::Dx12),
-            Device::Mtl(d) => d.create_cmd_buf().map(CmdBuf::Mtl),
-        }
-    }
-
-    pub unsafe fn destroy_cmd_buf(&self, cmd_buf: CmdBuf) -> Result<(), Error> {
-        mux_match! { self;
-            Device::Vk(d) => d.destroy_cmd_buf(cmd_buf.vk_owned()),
-            Device::Dx12(d) => d.destroy_cmd_buf(cmd_buf.dx12_owned()),
-            Device::Mtl(d) => d.destroy_cmd_buf(cmd_buf.mtl_owned()),
-        }
-    }
-
-    pub fn create_query_pool(&self, n_queries: u32) -> Result<QueryPool, Error> {
-        mux_match! { self;
-            Device::Vk(d) => d.create_query_pool(n_queries).map(QueryPool::Vk),
-            Device::Dx12(d) => d.create_query_pool(n_queries).map(QueryPool::Dx12),
-            Device::Mtl(d) => d.create_query_pool(n_queries).map(QueryPool::Mtl),
-        }
-    }
-
-    pub unsafe fn fetch_query_pool(&self, pool: &QueryPool) -> Result<Vec<f64>, Error> {
-        mux_match! { self;
-            Device::Vk(d) => d.fetch_query_pool(pool.vk()),
-            Device::Dx12(d) => d.fetch_query_pool(pool.dx12()),
-            Device::Mtl(d) => d.fetch_query_pool(pool.mtl()),
-        }
-    }
-
-    pub unsafe fn run_cmd_bufs(
-        &self,
-        cmd_bufs: &[&CmdBuf],
-        wait_semaphores: &[&Semaphore],
-        signal_semaphores: &[&Semaphore],
-        fence: Option<&mut Fence>,
-    ) -> Result<(), Error> {
-        mux_match! { self;
-            Device::Vk(d) => d.run_cmd_bufs(
-                &cmd_bufs
-                    .iter()
-                    .map(|c| c.vk())
-                    .collect::<SmallVec<[_; 4]>>(),
-                &wait_semaphores
-                    .iter()
-                    .copied()
-                    .map(Semaphore::vk)
-                    .collect::<SmallVec<[_; 4]>>(),
-                &signal_semaphores
-                    .iter()
-                    .copied()
-                    .map(Semaphore::vk)
-                    .collect::<SmallVec<[_; 4]>>(),
-                fence.map(Fence::vk_mut),
-            ),
-            Device::Dx12(d) => d.run_cmd_bufs(
-                &cmd_bufs
-                    .iter()
-                    .map(|c| c.dx12())
-                    .collect::<SmallVec<[_; 4]>>(),
-                &wait_semaphores
-                    .iter()
-                    .copied()
-                    .map(Semaphore::dx12)
-                    .collect::<SmallVec<[_; 4]>>(),
-                &signal_semaphores
-                    .iter()
-                    .copied()
-                    .map(Semaphore::dx12)
-                    .collect::<SmallVec<[_; 4]>>(),
-                fence.map(Fence::dx12_mut),
-            ),
-            Device::Mtl(d) => d.run_cmd_bufs(
-                &cmd_bufs
-                    .iter()
-                    .map(|c| c.mtl())
-                    .collect::<SmallVec<[_; 4]>>(),
-                &wait_semaphores
-                    .iter()
-                    .copied()
-                    .map(Semaphore::mtl)
-                    .collect::<SmallVec<[_; 4]>>(),
-                &signal_semaphores
-                    .iter()
-                    .copied()
-                    .map(Semaphore::mtl)
-                    .collect::<SmallVec<[_; 4]>>(),
-                fence.map(Fence::mtl_mut),
-            ),
-        }
-    }
-
-    pub unsafe fn map_buffer(
-        &self,
-        buffer: &Buffer,
-        offset: u64,
-        size: u64,
-        mode: MapMode,
-    ) -> Result<*mut u8, Error> {
-        mux_match! { self;
-            Device::Vk(d) => d.map_buffer(buffer.vk(), offset, size, mode),
-            Device::Dx12(d) => d.map_buffer(buffer.dx12(), offset, size, mode),
-            Device::Mtl(d) => d.map_buffer(buffer.mtl(), offset, size, mode),
-        }
-    }
-
-    pub unsafe fn unmap_buffer(
-        &self,
-        buffer: &Buffer,
-        offset: u64,
-        size: u64,
-        mode: MapMode,
-    ) -> Result<(), Error> {
-        mux_match! { self;
-            Device::Vk(d) => d.unmap_buffer(buffer.vk(), offset, size, mode),
-            Device::Dx12(d) => d.unmap_buffer(buffer.dx12(), offset, size, mode),
-            Device::Mtl(d) => d.unmap_buffer(buffer.mtl(), offset, size, mode),
-        }
-    }
-
-    /// Choose shader code from the available choices.
-    pub fn choose_shader<'a>(
-        &self,
-        _spv: &'a [u8],
-        _hlsl: &'a str,
-        _dxil: &'a [u8],
-        _msl: &'a str,
-    ) -> ShaderCode<'a> {
-        mux_match! { self;
-            Device::Vk(_d) => ShaderCode::Spv(_spv),
-            Device::Dx12(_d) => ShaderCode::Dxil(_dxil),
-            Device::Mtl(_d) => ShaderCode::Msl(_msl),
-        }
-    }
-
-    pub fn backend_type(&self) -> BackendType {
-        mux_match! { self;
-            Device::Vk(_d) => BackendType::Vulkan,
-            Device::Dx12(_d) => BackendType::Dx12,
-            Device::Mtl(_d) => BackendType::Metal,
-        }
-    }
-}
-
-impl DescriptorSetBuilder {
-    pub fn add_buffers(&mut self, buffers: &[&Buffer]) {
-        mux_match! { self;
-            DescriptorSetBuilder::Vk(x) => x.add_buffers(
-                &buffers
-                    .iter()
-                    .copied()
-                    .map(Buffer::vk)
-                    .collect::<SmallVec<[_; 8]>>(),
-            ),
-            DescriptorSetBuilder::Dx12(x) => x.add_buffers(
-                &buffers
-                    .iter()
-                    .copied()
-                    .map(Buffer::dx12)
-                    .collect::<SmallVec<[_; 8]>>(),
-            ),
-            DescriptorSetBuilder::Mtl(x) => x.add_buffers(
-                &buffers
-                    .iter()
-                    .copied()
-                    .map(Buffer::mtl)
-                    .collect::<SmallVec<[_; 8]>>(),
-            ),
-        }
-    }
-
-    pub fn add_images(&mut self, images: &[&Image]) {
-        mux_match! { self;
-            DescriptorSetBuilder::Vk(x) => x.add_images(
-                &images
-                    .iter()
-                    .copied()
-                    .map(Image::vk)
-                    .collect::<SmallVec<[_; 8]>>(),
-            ),
-            DescriptorSetBuilder::Dx12(x) => x.add_images(
-                &images
-                    .iter()
-                    .copied()
-                    .map(Image::dx12)
-                    .collect::<SmallVec<[_; 8]>>(),
-            ),
-            DescriptorSetBuilder::Mtl(x) => x.add_images(
-                &images
-                    .iter()
-                    .copied()
-                    .map(Image::mtl)
-                    .collect::<SmallVec<[_; 8]>>(),
-            ),
-        }
-    }
-
-    pub fn add_textures(&mut self, images: &[&Image]) {
-        mux_match! { self;
-            DescriptorSetBuilder::Vk(x) => x.add_textures(
-                &images
-                    .iter()
-                    .copied()
-                    .map(Image::vk)
-                    .collect::<SmallVec<[_; 8]>>(),
-            ),
-            DescriptorSetBuilder::Dx12(x) => x.add_textures(
-                &images
-                    .iter()
-                    .copied()
-                    .map(Image::dx12)
-                    .collect::<SmallVec<[_; 8]>>(),
-            ),
-            DescriptorSetBuilder::Mtl(x) => x.add_textures(
-                &images
-                    .iter()
-                    .copied()
-                    .map(Image::mtl)
-                    .collect::<SmallVec<[_; 8]>>(),
-            ),
-        }
-    }
-
-    pub unsafe fn build(
-        self,
-        device: &Device,
-        pipeline: &Pipeline,
-    ) -> Result<DescriptorSet, Error> {
-        mux_match! { self;
-            DescriptorSetBuilder::Vk(x) =>
-                x.build(device.vk(), pipeline.vk()).map(DescriptorSet::Vk),
-            DescriptorSetBuilder::Dx12(x) => x
-                .build(device.dx12(), pipeline.dx12())
-                .map(DescriptorSet::Dx12),
-            DescriptorSetBuilder::Mtl(x) => x
-                .build(device.mtl(), pipeline.mtl())
-                .map(DescriptorSet::Mtl),
-        }
-    }
-}
-
-impl CmdBuf {
-    pub unsafe fn begin(&mut self) {
-        mux_match! { self;
-            CmdBuf::Vk(c) => c.begin(),
-            CmdBuf::Dx12(c) => c.begin(),
-            CmdBuf::Mtl(c) => c.begin(),
-        }
-    }
-
-    pub unsafe fn flush(&mut self) {
-        mux_match! { self;
-            CmdBuf::Vk(c) => c.flush(),
-            CmdBuf::Dx12(c) => c.flush(),
-            CmdBuf::Mtl(c) => c.flush(),
-        }
-    }
-
-    pub unsafe fn finish(&mut self) {
-        mux_match! { self;
-            CmdBuf::Vk(c) => c.finish(),
-            CmdBuf::Dx12(c) => c.finish(),
-            CmdBuf::Mtl(c) => c.finish(),
-        }
-    }
-
-    pub unsafe fn reset(&mut self) -> bool {
-        mux_match! { self;
-            CmdBuf::Vk(c) => c.reset(),
-            CmdBuf::Dx12(c) => c.reset(),
-            CmdBuf::Mtl(c) => c.reset(),
-        }
-    }
-
-    pub unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
-        mux_match! { self;
-            CmdBuf::Vk(c) => c.begin_compute_pass(desc),
-            CmdBuf::Dx12(c) => c.begin_compute_pass(desc),
-            CmdBuf::Mtl(c) => c.begin_compute_pass(desc),
-        }
-    }
-
-    /// Dispatch a compute shader.
-    ///
-    /// Note that both the number of workgroups (`workgroup_count`) and the number of
-    /// threads in a workgroup (`workgroup_size`) are given. The latter is needed on
-    /// Metal, while it's baked into the shader on Vulkan and DX12.
-    ///
-    /// Perhaps we'll have a mechanism to plumb the latter value to configure the size
-    /// of a workgroup using specialization constants in the future.
-    pub unsafe fn dispatch(
-        &mut self,
-        pipeline: &Pipeline,
-        descriptor_set: &DescriptorSet,
-        workgroup_count: (u32, u32, u32),
-        workgroup_size: (u32, u32, u32),
-    ) {
-        mux_match! { self;
-            CmdBuf::Vk(c) => c.dispatch(pipeline.vk(), descriptor_set.vk(), workgroup_count, workgroup_size),
-            CmdBuf::Dx12(c) => c.dispatch(pipeline.dx12(), descriptor_set.dx12(), workgroup_count, workgroup_size),
-            CmdBuf::Mtl(c) => c.dispatch(pipeline.mtl(), descriptor_set.mtl(), workgroup_count, workgroup_size),
-        }
-    }
-
-    pub unsafe fn end_compute_pass(&mut self) {
-        mux_match! { self;
-            CmdBuf::Vk(c) => c.end_compute_pass(),
-            CmdBuf::Dx12(c) => c.end_compute_pass(),
-            CmdBuf::Mtl(c) => c.end_compute_pass(),
-        }
-    }
-
-    pub unsafe fn memory_barrier(&mut self) {
-        mux_match! { self;
-            CmdBuf::Vk(c) => c.memory_barrier(),
-            CmdBuf::Dx12(c) => c.memory_barrier(),
-            CmdBuf::Mtl(c) => c.memory_barrier(),
-        }
-    }
-
-    pub unsafe fn host_barrier(&mut self) {
-        mux_match! { self;
-            CmdBuf::Vk(c) => c.host_barrier(),
-            CmdBuf::Dx12(c) => c.host_barrier(),
-            CmdBuf::Mtl(c) => c.host_barrier(),
-        }
-    }
-
-    pub unsafe fn image_barrier(
-        &mut self,
-        image: &Image,
-        src_layout: ImageLayout,
-        dst_layout: ImageLayout,
-    ) {
-        mux_match! { self;
-            CmdBuf::Vk(c) => c.image_barrier(image.vk(), src_layout, dst_layout),
-            CmdBuf::Dx12(c) => c.image_barrier(image.dx12(), src_layout, dst_layout),
-            CmdBuf::Mtl(c) => c.image_barrier(image.mtl(), src_layout, dst_layout),
-        }
-    }
-
-    pub unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
-        mux_match! { self;
-            CmdBuf::Vk(c) => c.clear_buffer(buffer.vk(), size),
-            CmdBuf::Dx12(c) => c.clear_buffer(buffer.dx12(), size),
-            CmdBuf::Mtl(c) => c.clear_buffer(buffer.mtl(), size),
-        }
-    }
-
-    pub unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
-        mux_match! { self;
-            CmdBuf::Vk(c) => c.copy_buffer(src.vk(), dst.vk()),
-            CmdBuf::Dx12(c) => c.copy_buffer(src.dx12(), dst.dx12()),
-            CmdBuf::Mtl(c) => c.copy_buffer(src.mtl(), dst.mtl()),
-        }
-    }
-
-    pub unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
-        mux_match! { self;
-            CmdBuf::Vk(c) => c.copy_image_to_buffer(src.vk(), dst.vk()),
-            CmdBuf::Dx12(c) => c.copy_image_to_buffer(src.dx12(), dst.dx12()),
-            CmdBuf::Mtl(c) => c.copy_image_to_buffer(src.mtl(), dst.mtl()),
-        }
-    }
-
-    pub unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
-        mux_match! { self;
-            CmdBuf::Vk(c) => c.copy_buffer_to_image(src.vk(), dst.vk()),
-            CmdBuf::Dx12(c) => c.copy_buffer_to_image(src.dx12(), dst.dx12()),
-            CmdBuf::Mtl(c) => c.copy_buffer_to_image(src.mtl(), dst.mtl()),
-        }
-    }
-
-    pub unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
-        mux_match! { self;
-            CmdBuf::Vk(c) => c.blit_image(src.vk(), dst.vk()),
-            CmdBuf::Dx12(c) => c.blit_image(src.dx12(), dst.dx12()),
-            CmdBuf::Mtl(c) => c.blit_image(src.mtl(), dst.mtl()),
-        }
-    }
-
-    pub unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {
-        mux_match! { self;
-            CmdBuf::Vk(c) => c.reset_query_pool(pool.vk()),
-            CmdBuf::Dx12(c) => c.reset_query_pool(pool.dx12()),
-            CmdBuf::Mtl(c) => c.reset_query_pool(pool.mtl()),
-        }
-    }
-
-    pub unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
-        mux_match! { self;
-            CmdBuf::Vk(c) => c.write_timestamp(pool.vk(), query),
-            CmdBuf::Dx12(c) => c.write_timestamp(pool.dx12(), query),
-            CmdBuf::Mtl(c) => c.write_timestamp(pool.mtl(), query),
-        }
-    }
-
-    pub unsafe fn finish_timestamps(&mut self, pool: &QueryPool) {
-        mux_match! { self;
-            CmdBuf::Vk(c) => c.finish_timestamps(pool.vk()),
-            CmdBuf::Dx12(c) => c.finish_timestamps(pool.dx12()),
-            CmdBuf::Mtl(c) => c.finish_timestamps(pool.mtl()),
-        }
-    }
-
-    pub unsafe fn begin_debug_label(&mut self, label: &str) {
-        mux_match! { self;
-            CmdBuf::Vk(c) => c.begin_debug_label(label),
-            CmdBuf::Dx12(c) => c.begin_debug_label(label),
-            CmdBuf::Mtl(c) => c.begin_debug_label(label),
-        }
-    }
-
-    pub unsafe fn end_debug_label(&mut self) {
-        mux_match! { self;
-            CmdBuf::Vk(c) => c.end_debug_label(),
-            CmdBuf::Dx12(c) => c.end_debug_label(),
-            CmdBuf::Mtl(c) => c.end_debug_label(),
-        }
-    }
-}
-
-impl Buffer {
-    pub fn size(&self) -> u64 {
-        mux_match! { self;
-            Buffer::Vk(b) => b.size,
-            Buffer::Dx12(b) => b.size,
-            Buffer::Mtl(b) => b.size,
-        }
-    }
-}
-
-impl Swapchain {
-    pub unsafe fn next(&mut self) -> Result<(usize, Semaphore), Error> {
-        mux_match! { self;
-            Swapchain::Vk(s) => {
-                let (idx, sem) = s.next()?;
-                Ok((idx, Semaphore::Vk(sem)))
-            }
-            Swapchain::Dx12(s) => {
-                let (idx, sem) = s.next()?;
-                Ok((idx, Semaphore::Dx12(sem)))
-            }
-            Swapchain::Mtl(s) => {
-                let (idx, sem) = s.next()?;
-                Ok((idx, Semaphore::Mtl(sem)))
-            }
-        }
-    }
-
-    pub unsafe fn image(&self, idx: usize) -> crate::Image {
-        crate::Image::wrap_swapchain_image(self.image_raw(idx))
-    }
-
-    pub unsafe fn image_raw(&self, idx: usize) -> Image {
-        mux_match! { self;
-            Swapchain::Vk(s) => Image::Vk(s.image(idx)),
-            Swapchain::Dx12(s) => Image::Dx12(s.image(idx)),
-            Swapchain::Mtl(s) => Image::Mtl(s.image(idx)),
-        }
-    }
-
-    pub unsafe fn present(
-        &self,
-        image_idx: usize,
-        semaphores: &[&Semaphore],
-    ) -> Result<bool, Error> {
-        mux_match! { self;
-            Swapchain::Vk(s) => s.present(
-                image_idx,
-                &semaphores
-                    .iter()
-                    .copied()
-                    .map(Semaphore::vk)
-                    .collect::<SmallVec<[_; 4]>>(),
-            ),
-            Swapchain::Dx12(s) => s.present(
-                image_idx,
-                &semaphores
-                    .iter()
-                    .copied()
-                    .map(Semaphore::dx12)
-                    .collect::<SmallVec<[_; 4]>>(),
-            ),
-            Swapchain::Mtl(s) => s.present(
-                image_idx,
-                &semaphores
-                    .iter()
-                    .copied()
-                    .map(Semaphore::mtl)
-                    .collect::<SmallVec<[_; 4]>>(),
-            ),
-        }
-    }
-}
--- a/piet-gpu-hal/src/vulkan.rs
+++ b/piet-gpu-hal/src/vulkan.rs
--- a/piet-gpu-types/Cargo.toml
+++ b/piet-gpu-types/Cargo.toml
@ -1,12 +0,0 @@
-[package]
-name = "piet-gpu-types"
-version = "0.0.0"
-authors = ["Raph Levien <raph.levien@gmail.com>"]
-description = "The scene graph and internal GPU types for piet-gpu."
-license = "MIT/Apache-2.0"
-edition = "2018"
-keywords = ["graphics", "2d"]
-
-[dependencies]
-piet-gpu-derive = { path = "../piet-gpu-derive" }
-half = "1.5.0"
--- a/piet-gpu-types/src/annotated.rs
+++ b/piet-gpu-types/src/annotated.rs
@ -1,45 +0,0 @@
-use piet_gpu_derive::piet_gpu;
-
-piet_gpu! {
-    #[gpu_write]
-    mod annotated {
-        struct AnnoImage {
-            bbox: [f32; 4],
-            linewidth: f32,
-            index: u32,
-            offset: [i16; 2],
-        }
-        struct AnnoColor {
-            bbox: [f32; 4],
-            // For stroked fills.
-            // For the nonuniform scale case, this needs to be a 2x2 matrix.
-            // That's expected to be uncommon, so we could special-case it.
-            linewidth: f32,
-            rgba_color: u32,
-        }
-        struct AnnoLinGradient {
-            bbox: [f32; 4],
-            // For stroked fills.
-            linewidth: f32,
-            index: u32,
-            line_x: f32,
-            line_y: f32,
-            line_c: f32,
-        }
-        struct AnnoBeginClip {
-            bbox: [f32; 4],
-            linewidth: f32,
-        }
-        struct AnnoEndClip {
-            bbox: [f32; 4],
-        }
-        enum Annotated {
-            Nop,
-            Color(TagFlags, AnnoColor),
-            LinGradient(TagFlags, AnnoLinGradient),
-            Image(TagFlags, AnnoImage),
-            BeginClip(TagFlags, AnnoBeginClip),
-            EndClip(AnnoEndClip),
-        }
-    }
-}
--- a/piet-gpu-types/src/bins.rs
+++ b/piet-gpu-types/src/bins.rs
@ -1,12 +0,0 @@
-use piet_gpu_derive::piet_gpu;
-
-// The output of the binning stage, organized as a linked list of chunks.
-
-piet_gpu! {
-    #[gpu_write]
-    mod bins {
-        struct BinInstance {
-            element_ix: u32,
-        }
-    }
-}
--- a/piet-gpu-types/src/encoder.rs
+++ b/piet-gpu-types/src/encoder.rs
@ -1,151 +0,0 @@
-//  Copyright 2020 The xi-editor authors.
-
-//! New-style encoders (supporting proc macros)
-
-pub struct A;
-
-/// A reference to an encoded object within a buffer
-#[derive(Clone, Copy, Debug)]
-pub struct Ref<T> {
-    offset: u32,
-    _phantom: std::marker::PhantomData<T>,
-}
-
-pub struct Encoder {
-    buf: Vec<u8>,
-}
-
-// TODO: we probably do want to encode slices, get rid of Sized bound
-pub trait Encode: Sized {
-    /// Size if it's a fixed-size object, otherwise 0.
-    fn fixed_size() -> usize;
-
-    /// Encoded size, for both fixed and variable sized objects.
-    fn encoded_size(&self) -> usize {
-        Self::fixed_size()
-    }
-
-    /// Encode into a buffer; panics if not appropriately sized.
-    fn encode_to(&self, buf: &mut [u8]);
-
-    /// Allocate a chunk and encode, returning a reference.
-    fn encode(&self, encoder: &mut Encoder) -> Ref<Self> {
-        let size = self.encoded_size();
-        let (offset, buf) = encoder.alloc_chunk(size as u32);
-        self.encode_to(buf);
-        Ref::new(offset)
-    }
-}
-
-impl<T> Ref<T> {
-    fn new(offset: u32) -> Ref<T> {
-        Ref {
-            offset,
-            _phantom: Default::default(),
-        }
-    }
-
-    pub fn offset(&self) -> u32 {
-        self.offset
-    }
-
-    pub fn transmute<U>(&self) -> Ref<U> {
-        Ref::new(self.offset)
-    }
-}
-
-impl Encoder {
-    pub fn new() -> Encoder {
-        Encoder { buf: Vec::new() }
-    }
-
-    pub fn alloc_chunk(&mut self, size: u32) -> (u32, &mut [u8]) {
-        let offset = self.buf.len();
-        self.buf.resize(size as usize + offset, 0);
-        (offset as u32, &mut self.buf[offset..])
-    }
-
-    pub fn buf(&self) -> &[u8] {
-        &self.buf
-    }
-
-    pub fn buf_mut(&mut self) -> &mut [u8] {
-        &mut self.buf
-    }
-}
-
-impl<T> Encode for Ref<T> {
-    fn fixed_size() -> usize {
-        4
-    }
-
-    fn encode_to(&self, buf: &mut [u8]) {
-        buf[0..4].copy_from_slice(&self.offset.to_le_bytes());
-    }
-}
-
-// Encode impls for scalar and small vector types are as needed; it's a finite set of
-// possibilities, so we could do it all with macros, but by hand is expedient.
-
-impl Encode for u32 {
-    fn fixed_size() -> usize {
-        4
-    }
-
-    fn encode_to(&self, buf: &mut [u8]) {
-        buf[0..4].copy_from_slice(&self.to_le_bytes());
-    }
-}
-
-impl Encode for f32 {
-    fn fixed_size() -> usize {
-        4
-    }
-
-    fn encode_to(&self, buf: &mut [u8]) {
-        buf[0..4].copy_from_slice(&self.to_le_bytes());
-    }
-}
-
-impl Encode for [u16; 4] {
-    fn fixed_size() -> usize {
-        8
-    }
-
-    fn encode_to(&self, buf: &mut [u8]) {
-        buf[0..2].copy_from_slice(&self[0].to_le_bytes());
-        buf[2..4].copy_from_slice(&self[1].to_le_bytes());
-        buf[4..6].copy_from_slice(&self[2].to_le_bytes());
-        buf[6..8].copy_from_slice(&self[3].to_le_bytes());
-    }
-}
-
-impl Encode for [f32; 2] {
-    fn fixed_size() -> usize {
-        8
-    }
-
-    fn encode_to(&self, buf: &mut [u8]) {
-        buf[0..4].copy_from_slice(&self[0].to_le_bytes());
-        buf[4..8].copy_from_slice(&self[1].to_le_bytes());
-    }
-}
-
-// TODO: make this work for slices too, but need to deal with Sized bound
-//
-// Note: only works for vectors of fixed size objects.
-impl<T: Encode> Encode for Vec<T> {
-    fn fixed_size() -> usize {
-        0
-    }
-    fn encoded_size(&self) -> usize {
-        self.len() * T::fixed_size()
-    }
-
-    fn encode_to(&self, buf: &mut [u8]) {
-        let size = T::fixed_size();
-        for (ix, val) in self.iter().enumerate() {
-            val.encode_to(&mut buf[ix * size..]);
-        }
-    }
-}
--- a/piet-gpu-types/src/lib.rs
+++ b/piet-gpu-types/src/lib.rs
@ -1,12 +0,0 @@
-// Structures used only internally probably don't need to be pub.
-
-pub mod annotated;
-pub mod bins;
-pub mod encoder;
-pub mod pathseg;
-pub mod ptcl;
-pub mod scene;
-pub mod state;
-pub mod test;
-pub mod tile;
-pub mod tilegroup;
--- a/piet-gpu-types/src/main.rs
+++ b/piet-gpu-types/src/main.rs
@ -1,18 +0,0 @@
-fn main() {
-    let mod_name = std::env::args()
-        .skip(1)
-        .next()
-        .expect("provide a module name");
-    match mod_name.as_str() {
-        "scene" => print!("{}", piet_gpu_types::scene::gen_gpu_scene()),
-        "state" => print!("{}", piet_gpu_types::state::gen_gpu_state()),
-        "annotated" => print!("{}", piet_gpu_types::annotated::gen_gpu_annotated()),
-        "pathseg" => print!("{}", piet_gpu_types::pathseg::gen_gpu_pathseg()),
-        "bins" => print!("{}", piet_gpu_types::bins::gen_gpu_bins()),
-        "tile" => print!("{}", piet_gpu_types::tile::gen_gpu_tile()),
-        "tilegroup" => print!("{}", piet_gpu_types::tilegroup::gen_gpu_tilegroup()),
-        "ptcl" => print!("{}", piet_gpu_types::ptcl::gen_gpu_ptcl()),
-        "test" => print!("{}", piet_gpu_types::test::gen_gpu_test()),
-        _ => println!("Oops, unknown module name"),
-    }
-}
--- a/piet-gpu-types/src/pathseg.rs
+++ b/piet-gpu-types/src/pathseg.rs
@ -1,22 +0,0 @@
-use piet_gpu_derive::piet_gpu;
-
-piet_gpu! {
-    #[gpu_write]
-    mod pathseg {
-        struct PathCubic {
-            p0: [f32; 2],
-            p1: [f32; 2],
-            p2: [f32; 2],
-            p3: [f32; 2],
-            path_ix: u32,
-            // trans_ix is the transform index. It is 1-based, 0 means no transformation.
-            trans_ix: u32,
-            // Halfwidth in both x and y for binning. For strokes only.
-            stroke: [f32; 2],
-        }
-        enum PathSeg {
-            Nop,
-            Cubic(TagFlags, PathCubic),
-        }
-    }
-}
--- a/piet-gpu-types/src/ptcl.rs
+++ b/piet-gpu-types/src/ptcl.rs
@ -1,63 +0,0 @@
-use piet_gpu_derive::piet_gpu;
-
-piet_gpu! {
-    #[gpu_write]
-    mod ptcl {
-        struct CmdStroke {
-            // This is really a Ref<Tile>, but we don't have cross-module
-            // references.
-            tile_ref: u32,
-            half_width: f32,
-        }
-        struct CmdFill {
-            // As above, really Ref<Tile>
-            tile_ref: u32,
-            backdrop: i32,
-        }
-        struct CmdColor {
-            rgba_color: u32,
-        }
-        struct CmdLinGrad {
-            index: u32,
-            // line equation for gradient
-            line_x: f32,
-            line_y: f32,
-            line_c: f32,
-        }
-        struct CmdRadGrad {
-            index: u32,
-            mat: [f32; 4],
-            xlat: [f32; 2],
-            c1: [f32; 2],
-            ra: f32,
-            roff: f32,
-        }
-        struct CmdImage {
-            index: u32,
-            offset: [i16; 2],
-        }
-        struct CmdAlpha {
-            alpha: f32,
-        }
-        struct CmdEndClip {
-            blend: u32,
-        }
-        struct CmdJump {
-            new_ref: u32,
-        }
-        enum Cmd {
-            End,
-            Fill(CmdFill),
-            Stroke(CmdStroke),
-            Solid,
-            Alpha(CmdAlpha),
-            Color(CmdColor),
-            LinGrad(CmdLinGrad),
-            RadGrad(CmdRadGrad),
-            Image(CmdImage),
-            BeginClip,
-            EndClip(CmdEndClip),
-            Jump(CmdJump),
-        }
-    }
-}
--- a/piet-gpu-types/src/scene.rs
+++ b/piet-gpu-types/src/scene.rs
@ -1,69 +0,0 @@
-use piet_gpu_derive::piet_gpu;
-
-pub use self::scene::{
-    Clip, CubicSeg, Element, FillColor, FillLinGradient, LineSeg, QuadSeg, SetFillMode,
-    SetLineWidth, Transform,
-};
-
-piet_gpu! {
-    #[rust_encode]
-    mod scene {
-        struct LineSeg {
-            p0: [f32; 2],
-            p1: [f32; 2],
-        }
-        struct QuadSeg {
-            p0: [f32; 2],
-            p1: [f32; 2],
-            p2: [f32; 2],
-        }
-        struct CubicSeg {
-            p0: [f32; 2],
-            p1: [f32; 2],
-            p2: [f32; 2],
-            p3: [f32; 2],
-        }
-        struct FillColor {
-            rgba_color: u32,
-        }
-        struct FillLinGradient {
-            index: u32,
-            p0: [f32; 2],
-            p1: [f32; 2],
-        }
-        struct FillImage {
-            index: u32,
-            offset: [i16; 2],
-        }
-        struct SetLineWidth {
-            width: f32,
-        }
-        struct Transform {
-            mat: [f32; 4],
-            translate: [f32; 2],
-        }
-        struct Clip {
-            bbox: [f32; 4],
-            // TODO: add alpha?
-        }
-        struct SetFillMode {
-            fill_mode: u32,
-        }
-        enum Element {
-            Nop,
-
-            Line(LineSeg),
-            Quad(QuadSeg),
-            Cubic(CubicSeg),
-
-            FillColor(FillColor),
-            FillLinGradient(FillLinGradient),
-            FillImage(FillImage),
-            SetLineWidth(SetLineWidth),
-            Transform(Transform),
-            BeginClip(Clip),
-            EndClip(Clip),
-            SetFillMode(SetFillMode),
-        }
-    }
-}
--- a/piet-gpu-types/src/state.rs
+++ b/piet-gpu-types/src/state.rs
@ -1,17 +0,0 @@
-use piet_gpu_derive::piet_gpu;
-
-piet_gpu! {
-    #[gpu_write]
-    mod state {
-        struct State {
-            mat: [f32; 4],
-            translate: [f32; 2],
-            bbox: [f32; 4],
-            linewidth: f32,
-            flags: u32,
-            path_count: u32,
-            pathseg_count: u32,
-            trans_count: u32,
-        }
-    }
-}
--- a/piet-gpu-types/src/test.rs
+++ b/piet-gpu-types/src/test.rs
@ -1,33 +0,0 @@
-use piet_gpu_derive::piet_gpu;
-
-piet_gpu! {
-    #[rust_encode]
-    #[gpu_write]
-    mod test {
-        struct StructA {
-            a: f16,
-            b: f16,
-        }
-
-        struct StructB {
-            a: f16,
-            b: u16,
-            c: f16,
-        }
-
-        struct StructC {
-            a: f16,
-            b: u16,
-            c: u16,
-            d: f16,
-        }
-
-        struct StructD {
-            a: [f16; 2],
-        }
-
-        struct StructE {
-            a: [f16; 3],
-        }
-    }
-}
--- a/piet-gpu-types/src/tile.rs
+++ b/piet-gpu-types/src/tile.rs
@ -1,26 +0,0 @@
-use piet_gpu_derive::piet_gpu;
-
-piet_gpu! {
-    #[gpu_write]
-    mod tile {
-        struct Path {
-            bbox: [u16; 4],
-            tiles: Ref<Tile>,
-        }
-        struct Tile {
-            tile: Ref<TileSeg>,
-            backdrop: i32,
-        }
-        // Segments within a tile are represented as a linked list.
-        struct TileSeg {
-            origin: [f32; 2],
-            vector: [f32; 2],
-            y_edge: f32,
-            next: Ref<TileSeg>,
-        }
-        struct TransformSeg {
-            mat: [f32; 4],
-            translate: [f32; 2],
-        }
-    }
-}
--- a/piet-gpu-types/src/tilegroup.rs
+++ b/piet-gpu-types/src/tilegroup.rs
@ -1,39 +0,0 @@
-use piet_gpu_derive::piet_gpu;
-
-// Structures representing tilegroup instances (output of kernel 1).
-// There are three outputs: the main instances, the stroke instances,
-// and the fill instances. All three are conceptually a list of
-// instances, but the encoding is slightly different. The first is
-// encoded with Instance, Jump, and End. The other two are encoded
-// as a linked list of Chunk.
-
-// The motivation for the difference is that the first requires fewer
-// registers to track state, but the second contains information that
-// is useful up front for doing dynamic allocation in kernel 2, as
-// well as increasing read parallelism; the "jump" approach really is
-// geared to sequential reading.
-
-piet_gpu! {
-    #[gpu_write]
-    mod tilegroup {
-        struct Instance {
-            // Note: a better type would be `Ref<PietItem>` but to do that we
-            // would need cross-module references. Punt for now.
-            item_ref: u32,
-            // A better type would be Point.
-            offset: [f32; 2],
-        }
-        struct Jump {
-            new_ref: Ref<TileGroup>,
-        }
-        struct Chunk {
-            chunk_n: u32,
-            next: Ref<Chunk>,
-        }
-        enum TileGroup {
-            Instance(Instance),
-            Jump(Jump),
-            End,
-        }
-    }
-}
--- a/piet-gpu/Cargo.toml
+++ b/piet-gpu/Cargo.toml
@ -1,48 +0,0 @@
-[package]
-name = "piet-gpu"
-version = "0.1.0"
-authors = ["Raph Levien <raph.levien@gmail.com>"]
-description = "A compute-centric GPU 2D renderer."
-readme = "README.md"
-license = "MIT/Apache-2.0"
-edition = "2018"
-
-[[bin]]
-name = "cli"
-path = "bin/cli.rs"
-
-[[bin]]
-name = "winit"
-path = "bin/winit.rs"
-
-[[example]]
-name = "android"
-path = "bin/android.rs"
-crate-type = ["cdylib"]
-
-[dependencies.piet-gpu-hal]
-path = "../piet-gpu-hal"
-
-[dependencies.piet-gpu-types]
-path = "../piet-gpu-types"
-
-[dependencies.piet-scene]
-path = "../piet-scene"
-
-[dependencies]
-png = "0.17.6"
-rand = "0.8.5"
-roxmltree = "0.13"
-winit = {version = "0.27.3",  default-features = false, features = ["x11", "wayland", "wayland-dlopen"]}
-raw-window-handle = "0.5"
-clap = "3.2.22"
-bytemuck = { version = "1.7.2", features = ["derive"] }
-
-[target.'cfg(target_os = "android")'.dependencies]
-ndk = "0.3"
-ndk-sys = "0.2.0"
-ndk-glue = "0.3"
-raw-window-handle = "0.3"
-
-[package.metadata.android.application]
-debuggable = true
--- a/piet-gpu/bin/android.rs
+++ b/piet-gpu/bin/android.rs
@ -1,164 +0,0 @@
-#![cfg(target_os = "android")]
-//! Android example
-//!
-//! Run using `cargo apk run --example android`
-//!
-//! Requires the [cargo-apk] tool.
-//! [cargo-apk]: https://crates.io/crates/cargo-apk
-
-use raw_window_handle::{
-    AndroidDisplayHandle, AndroidNdkWindowHandle, RawDisplayHandle, RawWindowHandle,
-};
-
-use ndk_glue::Event;
-
-use piet_gpu_hal::{
-    Error, ImageLayout, Instance, InstanceFlags, Semaphore, Session, Surface, Swapchain,
-};
-
-use piet_gpu::{samples, RenderDriver, Renderer, SimpleText};
-use piet_scene::{Scene, SceneBuilder};
-
-#[cfg_attr(target_os = "android", ndk_glue::main(backtrace = "on"))]
-fn main() {
-    my_main().unwrap();
-}
-
-// State required to render and present the contents
-struct GfxState {
-    session: Session,
-    render_driver: RenderDriver,
-    swapchain: Swapchain,
-    current_frame: usize,
-    present_semaphores: Vec<Semaphore>,
-}
-
-const NUM_FRAMES: usize = 2;
-
-fn my_main() -> Result<(), Error> {
-    let mut gfx_state = None;
-    loop {
-        for event in ndk_glue::poll_events() {
-            println!("got event {:?}", event);
-            match event {
-                Event::WindowCreated => {
-                    let window = ndk_glue::native_window();
-                    if let Some(window) = &*window {
-                        let width = window.width() as usize;
-                        let height = window.height() as usize;
-                        let instance = Instance::new(InstanceFlags::default())?;
-                        let mut android_handle = AndroidNdkWindowHandle::empty();
-                        android_handle.a_native_window = window.ptr().as_ptr() as *mut _;
-                        let window_handle = RawWindowHandle::AndroidNdk(android_handle);
-                        let display_handle =
-                            RawDisplayHandle::Android(AndroidDisplayHandle::empty());
-                        let surface = unsafe { instance.surface(display_handle, window_handle)? };
-                        gfx_state = Some(GfxState::new(&instance, Some(&surface), width, height)?);
-                    } else {
-                        println!("native window is sadly none");
-                    }
-                }
-                Event::WindowRedrawNeeded => {
-                    if let Some(gfx_state) = gfx_state.as_mut() {
-                        for _ in 0..1000 {
-                            gfx_state.redraw();
-                        }
-                    }
-                }
-                _ => (),
-            }
-        }
-    }
-}
-
-impl GfxState {
-    fn new(
-        instance: &Instance,
-        surface: Option<&Surface>,
-        width: usize,
-        height: usize,
-    ) -> Result<GfxState, Error> {
-        unsafe {
-            let device = instance.device()?;
-            let swapchain = instance.swapchain(width, height, &device, surface.unwrap())?;
-            let session = Session::new(device);
-            let current_frame = 0;
-            let present_semaphores = (0..NUM_FRAMES)
-                .map(|_| session.create_semaphore())
-                .collect::<Result<Vec<_>, Error>>()?;
-
-            let renderer = Renderer::new(&session, width, height, NUM_FRAMES)?;
-            let render_driver = RenderDriver::new(&session, NUM_FRAMES, renderer);
-
-            Ok(GfxState {
-                session,
-                render_driver,
-                swapchain,
-                current_frame,
-                present_semaphores,
-            })
-        }
-    }
-
-    fn redraw(&mut self) {
-        println!("redraw");
-        unsafe {
-            let frame_idx = self.current_frame % NUM_FRAMES;
-            let mut info_string = String::new();
-
-            if self.current_frame >= NUM_FRAMES {
-                let stats = self
-                    .render_driver
-                    .get_timing_stats(&self.session, frame_idx);
-                info_string = stats.short_summary();
-                println!("{}", info_string);
-            }
-            let mut text = SimpleText::new();
-            let mut scene = Scene::default();
-            let mut builder = SceneBuilder::for_scene(&mut scene);
-            samples::render_anim_frame(&mut builder, self.current_frame);
-            //samples::render_tiger(&mut builder, false);
-            render_info(&mut text, &mut builder, &info_string);
-            builder.finish();
-            if let Err(e) = self.render_driver.upload_scene(&self.session, &scene) {
-                println!("error in uploading: {}", e);
-            }
-            let (image_idx, acquisition_semaphore) = self.swapchain.next().unwrap();
-            let swap_image = self.swapchain.image(image_idx);
-            self.render_driver.run_coarse(&self.session).unwrap();
-            let target = self.render_driver.record_fine(&self.session).unwrap();
-            let cmd_buf = target.cmd_buf;
-
-            // Image -> Swapchain
-            cmd_buf.image_barrier(&swap_image, ImageLayout::Undefined, ImageLayout::BlitDst);
-            cmd_buf.blit_image(target.image, &swap_image);
-            cmd_buf.image_barrier(&swap_image, ImageLayout::BlitDst, ImageLayout::Present);
-
-            self.render_driver
-                .submit(
-                    &self.session,
-                    &[&acquisition_semaphore],
-                    &[&self.present_semaphores[frame_idx]],
-                )
-                .unwrap();
-
-            self.swapchain
-                .present(image_idx, &[&self.present_semaphores[frame_idx]])
-                .unwrap();
-
-            self.render_driver.next_buffer();
-            self.current_frame += 1;
-        }
-    }
-}
-
-fn render_info(simple_text: &mut SimpleText, sb: &mut SceneBuilder, info: &str) {
-    simple_text.add(
-        sb,
-        None,
-        60.0,
-        None,
-        piet_scene::Affine::translate(110.0, 120.0),
-        info,
-    );
-}
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@ -1,291 +0,0 @@
-use std::fs::File;
-use std::io::BufWriter;
-use std::path::Path;
-
-use clap::{App, Arg};
-
-use piet_gpu_hal::{BufferUsage, Error, Instance, InstanceFlags, Session};
-
-use piet_gpu::{samples, PicoSvg, RenderDriver, Renderer};
-use piet_scene::{Scene, SceneBuilder};
-
-const WIDTH: usize = 2048;
-const HEIGHT: usize = 1536;
-
-#[allow(unused)]
-fn dump_scene(buf: &[u8]) {
-    for i in 0..(buf.len() / 4) {
-        let mut buf_u32 = [0u8; 4];
-        buf_u32.copy_from_slice(&buf[i * 4..i * 4 + 4]);
-        println!("{:4x}: {:8x}", i * 4, u32::from_le_bytes(buf_u32));
-    }
-}
-
-#[allow(unused)]
-fn dump_state(buf: &[u8]) {
-    for i in 0..(buf.len() / 48) {
-        let j = i * 48;
-        let floats = (0..11)
-            .map(|k| {
-                let mut buf_f32 = [0u8; 4];
-                buf_f32.copy_from_slice(&buf[j + k * 4..j + k * 4 + 4]);
-                f32::from_le_bytes(buf_f32)
-            })
-            .collect::<Vec<_>>();
-        println!(
-            "{}: [{} {} {} {} {} {}] ({}, {})-({} {}) {} {}",
-            i,
-            floats[0],
-            floats[1],
-            floats[2],
-            floats[3],
-            floats[4],
-            floats[5],
-            floats[6],
-            floats[7],
-            floats[8],
-            floats[9],
-            floats[10],
-            buf[j + 44]
-        );
-    }
-}
-
-/// Interpret the output of the binning stage, for diagnostic purposes.
-#[allow(unused)]
-fn trace_merge(buf: &[u32]) {
-    for bin in 0..256 {
-        println!("bin {}:", bin);
-        let mut starts = (0..16)
-            .map(|i| Some((bin * 16 + i) * 64))
-            .collect::<Vec<Option<usize>>>();
-        loop {
-            let min_start = starts
-                .iter()
-                .map(|st| {
-                    st.map(|st| {
-                        if buf[st / 4] == 0 {
-                            !0
-                        } else {
-                            buf[st / 4 + 2]
-                        }
-                    })
-                    .unwrap_or(!0)
-                })
-                .min()
-                .unwrap();
-            if min_start == !0 {
-                break;
-            }
-            let mut selected = !0;
-            for i in 0..16 {
-                if let Some(st) = starts[i] {
-                    if buf[st / 4] != 0 && buf[st / 4 + 2] == min_start {
-                        selected = i;
-                        break;
-                    }
-                }
-            }
-            let st = starts[selected].unwrap();
-            println!("selected {}, start {:x}", selected, st);
-            for j in 0..buf[st / 4] {
-                println!("{:x}", buf[st / 4 + 2 + j as usize])
-            }
-            if buf[st / 4 + 1] == 0 {
-                starts[selected] = None;
-            } else {
-                starts[selected] = Some(buf[st / 4 + 1] as usize);
-            }
-        }
-    }
-}
-
-/// Interpret the output of the coarse raster stage, for diagnostic purposes.
-#[allow(unused)]
-fn trace_ptcl(buf: &[u32]) {
-    for y in 0..96 {
-        for x in 0..128 {
-            let tile_ix = y * 128 + x;
-            println!("tile {} @({}, {})", tile_ix, x, y);
-            let mut tile_offset = tile_ix * 1024;
-            loop {
-                let tag = buf[tile_offset / 4];
-                match tag {
-                    0 => break,
-                    3 => {
-                        let backdrop = buf[tile_offset / 4 + 2];
-                        let rgba_color = buf[tile_offset / 4 + 3];
-                        println!("  {:x}: fill {:x} {}", tile_offset, rgba_color, backdrop);
-                        let mut seg_chunk = buf[tile_offset / 4 + 1] as usize;
-                        let n = buf[seg_chunk / 4] as usize;
-                        let segs = buf[seg_chunk / 4 + 2] as usize;
-                        println!("    chunk @{:x}: n={}, segs @{:x}", seg_chunk, n, segs);
-                        for i in 0..n {
-                            let x0 = f32::from_bits(buf[segs / 4 + i * 5]);
-                            let y0 = f32::from_bits(buf[segs / 4 + i * 5 + 1]);
-                            let x1 = f32::from_bits(buf[segs / 4 + i * 5 + 2]);
-                            let y1 = f32::from_bits(buf[segs / 4 + i * 5 + 3]);
-                            let y_edge = f32::from_bits(buf[segs / 4 + i * 5 + 4]);
-                            println!(
-                                "      ({:.3}, {:.3}) - ({:.3}, {:.3}) | {:.3}",
-                                x0, y0, x1, y1, y_edge
-                            );
-                        }
-                        loop {
-                            seg_chunk = buf[seg_chunk / 4 + 1] as usize;
-                            if seg_chunk == 0 {
-                                break;
-                            }
-                        }
-                    }
-                    4 => {
-                        let line_width = f32::from_bits(buf[tile_offset / 4 + 2]);
-                        let rgba_color = buf[tile_offset / 4 + 3];
-                        println!(
-                            "  {:x}: stroke {:x} {}",
-                            tile_offset, rgba_color, line_width
-                        );
-                        let mut seg_chunk = buf[tile_offset / 4 + 1] as usize;
-                        let n = buf[seg_chunk / 4] as usize;
-                        let segs = buf[seg_chunk / 4 + 2] as usize;
-                        println!("    chunk @{:x}: n={}, segs @{:x}", seg_chunk, n, segs);
-                        for i in 0..n {
-                            let x0 = f32::from_bits(buf[segs / 4 + i * 5]);
-                            let y0 = f32::from_bits(buf[segs / 4 + i * 5 + 1]);
-                            let x1 = f32::from_bits(buf[segs / 4 + i * 5 + 2]);
-                            let y1 = f32::from_bits(buf[segs / 4 + i * 5 + 3]);
-                            let y_edge = f32::from_bits(buf[segs / 4 + i * 5 + 4]);
-                            println!(
-                                "      ({:.3}, {:.3}) - ({:.3}, {:.3}) | {:.3}",
-                                x0, y0, x1, y1, y_edge
-                            );
-                        }
-                        loop {
-                            seg_chunk = buf[seg_chunk / 4 + 1] as usize;
-                            if seg_chunk == 0 {
-                                break;
-                            }
-                        }
-                    }
-                    6 => {
-                        let backdrop = buf[tile_offset / 4 + 2];
-                        println!("  {:x}: begin_clip {}", tile_offset, backdrop);
-                        let mut seg_chunk = buf[tile_offset / 4 + 1] as usize;
-                        let n = buf[seg_chunk / 4] as usize;
-                        let segs = buf[seg_chunk / 4 + 2] as usize;
-                        println!("    chunk @{:x}: n={}, segs @{:x}", seg_chunk, n, segs);
-                        for i in 0..n {
-                            let x0 = f32::from_bits(buf[segs / 4 + i * 5]);
-                            let y0 = f32::from_bits(buf[segs / 4 + i * 5 + 1]);
-                            let x1 = f32::from_bits(buf[segs / 4 + i * 5 + 2]);
-                            let y1 = f32::from_bits(buf[segs / 4 + i * 5 + 3]);
-                            let y_edge = f32::from_bits(buf[segs / 4 + i * 5 + 4]);
-                            println!(
-                                "      ({:.3}, {:.3}) - ({:.3}, {:.3}) | {:.3}",
-                                x0, y0, x1, y1, y_edge
-                            );
-                        }
-                        loop {
-                            seg_chunk = buf[seg_chunk / 4 + 1] as usize;
-                            if seg_chunk == 0 {
-                                break;
-                            }
-                        }
-                    }
-                    7 => {
-                        let backdrop = buf[tile_offset / 4 + 1];
-                        println!("{:x}: solid_clip {:x}", tile_offset, backdrop);
-                    }
-                    8 => {
-                        println!("{:x}: end_clip", tile_offset);
-                    }
-                    _ => {
-                        println!("{:x}: {}", tile_offset, tag);
-                    }
-                }
-                if tag == 0 {
-                    break;
-                }
-                if tag == 8 {
-                    tile_offset = buf[tile_offset / 4 + 1] as usize;
-                } else {
-                    tile_offset += 20;
-                }
-            }
-        }
-    }
-}
-
-fn main() -> Result<(), Error> {
-    let matches = App::new("piet-gpu test")
-        .arg(Arg::with_name("INPUT").index(1))
-        .arg(Arg::with_name("flip").short('f').long("flip"))
-        .arg(
-            Arg::with_name("scale")
-                .short('s')
-                .long("scale")
-                .takes_value(true),
-        )
-        .get_matches();
-    let instance = Instance::new(InstanceFlags::default())?;
-    let mut scene = Scene::default();
-    unsafe {
-        let device = instance.device()?;
-        let session = Session::new(device);
-        let mut builder = SceneBuilder::for_scene(&mut scene);
-        if let Some(input) = matches.value_of("INPUT") {
-            let mut scale = matches
-                .value_of("scale")
-                .map(|scale| scale.parse().unwrap())
-                .unwrap_or(8.0);
-            if matches.is_present("flip") {
-                scale = -scale;
-            }
-            let xml_str = std::fs::read_to_string(input).unwrap();
-            let start = std::time::Instant::now();
-            let svg = PicoSvg::load(&xml_str, scale).unwrap();
-            println!("parsing time: {:?}", start.elapsed());
-            samples::render_svg(&mut builder, &svg, true);
-        } else {
-            //test_scenes::render_scene(&mut ctx);
-            samples::render_blend_grid(&mut builder);
-        }
-        builder.finish();
-
-        let renderer = Renderer::new(&session, WIDTH, HEIGHT, 1)?;
-        let mut render_driver = RenderDriver::new(&session, 1, renderer);
-        let start = std::time::Instant::now();
-        render_driver.upload_scene(&session, &scene)?;
-        let image_usage = BufferUsage::MAP_READ | BufferUsage::COPY_DST;
-        let image_buf = session.create_buffer((WIDTH * HEIGHT * 4) as u64, image_usage)?;
-
-        render_driver.run_coarse(&session)?;
-        let target = render_driver.record_fine(&session)?;
-        target
-            .cmd_buf
-            .copy_image_to_buffer(target.image, &image_buf);
-        render_driver.submit(&session, &[], &[])?;
-        render_driver.wait(&session);
-        println!("elapsed = {:?}", start.elapsed());
-        render_driver.get_timing_stats(&session, 0).print_summary();
-
-        let mut img_data: Vec<u8> = Default::default();
-        // Note: because png can use a `&[u8]` slice, we could avoid an extra copy
-        // (probably passing a slice into a closure). But for now: keep it simple.
-        image_buf.read(&mut img_data).unwrap();
-
-        // Write image as PNG file.
-        let path = Path::new("image.png");
-        let file = File::create(path).unwrap();
-        let ref mut w = BufWriter::new(file);
-
-        let mut encoder = png::Encoder::new(w, WIDTH as u32, HEIGHT as u32);
-        encoder.set_color(png::ColorType::Rgba);
-        encoder.set_depth(png::BitDepth::Eight);
-        let mut writer = encoder.write_header().unwrap();
-
-        writer.write_image_data(&img_data).unwrap();
-    }
-
-    Ok(())
-}
--- a/piet-gpu/bin/winit.rs
+++ b/piet-gpu/bin/winit.rs
@ -1,196 +0,0 @@
-use piet_gpu::{samples, PicoSvg, RenderDriver, Renderer, SimpleText};
-use piet_gpu_hal::{Error, ImageLayout, Instance, InstanceFlags, Session};
-use piet_scene::{Scene, SceneBuilder};
-
-use clap::{App, Arg};
-
-use raw_window_handle::{HasRawDisplayHandle, HasRawWindowHandle};
-
-use winit::{
-    event::{Event, WindowEvent},
-    event_loop::{ControlFlow, EventLoop},
-    window::WindowBuilder,
-};
-
-const NUM_FRAMES: usize = 2;
-
-const WIDTH: usize = 2048;
-const HEIGHT: usize = 1536;
-
-fn main() -> Result<(), Error> {
-    let matches = App::new("piet-gpu test")
-        .arg(Arg::with_name("INPUT").index(1))
-        .arg(Arg::with_name("flip").short('f').long("flip"))
-        .arg(
-            Arg::with_name("scale")
-                .short('s')
-                .long("scale")
-                .takes_value(true),
-        )
-        .get_matches();
-
-    // Collect SVG if input
-    let svg = match matches.value_of("INPUT") {
-        Some(file) => {
-            let mut scale = matches
-                .value_of("scale")
-                .map(|scale| scale.parse().unwrap())
-                .unwrap_or(8.0);
-            if matches.is_present("flip") {
-                scale = -scale;
-            }
-            let xml_str = std::fs::read_to_string(file).unwrap();
-            let start = std::time::Instant::now();
-            let svg = PicoSvg::load(&xml_str, scale).unwrap();
-            println!("parsing time: {:?}", start.elapsed());
-            Some(svg)
-        }
-        None => None,
-    };
-
-    let event_loop = EventLoop::new();
-    let window = WindowBuilder::new()
-        .with_inner_size(winit::dpi::LogicalSize {
-            width: (WIDTH / 2) as f64,
-            height: (HEIGHT / 2) as f64,
-        })
-        .with_resizable(false) // currently not supported
-        .build(&event_loop)?;
-
-    let instance = Instance::new(InstanceFlags::default())?;
-    let mut info_string = "info".to_string();
-    let mut scene = Scene::default();
-    let mut simple_text = piet_gpu::SimpleText::new();
-    unsafe {
-        let display_handle = window.raw_display_handle();
-        let window_handle = window.raw_window_handle();
-        let surface = instance.surface(display_handle, window_handle)?;
-        let device = instance.device()?;
-        let mut swapchain = instance.swapchain(WIDTH / 2, HEIGHT / 2, &device, &surface)?;
-        let session = Session::new(device);
-
-        let mut current_frame = 0;
-        let present_semaphores = (0..NUM_FRAMES)
-            .map(|_| session.create_semaphore())
-            .collect::<Result<Vec<_>, Error>>()?;
-
-        let renderer = Renderer::new(&session, WIDTH, HEIGHT, NUM_FRAMES)?;
-        let mut render_driver = RenderDriver::new(&session, NUM_FRAMES, renderer);
-        let mut sample_index = 0usize;
-
-        event_loop.run(move |event, _, control_flow| {
-            *control_flow = ControlFlow::Poll; // `ControlFlow::Wait` if only re-render on event
-
-            match event {
-                Event::WindowEvent { event, window_id } if window_id == window.id() => {
-                    use winit::event::{ElementState, VirtualKeyCode};
-                    match event {
-                        WindowEvent::CloseRequested => {
-                            *control_flow = ControlFlow::Exit;
-                        }
-                        WindowEvent::KeyboardInput { input, .. } => {
-                            if input.state == ElementState::Pressed {
-                                match input.virtual_keycode {
-                                    Some(VirtualKeyCode::Left) => {
-                                        sample_index = sample_index.saturating_sub(1)
-                                    }
-                                    Some(VirtualKeyCode::Right) => {
-                                        sample_index = sample_index.saturating_add(1)
-                                    }
-                                    _ => {}
-                                }
-                            }
-                        }
-                        _ => (),
-                    }
-                }
-                Event::MainEventsCleared => {
-                    window.request_redraw();
-                }
-                Event::RedrawRequested(window_id) if window_id == window.id() => {
-                    let frame_idx = current_frame % NUM_FRAMES;
-
-                    if current_frame >= NUM_FRAMES {
-                        let stats = render_driver.get_timing_stats(&session, frame_idx);
-                        info_string = stats.short_summary();
-                    }
-
-                    if let Some(svg) = &svg {
-                        let mut builder = SceneBuilder::for_scene(&mut scene);
-                        samples::render_svg(&mut builder, svg, false);
-                        render_info(&mut simple_text, &mut builder, &info_string);
-                        builder.finish();
-                        if let Err(e) = render_driver.upload_scene(&session, &scene) {
-                            println!("error in uploading: {}", e);
-                        }
-                    } else {
-                        let mut builder = SceneBuilder::for_scene(&mut scene);
-
-                        const N_SAMPLES: usize = 6;
-                        match sample_index % N_SAMPLES {
-                            0 => samples::render_anim_frame(
-                                &mut builder,
-                                &mut simple_text,
-                                current_frame,
-                            ),
-                            1 => samples::render_blend_grid(&mut builder),
-                            2 => samples::render_tiger(&mut builder, false),
-                            3 => samples::render_brush_transform(&mut builder, current_frame),
-                            4 => samples::render_funky_paths(&mut builder),
-                            _ => samples::render_scene(&mut builder),
-                        }
-                        render_info(&mut simple_text, &mut builder, &info_string);
-                        builder.finish();
-                        if let Err(e) = render_driver.upload_scene(&session, &scene) {
-                            println!("error in uploading: {}", e);
-                        }
-                    }
-
-                    let (image_idx, acquisition_semaphore) = swapchain.next().unwrap();
-                    let swap_image = swapchain.image(image_idx);
-                    render_driver.run_coarse(&session).unwrap();
-                    let target = render_driver.record_fine(&session).unwrap();
-                    let cmd_buf = target.cmd_buf;
-
-                    // Image -> Swapchain
-                    cmd_buf.image_barrier(
-                        &swap_image,
-                        ImageLayout::Undefined,
-                        ImageLayout::BlitDst,
-                    );
-                    cmd_buf.blit_image(target.image, &swap_image);
-                    cmd_buf.image_barrier(&swap_image, ImageLayout::BlitDst, ImageLayout::Present);
-                    render_driver
-                        .submit(
-                            &session,
-                            &[&acquisition_semaphore],
-                            &[&present_semaphores[frame_idx]],
-                        )
-                        .unwrap();
-
-                    swapchain
-                        .present(image_idx, &[&present_semaphores[frame_idx]])
-                        .unwrap();
-
-                    render_driver.next_buffer();
-                    current_frame += 1;
-                }
-                Event::LoopDestroyed => {
-                    render_driver.wait_all(&session);
-                }
-                _ => (),
-            }
-        })
-    }
-}
-
-fn render_info(simple_text: &mut SimpleText, sb: &mut SceneBuilder, info: &str) {
-    simple_text.add(
-        sb,
-        None,
-        40.0,
-        None,
-        piet_scene::kurbo::Affine::translate((110.0, 50.0)),
-        info,
-    );
-}
--- a/piet-gpu/shader/.clang-format
+++ b/piet-gpu/shader/.clang-format
@ -1,5 +0,0 @@
-BasedOnStyle: LLVM
-IndentWidth: 4
-ColumnLimit: 120
-AllowShortFunctionsOnASingleLine: None
-SortIncludes: false
--- a/piet-gpu/shader/annotated.h
+++ b/piet-gpu/shader/annotated.h
@ -1,296 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Code auto-generated by piet-gpu-derive
-
-struct AnnoImageRef {
-    uint offset;
-};
-
-struct AnnoColorRef {
-    uint offset;
-};
-
-struct AnnoLinGradientRef {
-    uint offset;
-};
-
-struct AnnoBeginClipRef {
-    uint offset;
-};
-
-struct AnnoEndClipRef {
-    uint offset;
-};
-
-struct AnnotatedRef {
-    uint offset;
-};
-
-struct AnnoImage {
-    vec4 bbox;
-    float linewidth;
-    uint index;
-    ivec2 offset;
-};
-
-#define AnnoImage_size 28
-
-AnnoImageRef AnnoImage_index(AnnoImageRef ref, uint index) {
-    return AnnoImageRef(ref.offset + index * AnnoImage_size);
-}
-
-struct AnnoColor {
-    vec4 bbox;
-    float linewidth;
-    uint rgba_color;
-};
-
-#define AnnoColor_size 24
-
-AnnoColorRef AnnoColor_index(AnnoColorRef ref, uint index) {
-    return AnnoColorRef(ref.offset + index * AnnoColor_size);
-}
-
-struct AnnoLinGradient {
-    vec4 bbox;
-    float linewidth;
-    uint index;
-    float line_x;
-    float line_y;
-    float line_c;
-};
-
-#define AnnoLinGradient_size 36
-
-AnnoLinGradientRef AnnoLinGradient_index(AnnoLinGradientRef ref, uint index) {
-    return AnnoLinGradientRef(ref.offset + index * AnnoLinGradient_size);
-}
-
-struct AnnoBeginClip {
-    vec4 bbox;
-    float linewidth;
-    uint blend;
-};
-
-#define AnnoBeginClip_size 24
-
-AnnoBeginClipRef AnnoBeginClip_index(AnnoBeginClipRef ref, uint index) {
-    return AnnoBeginClipRef(ref.offset + index * AnnoBeginClip_size);
-}
-
-struct AnnoEndClip {
-    vec4 bbox;
-    uint blend;
-};
-
-#define AnnoEndClip_size 20
-
-AnnoEndClipRef AnnoEndClip_index(AnnoEndClipRef ref, uint index) {
-    return AnnoEndClipRef(ref.offset + index * AnnoEndClip_size);
-}
-
-#define Annotated_Nop 0
-#define Annotated_Color 1
-#define Annotated_LinGradient 2
-#define Annotated_Image 3
-#define Annotated_BeginClip 4
-#define Annotated_EndClip 5
-#define Annotated_size 40
-
-AnnotatedRef Annotated_index(AnnotatedRef ref, uint index) {
-    return AnnotatedRef(ref.offset + index * Annotated_size);
-}
-
-struct AnnotatedTag {
-   uint tag;
-   uint flags;
-};
-
-AnnoImage AnnoImage_read(Alloc a, AnnoImageRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    uint raw1 = read_mem(a, ix + 1);
-    uint raw2 = read_mem(a, ix + 2);
-    uint raw3 = read_mem(a, ix + 3);
-    uint raw4 = read_mem(a, ix + 4);
-    uint raw5 = read_mem(a, ix + 5);
-    uint raw6 = read_mem(a, ix + 6);
-    AnnoImage s;
-    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.linewidth = uintBitsToFloat(raw4);
-    s.index = raw5;
-    s.offset = ivec2(int(raw6 << 16) >> 16, int(raw6) >> 16);
-    return s;
-}
-
-void AnnoImage_write(Alloc a, AnnoImageRef ref, AnnoImage s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, floatBitsToUint(s.bbox.x));
-    write_mem(a, ix + 1, floatBitsToUint(s.bbox.y));
-    write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
-    write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
-    write_mem(a, ix + 4, floatBitsToUint(s.linewidth));
-    write_mem(a, ix + 5, s.index);
-    write_mem(a, ix + 6, (uint(s.offset.x) & 0xffff) | (uint(s.offset.y) << 16));
-}
-
-AnnoColor AnnoColor_read(Alloc a, AnnoColorRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    uint raw1 = read_mem(a, ix + 1);
-    uint raw2 = read_mem(a, ix + 2);
-    uint raw3 = read_mem(a, ix + 3);
-    uint raw4 = read_mem(a, ix + 4);
-    uint raw5 = read_mem(a, ix + 5);
-    AnnoColor s;
-    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.linewidth = uintBitsToFloat(raw4);
-    s.rgba_color = raw5;
-    return s;
-}
-
-void AnnoColor_write(Alloc a, AnnoColorRef ref, AnnoColor s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, floatBitsToUint(s.bbox.x));
-    write_mem(a, ix + 1, floatBitsToUint(s.bbox.y));
-    write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
-    write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
-    write_mem(a, ix + 4, floatBitsToUint(s.linewidth));
-    write_mem(a, ix + 5, s.rgba_color);
-}
-
-AnnoLinGradient AnnoLinGradient_read(Alloc a, AnnoLinGradientRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    uint raw1 = read_mem(a, ix + 1);
-    uint raw2 = read_mem(a, ix + 2);
-    uint raw3 = read_mem(a, ix + 3);
-    uint raw4 = read_mem(a, ix + 4);
-    uint raw5 = read_mem(a, ix + 5);
-    uint raw6 = read_mem(a, ix + 6);
-    uint raw7 = read_mem(a, ix + 7);
-    uint raw8 = read_mem(a, ix + 8);
-    AnnoLinGradient s;
-    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.linewidth = uintBitsToFloat(raw4);
-    s.index = raw5;
-    s.line_x = uintBitsToFloat(raw6);
-    s.line_y = uintBitsToFloat(raw7);
-    s.line_c = uintBitsToFloat(raw8);
-    return s;
-}
-
-void AnnoLinGradient_write(Alloc a, AnnoLinGradientRef ref, AnnoLinGradient s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, floatBitsToUint(s.bbox.x));
-    write_mem(a, ix + 1, floatBitsToUint(s.bbox.y));
-    write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
-    write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
-    write_mem(a, ix + 4, floatBitsToUint(s.linewidth));
-    write_mem(a, ix + 5, s.index);
-    write_mem(a, ix + 6, floatBitsToUint(s.line_x));
-    write_mem(a, ix + 7, floatBitsToUint(s.line_y));
-    write_mem(a, ix + 8, floatBitsToUint(s.line_c));
-}
-
-AnnoBeginClip AnnoBeginClip_read(Alloc a, AnnoBeginClipRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    uint raw1 = read_mem(a, ix + 1);
-    uint raw2 = read_mem(a, ix + 2);
-    uint raw3 = read_mem(a, ix + 3);
-    uint raw4 = read_mem(a, ix + 4);
-    uint raw5 = read_mem(a, ix + 5);
-    AnnoBeginClip s;
-    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.linewidth = uintBitsToFloat(raw4);
-    s.blend = raw5;
-    return s;
-}
-
-void AnnoBeginClip_write(Alloc a, AnnoBeginClipRef ref, AnnoBeginClip s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, floatBitsToUint(s.bbox.x));
-    write_mem(a, ix + 1, floatBitsToUint(s.bbox.y));
-    write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
-    write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
-    write_mem(a, ix + 4, floatBitsToUint(s.linewidth));
-    write_mem(a, ix + 5, s.blend);
-}
-
-AnnoEndClip AnnoEndClip_read(Alloc a, AnnoEndClipRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    uint raw1 = read_mem(a, ix + 1);
-    uint raw2 = read_mem(a, ix + 2);
-    uint raw3 = read_mem(a, ix + 3);
-    uint raw4 = read_mem(a, ix + 4);
-    AnnoEndClip s;
-    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.blend = raw4;
-    return s;
-}
-
-void AnnoEndClip_write(Alloc a, AnnoEndClipRef ref, AnnoEndClip s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, floatBitsToUint(s.bbox.x));
-    write_mem(a, ix + 1, floatBitsToUint(s.bbox.y));
-    write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
-    write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
-    write_mem(a, ix + 4, s.blend);
-}
-
-AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref) {
-    uint tag_and_flags = read_mem(a, ref.offset >> 2);
-    return AnnotatedTag(tag_and_flags & 0xffff, tag_and_flags >> 16);
-}
-
-AnnoColor Annotated_Color_read(Alloc a, AnnotatedRef ref) {
-    return AnnoColor_read(a, AnnoColorRef(ref.offset + 4));
-}
-
-AnnoLinGradient Annotated_LinGradient_read(Alloc a, AnnotatedRef ref) {
-    return AnnoLinGradient_read(a, AnnoLinGradientRef(ref.offset + 4));
-}
-
-AnnoImage Annotated_Image_read(Alloc a, AnnotatedRef ref) {
-    return AnnoImage_read(a, AnnoImageRef(ref.offset + 4));
-}
-
-AnnoBeginClip Annotated_BeginClip_read(Alloc a, AnnotatedRef ref) {
-    return AnnoBeginClip_read(a, AnnoBeginClipRef(ref.offset + 4));
-}
-
-AnnoEndClip Annotated_EndClip_read(Alloc a, AnnotatedRef ref) {
-    return AnnoEndClip_read(a, AnnoEndClipRef(ref.offset + 4));
-}
-
-void Annotated_Nop_write(Alloc a, AnnotatedRef ref) {
-    write_mem(a, ref.offset >> 2, Annotated_Nop);
-}
-
-void Annotated_Color_write(Alloc a, AnnotatedRef ref, uint flags, AnnoColor s) {
-    write_mem(a, ref.offset >> 2, (flags << 16) | Annotated_Color);
-    AnnoColor_write(a, AnnoColorRef(ref.offset + 4), s);
-}
-
-void Annotated_LinGradient_write(Alloc a, AnnotatedRef ref, uint flags, AnnoLinGradient s) {
-    write_mem(a, ref.offset >> 2, (flags << 16) | Annotated_LinGradient);
-    AnnoLinGradient_write(a, AnnoLinGradientRef(ref.offset + 4), s);
-}
-
-void Annotated_Image_write(Alloc a, AnnotatedRef ref, uint flags, AnnoImage s) {
-    write_mem(a, ref.offset >> 2, (flags << 16) | Annotated_Image);
-    AnnoImage_write(a, AnnoImageRef(ref.offset + 4), s);
-}
-
-void Annotated_BeginClip_write(Alloc a, AnnotatedRef ref, uint flags, AnnoBeginClip s) {
-    write_mem(a, ref.offset >> 2, (flags << 16) | Annotated_BeginClip);
-    AnnoBeginClip_write(a, AnnoBeginClipRef(ref.offset + 4), s);
-}
-
-void Annotated_EndClip_write(Alloc a, AnnotatedRef ref, uint flags, AnnoEndClip s) {
-    write_mem(a, ref.offset >> 2, (flags << 16) | Annotated_EndClip);
-    AnnoEndClip_write(a, AnnoEndClipRef(ref.offset + 4), s);
-}
-
--- a/piet-gpu/shader/backdrop.comp
+++ b/piet-gpu/shader/backdrop.comp
@ -1,118 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Propagation of tile backdrop for filling.
-//
-// Each thread reads one path element and calculates the row and column counts of spanned tiles
-// based on the bounding box.
-// The row count then goes through a prefix sum to redistribute and load-balance the work across the workgroup.
-// In the following step, the workgroup loops over the corresponding tile rows per element in parallel.
-// For each row the per tile backdrop will be read, as calculated in the previous coarse path segment kernel,
-// and propagated from the left to the right (prefix summed).
-//
-// Output state:
-//  - Each path element has an array of tiles covering the whole path based on boundig box
-//  - Each tile per path element contains the 'backdrop' and a list of subdivided path segments
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-#include "mem.h"
-#include "setup.h"
-
-#define LG_BACKDROP_WG (7 + LG_WG_FACTOR)
-#define BACKDROP_WG (1 << LG_BACKDROP_WG)
-#ifndef BACKDROP_DIST_FACTOR
-// Some paths (those covering a large area) can generate a lot of backdrop tiles; BACKDROP_DIST_FACTOR defines how much
-// additional threads should we spawn for parallel row processing. The additional threads does not participate in the
-// earlier stages (calculating the tile counts) but does work in the final prefix sum stage which has a lot more
-// parallelism.
-
-// This feature is opt-in: one variant is compiled with the following default, while the other variant is compiled with
-// a larger BACKDROP_DIST_FACTOR, which is used on GPUs supporting a larger workgroup size to improve performance.
-#define BACKDROP_DIST_FACTOR 1
-#endif
-
-layout(local_size_x = BACKDROP_WG, local_size_y = BACKDROP_DIST_FACTOR) in;
-
-layout(set = 0, binding = 1) readonly buffer ConfigBuf {
-    Config conf;
-};
-
-#include "tile.h"
-
-shared uint sh_row_count[BACKDROP_WG];
-shared Alloc sh_row_alloc[BACKDROP_WG];
-shared uint sh_row_width[BACKDROP_WG];
-
-void main() {
-    if (!check_deps(STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_PATH_COARSE)) {
-        return;
-    }
-
-    uint th_ix = gl_LocalInvocationIndex;
-    uint element_ix = gl_GlobalInvocationID.x;
-
-    // Work assignment: 1 thread : 1 path element
-    uint row_count = 0;
-    if (gl_LocalInvocationID.y == 0) {
-        if (element_ix < conf.n_elements) {
-            // Possible TODO: it's not necessary to process backdrops of stroked paths.
-            // We had logic for that but took it out because it used the Annotated struct.
-            PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
-            Path path = Path_read(conf.tile_alloc, path_ref);
-            sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
-            row_count = path.bbox.w - path.bbox.y;
-            // Paths that don't cross tile top edges don't have backdrops.
-            // Don't apply the optimization to paths that may cross the y = 0
-            // top edge, but clipped to 1 row.
-            if (row_count == 1 && path.bbox.y > 0) {
-                // Note: this can probably be expanded to width = 2 as
-                // long as it doesn't cross the left edge.
-                row_count = 0;
-            }
-            Alloc path_alloc = new_alloc(
-                path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, true);
-            sh_row_alloc[th_ix] = path_alloc;
-        }
-        sh_row_count[th_ix] = row_count;
-    }
-
-    // Prefix sum of sh_row_count
-    for (uint i = 0; i < LG_BACKDROP_WG; i++) {
-        barrier();
-        if (gl_LocalInvocationID.y == 0 && th_ix >= (1u << i)) {
-            row_count += sh_row_count[th_ix - (1u << i)];
-        }
-        barrier();
-        if (gl_LocalInvocationID.y == 0) {
-            sh_row_count[th_ix] = row_count;
-        }
-    }
-    barrier();
-    // Work assignment: 1 thread : 1 path element row
-    uint total_rows = sh_row_count[BACKDROP_WG - 1];
-    for (uint row = th_ix; row < total_rows; row += BACKDROP_WG * BACKDROP_DIST_FACTOR) {
-        // Binary search to find element
-        uint el_ix = 0;
-        for (uint i = 0; i < LG_BACKDROP_WG; i++) {
-            uint probe = el_ix + (uint(BACKDROP_WG / 2) >> i);
-            if (row >= sh_row_count[probe - 1]) {
-                el_ix = probe;
-            }
-        }
-        uint width = sh_row_width[el_ix];
-        if (width > 0) {
-            // Process one row sequentially
-            // Read backdrop value per tile and prefix sum it
-            Alloc tiles_alloc = sh_row_alloc[el_ix];
-            uint seq_ix = row - (el_ix > 0 ? sh_row_count[el_ix - 1] : 0);
-            uint tile_el_ix = (tiles_alloc.offset >> 2) + 1 + seq_ix * 2 * width;
-            uint sum = read_mem(tiles_alloc, tile_el_ix);
-            for (uint x = 1; x < width; x++) {
-                tile_el_ix += 2;
-                sum += read_mem(tiles_alloc, tile_el_ix);
-                write_mem(tiles_alloc, tile_el_ix, sum);
-            }
-        }
-    }
-}
--- a/piet-gpu/shader/bbox_clear.comp
+++ b/piet-gpu/shader/bbox_clear.comp
@ -1,29 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Clear path bbox to prepare for atomic min/max.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-#include "mem.h"
-#include "setup.h"
-
-#define LG_WG_SIZE 9
-#define WG_SIZE (1 << LG_WG_SIZE)
-
-layout(local_size_x = WG_SIZE, local_size_y = 1) in;
-
-layout(binding = 1) readonly buffer ConfigBuf {
-    Config conf;
-};
-
-void main() {
-    uint ix = gl_GlobalInvocationID.x;
-    if (ix < conf.n_path) {
-        uint out_ix = (conf.path_bbox_alloc.offset >> 2) + 6 * ix;
-        memory[out_ix] = 0xffff;
-        memory[out_ix + 1] = 0xffff;
-        memory[out_ix + 2] = 0;
-        memory[out_ix + 3] = 0;
-    }
-}
--- a/piet-gpu/shader/binning.comp
+++ b/piet-gpu/shader/binning.comp
@ -1,182 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// The binning stage of the pipeline.
-//
-// Each workgroup processes N_TILE paths.
-// Each thread processes one path and calculates a N_TILE_X x N_TILE_Y coverage mask
-// based on the path bounding box to bin the paths.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-#include "mem.h"
-#include "setup.h"
-
-layout(local_size_x = N_TILE, local_size_y = 1) in;
-
-layout(set = 0, binding = 1) readonly buffer ConfigBuf {
-    Config conf;
-};
-
-#include "bins.h"
-#include "drawtag.h"
-
-// scale factors useful for converting coordinates to bins
-#define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX))
-#define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX))
-
-// Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000)
-#define INFINITY (1.0 / 0.0)
-
-// Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
-// Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps)
-shared uint bitmaps[N_SLICE][N_TILE];
-shared uint count[N_SLICE][N_TILE];
-shared uint sh_chunk_offset[N_TILE];
-
-DrawMonoid load_draw_monoid(uint element_ix) {
-    uint base = (conf.drawmonoid_alloc.offset >> 2) + 4 * element_ix;
-    uint path_ix = memory[base];
-    uint clip_ix = memory[base + 1];
-    uint scene_offset = memory[base + 2];
-    uint info_offset = memory[base + 3];
-    return DrawMonoid(path_ix, clip_ix, scene_offset, info_offset);
-}
-
-// Load bounding box computed by clip processing
-vec4 load_clip_bbox(uint clip_ix) {
-    uint base = (conf.clip_bbox_alloc.offset >> 2) + 4 * clip_ix;
-    float x0 = uintBitsToFloat(memory[base]);
-    float y0 = uintBitsToFloat(memory[base + 1]);
-    float x1 = uintBitsToFloat(memory[base + 2]);
-    float y1 = uintBitsToFloat(memory[base + 3]);
-    vec4 bbox = vec4(x0, y0, x1, y1);
-    return bbox;
-}
-
-vec4 bbox_intersect(vec4 a, vec4 b) {
-    return vec4(max(a.xy, b.xy), min(a.zw, b.zw));
-}
-
-// Load path's bbox from bbox (as written by pathseg).
-vec4 load_path_bbox(uint path_ix) {
-    uint base = (conf.path_bbox_alloc.offset >> 2) + 6 * path_ix;
-    float bbox_l = float(memory[base]) - 32768.0;
-    float bbox_t = float(memory[base + 1]) - 32768.0;
-    float bbox_r = float(memory[base + 2]) - 32768.0;
-    float bbox_b = float(memory[base + 3]) - 32768.0;
-    vec4 bbox = vec4(bbox_l, bbox_t, bbox_r, bbox_b);
-    return bbox;
-}
-
-void store_draw_bbox(uint draw_ix, vec4 bbox) {
-    uint base = (conf.draw_bbox_alloc.offset >> 2) + 4 * draw_ix;
-    memory[base] = floatBitsToUint(bbox.x);
-    memory[base + 1] = floatBitsToUint(bbox.y);
-    memory[base + 2] = floatBitsToUint(bbox.z);
-    memory[base + 3] = floatBitsToUint(bbox.w);
-}
-
-void main() {
-    uint my_partition = gl_WorkGroupID.x;
-
-    for (uint i = 0; i < N_SLICE; i++) {
-        bitmaps[i][gl_LocalInvocationID.x] = 0;
-    }
-
-    // Read inputs and determine coverage of bins
-    uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x;
-    int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
-    if (element_ix < conf.n_elements) {
-        DrawMonoid draw_monoid = load_draw_monoid(element_ix);
-        uint path_ix = draw_monoid.path_ix;
-        vec4 clip_bbox = vec4(-1e9, -1e9, 1e9, 1e9);
-        uint clip_ix = draw_monoid.clip_ix;
-        if (clip_ix > 0) {
-            clip_bbox = load_clip_bbox(clip_ix - 1);
-        }
-        // For clip elements, clip_bbox is the bbox of the clip path, intersected
-        // with enclosing clips.
-        // For other elements, it is the bbox of the enclosing clips.
-
-        vec4 path_bbox = load_path_bbox(path_ix);
-        vec4 bbox = bbox_intersect(path_bbox, clip_bbox);
-        // Avoid negative-size bbox (is this necessary)?
-        bbox.zw = max(bbox.xy, bbox.zw);
-        // Store clip-intersected bbox for tile_alloc.
-        store_draw_bbox(element_ix, bbox);
-        x0 = int(floor(bbox.x * SX));
-        y0 = int(floor(bbox.y * SY));
-        x1 = int(ceil(bbox.z * SX));
-        y1 = int(ceil(bbox.w * SY));
-    }
-
-    // At this point, we run an iterator over the coverage area,
-    // trying to keep divergence low.
-    // Right now, it's just a bbox, but we'll get finer with
-    // segments.
-    uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1) / N_TILE_X;
-    uint height_in_bins = (conf.height_in_tiles + N_TILE_Y - 1) / N_TILE_Y;
-    x0 = clamp(x0, 0, int(width_in_bins));
-    x1 = clamp(x1, x0, int(width_in_bins));
-    y0 = clamp(y0, 0, int(height_in_bins));
-    y1 = clamp(y1, y0, int(height_in_bins));
-    if (x0 == x1)
-        y1 = y0;
-    int x = x0, y = y0;
-    uint my_slice = gl_LocalInvocationID.x / 32;
-    uint my_mask = 1u << (gl_LocalInvocationID.x & 31);
-    while (y < y1) {
-        atomicOr(bitmaps[my_slice][y * width_in_bins + x], my_mask);
-        x++;
-        if (x == x1) {
-            x = x0;
-            y++;
-        }
-    }
-
-    barrier();
-    // Allocate output segments.
-    uint element_count = 0;
-    for (uint i = 0; i < N_SLICE; i++) {
-        element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]);
-        count[i][gl_LocalInvocationID.x] = element_count;
-    }
-    // element_count is number of elements covering bin for this invocation.
-    uint chunk_offset = 0;
-    if (element_count != 0) {
-        chunk_offset = malloc_stage(element_count * BinInstance_size, conf.mem_size, STAGE_BINNING);
-        sh_chunk_offset[gl_LocalInvocationID.x] = chunk_offset;
-    }
-    // Note: it might be more efficient for reading to do this in the
-    // other order (each bin is a contiguous sequence of partitions)
-    uint out_ix = (conf.bin_alloc.offset >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2;
-    write_mem(conf.bin_alloc, out_ix, element_count);
-    write_mem(conf.bin_alloc, out_ix + 1, chunk_offset);
-
-    barrier();
-
-    // Use similar strategy as Laine & Karras paper; loop over bbox of bins
-    // touched by this element
-    x = x0;
-    y = y0;
-    while (y < y1) {
-        uint bin_ix = y * width_in_bins + x;
-        uint out_mask = bitmaps[my_slice][bin_ix];
-        if ((out_mask & my_mask) != 0) {
-            uint idx = bitCount(out_mask & (my_mask - 1));
-            if (my_slice > 0) {
-                idx += count[my_slice - 1][bin_ix];
-            }
-            uint chunk_offset = sh_chunk_offset[bin_ix];
-            if (chunk_offset != MALLOC_FAILED) {
-                memory[(chunk_offset >> 2) + idx] = element_ix;
-            }
-        }
-        x++;
-        if (x == x1) {
-            x = x0;
-            y++;
-        }
-    }
-}
--- a/piet-gpu/shader/bins.h
+++ b/piet-gpu/shader/bins.h
@ -1,31 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Code auto-generated by piet-gpu-derive
-
-struct BinInstanceRef {
-    uint offset;
-};
-
-struct BinInstance {
-    uint element_ix;
-};
-
-#define BinInstance_size 4
-
-BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) {
-    return BinInstanceRef(ref.offset + index * BinInstance_size);
-}
-
-BinInstance BinInstance_read(Alloc a, BinInstanceRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    BinInstance s;
-    s.element_ix = raw0;
-    return s;
-}
-
-void BinInstance_write(Alloc a, BinInstanceRef ref, BinInstance s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, s.element_ix);
-}
-
--- a/piet-gpu/shader/blend.h
+++ b/piet-gpu/shader/blend.h
@ -1,291 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Mode definitions and functions for blending and composition.
-
-#define Blend_Normal 0
-#define Blend_Multiply 1
-#define Blend_Screen 2
-#define Blend_Overlay 3
-#define Blend_Darken 4
-#define Blend_Lighten 5
-#define Blend_ColorDodge 6
-#define Blend_ColorBurn 7
-#define Blend_HardLight 8
-#define Blend_SoftLight 9
-#define Blend_Difference 10
-#define Blend_Exclusion 11
-#define Blend_Hue 12
-#define Blend_Saturation 13
-#define Blend_Color 14
-#define Blend_Luminosity 15
-#define Blend_Clip 128
-
-vec3 screen(vec3 cb, vec3 cs) {
-	return cb + cs - (cb * cs);
-}
-
-float color_dodge(float cb, float cs) {
-    if (cb == 0.0)
-        return 0.0;
-    else if (cs == 1.0)
-        return 1.0;
-    else
-        return min(1.0, cb / (1.0 - cs));
-}
-
-float color_burn(float cb, float cs) {
-    if (cb == 1.0)
-        return 1.0;
-    else if (cs == 0.0)
-        return 0.0;
-    else
-        return 1.0 - min(1.0, (1.0 - cb) / cs);
-}
-
-vec3 hard_light(vec3 cb, vec3 cs) {
-	return mix(
-		screen(cb, 2.0 * cs - 1.0),
-		cb * 2.0 * cs, 
-		lessThanEqual(cs, vec3(0.5))
-	);
-}
-
-vec3 soft_light(vec3 cb, vec3 cs) {
-	vec3 d = mix(
-		sqrt(cb),
-		((16.0 * cb - vec3(12.0)) * cb + vec3(4.0)) * cb,
-		lessThanEqual(cb, vec3(0.25))
-	);
-	return mix(
-		cb + (2.0 * cs - vec3(1.0)) * (d - cb),
-		cb - (vec3(1.0) - 2.0 * cs) * cb * (vec3(1.0) - cb),
-		lessThanEqual(cs, vec3(0.5))
-	);
-}
-
-float sat(vec3 c) {
-    return max(c.r, max(c.g, c.b)) - min(c.r, min(c.g, c.b));
-}
-
-float lum(vec3 c) {
-    vec3 f = vec3(0.3, 0.59, 0.11);
-    return dot(c, f);
-}
-
-vec3 clip_color(vec3 c) {
-    float L = lum(c);
-    float n = min(c.r, min(c.g, c.b));
-    float x = max(c.r, max(c.g, c.b));
-    if (n < 0.0)
-        c = L + (((c - L) * L) / (L - n));
-    if (x > 1.0)
-        c = L + (((c - L) * (1.0 - L)) / (x - L));
-    return c;
-}
-
-vec3 set_lum(vec3 c, float l) {
-    return clip_color(c + (l - lum(c)));
-}
-
-void set_sat_inner(inout float cmin, inout float cmid, inout float cmax, float s) {
-    if (cmax > cmin) {
-        cmid = (((cmid - cmin) * s) / (cmax - cmin));
-        cmax = s;
-    } else {
-        cmid = 0.0;
-        cmax = 0.0;
-    }
-    cmin = 0.0;
-}
-
-vec3 set_sat(vec3 c, float s) {
-    if (c.r <= c.g) {
-        if (c.g <= c.b) {
-            set_sat_inner(c.r, c.g, c.b, s);
-        } else {
-            if (c.r <= c.b) {
-                set_sat_inner(c.r, c.b, c.g, s);
-            } else {
-                set_sat_inner(c.b, c.r, c.g, s);
-            }
-        }
-    } else {
-        if (c.r <= c.b) {
-            set_sat_inner(c.g, c.r, c.b, s);
-        } else {
-            if (c.g <= c.b) {
-                set_sat_inner(c.g, c.b, c.r, s);
-            } else {
-                set_sat_inner(c.b, c.g, c.r, s);
-            }
-        }
-    }
-    return c;
-}
-
-// Blends two RGB colors together. The colors are assumed to be in sRGB
-// color space, and this function does not take alpha into account.
-vec3 mix_blend(vec3 cb, vec3 cs, uint mode) {
-	vec3 b = vec3(0.0);
-	switch (mode) {
-	case Blend_Multiply:
-		b = cb * cs;
-		break;
-	case Blend_Screen:
-		b = screen(cb, cs);
-		break;
-	case Blend_Overlay:
-		b = hard_light(cs, cb);
-		break;
-	case Blend_Darken:
-		b = min(cb, cs);
-		break;
-	case Blend_Lighten:
-		b = max(cb, cs);
-		break;
-	case Blend_ColorDodge:
-		b = vec3(color_dodge(cb.x, cs.x), color_dodge(cb.y, cs.y), color_dodge(cb.z, cs.z));
-		break;
-	case Blend_ColorBurn:
-		b = vec3(color_burn(cb.x, cs.x), color_burn(cb.y, cs.y), color_burn(cb.z, cs.z));
-		break;
-	case Blend_HardLight:
-		b = hard_light(cb, cs);
-		break;
-	case Blend_SoftLight:
-		b = soft_light(cb, cs);
-		break;
-	case Blend_Difference:
-		b = abs(cb - cs);
-		break;
-	case Blend_Exclusion:
-		b = cb + cs - 2 * cb * cs;
-		break;
-	case Blend_Hue:
-		b = set_lum(set_sat(cs, sat(cb)), lum(cb));
-		break;
-	case Blend_Saturation:
-		b = set_lum(set_sat(cb, sat(cs)), lum(cb));
-		break;
-	case Blend_Color:
-		b = set_lum(cs, lum(cb));
-		break;
-	case Blend_Luminosity:
-		b = set_lum(cb, lum(cs));
-		break;
-	default:
-		b = cs;
-		break;
-	}
-	return b;
-}
-
-#define Comp_Clear 0
-#define Comp_Copy 1
-#define Comp_Dest 2
-#define Comp_SrcOver 3
-#define Comp_DestOver 4
-#define Comp_SrcIn 5
-#define Comp_DestIn 6
-#define Comp_SrcOut 7
-#define Comp_DestOut 8
-#define Comp_SrcAtop 9
-#define Comp_DestAtop 10
-#define Comp_Xor 11
-#define Comp_Plus 12
-#define Comp_PlusLighter 13
-
-// Apply general compositing operation.
-// Inputs are separated colors and alpha, output is premultiplied.
-vec4 mix_compose(vec3 cb, vec3 cs, float ab, float as, uint mode) {
-	float fa = 0.0;
-	float fb = 0.0;
-	switch (mode) {
-	case Comp_Copy:
-		fa = 1.0;
-		fb = 0.0;
-		break;
-	case Comp_Dest:
-		fa = 0.0;
-		fb = 1.0;
-		break;
-	case Comp_SrcOver:
-		fa = 1.0;
-		fb = 1.0 - as;
-		break;
-	case Comp_DestOver:
-		fa = 1.0 - ab;
-		fb = 1.0;
-		break;
-	case Comp_SrcIn:
-		fa = ab;
-		fb = 0.0;
-		break;
-	case Comp_DestIn:
-		fa = 0.0;
-		fb = as;
-		break;
-	case Comp_SrcOut:
-		fa = 1.0 - ab;
-		fb = 0.0;
-		break;
-	case Comp_DestOut:
-		fa = 0.0;
-		fb = 1.0 - as;
-		break;
-	case Comp_SrcAtop:
-		fa = ab;
-		fb = 1.0 - as;
-		break;
-	case Comp_DestAtop:
-		fa = 1.0 - ab;
-		fb = as;
-		break;
-	case Comp_Xor:
-		fa = 1.0 - ab;
-		fb = 1.0 - as;
-		break;
-	case Comp_Plus:
-		fa = 1.0;
-		fb = 1.0;
-		break;
-	case Comp_PlusLighter:
-		return min(vec4(1.0), vec4(as * cs + ab * cb, as + ab));
-	default:
-		break;
-	}
-	float as_fa = as * fa;
-	float ab_fb = ab * fb;
-	vec3 co = as_fa * cs + ab_fb * cb;
-	return vec4(co, as_fa + ab_fb);
-}
-
-#define BlendComp_default (Blend_Normal << 8 | Comp_SrcOver)
-#define BlendComp_clip (Blend_Clip << 8 | Comp_SrcOver)
-
-// This is added to alpha to prevent divide-by-zero
-#define EPSILON 1e-15
-
-// Apply blending and composition. Both input and output colors are
-// premultiplied RGB.
-vec4 mix_blend_compose(vec4 backdrop, vec4 src, uint mode) {
-	if ((mode & 0x7fff) == BlendComp_default) {
-		// Both normal+src_over blend and clip case
-		return backdrop * (1.0 - src.a) + src;
-	}
-	// Un-premultiply colors for blending
-	float inv_src_a = 1.0 / (src.a + EPSILON);
-	vec3 cs = src.rgb * inv_src_a;
-	float inv_backdrop_a = 1.0 / (backdrop.a + EPSILON);
-	vec3 cb = backdrop.rgb * inv_backdrop_a;
-	uint blend_mode = mode >> 8;
-	vec3 blended = mix_blend(cb, cs, blend_mode);
-	cs = mix(cs, blended, backdrop.a);
-	uint comp_mode = mode & 0xff;
-	if (comp_mode == Comp_SrcOver) {
-		vec3 co = mix(backdrop.rgb, cs, src.a);
-		return vec4(co, src.a + backdrop.a * (1 - src.a));
-	} else {
-		return mix_compose(cb, cs, backdrop.a, src.a, comp_mode);
-	}
-}
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@ -1,118 +0,0 @@
-# Build file for shaders.
-
-# You must have Vulkan tools in your path, or patch here.
-
-glslang_validator = glslangValidator
-spirv_cross = spirv-cross
-dxc = dxc
-
-# See https://github.com/KhronosGroup/SPIRV-Cross/issues/1248 for
-# why we set this.
-msl_flags = --msl-decoration-binding
-
-rule glsl
-  command = $glslang_validator $flags -V -o $out $in
-
-rule hlsl
-  command = $spirv_cross --hlsl --shader-model 60 $in --output $out
-
-rule dxil
-  command = $dxc -T cs_6_0 $in -Fo $out
-
-rule msl
-  command = $spirv_cross --msl $in --output $out $msl_flags
-
-build gen/binning.spv: glsl binning.comp | bins.h drawtag.h setup.h mem.h
-build gen/binning.hlsl: hlsl gen/binning.spv
-build gen/binning.dxil: dxil gen/binning.hlsl
-build gen/binning.msl: msl gen/binning.spv
-
-build gen/tile_alloc.spv: glsl tile_alloc.comp | drawtag.h tile.h setup.h mem.h
-build gen/tile_alloc.hlsl: hlsl gen/tile_alloc.spv
-build gen/tile_alloc.dxil: dxil gen/tile_alloc.hlsl
-build gen/tile_alloc.msl: msl gen/tile_alloc.spv
-
-build gen/path_coarse.spv: glsl path_coarse.comp | pathseg.h tile.h setup.h mem.h
-build gen/path_coarse.hlsl: hlsl gen/path_coarse.spv
-build gen/path_coarse.dxil: dxil gen/path_coarse.hlsl
-build gen/path_coarse.msl: msl gen/path_coarse.spv
-
-build gen/backdrop.spv: glsl backdrop.comp | tile.h setup.h mem.h
-build gen/backdrop.hlsl: hlsl gen/backdrop.spv
-build gen/backdrop.dxil: dxil gen/backdrop.hlsl
-build gen/backdrop.msl: msl gen/backdrop.spv
-
-build gen/backdrop_lg.spv: glsl backdrop.comp | tile.h setup.h mem.h
-  flags = -DBACKDROP_DIST_FACTOR=4
-build gen/backdrop_lg.hlsl: hlsl gen/backdrop_lg.spv
-build gen/backdrop_lg.dxil: dxil gen/backdrop_lg.hlsl
-build gen/backdrop_lg.msl: msl gen/backdrop_lg.spv
-
-build gen/coarse.spv: glsl coarse.comp | drawtag.h bins.h ptcl.h blend.h setup.h mem.h
-build gen/coarse.hlsl: hlsl gen/coarse.spv
-build gen/coarse.dxil: dxil gen/coarse.hlsl
-build gen/coarse.msl: msl gen/coarse.spv
-
-build gen/kernel4.spv: glsl kernel4.comp | blend.h ptcl.h setup.h mem.h
-build gen/kernel4.hlsl: hlsl gen/kernel4.spv
-build gen/kernel4.dxil: dxil gen/kernel4.hlsl
-build gen/kernel4.msl: msl gen/kernel4.spv
-
-build gen/kernel4_gray.spv: glsl kernel4.comp | blend.h ptcl.h setup.h mem.h
-  flags = -DGRAY
-build gen/kernel4_gray.hlsl: hlsl gen/kernel4_gray.spv
-build gen/kernel4_gray.dxil: dxil gen/kernel4_gray.hlsl
-build gen/kernel4_gray.msl: msl gen/kernel4_gray.spv
-
-# New element pipeline follows
-
-build gen/pathtag_reduce.spv: glsl pathtag_reduce.comp | pathtag.h setup.h mem.h
-build gen/pathtag_reduce.hlsl: hlsl gen/pathtag_reduce.spv
-build gen/pathtag_reduce.dxil: dxil gen/pathtag_reduce.hlsl
-build gen/pathtag_reduce.msl: msl gen/pathtag_reduce.spv
-
-build gen/pathtag_root.spv: glsl pathtag_scan.comp | pathtag.h setup.h
-  flags = -DROOT
-build gen/pathtag_root.hlsl: hlsl gen/pathtag_root.spv
-build gen/pathtag_root.dxil: dxil gen/pathtag_root.hlsl
-build gen/pathtag_root.msl: msl gen/pathtag_root.spv
-
-build gen/bbox_clear.spv: glsl bbox_clear.comp | setup.h mem.h
-build gen/bbox_clear.hlsl: hlsl gen/bbox_clear.spv
-build gen/bbox_clear.dxil: dxil gen/bbox_clear.hlsl
-build gen/bbox_clear.msl: msl gen/bbox_clear.spv
-
-build gen/pathseg.spv: glsl pathseg.comp | scene.h tile.h pathseg.h pathtag.h setup.h mem.h
-build gen/pathseg.hlsl: hlsl gen/pathseg.spv
-build gen/pathseg.dxil: dxil gen/pathseg.hlsl
-build gen/pathseg.msl: msl gen/pathseg.spv
-
-build gen/draw_reduce.spv: glsl draw_reduce.comp | scene.h drawtag.h setup.h mem.h
-build gen/draw_reduce.hlsl: hlsl gen/draw_reduce.spv
-build gen/draw_reduce.dxil: dxil gen/draw_reduce.hlsl
-build gen/draw_reduce.msl: msl gen/draw_reduce.spv
-
-build gen/draw_root.spv: glsl draw_scan.comp | drawtag.h setup.h
-  flags = -DROOT
-build gen/draw_root.hlsl: hlsl gen/draw_root.spv
-build gen/draw_root.dxil: dxil gen/draw_root.hlsl
-build gen/draw_root.msl: msl gen/draw_root.spv
-
-build gen/draw_leaf.spv: glsl draw_leaf.comp | blend.h scene.h drawtag.h setup.h mem.h
-build gen/draw_leaf.hlsl: hlsl gen/draw_leaf.spv
-build gen/draw_leaf.dxil: dxil gen/draw_leaf.hlsl
-build gen/draw_leaf.msl: msl gen/draw_leaf.spv
-
-build gen/clip_reduce.spv: glsl clip_reduce.comp | mem.h setup.h
-build gen/clip_reduce.hlsl: hlsl gen/clip_reduce.spv
-build gen/clip_reduce.dxil: dxil gen/clip_reduce.hlsl
-build gen/clip_reduce.msl: msl gen/clip_reduce.spv
-
-build gen/clip_leaf.spv: glsl clip_leaf.comp | mem.h setup.h
-build gen/clip_leaf.hlsl: hlsl gen/clip_leaf.spv
-build gen/clip_leaf.dxil: dxil gen/clip_leaf.hlsl
-build gen/clip_leaf.msl: msl gen/clip_leaf.spv
-
-build spv: phony gen/backdrop_lg.spv gen/backdrop.spv gen/bbox_clear.spv gen/binning.spv gen/clip_leaf.spv gen/clip_reduce.spv gen/coarse.spv gen/draw_leaf.spv gen/draw_reduce.spv gen/draw_root.spv gen/kernel4.spv gen/kernel4_gray.spv gen/path_coarse.spv gen/pathseg.spv gen/pathtag_reduce.spv gen/pathtag_root.spv gen/tile_alloc.spv
-build dxil: phony gen/backdrop.hlsl gen/backdrop_lg.hlsl gen/bbox_clear.hlsl gen/binning.hlsl gen/clip_leaf.hlsl gen/clip_reduce.hlsl gen/coarse.hlsl gen/draw_leaf.hlsl gen/draw_reduce.hlsl gen/draw_root.hlsl gen/kernel4.hlsl gen/kernel4_gray.hlsl gen/path_coarse.hlsl gen/pathseg.hlsl gen/pathtag_reduce.hlsl gen/pathtag_root.hlsl gen/tile_alloc.hlsl
-build msl: phony gen/backdrop_lg.msl gen/backdrop.msl gen/bbox_clear.msl gen/binning.msl gen/clip_leaf.msl gen/clip_reduce.msl gen/coarse.msl gen/draw_leaf.msl gen/draw_reduce.msl gen/draw_root.msl gen/kernel4.msl gen/kernel4_gray.msl gen/path_coarse.msl gen/pathseg.msl gen/pathtag_reduce.msl gen/pathtag_root.msl gen/tile_alloc.msl
--- a/piet-gpu/shader/clip_leaf.comp
+++ b/piet-gpu/shader/clip_leaf.comp
@ -1,285 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// The second dispatch of clip stack processing.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-#include "mem.h"
-#include "setup.h"
-
-#define LG_WG_SIZE (7 + LG_WG_FACTOR)
-#define WG_SIZE (1 << LG_WG_SIZE)
-#define PARTITION_SIZE WG_SIZE
-
-layout(local_size_x = WG_SIZE) in;
-
-layout(binding = 1) readonly buffer ConfigBuf {
-    Config conf;
-};
-
-// Some of this is cut'n'paste duplication with the reduce pass, and
-// arguably should be moved to a common .h file.
-// The bicyclic monoid
-
-struct ClipEl {
-    // index of parent node
-    uint parent_ix;
-    // bounding box
-    vec4 bbox;
-};
-
-struct Bic {
-    uint a;
-    uint b;
-};
-
-Bic bic_combine(Bic x, Bic y) {
-    uint m = min(x.b, y.a);
-    return Bic(x.a + y.a - m, x.b + y.b - m);
-}
-
-// Load path's bbox from bbox (as written by pathseg).
-vec4 load_path_bbox(uint path_ix) {
-    uint base = (conf.path_bbox_alloc.offset >> 2) + 6 * path_ix;
-    float bbox_l = float(memory[base]) - 32768.0;
-    float bbox_t = float(memory[base + 1]) - 32768.0;
-    float bbox_r = float(memory[base + 2]) - 32768.0;
-    float bbox_b = float(memory[base + 3]) - 32768.0;
-    vec4 bbox = vec4(bbox_l, bbox_t, bbox_r, bbox_b);
-    return bbox;
-}
-
-vec4 bbox_intersect(vec4 a, vec4 b) {
-    return vec4(max(a.xy, b.xy), min(a.zw, b.zw));
-}
-
-shared Bic sh_bic[WG_SIZE * 2 - 2];
-shared uint sh_stack[PARTITION_SIZE];
-shared vec4 sh_stack_bbox[PARTITION_SIZE];
-shared uint sh_link[PARTITION_SIZE];
-shared vec4 sh_bbox[PARTITION_SIZE];
-
-// This is adapted directly from the stack monoid impl.
-// Return value is reference within partition if >= 0,
-// otherwise reference to stack.
-uint search_link(inout Bic bic) {
-    uint ix = gl_LocalInvocationID.x;
-    uint j = 0;
-    while (j < LG_WG_SIZE) {
-        uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j));
-        if (((ix >> j) & 1) != 0) {
-            Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic);
-            if (test.b > 0) {
-                break;
-            }
-            bic = test;
-            ix -= 1u << j;
-        }
-        j++;
-    }
-    if (ix > 0) {
-        while (j > 0) {
-            j--;
-            uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j));
-            Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic);
-            if (test.b == 0) {
-                bic = test;
-                ix -= 1u << j;
-            }
-        }
-    }
-    // ix is the smallest value such that reduce(ix..th).b == 0
-    if (ix > 0) {
-        return ix - 1;
-    } else {
-        return ~0u - bic.a;
-    }
-}
-
-Bic load_bic(uint ix) {
-    uint base = (conf.clip_bic_alloc.offset >> 2) + 2 * ix;
-    return Bic(memory[base], memory[base + 1]);
-}
-
-ClipEl load_clip_el(uint ix) {
-    uint base = (conf.clip_stack_alloc.offset >> 2) + 5 * ix;
-    uint parent_ix = memory[base];
-    float x0 = uintBitsToFloat(memory[base + 1]);
-    float y0 = uintBitsToFloat(memory[base + 2]);
-    float x1 = uintBitsToFloat(memory[base + 3]);
-    float y1 = uintBitsToFloat(memory[base + 4]);
-    vec4 bbox = vec4(x0, y0, x1, y1);
-    return ClipEl(parent_ix, bbox);
-}
-
-uint load_path_ix(uint ix) {
-    // This is one approach to a partial final block. Another would be
-    // to do a memset to the padding in the command queue.
-    if (ix < conf.n_clip) {
-        return memory[(conf.clip_alloc.offset >> 2) + ix];
-    } else {
-        // EndClip tags don't implicate further loads.
-        return 0x80000000;
-    }
-}
-
-void store_clip_bbox(uint ix, vec4 bbox) {
-    uint base = (conf.clip_bbox_alloc.offset >> 2) + 4 * ix;
-    memory[base] = floatBitsToUint(bbox.x);
-    memory[base + 1] = floatBitsToUint(bbox.y);
-    memory[base + 2] = floatBitsToUint(bbox.z);
-    memory[base + 3] = floatBitsToUint(bbox.w);
-}
-
-void main() {
-    // materialize stack up to the start of this partition. This
-    // is based on the pure stack monoid, but with two additions.
-
-    // First, (this only matters if the stack goes deeper than the
-    // partition size, which might be unlikely in practice), the
-    // topmost stack element from each partition is picked, then an
-    // exclusive scan of those. Also note that if this is skipped,
-    // a scan is not needed in the reduce stage.
-
-    // Second, after the stream compaction, do a scan of the retrieved
-    // bbox values.
-    uint th = gl_LocalInvocationID.x;
-    Bic bic = Bic(0, 0);
-    if (th < gl_WorkGroupID.x) {
-        bic = load_bic(th);
-    }
-    sh_bic[th] = bic;
-    for (uint i = 0; i < LG_WG_SIZE; i++) {
-        barrier();
-        if (th + (1u << i) < WG_SIZE) {
-            Bic other = sh_bic[th + (1u << i)];
-            bic = bic_combine(bic, other);
-        }
-        barrier();
-        sh_bic[th] = bic;
-    }
-    barrier();
-    uint stack_size = sh_bic[0].b;
-
-    // TODO: do bbox scan here (to unlock greater stack depth)
-
-    // binary search in stack
-    uint sp = PARTITION_SIZE - 1 - th;
-    uint ix = 0;
-    for (uint i = 0; i < LG_WG_SIZE; i++) {
-        uint probe = ix + (uint(PARTITION_SIZE / 2) >> i);
-        if (sp < sh_bic[probe].b) {
-            ix = probe;
-        }
-    }
-    // ix is largest value such that sp < sh_bic[ix].b (if any)
-    uint b = sh_bic[ix].b;
-    vec4 bbox = vec4(-1e9, -1e9, 1e9, 1e9);
-    if (sp < b) {
-        // maybe store the index here for future use?
-        ClipEl el = load_clip_el(ix * PARTITION_SIZE + b - sp - 1);
-        sh_stack[th] = el.parent_ix;
-        bbox = el.bbox;
-        // other element values here?
-    }
-
-    // forward scan of bbox values of prefix stack
-    for (uint i = 0; i < LG_WG_SIZE; i++) {
-        sh_stack_bbox[th] = bbox;
-        barrier();
-        if (th >= (1u << i)) {
-            bbox = bbox_intersect(sh_stack_bbox[th - (1u << i)], bbox);
-        }
-        barrier();
-    }
-    sh_stack_bbox[th] = bbox;
-
-    // Read input and compute bicyclic semigroup binary tree
-    uint inp = load_path_ix(gl_GlobalInvocationID.x);
-    bool is_push = int(inp) >= 0;
-    bic = Bic(1 - uint(is_push), uint(is_push));
-    sh_bic[th] = bic;
-    if (is_push) {
-        bbox = load_path_bbox(inp);
-    } else {
-        bbox = vec4(-1e9, -1e9, 1e9, 1e9);
-    }
-    uint inbase = 0;
-    for (uint i = 0; i < LG_WG_SIZE - 1; i++) {
-        uint outbase = 2 * WG_SIZE - (1u << (LG_WG_SIZE - i));
-        barrier();
-        if (th < (1u << (LG_WG_SIZE - 1 - i))) {
-            sh_bic[outbase + th] = bic_combine(sh_bic[inbase + th * 2], sh_bic[inbase + th * 2 + 1]);
-        }
-        inbase = outbase;
-    }
-    barrier();
-    // Search for predecessor node
-    bic = Bic(0, 0);
-    uint link = search_link(bic);
-    // we use N_SEQ > 1 convention here:
-    // link >= 0 is index within partition
-    // link < 0 is reference to stack
-
-    // We want grandparent bbox for pop nodes, so follow those links.
-    sh_link[th] = link;
-    barrier();
-    uint grandparent;
-    if (int(link) >= 0) {
-        grandparent = sh_link[link];
-    } else {
-        grandparent = link - 1;
-    }
-
-    // Resolve parent
-    uint parent;
-    if (int(link) >= 0) {
-        parent = gl_WorkGroupID.x * PARTITION_SIZE + link;
-    } else if (int(link + stack_size) >= 0) {
-        parent = sh_stack[PARTITION_SIZE + link];
-    } else {
-        parent = ~0u;
-    }
-
-    // bbox scan along parent links
-    for (uint i = 0; i < LG_WG_SIZE; i++) {
-        // sh_link was already stored for first iteration
-        if (i != 0) {
-            sh_link[th] = link;
-        }
-        sh_bbox[th] = bbox;
-        barrier();
-        if (int(link) >= 0) {
-            bbox = bbox_intersect(sh_bbox[link], bbox);
-            link = sh_link[link];
-        }
-        barrier();
-    }
-    if (int(link + stack_size) >= 0) {
-        bbox = bbox_intersect(sh_stack_bbox[PARTITION_SIZE + link], bbox);
-    }
-    // At this point, bbox is the reduction of bounding boxes along the tree.
-    sh_bbox[th] = bbox;
-    barrier();
-
-    uint path_ix = inp;
-    if (!is_push && gl_GlobalInvocationID.x < conf.n_clip) {
-        // Is this load expensive? If so, it's loaded earlier for in-partition
-        // and is in the ClipEl for cross-partition.
-        // If not, can probably get rid of it in the stack intermediate buf.
-        path_ix = load_path_ix(parent);
-        uint drawmonoid_out_base = (conf.drawmonoid_alloc.offset >> 2) + 4 * ~inp;
-        // Fix up drawmonoid so path_ix at EndClip matches BeginClip
-        memory[drawmonoid_out_base] = path_ix;
-
-        if (int(grandparent) >= 0) {
-            bbox = sh_bbox[grandparent];
-        } else if (int(grandparent + stack_size) >= 0) {
-            bbox = sh_stack_bbox[PARTITION_SIZE + grandparent];
-        } else {
-            bbox = vec4(-1e9, -1e9, 1e9, 1e9);
-        }
-    }
-    store_clip_bbox(gl_GlobalInvocationID.x, bbox);
-}
--- a/piet-gpu/shader/clip_reduce.comp
+++ b/piet-gpu/shader/clip_reduce.comp
@ -1,146 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// The reduce pass for clip stack processing.
-
-// The primary input is a sequence of path ids representing paths to
-// push, with a special value of ~0 to represent pop.
-
-// For each path, the bounding box is found in the anno stream
-// (anno_alloc), though this may change.
-
-// Output is a stack monoid reduction for the partition. The Bic
-// is stored in the BicBuf, and the stack slice in StackBuf.
-
-// Note: for this shader, only pushes are represented in the stack
-// monoid reduction output, so we don't have to worry about the
-// interpretation of pops.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-#include "mem.h"
-#include "setup.h"
-
-#define LG_WG_SIZE (7 + LG_WG_FACTOR)
-#define WG_SIZE (1 << LG_WG_SIZE)
-#define PARTITION_SIZE WG_SIZE
-
-layout(local_size_x = WG_SIZE) in;
-
-layout(binding = 1) readonly buffer ConfigBuf {
-    Config conf;
-};
-
-// The intermediate state for clip processing.
-struct ClipEl {
-    // index of parent node
-    uint parent_ix;
-    // bounding box
-    vec4 bbox;
-};
-
-// The bicyclic monoid
-struct Bic {
-    uint a;
-    uint b;
-};
-
-Bic bic_combine(Bic x, Bic y) {
-    uint m = min(x.b, y.a);
-    return Bic(x.a + y.a - m, x.b + y.b - m);
-}
-
-shared Bic sh_bic[WG_SIZE];
-shared uint sh_parent[WG_SIZE];
-shared uint sh_path_ix[WG_SIZE];
-shared vec4 sh_bbox[WG_SIZE];
-
-// Load path's bbox from bbox (as written by pathseg).
-vec4 load_path_bbox(uint path_ix) {
-    uint base = (conf.path_bbox_alloc.offset >> 2) + 6 * path_ix;
-    float bbox_l = float(memory[base]) - 32768.0;
-    float bbox_t = float(memory[base + 1]) - 32768.0;
-    float bbox_r = float(memory[base + 2]) - 32768.0;
-    float bbox_b = float(memory[base + 3]) - 32768.0;
-    vec4 bbox = vec4(bbox_l, bbox_t, bbox_r, bbox_b);
-    return bbox;
-}
-
-vec4 bbox_intersect(vec4 a, vec4 b) {
-    return vec4(max(a.xy, b.xy), min(a.zw, b.zw));
-}
-
-void store_bic(uint ix, Bic bic) {
-    uint base = (conf.clip_bic_alloc.offset >> 2) + 2 * ix;
-    memory[base] = bic.a;
-    memory[base + 1] = bic.b;
-}
-
-void store_clip_el(uint ix, ClipEl el) {
-    uint base = (conf.clip_stack_alloc.offset >> 2) + 5 * ix;
-    memory[base] = el.parent_ix;
-    memory[base + 1] = floatBitsToUint(el.bbox.x);
-    memory[base + 2] = floatBitsToUint(el.bbox.y);
-    memory[base + 3] = floatBitsToUint(el.bbox.z);
-    memory[base + 4] = floatBitsToUint(el.bbox.w);
-}
-
-void main() {
-    uint th = gl_LocalInvocationID.x;
-    uint inp = memory[(conf.clip_alloc.offset >> 2) + gl_GlobalInvocationID.x];
-    bool is_push = int(inp) >= 0;
-    // reverse scan of bicyclic semigroup
-    Bic bic = Bic(1 - uint(is_push), uint(is_push));
-    sh_bic[gl_LocalInvocationID.x] = bic;
-    for (uint i = 0; i < LG_WG_SIZE; i++) {
-        barrier();
-        if (th + (1u << i) < WG_SIZE) {
-            Bic other = sh_bic[gl_LocalInvocationID.x + (1u << i)];
-            bic = bic_combine(bic, other);
-        }
-        barrier();
-        sh_bic[th] = bic;
-    }
-    if (th == 0) {
-        store_bic(gl_WorkGroupID.x, bic);
-    }
-    barrier();
-    uint size = sh_bic[0].b;
-    bic = Bic(0, 0);
-    if (th + 1 < WG_SIZE) {
-        bic = sh_bic[th + 1];
-    }
-    if (is_push && bic.a == 0) {
-        uint local_ix = size - bic.b - 1;
-        sh_parent[local_ix] = th;
-        sh_path_ix[local_ix] = inp;
-    }
-    barrier();
-    // Do forward scan of bounding box intersection
-    vec4 bbox;
-    uint path_ix;
-    if (th < size) {
-        path_ix = sh_path_ix[th];
-        bbox = load_path_bbox(path_ix);
-    }
-    // Not necessary if depth is bounded by wg size
-#if 0
-    for (uint i = 0; i < LG_WG_SIZE; i++) {
-        // We gate so we never access uninit data, but it might
-        // be more efficient to avoid the conditionals.
-        if (th < size) {
-            sh_bbox[th] = bbox;
-        }
-        barrier();
-        if (th < size && th >= (1u << i)) {
-            bbox = bbox_intersect(sh_bbox[th - (1u << i)], bbox);
-        }
-        barrier();
-    }
-#endif
-    if (th < size) {
-        uint parent_ix = sh_parent[th] + gl_WorkGroupID.x * PARTITION_SIZE;
-        ClipEl el = ClipEl(parent_ix, bbox);
-        store_clip_el(gl_GlobalInvocationID.x, el);
-    }
-}
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@ -1,480 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// The coarse rasterizer stage of the pipeline.
-//
-// As input we have the ordered partitions of paths from the binning phase and
-// the annotated tile list of segments and backdrop per path.
-//
-// Each workgroup operating on one bin by stream compacting
-// the elements corresponding to the bin.
-//
-// As output we have an ordered command stream per tile. Every tile from a path (backdrop + segment list) will be
-// encoded.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-#include "mem.h"
-#include "setup.h"
-
-layout(local_size_x = N_TILE, local_size_y = 1) in;
-
-layout(binding = 1) readonly buffer ConfigBuf {
-    Config conf;
-};
-
-layout(binding = 2) readonly buffer SceneBuf {
-    uint[] scene;
-};
-
-#include "drawtag.h"
-#include "bins.h"
-#include "tile.h"
-#include "ptcl.h"
-#include "blend.h"
-
-#define LG_N_PART_READ (7 + LG_WG_FACTOR)
-#define N_PART_READ (1 << LG_N_PART_READ)
-
-shared uint sh_elements[N_TILE];
-
-// Number of elements in the partition; prefix sum.
-shared uint sh_part_count[N_PART_READ];
-shared Alloc sh_part_elements[N_PART_READ];
-
-shared uint sh_bitmaps[N_SLICE][N_TILE];
-
-shared uint sh_tile_count[N_TILE];
-// The width of the tile rect for the element, intersected with this bin
-shared uint sh_tile_width[N_TILE];
-shared uint sh_tile_x0[N_TILE];
-shared uint sh_tile_y0[N_TILE];
-
-// These are set up so base + tile_y * stride + tile_x points to a Tile.
-shared uint sh_tile_base[N_TILE];
-shared uint sh_tile_stride[N_TILE];
-
-#ifdef MEM_DEBUG
-// Store allocs only when MEM_DEBUG to save shared memory traffic.
-shared Alloc sh_tile_alloc[N_TILE];
-
-void write_tile_alloc(uint el_ix, Alloc a) {
-    sh_tile_alloc[el_ix] = a;
-}
-
-Alloc read_tile_alloc(uint el_ix, bool mem_ok) {
-    return sh_tile_alloc[el_ix];
-}
-#else
-void write_tile_alloc(uint el_ix, Alloc a) {
-    // No-op
-}
-
-Alloc read_tile_alloc(uint el_ix, bool mem_ok) {
-    // All memory.
-    return new_alloc(0, conf.mem_size, mem_ok);
-}
-#endif
-
-// The maximum number of commands per annotated element.
-#define ANNO_COMMANDS 2
-
-// All writes to the output must be gated by mem_ok.
-bool mem_ok = true;
-
-// Perhaps cmd allocations should be a global? This is a style question.
-void alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) {
-    if (cmd_ref.offset < cmd_limit) {
-        return;
-    }
-    uint new_cmd = malloc_stage(PTCL_INITIAL_ALLOC, conf.mem_size, STAGE_COARSE);
-    if (new_cmd == MALLOC_FAILED) {
-        mem_ok = false;
-    }
-    if (mem_ok) {
-        CmdJump jump = CmdJump(new_cmd);
-        Cmd_Jump_write(cmd_alloc, cmd_ref, jump);
-    }
-    cmd_alloc = new_alloc(new_cmd, PTCL_INITIAL_ALLOC, true);
-    cmd_ref = CmdRef(new_cmd);
-    // Reserve space for the maximum number of commands and a potential jump.
-    cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - (ANNO_COMMANDS + 1) * Cmd_size;
-}
-
-void write_fill(Alloc alloc, inout CmdRef cmd_ref, Tile tile, float linewidth) {
-    if (linewidth < 0.0) {
-        if (tile.tile.offset != 0) {
-            CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop);
-            if (mem_ok) {
-                Cmd_Fill_write(alloc, cmd_ref, cmd_fill);
-            }
-            cmd_ref.offset += 4 + CmdFill_size;
-        } else {
-            if (mem_ok) {
-                Cmd_Solid_write(alloc, cmd_ref);
-            }
-            cmd_ref.offset += 4;
-        }
-    } else {
-        CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * linewidth);
-        if (mem_ok) {
-            Cmd_Stroke_write(alloc, cmd_ref, cmd_stroke);
-        }
-        cmd_ref.offset += 4 + CmdStroke_size;
-    }
-}
-
-void main() {
-    if (!check_deps(STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_PATH_COARSE)) {
-        return;
-    }
-    // Could use either linear or 2d layouts for both dispatch and
-    // invocations within the workgroup. We'll use variables to abstract.
-    uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1) / N_TILE_X;
-    uint bin_ix = width_in_bins * gl_WorkGroupID.y + gl_WorkGroupID.x;
-    uint partition_ix = 0;
-    uint n_partitions = (conf.n_elements + N_TILE - 1) / N_TILE;
-    uint th_ix = gl_LocalInvocationID.x;
-
-    // Coordinates of top left of bin, in tiles.
-    uint bin_tile_x = N_TILE_X * gl_WorkGroupID.x;
-    uint bin_tile_y = N_TILE_Y * gl_WorkGroupID.y;
-
-    // Per-tile state
-    uint tile_x = gl_LocalInvocationID.x % N_TILE_X;
-    uint tile_y = gl_LocalInvocationID.x / N_TILE_X;
-    uint this_tile_ix = (bin_tile_y + tile_y) * conf.width_in_tiles + bin_tile_x + tile_x;
-    Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, this_tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
-    CmdRef cmd_ref = CmdRef(cmd_alloc.offset);
-    // Reserve space for the maximum number of commands and a potential jump.
-    uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - (ANNO_COMMANDS + 1) * Cmd_size;
-    // The nesting depth of the clip stack
-    uint clip_depth = 0;
-    // State for the "clip zero" optimization. If it's nonzero, then we are
-    // currently in a clip for which the entire tile has an alpha of zero, and
-    // the value is the depth after the "begin clip" of that element.
-    uint clip_zero_depth = 0;
-
-    // I'm sure we can figure out how to do this with at least one fewer register...
-    // Items up to rd_ix have been read from sh_elements
-    uint rd_ix = 0;
-    // Items up to wr_ix have been written into sh_elements
-    uint wr_ix = 0;
-    // Items between part_start_ix and ready_ix are ready to be transferred from sh_part_elements
-    uint part_start_ix = 0;
-    uint ready_ix = 0;
-
-    Alloc scratch_alloc = slice_mem(cmd_alloc, 0, Alloc_size);
-    cmd_ref.offset += 4;
-    // Accounting for allocation of blend memory
-    uint render_blend_depth = 0;
-    uint max_blend_depth = 0;
-
-    uint drawmonoid_start = conf.drawmonoid_alloc.offset >> 2;
-    uint drawtag_start = conf.drawtag_offset >> 2;
-    uint drawdata_start = conf.drawdata_offset >> 2;
-    uint drawinfo_start = conf.drawinfo_alloc.offset >> 2;
-    while (true) {
-        for (uint i = 0; i < N_SLICE; i++) {
-            sh_bitmaps[i][th_ix] = 0;
-        }
-
-        // parallel read of input partitions
-        do {
-            if (ready_ix == wr_ix && partition_ix < n_partitions) {
-                part_start_ix = ready_ix;
-                uint count = 0;
-                if (th_ix < N_PART_READ && partition_ix + th_ix < n_partitions) {
-                    uint in_ix = (conf.bin_alloc.offset >> 2) + ((partition_ix + th_ix) * N_TILE + bin_ix) * 2;
-                    count = read_mem(conf.bin_alloc, in_ix);
-                    uint offset = read_mem(conf.bin_alloc, in_ix + 1);
-                    sh_part_elements[th_ix] = new_alloc(offset, count * BinInstance_size, true);
-                }
-                // prefix sum of counts
-                for (uint i = 0; i < LG_N_PART_READ; i++) {
-                    if (th_ix < N_PART_READ) {
-                        sh_part_count[th_ix] = count;
-                    }
-                    barrier();
-                    if (th_ix < N_PART_READ) {
-                        if (th_ix >= (1u << i)) {
-                            count += sh_part_count[th_ix - (1u << i)];
-                        }
-                    }
-                    barrier();
-                }
-                if (th_ix < N_PART_READ) {
-                    sh_part_count[th_ix] = part_start_ix + count;
-                }
-                barrier();
-                ready_ix = sh_part_count[N_PART_READ - 1];
-                partition_ix += N_PART_READ;
-            }
-            // use binary search to find element to read
-            uint ix = rd_ix + th_ix;
-            if (ix >= wr_ix && ix < ready_ix) {
-                uint part_ix = 0;
-                for (uint i = 0; i < LG_N_PART_READ; i++) {
-                    uint probe = part_ix + (uint(N_PART_READ / 2) >> i);
-                    if (ix >= sh_part_count[probe - 1]) {
-                        part_ix = probe;
-                    }
-                }
-                ix -= part_ix > 0 ? sh_part_count[part_ix - 1] : part_start_ix;
-                Alloc bin_alloc = sh_part_elements[part_ix];
-                BinInstanceRef inst_ref = BinInstanceRef(bin_alloc.offset);
-                BinInstance inst = BinInstance_read(bin_alloc, BinInstance_index(inst_ref, ix));
-                sh_elements[th_ix] = inst.element_ix;
-            }
-            barrier();
-
-            wr_ix = min(rd_ix + N_TILE, ready_ix);
-        } while (wr_ix - rd_ix < N_TILE && (wr_ix < ready_ix || partition_ix < n_partitions));
-
-        // We've done the merge and filled the buffer.
-
-        // Read one element, compute coverage.
-        uint tag = Drawtag_Nop;
-        uint element_ix;
-        if (th_ix + rd_ix < wr_ix) {
-            element_ix = sh_elements[th_ix];
-            tag = scene[drawtag_start + element_ix];
-        }
-
-        // Bounding box of element in pixel coordinates.
-        uint tile_count;
-        switch (tag) {
-        case Drawtag_FillColor:
-        case Drawtag_FillImage:
-        case Drawtag_FillLinGradient:
-        case Drawtag_FillRadGradient:
-        case Drawtag_BeginClip:
-        case Drawtag_EndClip:
-            uint drawmonoid_base = drawmonoid_start + 4 * element_ix;
-            uint path_ix = memory[drawmonoid_base];
-            Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size));
-            uint stride = path.bbox.z - path.bbox.x;
-            sh_tile_stride[th_ix] = stride;
-            int dx = int(path.bbox.x) - int(bin_tile_x);
-            int dy = int(path.bbox.y) - int(bin_tile_y);
-            int x0 = clamp(dx, 0, N_TILE_X);
-            int y0 = clamp(dy, 0, N_TILE_Y);
-            int x1 = clamp(int(path.bbox.z) - int(bin_tile_x), 0, N_TILE_X);
-            int y1 = clamp(int(path.bbox.w) - int(bin_tile_y), 0, N_TILE_Y);
-            sh_tile_width[th_ix] = uint(x1 - x0);
-            sh_tile_x0[th_ix] = x0;
-            sh_tile_y0[th_ix] = y0;
-            tile_count = uint(x1 - x0) * uint(y1 - y0);
-            // base relative to bin
-            uint base = path.tiles.offset - uint(dy * stride + dx) * Tile_size;
-            sh_tile_base[th_ix] = base;
-            Alloc path_alloc = new_alloc(path.tiles.offset,
-                                         (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, true);
-            write_tile_alloc(th_ix, path_alloc);
-            break;
-        default:
-            tile_count = 0;
-            break;
-        }
-
-        // Prefix sum of sh_tile_count
-        sh_tile_count[th_ix] = tile_count;
-        for (uint i = 0; i < LG_N_TILE; i++) {
-            barrier();
-            if (th_ix >= (1u << i)) {
-                tile_count += sh_tile_count[th_ix - (1u << i)];
-            }
-            barrier();
-            sh_tile_count[th_ix] = tile_count;
-        }
-        barrier();
-        uint total_tile_count = sh_tile_count[N_TILE - 1];
-        for (uint ix = th_ix; ix < total_tile_count; ix += N_TILE) {
-            // Binary search to find element
-            uint el_ix = 0;
-            for (uint i = 0; i < LG_N_TILE; i++) {
-                uint probe = el_ix + (uint(N_TILE / 2) >> i);
-                if (ix >= sh_tile_count[probe - 1]) {
-                    el_ix = probe;
-                }
-            }
-            uint element_ix = sh_elements[el_ix];
-            uint tag = scene[drawtag_start + element_ix];
-            uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0);
-            uint width = sh_tile_width[el_ix];
-            uint x = sh_tile_x0[el_ix] + seq_ix % width;
-            uint y = sh_tile_y0[el_ix] + seq_ix / width;
-            bool include_tile = false;
-            Tile tile = Tile_read(read_tile_alloc(el_ix, true),
-                                    TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
-            bool is_clip = (tag & 1) != 0;
-            // Always include the tile if it contains a path segment.
-            // For draws, include the tile if it is solid.
-            // For clips, include the tile if it is empty - this way, logic
-            // below will suppress the drawing of inner elements.
-            // For blends, include the tile if
-            // (blend_mode, composition_mode) != (Normal, SrcOver)
-            bool is_blend = false;
-            if (is_clip) {
-                uint drawmonoid_base = drawmonoid_start + 4 * element_ix;
-                uint scene_offset = memory[drawmonoid_base + 2];
-                uint dd = drawdata_start + (scene_offset >> 2);
-                uint blend = scene[dd];
-                is_blend = (blend != BlendComp_clip);
-            }
-            include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip
-                || is_blend;
-            if (include_tile) {
-                uint el_slice = el_ix / 32;
-                uint el_mask = 1u << (el_ix & 31);
-                atomicOr(sh_bitmaps[el_slice][y * N_TILE_X + x], el_mask);
-            }
-        }
-
-        barrier();
-
-        // Output draw objects for this tile. The thread does a sequential walk
-        // through the draw objects.
-        uint slice_ix = 0;
-        uint bitmap = sh_bitmaps[0][th_ix];
-        while (true) {
-            if (bitmap == 0) {
-                slice_ix++;
-                if (slice_ix == N_SLICE) {
-                    break;
-                }
-                bitmap = sh_bitmaps[slice_ix][th_ix];
-                if (bitmap == 0) {
-                    continue;
-                }
-            }
-            uint element_ref_ix = slice_ix * 32 + findLSB(bitmap);
-            uint element_ix = sh_elements[element_ref_ix];
-
-            // Clear LSB
-            bitmap &= bitmap - 1;
-
-            uint drawtag = scene[drawtag_start + element_ix];
-
-            if (clip_zero_depth == 0) {
-                Tile tile = Tile_read(read_tile_alloc(element_ref_ix, true),
-                                        TileRef(sh_tile_base[element_ref_ix] +
-                                                (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
-                uint drawmonoid_base = drawmonoid_start + 4 * element_ix;
-                uint scene_offset = memory[drawmonoid_base + 2];
-                uint info_offset = memory[drawmonoid_base + 3];
-                uint dd = drawdata_start + (scene_offset >> 2);
-                uint di = drawinfo_start + (info_offset >> 2);
-                switch (drawtag) {
-                case Drawtag_FillColor:
-                    float linewidth = uintBitsToFloat(memory[di]);
-                    alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
-                    write_fill(cmd_alloc, cmd_ref, tile, linewidth);
-                    uint rgba = scene[dd];
-                    if (mem_ok) {
-                        Cmd_Color_write(cmd_alloc, cmd_ref, CmdColor(rgba));
-                    }
-                    cmd_ref.offset += 4 + CmdColor_size;
-                    break;
-                case Drawtag_FillLinGradient:
-                    alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
-                    linewidth = uintBitsToFloat(memory[di]);
-                    write_fill(cmd_alloc, cmd_ref, tile, linewidth);
-                    CmdLinGrad cmd_lin;
-                    cmd_lin.index = scene[dd];
-                    cmd_lin.line_x = uintBitsToFloat(memory[di + 1]);
-                    cmd_lin.line_y = uintBitsToFloat(memory[di + 2]);
-                    cmd_lin.line_c = uintBitsToFloat(memory[di + 3]);
-                    if (mem_ok) {
-                        Cmd_LinGrad_write(cmd_alloc, cmd_ref, cmd_lin);
-                    }
-                    cmd_ref.offset += 4 + CmdLinGrad_size;
-                    break;
-                case Drawtag_FillRadGradient:
-                    alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
-                    linewidth = uintBitsToFloat(memory[di]);
-                    write_fill(cmd_alloc, cmd_ref, tile, linewidth);
-                    CmdRadGrad cmd_rad;
-                    cmd_rad.index = scene[dd];
-                    // Given that this is basically a memcpy, we might consider
-                    // letting the fine raster read the info itself.
-                    cmd_rad.mat = uintBitsToFloat(uvec4(memory[di + 1], memory[di + 2],
-                        memory[di + 3], memory[di + 4]));
-                    cmd_rad.xlat = uintBitsToFloat(uvec2(memory[di + 5], memory[di + 6]));
-                    cmd_rad.c1 = uintBitsToFloat(uvec2(memory[di + 7], memory[di + 8]));
-                    cmd_rad.ra = uintBitsToFloat(memory[di + 9]);
-                    cmd_rad.roff = uintBitsToFloat(memory[di + 10]);
-                    if (mem_ok) {
-                        Cmd_RadGrad_write(cmd_alloc, cmd_ref, cmd_rad);
-                    }
-                    cmd_ref.offset += 4 + CmdRadGrad_size;
-                    break;
-                case Drawtag_FillImage:
-                    alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
-                    linewidth = uintBitsToFloat(memory[di]);
-                    write_fill(cmd_alloc, cmd_ref, tile, linewidth);
-                    uint index = scene[dd];
-                    uint raw1 = scene[dd + 1];
-                    ivec2 offset = ivec2(int(raw1 << 16) >> 16, int(raw1) >> 16);
-                    if (mem_ok) {
-                        Cmd_Image_write(cmd_alloc, cmd_ref, CmdImage(index, offset));
-                    }
-                    cmd_ref.offset += 4 + CmdImage_size;
-                    break;
-                case Drawtag_BeginClip:
-                    if (tile.tile.offset == 0 && tile.backdrop == 0) {
-                        clip_zero_depth = clip_depth + 1;
-                    } else {
-                        alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
-                        if (mem_ok) {
-                            Cmd_BeginClip_write(cmd_alloc, cmd_ref);
-                        }
-                        cmd_ref.offset += 4;
-                        render_blend_depth++;
-                        max_blend_depth = max(max_blend_depth, render_blend_depth);
-                    }
-                    clip_depth++;
-                    break;
-                case Drawtag_EndClip:
-                    clip_depth--;
-                    write_fill(cmd_alloc, cmd_ref, tile, -1.0);
-                    uint blend = scene[dd];
-                    if (mem_ok) {
-                        Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(blend));
-                    }
-                    cmd_ref.offset += 4 + CmdEndClip_size;
-                    render_blend_depth--;
-                    break;
-                }
-            } else {
-                // In "clip zero" state, suppress all drawing
-                switch (drawtag) {
-                case Drawtag_BeginClip:
-                    clip_depth++;
-                    break;
-                case Drawtag_EndClip:
-                    if (clip_depth == clip_zero_depth) {
-                        clip_zero_depth = 0;
-                    }
-                    clip_depth--;
-                    break;
-                }
-            }
-        }
-        barrier();
-
-        rd_ix += N_TILE;
-        if (rd_ix >= ready_ix && partition_ix >= n_partitions)
-            break;
-    }
-    if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) {
-        if (mem_ok) {
-            Cmd_End_write(cmd_alloc, cmd_ref);
-        }
-        if (max_blend_depth > BLEND_STACK_SPLIT) {
-            uint scratch_size = max_blend_depth * TILE_WIDTH_PX * TILE_HEIGHT_PX * CLIP_STATE_SIZE * 4;
-            uint scratch = atomicAdd(blend_offset, scratch_size);
-            write_mem(scratch_alloc, scratch_alloc.offset >> 2, scratch);
-        }
-    }
-}
--- a/piet-gpu/shader/draw_leaf.comp
+++ b/piet-gpu/shader/draw_leaf.comp
@ -1,181 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// The leaf scan pass for draw tag scan implemented as a tree reduction.
-// This stage can be fused with its consumer but is separate now.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-#include "mem.h"
-#include "setup.h"
-
-#define N_ROWS 8
-#define LG_WG_SIZE (7 + LG_WG_FACTOR)
-#define WG_SIZE (1 << LG_WG_SIZE)
-#define PARTITION_SIZE (WG_SIZE * N_ROWS)
-
-layout(local_size_x = WG_SIZE, local_size_y = 1) in;
-
-layout(binding = 1) readonly buffer ConfigBuf {
-    Config conf;
-};
-
-layout(binding = 2) readonly buffer SceneBuf {
-    uint[] scene;
-};
-
-#include "scene.h"
-#include "tile.h"
-#include "drawtag.h"
-#include "blend.h"
-
-#define Monoid DrawMonoid
-
-layout(set = 0, binding = 3) readonly buffer ParentBuf {
-    Monoid[] parent;
-};
-
-shared Monoid sh_scratch[WG_SIZE];
-
-void main() {
-    Monoid local[N_ROWS];
-
-    uint ix = gl_GlobalInvocationID.x * N_ROWS;
-    uint drawtag_base = conf.drawtag_offset >> 2;
-    uint tag_word = scene[drawtag_base + ix];
-
-    Monoid agg = map_tag(tag_word);
-    local[0] = agg;
-    for (uint i = 1; i < N_ROWS; i++) {
-        tag_word = scene[drawtag_base + ix + i];
-        agg = combine_draw_monoid(agg, map_tag(tag_word));
-        local[i] = agg;
-    }
-    sh_scratch[gl_LocalInvocationID.x] = agg;
-    for (uint i = 0; i < LG_WG_SIZE; i++) {
-        barrier();
-        if (gl_LocalInvocationID.x >= (1u << i)) {
-            Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)];
-            agg = combine_draw_monoid(other, agg);
-        }
-        barrier();
-        sh_scratch[gl_LocalInvocationID.x] = agg;
-    }
-
-    barrier();
-    Monoid row = draw_monoid_identity();
-    if (gl_WorkGroupID.x > 0) {
-        row = parent[gl_WorkGroupID.x - 1];
-    }
-    if (gl_LocalInvocationID.x > 0) {
-        row = combine_draw_monoid(row, sh_scratch[gl_LocalInvocationID.x - 1]);
-    }
-    uint drawdata_base = conf.drawdata_offset >> 2;
-    uint drawinfo_base = conf.drawinfo_alloc.offset >> 2;
-    uint out_ix = gl_GlobalInvocationID.x * N_ROWS;
-    uint out_base = (conf.drawmonoid_alloc.offset >> 2) + out_ix * 4;
-    uint clip_out_base = conf.clip_alloc.offset >> 2;
-    for (uint i = 0; i < N_ROWS; i++) {
-        Monoid m = row;
-        if (i > 0) {
-            m = combine_draw_monoid(m, local[i - 1]);
-        }
-        // m now holds exclusive scan of draw monoid
-        memory[out_base + i * 4] = m.path_ix;
-        memory[out_base + i * 4 + 1] = m.clip_ix;
-        memory[out_base + i * 4 + 2] = m.scene_offset;
-        memory[out_base + i * 4 + 3] = m.info_offset;
-
-        // u32 offset of drawobj data
-        uint dd = drawdata_base + (m.scene_offset >> 2);
-        uint di = drawinfo_base + (m.info_offset >> 2);
-
-        // For compatibility, we'll generate an Annotated object, same as old
-        // pipeline. However, going forward we'll get rid of that, and have
-        // later stages read scene + bbox etc.
-        tag_word = scene[drawtag_base + ix + i];
-        if (tag_word == Drawtag_FillColor || tag_word == Drawtag_FillLinGradient || tag_word == Drawtag_FillRadGradient ||
-            tag_word == Drawtag_FillImage || tag_word == Drawtag_BeginClip) {
-            uint bbox_offset = (conf.path_bbox_alloc.offset >> 2) + 6 * m.path_ix;
-            float bbox_l = float(memory[bbox_offset]) - 32768.0;
-            float bbox_t = float(memory[bbox_offset + 1]) - 32768.0;
-            float bbox_r = float(memory[bbox_offset + 2]) - 32768.0;
-            float bbox_b = float(memory[bbox_offset + 3]) - 32768.0;
-            vec4 bbox = vec4(bbox_l, bbox_t, bbox_r, bbox_b);
-            float linewidth = uintBitsToFloat(memory[bbox_offset + 4]);
-            uint fill_mode = uint(linewidth >= 0.0);
-            vec4 mat;
-            vec2 translate;
-            if (linewidth >= 0.0 || tag_word == Drawtag_FillLinGradient || tag_word == Drawtag_FillRadGradient) {
-                uint trans_ix = memory[bbox_offset + 5];
-                uint t = (conf.trans_offset >> 2) + trans_ix * 6;
-                mat = uintBitsToFloat(uvec4(scene[t], scene[t + 1], scene[t + 2], scene[t + 3]));
-                if (tag_word == Drawtag_FillLinGradient || tag_word == Drawtag_FillRadGradient) {
-                    translate = uintBitsToFloat(uvec2(scene[t + 4], scene[t + 5]));
-                }
-            }
-            if (linewidth >= 0.0) {
-                // TODO: need to deal with anisotropic case
-                linewidth *= sqrt(abs(mat.x * mat.w - mat.y * mat.z));
-            }
-            switch (tag_word) {
-            case Drawtag_FillColor:
-            case Drawtag_FillImage:
-                memory[di] = floatBitsToUint(linewidth);
-                break;
-            case Drawtag_FillLinGradient:
-                memory[di] = floatBitsToUint(linewidth);
-                vec2 p0 = uintBitsToFloat(uvec2(scene[dd + 1], scene[dd + 2]));
-                vec2 p1 = uintBitsToFloat(uvec2(scene[dd + 3], scene[dd + 4]));
-                p0 = mat.xy * p0.x + mat.zw * p0.y + translate;
-                p1 = mat.xy * p1.x + mat.zw * p1.y + translate;
-                vec2 dxy = p1 - p0;
-                float scale = 1.0 / (dxy.x * dxy.x + dxy.y * dxy.y);
-                float line_x = dxy.x * scale;
-                float line_y = dxy.y * scale;
-                float line_c = -(p0.x * line_x + p0.y * line_y);
-                memory[di + 1] = floatBitsToUint(line_x);
-                memory[di + 2] = floatBitsToUint(line_y);
-                memory[di + 3] = floatBitsToUint(line_c);
-                break;
-            case Drawtag_FillRadGradient:
-                p0 = uintBitsToFloat(uvec2(scene[dd + 1], scene[dd + 2]));
-                p1 = uintBitsToFloat(uvec2(scene[dd + 3], scene[dd + 4]));
-                float r0 = uintBitsToFloat(scene[dd + 5]);
-                float r1 = uintBitsToFloat(scene[dd + 6]);
-                float inv_det = 1.0 / (mat.x * mat.w - mat.y * mat.z);
-                vec4 inv_mat = inv_det * vec4(mat.w, -mat.y, -mat.z, mat.x);
-                vec2 inv_tr = inv_mat.xz * translate.x + inv_mat.yw * translate.y;
-                inv_tr += p0;
-                vec2 center1 = p1 - p0;
-                float rr = r1 / (r1 - r0);
-                float rainv = rr / (r1 * r1 - dot(center1, center1));
-                vec2 c1 = center1 * rainv;
-                float ra = rr * rainv;
-                float roff = rr - 1.0;
-                memory[di] = floatBitsToUint(linewidth);
-                memory[di + 1] = floatBitsToUint(inv_mat.x);
-                memory[di + 2] = floatBitsToUint(inv_mat.y);
-                memory[di + 3] = floatBitsToUint(inv_mat.z);
-                memory[di + 4] = floatBitsToUint(inv_mat.w);
-                memory[di + 5] = floatBitsToUint(inv_tr.x);
-                memory[di + 6] = floatBitsToUint(inv_tr.y);
-                memory[di + 7] = floatBitsToUint(c1.x);
-                memory[di + 8] = floatBitsToUint(c1.y);
-                memory[di + 9] = floatBitsToUint(ra);
-                memory[di + 10] = floatBitsToUint(roff);
-                break;
-            case Drawtag_BeginClip:
-                break;
-            }
-        }
-        // Generate clip stream.
-        if (tag_word == Drawtag_BeginClip || tag_word == Drawtag_EndClip) {
-            uint path_ix = ~(out_ix + i);
-            if (tag_word == Drawtag_BeginClip) {
-                path_ix = m.path_ix;
-            }
-            memory[clip_out_base + m.clip_ix] = path_ix;
-        }
-    }
-}
--- a/piet-gpu/shader/draw_reduce.comp
+++ b/piet-gpu/shader/draw_reduce.comp
@ -1,61 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// The reduction phase for draw scan implemented as a tree reduction.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-#include "mem.h"
-#include "setup.h"
-
-#define N_ROWS 8
-#define LG_WG_SIZE (7 + LG_WG_FACTOR)
-#define WG_SIZE (1 << LG_WG_SIZE)
-#define PARTITION_SIZE (WG_SIZE * N_ROWS)
-
-layout(local_size_x = WG_SIZE, local_size_y = 1) in;
-
-layout(binding = 1) readonly buffer ConfigBuf {
-    Config conf;
-};
-
-layout(binding = 2) readonly buffer SceneBuf {
-    uint[] scene;
-};
-
-#include "scene.h"
-#include "drawtag.h"
-
-#define Monoid DrawMonoid
-
-layout(set = 0, binding = 3) buffer OutBuf {
-    Monoid[] outbuf;
-};
-
-shared Monoid sh_scratch[WG_SIZE];
-
-void main() {
-    uint ix = gl_GlobalInvocationID.x * N_ROWS;
-    uint drawtag_base = conf.drawtag_offset >> 2;
-    uint tag_word = scene[drawtag_base + ix];
-
-    Monoid agg = map_tag(tag_word);
-    for (uint i = 1; i < N_ROWS; i++) {
-        uint tag_word = scene[drawtag_base + ix + i];
-        agg = combine_draw_monoid(agg, map_tag(tag_word));
-    }
-    sh_scratch[gl_LocalInvocationID.x] = agg;
-    for (uint i = 0; i < LG_WG_SIZE; i++) {
-        barrier();
-        // We could make this predicate tighter, but would it help?
-        if (gl_LocalInvocationID.x + (1u << i) < WG_SIZE) {
-            Monoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i)];
-            agg = combine_draw_monoid(agg, other);
-        }
-        barrier();
-        sh_scratch[gl_LocalInvocationID.x] = agg;
-    }
-    if (gl_LocalInvocationID.x == 0) {
-        outbuf[gl_WorkGroupID.x] = agg;
-    }
-}
--- a/piet-gpu/shader/draw_scan.comp
+++ b/piet-gpu/shader/draw_scan.comp
@ -1,75 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// A scan pass for draw tag scan implemented as a tree reduction.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-#include "setup.h"
-#include "drawtag.h"
-
-#define N_ROWS 8
-#define LG_WG_SIZE (7 + LG_WG_FACTOR)
-#define WG_SIZE (1 << LG_WG_SIZE)
-#define PARTITION_SIZE (WG_SIZE * N_ROWS)
-
-layout(local_size_x = WG_SIZE, local_size_y = 1) in;
-
-#define Monoid DrawMonoid
-#define combine_monoid combine_draw_monoid
-#define monoid_identity draw_monoid_identity
-
-layout(binding = 0) buffer DataBuf {
-    Monoid[] data;
-};
-
-#ifndef ROOT
-layout(binding = 1) readonly buffer ParentBuf {
-    Monoid[] parent;
-};
-#endif
-
-shared Monoid sh_scratch[WG_SIZE];
-
-void main() {
-    Monoid local[N_ROWS];
-
-    uint ix = gl_GlobalInvocationID.x * N_ROWS;
-
-    local[0] = data[ix];
-    for (uint i = 1; i < N_ROWS; i++) {
-        local[i] = combine_monoid(local[i - 1], data[ix + i]);
-    }
-    Monoid agg = local[N_ROWS - 1];
-    sh_scratch[gl_LocalInvocationID.x] = agg;
-    for (uint i = 0; i < LG_WG_SIZE; i++) {
-        barrier();
-        if (gl_LocalInvocationID.x >= (1u << i)) {
-            Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)];
-            agg = combine_monoid(other, agg);
-        }
-        barrier();
-        sh_scratch[gl_LocalInvocationID.x] = agg;
-    }
-
-    barrier();
-    // This could be a semigroup instead of a monoid if we reworked the
-    // conditional logic, but that might impact performance.
-    Monoid row = monoid_identity();
-#ifdef ROOT
-    if (gl_LocalInvocationID.x > 0) {
-        row = sh_scratch[gl_LocalInvocationID.x - 1];
-    }
-#else
-    if (gl_WorkGroupID.x > 0) {
-        row = parent[gl_WorkGroupID.x - 1];
-    }
-    if (gl_LocalInvocationID.x > 0) {
-        row = combine_monoid(row, sh_scratch[gl_LocalInvocationID.x - 1]);
-    }
-#endif
-    for (uint i = 0; i < N_ROWS; i++) {
-        Monoid m = combine_monoid(row, local[i]);
-        data[ix + i] = m;
-    }
-}
--- a/piet-gpu/shader/drawtag.h
+++ b/piet-gpu/shader/drawtag.h
@ -1,41 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Common data structures and functions for the draw tag stream.
-
-// Design of draw tag: & 0x1c gives scene size in bytes
-// & 1 gives clip
-// (tag >> 4) & 0x3c is info size in bytes
-
-#define Drawtag_Nop 0
-#define Drawtag_FillColor 0x44
-#define Drawtag_FillLinGradient 0x114
-#define Drawtag_FillRadGradient 0x2dc
-#define Drawtag_FillImage 0x48
-#define Drawtag_BeginClip 0x05
-#define Drawtag_EndClip 0x25
-
-struct DrawMonoid {
-    uint path_ix;
-    uint clip_ix;
-    uint scene_offset;
-    uint info_offset;
-};
-
-DrawMonoid draw_monoid_identity() {
-    return DrawMonoid(0, 0, 0, 0);
-}
-
-DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b) {
-    DrawMonoid c;
-    c.path_ix = a.path_ix + b.path_ix;
-    c.clip_ix = a.clip_ix + b.clip_ix;
-    c.scene_offset = a.scene_offset + b.scene_offset;
-    c.info_offset = a.info_offset + b.info_offset;
-    return c;
-}
-
-DrawMonoid map_tag(uint tag_word) {
-    // TODO: at some point, EndClip should not generate a path
-    uint has_path = uint(tag_word != Drawtag_Nop);
-    return DrawMonoid(has_path, tag_word & 1, tag_word & 0x1c, (tag_word >> 4) & 0x3c);
-}
--- a/piet-gpu/shader/image.png
+++ b/piet-gpu/shader/image.png
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@ -1,301 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// This is "kernel 4" in a 4-kernel pipeline. It renders the commands
-// in the per-tile command list to an image.
-
-// Right now, this kernel stores the image in a buffer, but a better
-// plan is to use a texture. This is because of limited support.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-// We can do rendering either in sRGB colorspace (for compatibility)
-// or in a linear colorspace, with conversions to sRGB (which will give
-// higher quality antialiasing among other things).
-#define DO_SRGB_CONVERSION 0
-
-// TODO: the binding of the main buffer can be readonly
-#include "mem.h"
-#include "setup.h"
-
-#define CHUNK_X 2
-#define CHUNK_Y 4
-#define CHUNK (CHUNK_X * CHUNK_Y)
-#define CHUNK_DX (TILE_WIDTH_PX / CHUNK_X)
-#define CHUNK_DY (TILE_HEIGHT_PX / CHUNK_Y)
-layout(local_size_x = CHUNK_DX, local_size_y = CHUNK_DY) in;
-
-layout(binding = 1) restrict readonly buffer ConfigBuf {
-    Config conf;
-};
-
-layout(binding = 2) buffer BlendBuf {
-    uint blend_mem[];
-};
-
-#ifdef GRAY
-layout(r8, binding = 3) uniform restrict writeonly image2D image;
-#else
-layout(rgba8, binding = 3) uniform restrict writeonly image2D image;
-#endif
-
-layout(rgba8, binding = 4) uniform restrict readonly image2D image_atlas;
-
-layout(rgba8, binding = 5) uniform restrict readonly image2D gradients;
-
-#include "ptcl.h"
-#include "tile.h"
-#include "blend.h"
-
-#define MAX_BLEND_STACK 128
-mediump vec3 tosRGB(mediump vec3 rgb) {
-#if DO_SRGB_CONVERSION
-    bvec3 cutoff = greaterThanEqual(rgb, vec3(0.0031308));
-    mediump vec3 below = vec3(12.92) * rgb;
-    mediump vec3 above = vec3(1.055) * pow(rgb, vec3(0.41666)) - vec3(0.055);
-    return mix(below, above, cutoff);
-#else
-    return rgb;
-#endif
-}
-
-mediump vec3 fromsRGB(mediump vec3 srgb) {
-#if DO_SRGB_CONVERSION
-    // Formula from EXT_sRGB.
-    bvec3 cutoff = greaterThanEqual(srgb, vec3(0.04045));
-    mediump vec3 below = srgb / vec3(12.92);
-    mediump vec3 above = pow((srgb + vec3(0.055)) / vec3(1.055), vec3(2.4));
-    return mix(below, above, cutoff);
-#else
-    return srgb;
-#endif
-}
-
-// unpacksRGB unpacks a color in the sRGB color space to a vec4 in the linear color
-// space.
-mediump vec4 unpacksRGB(uint srgba) {
-    mediump vec4 color = unpackUnorm4x8(srgba).wzyx;
-    return vec4(fromsRGB(color.rgb), color.a);
-}
-
-// packsRGB packs a color in the linear color space into its 8-bit sRGB equivalent.
-uint packsRGB(mediump vec4 rgba) {
-    rgba = vec4(tosRGB(rgba.rgb), rgba.a);
-    return packUnorm4x8(rgba.wzyx);
-}
-
-uvec2 chunk_offset(uint i) {
-    return uvec2(i % CHUNK_X * CHUNK_DX, i / CHUNK_X * CHUNK_DY);
-}
-
-mediump vec4[CHUNK] fillImage(uvec2 xy, CmdImage cmd_img) {
-    mediump vec4 rgba[CHUNK];
-    for (uint i = 0; i < CHUNK; i++) {
-        ivec2 uv = ivec2(xy + chunk_offset(i)) + cmd_img.offset;
-        mediump vec4 fg_rgba;
-        fg_rgba = imageLoad(image_atlas, uv);
-        fg_rgba.rgb = fromsRGB(fg_rgba.rgb);
-        rgba[i] = fg_rgba;
-    }
-    return rgba;
-}
-
-void main() {
-    uint tile_ix = gl_WorkGroupID.y * conf.width_in_tiles + gl_WorkGroupID.x;
-    Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
-    CmdRef cmd_ref = CmdRef(cmd_alloc.offset);
-
-    uint blend_offset = memory[cmd_ref.offset >> 2];
-    cmd_ref.offset += 4;
-
-    uvec2 xy_uint = uvec2(gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_WorkGroupID.x,
-                          gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y);
-    vec2 xy = vec2(xy_uint);
-    mediump vec4 rgba[CHUNK];
-    uint blend_stack[BLEND_STACK_SPLIT][CHUNK];
-    for (uint i = 0; i < CHUNK; i++) {
-        rgba[i] = vec4(0.0);
-    }
-
-    mediump float area[CHUNK];
-    uint clip_depth = 0;
-    // Previously we would early-out if there was a memory failure, so we wouldn't try to read corrupt
-    // tiles. But now we assume this is checked CPU-side before launching fine rasterization.
-    while (true) {
-        uint tag = Cmd_tag(cmd_alloc, cmd_ref).tag;
-        if (tag == Cmd_End) {
-            break;
-        }
-        switch (tag) {
-        case Cmd_Stroke:
-            // Calculate distance field from all the line segments in this tile.
-            CmdStroke stroke = Cmd_Stroke_read(cmd_alloc, cmd_ref);
-            mediump float df[CHUNK];
-            for (uint k = 0; k < CHUNK; k++)
-                df[k] = 1e9;
-            TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref);
-            do {
-                TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, true), tile_seg_ref);
-                vec2 line_vec = seg.vector;
-                for (uint k = 0; k < CHUNK; k++) {
-                    vec2 dpos = xy + vec2(0.5, 0.5) - seg.origin;
-                    dpos += vec2(chunk_offset(k));
-                    float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
-                    df[k] = min(df[k], length(line_vec * t - dpos));
-                }
-                tile_seg_ref = seg.next;
-            } while (tile_seg_ref.offset != 0);
-            for (uint k = 0; k < CHUNK; k++) {
-                area[k] = clamp(stroke.half_width + 0.5 - df[k], 0.0, 1.0);
-            }
-            cmd_ref.offset += 4 + CmdStroke_size;
-            break;
-        case Cmd_Fill:
-            CmdFill fill = Cmd_Fill_read(cmd_alloc, cmd_ref);
-            for (uint k = 0; k < CHUNK; k++)
-                area[k] = float(fill.backdrop);
-            tile_seg_ref = TileSegRef(fill.tile_ref);
-            // Calculate coverage based on backdrop + coverage of each line segment
-            do {
-                TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, true), tile_seg_ref);
-                for (uint k = 0; k < CHUNK; k++) {
-                    vec2 my_xy = xy + vec2(chunk_offset(k));
-                    vec2 start = seg.origin - my_xy;
-                    vec2 end = start + seg.vector;
-                    vec2 window = clamp(vec2(start.y, end.y), 0.0, 1.0);
-                    if (window.x != window.y) {
-                        vec2 t = (window - start.y) / seg.vector.y;
-                        vec2 xs = vec2(mix(start.x, end.x, t.x), mix(start.x, end.x, t.y));
-                        float xmin = min(min(xs.x, xs.y), 1.0) - 1e-6;
-                        float xmax = max(xs.x, xs.y);
-                        float b = min(xmax, 1.0);
-                        float c = max(b, 0.0);
-                        float d = max(xmin, 0.0);
-                        float a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin);
-                        area[k] += a * (window.x - window.y);
-                    }
-                    area[k] += sign(seg.vector.x) * clamp(my_xy.y - seg.y_edge + 1.0, 0.0, 1.0);
-                }
-                tile_seg_ref = seg.next;
-            } while (tile_seg_ref.offset != 0);
-            for (uint k = 0; k < CHUNK; k++) {
-                area[k] = min(abs(area[k]), 1.0);
-            }
-            cmd_ref.offset += 4 + CmdFill_size;
-            break;
-        case Cmd_Solid:
-            for (uint k = 0; k < CHUNK; k++) {
-                area[k] = 1.0;
-            }
-            cmd_ref.offset += 4;
-            break;
-        case Cmd_Alpha:
-            CmdAlpha alpha = Cmd_Alpha_read(cmd_alloc, cmd_ref);
-            for (uint k = 0; k < CHUNK; k++) {
-                area[k] = alpha.alpha;
-            }
-            cmd_ref.offset += 4 + CmdAlpha_size;
-            break;
-        case Cmd_Color:
-            CmdColor color = Cmd_Color_read(cmd_alloc, cmd_ref);
-            mediump vec4 fg = unpacksRGB(color.rgba_color);
-            for (uint k = 0; k < CHUNK; k++) {
-                mediump vec4 fg_k = fg * area[k];
-                rgba[k] = rgba[k] * (1.0 - fg_k.a) + fg_k;
-            }
-            cmd_ref.offset += 4 + CmdColor_size;
-            break;
-        case Cmd_LinGrad:
-            CmdLinGrad lin = Cmd_LinGrad_read(cmd_alloc, cmd_ref);
-            float d = lin.line_x * float(xy.x) + lin.line_y * float(xy.y) + lin.line_c;
-            for (uint k = 0; k < CHUNK; k++) {
-                vec2 chunk_xy = vec2(chunk_offset(k));
-                float my_d = d + lin.line_x * chunk_xy.x + lin.line_y * chunk_xy.y;
-                int x = int(round(clamp(my_d, 0.0, 1.0) * float(GRADIENT_WIDTH - 1)));
-                mediump vec4 fg_rgba = imageLoad(gradients, ivec2(x, int(lin.index)));
-                fg_rgba.rgb = fromsRGB(fg_rgba.rgb);
-                mediump vec4 fg_k = fg_rgba * area[k];
-                rgba[k] = rgba[k] * (1.0 - fg_k.a) + fg_k;
-            }
-            cmd_ref.offset += 4 + CmdLinGrad_size;
-            break;
-        case Cmd_RadGrad:
-            CmdRadGrad rad = Cmd_RadGrad_read(cmd_alloc, cmd_ref);
-            for (uint k = 0; k < CHUNK; k++) {
-                vec2 my_xy = xy + vec2(chunk_offset(k));
-                my_xy = rad.mat.xz * my_xy.x + rad.mat.yw * my_xy.y - rad.xlat;
-                float ba = dot(my_xy, rad.c1);
-                float ca = rad.ra * dot(my_xy, my_xy);
-                float t = sqrt(ba * ba  + ca) - ba - rad.roff;
-                int x = int(round(clamp(t, 0.0, 1.0) * float(GRADIENT_WIDTH - 1)));
-                mediump vec4 fg_rgba = imageLoad(gradients, ivec2(x, int(rad.index)));
-                fg_rgba.rgb = fromsRGB(fg_rgba.rgb);
-                mediump vec4 fg_k = fg_rgba * area[k];
-                rgba[k] = rgba[k] * (1.0 - fg_k.a) + fg_k;
-            }
-            cmd_ref.offset += 4 + CmdRadGrad_size;
-            break;
-        case Cmd_Image:
-            CmdImage fill_img = Cmd_Image_read(cmd_alloc, cmd_ref);
-            mediump vec4 img[CHUNK] = fillImage(xy_uint, fill_img);
-            for (uint k = 0; k < CHUNK; k++) {
-                mediump vec4 fg_k = img[k] * area[k];
-                rgba[k] = rgba[k] * (1.0 - fg_k.a) + fg_k;
-            }
-            cmd_ref.offset += 4 + CmdImage_size;
-            break;
-        case Cmd_BeginClip:
-            if (clip_depth < BLEND_STACK_SPLIT) {
-                for (uint k = 0; k < CHUNK; k++) {
-                    blend_stack[clip_depth][k] = packsRGB(vec4(rgba[k]));
-                    rgba[k] = vec4(0.0);
-                }
-            } else {
-                uint base_ix = (blend_offset >> 2) + (clip_depth - BLEND_STACK_SPLIT) * TILE_HEIGHT_PX * TILE_WIDTH_PX +
-                    CHUNK * (gl_LocalInvocationID.x + CHUNK_DX * gl_LocalInvocationID.y);
-                for (uint k = 0; k < CHUNK; k++) {
-                    blend_mem[base_ix + k] = packsRGB(vec4(rgba[k]));
-                    rgba[k] = vec4(0.0);
-                }
-            }
-            clip_depth++;
-            cmd_ref.offset += 4;
-            break;
-        case Cmd_EndClip:
-            CmdEndClip end_clip = Cmd_EndClip_read(cmd_alloc, cmd_ref);
-            clip_depth--;
-            uint base_ix;
-            if (clip_depth >= BLEND_STACK_SPLIT) {
-                base_ix = (blend_offset >> 2) + (clip_depth - BLEND_STACK_SPLIT) * TILE_HEIGHT_PX * TILE_WIDTH_PX +
-                    CHUNK * (gl_LocalInvocationID.x + CHUNK_DX * gl_LocalInvocationID.y);
-            }
-            for (uint k = 0; k < CHUNK; k++) {
-                uint bg_rgba;
-                if (clip_depth < BLEND_STACK_SPLIT) {
-                    bg_rgba = blend_stack[clip_depth][k];
-                } else {
-                    bg_rgba = blend_mem[base_ix + k];
-                }
-                mediump vec4 bg = unpacksRGB(bg_rgba);
-                mediump vec4 fg = rgba[k] * area[k];
-                rgba[k] = mix_blend_compose(bg, fg, end_clip.blend);
-            }
-            cmd_ref.offset += 4 + CmdEndClip_size;
-            break;
-        case Cmd_Jump:
-            cmd_ref = CmdRef(Cmd_Jump_read(cmd_alloc, cmd_ref).new_ref);
-            cmd_alloc.offset = cmd_ref.offset;
-            break;
-        }
-    }
-
-    for (uint i = 0; i < CHUNK; i++) {
-#ifdef GRAY
-        // Just store the alpha value; later we can specialize this kernel more to avoid
-        // computing unneeded RGB colors.
-        imageStore(image, ivec2(xy_uint + chunk_offset(i)), vec4(rgba[i].a));
-#else
-        imageStore(image, ivec2(xy_uint + chunk_offset(i)), vec4(tosRGB(rgba[i].rgb), rgba[i].a));
-#endif
-    }
-}
--- a/piet-gpu/shader/mem.h
+++ b/piet-gpu/shader/mem.h
@ -1,145 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-layout(set = 0, binding = 0) buffer Memory {
-    // offset into memory of the next allocation, initialized by the user.
-    uint mem_offset;
-    // mem_error is a bitmask of stages that have failed allocation.
-    uint mem_error;
-    // offset into blend memory of allocations for blend stack.
-    uint blend_offset;
-    uint[] memory;
-};
-
-// Uncomment this line to add the size field to Alloc and enable memory checks.
-// Note that the Config struct in setup.h grows size fields as well.
-
-// This setting is not working and the mechanism will be removed.
-//#define MEM_DEBUG
-
-#ifdef MEM_DEBUG
-#define Alloc_size 16
-#else
-// TODO: this seems wrong
-#define Alloc_size 8
-#endif
-
-// Alloc represents a memory allocation.
-struct Alloc {
-    // offset in bytes into memory.
-    uint offset;
-#ifdef MEM_DEBUG
-    // size in bytes of the allocation.
-    uint size;
-#endif
-};
-
-// new_alloc synthesizes an Alloc from an offset and size.
-Alloc new_alloc(uint offset, uint size, bool mem_ok) {
-    Alloc a;
-    a.offset = offset;
-#ifdef MEM_DEBUG
-    if (mem_ok) {
-        a.size = size;
-    } else {
-        a.size = 0;
-    }
-#endif
-    return a;
-}
-
-#define STAGE_BINNING (1u << 0)
-#define STAGE_TILE_ALLOC (1u << 1)
-#define STAGE_PATH_COARSE (1u << 2)
-#define STAGE_COARSE (1u << 3)
-
-// Allocations in main memory will never be 0, and this might be slightly
-// faster to test against than some other value.
-#define MALLOC_FAILED 0
-
-// Check that previous dependent stages have succeeded.
-bool check_deps(uint dep_stage) {
-    // TODO: this should be an atomic relaxed load, but that involves
-    // bringing in "memory scope semantics"
-    return (atomicOr(mem_error, 0) & dep_stage) == 0;
-}
-
-// Allocate size bytes of memory, offset in bytes.
-// Note: with a bit of rearrangement of header files, we could make the
-// mem_size argument go away (it comes from the config binding).
-uint malloc_stage(uint size, uint mem_size, uint stage) {
-    uint offset = atomicAdd(mem_offset, size);
-    if (offset + size > mem_size) {
-        atomicOr(mem_error, stage);
-        offset = MALLOC_FAILED;
-    }
-    return offset;
-}
-
-// touch_mem checks whether access to the memory word at offset is valid.
-// If MEM_DEBUG is defined, touch_mem returns false if offset is out of bounds.
-// Offset is in words.
-bool touch_mem(Alloc alloc, uint offset) {
-#ifdef MEM_DEBUG
-    if (offset < alloc.offset/4 || offset >= (alloc.offset + alloc.size)/4) {
-        atomicMax(mem_error, ERR_OUT_OF_BOUNDS);
-        return false;
-    }
-#endif
-    return true;
-}
-
-// write_mem writes val to memory at offset.
-// Offset is in words.
-void write_mem(Alloc alloc, uint offset, uint val) {
-    if (!touch_mem(alloc, offset)) {
-        return;
-    }
-    memory[offset] = val;
-}
-
-// read_mem reads the value from memory at offset.
-// Offset is in words.
-uint read_mem(Alloc alloc, uint offset) {
-    if (!touch_mem(alloc, offset)) {
-        return 0;
-    }
-    uint v = memory[offset];
-    return v;
-}
-
-// slice_mem returns a sub-allocation inside another. Offset and size are in
-// bytes, relative to a.offset.
-Alloc slice_mem(Alloc a, uint offset, uint size) {
-#ifdef MEM_DEBUG
-    if ((offset & 3) != 0 || (size & 3) != 0) {
-        atomicMax(mem_error, ERR_UNALIGNED_ACCESS);
-        return Alloc(0, 0);
-    }
-    if (offset + size > a.size) {
-        // slice_mem is sometimes used for slices outside bounds,
-        // but never written.
-        return Alloc(0, 0);
-    }
-    return Alloc(a.offset + offset, size);
-#else
-    return Alloc(a.offset + offset);
-#endif
-}
-
-// alloc_write writes alloc to memory at offset bytes.
-void alloc_write(Alloc a, uint offset, Alloc alloc) {
-    write_mem(a, offset >> 2, alloc.offset);
-#ifdef MEM_DEBUG
-    write_mem(a, (offset >> 2) + 1, alloc.size);
-#endif
-}
-
-// alloc_read reads an Alloc from memory at offset bytes.
-Alloc alloc_read(Alloc a, uint offset) {
-    Alloc alloc;
-    alloc.offset = read_mem(a, offset >> 2);
-#ifdef MEM_DEBUG
-    alloc.size = read_mem(a, (offset >> 2) + 1);
-#endif
-    return alloc;
-}
--- a/piet-gpu/shader/path_coarse.comp
+++ b/piet-gpu/shader/path_coarse.comp
@ -1,289 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Coarse rasterization of path segments.
-
-// Allocation and initialization of tiles for paths.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-#include "mem.h"
-#include "setup.h"
-
-#define LG_COARSE_WG 5
-#define COARSE_WG (1 << LG_COARSE_WG)
-
-layout(local_size_x = COARSE_WG, local_size_y = 1) in;
-
-layout(set = 0, binding = 1) readonly buffer ConfigBuf {
-    Config conf;
-};
-
-#include "pathseg.h"
-#include "tile.h"
-
-// scale factors useful for converting coordinates to tiles
-#define SX (1.0 / float(TILE_WIDTH_PX))
-#define SY (1.0 / float(TILE_HEIGHT_PX))
-
-#define ACCURACY 0.25
-#define Q_ACCURACY (ACCURACY * 0.1)
-#define REM_ACCURACY (ACCURACY - Q_ACCURACY)
-#define MAX_HYPOT2 (432.0 * Q_ACCURACY * Q_ACCURACY)
-#define MAX_QUADS 16
-
-vec2 eval_quad(vec2 p0, vec2 p1, vec2 p2, float t) {
-    float mt = 1.0 - t;
-    return p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t;
-}
-
-vec2 eval_cubic(vec2 p0, vec2 p1, vec2 p2, vec2 p3, float t) {
-    float mt = 1.0 - t;
-    return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t;
-}
-
-struct SubdivResult {
-    float val;
-    float a0;
-    float a2;
-};
-
-/// An approximation to $\int (1 + 4x^2) ^ -0.25 dx$
-///
-/// This is used for flattening curves.
-#define D 0.67
-float approx_parabola_integral(float x) {
-    return x * inversesqrt(sqrt(1.0 - D + (D * D * D * D + 0.25 * x * x)));
-}
-
-/// An approximation to the inverse parabola integral.
-#define B 0.39
-float approx_parabola_inv_integral(float x) {
-    return x * sqrt(1.0 - B + (B * B + 0.25 * x * x));
-}
-
-SubdivResult estimate_subdiv(vec2 p0, vec2 p1, vec2 p2, float sqrt_tol) {
-    vec2 d01 = p1 - p0;
-    vec2 d12 = p2 - p1;
-    vec2 dd = d01 - d12;
-    float cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x;
-    float x0 = (d01.x * dd.x + d01.y * dd.y) / cross;
-    float x2 = (d12.x * dd.x + d12.y * dd.y) / cross;
-    float scale = abs(cross / (length(dd) * (x2 - x0)));
-
-    float a0 = approx_parabola_integral(x0);
-    float a2 = approx_parabola_integral(x2);
-    float val = 0.0;
-    if (scale < 1e9) {
-        float da = abs(a2 - a0);
-        float sqrt_scale = sqrt(scale);
-        if (sign(x0) == sign(x2)) {
-            val = da * sqrt_scale;
-        } else {
-            float xmin = sqrt_tol / sqrt_scale;
-            val = sqrt_tol * da / approx_parabola_integral(xmin);
-        }
-    }
-    return SubdivResult(val, a0, a2);
-}
-
-// All writes to the output must be gated by mem_ok.
-bool mem_ok = true;
-
-void main() {
-    if (!check_deps(STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_PATH_COARSE)) {
-        return;
-    }
-    uint element_ix = gl_GlobalInvocationID.x;
-    PathSegRef ref = PathSegRef(conf.pathseg_alloc.offset + element_ix * PathSeg_size);
-
-    PathSegTag tag = PathSegTag(PathSeg_Nop, 0);
-    if (element_ix < conf.n_pathseg) {
-        tag = PathSeg_tag(conf.pathseg_alloc, ref);
-    }
-    switch (tag.tag) {
-    case PathSeg_Cubic:
-        PathCubic cubic = PathSeg_Cubic_read(conf.pathseg_alloc, ref);
-
-        vec2 err_v = 3.0 * (cubic.p2 - cubic.p1) + cubic.p0 - cubic.p3;
-        float err = err_v.x * err_v.x + err_v.y * err_v.y;
-        // The number of quadratics.
-        uint n_quads = max(uint(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1);
-        n_quads = min(n_quads, MAX_QUADS);
-        SubdivResult keep_params[MAX_QUADS];
-        // Iterate over quadratics and tote up the estimated number of segments.
-        float val = 0.0;
-        vec2 qp0 = cubic.p0;
-        float step = 1.0 / float(n_quads);
-        for (uint i = 0; i < n_quads; i++) {
-            float t = float(i + 1) * step;
-            vec2 qp2 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t);
-            vec2 qp1 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t - 0.5 * step);
-            qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
-            SubdivResult params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY));
-            keep_params[i] = params;
-            val += params.val;
-
-            qp0 = qp2;
-        }
-        uint n = max(uint(ceil(val * 0.5 / sqrt(REM_ACCURACY))), 1);
-
-        bool is_stroke = fill_mode_from_flags(tag.flags) == MODE_STROKE;
-        uint path_ix = cubic.path_ix;
-        Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size));
-        Alloc path_alloc =
-            new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, true);
-        ivec4 bbox = ivec4(path.bbox);
-        vec2 p0 = cubic.p0;
-        qp0 = cubic.p0;
-        float v_step = val / float(n);
-        int n_out = 1;
-        float val_sum = 0.0;
-        for (uint i = 0; i < n_quads; i++) {
-            float t = float(i + 1) * step;
-            vec2 qp2 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t);
-            vec2 qp1 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t - 0.5 * step);
-            qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
-            SubdivResult params = keep_params[i];
-            float u0 = approx_parabola_inv_integral(params.a0);
-            float u2 = approx_parabola_inv_integral(params.a2);
-            float uscale = 1.0 / (u2 - u0);
-            float target = float(n_out) * v_step;
-            while (n_out == n || target < val_sum + params.val) {
-                vec2 p1;
-                if (n_out == n) {
-                    p1 = cubic.p3;
-                } else {
-                    float u = (target - val_sum) / params.val;
-                    float a = mix(params.a0, params.a2, u);
-                    float au = approx_parabola_inv_integral(a);
-                    float t = (au - u0) * uscale;
-                    p1 = eval_quad(qp0, qp1, qp2, t);
-                }
-
-                // Output line segment
-
-                // Bounding box of element in pixel coordinates.
-                float xmin = min(p0.x, p1.x) - cubic.stroke.x;
-                float xmax = max(p0.x, p1.x) + cubic.stroke.x;
-                float ymin = min(p0.y, p1.y) - cubic.stroke.y;
-                float ymax = max(p0.y, p1.y) + cubic.stroke.y;
-                float dx = p1.x - p0.x;
-                float dy = p1.y - p0.y;
-                // Set up for per-scanline coverage formula, below.
-                float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
-                float c = (cubic.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + cubic.stroke.y)) * SX;
-                float b = invslope; // Note: assumes square tiles, otherwise scale.
-                float a = (p0.x - (p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX;
-
-                int x0 = int(floor(xmin * SX));
-                int x1 = int(floor(xmax * SX) + 1);
-                int y0 = int(floor(ymin * SY));
-                int y1 = int(floor(ymax * SY) + 1);
-
-                x0 = clamp(x0, bbox.x, bbox.z);
-                y0 = clamp(y0, bbox.y, bbox.w);
-                x1 = clamp(x1, bbox.x, bbox.z);
-                y1 = clamp(y1, bbox.y, bbox.w);
-                float xc = a + b * float(y0);
-                int stride = bbox.z - bbox.x;
-                int base = (y0 - bbox.y) * stride - bbox.x;
-                // TODO: can be tighter, use c to bound width
-                uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
-                // Consider using subgroups to aggregate atomic add.
-                uint malloc_size = n_tile_alloc * TileSeg_size;
-                uint tile_offset = malloc_stage(malloc_size, conf.mem_size, STAGE_PATH_COARSE);
-                if (tile_offset == MALLOC_FAILED) {
-                    mem_ok = false;
-                }
-                Alloc tile_alloc = new_alloc(tile_offset, malloc_size, true);
-
-                TileSeg tile_seg;
-
-                int xray = int(floor(p0.x * SX));
-                int last_xray = int(floor(p1.x * SX));
-                if (p0.y > p1.y) {
-                    int tmp = xray;
-                    xray = last_xray;
-                    last_xray = tmp;
-                }
-                for (int y = y0; y < y1; y++) {
-                    float tile_y0 = float(y * TILE_HEIGHT_PX);
-                    int xbackdrop = max(xray + 1, bbox.x);
-                    if (!is_stroke && min(p0.y, p1.y) < tile_y0 && xbackdrop < bbox.z) {
-                        int backdrop = p1.y < p0.y ? 1 : -1;
-                        TileRef tile_ref = Tile_index(path.tiles, uint(base + xbackdrop));
-                        uint tile_el = tile_ref.offset >> 2;
-                        atomicAdd(memory[tile_el + 1], backdrop);
-                    }
-
-                    // next_xray is the xray for the next scanline; the line segment intersects
-                    // all tiles between xray and next_xray.
-                    int next_xray = last_xray;
-                    if (y < y1 - 1) {
-                        float tile_y1 = float((y + 1) * TILE_HEIGHT_PX);
-                        float x_edge = mix(p0.x, p1.x, (tile_y1 - p0.y) / dy);
-                        next_xray = int(floor(x_edge * SX));
-                    }
-
-                    int min_xray = min(xray, next_xray);
-                    int max_xray = max(xray, next_xray);
-                    int xx0 = min(int(floor(xc - c)), min_xray);
-                    int xx1 = max(int(ceil(xc + c)), max_xray + 1);
-                    xx0 = clamp(xx0, x0, x1);
-                    xx1 = clamp(xx1, x0, x1);
-
-                    for (int x = xx0; x < xx1; x++) {
-                        float tile_x0 = float(x * TILE_WIDTH_PX);
-                        TileRef tile_ref = Tile_index(TileRef(path.tiles.offset), uint(base + x));
-                        uint tile_el = tile_ref.offset >> 2;
-                        uint old = 0;
-                        old = atomicExchange(memory[tile_el], tile_offset);
-                        tile_seg.origin = p0;
-                        tile_seg.vector = p1 - p0;
-                        float y_edge = 0.0;
-                        if (!is_stroke) {
-                            y_edge = mix(p0.y, p1.y, (tile_x0 - p0.x) / dx);
-                            if (min(p0.x, p1.x) < tile_x0) {
-                                vec2 p = vec2(tile_x0, y_edge);
-                                if (p0.x > p1.x) {
-                                    tile_seg.vector = p - p0;
-                                } else {
-                                    tile_seg.origin = p;
-                                    tile_seg.vector = p1 - p;
-                                }
-                                // kernel4 uses sign(vector.x) for the sign of the intersection backdrop.
-                                // Nudge zeroes towards the intended sign.
-                                if (tile_seg.vector.x == 0) {
-                                    tile_seg.vector.x = sign(p1.x - p0.x) * 1e-9;
-                                }
-                            }
-                            if (x <= min_xray || max_xray < x) {
-                                // Reject inconsistent intersections.
-                                y_edge = 1e9;
-                            }
-                        }
-                        tile_seg.y_edge = y_edge;
-                        tile_seg.next.offset = old;
-                        if (mem_ok) {
-                            TileSeg_write(tile_alloc, TileSegRef(tile_offset), tile_seg);
-                        }
-                        tile_offset += TileSeg_size;
-                    }
-                    xc += b;
-                    base += stride;
-                    xray = next_xray;
-                }
-
-                n_out += 1;
-                target += v_step;
-                p0 = p1;
-            }
-            val_sum += params.val;
-
-            qp0 = qp2;
-        }
-
-        break;
-    }
-}
--- a/piet-gpu/shader/pathseg.comp
+++ b/piet-gpu/shader/pathseg.comp
@ -1,291 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Processing of the path stream, after the tag scan.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-#include "mem.h"
-#include "setup.h"
-#include "pathtag.h"
-
-#define N_SEQ 4
-#define LG_WG_SIZE (7 + LG_WG_FACTOR)
-#define WG_SIZE (1 << LG_WG_SIZE)
-#define PARTITION_SIZE (WG_SIZE * N_SEQ)
-
-layout(local_size_x = WG_SIZE, local_size_y = 1) in;
-
-layout(binding = 1) readonly buffer ConfigBuf {
-    Config conf;
-};
-
-layout(binding = 2) readonly buffer SceneBuf {
-    uint[] scene;
-};
-
-#include "tile.h"
-#include "pathseg.h"
-#include "scene.h"
-
-layout(binding = 3) readonly buffer ParentBuf {
-    TagMonoid[] parent;
-};
-
-struct Monoid {
-    vec4 bbox;
-    uint flags;
-};
-
-#define FLAG_RESET_BBOX 1
-#define FLAG_SET_BBOX 2
-
-Monoid combine_monoid(Monoid a, Monoid b) {
-    Monoid c;
-    c.bbox = b.bbox;
-    // TODO: I think this should be gated on b & SET_BBOX == false also.
-    if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) {
-        c.bbox = a.bbox;
-    } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 &&
-               (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)) {
-        c.bbox.xy = min(a.bbox.xy, c.bbox.xy);
-        c.bbox.zw = max(a.bbox.zw, c.bbox.zw);
-    }
-    c.flags = (a.flags & FLAG_SET_BBOX) | b.flags;
-    c.flags |= ((a.flags & FLAG_RESET_BBOX) << 1);
-    return c;
-}
-
-Monoid monoid_identity() {
-    return Monoid(vec4(0.0, 0.0, 0.0, 0.0), 0);
-}
-
-// These are not both live at the same time. A very smart shader compiler
-// would be able to figure that out, but I suspect many won't.
-shared TagMonoid sh_tag[WG_SIZE];
-shared Monoid sh_scratch[WG_SIZE];
-
-vec2 read_f32_point(uint ix) {
-    float x = uintBitsToFloat(scene[ix]);
-    float y = uintBitsToFloat(scene[ix + 1]);
-    return vec2(x, y);
-}
-
-vec2 read_i16_point(uint ix) {
-    uint raw = scene[ix];
-    float x = float(int(raw << 16) >> 16);
-    float y = float(int(raw) >> 16);
-    return vec2(x, y);
-}
-
-// Note: these are 16 bit, which is adequate, but we could use 32 bits.
-
-// Round down and saturate to minimum integer; add bias
-uint round_down(float x) {
-    return uint(max(0.0, floor(x) + 32768.0));
-}
-
-// Round up and saturate to maximum integer; add bias
-uint round_up(float x) {
-    return uint(min(65535.0, ceil(x) + 32768.0));
-}
-
-void main() {
-    Monoid local[N_SEQ];
-    float linewidth[N_SEQ];
-    uint save_trans_ix[N_SEQ];
-
-    uint ix = gl_GlobalInvocationID.x * N_SEQ;
-
-    uint tag_word = scene[(conf.pathtag_offset >> 2) + (ix >> 2)];
-
-    // Scan the tag monoid
-    TagMonoid local_tm = reduce_tag(tag_word);
-    sh_tag[gl_LocalInvocationID.x] = local_tm;
-    for (uint i = 0; i < LG_WG_SIZE; i++) {
-        barrier();
-        if (gl_LocalInvocationID.x >= (1u << i)) {
-            TagMonoid other = sh_tag[gl_LocalInvocationID.x - (1u << i)];
-            local_tm = combine_tag_monoid(other, local_tm);
-        }
-        barrier();
-        sh_tag[gl_LocalInvocationID.x] = local_tm;
-    }
-    barrier();
-    // sh_tag is now the partition-wide inclusive scan of the tag monoid.
-    TagMonoid tm = tag_monoid_identity();
-    if (gl_WorkGroupID.x > 0) {
-        tm = parent[gl_WorkGroupID.x - 1];
-    }
-    if (gl_LocalInvocationID.x > 0) {
-        tm = combine_tag_monoid(tm, sh_tag[gl_LocalInvocationID.x - 1]);
-    }
-    // tm is now the full exclusive scan of the tag monoid.
-
-    // Indices to scene buffer in u32 units.
-    uint ps_ix = (conf.pathseg_offset >> 2) + tm.pathseg_offset;
-    uint lw_ix = (conf.linewidth_offset >> 2) + tm.linewidth_ix;
-    uint save_path_ix = tm.path_ix;
-    uint trans_ix = tm.trans_ix;
-    TransformRef trans_ref = TransformRef(conf.trans_offset + trans_ix * Transform_size);
-    PathSegRef ps_ref = PathSegRef(conf.pathseg_alloc.offset + tm.pathseg_ix * PathSeg_size);
-    for (uint i = 0; i < N_SEQ; i++) {
-        linewidth[i] = uintBitsToFloat(scene[lw_ix]);
-        save_trans_ix[i] = trans_ix;
-        // if N_SEQ > 4, need to load tag_word from local if N_SEQ % 4 == 0
-        uint tag_byte = tag_word >> (i * 8);
-        uint seg_type = tag_byte & 3;
-        if (seg_type != 0) {
-            // 1 = line, 2 = quad, 3 = cubic
-            // Unpack path segment from input
-            vec2 p0;
-            vec2 p1;
-            vec2 p2;
-            vec2 p3;
-            if ((tag_byte & 8) != 0) {
-                // 32 bit encoding
-                p0 = read_f32_point(ps_ix);
-                p1 = read_f32_point(ps_ix + 2);
-                if (seg_type >= 2) {
-                    p2 = read_f32_point(ps_ix + 4);
-                    if (seg_type == 3) {
-                        p3 = read_f32_point(ps_ix + 6);
-                    }
-                }
-            } else {
-                // 16 bit encoding
-                p0 = read_i16_point(ps_ix);
-                p1 = read_i16_point(ps_ix + 1);
-                if (seg_type >= 2) {
-                    p2 = read_i16_point(ps_ix + 2);
-                    if (seg_type == 3) {
-                        p3 = read_i16_point(ps_ix + 3);
-                    }
-                }
-            }
-            Transform transform = Transform_read(trans_ref);
-            p0 = transform.mat.xy * p0.x + transform.mat.zw * p0.y + transform.translate;
-            p1 = transform.mat.xy * p1.x + transform.mat.zw * p1.y + transform.translate;
-            vec4 bbox = vec4(min(p0, p1), max(p0, p1));
-            // Degree-raise and compute bbox
-            if (seg_type >= 2) {
-                p2 = transform.mat.xy * p2.x + transform.mat.zw * p2.y + transform.translate;
-                bbox.xy = min(bbox.xy, p2);
-                bbox.zw = max(bbox.zw, p2);
-                if (seg_type == 3) {
-                    p3 = transform.mat.xy * p3.x + transform.mat.zw * p3.y + transform.translate;
-                    bbox.xy = min(bbox.xy, p3);
-                    bbox.zw = max(bbox.zw, p3);
-                } else {
-                    p3 = p2;
-                    p2 = mix(p1, p2, 1.0 / 3.0);
-                    p1 = mix(p1, p0, 1.0 / 3.0);
-                }
-            } else {
-                p3 = p1;
-                p2 = mix(p3, p0, 1.0 / 3.0);
-                p1 = mix(p0, p3, 1.0 / 3.0);
-            }
-            vec2 stroke = vec2(0.0, 0.0);
-            if (linewidth[i] >= 0.0) {
-                // See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm
-                stroke = 0.5 * linewidth[i] * vec2(length(transform.mat.xz), length(transform.mat.yw));
-                bbox += vec4(-stroke, stroke);
-            }
-            local[i].bbox = bbox;
-            local[i].flags = 0;
-
-            // Write path segment to output
-            PathCubic cubic;
-            cubic.p0 = p0;
-            cubic.p1 = p1;
-            cubic.p2 = p2;
-            cubic.p3 = p3;
-            cubic.path_ix = tm.path_ix;
-            // Not needed, TODO remove from struct
-            cubic.trans_ix = gl_GlobalInvocationID.x * 4 + i;
-            cubic.stroke = stroke;
-            uint fill_mode = uint(linewidth[i] >= 0.0);
-            PathSeg_Cubic_write(conf.pathseg_alloc, ps_ref, fill_mode, cubic);
-
-            ps_ref.offset += PathSeg_size;
-            uint n_points = (tag_byte & 3) + ((tag_byte >> 2) & 1);
-            uint n_words = n_points + (n_points & (((tag_byte >> 3) & 1) * 15));
-            ps_ix += n_words;
-        } else {
-            local[i].bbox = vec4(0.0, 0.0, 0.0, 0.0);
-            // These shifts need to be kept in sync with setup.h
-            uint is_path = (tag_byte >> 4) & 1;
-            // Relies on the fact that RESET_BBOX == 1
-            local[i].flags = is_path;
-            tm.path_ix += is_path;
-            trans_ix += (tag_byte >> 5) & 1;
-            trans_ref.offset += ((tag_byte >> 5) & 1) * Transform_size;
-            lw_ix += (tag_byte >> 6) & 1;
-        }
-    }
-
-    // Partition-wide monoid scan for bbox monoid
-    Monoid agg = local[0];
-    for (uint i = 1; i < N_SEQ; i++) {
-        // Note: this could be fused with the map above, but probably
-        // a thin performance gain not worth the complexity.
-        agg = combine_monoid(agg, local[i]);
-        local[i] = agg;
-    }
-    // local is N_SEQ sub-partition inclusive scan of bbox monoid.
-    sh_scratch[gl_LocalInvocationID.x] = agg;
-    for (uint i = 0; i < LG_WG_SIZE; i++) {
-        barrier();
-        if (gl_LocalInvocationID.x >= (1u << i)) {
-            Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)];
-            agg = combine_monoid(other, agg);
-        }
-        barrier();
-        sh_scratch[gl_LocalInvocationID.x] = agg;
-    }
-    // sh_scratch is the partition-wide inclusive scan of the bbox monoid,
-    // sampled at the end of the N_SEQ sub-partition.
-
-    barrier();
-    uint path_ix = save_path_ix;
-    uint bbox_out_ix = (conf.path_bbox_alloc.offset >> 2) + path_ix * 6;
-    // Write bboxes to paths; do atomic min/max if partial
-    Monoid row = monoid_identity();
-    if (gl_LocalInvocationID.x > 0) {
-        row = sh_scratch[gl_LocalInvocationID.x - 1];
-    }
-    for (uint i = 0; i < N_SEQ; i++) {
-        Monoid m = combine_monoid(row, local[i]);
-        // m is partition-wide inclusive scan of bbox monoid.
-        bool do_atomic = false;
-        if (i == N_SEQ - 1 && gl_LocalInvocationID.x == WG_SIZE - 1) {
-            // last element
-            do_atomic = true;
-        }
-        if ((m.flags & FLAG_RESET_BBOX) != 0) {
-            memory[bbox_out_ix + 4] = floatBitsToUint(linewidth[i]);
-            memory[bbox_out_ix + 5] = save_trans_ix[i];
-            if ((m.flags & FLAG_SET_BBOX) == 0) {
-                do_atomic = true;
-            } else {
-                memory[bbox_out_ix] = round_down(m.bbox.x);
-                memory[bbox_out_ix + 1] = round_down(m.bbox.y);
-                memory[bbox_out_ix + 2] = round_up(m.bbox.z);
-                memory[bbox_out_ix + 3] = round_up(m.bbox.w);
-                bbox_out_ix += 6;
-                do_atomic = false;
-            }
-        }
-        if (do_atomic) {
-            if (m.bbox.z > m.bbox.x || m.bbox.w > m.bbox.y) {
-                // atomic min/max
-                atomicMin(memory[bbox_out_ix], round_down(m.bbox.x));
-                atomicMin(memory[bbox_out_ix + 1], round_down(m.bbox.y));
-                atomicMax(memory[bbox_out_ix + 2], round_up(m.bbox.z));
-                atomicMax(memory[bbox_out_ix + 3], round_up(m.bbox.w));
-            }
-            bbox_out_ix += 6;
-        }
-    }
-}
--- a/piet-gpu/shader/pathseg.h
+++ b/piet-gpu/shader/pathseg.h
@ -1,100 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Code auto-generated by piet-gpu-derive
-
-struct PathCubicRef {
-    uint offset;
-};
-
-struct PathSegRef {
-    uint offset;
-};
-
-struct PathCubic {
-    vec2 p0;
-    vec2 p1;
-    vec2 p2;
-    vec2 p3;
-    uint path_ix;
-    uint trans_ix;
-    vec2 stroke;
-};
-
-#define PathCubic_size 48
-
-PathCubicRef PathCubic_index(PathCubicRef ref, uint index) {
-    return PathCubicRef(ref.offset + index * PathCubic_size);
-}
-
-#define PathSeg_Nop 0
-#define PathSeg_Cubic 1
-#define PathSeg_size 52
-
-PathSegRef PathSeg_index(PathSegRef ref, uint index) {
-    return PathSegRef(ref.offset + index * PathSeg_size);
-}
-
-struct PathSegTag {
-   uint tag;
-   uint flags;
-};
-
-PathCubic PathCubic_read(Alloc a, PathCubicRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    uint raw1 = read_mem(a, ix + 1);
-    uint raw2 = read_mem(a, ix + 2);
-    uint raw3 = read_mem(a, ix + 3);
-    uint raw4 = read_mem(a, ix + 4);
-    uint raw5 = read_mem(a, ix + 5);
-    uint raw6 = read_mem(a, ix + 6);
-    uint raw7 = read_mem(a, ix + 7);
-    uint raw8 = read_mem(a, ix + 8);
-    uint raw9 = read_mem(a, ix + 9);
-    uint raw10 = read_mem(a, ix + 10);
-    uint raw11 = read_mem(a, ix + 11);
-    PathCubic s;
-    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
-    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
-    s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7));
-    s.path_ix = raw8;
-    s.trans_ix = raw9;
-    s.stroke = vec2(uintBitsToFloat(raw10), uintBitsToFloat(raw11));
-    return s;
-}
-
-void PathCubic_write(Alloc a, PathCubicRef ref, PathCubic s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, floatBitsToUint(s.p0.x));
-    write_mem(a, ix + 1, floatBitsToUint(s.p0.y));
-    write_mem(a, ix + 2, floatBitsToUint(s.p1.x));
-    write_mem(a, ix + 3, floatBitsToUint(s.p1.y));
-    write_mem(a, ix + 4, floatBitsToUint(s.p2.x));
-    write_mem(a, ix + 5, floatBitsToUint(s.p2.y));
-    write_mem(a, ix + 6, floatBitsToUint(s.p3.x));
-    write_mem(a, ix + 7, floatBitsToUint(s.p3.y));
-    write_mem(a, ix + 8, s.path_ix);
-    write_mem(a, ix + 9, s.trans_ix);
-    write_mem(a, ix + 10, floatBitsToUint(s.stroke.x));
-    write_mem(a, ix + 11, floatBitsToUint(s.stroke.y));
-}
-
-PathSegTag PathSeg_tag(Alloc a, PathSegRef ref) {
-    uint tag_and_flags = read_mem(a, ref.offset >> 2);
-    return PathSegTag(tag_and_flags & 0xffff, tag_and_flags >> 16);
-}
-
-PathCubic PathSeg_Cubic_read(Alloc a, PathSegRef ref) {
-    return PathCubic_read(a, PathCubicRef(ref.offset + 4));
-}
-
-void PathSeg_Nop_write(Alloc a, PathSegRef ref) {
-    write_mem(a, ref.offset >> 2, PathSeg_Nop);
-}
-
-void PathSeg_Cubic_write(Alloc a, PathSegRef ref, uint flags, PathCubic s) {
-    write_mem(a, ref.offset >> 2, (flags << 16) | PathSeg_Cubic);
-    PathCubic_write(a, PathCubicRef(ref.offset + 4), s);
-}
-
--- a/piet-gpu/shader/pathtag.h
+++ b/piet-gpu/shader/pathtag.h
@ -1,49 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Common data structures and functions for the path tag stream.
-
-// This is the layout for tag bytes in the path stream. See
-// doc/pathseg.md for an explanation.
-
-#define PATH_TAG_PATHSEG_BITS 0xf
-#define PATH_TAG_PATH 0x10
-#define PATH_TAG_TRANSFORM 0x20
-#define PATH_TAG_LINEWIDTH 0x40
-
-struct TagMonoid {
-    uint trans_ix;
-    uint linewidth_ix;
-    uint pathseg_ix;
-    uint path_ix;
-    uint pathseg_offset;
-};
-
-TagMonoid tag_monoid_identity() {
-    return TagMonoid(0, 0, 0, 0, 0);
-}
-
-TagMonoid combine_tag_monoid(TagMonoid a, TagMonoid b) {
-    TagMonoid c;
-    c.trans_ix = a.trans_ix + b.trans_ix;
-    c.linewidth_ix = a.linewidth_ix + b.linewidth_ix;
-    c.pathseg_ix = a.pathseg_ix + b.pathseg_ix;
-    c.path_ix = a.path_ix + b.path_ix;
-    c.pathseg_offset = a.pathseg_offset + b.pathseg_offset;
-    return c;
-}
-
-TagMonoid reduce_tag(uint tag_word) {
-    TagMonoid c;
-    // Some fun bit magic here, see doc/pathseg.md for explanation.
-    uint point_count = tag_word & 0x3030303;
-    c.pathseg_ix = bitCount((point_count * 7) & 0x4040404);
-    c.linewidth_ix = bitCount(tag_word & (PATH_TAG_LINEWIDTH * 0x1010101));
-    c.path_ix = bitCount(tag_word & (PATH_TAG_PATH * 0x1010101));
-    c.trans_ix = bitCount(tag_word & (PATH_TAG_TRANSFORM * 0x1010101));
-    uint n_points = point_count + ((tag_word >> 2) & 0x1010101);
-    uint a = n_points + (n_points & (((tag_word >> 3) & 0x1010101) * 15));
-    a += a >> 8;
-    a += a >> 16;
-    c.pathseg_offset = a & 0xff;
-    return c;
-}
--- a/piet-gpu/shader/pathtag_reduce.comp
+++ b/piet-gpu/shader/pathtag_reduce.comp
@ -1,61 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// The reduction phase for path tag scan implemented as a tree reduction.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-#include "mem.h"
-#include "setup.h"
-#include "pathtag.h"
-
-// Note: the partition size is smaller than pathseg by a factor
-// of 4, as there are 4 tag bytes to a tag word.
-#define N_ROWS 2
-#define LG_WG_SIZE (6 + LG_WG_FACTOR)
-#define WG_SIZE (1 << LG_WG_SIZE)
-#define PARTITION_SIZE (WG_SIZE * N_ROWS)
-
-layout(local_size_x = WG_SIZE, local_size_y = 1) in;
-
-layout(binding = 1) readonly buffer ConfigBuf {
-    Config conf;
-};
-
-layout(binding = 2) readonly buffer SceneBuf {
-    uint[] scene;
-};
-
-#define Monoid TagMonoid
-
-layout(set = 0, binding = 3) buffer OutBuf {
-    Monoid[] outbuf;
-};
-
-shared Monoid sh_scratch[WG_SIZE];
-
-void main() {
-    uint ix = gl_GlobalInvocationID.x * N_ROWS;
-    uint scene_ix = (conf.pathtag_offset >> 2) + ix;
-    uint tag_word = scene[scene_ix];
-
-    Monoid agg = reduce_tag(tag_word);
-    for (uint i = 1; i < N_ROWS; i++) {
-        tag_word = scene[scene_ix + i];
-        agg = combine_tag_monoid(agg, reduce_tag(tag_word));
-    }
-    sh_scratch[gl_LocalInvocationID.x] = agg;
-    for (uint i = 0; i < LG_WG_SIZE; i++) {
-        barrier();
-        // We could make this predicate tighter, but would it help?
-        if (gl_LocalInvocationID.x + (1u << i) < WG_SIZE) {
-            Monoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i)];
-            agg = combine_tag_monoid(agg, other);
-        }
-        barrier();
-        sh_scratch[gl_LocalInvocationID.x] = agg;
-    }
-    if (gl_LocalInvocationID.x == 0) {
-        outbuf[gl_WorkGroupID.x] = agg;
-    }
-}
--- a/piet-gpu/shader/pathtag_scan.comp
+++ b/piet-gpu/shader/pathtag_scan.comp
@ -1,75 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// A scan for path tag scan implemented as a tree reduction.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-#include "setup.h"
-#include "pathtag.h"
-
-#define N_ROWS 8
-#define LG_WG_SIZE (7 + LG_WG_FACTOR)
-#define WG_SIZE (1 << LG_WG_SIZE)
-#define PARTITION_SIZE (WG_SIZE * N_ROWS)
-
-layout(local_size_x = WG_SIZE, local_size_y = 1) in;
-
-#define Monoid TagMonoid
-#define combine_monoid combine_tag_monoid
-#define monoid_identity tag_monoid_identity
-
-layout(binding = 0) buffer DataBuf {
-    Monoid[] data;
-};
-
-#ifndef ROOT
-layout(binding = 1) readonly buffer ParentBuf {
-    Monoid[] parent;
-};
-#endif
-
-shared Monoid sh_scratch[WG_SIZE];
-
-void main() {
-    Monoid local[N_ROWS];
-
-    uint ix = gl_GlobalInvocationID.x * N_ROWS;
-
-    local[0] = data[ix];
-    for (uint i = 1; i < N_ROWS; i++) {
-        local[i] = combine_monoid(local[i - 1], data[ix + i]);
-    }
-    Monoid agg = local[N_ROWS - 1];
-    sh_scratch[gl_LocalInvocationID.x] = agg;
-    for (uint i = 0; i < LG_WG_SIZE; i++) {
-        barrier();
-        if (gl_LocalInvocationID.x >= (1u << i)) {
-            Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)];
-            agg = combine_monoid(other, agg);
-        }
-        barrier();
-        sh_scratch[gl_LocalInvocationID.x] = agg;
-    }
-
-    barrier();
-    // This could be a semigroup instead of a monoid if we reworked the
-    // conditional logic, but that might impact performance.
-    Monoid row = monoid_identity();
-#ifdef ROOT
-    if (gl_LocalInvocationID.x > 0) {
-        row = sh_scratch[gl_LocalInvocationID.x - 1];
-    }
-#else
-    if (gl_WorkGroupID.x > 0) {
-        row = parent[gl_WorkGroupID.x - 1];
-    }
-    if (gl_LocalInvocationID.x > 0) {
-        row = combine_monoid(row, sh_scratch[gl_LocalInvocationID.x - 1]);
-    }
-#endif
-    for (uint i = 0; i < N_ROWS; i++) {
-        Monoid m = combine_monoid(row, local[i]);
-        data[ix + i] = m;
-    }
-}
--- a/piet-gpu/shader/ptcl.h
+++ b/piet-gpu/shader/ptcl.h
@ -1,426 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Code auto-generated by piet-gpu-derive
-
-struct CmdStrokeRef {
-    uint offset;
-};
-
-struct CmdFillRef {
-    uint offset;
-};
-
-struct CmdColorRef {
-    uint offset;
-};
-
-struct CmdLinGradRef {
-    uint offset;
-};
-
-struct CmdRadGradRef {
-    uint offset;
-};
-
-struct CmdImageRef {
-    uint offset;
-};
-
-struct CmdAlphaRef {
-    uint offset;
-};
-
-struct CmdEndClipRef {
-    uint offset;
-};
-
-struct CmdJumpRef {
-    uint offset;
-};
-
-struct CmdRef {
-    uint offset;
-};
-
-struct CmdStroke {
-    uint tile_ref;
-    float half_width;
-};
-
-#define CmdStroke_size 8
-
-CmdStrokeRef CmdStroke_index(CmdStrokeRef ref, uint index) {
-    return CmdStrokeRef(ref.offset + index * CmdStroke_size);
-}
-
-struct CmdFill {
-    uint tile_ref;
-    int backdrop;
-};
-
-#define CmdFill_size 8
-
-CmdFillRef CmdFill_index(CmdFillRef ref, uint index) {
-    return CmdFillRef(ref.offset + index * CmdFill_size);
-}
-
-struct CmdColor {
-    uint rgba_color;
-};
-
-#define CmdColor_size 4
-
-CmdColorRef CmdColor_index(CmdColorRef ref, uint index) {
-    return CmdColorRef(ref.offset + index * CmdColor_size);
-}
-
-struct CmdLinGrad {
-    uint index;
-    float line_x;
-    float line_y;
-    float line_c;
-};
-
-#define CmdLinGrad_size 16
-
-CmdLinGradRef CmdLinGrad_index(CmdLinGradRef ref, uint index) {
-    return CmdLinGradRef(ref.offset + index * CmdLinGrad_size);
-}
-
-struct CmdRadGrad {
-    uint index;
-    vec4 mat;
-    vec2 xlat;
-    vec2 c1;
-    float ra;
-    float roff;
-};
-
-#define CmdRadGrad_size 44
-
-CmdRadGradRef CmdRadGrad_index(CmdRadGradRef ref, uint index) {
-    return CmdRadGradRef(ref.offset + index * CmdRadGrad_size);
-}
-
-struct CmdImage {
-    uint index;
-    ivec2 offset;
-};
-
-#define CmdImage_size 8
-
-CmdImageRef CmdImage_index(CmdImageRef ref, uint index) {
-    return CmdImageRef(ref.offset + index * CmdImage_size);
-}
-
-struct CmdAlpha {
-    float alpha;
-};
-
-#define CmdAlpha_size 4
-
-CmdAlphaRef CmdAlpha_index(CmdAlphaRef ref, uint index) {
-    return CmdAlphaRef(ref.offset + index * CmdAlpha_size);
-}
-
-struct CmdEndClip {
-    uint blend;
-};
-
-#define CmdEndClip_size 4
-
-CmdEndClipRef CmdEndClip_index(CmdEndClipRef ref, uint index) {
-    return CmdEndClipRef(ref.offset + index * CmdEndClip_size);
-}
-
-struct CmdJump {
-    uint new_ref;
-};
-
-#define CmdJump_size 4
-
-CmdJumpRef CmdJump_index(CmdJumpRef ref, uint index) {
-    return CmdJumpRef(ref.offset + index * CmdJump_size);
-}
-
-#define Cmd_End 0
-#define Cmd_Fill 1
-#define Cmd_Stroke 2
-#define Cmd_Solid 3
-#define Cmd_Alpha 4
-#define Cmd_Color 5
-#define Cmd_LinGrad 6
-#define Cmd_RadGrad 7
-#define Cmd_Image 8
-#define Cmd_BeginClip 9
-#define Cmd_EndClip 10
-#define Cmd_Jump 11
-#define Cmd_size 48
-
-CmdRef Cmd_index(CmdRef ref, uint index) {
-    return CmdRef(ref.offset + index * Cmd_size);
-}
-
-struct CmdTag {
-   uint tag;
-   uint flags;
-};
-
-CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    uint raw1 = read_mem(a, ix + 1);
-    CmdStroke s;
-    s.tile_ref = raw0;
-    s.half_width = uintBitsToFloat(raw1);
-    return s;
-}
-
-void CmdStroke_write(Alloc a, CmdStrokeRef ref, CmdStroke s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, s.tile_ref);
-    write_mem(a, ix + 1, floatBitsToUint(s.half_width));
-}
-
-CmdFill CmdFill_read(Alloc a, CmdFillRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    uint raw1 = read_mem(a, ix + 1);
-    CmdFill s;
-    s.tile_ref = raw0;
-    s.backdrop = int(raw1);
-    return s;
-}
-
-void CmdFill_write(Alloc a, CmdFillRef ref, CmdFill s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, s.tile_ref);
-    write_mem(a, ix + 1, uint(s.backdrop));
-}
-
-CmdColor CmdColor_read(Alloc a, CmdColorRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    CmdColor s;
-    s.rgba_color = raw0;
-    return s;
-}
-
-void CmdColor_write(Alloc a, CmdColorRef ref, CmdColor s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, s.rgba_color);
-}
-
-CmdLinGrad CmdLinGrad_read(Alloc a, CmdLinGradRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    uint raw1 = read_mem(a, ix + 1);
-    uint raw2 = read_mem(a, ix + 2);
-    uint raw3 = read_mem(a, ix + 3);
-    CmdLinGrad s;
-    s.index = raw0;
-    s.line_x = uintBitsToFloat(raw1);
-    s.line_y = uintBitsToFloat(raw2);
-    s.line_c = uintBitsToFloat(raw3);
-    return s;
-}
-
-void CmdLinGrad_write(Alloc a, CmdLinGradRef ref, CmdLinGrad s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, s.index);
-    write_mem(a, ix + 1, floatBitsToUint(s.line_x));
-    write_mem(a, ix + 2, floatBitsToUint(s.line_y));
-    write_mem(a, ix + 3, floatBitsToUint(s.line_c));
-}
-
-CmdRadGrad CmdRadGrad_read(Alloc a, CmdRadGradRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    uint raw1 = read_mem(a, ix + 1);
-    uint raw2 = read_mem(a, ix + 2);
-    uint raw3 = read_mem(a, ix + 3);
-    uint raw4 = read_mem(a, ix + 4);
-    uint raw5 = read_mem(a, ix + 5);
-    uint raw6 = read_mem(a, ix + 6);
-    uint raw7 = read_mem(a, ix + 7);
-    uint raw8 = read_mem(a, ix + 8);
-    uint raw9 = read_mem(a, ix + 9);
-    uint raw10 = read_mem(a, ix + 10);
-    CmdRadGrad s;
-    s.index = raw0;
-    s.mat = vec4(uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3), uintBitsToFloat(raw4));
-    s.xlat = vec2(uintBitsToFloat(raw5), uintBitsToFloat(raw6));
-    s.c1 = vec2(uintBitsToFloat(raw7), uintBitsToFloat(raw8));
-    s.ra = uintBitsToFloat(raw9);
-    s.roff = uintBitsToFloat(raw10);
-    return s;
-}
-
-void CmdRadGrad_write(Alloc a, CmdRadGradRef ref, CmdRadGrad s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, s.index);
-    write_mem(a, ix + 1, floatBitsToUint(s.mat.x));
-    write_mem(a, ix + 2, floatBitsToUint(s.mat.y));
-    write_mem(a, ix + 3, floatBitsToUint(s.mat.z));
-    write_mem(a, ix + 4, floatBitsToUint(s.mat.w));
-    write_mem(a, ix + 5, floatBitsToUint(s.xlat.x));
-    write_mem(a, ix + 6, floatBitsToUint(s.xlat.y));
-    write_mem(a, ix + 7, floatBitsToUint(s.c1.x));
-    write_mem(a, ix + 8, floatBitsToUint(s.c1.y));
-    write_mem(a, ix + 9, floatBitsToUint(s.ra));
-    write_mem(a, ix + 10, floatBitsToUint(s.roff));
-}
-
-CmdImage CmdImage_read(Alloc a, CmdImageRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    uint raw1 = read_mem(a, ix + 1);
-    CmdImage s;
-    s.index = raw0;
-    s.offset = ivec2(int(raw1 << 16) >> 16, int(raw1) >> 16);
-    return s;
-}
-
-void CmdImage_write(Alloc a, CmdImageRef ref, CmdImage s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, s.index);
-    write_mem(a, ix + 1, (uint(s.offset.x) & 0xffff) | (uint(s.offset.y) << 16));
-}
-
-CmdAlpha CmdAlpha_read(Alloc a, CmdAlphaRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    CmdAlpha s;
-    s.alpha = uintBitsToFloat(raw0);
-    return s;
-}
-
-void CmdAlpha_write(Alloc a, CmdAlphaRef ref, CmdAlpha s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, floatBitsToUint(s.alpha));
-}
-
-CmdEndClip CmdEndClip_read(Alloc a, CmdEndClipRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    CmdEndClip s;
-    s.blend = raw0;
-    return s;
-}
-
-void CmdEndClip_write(Alloc a, CmdEndClipRef ref, CmdEndClip s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, s.blend);
-}
-
-CmdJump CmdJump_read(Alloc a, CmdJumpRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    CmdJump s;
-    s.new_ref = raw0;
-    return s;
-}
-
-void CmdJump_write(Alloc a, CmdJumpRef ref, CmdJump s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, s.new_ref);
-}
-
-CmdTag Cmd_tag(Alloc a, CmdRef ref) {
-    uint tag_and_flags = read_mem(a, ref.offset >> 2);
-    return CmdTag(tag_and_flags & 0xffff, tag_and_flags >> 16);
-}
-
-CmdFill Cmd_Fill_read(Alloc a, CmdRef ref) {
-    return CmdFill_read(a, CmdFillRef(ref.offset + 4));
-}
-
-CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref) {
-    return CmdStroke_read(a, CmdStrokeRef(ref.offset + 4));
-}
-
-CmdAlpha Cmd_Alpha_read(Alloc a, CmdRef ref) {
-    return CmdAlpha_read(a, CmdAlphaRef(ref.offset + 4));
-}
-
-CmdColor Cmd_Color_read(Alloc a, CmdRef ref) {
-    return CmdColor_read(a, CmdColorRef(ref.offset + 4));
-}
-
-CmdLinGrad Cmd_LinGrad_read(Alloc a, CmdRef ref) {
-    return CmdLinGrad_read(a, CmdLinGradRef(ref.offset + 4));
-}
-
-CmdRadGrad Cmd_RadGrad_read(Alloc a, CmdRef ref) {
-    return CmdRadGrad_read(a, CmdRadGradRef(ref.offset + 4));
-}
-
-CmdImage Cmd_Image_read(Alloc a, CmdRef ref) {
-    return CmdImage_read(a, CmdImageRef(ref.offset + 4));
-}
-
-CmdEndClip Cmd_EndClip_read(Alloc a, CmdRef ref) {
-    return CmdEndClip_read(a, CmdEndClipRef(ref.offset + 4));
-}
-
-CmdJump Cmd_Jump_read(Alloc a, CmdRef ref) {
-    return CmdJump_read(a, CmdJumpRef(ref.offset + 4));
-}
-
-void Cmd_End_write(Alloc a, CmdRef ref) {
-    write_mem(a, ref.offset >> 2, Cmd_End);
-}
-
-void Cmd_Fill_write(Alloc a, CmdRef ref, CmdFill s) {
-    write_mem(a, ref.offset >> 2, Cmd_Fill);
-    CmdFill_write(a, CmdFillRef(ref.offset + 4), s);
-}
-
-void Cmd_Stroke_write(Alloc a, CmdRef ref, CmdStroke s) {
-    write_mem(a, ref.offset >> 2, Cmd_Stroke);
-    CmdStroke_write(a, CmdStrokeRef(ref.offset + 4), s);
-}
-
-void Cmd_Solid_write(Alloc a, CmdRef ref) {
-    write_mem(a, ref.offset >> 2, Cmd_Solid);
-}
-
-void Cmd_Alpha_write(Alloc a, CmdRef ref, CmdAlpha s) {
-    write_mem(a, ref.offset >> 2, Cmd_Alpha);
-    CmdAlpha_write(a, CmdAlphaRef(ref.offset + 4), s);
-}
-
-void Cmd_Color_write(Alloc a, CmdRef ref, CmdColor s) {
-    write_mem(a, ref.offset >> 2, Cmd_Color);
-    CmdColor_write(a, CmdColorRef(ref.offset + 4), s);
-}
-
-void Cmd_LinGrad_write(Alloc a, CmdRef ref, CmdLinGrad s) {
-    write_mem(a, ref.offset >> 2, Cmd_LinGrad);
-    CmdLinGrad_write(a, CmdLinGradRef(ref.offset + 4), s);
-}
-
-void Cmd_RadGrad_write(Alloc a, CmdRef ref, CmdRadGrad s) {
-    write_mem(a, ref.offset >> 2, Cmd_RadGrad);
-    CmdRadGrad_write(a, CmdRadGradRef(ref.offset + 4), s);
-}
-
-void Cmd_Image_write(Alloc a, CmdRef ref, CmdImage s) {
-    write_mem(a, ref.offset >> 2, Cmd_Image);
-    CmdImage_write(a, CmdImageRef(ref.offset + 4), s);
-}
-
-void Cmd_BeginClip_write(Alloc a, CmdRef ref) {
-    write_mem(a, ref.offset >> 2, Cmd_BeginClip);
-}
-
-void Cmd_EndClip_write(Alloc a, CmdRef ref, CmdEndClip s) {
-    write_mem(a, ref.offset >> 2, Cmd_EndClip);
-    CmdEndClip_write(a, CmdEndClipRef(ref.offset + 4), s);
-}
-
-void Cmd_Jump_write(Alloc a, CmdRef ref, CmdJump s) {
-    write_mem(a, ref.offset >> 2, Cmd_Jump);
-    CmdJump_write(a, CmdJumpRef(ref.offset + 4), s);
-}
-
--- a/piet-gpu/shader/scene.h
+++ b/piet-gpu/shader/scene.h
@ -1,350 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Code auto-generated by piet-gpu-derive
-
-struct LineSegRef {
-    uint offset;
-};
-
-struct QuadSegRef {
-    uint offset;
-};
-
-struct CubicSegRef {
-    uint offset;
-};
-
-struct FillColorRef {
-    uint offset;
-};
-
-struct FillLinGradientRef {
-    uint offset;
-};
-
-struct FillImageRef {
-    uint offset;
-};
-
-struct SetLineWidthRef {
-    uint offset;
-};
-
-struct TransformRef {
-    uint offset;
-};
-
-struct ClipRef {
-    uint offset;
-};
-
-struct SetFillModeRef {
-    uint offset;
-};
-
-struct ElementRef {
-    uint offset;
-};
-
-struct LineSeg {
-    vec2 p0;
-    vec2 p1;
-};
-
-#define LineSeg_size 16
-
-LineSegRef LineSeg_index(LineSegRef ref, uint index) {
-    return LineSegRef(ref.offset + index * LineSeg_size);
-}
-
-struct QuadSeg {
-    vec2 p0;
-    vec2 p1;
-    vec2 p2;
-};
-
-#define QuadSeg_size 24
-
-QuadSegRef QuadSeg_index(QuadSegRef ref, uint index) {
-    return QuadSegRef(ref.offset + index * QuadSeg_size);
-}
-
-struct CubicSeg {
-    vec2 p0;
-    vec2 p1;
-    vec2 p2;
-    vec2 p3;
-};
-
-#define CubicSeg_size 32
-
-CubicSegRef CubicSeg_index(CubicSegRef ref, uint index) {
-    return CubicSegRef(ref.offset + index * CubicSeg_size);
-}
-
-struct FillColor {
-    uint rgba_color;
-};
-
-#define FillColor_size 4
-
-FillColorRef FillColor_index(FillColorRef ref, uint index) {
-    return FillColorRef(ref.offset + index * FillColor_size);
-}
-
-struct FillLinGradient {
-    uint index;
-    vec2 p0;
-    vec2 p1;
-};
-
-#define FillLinGradient_size 20
-
-FillLinGradientRef FillLinGradient_index(FillLinGradientRef ref, uint index) {
-    return FillLinGradientRef(ref.offset + index * FillLinGradient_size);
-}
-
-struct FillImage {
-    uint index;
-    ivec2 offset;
-};
-
-#define FillImage_size 8
-
-FillImageRef FillImage_index(FillImageRef ref, uint index) {
-    return FillImageRef(ref.offset + index * FillImage_size);
-}
-
-struct SetLineWidth {
-    float width;
-};
-
-#define SetLineWidth_size 4
-
-SetLineWidthRef SetLineWidth_index(SetLineWidthRef ref, uint index) {
-    return SetLineWidthRef(ref.offset + index * SetLineWidth_size);
-}
-
-struct Transform {
-    vec4 mat;
-    vec2 translate;
-};
-
-#define Transform_size 24
-
-TransformRef Transform_index(TransformRef ref, uint index) {
-    return TransformRef(ref.offset + index * Transform_size);
-}
-
-struct Clip {
-    vec4 bbox;
-    uint blend;
-};
-
-#define Clip_size 20
-
-ClipRef Clip_index(ClipRef ref, uint index) {
-    return ClipRef(ref.offset + index * Clip_size);
-}
-
-struct SetFillMode {
-    uint fill_mode;
-};
-
-#define SetFillMode_size 4
-
-SetFillModeRef SetFillMode_index(SetFillModeRef ref, uint index) {
-    return SetFillModeRef(ref.offset + index * SetFillMode_size);
-}
-
-#define Element_Nop 0
-#define Element_Line 1
-#define Element_Quad 2
-#define Element_Cubic 3
-#define Element_FillColor 4
-#define Element_FillLinGradient 5
-#define Element_FillImage 6
-#define Element_SetLineWidth 7
-#define Element_Transform 8
-#define Element_BeginClip 9
-#define Element_EndClip 10
-#define Element_SetFillMode 11
-#define Element_size 36
-
-ElementRef Element_index(ElementRef ref, uint index) {
-    return ElementRef(ref.offset + index * Element_size);
-}
-
-struct ElementTag {
-   uint tag;
-   uint flags;
-};
-
-LineSeg LineSeg_read(LineSegRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = scene[ix + 0];
-    uint raw1 = scene[ix + 1];
-    uint raw2 = scene[ix + 2];
-    uint raw3 = scene[ix + 3];
-    LineSeg s;
-    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
-    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    return s;
-}
-
-QuadSeg QuadSeg_read(QuadSegRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = scene[ix + 0];
-    uint raw1 = scene[ix + 1];
-    uint raw2 = scene[ix + 2];
-    uint raw3 = scene[ix + 3];
-    uint raw4 = scene[ix + 4];
-    uint raw5 = scene[ix + 5];
-    QuadSeg s;
-    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
-    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
-    return s;
-}
-
-CubicSeg CubicSeg_read(CubicSegRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = scene[ix + 0];
-    uint raw1 = scene[ix + 1];
-    uint raw2 = scene[ix + 2];
-    uint raw3 = scene[ix + 3];
-    uint raw4 = scene[ix + 4];
-    uint raw5 = scene[ix + 5];
-    uint raw6 = scene[ix + 6];
-    uint raw7 = scene[ix + 7];
-    CubicSeg s;
-    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
-    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
-    s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7));
-    return s;
-}
-
-FillColor FillColor_read(FillColorRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = scene[ix + 0];
-    FillColor s;
-    s.rgba_color = raw0;
-    return s;
-}
-
-FillLinGradient FillLinGradient_read(FillLinGradientRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = scene[ix + 0];
-    uint raw1 = scene[ix + 1];
-    uint raw2 = scene[ix + 2];
-    uint raw3 = scene[ix + 3];
-    uint raw4 = scene[ix + 4];
-    FillLinGradient s;
-    s.index = raw0;
-    s.p0 = vec2(uintBitsToFloat(raw1), uintBitsToFloat(raw2));
-    s.p1 = vec2(uintBitsToFloat(raw3), uintBitsToFloat(raw4));
-    return s;
-}
-
-FillImage FillImage_read(FillImageRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = scene[ix + 0];
-    uint raw1 = scene[ix + 1];
-    FillImage s;
-    s.index = raw0;
-    s.offset = ivec2(int(raw1 << 16) >> 16, int(raw1) >> 16);
-    return s;
-}
-
-SetLineWidth SetLineWidth_read(SetLineWidthRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = scene[ix + 0];
-    SetLineWidth s;
-    s.width = uintBitsToFloat(raw0);
-    return s;
-}
-
-Transform Transform_read(TransformRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = scene[ix + 0];
-    uint raw1 = scene[ix + 1];
-    uint raw2 = scene[ix + 2];
-    uint raw3 = scene[ix + 3];
-    uint raw4 = scene[ix + 4];
-    uint raw5 = scene[ix + 5];
-    Transform s;
-    s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
-    return s;
-}
-
-Clip Clip_read(ClipRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = scene[ix + 0];
-    uint raw1 = scene[ix + 1];
-    uint raw2 = scene[ix + 2];
-    uint raw3 = scene[ix + 3];
-    Clip s;
-    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.blend = scene[ix + 4];
-    return s;
-}
-
-SetFillMode SetFillMode_read(SetFillModeRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = scene[ix + 0];
-    SetFillMode s;
-    s.fill_mode = raw0;
-    return s;
-}
-
-ElementTag Element_tag(ElementRef ref) {
-    uint tag_and_flags = scene[ref.offset >> 2];
-    return ElementTag(tag_and_flags & 0xffff, tag_and_flags >> 16);
-}
-
-LineSeg Element_Line_read(ElementRef ref) {
-    return LineSeg_read(LineSegRef(ref.offset + 4));
-}
-
-QuadSeg Element_Quad_read(ElementRef ref) {
-    return QuadSeg_read(QuadSegRef(ref.offset + 4));
-}
-
-CubicSeg Element_Cubic_read(ElementRef ref) {
-    return CubicSeg_read(CubicSegRef(ref.offset + 4));
-}
-
-FillColor Element_FillColor_read(ElementRef ref) {
-    return FillColor_read(FillColorRef(ref.offset + 4));
-}
-
-FillLinGradient Element_FillLinGradient_read(ElementRef ref) {
-    return FillLinGradient_read(FillLinGradientRef(ref.offset + 4));
-}
-
-FillImage Element_FillImage_read(ElementRef ref) {
-    return FillImage_read(FillImageRef(ref.offset + 4));
-}
-
-SetLineWidth Element_SetLineWidth_read(ElementRef ref) {
-    return SetLineWidth_read(SetLineWidthRef(ref.offset + 4));
-}
-
-Transform Element_Transform_read(ElementRef ref) {
-    return Transform_read(TransformRef(ref.offset + 4));
-}
-
-Clip Element_BeginClip_read(ElementRef ref) {
-    return Clip_read(ClipRef(ref.offset + 4));
-}
-
-Clip Element_EndClip_read(ElementRef ref) {
-    return Clip_read(ClipRef(ref.offset + 4));
-}
-
-SetFillMode Element_SetFillMode_read(ElementRef ref) {
-    return SetFillMode_read(SetFillModeRef(ref.offset + 4));
-}
-
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@ -1,103 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Various constants for the sizes of groups and tiles.
-
-// Much of this will be made dynamic in various ways, but for now it's easiest
-// to hardcode and keep all in one place.
-
-// A LG_WG_FACTOR of n scales workgroup sizes by 2^n. Use 0 for a
-// maximum workgroup size of 128, or 1 for a maximum size of 256.
-#define LG_WG_FACTOR 1
-#define WG_FACTOR (1<<LG_WG_FACTOR)
-
-#define TILE_WIDTH_PX 16
-#define TILE_HEIGHT_PX 16
-
-#define PTCL_INITIAL_ALLOC 1024
-
-// These should probably be renamed and/or reworked. In the binning
-// kernel, they represent the number of bins. Also, the workgroup size
-// of that kernel is equal to the number of bins, but should probably
-// be more flexible (it's 512 in the K&L paper).
-#define N_TILE_X 16
-#define N_TILE_Y (8 * WG_FACTOR)
-#define N_TILE (N_TILE_X * N_TILE_Y)
-#define LG_N_TILE (7 + LG_WG_FACTOR)
-#define N_SLICE (N_TILE / 32)
-
-#define GRADIENT_WIDTH 512
-
-// We allocate this many blend stack entries in registers, and spill
-// to memory for the overflow.
-#define BLEND_STACK_SPLIT 4
-
-#ifdef MALLOC_FAILED
-struct Config {
-    uint mem_size; // in bytes
-    uint n_elements; // paths
-    uint n_pathseg;
-    uint width_in_tiles;
-    uint height_in_tiles;
-    Alloc tile_alloc;
-    Alloc bin_alloc;
-    Alloc ptcl_alloc;
-    Alloc pathseg_alloc;
-    Alloc anno_alloc;
-    // new element pipeline stuff follows
-
-    // Bounding boxes of paths, stored as int (so atomics work)
-    Alloc path_bbox_alloc;
-    // Monoid for draw objects
-    Alloc drawmonoid_alloc;
-
-    // BeginClip(path_ix) / EndClip
-    Alloc clip_alloc;
-    // Intermediate bicyclic semigroup
-    Alloc clip_bic_alloc;
-    // Intermediate stack
-    Alloc clip_stack_alloc;
-    // Clip processing results (path_ix + bbox)
-    Alloc clip_bbox_alloc;
-    // Bounding box per draw object
-    Alloc draw_bbox_alloc;
-    // Info computed in draw stage, per draw object
-    Alloc drawinfo_alloc;
-
-    // Number of transforms in scene
-    // This is probably not needed.
-    uint n_trans;
-    // This *should* count only actual paths, but in the current
-    // implementation is redundant with n_elements.
-    uint n_path;
-    // Total number of BeginClip and EndClip draw objects.
-    uint n_clip;
-
-    // Note: one of these offsets *could* be hardcoded to zero (as was the
-    // original element stream), but for now retain flexibility.
-
-    // Offset (in bytes) of transform stream in scene buffer
-    uint trans_offset;
-    // Offset (in bytes) of linewidth stream in scene
-    uint linewidth_offset;
-    // Offset (in bytes) of path tag stream in scene
-    uint pathtag_offset;
-    // Offset (in bytes) of path segment stream in scene
-    uint pathseg_offset;
-    // Offset (in bytes) of draw object tag stream in scene; see drawtag.h
-    uint drawtag_offset;
-    // Offset (in bytes) of draw payload stream in scene
-    uint drawdata_offset;
-};
-#endif
-
-// Fill modes.
-#define MODE_NONZERO 0
-#define MODE_STROKE 1
-
-// Size of kernel4 clip state, in words.
-#define CLIP_STATE_SIZE 1
-
-// fill_mode_from_flags extracts the fill mode from tag flags.
-uint fill_mode_from_flags(uint flags) {
-    return flags & 0x1;
-}
--- a/piet-gpu/shader/state.h
+++ b/piet-gpu/shader/state.h
@ -1,73 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Code auto-generated by piet-gpu-derive
-
-struct StateRef {
-    uint offset;
-};
-
-struct State {
-    vec4 mat;
-    vec2 translate;
-    vec4 bbox;
-    float linewidth;
-    uint flags;
-    uint path_count;
-    uint pathseg_count;
-    uint trans_count;
-};
-
-#define State_size 60
-
-StateRef State_index(StateRef ref, uint index) {
-    return StateRef(ref.offset + index * State_size);
-}
-
-State State_read(StateRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = state[ix + 0];
-    uint raw1 = state[ix + 1];
-    uint raw2 = state[ix + 2];
-    uint raw3 = state[ix + 3];
-    uint raw4 = state[ix + 4];
-    uint raw5 = state[ix + 5];
-    uint raw6 = state[ix + 6];
-    uint raw7 = state[ix + 7];
-    uint raw8 = state[ix + 8];
-    uint raw9 = state[ix + 9];
-    uint raw10 = state[ix + 10];
-    uint raw11 = state[ix + 11];
-    uint raw12 = state[ix + 12];
-    uint raw13 = state[ix + 13];
-    uint raw14 = state[ix + 14];
-    State s;
-    s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
-    s.bbox = vec4(uintBitsToFloat(raw6), uintBitsToFloat(raw7), uintBitsToFloat(raw8), uintBitsToFloat(raw9));
-    s.linewidth = uintBitsToFloat(raw10);
-    s.flags = raw11;
-    s.path_count = raw12;
-    s.pathseg_count = raw13;
-    s.trans_count = raw14;
-    return s;
-}
-
-void State_write(StateRef ref, State s) {
-    uint ix = ref.offset >> 2;
-    state[ix + 0] = floatBitsToUint(s.mat.x);
-    state[ix + 1] = floatBitsToUint(s.mat.y);
-    state[ix + 2] = floatBitsToUint(s.mat.z);
-    state[ix + 3] = floatBitsToUint(s.mat.w);
-    state[ix + 4] = floatBitsToUint(s.translate.x);
-    state[ix + 5] = floatBitsToUint(s.translate.y);
-    state[ix + 6] = floatBitsToUint(s.bbox.x);
-    state[ix + 7] = floatBitsToUint(s.bbox.y);
-    state[ix + 8] = floatBitsToUint(s.bbox.z);
-    state[ix + 9] = floatBitsToUint(s.bbox.w);
-    state[ix + 10] = floatBitsToUint(s.linewidth);
-    state[ix + 11] = s.flags;
-    state[ix + 12] = s.path_count;
-    state[ix + 13] = s.pathseg_count;
-    state[ix + 14] = s.trans_count;
-}
-
--- a/piet-gpu/shader/tile.h
+++ b/piet-gpu/shader/tile.h
@ -1,150 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Code auto-generated by piet-gpu-derive
-
-struct PathRef {
-    uint offset;
-};
-
-struct TileRef {
-    uint offset;
-};
-
-struct TileSegRef {
-    uint offset;
-};
-
-struct TransformSegRef {
-    uint offset;
-};
-
-struct Path {
-    uvec4 bbox;
-    TileRef tiles;
-};
-
-#define Path_size 12
-
-PathRef Path_index(PathRef ref, uint index) {
-    return PathRef(ref.offset + index * Path_size);
-}
-
-struct Tile {
-    TileSegRef tile;
-    int backdrop;
-};
-
-#define Tile_size 8
-
-TileRef Tile_index(TileRef ref, uint index) {
-    return TileRef(ref.offset + index * Tile_size);
-}
-
-struct TileSeg {
-    vec2 origin;
-    vec2 vector;
-    float y_edge;
-    TileSegRef next;
-};
-
-#define TileSeg_size 24
-
-TileSegRef TileSeg_index(TileSegRef ref, uint index) {
-    return TileSegRef(ref.offset + index * TileSeg_size);
-}
-
-struct TransformSeg {
-    vec4 mat;
-    vec2 translate;
-};
-
-#define TransformSeg_size 24
-
-TransformSegRef TransformSeg_index(TransformSegRef ref, uint index) {
-    return TransformSegRef(ref.offset + index * TransformSeg_size);
-}
-
-Path Path_read(Alloc a, PathRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    uint raw1 = read_mem(a, ix + 1);
-    uint raw2 = read_mem(a, ix + 2);
-    Path s;
-    s.bbox = uvec4(raw0 & 0xffff, raw0 >> 16, raw1 & 0xffff, raw1 >> 16);
-    s.tiles = TileRef(raw2);
-    return s;
-}
-
-void Path_write(Alloc a, PathRef ref, Path s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, s.bbox.x | (s.bbox.y << 16));
-    write_mem(a, ix + 1, s.bbox.z | (s.bbox.w << 16));
-    write_mem(a, ix + 2, s.tiles.offset);
-}
-
-Tile Tile_read(Alloc a, TileRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    uint raw1 = read_mem(a, ix + 1);
-    Tile s;
-    s.tile = TileSegRef(raw0);
-    s.backdrop = int(raw1);
-    return s;
-}
-
-void Tile_write(Alloc a, TileRef ref, Tile s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, s.tile.offset);
-    write_mem(a, ix + 1, uint(s.backdrop));
-}
-
-TileSeg TileSeg_read(Alloc a, TileSegRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    uint raw1 = read_mem(a, ix + 1);
-    uint raw2 = read_mem(a, ix + 2);
-    uint raw3 = read_mem(a, ix + 3);
-    uint raw4 = read_mem(a, ix + 4);
-    uint raw5 = read_mem(a, ix + 5);
-    TileSeg s;
-    s.origin = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
-    s.vector = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.y_edge = uintBitsToFloat(raw4);
-    s.next = TileSegRef(raw5);
-    return s;
-}
-
-void TileSeg_write(Alloc a, TileSegRef ref, TileSeg s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, floatBitsToUint(s.origin.x));
-    write_mem(a, ix + 1, floatBitsToUint(s.origin.y));
-    write_mem(a, ix + 2, floatBitsToUint(s.vector.x));
-    write_mem(a, ix + 3, floatBitsToUint(s.vector.y));
-    write_mem(a, ix + 4, floatBitsToUint(s.y_edge));
-    write_mem(a, ix + 5, s.next.offset);
-}
-
-TransformSeg TransformSeg_read(Alloc a, TransformSegRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    uint raw1 = read_mem(a, ix + 1);
-    uint raw2 = read_mem(a, ix + 2);
-    uint raw3 = read_mem(a, ix + 3);
-    uint raw4 = read_mem(a, ix + 4);
-    uint raw5 = read_mem(a, ix + 5);
-    TransformSeg s;
-    s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
-    return s;
-}
-
-void TransformSeg_write(Alloc a, TransformSegRef ref, TransformSeg s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, floatBitsToUint(s.mat.x));
-    write_mem(a, ix + 1, floatBitsToUint(s.mat.y));
-    write_mem(a, ix + 2, floatBitsToUint(s.mat.z));
-    write_mem(a, ix + 3, floatBitsToUint(s.mat.w));
-    write_mem(a, ix + 4, floatBitsToUint(s.translate.x));
-    write_mem(a, ix + 5, floatBitsToUint(s.translate.y));
-}
-
--- a/piet-gpu/shader/tile_alloc.comp
+++ b/piet-gpu/shader/tile_alloc.comp
@ -1,112 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Allocation and initialization of tiles for paths.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-#include "mem.h"
-#include "setup.h"
-
-#define LG_TILE_ALLOC_WG (7 + LG_WG_FACTOR)
-#define TILE_ALLOC_WG (1 << LG_TILE_ALLOC_WG)
-
-layout(local_size_x = TILE_ALLOC_WG, local_size_y = 1) in;
-
-layout(set = 0, binding = 1) readonly buffer ConfigBuf {
-    Config conf;
-};
-
-layout(binding = 2) readonly buffer SceneBuf {
-    uint[] scene;
-};
-
-#include "drawtag.h"
-#include "tile.h"
-
-// scale factors useful for converting coordinates to tiles
-#define SX (1.0 / float(TILE_WIDTH_PX))
-#define SY (1.0 / float(TILE_HEIGHT_PX))
-
-shared uint sh_tile_count[TILE_ALLOC_WG];
-shared uint sh_tile_offset;
-
-vec4 load_draw_bbox(uint draw_ix) {
-    uint base = (conf.draw_bbox_alloc.offset >> 2) + 4 * draw_ix;
-    float x0 = uintBitsToFloat(memory[base]);
-    float y0 = uintBitsToFloat(memory[base + 1]);
-    float x1 = uintBitsToFloat(memory[base + 2]);
-    float y1 = uintBitsToFloat(memory[base + 3]);
-    vec4 bbox = vec4(x0, y0, x1, y1);
-    return bbox;
-}
-
-void main() {
-    if (!check_deps(STAGE_BINNING)) {
-        return;
-    }
-    uint th_ix = gl_LocalInvocationID.x;
-    uint element_ix = gl_GlobalInvocationID.x;
-    // At the moment, element_ix == path_ix. The clip-intersected bounding boxes
-    // for elements (draw objects) are computed in the binning stage, but at some
-    // point we'll probably want to break that correspondence. Tiles should be
-    // allocated for paths, not draw objs. EndClip doesn't need an allocation.
-    PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
-    uint drawtag_base = conf.drawtag_offset >> 2;
-
-    uint drawtag = Drawtag_Nop;
-    if (element_ix < conf.n_elements) {
-        drawtag = scene[drawtag_base + element_ix];
-    }
-    int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
-    // Allocate an empty path for EndClip; at some point we'll change
-    // this to be per path rather than per draw object.
-    if (drawtag != Drawtag_Nop && drawtag != Drawtag_EndClip) {
-        vec4 bbox = load_draw_bbox(element_ix);
-        x0 = int(floor(bbox.x * SX));
-        y0 = int(floor(bbox.y * SY));
-        x1 = int(ceil(bbox.z * SX));
-        y1 = int(ceil(bbox.w * SY));
-    }
-    x0 = clamp(x0, 0, int(conf.width_in_tiles));
-    y0 = clamp(y0, 0, int(conf.height_in_tiles));
-    x1 = clamp(x1, 0, int(conf.width_in_tiles));
-    y1 = clamp(y1, 0, int(conf.height_in_tiles));
-
-    Path path;
-    path.bbox = uvec4(x0, y0, x1, y1);
-    uint tile_count = (x1 - x0) * (y1 - y0);
-
-    sh_tile_count[th_ix] = tile_count;
-    uint total_tile_count = tile_count;
-    // Prefix sum of sh_tile_count
-    for (uint i = 0; i < LG_TILE_ALLOC_WG; i++) {
-        barrier();
-        if (th_ix >= (1u << i)) {
-            total_tile_count += sh_tile_count[th_ix - (1u << i)];
-        }
-        barrier();
-        sh_tile_count[th_ix] = total_tile_count;
-    }
-    if (th_ix == TILE_ALLOC_WG - 1) {
-        sh_tile_offset = malloc_stage(total_tile_count * Tile_size, conf.mem_size, STAGE_TILE_ALLOC);
-    }
-    barrier();
-    uint offset_start = sh_tile_offset;
-    if (offset_start == MALLOC_FAILED) {
-        return;
-    }
-
-    if (element_ix < conf.n_elements) {
-        uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0;
-        path.tiles = TileRef(offset_start + Tile_size * tile_subix);
-        Path_write(conf.tile_alloc, path_ref, path);
-    }
-
-    // Zero out allocated tiles efficiently
-    uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4);
-    uint start_ix = offset_start >> 2;
-    for (uint i = th_ix; i < total_count; i += TILE_ALLOC_WG) {
-        memory[start_ix + i] = 0;
-    }
-}
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -1,928 +0,0 @@
-mod pico_svg;
-mod ramp;
-mod render_driver;
-pub mod samples;
-mod simple_text;
-pub mod stages;
-
-pub use piet_scene as scene;
-
-use bytemuck::{Pod, Zeroable};
-use scene::ResourcePatch;
-use std::convert::TryInto;
-
-pub use render_driver::RenderDriver;
-pub use simple_text::SimpleText;
-
-use piet_gpu_hal::{
-    include_shader, BindType, BufWrite, Buffer, BufferUsage, CmdBuf, ComputePassDescriptor,
-    DescriptorSet, Error, Image, ImageLayout, Pipeline, QueryPool, Session,
-};
-
-use piet_scene::Scene;
-
-pub use pico_svg::PicoSvg;
-use stages::{ClipBinding, ElementBinding, ElementCode, DRAW_PART_SIZE, PATHSEG_PART_SIZE};
-
-use crate::stages::{ClipCode, Config, ElementStage, CLIP_PART_SIZE};
-
-const TILE_W: usize = 16;
-const TILE_H: usize = 16;
-
-const PTCL_INITIAL_ALLOC: usize = 1024;
-
-const N_GRADIENT_SAMPLES: usize = 512;
-// TODO: make this dynamic
-const N_GRADIENTS: usize = 256;
-
-#[allow(unused)]
-fn dump_scene(buf: &[u8]) {
-    for i in 0..(buf.len() / 4) {
-        let mut buf_u32 = [0u8; 4];
-        buf_u32.copy_from_slice(&buf[i * 4..i * 4 + 4]);
-        println!("{:4x}: {:8x}", i * 4, u32::from_le_bytes(buf_u32));
-    }
-}
-
-#[allow(unused)]
-pub fn dump_k1_data(k1_buf: &[u32]) {
-    for i in 0..k1_buf.len() {
-        if k1_buf[i] != 0 {
-            println!("{:4x}: {:8x}", i * 4, k1_buf[i]);
-        }
-    }
-}
-
-pub struct RenderConfig {
-    width: usize,
-    height: usize,
-    format: PixelFormat,
-}
-
-// Should we just use the enum from piet-gpu-hal?
-pub enum PixelFormat {
-    A8,
-    Rgba8,
-}
-
-#[repr(C)]
-#[derive(Clone, Copy, Debug, Zeroable, Pod)]
-pub(crate) struct MemoryHeader {
-    mem_offset: u32,
-    mem_error: u32,
-    blend_offset: u32,
-}
-
-/// The sizes of various objects in the encoded scene, needed for memory layout.
-#[derive(Default)]
-pub(crate) struct SceneStats {
-    // Slices of scene encoding, in order
-    pub n_drawobj: usize,
-    pub drawdata_len: usize,
-    pub n_transform: usize,
-    pub linewidth_len: usize,
-    pub pathseg_len: usize,
-    pub n_pathtag: usize,
-
-    // Additional stats needed needed for memory layout & dispatch
-    pub n_path: u32,
-    pub n_pathseg: u32,
-    pub n_clip: u32,
-}
-
-pub struct Renderer {
-    // These sizes are aligned to tile boundaries, though at some point
-    // we'll want to have a good strategy for dealing with odd sizes.
-    width: usize,
-    height: usize,
-
-    pub image_dev: Image, // resulting image
-
-    // TODO: two changes needed here. First, if we're fencing on the coarse
-    // pipeline, then we only need one copy (this changes if we also bind the
-    // scene buffer in fine rasterization, which might be a good idea to reduce
-    // copying). Second, there should be a staging buffer for discrete cards.
-    scene_bufs: Vec<Buffer>,
-
-    memory_buf_host: Vec<Buffer>,
-    memory_buf_dev: Buffer,
-    memory_buf_readback: Buffer,
-
-    // Staging buffers
-    config_bufs: Vec<Buffer>,
-    // Device config buf
-    config_buf: Buffer,
-
-    blend_buf: Buffer,
-
-    // New element pipeline
-    element_code: ElementCode,
-    element_stage: ElementStage,
-    element_bindings: Vec<ElementBinding>,
-
-    clip_code: ClipCode,
-    clip_binding: ClipBinding,
-
-    tile_pipeline: Pipeline,
-    tile_ds: Vec<DescriptorSet>,
-
-    path_pipeline: Pipeline,
-    path_ds: DescriptorSet,
-
-    backdrop_pipeline: Pipeline,
-    backdrop_ds: DescriptorSet,
-    backdrop_y: u32,
-
-    bin_pipeline: Pipeline,
-    bin_ds: DescriptorSet,
-
-    coarse_pipeline: Pipeline,
-    coarse_ds: Vec<DescriptorSet>,
-
-    k4_pipeline: Pipeline,
-    k4_ds: DescriptorSet,
-
-    scene_stats: SceneStats,
-    // TODO: the following stats are now redundant and can be removed.
-    n_transform: usize,
-    n_drawobj: usize,
-    n_paths: usize,
-    n_pathseg: usize,
-    n_pathtag: usize,
-    n_clip: u32,
-
-    // Keep a reference to the image so that it is not destroyed.
-    _bg_image: Image,
-
-    gradient_bufs: Vec<Buffer>,
-    gradients: Image,
-
-    ramps: ramp::RampCache,
-    drawdata_patches: Vec<(usize, u32)>,
-}
-
-impl RenderConfig {
-    pub fn new(width: usize, height: usize) -> RenderConfig {
-        RenderConfig {
-            width,
-            height,
-            format: PixelFormat::Rgba8,
-        }
-    }
-
-    pub fn pixel_format(mut self, format: PixelFormat) -> Self {
-        self.format = format;
-        self
-    }
-}
-
-impl Renderer {
-    /// The number of query pool entries needed to run the renderer.
-    pub const QUERY_POOL_SIZE: u32 = Self::COARSE_QUERY_POOL_SIZE + Self::FINE_QUERY_POOL_SIZE;
-
-    /// The number of query pool entries needed to run the coarse pipeline.
-    pub const COARSE_QUERY_POOL_SIZE: u32 = 10;
-
-    /// The number of query pool entries needed to run the fine pipeline.
-    pub const FINE_QUERY_POOL_SIZE: u32 = 2;
-
-    pub unsafe fn new(
-        session: &Session,
-        width: usize,
-        height: usize,
-        n_bufs: usize,
-    ) -> Result<Self, Error> {
-        let config = RenderConfig::new(width, height);
-        Self::new_from_config(session, config, n_bufs)
-    }
-
-    /// Create a new renderer.
-    pub unsafe fn new_from_config(
-        session: &Session,
-        config: RenderConfig,
-        n_bufs: usize,
-    ) -> Result<Self, Error> {
-        // For now, round up to tile alignment
-        let width = config.width;
-        let height = config.height;
-        let width = width + (width.wrapping_neg() & (TILE_W - 1));
-        let height = height + (height.wrapping_neg() & (TILE_W - 1));
-        let dev = BufferUsage::STORAGE | BufferUsage::COPY_DST;
-        let usage_mem_dev = BufferUsage::STORAGE | BufferUsage::COPY_DST | BufferUsage::COPY_SRC;
-        let usage_blend = BufferUsage::STORAGE;
-        let usage_upload = BufferUsage::MAP_WRITE | BufferUsage::COPY_SRC;
-        let usage_readback = BufferUsage::MAP_READ | BufferUsage::COPY_DST;
-
-        // TODO: separate staging buffer (if needed)
-        let scene_bufs = (0..n_bufs)
-            .map(|_| {
-                session
-                    .create_buffer(8 * 1024 * 1024, usage_upload)
-                    .unwrap()
-            })
-            .collect::<Vec<_>>();
-
-        let image_format = match config.format {
-            PixelFormat::A8 => piet_gpu_hal::ImageFormat::A8,
-            PixelFormat::Rgba8 => piet_gpu_hal::ImageFormat::Surface,
-        };
-        let image_dev = session.create_image2d(width as u32, height as u32, image_format)?;
-
-        const CONFIG_BUFFER_SIZE: u64 = std::mem::size_of::<Config>() as u64;
-        let config_buf = session.create_buffer(CONFIG_BUFFER_SIZE, dev).unwrap();
-        let config_bufs = (0..n_bufs)
-            .map(|_| {
-                session
-                    .create_buffer(CONFIG_BUFFER_SIZE, usage_upload)
-                    .unwrap()
-            })
-            .collect();
-
-        let memory_buf_host = (0..n_bufs)
-            .map(|_| {
-                session
-                    .create_buffer(std::mem::size_of::<MemoryHeader>() as u64, usage_upload)
-                    .unwrap()
-            })
-            .collect();
-        let target_dependent_size =
-            (width / TILE_W) as u64 * (height / TILE_H) as u64 * PTCL_INITIAL_ALLOC as u64;
-        let memory_buf_dev =
-            session.create_buffer(target_dependent_size + 8 * 1024 * 1024, usage_mem_dev)?;
-        let memory_buf_readback =
-            session.create_buffer(std::mem::size_of::<MemoryHeader>() as u64, usage_readback)?;
-        let blend_buf = session.create_buffer(16 * 1024 * 1024, usage_blend)?;
-
-        let element_code = ElementCode::new(session);
-        let element_stage = ElementStage::new(session, &element_code);
-        let element_bindings = scene_bufs
-            .iter()
-            .map(|scene_buf| {
-                element_stage.bind(
-                    session,
-                    &element_code,
-                    &config_buf,
-                    scene_buf,
-                    &memory_buf_dev,
-                )
-            })
-            .collect();
-
-        let clip_code = ClipCode::new(session);
-        let clip_binding = ClipBinding::new(session, &clip_code, &config_buf, &memory_buf_dev);
-
-        let tile_alloc_code = include_shader!(session, "../shader/gen/tile_alloc");
-        let tile_pipeline = session.create_compute_pipeline(
-            tile_alloc_code,
-            &[
-                BindType::Buffer,
-                BindType::BufReadOnly,
-                BindType::BufReadOnly,
-            ],
-        )?;
-        let tile_ds = scene_bufs
-            .iter()
-            .map(|scene_buf| {
-                session.create_simple_descriptor_set(
-                    &tile_pipeline,
-                    &[&memory_buf_dev, &config_buf, scene_buf],
-                )
-            })
-            .collect::<Result<Vec<_>, _>>()?;
-
-        let path_alloc_code = include_shader!(session, "../shader/gen/path_coarse");
-        let path_pipeline = session
-            .create_compute_pipeline(path_alloc_code, &[BindType::Buffer, BindType::BufReadOnly])?;
-        let path_ds = session
-            .create_simple_descriptor_set(&path_pipeline, &[&memory_buf_dev, &config_buf])?;
-
-        let (backdrop_code, backdrop_y) =
-            if session.gpu_info().workgroup_limits.max_invocations >= 1024 {
-                (include_shader!(session, "../shader/gen/backdrop_lg"), 4)
-            } else {
-                println!("using small workgroup backdrop kernel");
-                (include_shader!(session, "../shader/gen/backdrop"), 1)
-            };
-        let backdrop_pipeline = session
-            .create_compute_pipeline(backdrop_code, &[BindType::Buffer, BindType::BufReadOnly])?;
-        let backdrop_ds = session
-            .create_simple_descriptor_set(&backdrop_pipeline, &[&memory_buf_dev, &config_buf])?;
-
-        // TODO: constants
-        let bin_code = include_shader!(session, "../shader/gen/binning");
-        let bin_pipeline = session
-            .create_compute_pipeline(bin_code, &[BindType::Buffer, BindType::BufReadOnly])?;
-        let bin_ds =
-            session.create_simple_descriptor_set(&bin_pipeline, &[&memory_buf_dev, &config_buf])?;
-
-        let coarse_code = include_shader!(session, "../shader/gen/coarse");
-        let coarse_pipeline = session.create_compute_pipeline(
-            coarse_code,
-            &[
-                BindType::Buffer,
-                BindType::BufReadOnly,
-                BindType::BufReadOnly,
-            ],
-        )?;
-        let coarse_ds = scene_bufs
-            .iter()
-            .map(|scene_buf| {
-                session.create_simple_descriptor_set(
-                    &coarse_pipeline,
-                    &[&memory_buf_dev, &config_buf, scene_buf],
-                )
-            })
-            .collect::<Result<Vec<_>, _>>()?;
-        let bg_image = Self::make_test_bg_image(&session);
-
-        const GRADIENT_BUF_SIZE: usize = N_GRADIENTS * N_GRADIENT_SAMPLES * 4;
-
-        let gradient_bufs = (0..n_bufs)
-            .map(|_| {
-                session
-                    .create_buffer(GRADIENT_BUF_SIZE as u64, usage_upload)
-                    .unwrap()
-            })
-            .collect();
-        let gradients = Self::make_gradient_image(&session);
-
-        let k4_code = match config.format {
-            PixelFormat::A8 => include_shader!(session, "../shader/gen/kernel4_gray"),
-            PixelFormat::Rgba8 => include_shader!(session, "../shader/gen/kernel4"),
-        };
-        let k4_pipeline = session.create_compute_pipeline(
-            k4_code,
-            &[
-                BindType::Buffer,
-                BindType::BufReadOnly,
-                BindType::Buffer,
-                BindType::Image,
-                BindType::ImageRead,
-                BindType::ImageRead,
-            ],
-        )?;
-        let k4_ds = session
-            .descriptor_set_builder()
-            .add_buffers(&[&memory_buf_dev, &config_buf, &blend_buf])
-            .add_images(&[&image_dev])
-            .add_textures(&[&bg_image, &gradients])
-            .build(&session, &k4_pipeline)?;
-
-        let scene_stats = Default::default();
-        let ramps = ramp::RampCache::default();
-        let drawdata_patches = vec![];
-
-        Ok(Renderer {
-            width,
-            height,
-            scene_bufs,
-            memory_buf_host,
-            memory_buf_dev,
-            memory_buf_readback,
-            config_buf,
-            config_bufs,
-            blend_buf,
-            image_dev,
-            element_code,
-            element_stage,
-            element_bindings,
-            clip_code,
-            clip_binding,
-            tile_pipeline,
-            tile_ds,
-            path_pipeline,
-            path_ds,
-            backdrop_pipeline,
-            backdrop_ds,
-            backdrop_y,
-            bin_pipeline,
-            bin_ds,
-            coarse_pipeline,
-            coarse_ds,
-            k4_pipeline,
-            k4_ds,
-            scene_stats,
-            n_transform: 0,
-            n_drawobj: 0,
-            n_paths: 0,
-            n_pathseg: 0,
-            n_pathtag: 0,
-            n_clip: 0,
-            _bg_image: bg_image,
-            gradient_bufs,
-            gradients,
-            ramps,
-            drawdata_patches,
-        })
-    }
-
-    pub fn upload_scene(&mut self, scene: &Scene, buf_ix: usize) -> Result<(), Error> {
-        self.drawdata_patches.clear();
-        self.scene_stats = SceneStats::from_scene(scene);
-        self.ramps.advance();
-        let data = scene.data();
-        let stop_data = &data.resources.stops;
-        for patch in &data.resources.patches {
-            match patch {
-                ResourcePatch::Ramp { offset, stops } => {
-                    let ramp_id = self.ramps.add(&stop_data[stops.clone()]);
-                    self.drawdata_patches.push((*offset, ramp_id));
-                }
-            }
-        }
-        unsafe {
-            self.upload_config(buf_ix)?;
-            {
-                let mut mapped_scene = self.scene_bufs[buf_ix].map_write(..)?;
-                write_scene(scene, &self.drawdata_patches, &mut mapped_scene);
-            }
-
-            // Upload gradient data.
-            let ramp_data = self.ramps.data();
-            if !ramp_data.is_empty() {
-                assert!(
-                    self.gradient_bufs[buf_ix].size() as usize
-                        >= std::mem::size_of_val(&*ramp_data)
-                );
-                self.gradient_bufs[buf_ix].write(ramp_data)?;
-            }
-        }
-        Ok(())
-    }
-
-    // Note: configuration has to be re-uploaded when memory buffer is resized
-    pub(crate) unsafe fn upload_config(&mut self, buf_ix: usize) -> Result<(), Error> {
-        let stats = &self.scene_stats;
-        let n_path = stats.n_path as usize;
-        self.n_paths = n_path;
-        self.n_transform = stats.n_transform;
-        self.n_drawobj = stats.n_drawobj;
-        self.n_pathseg = stats.n_pathseg as usize;
-        self.n_pathtag = stats.n_pathtag;
-        self.n_clip = stats.n_clip;
-        let (mut config, alloc) = stats.config(self.width, self.height);
-        config.mem_size = self.memory_buf_size() as u32;
-        self.config_bufs[buf_ix].write(&[config])?;
-        let mem_header = MemoryHeader {
-            mem_offset: alloc as u32,
-            mem_error: 0,
-            blend_offset: 0,
-        };
-        // Note: we could skip doing this on realloc, but probably not worth the bother
-        self.memory_buf_host[buf_ix].write(&[mem_header])?;
-        Ok(())
-    }
-
-    /// Get the size of memory for the allocations known in advance.
-    pub(crate) fn memory_size(&self, stats: &SceneStats) -> usize {
-        stats.config(self.width, self.height).1
-    }
-
-    /// Record the coarse part of a render pipeline.
-    pub unsafe fn record_coarse(
-        &self,
-        cmd_buf: &mut CmdBuf,
-        query_pool: &QueryPool,
-        buf_ix: usize,
-    ) {
-        cmd_buf.copy_buffer(&self.config_bufs[buf_ix], &self.config_buf);
-        cmd_buf.copy_buffer(&self.memory_buf_host[buf_ix], &self.memory_buf_dev);
-        cmd_buf.memory_barrier();
-        cmd_buf.image_barrier(
-            &self.image_dev,
-            ImageLayout::Undefined,
-            ImageLayout::General,
-        );
-        // TODO: make gradient upload optional, only if it's changed
-        cmd_buf.image_barrier(
-            &self.gradients,
-            ImageLayout::Undefined,
-            ImageLayout::BlitDst,
-        );
-        cmd_buf.copy_buffer_to_image(&self.gradient_bufs[buf_ix], &self.gradients);
-        cmd_buf.image_barrier(&self.gradients, ImageLayout::BlitDst, ImageLayout::General);
-        cmd_buf.reset_query_pool(&query_pool);
-        cmd_buf.begin_debug_label("Element bounding box calculation");
-        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 0, 1));
-        self.element_stage.record(
-            &mut pass,
-            &self.element_code,
-            &self.element_bindings[buf_ix],
-            self.n_paths as u32,
-            self.n_pathtag as u32,
-            self.n_drawobj as u64,
-        );
-        pass.end();
-        cmd_buf.end_debug_label();
-        cmd_buf.memory_barrier();
-        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 2, 3));
-        pass.begin_debug_label("Clip bounding box calculation");
-        self.clip_binding
-            .record(&mut pass, &self.clip_code, self.n_clip as u32);
-        pass.end_debug_label();
-        pass.begin_debug_label("Element binning");
-        pass.dispatch(
-            &self.bin_pipeline,
-            &self.bin_ds,
-            (((self.n_paths + 255) / 256) as u32, 1, 1),
-            (256, 1, 1),
-        );
-        pass.end_debug_label();
-        pass.memory_barrier();
-        pass.begin_debug_label("Tile allocation");
-        pass.dispatch(
-            &self.tile_pipeline,
-            &self.tile_ds[buf_ix],
-            (((self.n_paths + 255) / 256) as u32, 1, 1),
-            (256, 1, 1),
-        );
-        pass.end_debug_label();
-        pass.end();
-        cmd_buf.begin_debug_label("Path flattening");
-        cmd_buf.memory_barrier();
-        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 4, 5));
-        pass.dispatch(
-            &self.path_pipeline,
-            &self.path_ds,
-            (((self.n_pathseg + 31) / 32) as u32, 1, 1),
-            (32, 1, 1),
-        );
-        pass.end();
-        cmd_buf.end_debug_label();
-        cmd_buf.memory_barrier();
-        cmd_buf.begin_debug_label("Backdrop propagation");
-        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 6, 7));
-        pass.dispatch(
-            &self.backdrop_pipeline,
-            &self.backdrop_ds,
-            (((self.n_paths + 255) / 256) as u32, 1, 1),
-            (256, self.backdrop_y, 1),
-        );
-        pass.end();
-        cmd_buf.end_debug_label();
-        // TODO: redo query accounting
-        cmd_buf.memory_barrier();
-        cmd_buf.begin_debug_label("Coarse raster");
-        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 8, 9));
-        pass.dispatch(
-            &self.coarse_pipeline,
-            &self.coarse_ds[buf_ix],
-            (
-                (self.width as u32 + 255) / 256,
-                (self.height as u32 + 255) / 256,
-                1,
-            ),
-            (256, 1, 1),
-        );
-        pass.end();
-        cmd_buf.end_debug_label();
-        cmd_buf.memory_barrier();
-    }
-
-    pub unsafe fn record_fine(
-        &self,
-        cmd_buf: &mut CmdBuf,
-        query_pool: &QueryPool,
-        query_start: u32,
-    ) {
-        if query_start == 0 {
-            cmd_buf.reset_query_pool(&query_pool);
-        }
-        cmd_buf.begin_debug_label("Fine raster");
-        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(
-            &query_pool,
-            query_start,
-            query_start + 1,
-        ));
-        pass.dispatch(
-            &self.k4_pipeline,
-            &self.k4_ds,
-            (
-                (self.width / TILE_W) as u32,
-                (self.height / TILE_H) as u32,
-                1,
-            ),
-            (8, 4, 1),
-        );
-        pass.end();
-        cmd_buf.end_debug_label();
-        cmd_buf.memory_barrier();
-        cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
-    }
-
-    pub unsafe fn record_readback(&self, cmd_buf: &mut CmdBuf) {
-        cmd_buf.copy_buffer(&self.memory_buf_dev, &self.memory_buf_readback);
-        cmd_buf.memory_barrier();
-    }
-
-    /// Record a render pipeline.
-    ///
-    /// This *assumes* the buffers are adequately sized.
-    pub unsafe fn record(&self, cmd_buf: &mut CmdBuf, query_pool: &QueryPool, buf_ix: usize) {
-        self.record_coarse(cmd_buf, query_pool, buf_ix);
-        self.record_fine(cmd_buf, query_pool, 10);
-    }
-
-    pub fn make_image(
-        session: &Session,
-        width: usize,
-        height: usize,
-        buf: &[u8],
-    ) -> Result<Image, Error> {
-        unsafe {
-            let buffer = session.create_buffer_init(&buf, BufferUsage::COPY_SRC)?;
-            const RGBA: piet_gpu_hal::ImageFormat = piet_gpu_hal::ImageFormat::Rgba8;
-            let image = session.create_image2d(width.try_into()?, height.try_into()?, RGBA)?;
-            let mut cmd_buf = session.cmd_buf()?;
-            cmd_buf.begin();
-            cmd_buf.image_barrier(&image, ImageLayout::Undefined, ImageLayout::BlitDst);
-            cmd_buf.copy_buffer_to_image(&buffer, &image);
-            cmd_buf.image_barrier(&image, ImageLayout::BlitDst, ImageLayout::General);
-            cmd_buf.finish();
-            // Make sure not to drop the buffer and image until the command buffer completes.
-            cmd_buf.add_resource(&buffer);
-            cmd_buf.add_resource(&image);
-            let _ = session.run_cmd_buf(cmd_buf, &[], &[]);
-            // We let the session reclaim the fence.
-            Ok(image)
-        }
-    }
-
-    /// Make a test image.
-    fn make_test_bg_image(session: &Session) -> Image {
-        const WIDTH: usize = 256;
-        const HEIGHT: usize = 256;
-        let mut buf = vec![255u8; WIDTH * HEIGHT * 4];
-        for y in 0..HEIGHT {
-            for x in 0..WIDTH {
-                let r = x as u8;
-                let g = y as u8;
-                let b = r ^ g;
-                buf[(y * WIDTH + x) * 4] = r;
-                buf[(y * WIDTH + x) * 4 + 1] = g;
-                buf[(y * WIDTH + x) * 4 + 2] = b;
-            }
-        }
-        Self::make_image(session, WIDTH, HEIGHT, &buf).unwrap()
-    }
-
-    fn make_gradient_image(session: &Session) -> Image {
-        unsafe {
-            const RGBA: piet_gpu_hal::ImageFormat = piet_gpu_hal::ImageFormat::Rgba8;
-            session
-                .create_image2d(N_GRADIENT_SAMPLES as u32, N_GRADIENTS as u32, RGBA)
-                .unwrap()
-        }
-    }
-
-    pub(crate) unsafe fn realloc_scene_if_needed(
-        &mut self,
-        session: &Session,
-        new_size: u64,
-        buf_ix: usize,
-    ) -> Result<(), Error> {
-        if new_size <= self.scene_bufs[buf_ix].size() {
-            return Ok(());
-        }
-        const ALIGN: u64 = 0x10000;
-        let new_size = (new_size + ALIGN - 1) & ALIGN.wrapping_neg();
-        println!(
-            "reallocating scene buf[{}] {} -> {}",
-            buf_ix,
-            self.scene_bufs[buf_ix].size(),
-            new_size
-        );
-        let usage_upload = BufferUsage::MAP_WRITE | BufferUsage::COPY_SRC;
-        let scene_buf = session.create_buffer(new_size, usage_upload)?;
-        self.element_bindings[buf_ix].rebind_scene(session, &scene_buf);
-        session.update_buffer_descriptor(&mut self.tile_ds[buf_ix], 2, &scene_buf);
-        session.update_buffer_descriptor(&mut self.coarse_ds[buf_ix], 2, &scene_buf);
-        self.scene_bufs[buf_ix] = scene_buf;
-        Ok(())
-    }
-
-    /// Get the size of the memory buffer.
-    ///
-    /// This is the usable size (not including the header).
-    pub(crate) fn memory_buf_size(&self) -> u64 {
-        self.memory_buf_dev.size() - std::mem::size_of::<MemoryHeader>() as u64
-    }
-
-    pub(crate) unsafe fn realloc_memory(
-        &mut self,
-        session: &Session,
-        new_size: u64,
-    ) -> Result<(), Error> {
-        println!(
-            "reallocating memory buf {} -> {}",
-            self.memory_buf_dev.size(),
-            new_size
-        );
-        let usage_mem_dev = BufferUsage::STORAGE | BufferUsage::COPY_DST | BufferUsage::COPY_SRC;
-        let memory_buf_dev = session.create_buffer(new_size, usage_mem_dev)?;
-        for element_binding in &mut self.element_bindings {
-            element_binding.rebind_memory(session, &memory_buf_dev);
-        }
-        self.clip_binding.rebind_memory(session, &memory_buf_dev);
-        for tile_ds in &mut self.tile_ds {
-            session.update_buffer_descriptor(tile_ds, 0, &memory_buf_dev);
-        }
-        session.update_buffer_descriptor(&mut self.path_ds, 0, &memory_buf_dev);
-        session.update_buffer_descriptor(&mut self.backdrop_ds, 0, &memory_buf_dev);
-        session.update_buffer_descriptor(&mut self.bin_ds, 0, &memory_buf_dev);
-        for coarse_ds in &mut self.coarse_ds {
-            session.update_buffer_descriptor(coarse_ds, 0, &memory_buf_dev);
-        }
-        session.update_buffer_descriptor(&mut self.k4_ds, 0, &memory_buf_dev);
-        self.memory_buf_dev = memory_buf_dev;
-        Ok(())
-    }
-
-    pub(crate) fn blend_size(&self) -> u64 {
-        self.blend_buf.size()
-    }
-
-    pub(crate) unsafe fn realloc_blend(
-        &mut self,
-        session: &Session,
-        new_size: u64,
-    ) -> Result<(), Error> {
-        println!(
-            "reallocating blend buf {} -> {}",
-            self.blend_size(),
-            new_size
-        );
-        let usage_blend = BufferUsage::STORAGE;
-        let blend_buf = session.create_buffer(new_size, usage_blend)?;
-        session.update_buffer_descriptor(&mut self.k4_ds, 2, &blend_buf);
-        self.blend_buf = blend_buf;
-        Ok(())
-    }
-}
-
-const TRANSFORM_SIZE: usize = 24;
-const PATHSEG_SIZE: usize = 52;
-const PATH_BBOX_SIZE: usize = 24;
-const DRAWMONOID_SIZE: usize = 16;
-const DRAW_BBOX_SIZE: usize = 16;
-const DRAWTAG_SIZE: usize = 4;
-const ANNOTATED_SIZE: usize = 40;
-
-impl SceneStats {
-    pub fn from_scene(scene: &piet_scene::Scene) -> Self {
-        let data = scene.data();
-        Self {
-            n_drawobj: data.drawtag_stream.len(),
-            drawdata_len: data.drawdata_stream.len(),
-            n_transform: data.transform_stream.len(),
-            linewidth_len: std::mem::size_of_val(&*data.linewidth_stream),
-            pathseg_len: data.pathseg_stream.len(),
-            n_pathtag: data.tag_stream.len(),
-            n_path: data.n_path,
-            n_pathseg: data.n_pathseg,
-            n_clip: data.n_clip,
-        }
-    }
-
-    pub(crate) fn scene_size(&self) -> usize {
-        align_up(self.n_drawobj, DRAW_PART_SIZE as usize) * DRAWTAG_SIZE
-            + self.drawdata_len
-            + self.n_transform * TRANSFORM_SIZE
-            + self.linewidth_len
-            + align_up(self.n_pathtag, PATHSEG_PART_SIZE as usize)
-            + self.pathseg_len
-    }
-
-    /// Return a config for a scene with these stats.
-    ///
-    /// Also returns the beginning of free (dynamic) memory.
-    fn config(&self, width: usize, height: usize) -> (Config, usize) {
-        // Layout of scene buffer
-        let drawtag_offset = 0;
-        let n_drawobj = self.n_drawobj;
-        let n_drawobj_padded = align_up(n_drawobj, DRAW_PART_SIZE as usize);
-        let drawdata_offset = drawtag_offset + n_drawobj_padded * DRAWTAG_SIZE;
-        let trans_offset = drawdata_offset + self.drawdata_len;
-        let n_trans = self.n_transform;
-        let linewidth_offset = trans_offset + n_trans * TRANSFORM_SIZE;
-        let pathtag_offset = linewidth_offset + self.linewidth_len;
-        let n_pathtag = self.n_pathtag;
-        let n_pathtag_padded = align_up(n_pathtag, PATHSEG_PART_SIZE as usize);
-        let pathseg_offset = pathtag_offset + n_pathtag_padded;
-
-        // Layout of memory
-        let mut alloc = 0;
-        let pathseg_alloc = alloc;
-        alloc += pathseg_alloc + self.n_pathseg as usize * PATHSEG_SIZE;
-        let path_bbox_alloc = alloc;
-        let n_path = self.n_path as usize;
-        alloc += path_bbox_alloc + n_path * PATH_BBOX_SIZE;
-        let drawmonoid_alloc = alloc;
-        alloc += n_drawobj_padded * DRAWMONOID_SIZE;
-        let anno_alloc = alloc;
-        alloc += n_drawobj * ANNOTATED_SIZE;
-        let clip_alloc = alloc;
-        let n_clip = self.n_clip as usize;
-        const CLIP_SIZE: usize = 4;
-        alloc += n_clip * CLIP_SIZE;
-        let clip_bic_alloc = alloc;
-        const CLIP_BIC_SIZE: usize = 8;
-        // This can round down, as we only reduce the prefix
-        alloc += (n_clip / CLIP_PART_SIZE as usize) * CLIP_BIC_SIZE;
-        let clip_stack_alloc = alloc;
-        const CLIP_EL_SIZE: usize = 20;
-        alloc += n_clip * CLIP_EL_SIZE;
-        let clip_bbox_alloc = alloc;
-        const CLIP_BBOX_SIZE: usize = 16;
-        alloc += align_up(n_clip as usize, CLIP_PART_SIZE as usize) * CLIP_BBOX_SIZE;
-        let draw_bbox_alloc = alloc;
-        alloc += n_drawobj * DRAW_BBOX_SIZE;
-        let drawinfo_alloc = alloc;
-        // TODO: not optimized; it can be accumulated during encoding or summed from drawtags
-        const MAX_DRAWINFO_SIZE: usize = 44;
-        alloc += n_drawobj * MAX_DRAWINFO_SIZE;
-
-        // These constants depend on encoding and may need to be updated.
-        const PATH_SIZE: usize = 12;
-        const BIN_SIZE: usize = 8;
-        let width_in_tiles = width / TILE_W;
-        let height_in_tiles = height / TILE_H;
-        let tile_base = alloc;
-        alloc += ((n_path + 3) & !3) * PATH_SIZE;
-        let bin_base = alloc;
-        alloc += ((n_drawobj + 255) & !255) * BIN_SIZE;
-        let ptcl_base = alloc;
-        alloc += width_in_tiles * height_in_tiles * PTCL_INITIAL_ALLOC;
-
-        let config = Config {
-            mem_size: 0, // to be filled in later
-            n_elements: n_drawobj as u32,
-            n_pathseg: self.n_pathseg,
-            pathseg_alloc: pathseg_alloc as u32,
-            anno_alloc: anno_alloc as u32,
-            path_bbox_alloc: path_bbox_alloc as u32,
-            drawmonoid_alloc: drawmonoid_alloc as u32,
-            clip_alloc: clip_alloc as u32,
-            clip_bic_alloc: clip_bic_alloc as u32,
-            clip_stack_alloc: clip_stack_alloc as u32,
-            clip_bbox_alloc: clip_bbox_alloc as u32,
-            draw_bbox_alloc: draw_bbox_alloc as u32,
-            drawinfo_alloc: drawinfo_alloc as u32,
-            n_trans: n_trans as u32,
-            n_path: self.n_path,
-            n_clip: self.n_clip,
-            trans_offset: trans_offset as u32,
-            linewidth_offset: linewidth_offset as u32,
-            pathtag_offset: pathtag_offset as u32,
-            pathseg_offset: pathseg_offset as u32,
-            drawtag_offset: drawtag_offset as u32,
-            drawdata_offset: drawdata_offset as u32,
-            width_in_tiles: width_in_tiles as u32,
-            height_in_tiles: height_in_tiles as u32,
-            tile_alloc: tile_base as u32,
-            bin_alloc: bin_base as u32,
-            ptcl_alloc: ptcl_base as u32,
-        };
-
-        (config, alloc)
-    }
-}
-
-fn write_scene(scene: &Scene, drawdata_patches: &[(usize, u32)], buf: &mut BufWrite) {
-    let data = scene.data();
-    buf.extend_slice(&data.drawtag_stream);
-    let n_drawobj = data.drawtag_stream.len();
-    buf.fill_zero(padding(n_drawobj, DRAW_PART_SIZE as usize) * DRAWTAG_SIZE);
-    if !drawdata_patches.is_empty() {
-        let mut pos = 0;
-        for patch in drawdata_patches {
-            let offset = patch.0;
-            let value = patch.1;
-            if pos < offset {
-                buf.extend_slice(&data.drawdata_stream[pos..offset]);
-            }
-            buf.push(value);
-            pos = offset + 4;
-        }
-        if pos < data.drawdata_stream.len() {
-            buf.extend_slice(&data.drawdata_stream[pos..])
-        }
-    } else {
-        buf.extend_slice(&data.drawdata_stream);
-    }
-    buf.extend_slice(&data.transform_stream);
-    buf.extend_slice(&data.linewidth_stream);
-    buf.extend_slice(&data.tag_stream);
-    let n_pathtag = data.tag_stream.len();
-    buf.fill_zero(padding(n_pathtag, PATHSEG_PART_SIZE as usize));
-    buf.extend_slice(&data.pathseg_stream);
-}
-
-fn padding(x: usize, align: usize) -> usize {
-    x.wrapping_neg() & (align - 1)
-}
-
-fn align_up(x: usize, align: usize) -> usize {
-    debug_assert!(align.is_power_of_two());
-    (x + align - 1) & !(align - 1)
-}
--- a/piet-gpu/src/pico_svg.rs
+++ b/piet-gpu/src/pico_svg.rs
@ -1,139 +0,0 @@
-//! A loader for a tiny fragment of SVG
-
-use std::str::FromStr;
-
-use roxmltree::{Document, Node};
-
-use piet_scene::kurbo::{Affine, BezPath};
-use piet_scene::Color;
-
-pub struct PicoSvg {
-    pub items: Vec<Item>,
-}
-
-pub enum Item {
-    Fill(FillItem),
-    Stroke(StrokeItem),
-}
-
-pub struct StrokeItem {
-    pub width: f64,
-    pub color: Color,
-    pub path: BezPath,
-}
-
-pub struct FillItem {
-    pub color: Color,
-    pub path: BezPath,
-}
-
-struct Parser<'a> {
-    scale: f64,
-    items: &'a mut Vec<Item>,
-}
-
-impl PicoSvg {
-    pub fn load(xml_string: &str, scale: f64) -> Result<PicoSvg, Box<dyn std::error::Error>> {
-        let doc = Document::parse(xml_string)?;
-        let root = doc.root_element();
-        let mut items = Vec::new();
-        let mut parser = Parser::new(&mut items, scale);
-        for node in root.children() {
-            parser.rec_parse(node)?;
-        }
-        Ok(PicoSvg { items })
-    }
-}
-
-impl<'a> Parser<'a> {
-    fn new(items: &'a mut Vec<Item>, scale: f64) -> Parser<'a> {
-        Parser { scale, items }
-    }
-
-    fn rec_parse(&mut self, node: Node) -> Result<(), Box<dyn std::error::Error>> {
-        let transform = if self.scale >= 0.0 {
-            Affine::scale(self.scale)
-        } else {
-            Affine::new([-self.scale, 0.0, 0.0, self.scale, 0.0, 1536.0])
-        };
-        if node.is_element() {
-            match node.tag_name().name() {
-                "g" => {
-                    for child in node.children() {
-                        self.rec_parse(child)?;
-                    }
-                }
-                "path" => {
-                    let d = node.attribute("d").ok_or("missing 'd' attribute")?;
-                    let bp = BezPath::from_svg(d)?;
-                    let path = transform * bp;
-                    // TODO: default fill color is black, but this is overridden in tiger to this logic.
-                    if let Some(fill_color) = node.attribute("fill") {
-                        if fill_color != "none" {
-                            let color = parse_color(fill_color);
-                            let color = modify_opacity(color, "fill-opacity", node);
-                            self.items.push(Item::Fill(FillItem {
-                                color,
-                                path: path.clone(),
-                            }));
-                        }
-                    }
-                    if let Some(stroke_color) = node.attribute("stroke") {
-                        if stroke_color != "none" {
-                            let width = self.scale.abs()
-                                * f64::from_str(
-                                    node.attribute("stroke-width").ok_or("missing width")?,
-                                )?;
-                            let color = parse_color(stroke_color);
-                            let color = modify_opacity(color, "stroke-opacity", node);
-                            self.items
-                                .push(Item::Stroke(StrokeItem { width, color, path }));
-                        }
-                    }
-                }
-                _ => (),
-            }
-        }
-        Ok(())
-    }
-}
-
-fn parse_color(color: &str) -> Color {
-    if color.as_bytes()[0] == b'#' {
-        let mut hex = u32::from_str_radix(&color[1..], 16).unwrap();
-        if color.len() == 4 {
-            hex = (hex >> 8) * 0x110000 + ((hex >> 4) & 0xf) * 0x1100 + (hex & 0xf) * 0x11;
-        }
-        let rgba = (hex << 8) + 0xff;
-        let (r, g, b, a) = (
-            (rgba >> 24 & 255) as u8,
-            ((rgba >> 16) & 255) as u8,
-            ((rgba >> 8) & 255) as u8,
-            (rgba & 255) as u8,
-        );
-        Color::rgba8(r, g, b, a)
-    } else if color.starts_with("rgb(") {
-        let mut iter = color[4..color.len() - 1].split(',');
-        let r = u8::from_str(iter.next().unwrap()).unwrap();
-        let g = u8::from_str(iter.next().unwrap()).unwrap();
-        let b = u8::from_str(iter.next().unwrap()).unwrap();
-        Color::rgb8(r, g, b)
-    } else {
-        Color::rgba8(255, 0, 255, 0x80)
-    }
-}
-
-fn modify_opacity(mut color: Color, attr_name: &str, node: Node) -> Color {
-    if let Some(opacity) = node.attribute(attr_name) {
-        let alpha = if opacity.ends_with("%") {
-            let pctg = opacity[..opacity.len() - 1].parse().unwrap_or(100.0);
-            pctg * 0.01
-        } else {
-            opacity.parse().unwrap_or(1.0)
-        } as f64;
-        color.a = (alpha.min(1.0).max(0.0) * 255.0).round() as u8;
-        color
-    } else {
-        color
-    }
-}
--- a/piet-gpu/src/ramp.rs
+++ b/piet-gpu/src/ramp.rs
@ -1,129 +0,0 @@
-use piet_scene::{Color, ColorStop, ColorStops};
-
-use std::collections::HashMap;
-
-const N_SAMPLES: usize = 512;
-const RETAINED_COUNT: usize = 64;
-
-#[derive(Default)]
-pub struct RampCache {
-    epoch: u64,
-    map: HashMap<ColorStops, (u32, u64)>,
-    data: Vec<u32>,
-}
-
-impl RampCache {
-    pub fn advance(&mut self) {
-        self.epoch += 1;
-        if self.map.len() > RETAINED_COUNT {
-            self.map
-                .retain(|_key, value| value.0 < RETAINED_COUNT as u32);
-            self.data.truncate(RETAINED_COUNT * N_SAMPLES);
-        }
-    }
-
-    pub fn add(&mut self, stops: &[ColorStop]) -> u32 {
-        if let Some(entry) = self.map.get_mut(stops) {
-            entry.1 = self.epoch;
-            entry.0
-        } else if self.map.len() < RETAINED_COUNT {
-            let id = (self.data.len() / N_SAMPLES) as u32;
-            self.data.extend(make_ramp(stops));
-            self.map.insert(stops.into(), (id, self.epoch));
-            id
-        } else {
-            let mut reuse = None;
-            for (stops, (id, epoch)) in &self.map {
-                if *epoch + 2 < self.epoch {
-                    reuse = Some((stops.to_owned(), *id));
-                    break;
-                }
-            }
-            if let Some((old_stops, id)) = reuse {
-                self.map.remove(&old_stops);
-                let start = id as usize * N_SAMPLES;
-                for (dst, src) in self.data[start..start + N_SAMPLES]
-                    .iter_mut()
-                    .zip(make_ramp(stops))
-                {
-                    *dst = src;
-                }
-                self.map.insert(stops.into(), (id, self.epoch));
-                id
-            } else {
-                let id = (self.data.len() / N_SAMPLES) as u32;
-                self.data.extend(make_ramp(stops));
-                self.map.insert(stops.into(), (id, self.epoch));
-                id
-            }
-        }
-    }
-
-    pub fn data(&self) -> &[u32] {
-        &self.data
-    }
-}
-
-fn make_ramp<'a>(stops: &'a [ColorStop]) -> impl Iterator<Item = u32> + 'a {
-    let mut last_u = 0.0;
-    let mut last_c = ColorF64::from_color(stops[0].color);
-    let mut this_u = last_u;
-    let mut this_c = last_c;
-    let mut j = 0;
-    (0..N_SAMPLES).map(move |i| {
-        let u = (i as f64) / (N_SAMPLES - 1) as f64;
-        while u > this_u {
-            last_u = this_u;
-            last_c = this_c;
-            if let Some(s) = stops.get(j + 1) {
-                this_u = s.offset as f64;
-                this_c = ColorF64::from_color(s.color);
-                j += 1;
-            } else {
-                break;
-            }
-        }
-        let du = this_u - last_u;
-        let c = if du < 1e-9 {
-            this_c
-        } else {
-            last_c.lerp(&this_c, (u - last_u) / du)
-        };
-        c.to_premul_u32()
-    })
-}
-
-#[derive(Copy, Clone, Debug)]
-struct ColorF64([f64; 4]);
-
-impl ColorF64 {
-    fn from_color(color: Color) -> Self {
-        Self([
-            color.r as f64 / 255.0,
-            color.g as f64 / 255.0,
-            color.b as f64 / 255.0,
-            color.a as f64 / 255.0,
-        ])
-    }
-
-    fn lerp(&self, other: &Self, a: f64) -> Self {
-        fn l(x: f64, y: f64, a: f64) -> f64 {
-            x * (1.0 - a) + y * a
-        }
-        Self([
-            l(self.0[0], other.0[0], a),
-            l(self.0[1], other.0[1], a),
-            l(self.0[2], other.0[2], a),
-            l(self.0[3], other.0[3], a),
-        ])
-    }
-
-    fn to_premul_u32(&self) -> u32 {
-        let a = self.0[3].min(1.0).max(0.0);
-        let r = ((self.0[0] * a).min(1.0).max(0.0) * 255.0) as u32;
-        let g = ((self.0[1] * a).min(1.0).max(0.0) * 255.0) as u32;
-        let b = ((self.0[2] * a).min(1.0).max(0.0) * 255.0) as u32;
-        let a = (a * 255.0) as u32;
-        r | (g << 8) | (b << 16) | (a << 24)
-    }
-}
--- a/piet-gpu/src/render_driver.rs
+++ b/piet-gpu/src/render_driver.rs
@ -1,318 +0,0 @@
-// Copyright 2022 The piet-gpu authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Also licensed under MIT license, at your choice.
-
-use piet_gpu_hal::{CmdBuf, Error, Image, QueryPool, Semaphore, Session, SubmittedCmdBuf};
-use piet_scene::Scene;
-
-use crate::{MemoryHeader, Renderer, SceneStats};
-
-/// Additional logic for sequencing rendering operations, specifically
-/// for handling failure and reallocation.
-///
-/// It may be this shouldn't be a separate object from Renderer.
-pub struct RenderDriver {
-    frames: Vec<RenderFrame>,
-    renderer: Renderer,
-    buf_ix: usize,
-    /// The index of a pending fine rasterization submission.
-    pending: Option<usize>,
-}
-
-pub struct TargetState<'a> {
-    pub cmd_buf: &'a mut CmdBuf,
-    pub image: &'a Image,
-}
-
-#[derive(Default, Debug)]
-pub struct TimingStats {
-    coarse: Vec<f64>,
-    fine: Vec<f64>,
-}
-
-struct RenderFrame {
-    cmd_buf: CmdBufState,
-    coarse_query_pool: QueryPool,
-    fine_query_pool: QueryPool,
-    timing_stats: TimingStats,
-}
-
-enum CmdBufState {
-    Start,
-    Submitted(SubmittedCmdBuf),
-    Ready(CmdBuf),
-}
-
-impl RenderDriver {
-    /// Create new render driver.
-    ///
-    /// Should probably be fallible.
-    ///
-    /// We can get n from the renderer as well.
-    pub fn new(session: &Session, n: usize, renderer: Renderer) -> RenderDriver {
-        let frames = (0..n)
-            .map(|_| {
-                // Maybe should allocate here so it doesn't happen on first frame?
-                let cmd_buf = CmdBufState::default();
-                let coarse_query_pool =
-                    session.create_query_pool(Renderer::COARSE_QUERY_POOL_SIZE)?;
-                let fine_query_pool = session.create_query_pool(Renderer::FINE_QUERY_POOL_SIZE)?;
-                Ok(RenderFrame {
-                    cmd_buf,
-                    coarse_query_pool,
-                    fine_query_pool,
-                    timing_stats: TimingStats::default(),
-                })
-            })
-            .collect::<Result<_, Error>>()
-            .unwrap();
-        RenderDriver {
-            frames,
-            renderer,
-            buf_ix: 0,
-            pending: None,
-        }
-    }
-
-    pub fn upload_scene(&mut self, session: &Session, scene: &Scene) -> Result<(), Error> {
-        let stats = SceneStats::from_scene(scene);
-        self.ensure_scene_buffers(session, &stats)?;
-        self.renderer.upload_scene(scene, self.buf_ix)
-    }
-
-    fn ensure_scene_buffers(&mut self, session: &Session, stats: &SceneStats) -> Result<(), Error> {
-        let scene_size = stats.scene_size();
-        unsafe {
-            self.renderer
-                .realloc_scene_if_needed(session, scene_size as u64, self.buf_ix)?;
-        }
-        let memory_size = self.renderer.memory_size(&stats);
-        // TODO: better estimate of additional memory needed
-        // Note: if we were to cover the worst-case binning output, we could make the
-        // binning stage infallible and cut checking logic. It also may not be a bad
-        // estimate for the rest.
-        let estimated_needed = memory_size as u64 + (1 << 20);
-        if estimated_needed > self.renderer.memory_buf_size() {
-            if let Some(pending) = self.pending.take() {
-                // There might be a fine rasterization task that binds the memory buffer
-                // still in flight.
-                self.frames[pending].cmd_buf.wait();
-            }
-            unsafe {
-                self.renderer.realloc_memory(session, estimated_needed)?;
-            }
-        }
-        Ok(())
-    }
-
-    /// Run one try of the coarse rendering pipeline.
-    pub(crate) fn try_run_coarse(&mut self, session: &Session) -> Result<MemoryHeader, Error> {
-        let frame = &mut self.frames[self.buf_ix];
-        let cmd_buf = frame.cmd_buf.cmd_buf(session)?;
-        unsafe {
-            cmd_buf.begin();
-            // TODO: probably want to return query results as well
-            self.renderer
-                .record_coarse(cmd_buf, &frame.coarse_query_pool, self.buf_ix);
-            self.renderer.record_readback(cmd_buf);
-            let cmd_buf = frame.cmd_buf.cmd_buf(session)?;
-            cmd_buf.finish_timestamps(&frame.coarse_query_pool);
-            cmd_buf.host_barrier();
-            cmd_buf.finish();
-            frame.cmd_buf.submit(session, &[], &[])?;
-            frame.cmd_buf.wait();
-            frame.timing_stats.coarse = session.fetch_query_pool(&frame.coarse_query_pool)?;
-            let mut result = Vec::new();
-            // TODO: consider read method for single POD value
-            self.renderer.memory_buf_readback.read(&mut result)?;
-            Ok(result[0])
-        }
-    }
-
-    /// Run the coarse render pipeline, ensuring enough memory for intermediate buffers.
-    pub fn run_coarse(&mut self, session: &Session) -> Result<(), Error> {
-        loop {
-            let mem_header = self.try_run_coarse(session)?;
-            //println!("{:?}", mem_header);
-            if mem_header.mem_error == 0 {
-                let blend_needed = mem_header.blend_offset as u64;
-                if blend_needed > self.renderer.blend_size() {
-                    unsafe {
-                        self.renderer.realloc_blend(session, blend_needed)?;
-                    }
-                }
-                return Ok(());
-            }
-            // Not enough memory, reallocate and retry.
-            // TODO: be smarter (multiplier for early stages)
-            let mem_size = mem_header.mem_offset + 4096;
-            // Safety rationalization: no command buffers containing the buffer are
-            // in flight.
-            unsafe {
-                self.renderer.realloc_memory(session, mem_size.into())?;
-                self.renderer.upload_config(self.buf_ix)?;
-            }
-        }
-    }
-
-    /// Record the fine rasterizer, leaving the command buffer open.
-    pub fn record_fine(&mut self, session: &Session) -> Result<TargetState, Error> {
-        let frame = &mut self.frames[self.buf_ix];
-        let cmd_buf = frame.cmd_buf.cmd_buf(session)?;
-        unsafe {
-            cmd_buf.begin();
-            self.renderer
-                .record_fine(cmd_buf, &frame.fine_query_pool, 0);
-        }
-        let image = &self.renderer.image_dev;
-        Ok(TargetState { cmd_buf, image })
-    }
-
-    /// Submit the current command buffer.
-    pub fn submit(
-        &mut self,
-        session: &Session,
-        wait_semaphores: &[&Semaphore],
-        signal_semaphores: &[&Semaphore],
-    ) -> Result<(), Error> {
-        let frame = &mut self.frames[self.buf_ix];
-        let cmd_buf = frame.cmd_buf.cmd_buf(session)?;
-        unsafe {
-            cmd_buf.finish_timestamps(&frame.fine_query_pool);
-            cmd_buf.host_barrier();
-            cmd_buf.finish();
-            frame
-                .cmd_buf
-                .submit(session, wait_semaphores, signal_semaphores)?
-        }
-        self.pending = Some(self.buf_ix);
-        Ok(())
-    }
-
-    unsafe fn wait_frame(&mut self, session: &Session, buf_ix: usize) {
-        let frame = &mut self.frames[buf_ix];
-        frame.cmd_buf.wait();
-        if let Ok(stats) = session.fetch_query_pool(&frame.fine_query_pool) {
-            frame.timing_stats.fine = stats;
-        }
-        if self.pending == Some(buf_ix) {
-            self.pending = None;
-        }
-    }
-
-    pub unsafe fn wait(&mut self, session: &Session) {
-        self.wait_frame(session, self.buf_ix);
-    }
-
-    /// Move to the next buffer.
-    pub fn next_buffer(&mut self) {
-        self.buf_ix = (self.buf_ix + 1) % self.frames.len()
-    }
-
-    pub unsafe fn get_timing_stats(&mut self, session: &Session, buf_ix: usize) -> &TimingStats {
-        self.wait_frame(session, buf_ix);
-        &self.frames[buf_ix].timing_stats
-    }
-
-    pub fn wait_all(&mut self, session: &Session) {
-        for buf_ix in 0..self.frames.len() {
-            unsafe {
-                self.wait_frame(session, buf_ix);
-            }
-        }
-    }
-}
-
-impl Default for CmdBufState {
-    fn default() -> Self {
-        CmdBufState::Start
-    }
-}
-
-impl CmdBufState {
-    /// Get a command buffer suitable for recording.
-    ///
-    /// If the command buffer is submitted, wait.
-    fn cmd_buf(&mut self, session: &Session) -> Result<&mut CmdBuf, Error> {
-        if let CmdBufState::Ready(cmd_buf) = self {
-            return Ok(cmd_buf);
-        }
-        if let CmdBufState::Submitted(submitted) = std::mem::take(self) {
-            if let Ok(Some(cmd_buf)) = submitted.wait() {
-                *self = CmdBufState::Ready(cmd_buf);
-            }
-        }
-        if matches!(self, CmdBufState::Start) {
-            *self = CmdBufState::Ready(session.cmd_buf()?);
-        }
-        if let CmdBufState::Ready(cmd_buf) = self {
-            Ok(cmd_buf)
-        } else {
-            unreachable!()
-        }
-    }
-
-    unsafe fn submit(
-        &mut self,
-        session: &Session,
-        wait_semaphores: &[&Semaphore],
-        signal_semaphores: &[&Semaphore],
-    ) -> Result<(), Error> {
-        if let CmdBufState::Ready(cmd_buf) = std::mem::take(self) {
-            let submitted = session.run_cmd_buf(cmd_buf, wait_semaphores, signal_semaphores)?;
-            *self = CmdBufState::Submitted(submitted);
-            Ok(())
-        } else {
-            Err("Tried to submit CmdBufState not in ready state".into())
-        }
-    }
-
-    fn wait(&mut self) {
-        if matches!(self, CmdBufState::Submitted(_)) {
-            if let CmdBufState::Submitted(submitted) = std::mem::take(self) {
-                if let Ok(Some(cmd_buf)) = submitted.wait() {
-                    *self = CmdBufState::Ready(cmd_buf);
-                }
-            }
-        }
-    }
-}
-
-impl TimingStats {
-    pub fn print_summary(&self) {
-        let ts = &self.coarse;
-        println!("Element time: {:.3}ms", ts[0] * 1e3);
-        println!("Clip + bin + tile time: {:.3}ms", (ts[2] - ts[1]) * 1e3);
-        println!("Coarse path time: {:.3}ms", (ts[4] - ts[2]) * 1e3);
-        println!("Backdrop time: {:.3}ms", (ts[6] - ts[5]) * 1e3);
-        println!("Coarse raster kernel time: {:.3}ms", (ts[8] - ts[7]) * 1e3);
-        println!("Fine kernel time: {:.3}ms", self.fine[0] * 1e3);
-    }
-
-    pub fn short_summary(&self) -> String {
-        let ts = &self.coarse;
-        let el = ts[0] * 1e3;
-        let cl = (ts[2] - ts[1]) * 1e3;
-        let cp = (ts[4] - ts[3]) * 1e3;
-        let bd = (ts[6] - ts[5]) * 1e3;
-        let cr = (ts[8] - ts[7]) * 1e3;
-        let fr = self.fine[0] * 1e3;
-        let total = el + cl + cp + bd + cr + fr;
-        format!(
-            "{:.3}ms :: el:{:.3}ms|cl:{:.3}ms|cp:{:.3}ms|bd:{:.3}ms|cr:{:.3}ms|fr:{:.3}ms",
-            total, el, cl, cp, bd, cr, fr
-        )
-    }
-}
--- a/piet-gpu/src/samples.rs
+++ b/piet-gpu/src/samples.rs
@ -1,355 +0,0 @@
-use crate::PicoSvg;
-use piet_scene::kurbo::{Affine, BezPath, Ellipse, PathEl, Point, Rect};
-use piet_scene::*;
-
-use crate::SimpleText;
-
-pub fn render_funky_paths(sb: &mut SceneBuilder) {
-    use PathEl::*;
-    let missing_movetos = [
-        LineTo((100.0, 100.0).into()),
-        LineTo((100.0, 200.0).into()),
-        ClosePath,
-        LineTo((0.0, 400.0).into()),
-        LineTo((100.0, 400.0).into()),
-    ];
-    let only_movetos = [MoveTo((0.0, 0.0).into()), MoveTo((100.0, 100.0).into())];
-    let empty: [PathEl; 0] = [];
-    sb.fill(
-        Fill::NonZero,
-        Affine::translate((100.0, 100.0)),
-        Color::rgb8(0, 0, 255),
-        None,
-        &missing_movetos,
-    );
-    sb.fill(
-        Fill::NonZero,
-        Affine::IDENTITY,
-        Color::rgb8(0, 0, 255),
-        None,
-        &empty,
-    );
-    sb.fill(
-        Fill::NonZero,
-        Affine::IDENTITY,
-        Color::rgb8(0, 0, 255),
-        None,
-        &only_movetos,
-    );
-    sb.stroke(
-        &Stroke::new(8.0),
-        Affine::translate((100.0, 100.0)),
-        Color::rgb8(0, 255, 255),
-        None,
-        &missing_movetos,
-    );
-}
-
-#[allow(unused)]
-const N_CIRCLES: usize = 0;
-
-#[allow(unused)]
-pub fn render_svg(sb: &mut SceneBuilder, svg: &PicoSvg, print_stats: bool) {
-    use crate::pico_svg::*;
-    let start = std::time::Instant::now();
-    for item in &svg.items {
-        match item {
-            Item::Fill(fill) => {
-                sb.fill(
-                    Fill::NonZero,
-                    Affine::IDENTITY,
-                    fill.color,
-                    None,
-                    &fill.path,
-                );
-            }
-            Item::Stroke(stroke) => {
-                sb.stroke(
-                    &Stroke::new(stroke.width as f32),
-                    Affine::IDENTITY,
-                    stroke.color,
-                    None,
-                    &stroke.path,
-                );
-            }
-        }
-    }
-    if print_stats {
-        println!("flattening and encoding time: {:?}", start.elapsed());
-    }
-}
-
-#[allow(unused)]
-pub fn render_tiger(sb: &mut SceneBuilder, print_stats: bool) {
-    use super::pico_svg::*;
-    let xml_str = std::str::from_utf8(include_bytes!(
-        "../../piet-wgsl/examples/assets/Ghostscript_Tiger.svg"
-    ))
-    .unwrap();
-    let start = std::time::Instant::now();
-    let svg = PicoSvg::load(xml_str, 8.0).unwrap();
-    if print_stats {
-        println!("parsing time: {:?}", start.elapsed());
-    }
-    render_svg(sb, &svg, print_stats);
-}
-
-pub fn render_scene(sb: &mut SceneBuilder) {
-    render_cardioid(sb);
-    render_clip_test(sb);
-    render_alpha_test(sb);
-    //render_tiger(sb, false);
-}
-
-#[allow(unused)]
-fn render_cardioid(sb: &mut SceneBuilder) {
-    let n = 601;
-    let dth = std::f64::consts::PI * 2.0 / (n as f64);
-    let center = Point::new(1024.0, 768.0);
-    let r = 750.0;
-    let mut path = BezPath::new();
-    for i in 1..n {
-        let mut p0 = center;
-        let a0 = i as f64 * dth;
-        p0.x += a0.cos() * r;
-        p0.y += a0.sin() * r;
-        let mut p1 = center;
-        let a1 = ((i * 2) % n) as f64 * dth;
-        p1.x += a1.cos() * r;
-        p1.y += a1.sin() * r;
-        path.push(PathEl::MoveTo(p0));
-        path.push(PathEl::LineTo(p1));
-    }
-    sb.stroke(
-        &Stroke::new(2.0),
-        Affine::IDENTITY,
-        Color::rgb8(0, 0, 0),
-        None,
-        &path,
-    );
-}
-
-#[allow(unused)]
-fn render_clip_test(sb: &mut SceneBuilder) {
-    const N: usize = 16;
-    const X0: f64 = 50.0;
-    const Y0: f64 = 450.0;
-    // Note: if it gets much larger, it will exceed the 1MB scratch buffer.
-    // But this is a pretty demanding test.
-    const X1: f64 = 550.0;
-    const Y1: f64 = 950.0;
-    let step = 1.0 / ((N + 1) as f64);
-    for i in 0..N {
-        let t = ((i + 1) as f64) * step;
-        let path = [
-            PathEl::MoveTo((X0, Y0).into()),
-            PathEl::LineTo((X1, Y0).into()),
-            PathEl::LineTo((X1, Y0 + t * (Y1 - Y0)).into()),
-            PathEl::LineTo((X1 + t * (X0 - X1), Y1).into()),
-            PathEl::LineTo((X0, Y1).into()),
-            PathEl::ClosePath,
-        ];
-        sb.push_layer(Mix::Clip, Affine::IDENTITY, &path);
-    }
-    let rect = Rect::new(X0, Y0, X1, Y1);
-    sb.fill(
-        Fill::NonZero,
-        Affine::IDENTITY,
-        &Brush::Solid(Color::rgb8(0, 0, 0)),
-        None,
-        &rect,
-    );
-    for _ in 0..N {
-        sb.pop_layer();
-    }
-}
-
-#[allow(unused)]
-fn render_alpha_test(sb: &mut SceneBuilder) {
-    // Alpha compositing tests.
-    sb.fill(
-        Fill::NonZero,
-        Affine::IDENTITY,
-        Color::rgb8(255, 0, 0),
-        None,
-        &make_diamond(1024.0, 100.0),
-    );
-    sb.fill(
-        Fill::NonZero,
-        Affine::IDENTITY,
-        Color::rgba8(0, 255, 0, 0x80),
-        None,
-        &make_diamond(1024.0, 125.0),
-    );
-    sb.push_layer(Mix::Clip, Affine::IDENTITY, &make_diamond(1024.0, 150.0));
-    sb.fill(
-        Fill::NonZero,
-        Affine::IDENTITY,
-        Color::rgba8(0, 0, 255, 0x80),
-        None,
-        &make_diamond(1024.0, 175.0),
-    );
-    sb.pop_layer();
-}
-
-#[allow(unused)]
-pub fn render_blend_grid(sb: &mut SceneBuilder) {
-    const BLEND_MODES: &[Mix] = &[
-        Mix::Normal,
-        Mix::Multiply,
-        Mix::Darken,
-        Mix::Screen,
-        Mix::Lighten,
-        Mix::Overlay,
-        Mix::ColorDodge,
-        Mix::ColorBurn,
-        Mix::HardLight,
-        Mix::SoftLight,
-        Mix::Difference,
-        Mix::Exclusion,
-        Mix::Hue,
-        Mix::Saturation,
-        Mix::Color,
-        Mix::Luminosity,
-    ];
-    for (ix, &blend) in BLEND_MODES.iter().enumerate() {
-        let i = ix % 4;
-        let j = ix / 4;
-        let transform = Affine::translate((i as f64 * 225., j as f64 * 225.));
-        let square = blend_square(blend.into());
-        sb.append(&square, Some(transform));
-    }
-}
-
-#[allow(unused)]
-fn render_blend_square(sb: &mut SceneBuilder, blend: BlendMode, transform: Affine) {
-    // Inspired by https://developer.mozilla.org/en-US/docs/Web/CSS/mix-blend-mode
-    let rect = Rect::from_origin_size(Point::new(0., 0.), (200., 200.));
-    let linear = LinearGradient::new((0.0, 0.0), (200.0, 0.0)).stops([Color::BLACK, Color::WHITE]);
-    sb.fill(Fill::NonZero, transform, &linear, None, &rect);
-    const GRADIENTS: &[(f64, f64, Color)] = &[
-        (150., 0., Color::rgb8(255, 240, 64)),
-        (175., 100., Color::rgb8(255, 96, 240)),
-        (125., 200., Color::rgb8(64, 192, 255)),
-    ];
-    for (x, y, c) in GRADIENTS {
-        let mut color2 = c.clone();
-        color2.a = 0;
-        let radial = RadialGradient::new((*x, *y), 100.0).stops([*c, color2]);
-        sb.fill(Fill::NonZero, transform, &radial, None, &rect);
-    }
-    const COLORS: &[Color] = &[
-        Color::rgb8(255, 0, 0),
-        Color::rgb8(0, 255, 0),
-        Color::rgb8(0, 0, 255),
-    ];
-    sb.push_layer(Mix::Normal, transform, &rect);
-    for (i, c) in COLORS.iter().enumerate() {
-        let linear = LinearGradient::new((0.0, 0.0), (0.0, 200.0)).stops([Color::WHITE, *c]);
-        sb.push_layer(blend, transform, &rect);
-        // squash the ellipse
-        let a = transform
-            * Affine::translate((100., 100.))
-            * Affine::rotate(std::f64::consts::FRAC_PI_3 * (i * 2 + 1) as f64)
-            * Affine::scale_non_uniform(1.0, 0.357)
-            * Affine::translate((-100., -100.));
-        sb.fill(
-            Fill::NonZero,
-            a,
-            &linear,
-            None,
-            &Ellipse::new((100., 100.), (90., 90.), 0.),
-        );
-        sb.pop_layer();
-    }
-    sb.pop_layer();
-}
-
-#[allow(unused)]
-fn blend_square(blend: BlendMode) -> SceneFragment {
-    let mut fragment = SceneFragment::default();
-    let mut sb = SceneBuilder::for_fragment(&mut fragment);
-    render_blend_square(&mut sb, blend, Affine::IDENTITY);
-    sb.finish();
-    fragment
-}
-
-#[allow(unused)]
-pub fn render_anim_frame(sb: &mut SceneBuilder, text: &mut SimpleText, i: usize) {
-    sb.fill(
-        Fill::NonZero,
-        Affine::IDENTITY,
-        &Brush::Solid(Color::rgb8(128, 128, 128)),
-        None,
-        &Rect::from_origin_size(Point::new(0.0, 0.0), (1000.0, 1000.0)),
-    );
-    let text_size = 60.0 + 40.0 * (0.01 * i as f32).sin();
-    let s = "\u{1f600}hello piet-gpu text!";
-    text.add(
-        sb,
-        None,
-        text_size,
-        None,
-        Affine::translate((110.0, 600.0)),
-        s,
-    );
-    text.add(
-        sb,
-        None,
-        text_size,
-        None,
-        Affine::translate((110.0, 700.0)),
-        s,
-    );
-    let th = (std::f64::consts::PI / 180.0) * (i as f64);
-    let center = Point::new(500.0, 500.0);
-    let mut p1 = center;
-    p1.x += 400.0 * th.cos();
-    p1.y += 400.0 * th.sin();
-    sb.stroke(
-        &Stroke::new(5.0),
-        Affine::IDENTITY,
-        &Brush::Solid(Color::rgb8(128, 0, 0)),
-        None,
-        &&[PathEl::MoveTo(center), PathEl::LineTo(p1)][..],
-    );
-}
-
-#[allow(unused)]
-pub fn render_brush_transform(sb: &mut SceneBuilder, i: usize) {
-    let th = (std::f64::consts::PI / 180.0) * (i as f64);
-    let linear = LinearGradient::new((0.0, 0.0), (0.0, 200.0)).stops([
-        Color::RED,
-        Color::GREEN,
-        Color::BLUE,
-    ]);
-    sb.fill(
-        Fill::NonZero,
-        Affine::translate((200.0, 200.0)),
-        &linear,
-        Some(around_center(Affine::rotate(th), Point::new(200.0, 100.0))),
-        &Rect::from_origin_size(Point::default(), (400.0, 200.0)),
-    );
-    sb.stroke(
-        &Stroke::new(40.0),
-        Affine::translate((800.0, 200.0)),
-        &linear,
-        Some(around_center(Affine::rotate(th), Point::new(200.0, 100.0))),
-        &Rect::from_origin_size(Point::default(), (400.0, 200.0)),
-    );
-}
-
-fn around_center(xform: Affine, center: Point) -> Affine {
-    Affine::translate(center.to_vec2()) * xform * Affine::translate(-center.to_vec2())
-}
-
-fn make_diamond(cx: f64, cy: f64) -> [PathEl; 5] {
-    const SIZE: f64 = 50.0;
-    [
-        PathEl::MoveTo(Point::new(cx, cy - SIZE)),
-        PathEl::LineTo(Point::new(cx + SIZE, cy)),
-        PathEl::LineTo(Point::new(cx, cy + SIZE)),
-        PathEl::LineTo(Point::new(cx - SIZE, cy)),
-        PathEl::ClosePath,
-    ]
-}
--- a/piet-gpu/src/simple_text.rs
+++ b/piet-gpu/src/simple_text.rs
@ -1,82 +0,0 @@
-// Copyright 2022 The piet-gpu authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Also licensed under MIT license, at your choice.
-
-use piet_scene::glyph::{pinot, pinot::TableProvider, GlyphContext};
-use piet_scene::kurbo::Affine;
-use piet_scene::{Brush, SceneBuilder};
-
-pub use pinot::FontRef;
-
-// This is very much a hack to get things working.
-// On Windows, can set this to "c:\\Windows\\Fonts\\seguiemj.ttf" to get color emoji
-const FONT_DATA: &[u8] =
-    include_bytes!("../../piet-wgsl/examples/assets/third-party/Roboto-Regular.ttf");
-
-pub struct SimpleText {
-    gcx: GlyphContext,
-}
-
-impl SimpleText {
-    pub fn new() -> Self {
-        Self {
-            gcx: GlyphContext::new(),
-        }
-    }
-
-    pub fn add(
-        &mut self,
-        builder: &mut SceneBuilder,
-        font: Option<&FontRef>,
-        size: f32,
-        brush: Option<&Brush>,
-        transform: Affine,
-        text: &str,
-    ) {
-        let font = font.unwrap_or(&FontRef {
-            data: FONT_DATA,
-            offset: 0,
-        });
-        if let Some(cmap) = font.cmap() {
-            if let Some(hmtx) = font.hmtx() {
-                let upem = font.head().map(|head| head.units_per_em()).unwrap_or(1000) as f64;
-                let scale = size as f64 / upem;
-                let vars: [(pinot::types::Tag, f32); 0] = [];
-                let mut provider = self.gcx.new_provider(font, None, size, false, vars);
-                let hmetrics = hmtx.hmetrics();
-                let default_advance = hmetrics
-                    .get(hmetrics.len().saturating_sub(1))
-                    .map(|h| h.advance_width)
-                    .unwrap_or(0);
-                let mut pen_x = 0f64;
-                for ch in text.chars() {
-                    let gid = cmap.map(ch as u32).unwrap_or(0);
-                    let advance = hmetrics
-                        .get(gid as usize)
-                        .map(|h| h.advance_width)
-                        .unwrap_or(default_advance) as f64
-                        * scale;
-                    if let Some(glyph) = provider.get(gid, brush) {
-                        let xform = transform
-                            * Affine::translate((pen_x, 0.0))
-                            * Affine::scale_non_uniform(1.0, -1.0);
-                        builder.append(&glyph, Some(xform));
-                    }
-                    pen_x += advance;
-                }
-            }
-        }
-    }
-}
--- a/piet-gpu/src/stages.rs
+++ b/piet-gpu/src/stages.rs
@ -1,160 +0,0 @@
-// Copyright 2021 The piet-gpu authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Also licensed under MIT license, at your choice.
-
-//! Stages for new element pipeline, exposed for testing.
-
-mod clip;
-mod draw;
-mod path;
-mod transform;
-
-use bytemuck::{Pod, Zeroable};
-
-pub use clip::{ClipBinding, ClipCode, CLIP_PART_SIZE};
-pub use draw::{DrawBinding, DrawCode, DrawMonoid, DrawStage, DRAW_PART_SIZE};
-pub use path::{PathBinding, PathCode, PathEncoder, PathStage, PATHSEG_PART_SIZE};
-use piet_gpu_hal::{Buffer, ComputePass, Session};
-pub use transform::Transform;
-
-/// The configuration block passed to piet-gpu shaders.
-///
-/// Note: this should be kept in sync with the version in setup.h.
-#[repr(C)]
-#[derive(Clone, Copy, Default, Debug, Zeroable, Pod)]
-pub struct Config {
-    pub mem_size: u32,
-    pub n_elements: u32, // paths
-    pub n_pathseg: u32,
-    pub width_in_tiles: u32,
-    pub height_in_tiles: u32,
-    pub tile_alloc: u32,
-    pub bin_alloc: u32,
-    pub ptcl_alloc: u32,
-    pub pathseg_alloc: u32,
-    pub anno_alloc: u32,
-    pub path_bbox_alloc: u32,
-    pub drawmonoid_alloc: u32,
-    pub clip_alloc: u32,
-    pub clip_bic_alloc: u32,
-    pub clip_stack_alloc: u32,
-    pub clip_bbox_alloc: u32,
-    pub draw_bbox_alloc: u32,
-    pub drawinfo_alloc: u32,
-    pub n_trans: u32,
-    pub n_path: u32,
-    pub n_clip: u32,
-    pub trans_offset: u32,
-    pub linewidth_offset: u32,
-    pub pathtag_offset: u32,
-    pub pathseg_offset: u32,
-    pub drawtag_offset: u32,
-    pub drawdata_offset: u32,
-}
-
-// The "element" stage combines a number of stages for parts of the pipeline.
-
-pub struct ElementCode {
-    path_code: PathCode,
-    draw_code: DrawCode,
-}
-
-pub struct ElementStage {
-    path_stage: PathStage,
-    draw_stage: DrawStage,
-}
-
-pub struct ElementBinding {
-    path_binding: PathBinding,
-    draw_binding: DrawBinding,
-}
-
-impl ElementCode {
-    pub unsafe fn new(session: &Session) -> ElementCode {
-        ElementCode {
-            path_code: PathCode::new(session),
-            draw_code: DrawCode::new(session),
-        }
-    }
-}
-
-impl ElementStage {
-    pub unsafe fn new(session: &Session, code: &ElementCode) -> ElementStage {
-        ElementStage {
-            path_stage: PathStage::new(session, &code.path_code),
-            draw_stage: DrawStage::new(session, &code.draw_code),
-        }
-    }
-
-    pub unsafe fn bind(
-        &self,
-        session: &Session,
-        code: &ElementCode,
-        config_buf: &Buffer,
-        scene_buf: &Buffer,
-        memory_buf: &Buffer,
-    ) -> ElementBinding {
-        ElementBinding {
-            path_binding: self.path_stage.bind(
-                session,
-                &code.path_code,
-                config_buf,
-                scene_buf,
-                memory_buf,
-            ),
-            draw_binding: self.draw_stage.bind(
-                session,
-                &code.draw_code,
-                config_buf,
-                scene_buf,
-                memory_buf,
-            ),
-        }
-    }
-
-    pub unsafe fn record(
-        &self,
-        pass: &mut ComputePass,
-        code: &ElementCode,
-        binding: &ElementBinding,
-        n_paths: u32,
-        n_tags: u32,
-        n_drawobj: u64,
-    ) {
-        // No memory barrier needed here; path has at least one before pathseg
-        self.path_stage.record(
-            pass,
-            &code.path_code,
-            &binding.path_binding,
-            n_paths,
-            n_tags,
-        );
-        // No memory barrier needed here; draw has at least one before draw_leaf
-        self.draw_stage
-            .record(pass, &code.draw_code, &binding.draw_binding, n_drawobj);
-    }
-}
-
-impl ElementBinding {
-    pub unsafe fn rebind_memory(&mut self, session: &Session, memory: &Buffer) {
-        self.path_binding.rebind_memory(session, memory);
-        self.draw_binding.rebind_memory(session, memory);
-    }
-
-    pub unsafe fn rebind_scene(&mut self, session: &Session, scene: &Buffer) {
-        self.path_binding.rebind_scene(session, scene);
-        self.draw_binding.rebind_scene(session, scene);
-    }
-}
--- a/piet-gpu/src/stages/clip.rs
+++ b/piet-gpu/src/stages/clip.rs
@ -1,101 +0,0 @@
-// Copyright 2022 The piet-gpu authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Also licensed under MIT license, at your choice.
-
-//! The clip processing stage (includes substages).
-
-use piet_gpu_hal::{
-    include_shader, BindType, Buffer, ComputePass, DescriptorSet, Pipeline, Session,
-};
-
-// Note that this isn't the code/stage/binding pattern of most of the other stages
-// in the new element processing pipeline. We want to move those temporary buffers
-// into common memory and converge on this pattern.
-pub struct ClipCode {
-    reduce_pipeline: Pipeline,
-    leaf_pipeline: Pipeline,
-}
-
-pub struct ClipBinding {
-    reduce_ds: DescriptorSet,
-    leaf_ds: DescriptorSet,
-}
-
-pub const CLIP_PART_SIZE: u32 = 256;
-
-impl ClipCode {
-    pub unsafe fn new(session: &Session) -> ClipCode {
-        let reduce_code = include_shader!(session, "../../shader/gen/clip_reduce");
-        let reduce_pipeline = session
-            .create_compute_pipeline(reduce_code, &[BindType::Buffer, BindType::BufReadOnly])
-            .unwrap();
-        let leaf_code = include_shader!(session, "../../shader/gen/clip_leaf");
-        let leaf_pipeline = session
-            .create_compute_pipeline(leaf_code, &[BindType::Buffer, BindType::BufReadOnly])
-            .unwrap();
-        ClipCode {
-            reduce_pipeline,
-            leaf_pipeline,
-        }
-    }
-}
-
-impl ClipBinding {
-    pub unsafe fn new(
-        session: &Session,
-        code: &ClipCode,
-        config: &Buffer,
-        memory: &Buffer,
-    ) -> ClipBinding {
-        let reduce_ds = session
-            .create_simple_descriptor_set(&code.reduce_pipeline, &[memory, config])
-            .unwrap();
-        let leaf_ds = session
-            .create_simple_descriptor_set(&code.leaf_pipeline, &[memory, config])
-            .unwrap();
-        ClipBinding { reduce_ds, leaf_ds }
-    }
-
-    /// Record the clip dispatches.
-    ///
-    /// Assumes memory barrier on entry. Provides memory barrier on exit.
-    pub unsafe fn record(&self, pass: &mut ComputePass, code: &ClipCode, n_clip: u32) {
-        let n_wg_reduce = n_clip.saturating_sub(1) / CLIP_PART_SIZE;
-        if n_wg_reduce > 0 {
-            pass.dispatch(
-                &code.reduce_pipeline,
-                &self.reduce_ds,
-                (n_wg_reduce, 1, 1),
-                (CLIP_PART_SIZE, 1, 1),
-            );
-            pass.memory_barrier();
-        }
-        let n_wg = (n_clip + CLIP_PART_SIZE - 1) / CLIP_PART_SIZE;
-        if n_wg > 0 {
-            pass.dispatch(
-                &code.leaf_pipeline,
-                &self.leaf_ds,
-                (n_wg, 1, 1),
-                (CLIP_PART_SIZE, 1, 1),
-            );
-            pass.memory_barrier();
-        }
-    }
-
-    pub unsafe fn rebind_memory(&mut self, session: &Session, memory: &Buffer) {
-        session.update_buffer_descriptor(&mut self.reduce_ds, 0, memory);
-        session.update_buffer_descriptor(&mut self.leaf_ds, 0, memory);
-    }
-}
--- a/piet-gpu/src/stages/draw.rs
+++ b/piet-gpu/src/stages/draw.rs
@ -1,177 +0,0 @@
-// Copyright 2021 The piet-gpu authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Also licensed under MIT license, at your choice.
-
-//! The draw object stage of the element processing pipeline.
-
-use bytemuck::{Pod, Zeroable};
-
-use piet_gpu_hal::{
-    include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
-};
-
-/// The output element of the draw object stage.
-#[repr(C)]
-#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Zeroable, Pod)]
-pub struct DrawMonoid {
-    pub path_ix: u32,
-    pub clip_ix: u32,
-    pub scene_offset: u32,
-    pub info_offset: u32,
-}
-
-const DRAW_WG: u64 = 256;
-const DRAW_N_ROWS: u64 = 8;
-pub const DRAW_PART_SIZE: u64 = DRAW_WG * DRAW_N_ROWS;
-
-pub struct DrawCode {
-    reduce_pipeline: Pipeline,
-    root_pipeline: Pipeline,
-    leaf_pipeline: Pipeline,
-}
-pub struct DrawStage {
-    // Right now we're limited to partition^2 (~16M) elements. This can be
-    // expanded but is tedious.
-    root_buf: Buffer,
-    root_ds: DescriptorSet,
-}
-
-pub struct DrawBinding {
-    reduce_ds: DescriptorSet,
-    leaf_ds: DescriptorSet,
-}
-
-impl DrawCode {
-    pub unsafe fn new(session: &Session) -> DrawCode {
-        let reduce_code = include_shader!(session, "../../shader/gen/draw_reduce");
-        let reduce_pipeline = session
-            .create_compute_pipeline(
-                reduce_code,
-                &[
-                    BindType::Buffer,
-                    BindType::BufReadOnly,
-                    BindType::BufReadOnly,
-                    BindType::Buffer,
-                ],
-            )
-            .unwrap();
-        let root_code = include_shader!(session, "../../shader/gen/draw_root");
-        let root_pipeline = session
-            .create_compute_pipeline(root_code, &[BindType::Buffer])
-            .unwrap();
-        let leaf_code = include_shader!(session, "../../shader/gen/draw_leaf");
-        let leaf_pipeline = session
-            .create_compute_pipeline(
-                leaf_code,
-                &[
-                    BindType::Buffer,
-                    BindType::BufReadOnly,
-                    BindType::BufReadOnly,
-                    BindType::BufReadOnly,
-                ],
-            )
-            .unwrap();
-        DrawCode {
-            reduce_pipeline,
-            root_pipeline,
-            leaf_pipeline,
-        }
-    }
-}
-
-impl DrawStage {
-    pub unsafe fn new(session: &Session, code: &DrawCode) -> DrawStage {
-        // We're limited to DRAW_PART_SIZE^2
-        // Also note: size here allows padding
-        let root_buf_size = DRAW_PART_SIZE * 16;
-        let root_buf = session
-            .create_buffer(root_buf_size, BufferUsage::STORAGE)
-            .unwrap();
-        let root_ds = session
-            .create_simple_descriptor_set(&code.root_pipeline, &[&root_buf])
-            .unwrap();
-        DrawStage { root_buf, root_ds }
-    }
-
-    pub unsafe fn bind(
-        &self,
-        session: &Session,
-        code: &DrawCode,
-        config_buf: &Buffer,
-        scene_buf: &Buffer,
-        memory_buf: &Buffer,
-    ) -> DrawBinding {
-        let reduce_ds = session
-            .create_simple_descriptor_set(
-                &code.reduce_pipeline,
-                &[memory_buf, config_buf, scene_buf, &self.root_buf],
-            )
-            .unwrap();
-        let leaf_ds = session
-            .create_simple_descriptor_set(
-                &code.leaf_pipeline,
-                &[memory_buf, config_buf, scene_buf, &self.root_buf],
-            )
-            .unwrap();
-        DrawBinding { reduce_ds, leaf_ds }
-    }
-
-    pub unsafe fn record(
-        &self,
-        pass: &mut ComputePass,
-        code: &DrawCode,
-        binding: &DrawBinding,
-        size: u64,
-    ) {
-        if size > DRAW_PART_SIZE.pow(2) {
-            panic!("very large scan not yet implemented");
-        }
-        let n_workgroups = (size + DRAW_PART_SIZE - 1) / DRAW_PART_SIZE;
-        if n_workgroups > 1 {
-            pass.dispatch(
-                &code.reduce_pipeline,
-                &binding.reduce_ds,
-                (n_workgroups as u32, 1, 1),
-                (DRAW_WG as u32, 1, 1),
-            );
-            pass.memory_barrier();
-            pass.dispatch(
-                &code.root_pipeline,
-                &self.root_ds,
-                (1, 1, 1),
-                (DRAW_WG as u32, 1, 1),
-            );
-        }
-        pass.memory_barrier();
-        pass.dispatch(
-            &code.leaf_pipeline,
-            &binding.leaf_ds,
-            (n_workgroups as u32, 1, 1),
-            (DRAW_WG as u32, 1, 1),
-        );
-    }
-}
-
-impl DrawBinding {
-    pub unsafe fn rebind_memory(&mut self, session: &Session, memory: &Buffer) {
-        session.update_buffer_descriptor(&mut self.reduce_ds, 0, memory);
-        session.update_buffer_descriptor(&mut self.leaf_ds, 0, memory);
-    }
-
-    pub unsafe fn rebind_scene(&mut self, session: &Session, scene: &Buffer) {
-        session.update_buffer_descriptor(&mut self.reduce_ds, 2, scene);
-        session.update_buffer_descriptor(&mut self.leaf_ds, 2, scene);
-    }
-}
--- a/piet-gpu/src/stages/path.rs
+++ b/piet-gpu/src/stages/path.rs
@ -1,356 +0,0 @@
-// Copyright 2021 The piet-gpu authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Also licensed under MIT license, at your choice.
-
-//! The path stage (includes substages).
-
-use piet_gpu_hal::{
-    include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
-};
-
-pub struct PathCode {
-    reduce_pipeline: Pipeline,
-    tag_root_pipeline: Pipeline,
-    clear_pipeline: Pipeline,
-    pathseg_pipeline: Pipeline,
-}
-
-pub struct PathStage {
-    tag_root_buf: Buffer,
-    tag_root_ds: DescriptorSet,
-}
-
-pub struct PathBinding {
-    reduce_ds: DescriptorSet,
-    clear_ds: DescriptorSet,
-    path_ds: DescriptorSet,
-}
-
-const REDUCE_WG: u32 = 128;
-const REDUCE_N_ROWS: u32 = 2;
-const REDUCE_PART_SIZE: u32 = REDUCE_WG * REDUCE_N_ROWS;
-
-const ROOT_WG: u32 = 256;
-const ROOT_N_ROWS: u32 = 8;
-const ROOT_PART_SIZE: u32 = ROOT_WG * ROOT_N_ROWS;
-
-const SCAN_WG: u32 = 256;
-const SCAN_N_ROWS: u32 = 4;
-const SCAN_PART_SIZE: u32 = SCAN_WG * SCAN_N_ROWS;
-
-pub const PATHSEG_PART_SIZE: u32 = SCAN_PART_SIZE;
-
-const CLEAR_WG: u32 = 256;
-
-impl PathCode {
-    pub unsafe fn new(session: &Session) -> PathCode {
-        let reduce_code = include_shader!(session, "../../shader/gen/pathtag_reduce");
-        let reduce_pipeline = session
-            .create_compute_pipeline(
-                reduce_code,
-                &[
-                    BindType::Buffer,
-                    BindType::BufReadOnly,
-                    BindType::BufReadOnly,
-                    BindType::Buffer,
-                ],
-            )
-            .unwrap();
-        let tag_root_code = include_shader!(session, "../../shader/gen/pathtag_root");
-        let tag_root_pipeline = session
-            .create_compute_pipeline(tag_root_code, &[BindType::Buffer])
-            .unwrap();
-        let clear_code = include_shader!(session, "../../shader/gen/bbox_clear");
-        let clear_pipeline = session
-            .create_compute_pipeline(clear_code, &[BindType::Buffer, BindType::BufReadOnly])
-            .unwrap();
-        let pathseg_code = include_shader!(session, "../../shader/gen/pathseg");
-        let pathseg_pipeline = session
-            .create_compute_pipeline(
-                pathseg_code,
-                &[
-                    BindType::Buffer,
-                    BindType::BufReadOnly,
-                    BindType::BufReadOnly,
-                    BindType::BufReadOnly,
-                ],
-            )
-            .unwrap();
-        PathCode {
-            reduce_pipeline,
-            tag_root_pipeline,
-            clear_pipeline,
-            pathseg_pipeline,
-        }
-    }
-}
-
-impl PathStage {
-    pub unsafe fn new(session: &Session, code: &PathCode) -> PathStage {
-        let tag_root_buf_size = (ROOT_PART_SIZE * 20) as u64;
-        let tag_root_buf = session
-            .create_buffer(tag_root_buf_size, BufferUsage::STORAGE)
-            .unwrap();
-        let tag_root_ds = session
-            .create_simple_descriptor_set(&code.tag_root_pipeline, &[&tag_root_buf])
-            .unwrap();
-        PathStage {
-            tag_root_buf,
-            tag_root_ds,
-        }
-    }
-
-    pub unsafe fn bind(
-        &self,
-        session: &Session,
-        code: &PathCode,
-        config_buf: &Buffer,
-        scene_buf: &Buffer,
-        memory_buf: &Buffer,
-    ) -> PathBinding {
-        let reduce_ds = session
-            .create_simple_descriptor_set(
-                &code.reduce_pipeline,
-                &[memory_buf, config_buf, scene_buf, &self.tag_root_buf],
-            )
-            .unwrap();
-        let clear_ds = session
-            .create_simple_descriptor_set(&code.clear_pipeline, &[memory_buf, config_buf])
-            .unwrap();
-        let path_ds = session
-            .create_simple_descriptor_set(
-                &code.pathseg_pipeline,
-                &[memory_buf, config_buf, scene_buf, &self.tag_root_buf],
-            )
-            .unwrap();
-        PathBinding {
-            reduce_ds,
-            clear_ds,
-            path_ds,
-        }
-    }
-
-    /// Record the path stage.
-    ///
-    /// Note: no barrier is needed for transform output, we have a barrier before
-    /// those are consumed. Result is written without barrier.
-    pub unsafe fn record(
-        &self,
-        pass: &mut ComputePass,
-        code: &PathCode,
-        binding: &PathBinding,
-        n_paths: u32,
-        n_tags: u32,
-    ) {
-        if n_tags > ROOT_PART_SIZE * SCAN_PART_SIZE {
-            println!(
-                "number of pathsegs exceeded {} > {}",
-                n_tags,
-                ROOT_PART_SIZE * SCAN_PART_SIZE
-            );
-        }
-
-        // Number of tags consumed in a tag reduce workgroup
-        let reduce_part_tags = REDUCE_PART_SIZE * 4;
-        let n_wg_tag_reduce = (n_tags + reduce_part_tags - 1) / reduce_part_tags;
-        if n_wg_tag_reduce > 1 {
-            pass.dispatch(
-                &code.reduce_pipeline,
-                &binding.reduce_ds,
-                (n_wg_tag_reduce, 1, 1),
-                (REDUCE_WG, 1, 1),
-            );
-            // I think we can skip root if n_wg_tag_reduce == 2
-            pass.memory_barrier();
-            pass.dispatch(
-                &code.tag_root_pipeline,
-                &self.tag_root_ds,
-                (1, 1, 1),
-                (ROOT_WG, 1, 1),
-            );
-            // No barrier needed here; clear doesn't depend on path tags
-        }
-        let n_wg_clear = (n_paths + CLEAR_WG - 1) / CLEAR_WG;
-        pass.dispatch(
-            &code.clear_pipeline,
-            &binding.clear_ds,
-            (n_wg_clear, 1, 1),
-            (CLEAR_WG, 1, 1),
-        );
-        pass.memory_barrier();
-        let n_wg_pathseg = (n_tags + SCAN_PART_SIZE - 1) / SCAN_PART_SIZE;
-        pass.dispatch(
-            &code.pathseg_pipeline,
-            &binding.path_ds,
-            (n_wg_pathseg, 1, 1),
-            (SCAN_WG, 1, 1),
-        );
-    }
-}
-
-impl PathBinding {
-    pub unsafe fn rebind_memory(&mut self, session: &Session, memory: &Buffer) {
-        session.update_buffer_descriptor(&mut self.reduce_ds, 0, memory);
-        session.update_buffer_descriptor(&mut self.clear_ds, 0, memory);
-        session.update_buffer_descriptor(&mut self.path_ds, 0, memory);
-    }
-
-    pub unsafe fn rebind_scene(&mut self, session: &Session, scene: &Buffer) {
-        session.update_buffer_descriptor(&mut self.reduce_ds, 2, scene);
-        session.update_buffer_descriptor(&mut self.path_ds, 2, scene);
-    }
-}
-
-pub struct PathEncoder<'a> {
-    tag_stream: &'a mut Vec<u8>,
-    // If we're never going to use the i16 encoding, it might be
-    // slightly faster to store this as Vec<u32>, we'd get aligned
-    // stores on ARM etc.
-    pathseg_stream: &'a mut Vec<u8>,
-    first_pt: [f32; 2],
-    state: State,
-    n_pathseg: u32,
-}
-
-#[derive(PartialEq)]
-enum State {
-    Start,
-    MoveTo,
-    NonemptySubpath,
-}
-
-impl<'a> PathEncoder<'a> {
-    pub fn new(tags: &'a mut Vec<u8>, pathsegs: &'a mut Vec<u8>) -> PathEncoder<'a> {
-        PathEncoder {
-            tag_stream: tags,
-            pathseg_stream: pathsegs,
-            first_pt: [0.0, 0.0],
-            state: State::Start,
-            n_pathseg: 0,
-        }
-    }
-
-    pub fn move_to(&mut self, x: f32, y: f32) {
-        let buf = [x, y];
-        let bytes = bytemuck::bytes_of(&buf);
-        self.first_pt = buf;
-        if self.state == State::MoveTo {
-            let new_len = self.pathseg_stream.len() - 8;
-            self.pathseg_stream.truncate(new_len);
-        }
-        if self.state == State::NonemptySubpath {
-            if let Some(tag) = self.tag_stream.last_mut() {
-                *tag |= 4;
-            }
-        }
-        self.pathseg_stream.extend_from_slice(bytes);
-        self.state = State::MoveTo;
-    }
-
-    pub fn line_to(&mut self, x: f32, y: f32) {
-        if self.state == State::Start {
-            // should warn or error
-            return;
-        }
-        let buf = [x, y];
-        let bytes = bytemuck::bytes_of(&buf);
-        self.pathseg_stream.extend_from_slice(bytes);
-        self.tag_stream.push(9);
-        self.state = State::NonemptySubpath;
-        self.n_pathseg += 1;
-    }
-
-    pub fn quad_to(&mut self, x1: f32, y1: f32, x2: f32, y2: f32) {
-        if self.state == State::Start {
-            return;
-        }
-        let buf = [x1, y1, x2, y2];
-        let bytes = bytemuck::bytes_of(&buf);
-        self.pathseg_stream.extend_from_slice(bytes);
-        self.tag_stream.push(10);
-        self.state = State::NonemptySubpath;
-        self.n_pathseg += 1;
-    }
-
-    pub fn cubic_to(&mut self, x1: f32, y1: f32, x2: f32, y2: f32, x3: f32, y3: f32) {
-        if self.state == State::Start {
-            return;
-        }
-        let buf = [x1, y1, x2, y2, x3, y3];
-        let bytes = bytemuck::bytes_of(&buf);
-        self.pathseg_stream.extend_from_slice(bytes);
-        self.tag_stream.push(11);
-        self.state = State::NonemptySubpath;
-        self.n_pathseg += 1;
-    }
-
-    pub fn close_path(&mut self) {
-        match self.state {
-            State::Start => return,
-            State::MoveTo => {
-                let new_len = self.pathseg_stream.len() - 8;
-                self.pathseg_stream.truncate(new_len);
-                self.state = State::Start;
-                return;
-            }
-            State::NonemptySubpath => (),
-        }
-        let len = self.pathseg_stream.len();
-        if len < 8 {
-            // can't happen
-            return;
-        }
-        let first_bytes = bytemuck::bytes_of(&self.first_pt);
-        if &self.pathseg_stream[len - 8..len] != first_bytes {
-            self.pathseg_stream.extend_from_slice(first_bytes);
-            self.tag_stream.push(13);
-            self.n_pathseg += 1;
-        } else {
-            if let Some(tag) = self.tag_stream.last_mut() {
-                *tag |= 4;
-            }
-        }
-        self.state = State::Start;
-    }
-
-    fn finish(&mut self) {
-        if self.state == State::MoveTo {
-            let new_len = self.pathseg_stream.len() - 8;
-            self.pathseg_stream.truncate(new_len);
-        }
-        if let Some(tag) = self.tag_stream.last_mut() {
-            *tag |= 4;
-        }
-    }
-
-    /// Finish encoding a path.
-    ///
-    /// Encode this after encoding path segments.
-    pub fn path(&mut self) {
-        self.finish();
-        // maybe don't encode if path is empty? might throw off sync though
-        self.tag_stream.push(0x10);
-    }
-
-    /// Get the number of path segments.
-    ///
-    /// This is the number of path segments that will be written by the
-    /// path stage; use this for allocating the output buffer.
-    ///
-    /// Also note: it takes `self` for lifetime reasons.
-    pub fn n_pathseg(self) -> u32 {
-        self.n_pathseg
-    }
-}
--- a/piet-gpu/src/stages/transform.rs
+++ b/piet-gpu/src/stages/transform.rs
@ -1,36 +0,0 @@
-// Copyright 2021 The piet-gpu authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Also licensed under MIT license, at your choice.
-
-//! The transform stage of the element processing pipeline.
-
-use bytemuck::{Pod, Zeroable};
-
-/// An affine transform.
-// This is equivalent to the version in piet-gpu-types, but the bytemuck
-// representation will likely be faster.
-#[repr(C)]
-#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
-pub struct Transform {
-    pub mat: [f32; 4],
-    pub translate: [f32; 2],
-}
-
-impl Transform {
-    pub const IDENTITY: Transform = Transform {
-        mat: [1.0, 0.0, 0.0, 1.0],
-        translate: [0.0, 0.0],
-    };
-}
--- a/piet-wgsl/examples/winit/Cargo.toml
+++ b/piet-wgsl/examples/winit/Cargo.toml
@ -2,6 +2,7 @@
 name = "winit"
 version = "0.1.0"
 edition = "2021"
+publish = false

 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

--- a/tests/Cargo.toml
+++ b/tests/Cargo.toml
@ -1,23 +0,0 @@
-[package]
-name = "piet-gpu-tests"
-version = "0.1.0"
-authors = ["Raph Levien <raph.levien@gmail.com>"]
-description = "Tests for piet-gpu shaders and generic GPU capabilities."
-license = "MIT/Apache-2.0"
-edition = "2021"
-
-[features]
-default = ["piet-gpu"]
-
-[dependencies]
-clap = "3.2.22"
-bytemuck = "1.7.2"
-kurbo = "0.7.1"
-rand = "0.7.3"
-
-[dependencies.piet-gpu-hal]
-path = "../piet-gpu-hal"
-
-[dependencies.piet-gpu]
-path = "../piet-gpu"
-optional = true
--- a/tests/README.md
+++ b/tests/README.md
@ -1,47 +0,0 @@
-# piet-gpu-tests
-
-This subdirectory contains a curated set of tests for GPU issues likely to affect piet-gpu compatibility or performance. To run, cd to the tests directory and do `cargo run --release`. There are a number of additional options, including:
-
-* `--dx12` Prefer DX12 backend on windows.
-* `--size {s,m,l}` Size of test to run.
-* `--n_iter n` Number of iterations.
-* `--verbose` Verbose output.
-
-As usual, run `cargo run -- -h` for the current list.
-
-Below is a description of individual tests.
-
-## clear buffers
-
-This is as simple as it says, it uses a compute shader to clear buffers. It's run first as a warmup, and is a simple test of raw memory bandwidth (reported as 4 byte elements/s).
-
-## Prefix sum tests
-
-There are several variations of the prefix sum test, first the [decoupled look-back] variant, then a more conservative tree reduction version. The decoupled look-back implemenation exercises advanced atomic features and depends on their correctness, including atomic coherence and correct scope of memory barriers.
-
-None of the decoupled look-back tests are expected to pass on Metal, as that back-end lacks the appropriate barrier; the spirv-cross translation silently translates the GLSL version to a weaker one. All tests are expected to pass on both Vulkan and DX12.
-
-The compatibility variant does all manipulation of the state buffer using non-atomic operations, with the buffer marked "volatile" and barriers to insure acquire/release ordering.
-
-The atomic variant is similar, but uses atomicLoad and atomicStore (from the [memory scope semantics] extension to GLSL).
-
-Finally, the vkmm (Vulkan memory model) variant uses explicit acquire and release semantics on the atomics instead of barriers, and only runs when the device reports that the memory model extension is available.
-
-The tree reduction version of this test does not rely on advanced atomics and can be considered a baseline for both correctness and performance. The current implementation lacks configuration settings to handle odd-size buffers. On well-tuned hardware, the decoupled look-back implementation is expected to be 1.5x faster.
-
-Note that the workgroup sizes and sequential iteration count parameters are hard-coded (and tuned for a desktop card I had handy). A useful future extension of this test suite would be iteration over several combinations of those parameters. (The main reason this is not done yet is that it would put a lot of strain on the shader build pipeline, and at the moment hand-editing the ninja file is adequate).
-
-## Atomic tests
-
-Decoupled look-back relies on the atomic message passing idiom; these tests exercise that in isolation.
-
-The message passing tests basically do bunch of the basic message passing operation in parallel, and the "special sauce" is that the memory locations for both flags and data are permuted. That seems to do a lot better job finding violations than existing versions of the test.
-
-The linked list test is mostly a bandwidth test of atomicExchange, and is a simplified version of what the coarse path rasterizer does in piet-gpu to build per-tile lists of path segments. The verification of the resulting lists is also a pretty good test of device scoped modification order (not that this is likely to fail).
-
-## More tests
-
-I'll be adding more tests specific to piet-gpu. I'm also open to tests being added here, feel free to file an issue.
-
-[decoupled look-back]: https://raphlinus.github.io/gpu/2020/04/30/prefix-sum.html
-[memory scope semantics]: https://github.com/KhronosGroup/GLSL/blob/master/extensions/khr/GL_KHR_memory_scope_semantics.txt
--- a/tests/shader/build.ninja
+++ b/tests/shader/build.ninja
@ -1,72 +0,0 @@
-# Build file for shaders.
-
-# You must have Vulkan tools in your path, or patch here.
-
-glslang_validator = glslangValidator
-spirv_cross = spirv-cross
-dxc = dxc
-
-# See https://github.com/KhronosGroup/SPIRV-Cross/issues/1248 for
-# why we set this.
-msl_flags = --msl-decoration-binding
-
-rule glsl
-  command = $glslang_validator $flags -V -o $out $in
-
-rule hlsl
-  command = $spirv_cross --hlsl $in --output $out
-
-rule dxil
-  command = $dxc -T cs_6_0 $in -Fo $out
-
-rule msl
-  command = $spirv_cross --msl $in --output $out $msl_flags
-
-build gen/clear.spv: glsl clear.comp
-build gen/clear.hlsl: hlsl gen/clear.spv
-build gen/clear.dxil: dxil gen/clear.hlsl
-build gen/clear.msl: msl gen/clear.spv
-
-build gen/prefix.spv: glsl prefix.comp
-build gen/prefix.hlsl: hlsl gen/prefix.spv
-build gen/prefix.dxil: dxil gen/prefix.hlsl
-build gen/prefix.msl: msl gen/prefix.spv
-
-build gen/prefix_atomic.spv: glsl prefix.comp
-  flags = -DATOMIC
-build gen/prefix_atomic.hlsl: hlsl gen/prefix_atomic.spv
-build gen/prefix_atomic.dxil: dxil gen/prefix_atomic.hlsl
-build gen/prefix_atomic.msl: msl gen/prefix_atomic.spv
-
-build gen/prefix_vkmm.spv: glsl prefix.comp
-  flags = -DATOMIC -DVKMM
-# Vulkan memory model doesn't translate
-
-build gen/prefix_reduce.spv: glsl prefix_reduce.comp
-build gen/prefix_reduce.hlsl: hlsl gen/prefix_reduce.spv
-build gen/prefix_reduce.dxil: dxil gen/prefix_reduce.hlsl
-build gen/prefix_reduce.msl: msl gen/prefix_reduce.spv
-
-build gen/prefix_root.spv: glsl prefix_scan.comp
-  flags = -DROOT
-build gen/prefix_root.hlsl: hlsl gen/prefix_root.spv
-build gen/prefix_root.dxil: dxil gen/prefix_root.hlsl
-build gen/prefix_root.msl: msl gen/prefix_root.spv
-
-build gen/prefix_scan.spv: glsl prefix_scan.comp
-build gen/prefix_scan.hlsl: hlsl gen/prefix_scan.spv
-build gen/prefix_scan.dxil: dxil gen/prefix_scan.hlsl
-build gen/prefix_scan.msl: msl gen/prefix_scan.spv
-
-build gen/message_passing.spv: glsl message_passing.comp
-build gen/message_passing.hlsl: hlsl gen/message_passing.spv
-build gen/message_passing.dxil: dxil gen/message_passing.hlsl
-build gen/message_passing.msl: msl gen/message_passing.spv
-
-build gen/message_passing_vkmm.spv: glsl message_passing.comp
-  flags = -DVKMM
-
-build gen/linkedlist.spv: glsl linkedlist.comp
-build gen/linkedlist.hlsl: hlsl gen/linkedlist.spv
-build gen/linkedlist.dxil: dxil gen/linkedlist.hlsl
-build gen/linkedlist.msl: msl gen/linkedlist.spv
--- a/tests/shader/clear.comp
+++ b/tests/shader/clear.comp
@ -1,26 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Clear a buffer.
-
-#version 450
-
-layout(local_size_x = 256) in;
-
-// This should probably be uniform rather than readonly,
-// but we haven't done the binding work yet.
-layout(binding = 0) readonly buffer ConfigBuf {
-    // size is in uint (4 byte) units
-    uint size;
-    uint value;
-};
-
-layout(binding = 1) buffer TargetBuf {
-    uint[] data;
-};
-
-void main() {
-    uint ix = gl_GlobalInvocationID.x;
-    if (ix < size) {
-        data[ix] = value;
-    }
-}
--- a/Show more
+++ b/Show more