diff --git a/piet-gpu-derive/src/glsl.rs b/piet-gpu-derive/src/glsl.rs index 2409637..ce862b0 100644 --- a/piet-gpu-derive/src/glsl.rs +++ b/piet-gpu-derive/src/glsl.rs @@ -31,22 +31,18 @@ pub fn gen_glsl(module: &LayoutModule) -> String { for name in &module.def_names { let def = module.defs.get(name).unwrap(); - let mem = &"memory".to_owned(); - let mut buf_name = &module.name; - if !module.name.eq(&"state") && !module.name.eq(&"scene") { - buf_name = mem; - } + let is_mem = !module.name.eq(&"state") && !module.name.eq(&"scene"); match def { (_size, LayoutTypeDef::Struct(fields)) => { - gen_struct_read(&mut r, buf_name, &name, fields); + gen_struct_read(&mut r, &module.name, &name, is_mem, fields); if module.gpu_write { - gen_struct_write(&mut r, buf_name, &name, fields); + gen_struct_write(&mut r, &module.name, &name, is_mem, fields); } } (_size, LayoutTypeDef::Enum(en)) => { - gen_enum_read(&mut r, buf_name, &name, en); + gen_enum_read(&mut r, &module.name, &name, is_mem, en); if module.gpu_write { - gen_enum_write(&mut r, buf_name, &name, en); + gen_enum_write(&mut r, &module.name, &name, is_mem, en); } } } @@ -96,14 +92,23 @@ fn gen_struct_read( r: &mut String, bufname: &str, name: &str, + is_mem: bool, fields: &[(String, usize, LayoutType)], ) { - writeln!(r, "{} {}_read({}Ref ref) {{", name, name, name).unwrap(); + write!(r, "{} {}_read(", name, name).unwrap(); + if is_mem { + write!(r, "Alloc a, ").unwrap(); + } + writeln!(r, "{}Ref ref) {{", name).unwrap(); writeln!(r, " uint ix = ref.offset >> 2;").unwrap(); let coverage = crate::layout::struct_coverage(fields, false); for (i, fields) in coverage.iter().enumerate() { if !fields.is_empty() { - writeln!(r, " uint raw{} = {}[ix + {}];", i, bufname, i).unwrap(); + if is_mem { + writeln!(r, " uint raw{} = read_mem(a, ix + {});", i, i).unwrap(); + } else { + writeln!(r, " uint raw{} = {}[ix + {}];", i, bufname, i).unwrap(); + } } } writeln!(r, " {} s;", name).unwrap(); @@ -130,26 +135,47 @@ fn gen_enum_read( r: &mut String, bufname: &str, name: &str, + is_mem: bool, variants: &[(String, Vec<(usize, LayoutType)>)], ) { - writeln!(r, "uint {}_tag({}Ref ref) {{", name, name).unwrap(); - writeln!(r, " return {}[ref.offset >> 2];", bufname).unwrap(); + if is_mem { + writeln!(r, "uint {}_tag(Alloc a, {}Ref ref) {{", name, name).unwrap(); + writeln!(r, " return read_mem(a, ref.offset >> 2);").unwrap(); + } else { + writeln!(r, "uint {}_tag({}Ref ref) {{", name, name).unwrap(); + writeln!(r, " return {}[ref.offset >> 2];", bufname).unwrap(); + } writeln!(r, "}}\n").unwrap(); for (var_name, payload) in variants { if payload.len() == 1 { if let GpuType::InlineStruct(structname) = &payload[0].1.ty { - writeln!( - r, - "{} {}_{}_read({}Ref ref) {{", - structname, name, var_name, name - ) - .unwrap(); - writeln!( - r, - " return {}_read({}Ref(ref.offset + {}));", - structname, structname, payload[0].0 - ) - .unwrap(); + if is_mem { + writeln!( + r, + "{} {}_{}_read(Alloc a, {}Ref ref) {{", + structname, name, var_name, name + ) + .unwrap(); + writeln!( + r, + " return {}_read(a, {}Ref(ref.offset + {}));", + structname, structname, payload[0].0 + ) + .unwrap(); + } else { + writeln!( + r, + "{} {}_{}_read({}Ref ref) {{", + structname, name, var_name, name + ) + .unwrap(); + writeln!( + r, + " return {}_read({}Ref(ref.offset + {}));", + structname, structname, payload[0].0 + ) + .unwrap(); + } writeln!(r, "}}\n").unwrap(); } } @@ -303,9 +329,14 @@ fn gen_struct_write( r: &mut String, bufname: &str, name: &str, + is_mem: bool, fields: &[(String, usize, LayoutType)], ) { - writeln!(r, "void {}_write({}Ref ref, {} s) {{", name, name, name).unwrap(); + write!(r, "void {}_write(", name).unwrap(); + if is_mem { + write!(r, "Alloc a, ").unwrap(); + } + writeln!(r, "{}Ref ref, {} s) {{", name, name).unwrap(); writeln!(r, " uint ix = ref.offset >> 2;").unwrap(); let coverage = crate::layout::struct_coverage(fields, true); @@ -381,13 +412,20 @@ fn gen_struct_write( } if !pieces.is_empty() { - write!(r, " {}[ix + {}] = ", bufname, i).unwrap(); + if is_mem { + write!(r, " write_mem(a, ix + {}, ", i).unwrap(); + } else { + write!(r, " {}[ix + {}] = ", bufname, i).unwrap(); + } for (j, piece) in pieces.iter().enumerate() { if j != 0 { write!(r, " | ").unwrap(); } write!(r, "{}", piece).unwrap(); } + if is_mem { + write!(r, ")").unwrap(); + } writeln!(r, ";").unwrap(); } } @@ -429,38 +467,70 @@ fn gen_enum_write( r: &mut String, bufname: &str, name: &str, + is_mem: bool, variants: &[(String, Vec<(usize, LayoutType)>)], ) { for (var_name, payload) in variants { if payload.is_empty() { - writeln!(r, "void {}_{}_write({}Ref ref) {{", name, var_name, name).unwrap(); - writeln!( - r, - " {}[ref.offset >> 2] = {}_{};", - bufname, name, var_name - ) - .unwrap(); - writeln!(r, "}}\n").unwrap(); - } else if payload.len() == 1 { - if let GpuType::InlineStruct(structname) = &payload[0].1.ty { + if is_mem { + writeln!(r, "void {}_{}_write(Alloc a, {}Ref ref) {{", name, var_name, name).unwrap(); writeln!( r, - "void {}_{}_write({}Ref ref, {} s) {{", - name, var_name, name, structname + " write_mem(a, ref.offset >> 2, {}_{});", + name, var_name ) .unwrap(); + } else { + writeln!(r, "void {}_{}_write({}Ref ref) {{", name, var_name, name).unwrap(); writeln!( r, " {}[ref.offset >> 2] = {}_{};", bufname, name, var_name ) .unwrap(); - writeln!( - r, - " {}_write({}Ref(ref.offset + {}), s);", - structname, structname, payload[0].0 - ) - .unwrap(); + } + writeln!(r, "}}\n").unwrap(); + } else if payload.len() == 1 { + if let GpuType::InlineStruct(structname) = &payload[0].1.ty { + if is_mem { + writeln!( + r, + "void {}_{}_write(Alloc a, {}Ref ref, {} s) {{", + name, var_name, name, structname + ) + .unwrap(); + writeln!( + r, + " write_mem(a, ref.offset >> 2, {}_{});", + name, var_name + ) + .unwrap(); + writeln!( + r, + " {}_write(a, {}Ref(ref.offset + {}), s);", + structname, structname, payload[0].0 + ) + .unwrap(); + } else { + writeln!( + r, + "void {}_{}_write(Alloc a, {}Ref ref, {} s) {{", + name, var_name, name, structname + ) + .unwrap(); + writeln!( + r, + " {}[ref.offset >> 2] = {}_{};", + bufname, name, var_name + ) + .unwrap(); + writeln!( + r, + " {}_write({}Ref(ref.offset + {}), s);", + structname, structname, payload[0].0 + ) + .unwrap(); + } writeln!(r, "}}\n").unwrap(); } } diff --git a/piet-gpu/shader/annotated.h b/piet-gpu/shader/annotated.h index 8a757ef..291496f 100644 --- a/piet-gpu/shader/annotated.h +++ b/piet-gpu/shader/annotated.h @@ -62,36 +62,36 @@ AnnotatedRef Annotated_index(AnnotatedRef ref, uint index) { return AnnotatedRef(ref.offset + index * Annotated_size); } -AnnoFill AnnoFill_read(AnnoFillRef ref) { +AnnoFill AnnoFill_read(Alloc a, AnnoFillRef ref) { uint ix = ref.offset >> 2; - uint raw0 = memory[ix + 0]; - uint raw1 = memory[ix + 1]; - uint raw2 = memory[ix + 2]; - uint raw3 = memory[ix + 3]; - uint raw4 = memory[ix + 4]; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); + uint raw3 = read_mem(a, ix + 3); + uint raw4 = read_mem(a, ix + 4); AnnoFill s; s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3)); s.rgba_color = raw4; return s; } -void AnnoFill_write(AnnoFillRef ref, AnnoFill s) { +void AnnoFill_write(Alloc a, AnnoFillRef ref, AnnoFill s) { uint ix = ref.offset >> 2; - memory[ix + 0] = floatBitsToUint(s.bbox.x); - memory[ix + 1] = floatBitsToUint(s.bbox.y); - memory[ix + 2] = floatBitsToUint(s.bbox.z); - memory[ix + 3] = floatBitsToUint(s.bbox.w); - memory[ix + 4] = s.rgba_color; + write_mem(a, ix + 0, floatBitsToUint(s.bbox.x)); + write_mem(a, ix + 1, floatBitsToUint(s.bbox.y)); + write_mem(a, ix + 2, floatBitsToUint(s.bbox.z)); + write_mem(a, ix + 3, floatBitsToUint(s.bbox.w)); + write_mem(a, ix + 4, s.rgba_color); } -AnnoStroke AnnoStroke_read(AnnoStrokeRef ref) { +AnnoStroke AnnoStroke_read(Alloc a, AnnoStrokeRef ref) { uint ix = ref.offset >> 2; - uint raw0 = memory[ix + 0]; - uint raw1 = memory[ix + 1]; - uint raw2 = memory[ix + 2]; - uint raw3 = memory[ix + 3]; - uint raw4 = memory[ix + 4]; - uint raw5 = memory[ix + 5]; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); + uint raw3 = read_mem(a, ix + 3); + uint raw4 = read_mem(a, ix + 4); + uint raw5 = read_mem(a, ix + 5); AnnoStroke s; s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3)); s.rgba_color = raw4; @@ -99,76 +99,76 @@ AnnoStroke AnnoStroke_read(AnnoStrokeRef ref) { return s; } -void AnnoStroke_write(AnnoStrokeRef ref, AnnoStroke s) { +void AnnoStroke_write(Alloc a, AnnoStrokeRef ref, AnnoStroke s) { uint ix = ref.offset >> 2; - memory[ix + 0] = floatBitsToUint(s.bbox.x); - memory[ix + 1] = floatBitsToUint(s.bbox.y); - memory[ix + 2] = floatBitsToUint(s.bbox.z); - memory[ix + 3] = floatBitsToUint(s.bbox.w); - memory[ix + 4] = s.rgba_color; - memory[ix + 5] = floatBitsToUint(s.linewidth); + write_mem(a, ix + 0, floatBitsToUint(s.bbox.x)); + write_mem(a, ix + 1, floatBitsToUint(s.bbox.y)); + write_mem(a, ix + 2, floatBitsToUint(s.bbox.z)); + write_mem(a, ix + 3, floatBitsToUint(s.bbox.w)); + write_mem(a, ix + 4, s.rgba_color); + write_mem(a, ix + 5, floatBitsToUint(s.linewidth)); } -AnnoClip AnnoClip_read(AnnoClipRef ref) { +AnnoClip AnnoClip_read(Alloc a, AnnoClipRef ref) { uint ix = ref.offset >> 2; - uint raw0 = memory[ix + 0]; - uint raw1 = memory[ix + 1]; - uint raw2 = memory[ix + 2]; - uint raw3 = memory[ix + 3]; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); + uint raw3 = read_mem(a, ix + 3); AnnoClip s; s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3)); return s; } -void AnnoClip_write(AnnoClipRef ref, AnnoClip s) { +void AnnoClip_write(Alloc a, AnnoClipRef ref, AnnoClip s) { uint ix = ref.offset >> 2; - memory[ix + 0] = floatBitsToUint(s.bbox.x); - memory[ix + 1] = floatBitsToUint(s.bbox.y); - memory[ix + 2] = floatBitsToUint(s.bbox.z); - memory[ix + 3] = floatBitsToUint(s.bbox.w); + write_mem(a, ix + 0, floatBitsToUint(s.bbox.x)); + write_mem(a, ix + 1, floatBitsToUint(s.bbox.y)); + write_mem(a, ix + 2, floatBitsToUint(s.bbox.z)); + write_mem(a, ix + 3, floatBitsToUint(s.bbox.w)); } -uint Annotated_tag(AnnotatedRef ref) { - return memory[ref.offset >> 2]; +uint Annotated_tag(Alloc a, AnnotatedRef ref) { + return read_mem(a, ref.offset >> 2); } -AnnoStroke Annotated_Stroke_read(AnnotatedRef ref) { - return AnnoStroke_read(AnnoStrokeRef(ref.offset + 4)); +AnnoStroke Annotated_Stroke_read(Alloc a, AnnotatedRef ref) { + return AnnoStroke_read(a, AnnoStrokeRef(ref.offset + 4)); } -AnnoFill Annotated_Fill_read(AnnotatedRef ref) { - return AnnoFill_read(AnnoFillRef(ref.offset + 4)); +AnnoFill Annotated_Fill_read(Alloc a, AnnotatedRef ref) { + return AnnoFill_read(a, AnnoFillRef(ref.offset + 4)); } -AnnoClip Annotated_BeginClip_read(AnnotatedRef ref) { - return AnnoClip_read(AnnoClipRef(ref.offset + 4)); +AnnoClip Annotated_BeginClip_read(Alloc a, AnnotatedRef ref) { + return AnnoClip_read(a, AnnoClipRef(ref.offset + 4)); } -AnnoClip Annotated_EndClip_read(AnnotatedRef ref) { - return AnnoClip_read(AnnoClipRef(ref.offset + 4)); +AnnoClip Annotated_EndClip_read(Alloc a, AnnotatedRef ref) { + return AnnoClip_read(a, AnnoClipRef(ref.offset + 4)); } -void Annotated_Nop_write(AnnotatedRef ref) { - memory[ref.offset >> 2] = Annotated_Nop; +void Annotated_Nop_write(Alloc a, AnnotatedRef ref) { + write_mem(a, ref.offset >> 2, Annotated_Nop); } -void Annotated_Stroke_write(AnnotatedRef ref, AnnoStroke s) { - memory[ref.offset >> 2] = Annotated_Stroke; - AnnoStroke_write(AnnoStrokeRef(ref.offset + 4), s); +void Annotated_Stroke_write(Alloc a, AnnotatedRef ref, AnnoStroke s) { + write_mem(a, ref.offset >> 2, Annotated_Stroke); + AnnoStroke_write(a, AnnoStrokeRef(ref.offset + 4), s); } -void Annotated_Fill_write(AnnotatedRef ref, AnnoFill s) { - memory[ref.offset >> 2] = Annotated_Fill; - AnnoFill_write(AnnoFillRef(ref.offset + 4), s); +void Annotated_Fill_write(Alloc a, AnnotatedRef ref, AnnoFill s) { + write_mem(a, ref.offset >> 2, Annotated_Fill); + AnnoFill_write(a, AnnoFillRef(ref.offset + 4), s); } -void Annotated_BeginClip_write(AnnotatedRef ref, AnnoClip s) { - memory[ref.offset >> 2] = Annotated_BeginClip; - AnnoClip_write(AnnoClipRef(ref.offset + 4), s); +void Annotated_BeginClip_write(Alloc a, AnnotatedRef ref, AnnoClip s) { + write_mem(a, ref.offset >> 2, Annotated_BeginClip); + AnnoClip_write(a, AnnoClipRef(ref.offset + 4), s); } -void Annotated_EndClip_write(AnnotatedRef ref, AnnoClip s) { - memory[ref.offset >> 2] = Annotated_EndClip; - AnnoClip_write(AnnoClipRef(ref.offset + 4), s); +void Annotated_EndClip_write(Alloc a, AnnotatedRef ref, AnnoClip s) { + write_mem(a, ref.offset >> 2, Annotated_EndClip); + AnnoClip_write(a, AnnoClipRef(ref.offset + 4), s); } diff --git a/piet-gpu/shader/backdrop.comp b/piet-gpu/shader/backdrop.comp index 6828ac1..85e54e8 100644 --- a/piet-gpu/shader/backdrop.comp +++ b/piet-gpu/shader/backdrop.comp @@ -15,8 +15,8 @@ #version 450 #extension GL_GOOGLE_include_directive : enable -#include "setup.h" #include "mem.h" +#include "setup.h" #define LG_BACKDROP_WG (7 + LG_WG_FACTOR) #define BACKDROP_WG (1 << LG_BACKDROP_WG) @@ -31,27 +31,27 @@ layout(set = 0, binding = 1) readonly buffer ConfigBuf { #include "tile.h" shared uint sh_row_count[BACKDROP_WG]; -shared uint sh_row_base[BACKDROP_WG]; +shared Alloc sh_row_alloc[BACKDROP_WG]; shared uint sh_row_width[BACKDROP_WG]; void main() { - if (mem_overflow) { + if (mem_error != NO_ERROR) { return; } uint th_ix = gl_LocalInvocationID.x; uint element_ix = gl_GlobalInvocationID.x; - AnnotatedRef ref = AnnotatedRef(conf.anno_base + element_ix * Annotated_size); + AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size); // Work assignment: 1 thread : 1 path element uint row_count = 0; if (element_ix < conf.n_elements) { - uint tag = Annotated_tag(ref); + uint tag = Annotated_tag(conf.anno_alloc, ref); switch (tag) { case Annotated_Fill: case Annotated_BeginClip: - PathRef path_ref = PathRef(conf.tile_base + element_ix * Path_size); - Path path = Path_read(path_ref); + PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size); + Path path = Path_read(conf.tile_alloc, path_ref); sh_row_width[th_ix] = path.bbox.z - path.bbox.x; row_count = path.bbox.w - path.bbox.y; // Paths that don't cross tile top edges don't have backdrops. @@ -62,7 +62,8 @@ void main() { // long as it doesn't cross the left edge. row_count = 0; } - sh_row_base[th_ix] = (path.tiles.offset >> 2) + 1; + Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size); + sh_row_alloc[th_ix] = path_alloc; } } @@ -92,13 +93,14 @@ void main() { if (width > 0) { // Process one row sequentially // Read backdrop value per tile and prefix sum it + Alloc tiles_alloc = sh_row_alloc[el_ix]; uint seq_ix = row - (el_ix > 0 ? sh_row_count[el_ix - 1] : 0); - uint tile_el_ix = sh_row_base[el_ix] + seq_ix * 2 * width; - uint sum = memory[tile_el_ix]; + uint tile_el_ix = (tiles_alloc.offset >> 2) + 1 + seq_ix * 2 * width; + uint sum = read_mem(tiles_alloc, tile_el_ix); for (uint x = 1; x < width; x++) { tile_el_ix += 2; - sum += memory[tile_el_ix]; - memory[tile_el_ix] = sum; + sum += read_mem(tiles_alloc, tile_el_ix); + write_mem(tiles_alloc, tile_el_ix, sum); } } } diff --git a/piet-gpu/shader/backdrop.spv b/piet-gpu/shader/backdrop.spv index dc1cbc4..81d26c0 100644 Binary files a/piet-gpu/shader/backdrop.spv and b/piet-gpu/shader/backdrop.spv differ diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp index 6812fb6..8ad72c7 100644 --- a/piet-gpu/shader/binning.comp +++ b/piet-gpu/shader/binning.comp @@ -9,8 +9,8 @@ #version 450 #extension GL_GOOGLE_include_directive : enable -#include "setup.h" #include "mem.h" +#include "setup.h" layout(local_size_x = N_TILE, local_size_y = 1) in; @@ -32,11 +32,11 @@ layout(set = 0, binding = 1) readonly buffer ConfigBuf { // Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps) shared uint bitmaps[N_SLICE][N_TILE]; shared uint count[N_SLICE][N_TILE]; -shared uint sh_chunk_start[N_TILE]; +shared Alloc sh_chunk_alloc[N_TILE]; shared bool sh_alloc_failed; void main() { - if (mem_overflow) { + if (mem_error != NO_ERROR) { return; } @@ -53,10 +53,10 @@ void main() { // Read inputs and determine coverage of bins uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x; - AnnotatedRef ref = AnnotatedRef(conf.anno_base + element_ix * Annotated_size); + AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size); uint tag = Annotated_Nop; if (element_ix < my_n_elements) { - tag = Annotated_tag(ref); + tag = Annotated_tag(conf.anno_alloc, ref); } int x0 = 0, y0 = 0, x1 = 0, y1 = 0; switch (tag) { @@ -66,7 +66,7 @@ void main() { case Annotated_EndClip: // Note: we take advantage of the fact that these drawing elements // have the bbox at the same place in their layout. - AnnoFill fill = Annotated_Fill_read(ref); + AnnoFill fill = Annotated_Fill_read(conf.anno_alloc, ref); x0 = int(floor(fill.bbox.x * SX)); y0 = int(floor(fill.bbox.y * SY)); x1 = int(ceil(fill.bbox.z * SX)); @@ -105,20 +105,21 @@ void main() { count[i][gl_LocalInvocationID.x] = element_count; } // element_count is number of elements covering bin for this invocation. - Alloc chunk_alloc = Alloc(0, false); + Alloc chunk_alloc = new_alloc(0, 0); if (element_count != 0) { // TODO: aggregate atomic adds (subgroup is probably fastest) - chunk_alloc = malloc(element_count * BinInstance_size); - sh_chunk_start[gl_LocalInvocationID.x] = chunk_alloc.offset; - if (chunk_alloc.failed) { + MallocResult chunk = malloc(element_count * BinInstance_size); + chunk_alloc = chunk.alloc; + sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc; + if (chunk.failed) { sh_alloc_failed = true; } } // Note: it might be more efficient for reading to do this in the // other order (each bin is a contiguous sequence of partitions) - uint out_ix = (conf.bin_base >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2; - memory[out_ix] = element_count; - memory[out_ix + 1] = chunk_alloc.offset; + uint out_ix = (conf.bin_alloc.offset >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2; + write_mem(conf.bin_alloc, out_ix, element_count); + write_mem(conf.bin_alloc, out_ix + 1, chunk_alloc.offset); barrier(); if (sh_alloc_failed) { @@ -137,8 +138,9 @@ void main() { if (my_slice > 0) { idx += count[my_slice - 1][bin_ix]; } - uint out_offset = sh_chunk_start[bin_ix] + idx * BinInstance_size; - BinInstance_write(BinInstanceRef(out_offset), BinInstance(element_ix)); + Alloc out_alloc = sh_chunk_alloc[bin_ix]; + uint out_offset = out_alloc.offset + idx * BinInstance_size; + BinInstance_write(out_alloc, BinInstanceRef(out_offset), BinInstance(element_ix)); } x++; if (x == x1) { diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv index 7974dac..a9a05f5 100644 Binary files a/piet-gpu/shader/binning.spv and b/piet-gpu/shader/binning.spv differ diff --git a/piet-gpu/shader/bins.h b/piet-gpu/shader/bins.h index 4364278..853adab 100644 --- a/piet-gpu/shader/bins.h +++ b/piet-gpu/shader/bins.h @@ -16,16 +16,16 @@ BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) { return BinInstanceRef(ref.offset + index * BinInstance_size); } -BinInstance BinInstance_read(BinInstanceRef ref) { +BinInstance BinInstance_read(Alloc a, BinInstanceRef ref) { uint ix = ref.offset >> 2; - uint raw0 = memory[ix + 0]; + uint raw0 = read_mem(a, ix + 0); BinInstance s; s.element_ix = raw0; return s; } -void BinInstance_write(BinInstanceRef ref, BinInstance s) { +void BinInstance_write(Alloc a, BinInstanceRef ref, BinInstance s) { uint ix = ref.offset >> 2; - memory[ix + 0] = s.element_ix; + write_mem(a, ix + 0, s.element_ix); } diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp index 112a57d..3f4e460 100644 --- a/piet-gpu/shader/coarse.comp +++ b/piet-gpu/shader/coarse.comp @@ -13,8 +13,8 @@ #version 450 #extension GL_GOOGLE_include_directive : enable -#include "setup.h" #include "mem.h" +#include "setup.h" layout(local_size_x = N_TILE, local_size_y = 1) in; @@ -34,7 +34,7 @@ shared uint sh_elements[N_TILE]; // Number of elements in the partition; prefix sum. shared uint sh_part_count[N_PART_READ]; -shared uint sh_part_elements[N_PART_READ]; +shared Alloc sh_part_elements[N_PART_READ]; shared uint sh_bitmaps[N_SLICE][N_TILE]; @@ -48,24 +48,47 @@ shared uint sh_tile_y0[N_TILE]; shared uint sh_tile_base[N_TILE]; shared uint sh_tile_stride[N_TILE]; -// Perhaps cmd_limit should be a global? This is a style question. -bool alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) { +#ifdef MEM_DEBUG +// Store allocs only when MEM_DEBUG to save shared memory traffic. +shared Alloc sh_tile_alloc[N_TILE]; + +void write_tile_alloc(uint el_ix, Alloc a) { + sh_tile_alloc[el_ix] = a; +} + +Alloc read_tile_alloc(uint el_ix) { + return sh_tile_alloc[el_ix]; +} +#else +void write_tile_alloc(uint el_ix, Alloc a) { + // No-op +} + +Alloc read_tile_alloc(uint el_ix) { + // All memory. + return new_alloc(0, memory.length()*4); +} +#endif + +// Perhaps cmd_alloc should be a global? This is a style question. +bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) { if (cmd_ref.offset < cmd_limit) { return true; } - Alloc new_cmd = malloc(PTCL_INITIAL_ALLOC); + MallocResult new_cmd = malloc(PTCL_INITIAL_ALLOC); if (new_cmd.failed) { return false; } - CmdJump jump = CmdJump(new_cmd.offset); - Cmd_Jump_write(cmd_ref, jump); - cmd_ref = CmdRef(new_cmd.offset); - cmd_limit = new_cmd.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size; + CmdJump jump = CmdJump(new_cmd.alloc.offset); + Cmd_Jump_write(cmd_alloc, cmd_ref, jump); + cmd_alloc = new_cmd.alloc; + cmd_ref = CmdRef(cmd_alloc.offset); + cmd_limit = cmd_alloc.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size; return true; } void main() { - if (mem_overflow) { + if (mem_error != NO_ERROR) { return; } @@ -85,7 +108,8 @@ void main() { uint tile_x = gl_LocalInvocationID.x % N_TILE_X; uint tile_y = gl_LocalInvocationID.x / N_TILE_X; uint this_tile_ix = (bin_tile_y + tile_y) * conf.width_in_tiles + bin_tile_x + tile_x; - CmdRef cmd_ref = CmdRef(conf.ptcl_base + this_tile_ix * PTCL_INITIAL_ALLOC); + Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, this_tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC); + CmdRef cmd_ref = CmdRef(cmd_alloc.offset); uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size; // The nesting depth of the clip stack uint clip_depth = 0; @@ -117,9 +141,10 @@ void main() { part_start_ix = ready_ix; uint count = 0; if (th_ix < N_PART_READ && partition_ix + th_ix < n_partitions) { - uint in_ix = (conf.bin_base >> 2) + ((partition_ix + th_ix) * N_TILE + bin_ix) * 2; - count = memory[in_ix]; - sh_part_elements[th_ix] = memory[in_ix + 1]; + uint in_ix = (conf.bin_alloc.offset >> 2) + ((partition_ix + th_ix) * N_TILE + bin_ix) * 2; + count = read_mem(conf.bin_alloc, in_ix); + uint offset = read_mem(conf.bin_alloc, in_ix + 1); + sh_part_elements[th_ix] = new_alloc(offset, count*BinInstance_size); } // prefix sum of counts for (uint i = 0; i < LG_N_PART_READ; i++) { @@ -152,8 +177,9 @@ void main() { } } ix -= part_ix > 0 ? sh_part_count[part_ix - 1] : part_start_ix; - BinInstanceRef inst_ref = BinInstanceRef(sh_part_elements[part_ix]); - BinInstance inst = BinInstance_read(BinInstance_index(inst_ref, ix)); + Alloc bin_alloc = sh_part_elements[part_ix]; + BinInstanceRef inst_ref = BinInstanceRef(bin_alloc.offset); + BinInstance inst = BinInstance_read(bin_alloc, BinInstance_index(inst_ref, ix)); sh_elements[th_ix] = inst.element_ix; } barrier(); @@ -169,8 +195,8 @@ void main() { AnnotatedRef ref; if (th_ix + rd_ix < wr_ix) { element_ix = sh_elements[th_ix]; - ref = AnnotatedRef(conf.anno_base + element_ix * Annotated_size); - tag = Annotated_tag(ref); + ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size); + tag = Annotated_tag(conf.anno_alloc, ref); } // Bounding box of element in pixel coordinates. @@ -183,7 +209,7 @@ void main() { // We have one "path" for each element, even if the element isn't // actually a path (currently EndClip, but images etc in the future). uint path_ix = element_ix; - Path path = Path_read(PathRef(conf.tile_base + path_ix * Path_size)); + Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size)); uint stride = path.bbox.z - path.bbox.x; sh_tile_stride[th_ix] = stride; int dx = int(path.bbox.x) - int(bin_tile_x); @@ -199,6 +225,8 @@ void main() { // base relative to bin uint base = path.tiles.offset - uint(dy * stride + dx) * Tile_size; sh_tile_base[th_ix] = base; + Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size); + write_tile_alloc(th_ix, path_alloc); break; default: tile_count = 0; @@ -226,8 +254,8 @@ void main() { el_ix = probe; } } - AnnotatedRef ref = AnnotatedRef(conf.anno_base + sh_elements[el_ix] * Annotated_size); - uint tag = Annotated_tag(ref); + AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + sh_elements[el_ix] * Annotated_size); + uint tag = Annotated_tag(conf.anno_alloc, ref); uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0); uint width = sh_tile_width[el_ix]; uint x = sh_tile_x0[el_ix] + seq_ix % width; @@ -236,7 +264,7 @@ void main() { if (tag == Annotated_BeginClip || tag == Annotated_EndClip) { include_tile = true; } else { - Tile tile = Tile_read(TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size)); + Tile tile = Tile_read(read_tile_alloc(el_ix), TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size)); // Include the path in the tile if // - the tile contains at least a segment (tile offset non-zero) // - the tile is completely covered (backdrop non-zero) @@ -275,16 +303,16 @@ void main() { // At this point, we read the element again from global memory. // If that turns out to be expensive, maybe we can pack it into // shared memory (or perhaps just the tag). - ref = AnnotatedRef(conf.anno_base + element_ix * Annotated_size); - tag = Annotated_tag(ref); + ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size); + tag = Annotated_tag(conf.anno_alloc, ref); if (clip_zero_depth == 0) { switch (tag) { case Annotated_Fill: - Tile tile = Tile_read(TileRef(sh_tile_base[element_ref_ix] + Tile tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix] + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); - AnnoFill fill = Annotated_Fill_read(ref); - if (!alloc_cmd(cmd_ref, cmd_limit)) { + AnnoFill fill = Annotated_Fill_read(conf.anno_alloc, ref); + if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { break; } if (tile.tile.offset != 0) { @@ -292,32 +320,32 @@ void main() { cmd_fill.tile_ref = tile.tile.offset; cmd_fill.backdrop = tile.backdrop; cmd_fill.rgba_color = fill.rgba_color; - Cmd_Fill_write(cmd_ref, cmd_fill); + Cmd_Fill_write(cmd_alloc, cmd_ref, cmd_fill); } else { - Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color)); + Cmd_Solid_write(cmd_alloc, cmd_ref, CmdSolid(fill.rgba_color)); } cmd_ref.offset += Cmd_size; break; case Annotated_BeginClip: - tile = Tile_read(TileRef(sh_tile_base[element_ref_ix] + tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix] + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); if (tile.tile.offset == 0 && tile.backdrop == 0) { clip_zero_depth = clip_depth + 1; } else if (tile.tile.offset == 0 && clip_depth < 32) { clip_one_mask |= (1 << clip_depth); } else { - if (!alloc_cmd(cmd_ref, cmd_limit)) { + if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { break; } if (tile.tile.offset != 0) { CmdBeginClip cmd_begin_clip; cmd_begin_clip.tile_ref = tile.tile.offset; cmd_begin_clip.backdrop = tile.backdrop; - Cmd_BeginClip_write(cmd_ref, cmd_begin_clip); + Cmd_BeginClip_write(cmd_alloc, cmd_ref, cmd_begin_clip); } else { // TODO: here is where a bunch of optimization magic should happen float alpha = tile.backdrop == 0 ? 0.0 : 1.0; - Cmd_BeginSolidClip_write(cmd_ref, CmdBeginSolidClip(alpha)); + Cmd_BeginSolidClip_write(cmd_alloc, cmd_ref, CmdBeginSolidClip(alpha)); } cmd_ref.offset += Cmd_size; if (clip_depth < 32) { @@ -329,25 +357,25 @@ void main() { case Annotated_EndClip: clip_depth--; if (clip_depth >= 32 || (clip_one_mask & (1 << clip_depth)) == 0) { - if (!alloc_cmd(cmd_ref, cmd_limit)) { + if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { break; } - Cmd_EndClip_write(cmd_ref, CmdEndClip(1.0)); + Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(1.0)); cmd_ref.offset += Cmd_size; } break; case Annotated_Stroke: - tile = Tile_read(TileRef(sh_tile_base[element_ref_ix] + tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix] + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); - AnnoStroke stroke = Annotated_Stroke_read(ref); + AnnoStroke stroke = Annotated_Stroke_read(conf.anno_alloc, ref); CmdStroke cmd_stroke; cmd_stroke.tile_ref = tile.tile.offset; cmd_stroke.half_width = 0.5 * stroke.linewidth; cmd_stroke.rgba_color = stroke.rgba_color; - if (!alloc_cmd(cmd_ref, cmd_limit)) { + if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { break; } - Cmd_Stroke_write(cmd_ref, cmd_stroke); + Cmd_Stroke_write(cmd_alloc, cmd_ref, cmd_stroke); cmd_ref.offset += Cmd_size; break; } @@ -372,6 +400,6 @@ void main() { if (rd_ix >= ready_ix && partition_ix >= n_partitions) break; } if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) { - Cmd_End_write(cmd_ref); + Cmd_End_write(cmd_alloc, cmd_ref); } } diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv index 8991094..505c4f4 100644 Binary files a/piet-gpu/shader/coarse.spv and b/piet-gpu/shader/coarse.spv differ diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp index a0e5011..255dd13 100644 --- a/piet-gpu/shader/elements.comp +++ b/piet-gpu/shader/elements.comp @@ -9,8 +9,8 @@ #version 450 #extension GL_GOOGLE_include_directive : enable -#include "setup.h" #include "mem.h" +#include "setup.h" #define N_ROWS 4 #define WG_SIZE 32 @@ -172,7 +172,7 @@ shared uint sh_part_ix; shared State sh_prefix; void main() { - if (mem_overflow) { + if (mem_error != NO_ERROR) { return; } @@ -342,10 +342,10 @@ void main() { } // We do encoding a bit by hand to minimize divergence. Another approach // would be to have a fill/stroke bool. - PathSegRef path_out_ref = PathSegRef(conf.pathseg_base + (st.pathseg_count - 1) * PathSeg_size); + PathSegRef path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size); uint out_tag = tag == Element_FillLine ? PathSeg_FillCubic : PathSeg_StrokeCubic; - memory[path_out_ref.offset >> 2] = out_tag; - PathStrokeCubic_write(PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic); + write_mem(conf.pathseg_alloc, path_out_ref.offset >> 2, out_tag); + PathStrokeCubic_write(conf.pathseg_alloc, PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic); break; case Element_FillQuad: case Element_StrokeQuad: @@ -366,10 +366,10 @@ void main() { } // We do encoding a bit by hand to minimize divergence. Another approach // would be to have a fill/stroke bool. - path_out_ref = PathSegRef(conf.pathseg_base + (st.pathseg_count - 1) * PathSeg_size); + path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size); out_tag = tag == Element_FillQuad ? PathSeg_FillCubic : PathSeg_StrokeCubic; - memory[path_out_ref.offset >> 2] = out_tag; - PathStrokeCubic_write(PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic); + write_mem(conf.pathseg_alloc, path_out_ref.offset >> 2, out_tag); + PathStrokeCubic_write(conf.pathseg_alloc, PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic); break; case Element_FillCubic: case Element_StrokeCubic: @@ -387,10 +387,10 @@ void main() { } // We do encoding a bit by hand to minimize divergence. Another approach // would be to have a fill/stroke bool. - path_out_ref = PathSegRef(conf.pathseg_base + (st.pathseg_count - 1) * PathSeg_size); + path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size); out_tag = tag == Element_FillCubic ? PathSeg_FillCubic : PathSeg_StrokeCubic; - memory[path_out_ref.offset >> 2] = out_tag; - PathStrokeCubic_write(PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic); + write_mem(conf.pathseg_alloc, path_out_ref.offset >> 2, out_tag); + PathStrokeCubic_write(conf.pathseg_alloc, PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic); break; case Element_Stroke: Stroke stroke = Element_Stroke_read(this_ref); @@ -399,31 +399,31 @@ void main() { vec2 lw = get_linewidth(st); anno_stroke.bbox = st.bbox + vec4(-lw, lw); anno_stroke.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z)); - AnnotatedRef out_ref = AnnotatedRef(conf.anno_base + (st.path_count - 1) * Annotated_size); - Annotated_Stroke_write(out_ref, anno_stroke); + AnnotatedRef out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size); + Annotated_Stroke_write(conf.anno_alloc, out_ref, anno_stroke); break; case Element_Fill: Fill fill = Element_Fill_read(this_ref); AnnoFill anno_fill; anno_fill.rgba_color = fill.rgba_color; anno_fill.bbox = st.bbox; - out_ref = AnnotatedRef(conf.anno_base + (st.path_count - 1) * Annotated_size); - Annotated_Fill_write(out_ref, anno_fill); + out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size); + Annotated_Fill_write(conf.anno_alloc, out_ref, anno_fill); break; case Element_BeginClip: Clip begin_clip = Element_BeginClip_read(this_ref); AnnoClip anno_begin_clip = AnnoClip(begin_clip.bbox); // This is the absolute bbox, it's been transformed during encoding. anno_begin_clip.bbox = begin_clip.bbox; - out_ref = AnnotatedRef(conf.anno_base + (st.path_count - 1) * Annotated_size); - Annotated_BeginClip_write(out_ref, anno_begin_clip); + out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size); + Annotated_BeginClip_write(conf.anno_alloc, out_ref, anno_begin_clip); break; case Element_EndClip: Clip end_clip = Element_EndClip_read(this_ref); // This bbox is expected to be the same as the begin one. AnnoClip anno_end_clip = AnnoClip(end_clip.bbox); - out_ref = AnnotatedRef(conf.anno_base + (st.path_count - 1) * Annotated_size); - Annotated_EndClip_write(out_ref, anno_end_clip); + out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size); + Annotated_EndClip_write(conf.anno_alloc, out_ref, anno_end_clip); break; } } diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv index 7475f04..287aa4e 100644 Binary files a/piet-gpu/shader/elements.spv and b/piet-gpu/shader/elements.spv differ diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp index fe55ff9..395ac80 100644 --- a/piet-gpu/shader/kernel4.comp +++ b/piet-gpu/shader/kernel4.comp @@ -10,8 +10,8 @@ #extension GL_GOOGLE_include_directive : enable #extension GL_EXT_nonuniform_qualifier : enable -#include "setup.h" #include "mem.h" +#include "setup.h" #define CHUNK 8 #define CHUNK_DY (TILE_HEIGHT_PX / CHUNK) @@ -37,16 +37,16 @@ layout(set = 0, binding = 3) uniform sampler2D textures[]; #define CLIP_LINK_OFFSET (TILE_WIDTH_PX * TILE_HEIGHT_PX) #define CLIP_BUF_SIZE (CLIP_LINK_OFFSET + 1) -shared Alloc sh_clip_alloc; +shared MallocResult sh_clip_alloc; // Allocate a scratch buffer for clipping. -Alloc alloc_clip_buf(uint link) { +MallocResult alloc_clip_buf(uint link) { if (gl_LocalInvocationID.x == 0 && gl_LocalInvocationID.y == 0) { - Alloc alloc = malloc(CLIP_BUF_SIZE * 4); - if (!alloc.failed) { - memory[(alloc.offset >> 2) + CLIP_LINK_OFFSET] = link; + MallocResult m = malloc(CLIP_BUF_SIZE * 4); + if (!m.failed) { + write_mem(m.alloc, (m.alloc.offset >> 2) + CLIP_LINK_OFFSET, link); } - sh_clip_alloc = alloc; + sh_clip_alloc = m; } barrier(); return sh_clip_alloc; @@ -59,7 +59,7 @@ float[CHUNK] computeArea(vec2 xy, int backdrop, uint tile_ref) { for (uint k = 0; k < CHUNK; k++) area[k] = float(backdrop); TileSegRef tile_seg_ref = TileSegRef(tile_ref); do { - TileSeg seg = TileSeg_read(tile_seg_ref); + TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size), tile_seg_ref); for (uint k = 0; k < CHUNK; k++) { vec2 my_xy = vec2(xy.x, xy.y + float(k * CHUNK_DY)); vec2 start = seg.origin - my_xy; @@ -87,12 +87,13 @@ float[CHUNK] computeArea(vec2 xy, int backdrop, uint tile_ref) { } void main() { - if (mem_overflow) { + if (mem_error != NO_ERROR) { return; } uint tile_ix = gl_WorkGroupID.y * conf.width_in_tiles + gl_WorkGroupID.x; - CmdRef cmd_ref = CmdRef(conf.ptcl_base + tile_ix * PTCL_INITIAL_ALLOC); + Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC); + CmdRef cmd_ref = CmdRef(cmd_alloc.offset); uvec2 xy_uint = uvec2(gl_GlobalInvocationID.x, gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y); vec2 xy = vec2(xy_uint); @@ -101,7 +102,7 @@ void main() { uint blend_stack[BLEND_STACK_SIZE][CHUNK]; uint blend_spill = 0; uint blend_sp = 0; - uint clip_tos = 0; + Alloc clip_tos = new_alloc(0, 0); for (uint i = 0; i < CHUNK; i++) { rgb[i] = vec3(0.5); if (xy_uint.x < 1024 && xy_uint.y < 1024) { @@ -111,13 +112,13 @@ void main() { } while (true) { - uint tag = Cmd_tag(cmd_ref); + uint tag = Cmd_tag(cmd_alloc, cmd_ref); if (tag == Cmd_End) { break; } switch (tag) { case Cmd_Circle: - CmdCircle circle = Cmd_Circle_read(cmd_ref); + CmdCircle circle = Cmd_Circle_read(cmd_alloc, cmd_ref); vec4 fg_rgba = unpackUnorm4x8(circle.rgba_color).wzyx; for (uint i = 0; i < CHUNK; i++) { float dy = float(i * CHUNK_DY); @@ -129,12 +130,12 @@ void main() { break; case Cmd_Stroke: // Calculate distance field from all the line segments in this tile. - CmdStroke stroke = Cmd_Stroke_read(cmd_ref); + CmdStroke stroke = Cmd_Stroke_read(cmd_alloc, cmd_ref); float df[CHUNK]; for (uint k = 0; k < CHUNK; k++) df[k] = 1e9; TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref); do { - TileSeg seg = TileSeg_read(tile_seg_ref); + TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size), tile_seg_ref); vec2 line_vec = seg.vector; for (uint k = 0; k < CHUNK; k++) { vec2 dpos = xy + vec2(0.5, 0.5) - seg.origin; @@ -151,7 +152,7 @@ void main() { } break; case Cmd_Fill: - CmdFill fill = Cmd_Fill_read(cmd_ref); + CmdFill fill = Cmd_Fill_read(cmd_alloc, cmd_ref); float area[CHUNK]; area = computeArea(xy, fill.backdrop, fill.tile_ref); fg_rgba = unpackUnorm4x8(fill.rgba_color).wzyx; @@ -164,25 +165,25 @@ void main() { uint blend_slot = blend_sp % BLEND_STACK_SIZE; if (blend_sp == blend_spill + BLEND_STACK_SIZE) { // spill to scratch buffer - Alloc alloc = alloc_clip_buf(clip_tos); - if (alloc.failed) { + MallocResult m = alloc_clip_buf(clip_tos.offset); + if (m.failed) { return; } - clip_tos = alloc.offset; - uint base_ix = (clip_tos >> 2) + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y; + clip_tos = m.alloc; + uint base_ix = (clip_tos.offset >> 2) + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y; for (uint k = 0; k < CHUNK; k++) { - memory[base_ix + k * TILE_WIDTH_PX * CHUNK_DY] = blend_stack[blend_slot][k]; + write_mem(clip_tos, base_ix + k * TILE_WIDTH_PX * CHUNK_DY, blend_stack[blend_slot][k]); } blend_spill++; } if (tag == Cmd_BeginClip) { - CmdBeginClip begin_clip = Cmd_BeginClip_read(cmd_ref); + CmdBeginClip begin_clip = Cmd_BeginClip_read(cmd_alloc, cmd_ref); area = computeArea(xy, begin_clip.backdrop, begin_clip.tile_ref); for (uint k = 0; k < CHUNK; k++) { blend_stack[blend_slot][k] = packUnorm4x8(vec4(rgb[k], clamp(abs(area[k]), 0.0, 1.0))); } } else { - CmdBeginSolidClip begin_solid_clip = Cmd_BeginSolidClip_read(cmd_ref); + CmdBeginSolidClip begin_solid_clip = Cmd_BeginSolidClip_read(cmd_alloc, cmd_ref); float solid_alpha = begin_solid_clip.alpha; for (uint k = 0; k < CHUNK; k++) { blend_stack[blend_slot][k] = packUnorm4x8(vec4(rgb[k], solid_alpha)); @@ -191,14 +192,14 @@ void main() { blend_sp++; break; case Cmd_EndClip: - CmdEndClip end_clip = Cmd_EndClip_read(cmd_ref); + CmdEndClip end_clip = Cmd_EndClip_read(cmd_alloc, cmd_ref); blend_slot = (blend_sp - 1) % BLEND_STACK_SIZE; if (blend_sp == blend_spill) { - uint base_ix = (clip_tos >> 2) + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y; + uint base_ix = (clip_tos.offset >> 2) + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y; for (uint k = 0; k < CHUNK; k++) { - blend_stack[blend_slot][k] = memory[base_ix + k * TILE_WIDTH_PX * CHUNK_DY]; + blend_stack[blend_slot][k] = read_mem(clip_tos, base_ix + k * TILE_WIDTH_PX * CHUNK_DY); } - clip_tos = memory[(clip_tos >> 2) + CLIP_LINK_OFFSET]; + clip_tos.offset = read_mem(clip_tos, (clip_tos.offset >> 2) + CLIP_LINK_OFFSET); blend_spill--; } blend_sp--; @@ -208,20 +209,21 @@ void main() { } break; case Cmd_Solid: - CmdSolid solid = Cmd_Solid_read(cmd_ref); + CmdSolid solid = Cmd_Solid_read(cmd_alloc, cmd_ref); fg_rgba = unpackUnorm4x8(solid.rgba_color).wzyx; for (uint k = 0; k < CHUNK; k++) { rgb[k] = mix(rgb[k], fg_rgba.rgb, mask[k] * fg_rgba.a); } break; case Cmd_SolidMask: - CmdSolidMask solid_mask = Cmd_SolidMask_read(cmd_ref); + CmdSolidMask solid_mask = Cmd_SolidMask_read(cmd_alloc, cmd_ref); for (uint k = 0; k < CHUNK; k++) { mask[k] = solid_mask.mask; } break; case Cmd_Jump: - cmd_ref = CmdRef(Cmd_Jump_read(cmd_ref).new_ref); + cmd_ref = CmdRef(Cmd_Jump_read(cmd_alloc, cmd_ref).new_ref); + cmd_alloc.offset = cmd_ref.offset; continue; } cmd_ref.offset += Cmd_size; diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv index b384340..3ccf21d 100644 Binary files a/piet-gpu/shader/kernel4.spv and b/piet-gpu/shader/kernel4.spv differ diff --git a/piet-gpu/shader/mem.h b/piet-gpu/shader/mem.h index 9373cbf..7b2a02a 100644 --- a/piet-gpu/shader/mem.h +++ b/piet-gpu/shader/mem.h @@ -3,27 +3,118 @@ layout(set = 0, binding = 0) buffer Memory { // offset into memory of the next allocation, initialized by the user. uint mem_offset; - bool mem_overflow; + // mem_error tracks the status of memory accesses, initialized to NO_ERROR + // by the user. ERR_MALLOC_FAILED is reported for insufficient memory. + // If MEM_DEBUG is defined the following errors are reported: + // - ERR_OUT_OF_BOUNDS is reported for out of bounds writes. + // - ERR_UNALIGNED_ACCESS for memory access not aligned to 32-bit words. + uint mem_error; uint[] memory; }; +// Uncomment this line to add the size field to Alloc and enable memory checks. +// Note that the Config struct in setup.h grows size fields as well. +//#define MEM_DEBUG + +#define NO_ERROR 0 +#define ERR_MALLOC_FAILED 1 +#define ERR_OUT_OF_BOUNDS 2 +#define ERR_UNALIGNED_ACCESS 3 + +#define Alloc_size 8 + // Alloc represents a memory allocation. struct Alloc { // offset in bytes into memory. uint offset; +#ifdef MEM_DEBUG + // size in bytes of the allocation. + uint size; +#endif +}; + +struct MallocResult { + Alloc alloc; // failed is true if the allocation overflowed memory. bool failed; }; -// malloc allocates size bytes of memory. -Alloc malloc(uint size) { +// new_alloc synthesizes an Alloc when its offset and size are derived. +Alloc new_alloc(uint offset, uint size) { Alloc a; - // Round up to nearest 32-bit word. - size = (size + 3) & ~3; - a.offset = atomicAdd(mem_offset, size); - a.failed = a.offset + size > memory.length() * 4; - if (a.failed) { - mem_overflow = true; - } + a.offset = offset; +#ifdef MEM_DEBUG + a.size = size; +#endif return a; } + +// malloc allocates size bytes of memory. +MallocResult malloc(uint size) { + MallocResult r; + r.failed = false; + uint offset = atomicAdd(mem_offset, size); + r.alloc = new_alloc(offset, size); + if (offset + size > memory.length() * 4) { + r.failed = true; + atomicMax(mem_error, ERR_MALLOC_FAILED); + return r; + } +#ifdef MEM_DEBUG + if ((size & 3) != 0) { + r.failed = true; + atomicMax(mem_error, ERR_UNALIGNED_ACCESS); + return r; + } +#endif + return r; +} + +// touch_mem checks whether access to the memory word at offset is valid. +// If MEM_DEBUG is defined, touch_mem returns false if offset is out of bounds. +// Offset is in words. +bool touch_mem(Alloc alloc, uint offset) { +#ifdef MEM_DEBUG + if (offset < alloc.offset/4 || offset >= (alloc.offset + alloc.size)/4) { + atomicMax(mem_error, ERR_OUT_OF_BOUNDS); + return false; + } +#endif + return true; +} + +// write_mem writes val to memory at offset. +// Offset is in words. +void write_mem(Alloc alloc, uint offset, uint val) { + if (!touch_mem(alloc, offset)) { + return; + } + memory[offset] = val; +} + +// read_mem reads the value from memory at offset. +// Offset is in words. +uint read_mem(Alloc alloc, uint offset) { + if (!touch_mem(alloc, offset)) { + return 0; + } + uint v = memory[offset]; + return v; +} + +// slice_mem returns a sub-allocation inside another. Offset and size are in +// bytes, relative to a.offset. +Alloc slice_mem(Alloc a, uint offset, uint size) { +#ifdef MEM_DEBUG + if ((offset & 3) != 0 || (size & 3) != 0) { + atomicMax(mem_error, ERR_UNALIGNED_ACCESS); + return Alloc(0, 0); + } + if (offset + size > a.size) { + // slice_mem is sometimes used for slices outside bounds, + // but never written. + return Alloc(0, 0); + } +#endif + return new_alloc(a.offset + offset, size); +} diff --git a/piet-gpu/shader/path_coarse.comp b/piet-gpu/shader/path_coarse.comp index 20c3586..4f77ff9 100644 --- a/piet-gpu/shader/path_coarse.comp +++ b/piet-gpu/shader/path_coarse.comp @@ -7,8 +7,8 @@ #version 450 #extension GL_GOOGLE_include_directive : enable -#include "setup.h" #include "mem.h" +#include "setup.h" #define LG_COARSE_WG 5 #define COARSE_WG (1 << LG_COARSE_WG) @@ -87,21 +87,21 @@ SubdivResult estimate_subdiv(vec2 p0, vec2 p1, vec2 p2, float sqrt_tol) { } void main() { - if (mem_overflow) { + if (mem_error != NO_ERROR) { return; } uint element_ix = gl_GlobalInvocationID.x; - PathSegRef ref = PathSegRef(conf.pathseg_base + element_ix * PathSeg_size); + PathSegRef ref = PathSegRef(conf.pathseg_alloc.offset + element_ix * PathSeg_size); uint tag = PathSeg_Nop; if (element_ix < conf.n_pathseg) { - tag = PathSeg_tag(ref); + tag = PathSeg_tag(conf.pathseg_alloc, ref); } switch (tag) { case PathSeg_FillCubic: case PathSeg_StrokeCubic: - PathStrokeCubic cubic = PathSeg_StrokeCubic_read(ref); + PathStrokeCubic cubic = PathSeg_StrokeCubic_read(conf.pathseg_alloc, ref); vec2 err_v = 3.0 * (cubic.p2 - cubic.p1) + cubic.p0 - cubic.p3; float err = err_v.x * err_v.x + err_v.y * err_v.y; // The number of quadratics. @@ -123,7 +123,8 @@ void main() { uint n = max(uint(ceil(val * 0.5 / sqrt(REM_ACCURACY))), 1); uint path_ix = cubic.path_ix; - Path path = Path_read(PathRef(conf.tile_base + path_ix * Path_size)); + Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size)); + Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size); ivec4 bbox = ivec4(path.bbox); vec2 p0 = cubic.p0; qp0 = cubic.p0; @@ -182,11 +183,11 @@ void main() { // TODO: can be tighter, use c to bound width uint n_tile_alloc = uint((x1 - x0) * (y1 - y0)); // Consider using subgroups to aggregate atomic add. - Alloc tile_alloc = malloc(n_tile_alloc * TileSeg_size); + MallocResult tile_alloc = malloc(n_tile_alloc * TileSeg_size); if (tile_alloc.failed) { return; } - uint tile_offset = tile_alloc.offset; + uint tile_offset = tile_alloc.alloc.offset; TileSeg tile_seg; @@ -204,7 +205,9 @@ void main() { int backdrop = p1.y < p0.y ? 1 : -1; TileRef tile_ref = Tile_index(path.tiles, uint(base + xbackdrop)); uint tile_el = tile_ref.offset >> 2; - atomicAdd(memory[tile_el + 1], backdrop); + if (touch_mem(path_alloc, tile_el + 1)) { + atomicAdd(memory[tile_el + 1], backdrop); + } } // next_xray is the xray for the next scanline; the line segment intersects @@ -225,9 +228,12 @@ void main() { for (int x = xx0; x < xx1; x++) { float tile_x0 = float(x * TILE_WIDTH_PX); - TileRef tile_ref = Tile_index(path.tiles, uint(base + x)); + TileRef tile_ref = Tile_index(TileRef(path.tiles.offset), uint(base + x)); uint tile_el = tile_ref.offset >> 2; - uint old = atomicExchange(memory[tile_el], tile_offset); + uint old = 0; + if (touch_mem(path_alloc, tile_el)) { + old = atomicExchange(memory[tile_el], tile_offset); + } tile_seg.origin = p0; tile_seg.vector = p1 - p0; float y_edge = 0.0; @@ -254,7 +260,7 @@ void main() { } tile_seg.y_edge = y_edge; tile_seg.next.offset = old; - TileSeg_write(TileSegRef(tile_offset), tile_seg); + TileSeg_write(tile_alloc.alloc, TileSegRef(tile_offset), tile_seg); tile_offset += TileSeg_size; } xc += b; diff --git a/piet-gpu/shader/path_coarse.spv b/piet-gpu/shader/path_coarse.spv index 3e4392c..1854604 100644 Binary files a/piet-gpu/shader/path_coarse.spv and b/piet-gpu/shader/path_coarse.spv differ diff --git a/piet-gpu/shader/pathseg.h b/piet-gpu/shader/pathseg.h index ecba9c5..00509fb 100644 --- a/piet-gpu/shader/pathseg.h +++ b/piet-gpu/shader/pathseg.h @@ -87,13 +87,13 @@ PathSegRef PathSeg_index(PathSegRef ref, uint index) { return PathSegRef(ref.offset + index * PathSeg_size); } -PathFillLine PathFillLine_read(PathFillLineRef ref) { +PathFillLine PathFillLine_read(Alloc a, PathFillLineRef ref) { uint ix = ref.offset >> 2; - uint raw0 = memory[ix + 0]; - uint raw1 = memory[ix + 1]; - uint raw2 = memory[ix + 2]; - uint raw3 = memory[ix + 3]; - uint raw4 = memory[ix + 4]; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); + uint raw3 = read_mem(a, ix + 3); + uint raw4 = read_mem(a, ix + 4); PathFillLine s; s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); @@ -101,24 +101,24 @@ PathFillLine PathFillLine_read(PathFillLineRef ref) { return s; } -void PathFillLine_write(PathFillLineRef ref, PathFillLine s) { +void PathFillLine_write(Alloc a, PathFillLineRef ref, PathFillLine s) { uint ix = ref.offset >> 2; - memory[ix + 0] = floatBitsToUint(s.p0.x); - memory[ix + 1] = floatBitsToUint(s.p0.y); - memory[ix + 2] = floatBitsToUint(s.p1.x); - memory[ix + 3] = floatBitsToUint(s.p1.y); - memory[ix + 4] = s.path_ix; + write_mem(a, ix + 0, floatBitsToUint(s.p0.x)); + write_mem(a, ix + 1, floatBitsToUint(s.p0.y)); + write_mem(a, ix + 2, floatBitsToUint(s.p1.x)); + write_mem(a, ix + 3, floatBitsToUint(s.p1.y)); + write_mem(a, ix + 4, s.path_ix); } -PathStrokeLine PathStrokeLine_read(PathStrokeLineRef ref) { +PathStrokeLine PathStrokeLine_read(Alloc a, PathStrokeLineRef ref) { uint ix = ref.offset >> 2; - uint raw0 = memory[ix + 0]; - uint raw1 = memory[ix + 1]; - uint raw2 = memory[ix + 2]; - uint raw3 = memory[ix + 3]; - uint raw4 = memory[ix + 4]; - uint raw5 = memory[ix + 5]; - uint raw6 = memory[ix + 6]; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); + uint raw3 = read_mem(a, ix + 3); + uint raw4 = read_mem(a, ix + 4); + uint raw5 = read_mem(a, ix + 5); + uint raw6 = read_mem(a, ix + 6); PathStrokeLine s; s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); @@ -127,28 +127,28 @@ PathStrokeLine PathStrokeLine_read(PathStrokeLineRef ref) { return s; } -void PathStrokeLine_write(PathStrokeLineRef ref, PathStrokeLine s) { +void PathStrokeLine_write(Alloc a, PathStrokeLineRef ref, PathStrokeLine s) { uint ix = ref.offset >> 2; - memory[ix + 0] = floatBitsToUint(s.p0.x); - memory[ix + 1] = floatBitsToUint(s.p0.y); - memory[ix + 2] = floatBitsToUint(s.p1.x); - memory[ix + 3] = floatBitsToUint(s.p1.y); - memory[ix + 4] = s.path_ix; - memory[ix + 5] = floatBitsToUint(s.stroke.x); - memory[ix + 6] = floatBitsToUint(s.stroke.y); + write_mem(a, ix + 0, floatBitsToUint(s.p0.x)); + write_mem(a, ix + 1, floatBitsToUint(s.p0.y)); + write_mem(a, ix + 2, floatBitsToUint(s.p1.x)); + write_mem(a, ix + 3, floatBitsToUint(s.p1.y)); + write_mem(a, ix + 4, s.path_ix); + write_mem(a, ix + 5, floatBitsToUint(s.stroke.x)); + write_mem(a, ix + 6, floatBitsToUint(s.stroke.y)); } -PathFillCubic PathFillCubic_read(PathFillCubicRef ref) { +PathFillCubic PathFillCubic_read(Alloc a, PathFillCubicRef ref) { uint ix = ref.offset >> 2; - uint raw0 = memory[ix + 0]; - uint raw1 = memory[ix + 1]; - uint raw2 = memory[ix + 2]; - uint raw3 = memory[ix + 3]; - uint raw4 = memory[ix + 4]; - uint raw5 = memory[ix + 5]; - uint raw6 = memory[ix + 6]; - uint raw7 = memory[ix + 7]; - uint raw8 = memory[ix + 8]; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); + uint raw3 = read_mem(a, ix + 3); + uint raw4 = read_mem(a, ix + 4); + uint raw5 = read_mem(a, ix + 5); + uint raw6 = read_mem(a, ix + 6); + uint raw7 = read_mem(a, ix + 7); + uint raw8 = read_mem(a, ix + 8); PathFillCubic s; s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); @@ -158,32 +158,32 @@ PathFillCubic PathFillCubic_read(PathFillCubicRef ref) { return s; } -void PathFillCubic_write(PathFillCubicRef ref, PathFillCubic s) { +void PathFillCubic_write(Alloc a, PathFillCubicRef ref, PathFillCubic s) { uint ix = ref.offset >> 2; - memory[ix + 0] = floatBitsToUint(s.p0.x); - memory[ix + 1] = floatBitsToUint(s.p0.y); - memory[ix + 2] = floatBitsToUint(s.p1.x); - memory[ix + 3] = floatBitsToUint(s.p1.y); - memory[ix + 4] = floatBitsToUint(s.p2.x); - memory[ix + 5] = floatBitsToUint(s.p2.y); - memory[ix + 6] = floatBitsToUint(s.p3.x); - memory[ix + 7] = floatBitsToUint(s.p3.y); - memory[ix + 8] = s.path_ix; + write_mem(a, ix + 0, floatBitsToUint(s.p0.x)); + write_mem(a, ix + 1, floatBitsToUint(s.p0.y)); + write_mem(a, ix + 2, floatBitsToUint(s.p1.x)); + write_mem(a, ix + 3, floatBitsToUint(s.p1.y)); + write_mem(a, ix + 4, floatBitsToUint(s.p2.x)); + write_mem(a, ix + 5, floatBitsToUint(s.p2.y)); + write_mem(a, ix + 6, floatBitsToUint(s.p3.x)); + write_mem(a, ix + 7, floatBitsToUint(s.p3.y)); + write_mem(a, ix + 8, s.path_ix); } -PathStrokeCubic PathStrokeCubic_read(PathStrokeCubicRef ref) { +PathStrokeCubic PathStrokeCubic_read(Alloc a, PathStrokeCubicRef ref) { uint ix = ref.offset >> 2; - uint raw0 = memory[ix + 0]; - uint raw1 = memory[ix + 1]; - uint raw2 = memory[ix + 2]; - uint raw3 = memory[ix + 3]; - uint raw4 = memory[ix + 4]; - uint raw5 = memory[ix + 5]; - uint raw6 = memory[ix + 6]; - uint raw7 = memory[ix + 7]; - uint raw8 = memory[ix + 8]; - uint raw9 = memory[ix + 9]; - uint raw10 = memory[ix + 10]; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); + uint raw3 = read_mem(a, ix + 3); + uint raw4 = read_mem(a, ix + 4); + uint raw5 = read_mem(a, ix + 5); + uint raw6 = read_mem(a, ix + 6); + uint raw7 = read_mem(a, ix + 7); + uint raw8 = read_mem(a, ix + 8); + uint raw9 = read_mem(a, ix + 9); + uint raw10 = read_mem(a, ix + 10); PathStrokeCubic s; s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); @@ -194,62 +194,62 @@ PathStrokeCubic PathStrokeCubic_read(PathStrokeCubicRef ref) { return s; } -void PathStrokeCubic_write(PathStrokeCubicRef ref, PathStrokeCubic s) { +void PathStrokeCubic_write(Alloc a, PathStrokeCubicRef ref, PathStrokeCubic s) { uint ix = ref.offset >> 2; - memory[ix + 0] = floatBitsToUint(s.p0.x); - memory[ix + 1] = floatBitsToUint(s.p0.y); - memory[ix + 2] = floatBitsToUint(s.p1.x); - memory[ix + 3] = floatBitsToUint(s.p1.y); - memory[ix + 4] = floatBitsToUint(s.p2.x); - memory[ix + 5] = floatBitsToUint(s.p2.y); - memory[ix + 6] = floatBitsToUint(s.p3.x); - memory[ix + 7] = floatBitsToUint(s.p3.y); - memory[ix + 8] = s.path_ix; - memory[ix + 9] = floatBitsToUint(s.stroke.x); - memory[ix + 10] = floatBitsToUint(s.stroke.y); + write_mem(a, ix + 0, floatBitsToUint(s.p0.x)); + write_mem(a, ix + 1, floatBitsToUint(s.p0.y)); + write_mem(a, ix + 2, floatBitsToUint(s.p1.x)); + write_mem(a, ix + 3, floatBitsToUint(s.p1.y)); + write_mem(a, ix + 4, floatBitsToUint(s.p2.x)); + write_mem(a, ix + 5, floatBitsToUint(s.p2.y)); + write_mem(a, ix + 6, floatBitsToUint(s.p3.x)); + write_mem(a, ix + 7, floatBitsToUint(s.p3.y)); + write_mem(a, ix + 8, s.path_ix); + write_mem(a, ix + 9, floatBitsToUint(s.stroke.x)); + write_mem(a, ix + 10, floatBitsToUint(s.stroke.y)); } -uint PathSeg_tag(PathSegRef ref) { - return memory[ref.offset >> 2]; +uint PathSeg_tag(Alloc a, PathSegRef ref) { + return read_mem(a, ref.offset >> 2); } -PathFillLine PathSeg_FillLine_read(PathSegRef ref) { - return PathFillLine_read(PathFillLineRef(ref.offset + 4)); +PathFillLine PathSeg_FillLine_read(Alloc a, PathSegRef ref) { + return PathFillLine_read(a, PathFillLineRef(ref.offset + 4)); } -PathStrokeLine PathSeg_StrokeLine_read(PathSegRef ref) { - return PathStrokeLine_read(PathStrokeLineRef(ref.offset + 4)); +PathStrokeLine PathSeg_StrokeLine_read(Alloc a, PathSegRef ref) { + return PathStrokeLine_read(a, PathStrokeLineRef(ref.offset + 4)); } -PathFillCubic PathSeg_FillCubic_read(PathSegRef ref) { - return PathFillCubic_read(PathFillCubicRef(ref.offset + 4)); +PathFillCubic PathSeg_FillCubic_read(Alloc a, PathSegRef ref) { + return PathFillCubic_read(a, PathFillCubicRef(ref.offset + 4)); } -PathStrokeCubic PathSeg_StrokeCubic_read(PathSegRef ref) { - return PathStrokeCubic_read(PathStrokeCubicRef(ref.offset + 4)); +PathStrokeCubic PathSeg_StrokeCubic_read(Alloc a, PathSegRef ref) { + return PathStrokeCubic_read(a, PathStrokeCubicRef(ref.offset + 4)); } -void PathSeg_Nop_write(PathSegRef ref) { - memory[ref.offset >> 2] = PathSeg_Nop; +void PathSeg_Nop_write(Alloc a, PathSegRef ref) { + write_mem(a, ref.offset >> 2, PathSeg_Nop); } -void PathSeg_FillLine_write(PathSegRef ref, PathFillLine s) { - memory[ref.offset >> 2] = PathSeg_FillLine; - PathFillLine_write(PathFillLineRef(ref.offset + 4), s); +void PathSeg_FillLine_write(Alloc a, PathSegRef ref, PathFillLine s) { + write_mem(a, ref.offset >> 2, PathSeg_FillLine); + PathFillLine_write(a, PathFillLineRef(ref.offset + 4), s); } -void PathSeg_StrokeLine_write(PathSegRef ref, PathStrokeLine s) { - memory[ref.offset >> 2] = PathSeg_StrokeLine; - PathStrokeLine_write(PathStrokeLineRef(ref.offset + 4), s); +void PathSeg_StrokeLine_write(Alloc a, PathSegRef ref, PathStrokeLine s) { + write_mem(a, ref.offset >> 2, PathSeg_StrokeLine); + PathStrokeLine_write(a, PathStrokeLineRef(ref.offset + 4), s); } -void PathSeg_FillCubic_write(PathSegRef ref, PathFillCubic s) { - memory[ref.offset >> 2] = PathSeg_FillCubic; - PathFillCubic_write(PathFillCubicRef(ref.offset + 4), s); +void PathSeg_FillCubic_write(Alloc a, PathSegRef ref, PathFillCubic s) { + write_mem(a, ref.offset >> 2, PathSeg_FillCubic); + PathFillCubic_write(a, PathFillCubicRef(ref.offset + 4), s); } -void PathSeg_StrokeCubic_write(PathSegRef ref, PathStrokeCubic s) { - memory[ref.offset >> 2] = PathSeg_StrokeCubic; - PathStrokeCubic_write(PathStrokeCubicRef(ref.offset + 4), s); +void PathSeg_StrokeCubic_write(Alloc a, PathSegRef ref, PathStrokeCubic s) { + write_mem(a, ref.offset >> 2, PathSeg_StrokeCubic); + PathStrokeCubic_write(a, PathStrokeCubicRef(ref.offset + 4), s); } diff --git a/piet-gpu/shader/ptcl.h b/piet-gpu/shader/ptcl.h index eb21eac..4587f8f 100644 --- a/piet-gpu/shader/ptcl.h +++ b/piet-gpu/shader/ptcl.h @@ -171,12 +171,12 @@ CmdRef Cmd_index(CmdRef ref, uint index) { return CmdRef(ref.offset + index * Cmd_size); } -CmdCircle CmdCircle_read(CmdCircleRef ref) { +CmdCircle CmdCircle_read(Alloc a, CmdCircleRef ref) { uint ix = ref.offset >> 2; - uint raw0 = memory[ix + 0]; - uint raw1 = memory[ix + 1]; - uint raw2 = memory[ix + 2]; - uint raw3 = memory[ix + 3]; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); + uint raw3 = read_mem(a, ix + 3); CmdCircle s; s.center = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); s.radius = uintBitsToFloat(raw2); @@ -184,39 +184,39 @@ CmdCircle CmdCircle_read(CmdCircleRef ref) { return s; } -void CmdCircle_write(CmdCircleRef ref, CmdCircle s) { +void CmdCircle_write(Alloc a, CmdCircleRef ref, CmdCircle s) { uint ix = ref.offset >> 2; - memory[ix + 0] = floatBitsToUint(s.center.x); - memory[ix + 1] = floatBitsToUint(s.center.y); - memory[ix + 2] = floatBitsToUint(s.radius); - memory[ix + 3] = s.rgba_color; + write_mem(a, ix + 0, floatBitsToUint(s.center.x)); + write_mem(a, ix + 1, floatBitsToUint(s.center.y)); + write_mem(a, ix + 2, floatBitsToUint(s.radius)); + write_mem(a, ix + 3, s.rgba_color); } -CmdLine CmdLine_read(CmdLineRef ref) { +CmdLine CmdLine_read(Alloc a, CmdLineRef ref) { uint ix = ref.offset >> 2; - uint raw0 = memory[ix + 0]; - uint raw1 = memory[ix + 1]; - uint raw2 = memory[ix + 2]; - uint raw3 = memory[ix + 3]; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); + uint raw3 = read_mem(a, ix + 3); CmdLine s; s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); return s; } -void CmdLine_write(CmdLineRef ref, CmdLine s) { +void CmdLine_write(Alloc a, CmdLineRef ref, CmdLine s) { uint ix = ref.offset >> 2; - memory[ix + 0] = floatBitsToUint(s.start.x); - memory[ix + 1] = floatBitsToUint(s.start.y); - memory[ix + 2] = floatBitsToUint(s.end.x); - memory[ix + 3] = floatBitsToUint(s.end.y); + write_mem(a, ix + 0, floatBitsToUint(s.start.x)); + write_mem(a, ix + 1, floatBitsToUint(s.start.y)); + write_mem(a, ix + 2, floatBitsToUint(s.end.x)); + write_mem(a, ix + 3, floatBitsToUint(s.end.y)); } -CmdStroke CmdStroke_read(CmdStrokeRef ref) { +CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref) { uint ix = ref.offset >> 2; - uint raw0 = memory[ix + 0]; - uint raw1 = memory[ix + 1]; - uint raw2 = memory[ix + 2]; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); CmdStroke s; s.tile_ref = raw0; s.half_width = uintBitsToFloat(raw1); @@ -224,18 +224,18 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) { return s; } -void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) { +void CmdStroke_write(Alloc a, CmdStrokeRef ref, CmdStroke s) { uint ix = ref.offset >> 2; - memory[ix + 0] = s.tile_ref; - memory[ix + 1] = floatBitsToUint(s.half_width); - memory[ix + 2] = s.rgba_color; + write_mem(a, ix + 0, s.tile_ref); + write_mem(a, ix + 1, floatBitsToUint(s.half_width)); + write_mem(a, ix + 2, s.rgba_color); } -CmdFill CmdFill_read(CmdFillRef ref) { +CmdFill CmdFill_read(Alloc a, CmdFillRef ref) { uint ix = ref.offset >> 2; - uint raw0 = memory[ix + 0]; - uint raw1 = memory[ix + 1]; - uint raw2 = memory[ix + 2]; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); CmdFill s; s.tile_ref = raw0; s.backdrop = int(raw1); @@ -243,189 +243,189 @@ CmdFill CmdFill_read(CmdFillRef ref) { return s; } -void CmdFill_write(CmdFillRef ref, CmdFill s) { +void CmdFill_write(Alloc a, CmdFillRef ref, CmdFill s) { uint ix = ref.offset >> 2; - memory[ix + 0] = s.tile_ref; - memory[ix + 1] = uint(s.backdrop); - memory[ix + 2] = s.rgba_color; + write_mem(a, ix + 0, s.tile_ref); + write_mem(a, ix + 1, uint(s.backdrop)); + write_mem(a, ix + 2, s.rgba_color); } -CmdBeginClip CmdBeginClip_read(CmdBeginClipRef ref) { +CmdBeginClip CmdBeginClip_read(Alloc a, CmdBeginClipRef ref) { uint ix = ref.offset >> 2; - uint raw0 = memory[ix + 0]; - uint raw1 = memory[ix + 1]; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); CmdBeginClip s; s.tile_ref = raw0; s.backdrop = int(raw1); return s; } -void CmdBeginClip_write(CmdBeginClipRef ref, CmdBeginClip s) { +void CmdBeginClip_write(Alloc a, CmdBeginClipRef ref, CmdBeginClip s) { uint ix = ref.offset >> 2; - memory[ix + 0] = s.tile_ref; - memory[ix + 1] = uint(s.backdrop); + write_mem(a, ix + 0, s.tile_ref); + write_mem(a, ix + 1, uint(s.backdrop)); } -CmdBeginSolidClip CmdBeginSolidClip_read(CmdBeginSolidClipRef ref) { +CmdBeginSolidClip CmdBeginSolidClip_read(Alloc a, CmdBeginSolidClipRef ref) { uint ix = ref.offset >> 2; - uint raw0 = memory[ix + 0]; + uint raw0 = read_mem(a, ix + 0); CmdBeginSolidClip s; s.alpha = uintBitsToFloat(raw0); return s; } -void CmdBeginSolidClip_write(CmdBeginSolidClipRef ref, CmdBeginSolidClip s) { +void CmdBeginSolidClip_write(Alloc a, CmdBeginSolidClipRef ref, CmdBeginSolidClip s) { uint ix = ref.offset >> 2; - memory[ix + 0] = floatBitsToUint(s.alpha); + write_mem(a, ix + 0, floatBitsToUint(s.alpha)); } -CmdEndClip CmdEndClip_read(CmdEndClipRef ref) { +CmdEndClip CmdEndClip_read(Alloc a, CmdEndClipRef ref) { uint ix = ref.offset >> 2; - uint raw0 = memory[ix + 0]; + uint raw0 = read_mem(a, ix + 0); CmdEndClip s; s.alpha = uintBitsToFloat(raw0); return s; } -void CmdEndClip_write(CmdEndClipRef ref, CmdEndClip s) { +void CmdEndClip_write(Alloc a, CmdEndClipRef ref, CmdEndClip s) { uint ix = ref.offset >> 2; - memory[ix + 0] = floatBitsToUint(s.alpha); + write_mem(a, ix + 0, floatBitsToUint(s.alpha)); } -CmdSolid CmdSolid_read(CmdSolidRef ref) { +CmdSolid CmdSolid_read(Alloc a, CmdSolidRef ref) { uint ix = ref.offset >> 2; - uint raw0 = memory[ix + 0]; + uint raw0 = read_mem(a, ix + 0); CmdSolid s; s.rgba_color = raw0; return s; } -void CmdSolid_write(CmdSolidRef ref, CmdSolid s) { +void CmdSolid_write(Alloc a, CmdSolidRef ref, CmdSolid s) { uint ix = ref.offset >> 2; - memory[ix + 0] = s.rgba_color; + write_mem(a, ix + 0, s.rgba_color); } -CmdSolidMask CmdSolidMask_read(CmdSolidMaskRef ref) { +CmdSolidMask CmdSolidMask_read(Alloc a, CmdSolidMaskRef ref) { uint ix = ref.offset >> 2; - uint raw0 = memory[ix + 0]; + uint raw0 = read_mem(a, ix + 0); CmdSolidMask s; s.mask = uintBitsToFloat(raw0); return s; } -void CmdSolidMask_write(CmdSolidMaskRef ref, CmdSolidMask s) { +void CmdSolidMask_write(Alloc a, CmdSolidMaskRef ref, CmdSolidMask s) { uint ix = ref.offset >> 2; - memory[ix + 0] = floatBitsToUint(s.mask); + write_mem(a, ix + 0, floatBitsToUint(s.mask)); } -CmdJump CmdJump_read(CmdJumpRef ref) { +CmdJump CmdJump_read(Alloc a, CmdJumpRef ref) { uint ix = ref.offset >> 2; - uint raw0 = memory[ix + 0]; + uint raw0 = read_mem(a, ix + 0); CmdJump s; s.new_ref = raw0; return s; } -void CmdJump_write(CmdJumpRef ref, CmdJump s) { +void CmdJump_write(Alloc a, CmdJumpRef ref, CmdJump s) { uint ix = ref.offset >> 2; - memory[ix + 0] = s.new_ref; + write_mem(a, ix + 0, s.new_ref); } -uint Cmd_tag(CmdRef ref) { - return memory[ref.offset >> 2]; +uint Cmd_tag(Alloc a, CmdRef ref) { + return read_mem(a, ref.offset >> 2); } -CmdCircle Cmd_Circle_read(CmdRef ref) { - return CmdCircle_read(CmdCircleRef(ref.offset + 4)); +CmdCircle Cmd_Circle_read(Alloc a, CmdRef ref) { + return CmdCircle_read(a, CmdCircleRef(ref.offset + 4)); } -CmdLine Cmd_Line_read(CmdRef ref) { - return CmdLine_read(CmdLineRef(ref.offset + 4)); +CmdLine Cmd_Line_read(Alloc a, CmdRef ref) { + return CmdLine_read(a, CmdLineRef(ref.offset + 4)); } -CmdFill Cmd_Fill_read(CmdRef ref) { - return CmdFill_read(CmdFillRef(ref.offset + 4)); +CmdFill Cmd_Fill_read(Alloc a, CmdRef ref) { + return CmdFill_read(a, CmdFillRef(ref.offset + 4)); } -CmdBeginClip Cmd_BeginClip_read(CmdRef ref) { - return CmdBeginClip_read(CmdBeginClipRef(ref.offset + 4)); +CmdBeginClip Cmd_BeginClip_read(Alloc a, CmdRef ref) { + return CmdBeginClip_read(a, CmdBeginClipRef(ref.offset + 4)); } -CmdBeginSolidClip Cmd_BeginSolidClip_read(CmdRef ref) { - return CmdBeginSolidClip_read(CmdBeginSolidClipRef(ref.offset + 4)); +CmdBeginSolidClip Cmd_BeginSolidClip_read(Alloc a, CmdRef ref) { + return CmdBeginSolidClip_read(a, CmdBeginSolidClipRef(ref.offset + 4)); } -CmdEndClip Cmd_EndClip_read(CmdRef ref) { - return CmdEndClip_read(CmdEndClipRef(ref.offset + 4)); +CmdEndClip Cmd_EndClip_read(Alloc a, CmdRef ref) { + return CmdEndClip_read(a, CmdEndClipRef(ref.offset + 4)); } -CmdStroke Cmd_Stroke_read(CmdRef ref) { - return CmdStroke_read(CmdStrokeRef(ref.offset + 4)); +CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref) { + return CmdStroke_read(a, CmdStrokeRef(ref.offset + 4)); } -CmdSolid Cmd_Solid_read(CmdRef ref) { - return CmdSolid_read(CmdSolidRef(ref.offset + 4)); +CmdSolid Cmd_Solid_read(Alloc a, CmdRef ref) { + return CmdSolid_read(a, CmdSolidRef(ref.offset + 4)); } -CmdSolidMask Cmd_SolidMask_read(CmdRef ref) { - return CmdSolidMask_read(CmdSolidMaskRef(ref.offset + 4)); +CmdSolidMask Cmd_SolidMask_read(Alloc a, CmdRef ref) { + return CmdSolidMask_read(a, CmdSolidMaskRef(ref.offset + 4)); } -CmdJump Cmd_Jump_read(CmdRef ref) { - return CmdJump_read(CmdJumpRef(ref.offset + 4)); +CmdJump Cmd_Jump_read(Alloc a, CmdRef ref) { + return CmdJump_read(a, CmdJumpRef(ref.offset + 4)); } -void Cmd_End_write(CmdRef ref) { - memory[ref.offset >> 2] = Cmd_End; +void Cmd_End_write(Alloc a, CmdRef ref) { + write_mem(a, ref.offset >> 2, Cmd_End); } -void Cmd_Circle_write(CmdRef ref, CmdCircle s) { - memory[ref.offset >> 2] = Cmd_Circle; - CmdCircle_write(CmdCircleRef(ref.offset + 4), s); +void Cmd_Circle_write(Alloc a, CmdRef ref, CmdCircle s) { + write_mem(a, ref.offset >> 2, Cmd_Circle); + CmdCircle_write(a, CmdCircleRef(ref.offset + 4), s); } -void Cmd_Line_write(CmdRef ref, CmdLine s) { - memory[ref.offset >> 2] = Cmd_Line; - CmdLine_write(CmdLineRef(ref.offset + 4), s); +void Cmd_Line_write(Alloc a, CmdRef ref, CmdLine s) { + write_mem(a, ref.offset >> 2, Cmd_Line); + CmdLine_write(a, CmdLineRef(ref.offset + 4), s); } -void Cmd_Fill_write(CmdRef ref, CmdFill s) { - memory[ref.offset >> 2] = Cmd_Fill; - CmdFill_write(CmdFillRef(ref.offset + 4), s); +void Cmd_Fill_write(Alloc a, CmdRef ref, CmdFill s) { + write_mem(a, ref.offset >> 2, Cmd_Fill); + CmdFill_write(a, CmdFillRef(ref.offset + 4), s); } -void Cmd_BeginClip_write(CmdRef ref, CmdBeginClip s) { - memory[ref.offset >> 2] = Cmd_BeginClip; - CmdBeginClip_write(CmdBeginClipRef(ref.offset + 4), s); +void Cmd_BeginClip_write(Alloc a, CmdRef ref, CmdBeginClip s) { + write_mem(a, ref.offset >> 2, Cmd_BeginClip); + CmdBeginClip_write(a, CmdBeginClipRef(ref.offset + 4), s); } -void Cmd_BeginSolidClip_write(CmdRef ref, CmdBeginSolidClip s) { - memory[ref.offset >> 2] = Cmd_BeginSolidClip; - CmdBeginSolidClip_write(CmdBeginSolidClipRef(ref.offset + 4), s); +void Cmd_BeginSolidClip_write(Alloc a, CmdRef ref, CmdBeginSolidClip s) { + write_mem(a, ref.offset >> 2, Cmd_BeginSolidClip); + CmdBeginSolidClip_write(a, CmdBeginSolidClipRef(ref.offset + 4), s); } -void Cmd_EndClip_write(CmdRef ref, CmdEndClip s) { - memory[ref.offset >> 2] = Cmd_EndClip; - CmdEndClip_write(CmdEndClipRef(ref.offset + 4), s); +void Cmd_EndClip_write(Alloc a, CmdRef ref, CmdEndClip s) { + write_mem(a, ref.offset >> 2, Cmd_EndClip); + CmdEndClip_write(a, CmdEndClipRef(ref.offset + 4), s); } -void Cmd_Stroke_write(CmdRef ref, CmdStroke s) { - memory[ref.offset >> 2] = Cmd_Stroke; - CmdStroke_write(CmdStrokeRef(ref.offset + 4), s); +void Cmd_Stroke_write(Alloc a, CmdRef ref, CmdStroke s) { + write_mem(a, ref.offset >> 2, Cmd_Stroke); + CmdStroke_write(a, CmdStrokeRef(ref.offset + 4), s); } -void Cmd_Solid_write(CmdRef ref, CmdSolid s) { - memory[ref.offset >> 2] = Cmd_Solid; - CmdSolid_write(CmdSolidRef(ref.offset + 4), s); +void Cmd_Solid_write(Alloc a, CmdRef ref, CmdSolid s) { + write_mem(a, ref.offset >> 2, Cmd_Solid); + CmdSolid_write(a, CmdSolidRef(ref.offset + 4), s); } -void Cmd_SolidMask_write(CmdRef ref, CmdSolidMask s) { - memory[ref.offset >> 2] = Cmd_SolidMask; - CmdSolidMask_write(CmdSolidMaskRef(ref.offset + 4), s); +void Cmd_SolidMask_write(Alloc a, CmdRef ref, CmdSolidMask s) { + write_mem(a, ref.offset >> 2, Cmd_SolidMask); + CmdSolidMask_write(a, CmdSolidMaskRef(ref.offset + 4), s); } -void Cmd_Jump_write(CmdRef ref, CmdJump s) { - memory[ref.offset >> 2] = Cmd_Jump; - CmdJump_write(CmdJumpRef(ref.offset + 4), s); +void Cmd_Jump_write(Alloc a, CmdRef ref, CmdJump s) { + write_mem(a, ref.offset >> 2, Cmd_Jump); + CmdJump_write(a, CmdJumpRef(ref.offset + 4), s); } diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h index bcfa510..5a4935c 100644 --- a/piet-gpu/shader/setup.h +++ b/piet-gpu/shader/setup.h @@ -30,9 +30,9 @@ struct Config { uint n_pathseg; uint width_in_tiles; uint height_in_tiles; - uint tile_base; - uint bin_base; - uint ptcl_base; - uint pathseg_base; - uint anno_base; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; }; diff --git a/piet-gpu/shader/tile.h b/piet-gpu/shader/tile.h index 133ff53..500277b 100644 --- a/piet-gpu/shader/tile.h +++ b/piet-gpu/shader/tile.h @@ -49,48 +49,48 @@ TileSegRef TileSeg_index(TileSegRef ref, uint index) { return TileSegRef(ref.offset + index * TileSeg_size); } -Path Path_read(PathRef ref) { +Path Path_read(Alloc a, PathRef ref) { uint ix = ref.offset >> 2; - uint raw0 = memory[ix + 0]; - uint raw1 = memory[ix + 1]; - uint raw2 = memory[ix + 2]; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); Path s; s.bbox = uvec4(raw0 & 0xffff, raw0 >> 16, raw1 & 0xffff, raw1 >> 16); s.tiles = TileRef(raw2); return s; } -void Path_write(PathRef ref, Path s) { +void Path_write(Alloc a, PathRef ref, Path s) { uint ix = ref.offset >> 2; - memory[ix + 0] = s.bbox.x | (s.bbox.y << 16); - memory[ix + 1] = s.bbox.z | (s.bbox.w << 16); - memory[ix + 2] = s.tiles.offset; + write_mem(a, ix + 0, s.bbox.x | (s.bbox.y << 16)); + write_mem(a, ix + 1, s.bbox.z | (s.bbox.w << 16)); + write_mem(a, ix + 2, s.tiles.offset); } -Tile Tile_read(TileRef ref) { +Tile Tile_read(Alloc a, TileRef ref) { uint ix = ref.offset >> 2; - uint raw0 = memory[ix + 0]; - uint raw1 = memory[ix + 1]; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); Tile s; s.tile = TileSegRef(raw0); s.backdrop = int(raw1); return s; } -void Tile_write(TileRef ref, Tile s) { +void Tile_write(Alloc a, TileRef ref, Tile s) { uint ix = ref.offset >> 2; - memory[ix + 0] = s.tile.offset; - memory[ix + 1] = uint(s.backdrop); + write_mem(a, ix + 0, s.tile.offset); + write_mem(a, ix + 1, uint(s.backdrop)); } -TileSeg TileSeg_read(TileSegRef ref) { +TileSeg TileSeg_read(Alloc a, TileSegRef ref) { uint ix = ref.offset >> 2; - uint raw0 = memory[ix + 0]; - uint raw1 = memory[ix + 1]; - uint raw2 = memory[ix + 2]; - uint raw3 = memory[ix + 3]; - uint raw4 = memory[ix + 4]; - uint raw5 = memory[ix + 5]; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); + uint raw3 = read_mem(a, ix + 3); + uint raw4 = read_mem(a, ix + 4); + uint raw5 = read_mem(a, ix + 5); TileSeg s; s.origin = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); s.vector = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); @@ -99,13 +99,13 @@ TileSeg TileSeg_read(TileSegRef ref) { return s; } -void TileSeg_write(TileSegRef ref, TileSeg s) { +void TileSeg_write(Alloc a, TileSegRef ref, TileSeg s) { uint ix = ref.offset >> 2; - memory[ix + 0] = floatBitsToUint(s.origin.x); - memory[ix + 1] = floatBitsToUint(s.origin.y); - memory[ix + 2] = floatBitsToUint(s.vector.x); - memory[ix + 3] = floatBitsToUint(s.vector.y); - memory[ix + 4] = floatBitsToUint(s.y_edge); - memory[ix + 5] = s.next.offset; + write_mem(a, ix + 0, floatBitsToUint(s.origin.x)); + write_mem(a, ix + 1, floatBitsToUint(s.origin.y)); + write_mem(a, ix + 2, floatBitsToUint(s.vector.x)); + write_mem(a, ix + 3, floatBitsToUint(s.vector.y)); + write_mem(a, ix + 4, floatBitsToUint(s.y_edge)); + write_mem(a, ix + 5, s.next.offset); } diff --git a/piet-gpu/shader/tile_alloc.comp b/piet-gpu/shader/tile_alloc.comp index 6588227..f0d42da 100644 --- a/piet-gpu/shader/tile_alloc.comp +++ b/piet-gpu/shader/tile_alloc.comp @@ -5,8 +5,8 @@ #version 450 #extension GL_GOOGLE_include_directive : enable -#include "setup.h" #include "mem.h" +#include "setup.h" #define LG_TILE_ALLOC_WG (7 + LG_WG_FACTOR) #define TILE_ALLOC_WG (1 << LG_TILE_ALLOC_WG) @@ -25,21 +25,21 @@ layout(set = 0, binding = 1) readonly buffer ConfigBuf { #define SY (1.0 / float(TILE_HEIGHT_PX)) shared uint sh_tile_count[TILE_ALLOC_WG]; -shared Alloc sh_tile_alloc; +shared MallocResult sh_tile_alloc; void main() { - if (mem_overflow) { + if (mem_error != NO_ERROR) { return; } uint th_ix = gl_LocalInvocationID.x; uint element_ix = gl_GlobalInvocationID.x; - PathRef path_ref = PathRef(conf.tile_base + element_ix * Path_size); - AnnotatedRef ref = AnnotatedRef(conf.anno_base + element_ix * Annotated_size); + PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size); + AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size); uint tag = Annotated_Nop; if (element_ix < conf.n_elements) { - tag = Annotated_tag(ref); + tag = Annotated_tag(conf.anno_alloc, ref); } int x0 = 0, y0 = 0, x1 = 0, y1 = 0; switch (tag) { @@ -49,7 +49,7 @@ void main() { case Annotated_EndClip: // Note: we take advantage of the fact that fills, strokes, and // clips have compatible layout. - AnnoFill fill = Annotated_Fill_read(ref); + AnnoFill fill = Annotated_Fill_read(conf.anno_alloc, ref); x0 = int(floor(fill.bbox.x * SX)); y0 = int(floor(fill.bbox.y * SY)); x1 = int(ceil(fill.bbox.z * SX)); @@ -71,36 +71,38 @@ void main() { } sh_tile_count[th_ix] = tile_count; + uint total_tile_count = tile_count; // Prefix sum of sh_tile_count for (uint i = 0; i < LG_TILE_ALLOC_WG; i++) { barrier(); if (th_ix >= (1 << i)) { - tile_count += sh_tile_count[th_ix - (1 << i)]; + total_tile_count += sh_tile_count[th_ix - (1 << i)]; } barrier(); - sh_tile_count[th_ix] = tile_count; + sh_tile_count[th_ix] = total_tile_count; } if (th_ix == TILE_ALLOC_WG - 1) { - sh_tile_alloc = malloc(tile_count * Tile_size); + sh_tile_alloc = malloc(total_tile_count * Tile_size); } barrier(); - Alloc alloc_start = sh_tile_alloc; + MallocResult alloc_start = sh_tile_alloc; if (alloc_start.failed) { return; } if (element_ix < conf.n_elements) { uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0; - path.tiles = TileRef(alloc_start.offset + Tile_size * tile_subix); - Path_write(path_ref, path); + Alloc tiles_alloc = slice_mem(alloc_start.alloc, Tile_size * tile_subix, Tile_size * tile_count); + path.tiles = TileRef(tiles_alloc.offset); + Path_write(conf.tile_alloc, path_ref, path); } // Zero out allocated tiles efficiently uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4); - uint start_ix = alloc_start.offset >> 2; + uint start_ix = alloc_start.alloc.offset >> 2; for (uint i = th_ix; i < total_count; i += TILE_ALLOC_WG) { // Note: this interleaving is faster than using Tile_write // by a significant amount. - memory[start_ix + i] = 0; + write_mem(alloc_start.alloc, start_ix + i, 0); } } diff --git a/piet-gpu/shader/tile_alloc.spv b/piet-gpu/shader/tile_alloc.spv index 7a80ad7..b256392 100644 Binary files a/piet-gpu/shader/tile_alloc.spv and b/piet-gpu/shader/tile_alloc.spv differ