From 6976f877e02fe0103095bc556a1b85f821d5c430 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Tue, 21 Apr 2020 17:55:17 -0700 Subject: [PATCH] Add first draft of kernel 3 A fairly simple approach, but it adds the translation (not tested yet in scene encoding) and does bounding box culling. --- piet-gpu-types/src/ptcl.rs | 5 +- piet-gpu/shader/build.ninja | 2 + piet-gpu/shader/image.comp | 2 +- piet-gpu/shader/image.spv | Bin 8096 -> 8116 bytes piet-gpu/shader/kernel1.comp | 27 ++- piet-gpu/shader/kernel3.comp | 72 ++++++++ piet-gpu/shader/kernel3.spv | Bin 0 -> 9964 bytes piet-gpu/shader/ptcl.h | 323 +++++++++++++++++++++++++++++++++++ piet-gpu/src/main.rs | 32 +++- 9 files changed, 449 insertions(+), 14 deletions(-) create mode 100644 piet-gpu/shader/kernel3.comp create mode 100644 piet-gpu/shader/kernel3.spv create mode 100644 piet-gpu/shader/ptcl.h diff --git a/piet-gpu-types/src/ptcl.rs b/piet-gpu-types/src/ptcl.rs index f5e42af..b6df77d 100644 --- a/piet-gpu-types/src/ptcl.rs +++ b/piet-gpu-types/src/ptcl.rs @@ -4,8 +4,9 @@ piet_gpu! { #[gpu_write] mod ptcl { struct CmdCircle { - // In existing code, this is packed; we might need an annotation for this. - bbox: [u16; 4], + center: [f32; 2], + radius: f32, + rgba_color: u32, } struct CmdLine { start: [f32; 2], diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja index 5befa7f..ada8694 100644 --- a/piet-gpu/shader/build.ninja +++ b/piet-gpu/shader/build.ninja @@ -10,3 +10,5 @@ rule glsl build image.spv: glsl image.comp | scene.h build kernel1.spv: glsl kernel1.comp | scene.h tilegroup.h + +build kernel3.spv: glsl kernel3.comp | scene.h tilegroup.h ptcl.h diff --git a/piet-gpu/shader/image.comp b/piet-gpu/shader/image.comp index 60739d5..6d84eb5 100644 --- a/piet-gpu/shader/image.comp +++ b/piet-gpu/shader/image.comp @@ -40,7 +40,7 @@ void main() { if (tag == PietItem_Circle) { PietCircle circle = PietItem_Circle_read(item_ref); float r = length(xy + vec2(0.5, 0.5) - circle.center.xy); - float alpha = clamp(circle.radius - r, 0.0, 1.0); + float alpha = clamp(0.5 + circle.radius - r, 0.0, 1.0); vec4 fg_rgba = unpackUnorm4x8(circle.rgba_color); // TODO: sRGB rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a); diff --git a/piet-gpu/shader/image.spv b/piet-gpu/shader/image.spv index 527c9ae2b14983aaa723ab098d6d68b77092cab6..097add185fc152cd7131529812585e68b2eebfd6 100644 GIT binary patch delta 1542 zcmYk6xldG46vpo`Fam-i;EL?xE^a6)vJAK|jz%o42yVm>64aQOSUe2c+E7@T*iss7 z?KIjF6JzJ!U?+vq-|z8moaY?x_kCx%=iK`q??yk2Hs-5(H-(T3wfwvBzHzQ_ds{Aq z2eZ?UXYb5}nvf4Y@FyXFYr;{mj_HM&yL0#NrP?T3X?|w*{`k`egoOgCn$JUZR_hd( z*9y;Xjo>MuRAZABk-c!At3;ZePjPi9f_qYrz75_DM^5?m)wi3z<&y?E&k;hlu{NfB zadanqD_G4fyZFBe&djNAw($;Je`M zOcH1A$Yztf(HLtLG0r=bapzUDuuWZpyHn(4xceN%`<3-Nyqx2-%BznW4&?eA&Kq<< zYdVZK7jdhj3Cr7UJM+&J|H`Y~n;m>;c$px-<2 z+(#I*z^cDUzFRhf*I5TJ_h7Unv>@tefO<)-7{4%v>9dftsGIa1eLMIEtK6I&m@zkL zO8?GGQ}=Q4JmX-y5Nxg(lA5rBV>iSCZz_WJz|~)6)3G$$Vb8lY<>?i6XJ9*umaJ6S(GxT*X|7vbadQr{4mG!g_uI3Z9 zzD!e&+Wt)Q5T5Rn&z;3uuQS1y{ zJr)}TtNFRcTIayl(-&)<2U{z?6@^KPUO>hAs>Qcq=1om2Ez?TPNA}Xn(y}tUS9|#g(qW^pQV9{P2!e80bCh+z#6xgX6`P`-b=MhXyc1B^RuIm=Lic0R5hRb>dfXT zt$r;$ogT(1plo4`ZHnxI`&=i|)%g@}2_gbhu2OK%&lWVWHe#J-wsxD%Xa?X0jK9w*FOM{CF!?kzIx1e7#u=!S>6xogI6(UJPBtIcK#Q_1$Z;O zl}=*MZCP(}H!3n$5M#eX882m;i7o0f+?^tyfV)pG?ic1s_(YD=npk^KdpOr^bKam0 zn$u>y!N{x8d)7YIt>lwoY|o_{`!Dkf*CDTgS25IU*;}m+%#Sk4zUS>3U!tV~v;GL{ zO;z&hix>Vsn^21k6pPiktQrwM6s4JUKUh5uk3hWzjbAvyurhZy*2<}} zBDAc2Xy{2=k$w`g%ocXu23LCmHdIe-gB8EPf0%zW+oaoc;@R|cR$>=i%_nLHGfg>a z-DNFF->50>+Y>v)Fo6pU@XB72Snmi!-&Z?c%TX}Dbm%Wql%K&dhMo=hHp%04_kz{N zSvMS#@_ipi(1<}NvY@Fns2p<=O?^E(sZ(IJKE|g^I}J9MUd+`GcEhDjxd!0s>q)lo zEJV@oGA0`Yn@}$%I|nvdd`rBt^KkW8*brFFZ#w3>2sWRdr{Z5mwIn7UfqW*}=8i|v z)SqWXUIDB5k;ipc!LBPGXZfVAp{d7r>N?o9>hTL315-pNi3(xT1PCRDpP>t^Qf+MQ IS>s^MKkoc(nE(I) diff --git a/piet-gpu/shader/kernel1.comp b/piet-gpu/shader/kernel1.comp index 436b8bd..3a4156c 100644 --- a/piet-gpu/shader/kernel1.comp +++ b/piet-gpu/shader/kernel1.comp @@ -1,3 +1,16 @@ +// This is "kernel 1" in a 4-kernel pipeline. It traverses the scene graph +// and outputs "instances" (references to item + translation) for each item +// that intersects the tilegroup. +// +// This implementation is simplistic and leaves a lot of performance on the +// table. A fancier implementation would use threadgroup shared memory or +// subgroups (or possibly both) to parallelize the reading of the input and +// the computation of tilegroup intersection. +// +// In addition, there are some features currently missing. One is the use of +// a bump allocator to extend the current fixed allocation. Another is support +// for clipping. + #version 450 #extension GL_GOOGLE_include_directive : enable @@ -18,10 +31,10 @@ layout(set = 0, binding = 1) buffer TilegroupBuf { // TODO: compute this #define WIDTH_IN_TILEGROUPS 4 -#define TILEGROUP_WIDTH 512 -#define TILEGROUP_HEIGHT 16 +#define TILEGROUP_WIDTH_PX 512 +#define TILEGROUP_HEIGHT_PX 16 -#define INITIAL_ALLOC 1024 +#define TILEGROUP_INITIAL_ALLOC 1024 #define MAX_STACK 8 @@ -35,8 +48,8 @@ void main() { StackElement stack[MAX_STACK]; uint stack_ix = 0; uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS + gl_GlobalInvocationID.x; - TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * INITIAL_ALLOC); - vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILEGROUP_WIDTH, TILEGROUP_HEIGHT); + TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_INITIAL_ALLOC); + vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILEGROUP_WIDTH_PX, TILEGROUP_HEIGHT_PX); PietItemRef root = PietItemRef(0); SimpleGroup group = PietItem_Group_read(root); StackElement tos = StackElement(root, 0, group.offset.xy); @@ -45,8 +58,8 @@ void main() { if (tos.index < group.n_items) { Bbox bbox = Bbox_read(Bbox_index(group.bboxes, tos.index)); vec4 bb = vec4(bbox.bbox) + tos.offset.xyxy; - bool hit = max(bb.x, xy0.x) < min(bb.z, xy0.x + float(TILEGROUP_WIDTH)) - && max(bb.y, xy0.y) < min(bb.w, xy0.y + float(TILEGROUP_HEIGHT)); + bool hit = max(bb.x, xy0.x) < min(bb.z, xy0.x + float(TILEGROUP_WIDTH_PX)) + && max(bb.y, xy0.y) < min(bb.w, xy0.y + float(TILEGROUP_HEIGHT_PX)); bool is_group = false; if (hit) { PietItemRef item_ref = PietItem_index(group.items, tos.index); diff --git a/piet-gpu/shader/kernel3.comp b/piet-gpu/shader/kernel3.comp new file mode 100644 index 0000000..f9f9362 --- /dev/null +++ b/piet-gpu/shader/kernel3.comp @@ -0,0 +1,72 @@ +#version 450 +#extension GL_GOOGLE_include_directive : enable + +layout(local_size_x = 32, local_size_y = 1) in; + +layout(set = 0, binding = 0) readonly buffer SceneBuf { + uint[] scene; +}; + +// TODO: this should have a `readonly` qualifier, but then inclusion +// of ptcl.h would fail because of the writers. +layout(set = 0, binding = 1) buffer TilegroupBuf { + uint[] tilegroup; +}; + +layout(set = 0, binding = 2) buffer PtclBuf { + uint[] ptcl; +}; + +#include "scene.h" +#include "tilegroup.h" +#include "ptcl.h" + +// TODO: compute all these + +#define WIDTH_IN_TILEGROUPS 4 +#define WIDTH_IN_TILES 128 +#define TILEGROUP_WIDTH_TILES 32 +#define TILE_WIDTH_PX 16 +#define TILE_HEIGHT_PX 16 + +// Must be the same as kernel1. Might be a good idea to move these particular +// constants to their own .h file. +#define TILEGROUP_INITIAL_ALLOC 1024 + +#define PTCL_INITIAL_ALLOC 4096 + +void main() { + uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x; + uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS + + (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES); + vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX); + TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_INITIAL_ALLOC); + CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC); + + while (true) { + uint tg_tag = TileGroup_tag(tg_ref); + if (tg_tag == TileGroup_End) { + break; + } + // Assume tg_tag is `Instance`, though there will be more cases. + Instance ins = TileGroup_Instance_read(tg_ref); + PietItemRef item_ref = PietItemRef(ins.item_ref); + uint item_tag = PietItem_tag(item_ref); + switch (item_tag) { + case PietItem_Circle: + PietCircle circle = PietItem_Circle_read(item_ref); + vec2 center = ins.offset + circle.center.xy; + float r = circle.radius; + if (max(center.x - r, xy0.x) < min(center.x + r, xy0.x + float(TILE_WIDTH_PX)) + && max(center.y - r, xy0.y) < min(center.y + r, xy0.y + float(TILE_HEIGHT_PX))) + { + CmdCircle cmd = CmdCircle(center, r, circle.rgba_color); + Cmd_Circle_write(cmd_ref, cmd); + cmd_ref.offset += Cmd_size; + } + break; + } + tg_ref.offset += TileGroup_size; + } + Cmd_End_write(cmd_ref); +} diff --git a/piet-gpu/shader/kernel3.spv b/piet-gpu/shader/kernel3.spv new file mode 100644 index 0000000000000000000000000000000000000000..23a7c3efd84eec32a179cc6d5b1047f2ecd6af2e GIT binary patch literal 9964 zcmaKx2ar|e8HNvRk*0JD2vQWKN)-@6`nrmOfE9IFw&2EPcip{9u^r9UO@xR~u)xX_ycdhC_`G_Rxmki*) zxAjZXXL!;dCP@Y-wdq*5ZqvG1J*DPZ^B2t4;i#l4_0wl4J~fQh$d1Og4h>ZphQZ4X zMSRz-x2~aX{rb9fYZ{umx*B>)d)iwX+S@u>8k*a>TbfF3J6rS{ zQ1EN(Xln0mE;`iUQLJA>M@%Z&(Am~e+St+xOuLlx>1=K7X(=UmRWrxjkI|0ReRl8B zK8U&I_J;13#^zIcO5MFprJ0d=X78LiOYo~oj>NBDRq|TVq30XM1ON*1x{4VN**-sinZVp52YjZM{AAd}J~jy`1y# z#*1xb-5XPtj50^se@${s9$#N-*5nSKVQ)=vJN>|xF<$TJ0)+CeB!%v%QEa%f^?>5C_ zyu52%jvtatL67*unhdfqQP=V${Ihx1CZ~Wmx3#y_b$9l5;fI>mk@qRqnStKtTU^jS zvRueI)A7&h3{AY4eQpiW*kRrFnaf$Sy+5@tPi-<6ylO{t@#v=g%KMc2tx4vi_xXZd zKRKV`8I^NQrJu`w?rlKd#NxNC?Cm>iay~usjON~$ zGFgsGz~q=CrEM!?)Vc~BwK~D&TGdG@)8{dyan~}&`W^&#HFh`dNNv^0J%{O4$-U^2 z_dakrZ*B4bx`%bUccyP18ILI`g$-=K4@&a5uMaVn=Te&3 zw%2#;>}+Z*wRLvXdv<(>49We|H83dUtCG?1yZ6jaYmU=rd-myN50lWFIIr0`tWIWS zc0QT29#2It_gIxIL~rZpNpmea%vO^u(N_3u^lY6OT$(YSQf?Q~KV^>~$6P zv$f~%FV}DbdUrmz-P(3Er~T}$h}&18-%-%(nEQHi)FOTlIE#Pd%paTT*}VYveeTy; z8@&oK-uo~K@dW|!Xlw%+ zCm_cVH;7T)JjPj9ZX&iJjOxZ~8^#zp{Vr%ex$*4sh}@>X`HUaQD6g;ae(#8Q_tJPd zeVv;&dG&FO#;O}TkuhQ?fsK_LkFV>jE&4tYET^BbzMms@D%e;#>rBgS^6Ik~ja4^x zE@Q;{;b^R!bN0O#>lZ!?avyCI@i~prwKZ0oIekBc{~2I&%2|IYqyF;7uPEroPrzp_ zqcP@i@6Ss4>^x4-^?Ep;(AR;@JC6EszwE_(qi){wb04`SxxO*y%wxUHh>!jkGMZC9 z>|4P75bX_&_UV~WZ_RbrEc|zZJ;&Ow$?fiY^t~5szVP1<_H#l1LyYDhh&ZqN8STY+ zJ;120-}@QuQ_gv*dxzyB?<2Xc|D#~@dJfF{G-KrT9;@s3Lq>DRnOEJrEa%pT`+PKg{R z=))q0exf=Tc@&vufTuEN$C*g@8~~5PE%aN_z56rhz9`WXc_pEmpPtjO8F=kFNAyz5yZ@5Qk> zC-39Xg|BzwxZHjd_0*3?m-DWe+jA!uYcn2fy_nYou$=d313A-o30U5E&*?-Y;wOV` zAmUE~%X#mNUxIlGSl;+ajOs6OUruK9-Z(yw?^MQVh>!N^x!v()?B3Z^5dDl3+lM{Q z2HTr8yc_C^^EICX_IFl!?cOE1n8RGK+(Y>L*)R{0^X$EV{{pa&J!qSc$k~I~UZW@Z z=xH%Hdh$N1+moNYr-NrR%G;CoSI)EK{X7FKR}YRpm!iwNu70;z29`74`o9~SN5?5PCoj$3~V3!^FA&I>mwg|t^il|(EukOaa+N1 zW02^r5zJ5ecO&!a^Eh?9Guse**T-|y%;@8t(bj~>d1l1XOAFXJ#pk?h(26b}bJ`9r zuI-iR`p8F~Hn4Ll=6MskeDr@6n4ffQ&7nQw+rgE6?0}PxK03g~we3XLM?Ug&fh+sC z8cu$Hz9-#ax#HUPp!1Wit$Fo{wJm||T^~PdVjUfy^|&b^l=~b z*@?)x4`S!$o!J9+{xR2Uz|Pfq`FV6L*gI1X)~A;7I&}FBX!Fsp2b;%OZQiBbjPk}T z!RH3>q};ZbQC*wy`@#Afw~tYMF}eM&bt8B+qcPf|uY+Lo75lmgT|VBgH-qI4AQ5*9 zIO43QuAl20>*J`thSBwMjQHEY`Jd_+O-tk6pZ-QNkWE$E(~gSj4i-;Qpb;@Au)ICE z?&{wA!HhQ{M^-xCm@aY*zu9&Aq&7`+q28Sg>76Vc<_!TJ?@dAN%n>uw0xAdytDg_yAZg*5-p?x#Io(5c-3Nb+kw9hr!xhGv}fnwvT{| zc|VFSw*`rF@iB1Z)gG}Q2WvC0YoZ>uPk@VgKZ!2)2oiZe1&+MhBlgo^ZRT}t)x-7~ zuycsDeH`q1J7@jX^$Gvag6%EtsP??ab~^>mh;TSntlx& zYpOkBzYf-BPvYpwerF>-vykZNRIokGVe}q&SDr+?7t!N4!1@(?{3g15oYQZC<>C%c z1b-WmckRrn?j5$5?;ukUd3#llUi*U=BR;1g_Bt;eGrR!22(jVm*HZmWz4*7%aCLaW4)r{sgff>&P2BknyL8HqV@A;%7M* zd-!woXAtAeYkfKWqW&+y#rn^p%WXlT{x88%Uq0&p3al;a{~9b8^?w77`p<%`FQ;GB z|1G#!|99wek0728=l^@e`rebM{fC^7+J8isi`stzM{RRjTTZ{I{bz8o_FvHDb|P^$ z{|dIY`xkM41ItIPzk{QeIjkk8U-5gs=k83zXDQ;jJA-i<;#phH7> 2; + uint raw0 = ptcl[ix + 0]; + uint raw1 = ptcl[ix + 1]; + uint raw2 = ptcl[ix + 2]; + uint raw3 = ptcl[ix + 3]; + CmdCircle s; + s.center = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.radius = uintBitsToFloat(raw2); + s.rgba_color = raw3; + return s; +} + +void CmdCircle_write(CmdCircleRef ref, CmdCircle s) { + uint ix = ref.offset >> 2; + ptcl[ix + 0] = floatBitsToUint(s.center.x); + ptcl[ix + 1] = floatBitsToUint(s.center.y); + ptcl[ix + 2] = floatBitsToUint(s.radius); + ptcl[ix + 3] = s.rgba_color; +} + +CmdLine CmdLine_read(CmdLineRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = ptcl[ix + 0]; + uint raw1 = ptcl[ix + 1]; + uint raw2 = ptcl[ix + 2]; + uint raw3 = ptcl[ix + 3]; + CmdLine s; + s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + return s; +} + +void CmdLine_write(CmdLineRef ref, CmdLine s) { + uint ix = ref.offset >> 2; + ptcl[ix + 0] = floatBitsToUint(s.start.x); + ptcl[ix + 1] = floatBitsToUint(s.start.y); + ptcl[ix + 2] = floatBitsToUint(s.end.x); + ptcl[ix + 3] = floatBitsToUint(s.end.y); +} + +CmdStroke CmdStroke_read(CmdStrokeRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = ptcl[ix + 0]; + uint raw1 = ptcl[ix + 1]; + CmdStroke s; + s.halfWidth = uintBitsToFloat(raw0); + s.rgba_color = raw1; + return s; +} + +void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) { + uint ix = ref.offset >> 2; + ptcl[ix + 0] = floatBitsToUint(s.halfWidth); + ptcl[ix + 1] = s.rgba_color; +} + +CmdFill CmdFill_read(CmdFillRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = ptcl[ix + 0]; + uint raw1 = ptcl[ix + 1]; + uint raw2 = ptcl[ix + 2]; + uint raw3 = ptcl[ix + 3]; + CmdFill s; + s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + return s; +} + +void CmdFill_write(CmdFillRef ref, CmdFill s) { + uint ix = ref.offset >> 2; + ptcl[ix + 0] = floatBitsToUint(s.start.x); + ptcl[ix + 1] = floatBitsToUint(s.start.y); + ptcl[ix + 2] = floatBitsToUint(s.end.x); + ptcl[ix + 3] = floatBitsToUint(s.end.y); +} + +CmdFillEdge CmdFillEdge_read(CmdFillEdgeRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = ptcl[ix + 0]; + uint raw1 = ptcl[ix + 1]; + CmdFillEdge s; + s.sign = int(raw0); + s.y = uintBitsToFloat(raw1); + return s; +} + +void CmdFillEdge_write(CmdFillEdgeRef ref, CmdFillEdge s) { + uint ix = ref.offset >> 2; + ptcl[ix + 0] = uint(s.sign); + ptcl[ix + 1] = floatBitsToUint(s.y); +} + +CmdDrawFill CmdDrawFill_read(CmdDrawFillRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = ptcl[ix + 0]; + uint raw1 = ptcl[ix + 1]; + CmdDrawFill s; + s.backdrop = int(raw0); + s.rgba_color = raw1; + return s; +} + +void CmdDrawFill_write(CmdDrawFillRef ref, CmdDrawFill s) { + uint ix = ref.offset >> 2; + ptcl[ix + 0] = uint(s.backdrop); + ptcl[ix + 1] = s.rgba_color; +} + +CmdSolid CmdSolid_read(CmdSolidRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = ptcl[ix + 0]; + CmdSolid s; + s.rgba_color = raw0; + return s; +} + +void CmdSolid_write(CmdSolidRef ref, CmdSolid s) { + uint ix = ref.offset >> 2; + ptcl[ix + 0] = s.rgba_color; +} + +uint Cmd_tag(CmdRef ref) { + return ptcl[ref.offset >> 2]; +} + +CmdCircle Cmd_Circle_read(CmdRef ref) { + return CmdCircle_read(CmdCircleRef(ref.offset + 4)); +} + +CmdLine Cmd_Line_read(CmdRef ref) { + return CmdLine_read(CmdLineRef(ref.offset + 4)); +} + +CmdFill Cmd_Fill_read(CmdRef ref) { + return CmdFill_read(CmdFillRef(ref.offset + 4)); +} + +CmdStroke Cmd_Stroke_read(CmdRef ref) { + return CmdStroke_read(CmdStrokeRef(ref.offset + 4)); +} + +CmdFillEdge Cmd_FillEdge_read(CmdRef ref) { + return CmdFillEdge_read(CmdFillEdgeRef(ref.offset + 4)); +} + +CmdDrawFill Cmd_DrawFill_read(CmdRef ref) { + return CmdDrawFill_read(CmdDrawFillRef(ref.offset + 4)); +} + +CmdSolid Cmd_Solid_read(CmdRef ref) { + return CmdSolid_read(CmdSolidRef(ref.offset + 4)); +} + +void Cmd_End_write(CmdRef ref) { + ptcl[ref.offset >> 2] = Cmd_End; +} + +void Cmd_Circle_write(CmdRef ref, CmdCircle s) { + ptcl[ref.offset >> 2] = Cmd_Circle; + CmdCircle_write(CmdCircleRef(ref.offset + 4), s); +} + +void Cmd_Line_write(CmdRef ref, CmdLine s) { + ptcl[ref.offset >> 2] = Cmd_Line; + CmdLine_write(CmdLineRef(ref.offset + 4), s); +} + +void Cmd_Fill_write(CmdRef ref, CmdFill s) { + ptcl[ref.offset >> 2] = Cmd_Fill; + CmdFill_write(CmdFillRef(ref.offset + 4), s); +} + +void Cmd_Stroke_write(CmdRef ref, CmdStroke s) { + ptcl[ref.offset >> 2] = Cmd_Stroke; + CmdStroke_write(CmdStrokeRef(ref.offset + 4), s); +} + +void Cmd_FillEdge_write(CmdRef ref, CmdFillEdge s) { + ptcl[ref.offset >> 2] = Cmd_FillEdge; + CmdFillEdge_write(CmdFillEdgeRef(ref.offset + 4), s); +} + +void Cmd_DrawFill_write(CmdRef ref, CmdDrawFill s) { + ptcl[ref.offset >> 2] = Cmd_DrawFill; + CmdDrawFill_write(CmdDrawFillRef(ref.offset + 4), s); +} + +void Cmd_Solid_write(CmdRef ref, CmdSolid s) { + ptcl[ref.offset >> 2] = Cmd_Solid; + CmdSolid_write(CmdSolidRef(ref.offset + 4), s); +} + +void Cmd_Bail_write(CmdRef ref) { + ptcl[ref.offset >> 2] = Cmd_Bail; +} + diff --git a/piet-gpu/src/main.rs b/piet-gpu/src/main.rs index 72f0d3c..56b73ca 100644 --- a/piet-gpu/src/main.rs +++ b/piet-gpu/src/main.rs @@ -73,6 +73,7 @@ fn dump_scene(buf: &[u8]) { } } +#[allow(unused)] fn dump_k1_data(k1_buf: &[u32]) { for i in 0..k1_buf.len() { if k1_buf[i] != 0 { @@ -96,7 +97,9 @@ fn main() { .create_buffer(std::mem::size_of_val(&scene[..]) as u64, dev) .unwrap(); device.write_buffer(&scene_buf, &scene).unwrap(); + // These should only be on the host if we're going to examine them from Rust. let tilegroup_buf = device.create_buffer(384 * 1024, host).unwrap(); + let ptcl_buf = device.create_buffer(12 * 1024 * 4096, host).unwrap(); let image_buf = device .create_buffer((WIDTH * HEIGHT * 4) as u64, host) .unwrap(); @@ -110,16 +113,23 @@ fn main() { .create_descriptor_set(&k1_pipeline, &[&scene_dev, &tilegroup_buf]) .unwrap(); + let k3_code = include_bytes!("../shader/kernel3.spv"); + let k3_pipeline = device.create_simple_compute_pipeline(k3_code, 3).unwrap(); + let k3_ds = device + .create_descriptor_set(&k3_pipeline, &[&scene_dev, &tilegroup_buf, &ptcl_buf]) + .unwrap(); + let code = include_bytes!("../shader/image.spv"); let pipeline = device.create_simple_compute_pipeline(code, 2).unwrap(); let descriptor_set = device .create_descriptor_set(&pipeline, &[&scene_dev, &image_dev]) .unwrap(); - let query_pool = device.create_query_pool(3).unwrap(); + let query_pool = device.create_query_pool(4).unwrap(); let mut cmd_buf = device.create_cmd_buf().unwrap(); cmd_buf.begin(); cmd_buf.copy_buffer(&scene_buf, &scene_dev); cmd_buf.clear_buffer(&tilegroup_buf); + cmd_buf.clear_buffer(&ptcl_buf); cmd_buf.memory_barrier(); cmd_buf.write_timestamp(&query_pool, 0); cmd_buf.dispatch( @@ -129,22 +139,36 @@ fn main() { ); cmd_buf.write_timestamp(&query_pool, 1); cmd_buf.memory_barrier(); + cmd_buf.dispatch( + &k3_pipeline, + &k3_ds, + ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 1), + ); + cmd_buf.write_timestamp(&query_pool, 2); + cmd_buf.memory_barrier(); cmd_buf.dispatch( &pipeline, &descriptor_set, ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1), ); - cmd_buf.write_timestamp(&query_pool, 2); + cmd_buf.write_timestamp(&query_pool, 3); cmd_buf.memory_barrier(); cmd_buf.copy_buffer(&image_dev, &image_buf); cmd_buf.finish(); device.run_cmd_buf(&cmd_buf).unwrap(); let timestamps = device.reap_query_pool(query_pool).unwrap(); println!("Kernel 1 time: {:.3}ms", timestamps[0] * 1e3); - println!("Render time: {:.3}ms", (timestamps[1] - timestamps[0]) * 1e3); + println!( + "Kernel 3 time: {:.3}ms", + (timestamps[1] - timestamps[0]) * 1e3 + ); + println!( + "Render time: {:.3}ms", + (timestamps[2] - timestamps[1]) * 1e3 + ); let mut k1_data: Vec = Default::default(); - device.read_buffer(&tilegroup_buf, &mut k1_data).unwrap(); + device.read_buffer(&ptcl_buf, &mut k1_data).unwrap(); dump_k1_data(&k1_data); let mut img_data: Vec = Default::default();