From 0091a6b0a81a299402a1c02cdf51027ef93b54b9 Mon Sep 17 00:00:00 2001
From: Marcos Slomp <mslomp@gmail.com>
Date: Tue, 5 May 2026 10:49:57 -0700
Subject: [PATCH 01/21] initial prototype for WebGPU back-end

---
 public/common/TracyQueue.hpp |   3 +-
 public/tracy/TracyWebGPU.hpp | 867 +++++++++++++++++++++++++++++++++++
 2 files changed, 869 insertions(+), 1 deletion(-)
 create mode 100644 public/tracy/TracyWebGPU.hpp
diff --git a/public/common/TracyQueue.hpp b/public/common/TracyQueue.hpp
index 995769878b..11e57b0887 100644
--- a/public/common/TracyQueue.hpp
+++ b/public/common/TracyQueue.hpp
@@ -492,7 +492,8 @@ enum class GpuContextType : uint8_t
     Metal,
     Custom,
     CUDA,
-    Rocprof
+    Rocprof,
+    WebGPU
 };
 
 enum GpuContextFlags : uint8_t
diff --git a/public/tracy/TracyWebGPU.hpp b/public/tracy/TracyWebGPU.hpp
new file mode 100644
index 0000000000..b90a859159
--- /dev/null
+++ b/public/tracy/TracyWebGPU.hpp
@@ -0,0 +1,867 @@
+#ifndef __TRACYWEBGPU_HPP__
+#define __TRACYWEBGPU_HPP__
+
+#ifndef TRACY_ENABLE
+
+#define TracyWebGPUContext(instance, device, queue) nullptr
+#define TracyWebGPUDestroy(ctx)
+#define TracyWebGPUContextName(ctx, name, size)
+
+#define TracyWebGPUNewFrame(ctx)
+
+#define TracyWebGPUZone(ctx, encoder, name)
+#define TracyWebGPUZoneC(ctx, encoder, name, color)
+#define TracyWebGPUNamedZone(ctx, varname, encoder, name, active)
+#define TracyWebGPUNamedZoneC(ctx, varname, encoder, name, color, active)
+#define TracyWebGPUZoneTransient(ctx, varname, encoder, name, active)
+
+#define TracyWebGPUZoneS(ctx, encoder, name, depth)
+#define TracyWebGPUZoneCS(ctx, encoder, name, color, depth)
+#define TracyWebGPUNamedZoneS(ctx, varname, encoder, name, depth, active)
+#define TracyWebGPUNamedZoneCS(ctx, varname, encoder, name, color, depth, active)
+#define TracyWebGPUZoneTransientS(ctx, varname, encoder, name, depth, active)
+
+#define TracyWebGPUCollect(ctx)
+
+namespace tracy
+{
+    class WebGPUZoneScope {};
+}
+
+using TracyWebGPUCtx = void*;
+
+#else
+
+#include "Tracy.hpp"
+#include "client/TracyProfiler.hpp"
+#include "client/TracyCallstack.hpp"
+#include "common/TracyAlign.hpp"
+#include "common/TracyAlloc.hpp"
+
+#include <atomic>
+#include <mutex>
+#include <vector>
+#include <cstdlib>
+#include <cstring>
+#include <cassert>
+#include <chrono>
+#include <thread>
+
+#include <webgpu/webgpu.h>
+
+#ifndef TRACY_WEBGPU_DEBUG_LEVEL
+#define TRACY_WEBGPU_DEBUG_LEVEL (0)
+#endif//TRACY_WEBGPU_DEBUG_LEVEL
+
+#if TRACY_WEBGPU_DEBUG_LEVEL
+#define TracyWebGPUDebug(...) __VA_ARGS__;
+#if defined(_MSC_VER)
+#define TracyWebGPUBreak() if (IsDebuggerPresent()) __debugbreak()
+#else
+#define TracyWebGPUBreak() ((void)0)
+#endif
+#define TracyWebGPUAssert(predicate, ...) if (predicate) {} else { __VA_ARGS__; TracyWebGPUBreak(); }
+#else
+#define TracyWebGPUDebug(...)
+#define TracyWebGPUBreak()
+#define TracyWebGPUAssert(predicate, ...) assert(predicate);
+#endif
+
+#define TracyWebGPULog(severity, msg) tracy::Profiler::LogString( tracy::MessageSourceType::Tracy, tracy::MessageSeverity::severity, tracy::Color::Red4, 0, msg );
+#define TracyWebGPUPanic(msg, ...) do { TracyWebGPULog(Error, msg); TracyWebGPUAssert(false && "TracyWebGPU: " msg); __VA_ARGS__; } while(false);
+
+namespace tracy
+{
+
+    class WebGPUQueueCtx
+    {
+        friend class WebGPUZoneScope;
+
+        uint8_t m_contextId = 255;  // 255 represents "invalid id"
+
+        std::mutex m_collectionMutex;
+
+        WGPUInstance m_instance = nullptr;
+        WGPUDevice   m_device   = nullptr;
+        WGPUQueue    m_queue    = nullptr;
+
+        WGPUQuerySet m_querySet       = nullptr;
+        WGPUBuffer   m_resolveBuffer  = nullptr;  // QueryResolve | CopySrc
+        WGPUBuffer   m_readbackBuffer = nullptr;  // CopyDst | MapRead
+
+        using atomic_counter = std::atomic<uint64_t>;
+        atomic_counter m_queryCounter      = 0;
+        atomic_counter m_previousCheckpoint = 0;
+
+        uint32_t m_queryLimit = 0;
+
+        std::vector<uint64_t> m_shadowBuffer;
+        uint64_t m_latestKnownGpuTimestamp = 0;
+
+        // Map-state machine for the readback buffer.
+        enum class MapState : uint8_t
+        {
+            Idle,       // not mapped; GPU may write to it
+            Pending,    // MapAsync in flight
+            Ready,      // callback has fired, buffer is mapped for read
+            Failed      // last map attempt failed
+        };
+        std::atomic<MapState> m_mapState = MapState::Idle;
+
+        tracy_force_inline void SubmitQueueItem(tracy::QueueItem* item)
+        {
+#ifdef TRACY_ON_DEMAND
+            GetProfiler().DeferItem(*item);
+#endif
+            Profiler::QueueSerialFinish();
+        }
+
+        // Drive the WebGPU event queue. Some implementations (e.g. Dawn) want
+        // wgpuDeviceTick(); the canonical webgpu.h uses
+        // wgpuInstanceProcessEvents(). We only require the latter here.
+        void ProcessEvents()
+        {
+            if (m_instance)
+                wgpuInstanceProcessEvents(m_instance);
+        }
+
+        bool Anchor(uint64_t& outCpuTime, uint64_t& outGpuTime)
+        {
+            // Anchor() establishes a (cpuTime, gpuTime) anchor pair by querying
+            // a single timestamp (and synchronously resolving/reading it back)
+            WGPUCommandEncoderDescriptor encDesc = {};
+            WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(m_device, &encDesc);
+            if (!enc) return false;
+
+            // Snapshot CPU time as close to the GPU work as possible.
+            outCpuTime = static_cast<uint64_t>(Profiler::GetTime());
+
+            // NOTE: m_querySet slot 0 is used by Anchor(), but it can be immediately
+            // reclaimed/reused since Anchor() operates synchronously
+            wgpuCommandEncoderWriteTimestamp(enc, m_querySet, 0);
+            wgpuCommandEncoderResolveQuerySet(enc, m_querySet, 0, 1, m_resolveBuffer, 0);
+            wgpuCommandEncoderCopyBufferToBuffer(enc, m_resolveBuffer, 0, m_readbackBuffer, 0, sizeof(uint64_t));
+
+            WGPUCommandBufferDescriptor cmdDesc = {};
+            WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(enc, &cmdDesc);
+            wgpuCommandEncoderRelease(enc);
+            if (!cmd) return false;
+
+            wgpuQueueSubmit(m_queue, 1, &cmd);
+            wgpuCommandBufferRelease(cmd);
+
+            // Map and pump.
+            struct MapCtx { std::atomic<int> status{-1}; };
+            MapCtx mctx;
+
+            WGPUBufferMapCallbackInfo cbInfo = {};
+            cbInfo.mode = WGPUCallbackMode_AllowProcessEvents;
+            cbInfo.callback = [](WGPUMapAsyncStatus status, WGPUStringView /*msg*/, void* userdata1, void* /*userdata2*/) {
+                auto* c = static_cast<MapCtx*>(userdata1);
+                c->status.store(static_cast<int>(status), std::memory_order_release);
+            };
+            cbInfo.userdata1 = &mctx;
+
+            wgpuBufferMapAsync(m_readbackBuffer, WGPUMapMode_Read, 0, sizeof(uint64_t), cbInfo);
+
+            // Pump until the callback fires (with a generous timeout).
+            const auto t0 = std::chrono::steady_clock::now();
+            while (mctx.status.load(std::memory_order_acquire) < 0)
+            {
+                ProcessEvents();
+                if (std::chrono::steady_clock::now() - t0 > std::chrono::seconds(2))
+                {
+                    TracyWebGPUPanic("Timed out waiting for anchor timestamp readback.", return false);
+                }
+                std::this_thread::sleep_for(std::chrono::microseconds(100));
+            }
+
+            if (mctx.status.load(std::memory_order_acquire) != static_cast<int>(WGPUMapAsyncStatus_Success))
+            {
+                TracyWebGPUPanic("Failed to map anchor readback buffer.", return false);
+            }
+
+            const void* mapped = wgpuBufferGetConstMappedRange(m_readbackBuffer, 0, sizeof(uint64_t));
+            if (!mapped)
+            {
+                wgpuBufferUnmap(m_readbackBuffer);
+                return false;
+            }
+            uint64_t gpuTs;
+            std::memcpy(&gpuTs, mapped, sizeof(uint64_t));
+            wgpuBufferUnmap(m_readbackBuffer);
+
+            outGpuTime = gpuTs;
+            return true;
+        }
+
+    public:
+        WebGPUQueueCtx(WGPUInstance instance, WGPUDevice device, WGPUQueue queue)
+            : m_instance(instance)
+            , m_device(device)
+            , m_queue(queue)
+        {
+            ZoneScopedC(Color::Red4);
+
+            // The canonical webgpu.h uses AddRef/Release for refcounting.
+            if (m_instance) wgpuInstanceAddRef(m_instance);
+            wgpuDeviceAddRef(m_device);
+            wgpuQueueAddRef(m_queue);
+
+            // Pick a query budget. WebGPU has no native upper bound on query
+            // set size in the spec, but per-implementation maxQueriesPerQuerySet
+            // is typically 8192. We start at 64K and halve on failure, mirroring
+            // D3D12. Queries are issued in (begin, end) pairs, so the count is
+            // always even.
+            static constexpr uint32_t MaxQueries = 64 * 1024;
+            m_queryLimit = MaxQueries;
+
+            WGPUQuerySetDescriptor qsDesc = {};
+            qsDesc.type  = WGPUQueryType_Timestamp;
+            qsDesc.count = m_queryLimit;
+
+            for (;;)
+            {
+                m_querySet = wgpuDeviceCreateQuerySet(m_device, &qsDesc);
+                if (m_querySet) break;
+                m_queryLimit /= 2;
+                qsDesc.count = m_queryLimit;
+                if (m_queryLimit < 64)
+                {
+                    TracyWebGPUPanic("Failed to create timestamp query set (timestamp-query feature missing?).", return);
+                }
+            }
+
+            // Resolve buffer: the GPU resolves query results into this buffer.
+            WGPUBufferDescriptor resolveDesc = {};
+            resolveDesc.usage = WGPUBufferUsage_QueryResolve | WGPUBufferUsage_CopySrc;
+            resolveDesc.size  = static_cast<uint64_t>(m_queryLimit) * sizeof(uint64_t);
+            m_resolveBuffer = wgpuDeviceCreateBuffer(m_device, &resolveDesc);
+            if (!m_resolveBuffer)
+            {
+                TracyWebGPUPanic("Failed to create timestamp resolve buffer.", return);
+            }
+
+            // Readback buffer: target of CopyBufferToBuffer; mappable for read.
+            WGPUBufferDescriptor readbackDesc = {};
+            readbackDesc.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead;
+            readbackDesc.size  = static_cast<uint64_t>(m_queryLimit) * sizeof(uint64_t);
+            m_readbackBuffer = wgpuDeviceCreateBuffer(m_device, &readbackDesc);
+            if (!m_readbackBuffer)
+            {
+                TracyWebGPUPanic("Failed to create timestamp readback buffer.", return);
+            }
+
+            // Establish the (cpuTime, gpuTime) anchor for Tracy's GpuNewContext.
+            // WebGPU has no "clock calibration API", so we use a one-shot anchor
+            // to estimate a correlation for the CPU and the GPU timestamps.
+            uint64_t cpuTimestamp = 0;
+            uint64_t gpuTimestamp = 0;
+            if (!Anchor(cpuTimestamp, gpuTimestamp))
+            {
+                TracyWebGPUPanic("Failed to establish CPU/GPU timestamp anchor.", return);
+            }
+
+            m_shadowBuffer.resize(m_queryLimit, gpuTimestamp);
+            m_latestKnownGpuTimestamp = gpuTimestamp;
+
+            // WebGPU timestamps are in nanoseconds, as per the spec.
+            const float period = 1.0f;  // 1ns/tick
+
+            // All setup completed: register the context.
+            m_contextId = GetGpuCtxCounter().fetch_add(1);
+            ZoneValue(m_contextId);
+
+            auto* item = Profiler::QueueSerial();
+            MemWrite(&item->hdr.type, QueueType::GpuNewContext);
+            MemWrite(&item->gpuNewContext.cpuTime, static_cast<int64_t>(cpuTimestamp));
+            MemWrite(&item->gpuNewContext.gpuTime, static_cast<int64_t>(gpuTimestamp));
+            MemWrite(&item->gpuNewContext.thread, static_cast<uint32_t>(0));
+            MemWrite(&item->gpuNewContext.period, period);
+            MemWrite(&item->gpuNewContext.context, static_cast<uint8_t>(GetId()));
+            MemWrite(&item->gpuNewContext.flags, static_cast<uint8_t>(0));  // no calibration available
+            MemWrite(&item->gpuNewContext.type, static_cast<uint8_t>(GpuContextType::WebGPU));
+            SubmitQueueItem(item);
+        }
+
+        ~WebGPUQueueCtx()
+        {
+            ZoneScopedC(Color::Red4);
+            ZoneValue(m_contextId);
+
+            // Drain pending queries.
+            uint64_t endTicket = m_queryCounter;
+            uint64_t lastIssuedTicket = (endTicket >= 2) ? (endTicket - 2) : 0;
+            Drain(lastIssuedTicket, 200);
+
+            if (Distance(endTicket, m_queryCounter) > 0)
+                TracyWebGPUPanic("client is still pushing queries.");
+
+            // If the readback buffer is mapped, unmap it before release.
+            if (m_readbackBuffer && m_mapState.load() == MapState::Ready)
+            {
+                wgpuBufferUnmap(m_readbackBuffer);
+                m_mapState.store(MapState::Idle);
+            }
+
+            if (m_readbackBuffer) { wgpuBufferRelease(m_readbackBuffer); m_readbackBuffer = nullptr; }
+            if (m_resolveBuffer)  { wgpuBufferRelease(m_resolveBuffer);  m_resolveBuffer  = nullptr; }
+            if (m_querySet)       { wgpuQuerySetRelease(m_querySet);     m_querySet       = nullptr; }
+            if (m_queue)          { wgpuQueueRelease(m_queue);           m_queue          = nullptr; }
+            if (m_device)         { wgpuDeviceRelease(m_device);         m_device         = nullptr; }
+            if (m_instance)       { wgpuInstanceRelease(m_instance);     m_instance       = nullptr; }
+        }
+
+        tracy_force_inline uint8_t GetId() const
+        {
+            return m_contextId;
+        }
+
+        void Name(const char* name, uint16_t len)
+        {
+            auto ptr = (char*)tracy_malloc(len);
+            memcpy(ptr, name, len);
+
+            auto item = Profiler::QueueSerial();
+            MemWrite(&item->hdr.type, QueueType::GpuContextName);
+            MemWrite(&item->gpuContextNameFat.context, GetId());
+            MemWrite(&item->gpuContextNameFat.ptr, (uint64_t)ptr);
+            MemWrite(&item->gpuContextNameFat.size, len);
+            SubmitQueueItem(item);
+        }
+
+        void Collect()
+        {
+#ifdef TRACY_ON_DEMAND
+            if (!GetProfiler().IsConnected()) return;
+#endif
+            if (!m_collectionMutex.try_lock()) return;
+            std::unique_lock lock(m_collectionMutex, std::adopt_lock);
+            Collect(lock, m_queryCounter, false);
+        }
+
+    private:
+        // Issue (or progress) the readback for the range [earliest, end). On
+        // entry, the buffer is in some MapState; on return, if a complete
+        // readback was performed, queries up to the resolved point are emitted
+        // to Tracy and m_previousCheckpoint is advanced.
+        //
+        // Strategy:
+        //   * If MapState::Idle, kick off a CopyBufferToBuffer + MapAsync for
+        //     the unread range. Pump events briefly so the callback can land
+        //     before we return. This is the steady-state code path.
+        //   * If MapState::Pending, just pump events.
+        //   * If MapState::Ready, read the timestamps, unmap, mark Idle.
+        //   * If MapState::Failed, reset to Idle and bail.
+        void Collect(std::unique_lock<std::mutex>& lock, uint64_t targetTicket, bool urgent)
+        {
+            ZoneScopedC(Color::Red4);
+            TracyWebGPUAssert(lock.owns_lock());
+            TracyWebGPUDebug(ZoneValue(m_contextId));
+
+            uint64_t earliestTicket = m_previousCheckpoint;
+            uint64_t endTicket = m_queryCounter;
+            if (Distance(earliestTicket, endTicket) <= 0)
+                return;
+
+            // Drive the state machine. If the buffer is already mapped, harvest
+            // it. Otherwise, kick off a new map for the current unread range.
+            MapState state = m_mapState.load(std::memory_order_acquire);
+
+            if (state == MapState::Failed)
+            {
+                // Try again next time.
+                m_mapState.store(MapState::Idle, std::memory_order_release);
+                return;
+            }
+
+            if (state == MapState::Idle)
+            {
+                if (!IssueReadback(earliestTicket, endTicket))
+                    return;
+                state = m_mapState.load(std::memory_order_acquire);
+            }
+
+            // If we're in urgent mode, pump until we get a Ready or Failed.
+            if (urgent && state == MapState::Pending)
+            {
+                const auto t0 = std::chrono::steady_clock::now();
+                while ((state = m_mapState.load(std::memory_order_acquire)) == MapState::Pending)
+                {
+                    ProcessEvents();
+                    if (std::chrono::steady_clock::now() - t0 > std::chrono::seconds(1))
+                    {
+                        TracyWebGPULog(Warning, "Timed out waiting for urgent timestamp readback.");
+                        break;
+                    }
+                    std::this_thread::sleep_for(std::chrono::microseconds(50));
+                }
+            }
+            else if (state == MapState::Pending)
+            {
+                // Non-urgent: pump once and bail; the callback may land later.
+                ProcessEvents();
+                state = m_mapState.load(std::memory_order_acquire);
+                if (state != MapState::Ready) return;
+            }
+
+            if (state != MapState::Ready) return;
+
+            // We have a mapped range covering [m_pendingFirst, m_pendingLast).
+            HarvestMappedRange(targetTicket, urgent);
+
+            // After we've drained, stop. The next Collect() will issue a new
+            // readback for whatever has accumulated since.
+        }
+
+        // Set when the most recent IssueReadback was called.
+        uint64_t m_pendingFirstTicket = 0;
+        uint64_t m_pendingEndTicket   = 0;
+
+        // Issue a CopyBufferToBuffer + MapAsync for query slots in [first, end).
+        // Note: 'first' and 'end' are ticket numbers (logical, monotonic).
+        // Their wrapped slot indices may straddle the end of the ring buffer;
+        // in that case we issue two separate copies.
+        bool IssueReadback(uint64_t first, uint64_t end)
+        {
+            const int64_t span = Distance(first, end);
+            if (span <= 0) return false;
+
+            // Cap the readback to the ring's size. If span > capacity, the older
+            // entries will have been overwritten in the resolve buffer, so we
+            // can only meaningfully read the most recent capacity worth of
+            // entries.
+            uint64_t actualFirst = first;
+            if (static_cast<uint64_t>(span) > RingCapacity())
+            {
+                actualFirst = end - RingCapacity();
+            }
+
+            const uint32_t firstSlot = RingIndex(actualFirst);
+            const uint32_t lastSlot  = RingIndex(end);  // exclusive end
+            const uint32_t cap       = RingCapacity();
+
+            WGPUCommandEncoderDescriptor encDesc = {};
+            WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(m_device, &encDesc);
+            if (!enc) return false;
+
+            // Either a single contiguous copy, or two copies that wrap around.
+            if (firstSlot < lastSlot || lastSlot == 0)
+            {
+                const uint32_t count = (lastSlot == 0) ? (cap - firstSlot) : (lastSlot - firstSlot);
+                wgpuCommandEncoderCopyBufferToBuffer(
+                    enc,
+                    m_resolveBuffer,
+                    static_cast<uint64_t>(firstSlot) * sizeof(uint64_t),
+                    m_readbackBuffer,
+                    static_cast<uint64_t>(firstSlot) * sizeof(uint64_t),
+                    static_cast<uint64_t>(count) * sizeof(uint64_t));
+            }
+            else
+            {
+                // Wrap: [firstSlot, cap) and [0, lastSlot).
+                wgpuCommandEncoderCopyBufferToBuffer(
+                    enc,
+                    m_resolveBuffer,
+                    static_cast<uint64_t>(firstSlot) * sizeof(uint64_t),
+                    m_readbackBuffer,
+                    static_cast<uint64_t>(firstSlot) * sizeof(uint64_t),
+                    static_cast<uint64_t>(cap - firstSlot) * sizeof(uint64_t));
+                wgpuCommandEncoderCopyBufferToBuffer(
+                    enc,
+                    m_resolveBuffer,
+                    0,
+                    m_readbackBuffer,
+                    0,
+                    static_cast<uint64_t>(lastSlot) * sizeof(uint64_t));
+            }
+
+            WGPUCommandBufferDescriptor cmdDesc = {};
+            WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(enc, &cmdDesc);
+            wgpuCommandEncoderRelease(enc);
+            if (!cmd) return false;
+
+            wgpuQueueSubmit(m_queue, 1, &cmd);
+            wgpuCommandBufferRelease(cmd);
+
+            // Map the entire buffer (covers both contiguous and wrapped cases).
+            // We could be tighter and map just the touched range(s), but the
+            // single-range MapAsync makes the wrap case awkward, so we map all.
+            m_pendingFirstTicket = actualFirst;
+            m_pendingEndTicket   = end;
+            m_mapState.store(MapState::Pending, std::memory_order_release);
+
+            WGPUBufferMapCallbackInfo cbInfo = {};
+            cbInfo.mode = WGPUCallbackMode_AllowProcessEvents;
+            cbInfo.callback = &WebGPUQueueCtx::OnMapped;
+            cbInfo.userdata1 = this;
+
+            wgpuBufferMapAsync(
+                m_readbackBuffer,
+                WGPUMapMode_Read,
+                0,
+                static_cast<uint64_t>(cap) * sizeof(uint64_t),
+                cbInfo);
+
+            // A single pump in case the callback can fire immediately.
+            ProcessEvents();
+            return true;
+        }
+
+        static void OnMapped(WGPUMapAsyncStatus status, WGPUStringView /*msg*/, void* userdata1, void* /*userdata2*/)
+        {
+            auto* self = static_cast<WebGPUQueueCtx*>(userdata1);
+            if (status == WGPUMapAsyncStatus_Success)
+                self->m_mapState.store(MapState::Ready, std::memory_order_release);
+            else
+                self->m_mapState.store(MapState::Failed, std::memory_order_release);
+        }
+
+        void HarvestMappedRange(uint64_t targetTicket, bool urgent)
+        {
+            const uint32_t cap = RingCapacity();
+            const void* mapped = wgpuBufferGetConstMappedRange(
+                m_readbackBuffer, 0, static_cast<uint64_t>(cap) * sizeof(uint64_t));
+
+            if (!mapped)
+            {
+                wgpuBufferUnmap(m_readbackBuffer);
+                m_mapState.store(MapState::Idle, std::memory_order_release);
+                TracyWebGPUPanic("Failed to read mapped readback buffer.", return);
+            }
+
+            const uint64_t* timestampBuffer = static_cast<const uint64_t*>(mapped);
+
+            uint64_t ticket = m_pendingFirstTicket;
+            const uint64_t end = m_pendingEndTicket;
+
+            for (; ticket != end; ticket += 2)
+            {
+                if (!ResolveTimestamp(ticket, timestampBuffer))
+                    break;
+            }
+
+            // Urgent: ensure 'targetTicket' is collected before returning.
+            if (urgent)
+            {
+                while (Distance(ticket, targetTicket) >= 0)
+                {
+                    DropTimestamp(ticket, timestampBuffer);
+                    ticket += 2;
+                }
+            }
+
+            // Overflow handling: drop oldest queries to normalize the situation.
+            uint64_t curEnd = m_queryCounter;
+            while (Distance(ticket, curEnd) > static_cast<int64_t>(RingCapacity()))
+            {
+                DropTimestamp(ticket, timestampBuffer);
+                ticket += 2;
+            }
+
+            wgpuBufferUnmap(m_readbackBuffer);
+            m_mapState.store(MapState::Idle, std::memory_order_release);
+        }
+
+        bool Wait(uint64_t queryTicket, uint64_t timeout_ms)
+        {
+            ZoneScopedC(Color::Red4);
+            const auto t0 = std::chrono::steady_clock::now();
+            int64_t elapsed = 0;
+            while ((Distance(m_previousCheckpoint, queryTicket) >= 0)
+                   && (static_cast<uint64_t>(elapsed) < timeout_ms))
+            {
+                std::unique_lock lock(m_collectionMutex);
+                Collect(lock, queryTicket, false);
+                lock.unlock();
+                std::this_thread::sleep_for(std::chrono::microseconds(100));
+                elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
+                    std::chrono::steady_clock::now() - t0).count();
+            }
+            return Distance(m_previousCheckpoint, queryTicket) < 0;
+        }
+
+        void Drain(uint64_t queryTicket, uint64_t gracePeriod_ms)
+        {
+            ZoneScopedC(Color::Red4);
+            if (Wait(queryTicket, gracePeriod_ms))
+                return;
+            std::unique_lock lock(m_collectionMutex);
+            Collect(lock, queryTicket, true);
+        }
+
+        bool ResolveTimestamp(uint64_t queryTicket, const uint64_t* timestampBuffer)
+        {
+            uint32_t queryId = RingIndex(queryTicket);
+            uint64_t gpuZoneBeginTimestamp = timestampBuffer[queryId];
+            uint64_t gpuZoneEndTimestamp   = timestampBuffer[queryId + 1];
+            uint64_t baselineTimestamp     = m_shadowBuffer[queryId + 1];
+            int64_t  baseline_diff = Distance(baselineTimestamp, gpuZoneEndTimestamp);
+            if (baseline_diff <= 0)
+                return false;
+            EmitGpuTime(gpuZoneBeginTimestamp, queryId);
+            EmitGpuTime(gpuZoneEndTimestamp,   queryId + 1);
+            RetireTicket(queryTicket);
+            if (Distance(m_latestKnownGpuTimestamp, gpuZoneEndTimestamp) > 0)
+                m_latestKnownGpuTimestamp = gpuZoneEndTimestamp;
+            return true;
+        }
+
+        void DropTimestamp(uint64_t queryTicket, const uint64_t* timestampBuffer)
+        {
+            if (ResolveTimestamp(queryTicket, timestampBuffer))
+                return;
+            uint32_t queryId = RingIndex(queryTicket);
+            uint64_t latestGpuTimestamp = m_latestKnownGpuTimestamp;
+            EmitGpuTime(latestGpuTimestamp, queryId);
+            EmitGpuTime(latestGpuTimestamp, queryId + 1);
+            RetireTicket(queryTicket);
+        }
+
+        void EmitGpuTime(uint64_t gpuTimestamp, uint32_t queryId)
+        {
+            auto* item = Profiler::QueueSerial();
+            MemWrite(&item->hdr.type, QueueType::GpuTime);
+            MemWrite(&item->gpuTime.gpuTime, static_cast<int64_t>(gpuTimestamp));
+            MemWrite(&item->gpuTime.queryId, static_cast<uint16_t>(queryId));
+            MemWrite(&item->gpuTime.context, GetId());
+            Profiler::QueueSerialFinish();
+            m_shadowBuffer[queryId] = gpuTimestamp;
+        }
+
+        tracy_force_inline uint32_t RingCapacity() const
+        {
+            return m_queryLimit;
+        }
+
+        tracy_force_inline uint32_t RingIndex(uint64_t logicalSlot) const
+        {
+            return static_cast<uint32_t>(logicalSlot % RingCapacity());
+        }
+
+        tracy_force_inline static int64_t Distance(uint64_t begin, uint64_t end)
+        {
+            return static_cast<int64_t>(end - begin);
+        }
+
+        void RetireTicket(uint64_t ticket)
+        {
+            TracyWebGPUAssert(m_previousCheckpoint == ticket);
+            uint64_t nextTicket = ticket + 2;
+            m_previousCheckpoint.store(nextTicket, std::memory_order_release);
+        }
+
+        tracy_force_inline uint32_t NextQueryId()
+        {
+            const uint64_t ticket = m_queryCounter.fetch_add(2, std::memory_order_relaxed);
+            const uint64_t checkpoint = m_previousCheckpoint.load(std::memory_order_relaxed);
+            if (Distance(checkpoint, ticket) >= static_cast<int64_t>(RingCapacity()))
+            {
+                ZoneScopedC(Color::Red4);
+                TracyWebGPULog(Warning, "Too many pending GPU queries: stalling!");
+                uint64_t oldTicket = ticket - RingCapacity();
+                Drain(oldTicket, 0);
+            }
+            return RingIndex(ticket);
+        }
+    };
+
+    class WebGPUZoneScope
+    {
+        const bool m_active;
+        WebGPUQueueCtx* m_ctx = nullptr;
+        WGPUCommandEncoder m_encoder = nullptr;
+        uint32_t m_queryId = 0;
+
+        tracy_force_inline void WriteQueueItem(const SourceLocationData* srcLocation, int32_t callstackDepth, uint32_t sourceLine, const char* sourceFile, size_t sourceFileLen, const char* functionName, size_t functionNameLen, const char* zoneName, size_t zoneNameLen)
+        {
+            if (!m_active) return;
+
+            const bool captureCallstack = callstackDepth > 0 && has_callstack();
+            const bool transientZone = srcLocation == nullptr;
+            uint64_t srcLocationAddr = reinterpret_cast<uint64_t>(srcLocation);
+
+            QueueItem* item = nullptr;
+            QueueType itemType;
+            if (transientZone)
+            {
+                srcLocationAddr = Profiler::AllocSourceLocation(sourceLine, sourceFile, sourceFileLen, functionName, functionNameLen, zoneName, zoneNameLen);
+                if (captureCallstack)
+                {
+                    item = Profiler::QueueSerialCallstack(Callstack(callstackDepth));
+                    itemType = QueueType::GpuZoneBeginAllocSrcLocCallstackSerial;
+                }
+                else
+                {
+                    item = Profiler::QueueSerial();
+                    itemType = QueueType::GpuZoneBeginAllocSrcLocSerial;
+                }
+            }
+            else
+            {
+                if (captureCallstack)
+                {
+                    item = Profiler::QueueSerialCallstack(Callstack(callstackDepth));
+                    itemType = QueueType::GpuZoneBeginCallstackSerial;
+                }
+                else
+                {
+                    item = Profiler::QueueSerial();
+                    itemType = QueueType::GpuZoneBeginSerial;
+                }
+            }
+
+            MemWrite(&item->hdr.type, itemType);
+            MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
+            MemWrite(&item->gpuZoneBegin.srcloc, srcLocationAddr);
+            MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
+            MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(m_queryId));
+            MemWrite(&item->gpuZoneBegin.context, m_ctx->GetId());
+            Profiler::QueueSerialFinish();
+        }
+
+        tracy_force_inline WebGPUZoneScope(WebGPUQueueCtx* ctx, WGPUCommandEncoder encoder, bool active)
+#ifdef TRACY_ON_DEMAND
+            : m_active(active && GetProfiler().IsConnected())
+#else
+            : m_active(active)
+#endif
+        {
+            if (!m_active) return;
+
+            m_ctx = ctx;
+            m_encoder = encoder;
+
+            m_queryId = m_ctx->NextQueryId();
+            wgpuCommandEncoderWriteTimestamp(m_encoder, m_ctx->m_querySet, m_queryId);
+        }
+
+    public:
+        tracy_force_inline WebGPUZoneScope(WebGPUQueueCtx* ctx, WGPUCommandEncoder encoder, const SourceLocationData* srcLocation, bool active)
+            : WebGPUZoneScope(ctx, encoder, active)
+        {
+            WriteQueueItem(srcLocation, 0, 0, nullptr, 0, nullptr, 0, nullptr, 0);
+        }
+
+        tracy_force_inline WebGPUZoneScope(WebGPUQueueCtx* ctx, WGPUCommandEncoder encoder, const SourceLocationData* srcLocation, int32_t depth, bool active)
+            : WebGPUZoneScope(ctx, encoder, active)
+        {
+            WriteQueueItem(srcLocation, depth, 0, nullptr, 0, nullptr, 0, nullptr, 0);
+        }
+
+        tracy_force_inline WebGPUZoneScope(WebGPUQueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, WGPUCommandEncoder encoder, bool active)
+            : WebGPUZoneScope(ctx, encoder, active)
+        {
+            WriteQueueItem(nullptr, 0, line, source, sourceSz, function, functionSz, name, nameSz);
+        }
+
+        tracy_force_inline WebGPUZoneScope(WebGPUQueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, WGPUCommandEncoder encoder, int32_t depth, bool active)
+            : WebGPUZoneScope(ctx, encoder, active)
+        {
+            WriteQueueItem(nullptr, depth, line, source, sourceSz, function, functionSz, name, nameSz);
+        }
+
+        tracy_force_inline ~WebGPUZoneScope()
+        {
+            if (!m_active) return;
+
+            const auto queryId = m_queryId + 1;
+
+            auto* item = Profiler::QueueSerial();
+            MemWrite(&item->hdr.type, QueueType::GpuZoneEndSerial);
+            MemWrite(&item->gpuZoneEnd.cpuTime, Profiler::GetTime());
+            MemWrite(&item->gpuZoneEnd.thread, GetThreadHandle());
+            MemWrite(&item->gpuZoneEnd.queryId, static_cast<uint16_t>(queryId));
+            MemWrite(&item->gpuZoneEnd.context, m_ctx->GetId());
+            Profiler::QueueSerialFinish();
+
+            // Write the end timestamp and resolve the (begin, end) pair into
+            // the resolve buffer right away. We cannot move the resolve to
+            // Collect() because the user may finish/destroy the encoder
+            // immediately after the zone closes, and ResolveQuerySet must be
+            // recorded into an encoder belonging to the same submission as the
+            // timestamp writes if we want to read the values for THIS zone in
+            // the same frame. Recording it here also matches the D3D12 backend.
+            wgpuCommandEncoderWriteTimestamp(m_encoder, m_ctx->m_querySet, queryId);
+            wgpuCommandEncoderResolveQuerySet(
+                m_encoder,
+                m_ctx->m_querySet,
+                m_queryId, 2,
+                m_ctx->m_resolveBuffer,
+                static_cast<uint64_t>(m_queryId) * sizeof(uint64_t));
+        }
+    };
+
+    static inline void DestroyWebGPUContext(WebGPUQueueCtx* ctx)
+    {
+        TracyWebGPUAssert(ctx);
+        ctx->~WebGPUQueueCtx();
+        tracy_free(ctx);
+    }
+
+    static inline WebGPUQueueCtx* CreateWebGPUContext(WGPUInstance instance, WGPUDevice device, WGPUQueue queue)
+    {
+        auto* ctx = static_cast<WebGPUQueueCtx*>(tracy_malloc(sizeof(WebGPUQueueCtx)));
+        new (ctx) WebGPUQueueCtx{ instance, device, queue };
+        if (ctx->GetId() == 255)
+        {
+            DestroyWebGPUContext(ctx);
+            return nullptr;
+        }
+        return ctx;
+    }
+
+}
+
+#undef TracyWebGPUPanic
+#undef TracyWebGPULog
+#undef TracyWebGPUAssert
+#undef TracyWebGPUBreak
+#undef TracyWebGPUDebug
+#undef TRACY_WEBGPU_DEBUG_LEVEL
+
+using TracyWebGPUCtx = tracy::WebGPUQueueCtx*;
+
+#define TracyWebGPUContext(instance, device, queue) tracy::CreateWebGPUContext(instance, device, queue);
+#define TracyWebGPUDestroy(ctx) tracy::DestroyWebGPUContext(ctx);
+#define TracyWebGPUContextName(ctx, name, size) ctx->Name(name, size);
+
+#define TracyWebGPUNewFrame(ctx) ((void)(ctx))
+
+#define TracyWebGPUUnnamedZone ___tracy_gpu_webgpu_zone
+#define TracyWebGPUSrcLocSymbol TracyConcat(__tracy_webgpu_source_location,TracyLine)
+#define TracyWebGPUSrcLocObject(name, color) static constexpr tracy::SourceLocationData TracyWebGPUSrcLocSymbol { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color };
+
+#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
+#  define TracyWebGPUZone(ctx, encoder, name) TracyWebGPUNamedZoneS(ctx, TracyWebGPUUnnamedZone, encoder, name, TRACY_CALLSTACK, true)
+#  define TracyWebGPUZoneC(ctx, encoder, name, color) TracyWebGPUNamedZoneCS(ctx, TracyWebGPUUnnamedZone, encoder, name, color, TRACY_CALLSTACK, true)
+#  define TracyWebGPUNamedZone(ctx, varname, encoder, name, active) TracyWebGPUSrcLocObject(name, 0); tracy::WebGPUZoneScope varname{ ctx, encoder, &TracyWebGPUSrcLocSymbol, TRACY_CALLSTACK, active };
+#  define TracyWebGPUNamedZoneC(ctx, varname, encoder, name, color, active) TracyWebGPUSrcLocObject(name, color); tracy::WebGPUZoneScope varname{ ctx, encoder, &TracyWebGPUSrcLocSymbol, TRACY_CALLSTACK, active };
+#  define TracyWebGPUZoneTransient(ctx, varname, encoder, name, active) TracyWebGPUZoneTransientS(ctx, varname, encoder, name, TRACY_CALLSTACK, active)
+#else
+#  define TracyWebGPUZone(ctx, encoder, name) TracyWebGPUNamedZone(ctx, TracyWebGPUUnnamedZone, encoder, name, true)
+#  define TracyWebGPUZoneC(ctx, encoder, name, color) TracyWebGPUNamedZoneC(ctx, TracyWebGPUUnnamedZone, encoder, name, color, true)
+#  define TracyWebGPUNamedZone(ctx, varname, encoder, name, active) TracyWebGPUSrcLocObject(name, 0); tracy::WebGPUZoneScope varname{ ctx, encoder, &TracyWebGPUSrcLocSymbol, active };
+#  define TracyWebGPUNamedZoneC(ctx, varname, encoder, name, color, active) TracyWebGPUSrcLocObject(name, color); tracy::WebGPUZoneScope varname{ ctx, encoder, &TracyWebGPUSrcLocSymbol, active };
+#  define TracyWebGPUZoneTransient(ctx, varname, encoder, name, active) tracy::WebGPUZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), encoder, active };
+#endif
+
+#ifdef TRACY_HAS_CALLSTACK
+#  define TracyWebGPUZoneS(ctx, encoder, name, depth) TracyWebGPUNamedZoneS(ctx, TracyWebGPUUnnamedZone, encoder, name, depth, true)
+#  define TracyWebGPUZoneCS(ctx, encoder, name, color, depth) TracyWebGPUNamedZoneCS(ctx, TracyWebGPUUnnamedZone, encoder, name, color, depth, true)
+#  define TracyWebGPUNamedZoneS(ctx, varname, encoder, name, depth, active) TracyWebGPUSrcLocObject(name, 0); tracy::WebGPUZoneScope varname{ ctx, encoder, &TracyWebGPUSrcLocSymbol, depth, active };
+#  define TracyWebGPUNamedZoneCS(ctx, varname, encoder, name, color, depth, active) TracyWebGPUSrcLocObject(name, color); tracy::WebGPUZoneScope varname{ ctx, encoder, &TracyWebGPUSrcLocSymbol, depth, active };
+#  define TracyWebGPUZoneTransientS(ctx, varname, encoder, name, depth, active) tracy::WebGPUZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), encoder, depth, active };
+#else
+#  define TracyWebGPUZoneS(ctx, encoder, name, depth) TracyWebGPUZone(ctx, encoder, name)
+#  define TracyWebGPUZoneCS(ctx, encoder, name, color, depth) TracyWebGPUZoneC(ctx, encoder, name, color)
+#  define TracyWebGPUNamedZoneS(ctx, varname, encoder, name, depth, active) TracyWebGPUNamedZone(ctx, varname, encoder, name, active)
+#  define TracyWebGPUNamedZoneCS(ctx, varname, encoder, name, color, depth, active) TracyWebGPUNamedZoneC(ctx, varname, encoder, name, color, active)
+#  define TracyWebGPUZoneTransientS(ctx, varname, encoder, name, depth, active) TracyWebGPUZoneTransient(ctx, varname, encoder, name, active)
+#endif
+
+#define TracyWebGPUCollect(ctx) ctx->Collect();
+
+#endif
+
+#endif

From 7951f9a8dba4c222348d43678dd1c8c5366e977d Mon Sep 17 00:00:00 2001
From: Marcos Slomp <mslomp@gmail.com>
Date: Tue, 12 May 2026 10:47:08 -0700
Subject: [PATCH 02/21] redesign...

---
 public/tracy/TracyWebGPU.hpp     | 873 +++++++++++++++----------------
 python/bindings/ServerModule.cpp |   4 +-
 2 files changed, 420 insertions(+), 457 deletions(-)

diff --git a/public/tracy/TracyWebGPU.hpp b/public/tracy/TracyWebGPU.hpp
index b90a859159..a4d777b152 100644
--- a/public/tracy/TracyWebGPU.hpp
+++ b/public/tracy/TracyWebGPU.hpp
@@ -7,19 +7,17 @@
 #define TracyWebGPUDestroy(ctx)
 #define TracyWebGPUContextName(ctx, name, size)
 
-#define TracyWebGPUNewFrame(ctx)
-
-#define TracyWebGPUZone(ctx, encoder, name)
-#define TracyWebGPUZoneC(ctx, encoder, name, color)
-#define TracyWebGPUNamedZone(ctx, varname, encoder, name, active)
-#define TracyWebGPUNamedZoneC(ctx, varname, encoder, name, color, active)
-#define TracyWebGPUZoneTransient(ctx, varname, encoder, name, active)
-
-#define TracyWebGPUZoneS(ctx, encoder, name, depth)
-#define TracyWebGPUZoneCS(ctx, encoder, name, color, depth)
-#define TracyWebGPUNamedZoneS(ctx, varname, encoder, name, depth, active)
-#define TracyWebGPUNamedZoneCS(ctx, varname, encoder, name, color, depth, active)
-#define TracyWebGPUZoneTransientS(ctx, varname, encoder, name, depth, active)
+#define TracyWebGPUZone(ctx, encoder, passDesc, name)
+#define TracyWebGPUZoneC(ctx, encoder, passDesc, name, color)
+#define TracyWebGPUNamedZone(ctx, varname, encoder, passDesc, name, active)
+#define TracyWebGPUNamedZoneC(ctx, varname, encoder, passDesc, name, color, active)
+#define TracyWebGPUZoneTransient(ctx, varname, encoder, passDesc, name, active)
+
+#define TracyWebGPUZoneS(ctx, encoder, passDesc, name, depth)
+#define TracyWebGPUZoneCS(ctx, encoder, passDesc, name, color, depth)
+#define TracyWebGPUNamedZoneS(ctx, varname, encoder, passDesc, name, depth, active)
+#define TracyWebGPUNamedZoneCS(ctx, varname, encoder, passDesc, name, color, depth, active)
+#define TracyWebGPUZoneTransientS(ctx, varname, encoder, passDesc, name, depth, active)
 
 #define TracyWebGPUCollect(ctx)
 
@@ -33,14 +31,15 @@ using TracyWebGPUCtx = void*;
 #else
 
 #include "Tracy.hpp"
-#include "client/TracyProfiler.hpp"
-#include "client/TracyCallstack.hpp"
-#include "common/TracyAlign.hpp"
-#include "common/TracyAlloc.hpp"
+#include "../client/TracyProfiler.hpp"
+#include "../client/TracyCallstack.hpp"
+#include "../common/TracyAlign.hpp"
+#include "../common/TracyAlloc.hpp"
 
 #include <atomic>
 #include <mutex>
 #include <vector>
+#include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <cassert>
@@ -67,7 +66,7 @@ using TracyWebGPUCtx = void*;
 #define TracyWebGPUAssert(predicate, ...) assert(predicate);
 #endif
 
-#define TracyWebGPULog(severity, msg) tracy::Profiler::LogString( tracy::MessageSourceType::Tracy, tracy::MessageSeverity::severity, tracy::Color::Red4, 0, msg );
+#define TracyWebGPULog(severity, msg) fprintf(stdout, "%s", msg), tracy::Profiler::LogString( tracy::MessageSourceType::Tracy, tracy::MessageSeverity::severity, tracy::Color::Red4, 0, msg );
 #define TracyWebGPUPanic(msg, ...) do { TracyWebGPULog(Error, msg); TracyWebGPUAssert(false && "TracyWebGPU: " msg); __VA_ARGS__; } while(false);
 
 namespace tracy
@@ -85,28 +84,28 @@ namespace tracy
         WGPUDevice   m_device   = nullptr;
         WGPUQueue    m_queue    = nullptr;
 
-        WGPUQuerySet m_querySet       = nullptr;
-        WGPUBuffer   m_resolveBuffer  = nullptr;  // QueryResolve | CopySrc
-        WGPUBuffer   m_readbackBuffer = nullptr;  // CopyDst | MapRead
+        struct ReadbackSlot
+        {
+            WGPUBuffer            buffer;
+            std::atomic<uint64_t> copiedUpto;
+            std::atomic<WGPUMapAsyncStatus> mapStatus = {};
+            WGPUFuture            pendingFuture = {};
+        };
+        static_assert(std::atomic<WGPUMapAsyncStatus>::is_always_lock_free, "WGPUMapAsyncStatus must be lock-free atomic");
+
+        WGPUQuerySet  m_querySet        = nullptr;
+        WGPUBuffer    m_resolveBuffer   = nullptr;  // QueryResolve | CopySrc
+        ReadbackSlot  m_readbackSlots[3];            // CopyDst | MapRead (3-slot ring)
+        std::atomic<int> m_writeIdx{0};              // WRITE slot index (ring: 0→1→2→0)
 
         using atomic_counter = std::atomic<uint64_t>;
-        atomic_counter m_queryCounter      = 0;
+        atomic_counter m_queryCounter       = 0;
         atomic_counter m_previousCheckpoint = 0;
 
         uint32_t m_queryLimit = 0;
 
         std::vector<uint64_t> m_shadowBuffer;
-        uint64_t m_latestKnownGpuTimestamp = 0;
-
-        // Map-state machine for the readback buffer.
-        enum class MapState : uint8_t
-        {
-            Idle,       // not mapped; GPU may write to it
-            Pending,    // MapAsync in flight
-            Ready,      // callback has fired, buffer is mapped for read
-            Failed      // last map attempt failed
-        };
-        std::atomic<MapState> m_mapState = MapState::Idle;
+        uint64_t m_prevCalibGpuTime = 0;
 
         tracy_force_inline void SubmitQueueItem(tracy::QueueItem* item)
         {
@@ -116,86 +115,182 @@ namespace tracy
             Profiler::QueueSerialFinish();
         }
 
-        // Drive the WebGPU event queue. Some implementations (e.g. Dawn) want
-        // wgpuDeviceTick(); the canonical webgpu.h uses
-        // wgpuInstanceProcessEvents(). We only require the latter here.
-        void ProcessEvents()
-        {
-            if (m_instance)
-                wgpuInstanceProcessEvents(m_instance);
-        }
-
-        bool Anchor(uint64_t& outCpuTime, uint64_t& outGpuTime)
+        bool CalibrateClocks(uint64_t& outCpuTime, uint64_t& outGpuTime)
         {
-            // Anchor() establishes a (cpuTime, gpuTime) anchor pair by querying
-            // a single timestamp (and synchronously resolving/reading it back)
-            WGPUCommandEncoderDescriptor encDesc = {};
-            WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(m_device, &encDesc);
-            if (!enc) return false;
-
-            // Snapshot CPU time as close to the GPU work as possible.
-            outCpuTime = static_cast<uint64_t>(Profiler::GetTime());
-
-            // NOTE: m_querySet slot 0 is used by Anchor(), but it can be immediately
-            // reclaimed/reused since Anchor() operates synchronously
-            wgpuCommandEncoderWriteTimestamp(enc, m_querySet, 0);
-            wgpuCommandEncoderResolveQuerySet(enc, m_querySet, 0, 1, m_resolveBuffer, 0);
-            wgpuCommandEncoderCopyBufferToBuffer(enc, m_resolveBuffer, 0, m_readbackBuffer, 0, sizeof(uint64_t));
-
-            WGPUCommandBufferDescriptor cmdDesc = {};
-            WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(enc, &cmdDesc);
+            ZoneScoped;
+
+            WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(m_device, nullptr);
+            if (!enc) { TracyWebGPUPanic("Failed to create calibration command encoder.", return false); }
+
+            // wgpuCommandEncoderWriteTimestamp is deprecated and returns 0 on Metal.
+            // Use a render pass with an actual draw call: on Metal TBDR, begin-of-pass
+            // timestamps fire at tile rasterization start. An empty render pass (no
+            // geometry) may never trigger rasterization, yielding a deferred or
+            // meaningless timestamp that doesn't reflect actual GPU execution order.
+            static const char kCalibShader[] = R"(
+                @vertex fn vs(@builtin(vertex_index) i: u32) -> @builtin(position) vec4f {
+                    var p = array(vec4f(-1,-1,.5,1), vec4f(3,-1,.5,1), vec4f(-1,3,.5,1));
+                    return p[i];
+                }
+                @fragment fn fs() -> @location(0) vec4f { return vec4f(0.0); }
+            )";
+            WGPUShaderSourceWGSL wgslSrc = {};
+            wgslSrc.chain.sType = WGPUSType_ShaderSourceWGSL;
+            wgslSrc.code        = { kCalibShader, WGPU_STRLEN };
+            WGPUShaderModuleDescriptor smDesc = {};
+            smDesc.nextInChain  = reinterpret_cast<WGPUChainedStruct*>(&wgslSrc);
+            WGPUShaderModule calibShader = wgpuDeviceCreateShaderModule(m_device, &smDesc);
+            if (!calibShader) { wgpuCommandEncoderRelease(enc); TracyWebGPUPanic("Failed to create calibration shader.", return false); }
+
+            WGPUTextureDescriptor texDesc = {};
+            texDesc.usage         = WGPUTextureUsage_RenderAttachment;
+            texDesc.dimension     = WGPUTextureDimension_2D;
+            texDesc.size          = { 1, 1, 1 };
+            texDesc.format        = WGPUTextureFormat_BGRA8Unorm;
+            texDesc.mipLevelCount = 1;
+            texDesc.sampleCount   = 1;
+            WGPUTexture tex = wgpuDeviceCreateTexture(m_device, &texDesc);
+            if (!tex) { wgpuShaderModuleRelease(calibShader); wgpuCommandEncoderRelease(enc); TracyWebGPUPanic("Failed to create calibration scratch texture.", return false); }
+            WGPUTextureView texView = wgpuTextureCreateView(tex, nullptr);
+            if (!texView) { wgpuTextureRelease(tex); wgpuShaderModuleRelease(calibShader); wgpuCommandEncoderRelease(enc); TracyWebGPUPanic("Failed to create calibration scratch texture view.", return false); }
+
+            WGPUColorTargetState colorTarget = {};
+            colorTarget.format    = WGPUTextureFormat_BGRA8Unorm;
+            colorTarget.writeMask = WGPUColorWriteMask_All;
+            WGPUFragmentState fragState = {};
+            fragState.module      = calibShader;
+            fragState.entryPoint  = { "fs", WGPU_STRLEN };
+            fragState.targetCount = 1;
+            fragState.targets     = &colorTarget;
+            WGPURenderPipelineDescriptor pipeDesc = {};
+            pipeDesc.vertex.module        = calibShader;
+            pipeDesc.vertex.entryPoint    = { "vs", WGPU_STRLEN };
+            pipeDesc.primitive.topology   = WGPUPrimitiveTopology_TriangleList;
+            pipeDesc.multisample.count    = 1;
+            pipeDesc.fragment             = &fragState;
+            WGPURenderPipeline calibPipeline = wgpuDeviceCreateRenderPipeline(m_device, &pipeDesc);
+            if (!calibPipeline) { wgpuTextureViewRelease(texView); wgpuTextureRelease(tex); wgpuShaderModuleRelease(calibShader); wgpuCommandEncoderRelease(enc); TracyWebGPUPanic("Failed to create calibration pipeline.", return false); }
+
+            //const uint64_t calibTicket = NextQueryId();
+            //const uint32_t calibSlotB  = RingIndex(calibTicket);
+            //const uint32_t calibSlotE  = calibSlotB + 1;
+            const uint32_t calibSlotB  = 0;
+            const uint32_t calibSlotE  = 1;
+
+            WGPUPassTimestampWrites anchorTs = {};
+            anchorTs.querySet                  = m_querySet;
+            anchorTs.beginningOfPassWriteIndex = calibSlotB;
+            anchorTs.endOfPassWriteIndex       = calibSlotE;
+
+            WGPURenderPassColorAttachment att = {};
+            att.view       = texView;
+            att.loadOp     = WGPULoadOp_Clear;
+            att.storeOp    = WGPUStoreOp_Store;
+            att.depthSlice = WGPU_DEPTH_SLICE_UNDEFINED;
+
+            WGPURenderPassDescriptor passDesc = {};
+            passDesc.colorAttachmentCount = 1;
+            passDesc.colorAttachments     = &att;
+            passDesc.timestampWrites      = &anchorTs;
+
+            WGPURenderPassEncoder pass = wgpuCommandEncoderBeginRenderPass(enc, &passDesc);
+            wgpuRenderPassEncoderSetPipeline(pass, calibPipeline);
+            wgpuRenderPassEncoderDraw(pass, 3, 1, 0, 0);
+            wgpuRenderPassEncoderEnd(pass);
+            wgpuRenderPassEncoderRelease(pass);
+            wgpuRenderPipelineRelease(calibPipeline);
+            wgpuShaderModuleRelease(calibShader);
+            wgpuTextureViewRelease(texView);
+            wgpuTextureRelease(tex);
+
+            wgpuCommandEncoderResolveQuerySet(enc, m_querySet, calibSlotB, 2, m_resolveBuffer, calibSlotB * sizeof(uint64_t));
+            wgpuCommandEncoderCopyBufferToBuffer(enc, m_resolveBuffer, calibSlotB * sizeof(uint64_t), m_readbackSlots[0].buffer, calibSlotB * sizeof(uint64_t), 2 * sizeof(uint64_t));
+
+            WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(enc, nullptr);
             wgpuCommandEncoderRelease(enc);
-            if (!cmd) return false;
+            if (!cmd) { TracyWebGPUPanic("Failed to finish calibration command encoder.", return false); }
 
+            auto t0 = Profiler::GetTime();
             wgpuQueueSubmit(m_queue, 1, &cmd);
             wgpuCommandBufferRelease(cmd);
 
-            // Map and pump.
-            struct MapCtx { std::atomic<int> status{-1}; };
-            MapCtx mctx;
-
-            WGPUBufferMapCallbackInfo cbInfo = {};
-            cbInfo.mode = WGPUCallbackMode_AllowProcessEvents;
-            cbInfo.callback = [](WGPUMapAsyncStatus status, WGPUStringView /*msg*/, void* userdata1, void* /*userdata2*/) {
-                auto* c = static_cast<MapCtx*>(userdata1);
-                c->status.store(static_cast<int>(status), std::memory_order_release);
+            // Wait for the GPU to finish executing the command buffer before mapping.
+            bool gpuDone = false;
+            WGPUQueueWorkDoneCallbackInfo doneCB = {};
+            doneCB.mode      = WGPUCallbackMode_AllowSpontaneous;
+            doneCB.callback  = [](WGPUQueueWorkDoneStatus, WGPUStringView, void* ud, void*) {
+                *static_cast<bool*>(ud) = true;
             };
-            cbInfo.userdata1 = &mctx;
+            doneCB.userdata1 = &gpuDone;
+            wgpuQueueOnSubmittedWorkDone(m_queue, doneCB);
 
-            wgpuBufferMapAsync(m_readbackBuffer, WGPUMapMode_Read, 0, sizeof(uint64_t), cbInfo);
+            const auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(2);
+            while (!gpuDone && std::chrono::steady_clock::now() < deadline)
+                wgpuInstanceProcessEvents(m_instance);
 
-            // Pump until the callback fires (with a generous timeout).
-            const auto t0 = std::chrono::steady_clock::now();
-            while (mctx.status.load(std::memory_order_acquire) < 0)
-            {
-                ProcessEvents();
-                if (std::chrono::steady_clock::now() - t0 > std::chrono::seconds(2))
+            struct MapCtx { WGPUBuffer buffer; uint32_t slotB; uint64_t gpuTime = 0; bool ok = false; };
+            MapCtx mctx{ m_readbackSlots[0].buffer, calibSlotB };
+            WGPUBufferMapCallbackInfo cbInfo = {};
+            cbInfo.mode      = WGPUCallbackMode_AllowSpontaneous;
+            cbInfo.callback  = [](WGPUMapAsyncStatus status, WGPUStringView, void* ud, void*) {
+                auto* ctx = static_cast<MapCtx*>(ud);
+                if (status != WGPUMapAsyncStatus_Success) return;
+                const auto* ts = static_cast<const uint64_t*>(
+                    wgpuBufferGetConstMappedRange(ctx->buffer, ctx->slotB * sizeof(uint64_t), 2 * sizeof(uint64_t)));
+                if (ts)
                 {
-                    TracyWebGPUPanic("Timed out waiting for anchor timestamp readback.", return false);
+                    ctx->gpuTime = ts[0];
+                    ctx->ok = true;
+                    fprintf(stdout, "CalibrateClocks() -> %llu | %llu | %lld\n", ts[0], ts[1], ts[1]-ts[0]);
                 }
-                std::this_thread::sleep_for(std::chrono::microseconds(100));
-            }
+                wgpuBufferUnmap(ctx->buffer);
+            };
+            cbInfo.userdata1 = &mctx;
+            wgpuBufferMapAsync(m_readbackSlots[0].buffer, WGPUMapMode_Read,
+                               calibSlotB * sizeof(uint64_t), 2 * sizeof(uint64_t), cbInfo);
 
-            if (mctx.status.load(std::memory_order_acquire) != static_cast<int>(WGPUMapAsyncStatus_Success))
-            {
-                TracyWebGPUPanic("Failed to map anchor readback buffer.", return false);
-            }
+            while (!mctx.ok && std::chrono::steady_clock::now() < deadline)
+                wgpuInstanceProcessEvents(m_instance);
+            //m_previousCheckpoint = m_queryCounter.load();
+
+            auto t1 = Profiler::GetTime();
+            //outCpuTime = static_cast<uint64_t>(t0 + (t1-t0)/2);
+            outCpuTime = t1;
 
-            const void* mapped = wgpuBufferGetConstMappedRange(m_readbackBuffer, 0, sizeof(uint64_t));
-            if (!mapped)
+            if (!mctx.ok)
             {
-                wgpuBufferUnmap(m_readbackBuffer);
-                return false;
+                TracyWebGPUPanic("Failed to calibrate CPU/GPU clocks.", return false);
             }
-            uint64_t gpuTs;
-            std::memcpy(&gpuTs, mapped, sizeof(uint64_t));
-            wgpuBufferUnmap(m_readbackBuffer);
 
-            outGpuTime = gpuTs;
+            outGpuTime = mctx.gpuTime;
+            fprintf(stdout, "CalibrateClocks() -> %llu\n", outGpuTime);
+            if (outGpuTime < m_prevCalibGpuTime)
+                fprintf(stdout, "CalibrateClocks() -> WARNING!!! going backwards!\n%llu\n%llu\n%lld\n", m_prevCalibGpuTime, outGpuTime, outGpuTime-m_prevCalibGpuTime);
+            m_prevCalibGpuTime = outGpuTime;
             return true;
         }
 
     public:
+        static bool SetupDevice(WGPUDeviceDescriptor& deviceDescriptor)
+        {
+            // piggy-back on WGPU_DAWN_TOGGLES_DESCRIPTOR_INIT to detect Dawn header
+#           ifdef WGPU_DAWN_TOGGLES_DESCRIPTOR_INIT
+                fprintf(stderr, "[INFO] [DAWN] ENABLING RAW TIMESTAMP TICKS (disabling ns conversion + quantization)\n");
+                // disable_timestamp_query_conversion: resolve timestamps as raw GPU ticks, not nanoseconds.
+                // timestamp_quantization: disabled defensively (off by default on Metal, but on elsewhere).
+                static const char* dawnDisabledToggles[] = { "timestamp_quantization" };
+                static const char* dawnEnabledToggles[]  = { "disable_timestamp_query_conversion" };
+                static WGPUDawnTogglesDescriptor togglesDesc = {};
+                togglesDesc.chain.sType = WGPUSType_DawnTogglesDescriptor;
+                togglesDesc.disabledToggles = dawnDisabledToggles;
+                togglesDesc.disabledToggleCount = 1;
+                togglesDesc.enabledToggles = dawnEnabledToggles;
+                togglesDesc.enabledToggleCount  = 1;
+                deviceDescriptor.nextInChain = reinterpret_cast<WGPUChainedStruct*>(&togglesDesc);
+#           endif
+            return true;
+        }
+
         WebGPUQueueCtx(WGPUInstance instance, WGPUDevice device, WGPUQueue queue)
             : m_instance(instance)
             , m_device(device)
@@ -208,12 +303,35 @@ namespace tracy
             wgpuDeviceAddRef(m_device);
             wgpuQueueAddRef(m_queue);
 
+            // Graceful early-out: if the logical device was created without the
+            // required timestamp features, GPU zones will silently do nothing.
+            // m_contextId stays 255 (invalid); CreateWebGPUContext destroys and
+            // returns nullptr, and all TracyWebGPU* macros become no-ops.
+            if (!wgpuDeviceHasFeature(m_device, WGPUFeatureName_TimestampQuery))
+            {
+                TracyWebGPUPanic(
+                    "timestamp-query feature not enabled on device; GPU profiling disabled.",
+                    return
+                )
+            }
+            // wgpuCommandEncoderResolveQuerySet requires the wgpu-native
+            // TIMESTAMP_QUERY_INSIDE_ENCODERS feature on some backends.
+#ifdef WGPUNativeFeature_TimestampQueryInsideEncoders
+            if (!wgpuDeviceHasFeature(m_device, (WGPUFeatureName)WGPUNativeFeature_TimestampQueryInsideEncoders))
+            {
+                TracyWebGPUPanic(
+                    "WGPUNativeFeature_TimestampQueryInsideEncoders not enabled on device; "
+                    "GPU profiling disabled (needed for ResolveQuerySet on the command encoder).",
+                    return
+                );
+            }
+#endif
+
             // Pick a query budget. WebGPU has no native upper bound on query
-            // set size in the spec, but per-implementation maxQueriesPerQuerySet
-            // is typically 8192. We start at 64K and halve on failure, mirroring
-            // D3D12. Queries are issued in (begin, end) pairs, so the count is
+            // set size in the spec. The WebGPU default/max for maxQuerySetSize
+            // is 4096. Queries are issued in (begin, end) pairs, so the count is
             // always even.
-            static constexpr uint32_t MaxQueries = 64 * 1024;
+            static constexpr uint32_t MaxQueries = 512; //4096;
             m_queryLimit = MaxQueries;
 
             WGPUQuerySetDescriptor qsDesc = {};
@@ -242,14 +360,15 @@ namespace tracy
                 TracyWebGPUPanic("Failed to create timestamp resolve buffer.", return);
             }
 
-            // Readback buffer: target of CopyBufferToBuffer; mappable for read.
+            // Readback buffers: targets of CopyBufferToBuffer; mappable for read (3-slot ring).
             WGPUBufferDescriptor readbackDesc = {};
             readbackDesc.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead;
             readbackDesc.size  = static_cast<uint64_t>(m_queryLimit) * sizeof(uint64_t);
-            m_readbackBuffer = wgpuDeviceCreateBuffer(m_device, &readbackDesc);
-            if (!m_readbackBuffer)
+            for (auto& slot : m_readbackSlots)
             {
-                TracyWebGPUPanic("Failed to create timestamp readback buffer.", return);
+                slot.buffer = wgpuDeviceCreateBuffer(m_device, &readbackDesc);
+                slot.copiedUpto = 0;
+                if (!slot.buffer) { TracyWebGPUPanic("Failed to create timestamp readback buffer.", return); }
             }
 
             // Establish the (cpuTime, gpuTime) anchor for Tracy's GpuNewContext.
@@ -257,13 +376,14 @@ namespace tracy
             // to estimate a correlation for the CPU and the GPU timestamps.
             uint64_t cpuTimestamp = 0;
             uint64_t gpuTimestamp = 0;
-            if (!Anchor(cpuTimestamp, gpuTimestamp))
+            if (!CalibrateClocks(cpuTimestamp, gpuTimestamp))
             {
-                TracyWebGPUPanic("Failed to establish CPU/GPU timestamp anchor.", return);
+                TracyWebGPUPanic("Failed to calibrate CPU/GPU clocks.", return);
             }
 
-            m_shadowBuffer.resize(m_queryLimit, gpuTimestamp);
-            m_latestKnownGpuTimestamp = gpuTimestamp;
+            fprintf(stdout, "INFO: gpuTimestamp is %llu\n", gpuTimestamp);
+            //m_shadowBuffer.resize(m_queryLimit, gpuTimestamp);
+            m_shadowBuffer.resize(m_queryLimit, 0);
 
             // WebGPU timestamps are in nanoseconds, as per the spec.
             const float period = 1.0f;  // 1ns/tick
@@ -286,25 +406,11 @@ namespace tracy
 
         ~WebGPUQueueCtx()
         {
-            ZoneScopedC(Color::Red4);
-            ZoneValue(m_contextId);
+            Collect(); // best-effort non-blocking flush
 
-            // Drain pending queries.
-            uint64_t endTicket = m_queryCounter;
-            uint64_t lastIssuedTicket = (endTicket >= 2) ? (endTicket - 2) : 0;
-            Drain(lastIssuedTicket, 200);
-
-            if (Distance(endTicket, m_queryCounter) > 0)
-                TracyWebGPUPanic("client is still pushing queries.");
-
-            // If the readback buffer is mapped, unmap it before release.
-            if (m_readbackBuffer && m_mapState.load() == MapState::Ready)
-            {
-                wgpuBufferUnmap(m_readbackBuffer);
-                m_mapState.store(MapState::Idle);
-            }
-
-            if (m_readbackBuffer) { wgpuBufferRelease(m_readbackBuffer); m_readbackBuffer = nullptr; }
+            // Block until any in-flight map completes before releasing buffers.
+            for (auto& slot : m_readbackSlots)
+                if (slot.buffer) { wgpuBufferRelease(slot.buffer); slot.buffer = nullptr; }
             if (m_resolveBuffer)  { wgpuBufferRelease(m_resolveBuffer);  m_resolveBuffer  = nullptr; }
             if (m_querySet)       { wgpuQuerySetRelease(m_querySet);     m_querySet       = nullptr; }
             if (m_queue)          { wgpuQueueRelease(m_queue);           m_queue          = nullptr; }
@@ -336,307 +442,130 @@ namespace tracy
             if (!GetProfiler().IsConnected()) return;
 #endif
             if (!m_collectionMutex.try_lock()) return;
-            std::unique_lock lock(m_collectionMutex, std::adopt_lock);
-            Collect(lock, m_queryCounter, false);
-        }
+            std::unique_lock<std::mutex> lock(m_collectionMutex, std::adopt_lock);
 
-    private:
-        // Issue (or progress) the readback for the range [earliest, end). On
-        // entry, the buffer is in some MapState; on return, if a complete
-        // readback was performed, queries up to the resolved point are emitted
-        // to Tracy and m_previousCheckpoint is advanced.
-        //
-        // Strategy:
-        //   * If MapState::Idle, kick off a CopyBufferToBuffer + MapAsync for
-        //     the unread range. Pump events briefly so the callback can land
-        //     before we return. This is the steady-state code path.
-        //   * If MapState::Pending, just pump events.
-        //   * If MapState::Ready, read the timestamps, unmap, mark Idle.
-        //   * If MapState::Failed, reset to Idle and bail.
-        void Collect(std::unique_lock<std::mutex>& lock, uint64_t targetTicket, bool urgent)
-        {
             ZoneScopedC(Color::Red4);
-            TracyWebGPUAssert(lock.owns_lock());
-            TracyWebGPUDebug(ZoneValue(m_contextId));
 
-            uint64_t earliestTicket = m_previousCheckpoint;
-            uint64_t endTicket = m_queryCounter;
-            if (Distance(earliestTicket, endTicket) <= 0)
+            if (Distance(m_previousCheckpoint, m_queryCounter) <= 0)
                 return;
 
-            // Drive the state machine. If the buffer is already mapped, harvest
-            // it. Otherwise, kick off a new map for the current unread range.
-            MapState state = m_mapState.load(std::memory_order_acquire);
-
-            if (state == MapState::Failed)
-            {
-                // Try again next time.
-                m_mapState.store(MapState::Idle, std::memory_order_release);
-                return;
-            }
+            const int collectIdx = (m_writeIdx + 2) % 3;
+            auto& collectSlot = m_readbackSlots[collectIdx];
 
-            if (state == MapState::Idle)
+            // Poll for an in-flight map to complete.
+            if (collectSlot.pendingFuture.id != 0)
             {
-                if (!IssueReadback(earliestTicket, endTicket))
-                    return;
-                state = m_mapState.load(std::memory_order_acquire);
+                wgpuInstanceProcessEvents(m_instance);
+                if (collectSlot.mapStatus == WGPUMapAsyncStatus{})
+                    return;  // callback hasn't fired yet
+                collectSlot.pendingFuture = {};
             }
 
-            // If we're in urgent mode, pump until we get a Ready or Failed.
-            if (urgent && state == MapState::Pending)
+            // If a buffer is mapped, process as many resolved queries as possible.
+            if (collectSlot.mapStatus == WGPUMapAsyncStatus_Success)
             {
-                const auto t0 = std::chrono::steady_clock::now();
-                while ((state = m_mapState.load(std::memory_order_acquire)) == MapState::Pending)
+                const uint64_t* ts = static_cast<const uint64_t*>(
+                    wgpuBufferGetConstMappedRange(collectSlot.buffer, 0,
+                        static_cast<uint64_t>(m_queryLimit) * sizeof(uint64_t)));
+                if (ts)
                 {
-                    ProcessEvents();
-                    if (std::chrono::steady_clock::now() - t0 > std::chrono::seconds(1))
+                    uint64_t ticket = m_previousCheckpoint;
+                    const uint64_t end = collectSlot.copiedUpto;
+                    fprintf(stdout, "[TWG] Collect [%d] (%llu, %llu)\n", collectIdx, ticket, end);
+                    for (; Distance(ticket, end) > 0; ticket += 2)
                     {
-                        TracyWebGPULog(Warning, "Timed out waiting for urgent timestamp readback.");
-                        break;
+                        const uint32_t slotB = RingIndex(ticket);
+                        const uint32_t slotE = slotB + 1;
+                        fprintf(stderr,
+                            "[TWG] slot B=%4u E=%4u ts[B]=%llu ts[E]=%llu shadow[E]=%llu ts-diff=%lld shadow-diff=%lld\n",
+                            slotB, slotE,
+                            (unsigned long long)ts[slotB],
+                            (unsigned long long)ts[slotE],
+                            (unsigned long long)m_shadowBuffer[slotE],
+                            (long long)Distance(ts[slotB], ts[slotE]),
+                            (long long)Distance(m_shadowBuffer[slotE], ts[slotE]));
+                        if (Distance(m_shadowBuffer[slotE], ts[slotE]) <= 0)
+                            break; // GPU hasn't written this timestamp yet; retry next Collect()
+                        EmitGpuTime(ts[slotB], slotB);
+                        EmitGpuTime(ts[slotE], slotE);
                     }
-                    std::this_thread::sleep_for(std::chrono::microseconds(50));
-                }
-            }
-            else if (state == MapState::Pending)
-            {
-                // Non-urgent: pump once and bail; the callback may land later.
-                ProcessEvents();
-                state = m_mapState.load(std::memory_order_acquire);
-                if (state != MapState::Ready) return;
-            }
-
-            if (state != MapState::Ready) return;
-
-            // We have a mapped range covering [m_pendingFirst, m_pendingLast).
-            HarvestMappedRange(targetTicket, urgent);
-
-            // After we've drained, stop. The next Collect() will issue a new
-            // readback for whatever has accumulated since.
-        }
+                    m_previousCheckpoint = ticket;
 
-        // Set when the most recent IssueReadback was called.
-        uint64_t m_pendingFirstTicket = 0;
-        uint64_t m_pendingEndTicket   = 0;
+                    if (Distance(ticket, end) > 0)
+                        return; // still unresolved queries in this buffer; come back next Collect()
+                }
 
-        // Issue a CopyBufferToBuffer + MapAsync for query slots in [first, end).
-        // Note: 'first' and 'end' are ticket numbers (logical, monotonic).
-        // Their wrapped slot indices may straddle the end of the ring buffer;
-        // in that case we issue two separate copies.
-        bool IssueReadback(uint64_t first, uint64_t end)
-        {
-            const int64_t span = Distance(first, end);
-            if (span <= 0) return false;
-
-            // Cap the readback to the ring's size. If span > capacity, the older
-            // entries will have been overwritten in the resolve buffer, so we
-            // can only meaningfully read the most recent capacity worth of
-            // entries.
-            uint64_t actualFirst = first;
-            if (static_cast<uint64_t>(span) > RingCapacity())
-            {
-                actualFirst = end - RingCapacity();
+                // All queries resolved (or getMappedRange failed): unmap and fall through to rotate.
+                wgpuBufferUnmap(collectSlot.buffer);
+                collectSlot.mapStatus = {};
             }
 
-            const uint32_t firstSlot = RingIndex(actualFirst);
-            const uint32_t lastSlot  = RingIndex(end);  // exclusive end
-            const uint32_t cap       = RingCapacity();
-
-            WGPUCommandEncoderDescriptor encDesc = {};
-            WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(m_device, &encDesc);
-            if (!enc) return false;
+            // Idle: rotate the ring and start the next map if there is committed data to collect.
+            //   WRITE   = m_writeIdx
+            //   PENDING = (m_writeIdx + 1) % 3  ← map this
+            //   COLLECT = (m_writeIdx + 2) % 3  ← recycle as new WRITE
+            const int writeIdx   = m_writeIdx;
+            const int pendingIdx = (writeIdx + 1) % 3;
 
-            // Either a single contiguous copy, or two copies that wrap around.
-            if (firstSlot < lastSlot || lastSlot == 0)
-            {
-                const uint32_t count = (lastSlot == 0) ? (cap - firstSlot) : (lastSlot - firstSlot);
-                wgpuCommandEncoderCopyBufferToBuffer(
-                    enc,
-                    m_resolveBuffer,
-                    static_cast<uint64_t>(firstSlot) * sizeof(uint64_t),
-                    m_readbackBuffer,
-                    static_cast<uint64_t>(firstSlot) * sizeof(uint64_t),
-                    static_cast<uint64_t>(count) * sizeof(uint64_t));
-            }
-            else
-            {
-                // Wrap: [firstSlot, cap) and [0, lastSlot).
-                wgpuCommandEncoderCopyBufferToBuffer(
-                    enc,
-                    m_resolveBuffer,
-                    static_cast<uint64_t>(firstSlot) * sizeof(uint64_t),
-                    m_readbackBuffer,
-                    static_cast<uint64_t>(firstSlot) * sizeof(uint64_t),
-                    static_cast<uint64_t>(cap - firstSlot) * sizeof(uint64_t));
-                wgpuCommandEncoderCopyBufferToBuffer(
-                    enc,
-                    m_resolveBuffer,
-                    0,
-                    m_readbackBuffer,
-                    0,
-                    static_cast<uint64_t>(lastSlot) * sizeof(uint64_t));
-            }
+            if (m_readbackSlots[writeIdx].copiedUpto <= m_previousCheckpoint)
+                return;
 
-            WGPUCommandBufferDescriptor cmdDesc = {};
-            WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(enc, &cmdDesc);
-            wgpuCommandEncoderRelease(enc);
-            if (!cmd) return false;
+            const int newWriteIdx = (writeIdx + 2) % 3;
 
-            wgpuQueueSubmit(m_queue, 1, &cmd);
-            wgpuCommandBufferRelease(cmd);
+            m_readbackSlots[newWriteIdx].copiedUpto = m_previousCheckpoint.load();
 
-            // Map the entire buffer (covers both contiguous and wrapped cases).
-            // We could be tighter and map just the touched range(s), but the
-            // single-range MapAsync makes the wrap case awkward, so we map all.
-            m_pendingFirstTicket = actualFirst;
-            m_pendingEndTicket   = end;
-            m_mapState.store(MapState::Pending, std::memory_order_release);
+            m_writeIdx = newWriteIdx;
 
             WGPUBufferMapCallbackInfo cbInfo = {};
-            cbInfo.mode = WGPUCallbackMode_AllowProcessEvents;
-            cbInfo.callback = &WebGPUQueueCtx::OnMapped;
+            cbInfo.mode      = WGPUCallbackMode_AllowSpontaneous;
+            cbInfo.callback  = &WebGPUQueueCtx::OnMapped;
             cbInfo.userdata1 = this;
-
-            wgpuBufferMapAsync(
-                m_readbackBuffer,
-                WGPUMapMode_Read,
-                0,
-                static_cast<uint64_t>(cap) * sizeof(uint64_t),
-                cbInfo);
-
-            // A single pump in case the callback can fire immediately.
-            ProcessEvents();
-            return true;
+            m_readbackSlots[pendingIdx].pendingFuture = wgpuBufferMapAsync(
+                m_readbackSlots[pendingIdx].buffer, WGPUMapMode_Read, 0,
+                static_cast<uint64_t>(m_queryLimit) * sizeof(uint64_t), cbInfo);
+
+            // Optimistic immediate poll: deliver any already-completed callbacks.
+            wgpuInstanceProcessEvents(m_instance);
+            if (m_readbackSlots[pendingIdx].mapStatus != WGPUMapAsyncStatus{})
+                m_readbackSlots[pendingIdx].pendingFuture = {};
         }
 
-        static void OnMapped(WGPUMapAsyncStatus status, WGPUStringView /*msg*/, void* userdata1, void* /*userdata2*/)
-        {
-            auto* self = static_cast<WebGPUQueueCtx*>(userdata1);
-            if (status == WGPUMapAsyncStatus_Success)
-                self->m_mapState.store(MapState::Ready, std::memory_order_release);
-            else
-                self->m_mapState.store(MapState::Failed, std::memory_order_release);
-        }
-
-        void HarvestMappedRange(uint64_t targetTicket, bool urgent)
-        {
-            const uint32_t cap = RingCapacity();
-            const void* mapped = wgpuBufferGetConstMappedRange(
-                m_readbackBuffer, 0, static_cast<uint64_t>(cap) * sizeof(uint64_t));
-
-            if (!mapped)
-            {
-                wgpuBufferUnmap(m_readbackBuffer);
-                m_mapState.store(MapState::Idle, std::memory_order_release);
-                TracyWebGPUPanic("Failed to read mapped readback buffer.", return);
-            }
-
-            const uint64_t* timestampBuffer = static_cast<const uint64_t*>(mapped);
-
-            uint64_t ticket = m_pendingFirstTicket;
-            const uint64_t end = m_pendingEndTicket;
-
-            for (; ticket != end; ticket += 2)
-            {
-                if (!ResolveTimestamp(ticket, timestampBuffer))
-                    break;
-            }
-
-            // Urgent: ensure 'targetTicket' is collected before returning.
-            if (urgent)
-            {
-                while (Distance(ticket, targetTicket) >= 0)
-                {
-                    DropTimestamp(ticket, timestampBuffer);
-                    ticket += 2;
-                }
-            }
-
-            // Overflow handling: drop oldest queries to normalize the situation.
-            uint64_t curEnd = m_queryCounter;
-            while (Distance(ticket, curEnd) > static_cast<int64_t>(RingCapacity()))
-            {
-                DropTimestamp(ticket, timestampBuffer);
-                ticket += 2;
-            }
-
-            wgpuBufferUnmap(m_readbackBuffer);
-            m_mapState.store(MapState::Idle, std::memory_order_release);
-        }
-
-        bool Wait(uint64_t queryTicket, uint64_t timeout_ms)
-        {
-            ZoneScopedC(Color::Red4);
-            const auto t0 = std::chrono::steady_clock::now();
-            int64_t elapsed = 0;
-            while ((Distance(m_previousCheckpoint, queryTicket) >= 0)
-                   && (static_cast<uint64_t>(elapsed) < timeout_ms))
-            {
-                std::unique_lock lock(m_collectionMutex);
-                Collect(lock, queryTicket, false);
-                lock.unlock();
-                std::this_thread::sleep_for(std::chrono::microseconds(100));
-                elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
-                    std::chrono::steady_clock::now() - t0).count();
-            }
-            return Distance(m_previousCheckpoint, queryTicket) < 0;
-        }
-
-        void Drain(uint64_t queryTicket, uint64_t gracePeriod_ms)
-        {
-            ZoneScopedC(Color::Red4);
-            if (Wait(queryTicket, gracePeriod_ms))
-                return;
-            std::unique_lock lock(m_collectionMutex);
-            Collect(lock, queryTicket, true);
-        }
-
-        bool ResolveTimestamp(uint64_t queryTicket, const uint64_t* timestampBuffer)
+    private:
+        // Drive the WebGPU event queue to deliver pending callbacks.
+        // wgpuInstanceProcessEvents is the canonical webgpu.h API.
+        // wgpu-native additionally benefits from wgpuDevicePoll.
+        void ProcessEvents()
         {
-            uint32_t queryId = RingIndex(queryTicket);
-            uint64_t gpuZoneBeginTimestamp = timestampBuffer[queryId];
-            uint64_t gpuZoneEndTimestamp   = timestampBuffer[queryId + 1];
-            uint64_t baselineTimestamp     = m_shadowBuffer[queryId + 1];
-            int64_t  baseline_diff = Distance(baselineTimestamp, gpuZoneEndTimestamp);
-            if (baseline_diff <= 0)
-                return false;
-            EmitGpuTime(gpuZoneBeginTimestamp, queryId);
-            EmitGpuTime(gpuZoneEndTimestamp,   queryId + 1);
-            RetireTicket(queryTicket);
-            if (Distance(m_latestKnownGpuTimestamp, gpuZoneEndTimestamp) > 0)
-                m_latestKnownGpuTimestamp = gpuZoneEndTimestamp;
-            return true;
+            if (m_instance)
+                wgpuInstanceProcessEvents(m_instance);
+#ifdef WGPU_H_
+            wgpuDevicePoll(m_device, false, nullptr);
+#endif
         }
 
-        void DropTimestamp(uint64_t queryTicket, const uint64_t* timestampBuffer)
+        static void OnMapped(WGPUMapAsyncStatus status, WGPUStringView, void* ud, void*)
         {
-            if (ResolveTimestamp(queryTicket, timestampBuffer))
-                return;
-            uint32_t queryId = RingIndex(queryTicket);
-            uint64_t latestGpuTimestamp = m_latestKnownGpuTimestamp;
-            EmitGpuTime(latestGpuTimestamp, queryId);
-            EmitGpuTime(latestGpuTimestamp, queryId + 1);
-            RetireTicket(queryTicket);
+            auto* self = static_cast<WebGPUQueueCtx*>(ud);
+            const int collectIdx = (self->m_writeIdx + 2) % 3;
+            self->m_readbackSlots[collectIdx].mapStatus = status;
         }
 
-        void EmitGpuTime(uint64_t gpuTimestamp, uint32_t queryId)
+        void EmitGpuTime(uint64_t gpuTimestamp, uint32_t slot)
         {
             auto* item = Profiler::QueueSerial();
             MemWrite(&item->hdr.type, QueueType::GpuTime);
             MemWrite(&item->gpuTime.gpuTime, static_cast<int64_t>(gpuTimestamp));
-            MemWrite(&item->gpuTime.queryId, static_cast<uint16_t>(queryId));
+            MemWrite(&item->gpuTime.queryId, static_cast<uint16_t>(slot));
             MemWrite(&item->gpuTime.context, GetId());
             Profiler::QueueSerialFinish();
-            m_shadowBuffer[queryId] = gpuTimestamp;
+            m_shadowBuffer[slot] = gpuTimestamp;
         }
 
-        tracy_force_inline uint32_t RingCapacity() const
-        {
-            return m_queryLimit;
-        }
+        tracy_force_inline uint32_t RingCapacity() const { return m_queryLimit; }
 
-        tracy_force_inline uint32_t RingIndex(uint64_t logicalSlot) const
+        tracy_force_inline uint32_t RingIndex(uint64_t t) const
         {
-            return static_cast<uint32_t>(logicalSlot % RingCapacity());
+            return static_cast<uint32_t>(t % RingCapacity());
         }
 
         tracy_force_inline static int64_t Distance(uint64_t begin, uint64_t end)
@@ -644,34 +573,28 @@ namespace tracy
             return static_cast<int64_t>(end - begin);
         }
 
-        void RetireTicket(uint64_t ticket)
-        {
-            TracyWebGPUAssert(m_previousCheckpoint == ticket);
-            uint64_t nextTicket = ticket + 2;
-            m_previousCheckpoint.store(nextTicket, std::memory_order_release);
-        }
-
-        tracy_force_inline uint32_t NextQueryId()
+        tracy_force_inline uint64_t NextQueryId()
         {
             const uint64_t ticket = m_queryCounter.fetch_add(2, std::memory_order_relaxed);
-            const uint64_t checkpoint = m_previousCheckpoint.load(std::memory_order_relaxed);
-            if (Distance(checkpoint, ticket) >= static_cast<int64_t>(RingCapacity()))
+            if (Distance(m_previousCheckpoint, ticket)
+                >= static_cast<int64_t>(RingCapacity()))
             {
-                ZoneScopedC(Color::Red4);
                 TracyWebGPULog(Warning, "Too many pending GPU queries: stalling!");
-                uint64_t oldTicket = ticket - RingCapacity();
-                Drain(oldTicket, 0);
+                Collect();
             }
-            return RingIndex(ticket);
+            return ticket;
         }
     };
 
     class WebGPUZoneScope
     {
-        const bool m_active;
-        WebGPUQueueCtx* m_ctx = nullptr;
-        WGPUCommandEncoder m_encoder = nullptr;
-        uint32_t m_queryId = 0;
+        const bool         m_active;
+        WebGPUQueueCtx*    m_ctx       = nullptr;
+        WGPUCommandEncoder m_encoder   = nullptr;
+        uint64_t           m_rawTicket = 0;  // raw (non-modded) ticket from NextQueryId
+        uint32_t           m_queryId   = 0;  // ring index = m_rawTicket % queryLimit
+
+        WGPUPassTimestampWrites m_timestampWrites = {};
 
         tracy_force_inline void WriteQueueItem(const SourceLocationData* srcLocation, int32_t callstackDepth, uint32_t sourceLine, const char* sourceFile, size_t sourceFileLen, const char* functionName, size_t functionNameLen, const char* zoneName, size_t zoneNameLen)
         {
@@ -720,7 +643,25 @@ namespace tracy
             Profiler::QueueSerialFinish();
         }
 
-        tracy_force_inline WebGPUZoneScope(WebGPUQueueCtx* ctx, WGPUCommandEncoder encoder, bool active)
+        // Fills in m_timestampWrites and assigns its address to passDesc.timestampWrites.
+        // Works with both WGPURenderPassDescriptor and WGPUComputePassDescriptor.
+        template<typename PassDescriptor>
+        tracy_force_inline void InitBase(WebGPUQueueCtx* ctx, WGPUCommandEncoder encoder, PassDescriptor& passDesc)
+        {
+            m_ctx       = ctx;
+            m_encoder   = encoder;
+
+            m_rawTicket = m_ctx->NextQueryId();
+            m_queryId   = static_cast<uint32_t>(m_rawTicket % ctx->m_queryLimit);
+            m_timestampWrites.querySet                  = m_ctx->m_querySet;
+            m_timestampWrites.beginningOfPassWriteIndex = m_queryId;
+            m_timestampWrites.endOfPassWriteIndex       = m_queryId + 1;
+            passDesc.timestampWrites                    = &m_timestampWrites;
+        }
+
+    public:
+        template<typename PassDescriptor>
+        tracy_force_inline WebGPUZoneScope(WebGPUQueueCtx* ctx, WGPUCommandEncoder encoder, PassDescriptor& passDesc, const SourceLocationData* srcLocation, bool active)
 #ifdef TRACY_ON_DEMAND
             : m_active(active && GetProfiler().IsConnected())
 #else
@@ -728,36 +669,46 @@ namespace tracy
 #endif
         {
             if (!m_active) return;
-
-            m_ctx = ctx;
-            m_encoder = encoder;
-
-            m_queryId = m_ctx->NextQueryId();
-            wgpuCommandEncoderWriteTimestamp(m_encoder, m_ctx->m_querySet, m_queryId);
-        }
-
-    public:
-        tracy_force_inline WebGPUZoneScope(WebGPUQueueCtx* ctx, WGPUCommandEncoder encoder, const SourceLocationData* srcLocation, bool active)
-            : WebGPUZoneScope(ctx, encoder, active)
-        {
+            InitBase(ctx, encoder, passDesc);
             WriteQueueItem(srcLocation, 0, 0, nullptr, 0, nullptr, 0, nullptr, 0);
         }
 
-        tracy_force_inline WebGPUZoneScope(WebGPUQueueCtx* ctx, WGPUCommandEncoder encoder, const SourceLocationData* srcLocation, int32_t depth, bool active)
-            : WebGPUZoneScope(ctx, encoder, active)
+        template<typename PassDescriptor>
+        tracy_force_inline WebGPUZoneScope(WebGPUQueueCtx* ctx, WGPUCommandEncoder encoder, PassDescriptor& passDesc, const SourceLocationData* srcLocation, int32_t depth, bool active)
+#ifdef TRACY_ON_DEMAND
+            : m_active(active && GetProfiler().IsConnected())
+#else
+            : m_active(active)
+#endif
         {
+            if (!m_active) return;
+            InitBase(ctx, encoder, passDesc);
             WriteQueueItem(srcLocation, depth, 0, nullptr, 0, nullptr, 0, nullptr, 0);
         }
 
-        tracy_force_inline WebGPUZoneScope(WebGPUQueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, WGPUCommandEncoder encoder, bool active)
-            : WebGPUZoneScope(ctx, encoder, active)
+        template<typename PassDescriptor>
+        tracy_force_inline WebGPUZoneScope(WebGPUQueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, WGPUCommandEncoder encoder, PassDescriptor& passDesc, bool active)
+#ifdef TRACY_ON_DEMAND
+            : m_active(active && GetProfiler().IsConnected())
+#else
+            : m_active(active)
+#endif
         {
+            if (!m_active) return;
+            InitBase(ctx, encoder, passDesc);
             WriteQueueItem(nullptr, 0, line, source, sourceSz, function, functionSz, name, nameSz);
         }
 
-        tracy_force_inline WebGPUZoneScope(WebGPUQueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, WGPUCommandEncoder encoder, int32_t depth, bool active)
-            : WebGPUZoneScope(ctx, encoder, active)
+        template<typename PassDescriptor>
+        tracy_force_inline WebGPUZoneScope(WebGPUQueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, WGPUCommandEncoder encoder, PassDescriptor& passDesc, int32_t depth, bool active)
+#ifdef TRACY_ON_DEMAND
+            : m_active(active && GetProfiler().IsConnected())
+#else
+            : m_active(active)
+#endif
         {
+            if (!m_active) return;
+            InitBase(ctx, encoder, passDesc);
             WriteQueueItem(nullptr, depth, line, source, sourceSz, function, functionSz, name, nameSz);
         }
 
@@ -775,20 +726,34 @@ namespace tracy
             MemWrite(&item->gpuZoneEnd.context, m_ctx->GetId());
             Profiler::QueueSerialFinish();
 
-            // Write the end timestamp and resolve the (begin, end) pair into
-            // the resolve buffer right away. We cannot move the resolve to
-            // Collect() because the user may finish/destroy the encoder
-            // immediately after the zone closes, and ResolveQuerySet must be
-            // recorded into an encoder belonging to the same submission as the
-            // timestamp writes if we want to read the values for THIS zone in
-            // the same frame. Recording it here also matches the D3D12 backend.
-            wgpuCommandEncoderWriteTimestamp(m_encoder, m_ctx->m_querySet, queryId);
-            wgpuCommandEncoderResolveQuerySet(
-                m_encoder,
-                m_ctx->m_querySet,
-                m_queryId, 2,
-                m_ctx->m_resolveBuffer,
-                static_cast<uint64_t>(m_queryId) * sizeof(uint64_t));
+            if (m_queryId % 32 == 30)
+            {
+                // 32 queries = 32 * 8 bytes = 256 bytes
+                const uint32_t blockStart  = m_queryId - 30;
+                const uint64_t blockOffset = static_cast<uint64_t>(blockStart) * sizeof(uint64_t);
+                wgpuCommandEncoderResolveQuerySet(
+                    m_encoder,
+                    m_ctx->m_querySet,
+                    blockStart, 32,
+                    m_ctx->m_resolveBuffer,
+                    blockOffset // MUST be a multiple of (aligned to) 256...
+                );
+                auto& slot = m_ctx->m_readbackSlots[m_ctx->m_writeIdx];
+                auto readbackBuffer = slot.buffer;
+                wgpuCommandEncoderCopyBufferToBuffer(
+                    m_encoder,
+                    m_ctx->m_resolveBuffer,
+                    blockOffset,
+                    readbackBuffer,
+                    blockOffset,
+                    32 * sizeof(uint64_t));
+                // Advance this slot's high-water mark to cover the block just encoded.
+                const uint64_t blockEnd = m_rawTicket + 2;
+                uint64_t prev = slot.copiedUpto;
+                while (prev < blockEnd &&
+                       !slot.copiedUpto.compare_exchange_weak(prev, blockEnd)) {}
+                fprintf(stdout, "[TWG] WebGPUZoneScope [%d] (%d,%d)\n", (int)m_ctx->m_writeIdx, blockStart, m_queryId);
+            }
         }
     };
 
@@ -826,38 +791,36 @@ using TracyWebGPUCtx = tracy::WebGPUQueueCtx*;
 #define TracyWebGPUDestroy(ctx) tracy::DestroyWebGPUContext(ctx);
 #define TracyWebGPUContextName(ctx, name, size) ctx->Name(name, size);
 
-#define TracyWebGPUNewFrame(ctx) ((void)(ctx))
-
 #define TracyWebGPUUnnamedZone ___tracy_gpu_webgpu_zone
 #define TracyWebGPUSrcLocSymbol TracyConcat(__tracy_webgpu_source_location,TracyLine)
 #define TracyWebGPUSrcLocObject(name, color) static constexpr tracy::SourceLocationData TracyWebGPUSrcLocSymbol { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color };
 
 #if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
-#  define TracyWebGPUZone(ctx, encoder, name) TracyWebGPUNamedZoneS(ctx, TracyWebGPUUnnamedZone, encoder, name, TRACY_CALLSTACK, true)
-#  define TracyWebGPUZoneC(ctx, encoder, name, color) TracyWebGPUNamedZoneCS(ctx, TracyWebGPUUnnamedZone, encoder, name, color, TRACY_CALLSTACK, true)
-#  define TracyWebGPUNamedZone(ctx, varname, encoder, name, active) TracyWebGPUSrcLocObject(name, 0); tracy::WebGPUZoneScope varname{ ctx, encoder, &TracyWebGPUSrcLocSymbol, TRACY_CALLSTACK, active };
-#  define TracyWebGPUNamedZoneC(ctx, varname, encoder, name, color, active) TracyWebGPUSrcLocObject(name, color); tracy::WebGPUZoneScope varname{ ctx, encoder, &TracyWebGPUSrcLocSymbol, TRACY_CALLSTACK, active };
-#  define TracyWebGPUZoneTransient(ctx, varname, encoder, name, active) TracyWebGPUZoneTransientS(ctx, varname, encoder, name, TRACY_CALLSTACK, active)
+#  define TracyWebGPUZone(ctx, encoder, passDesc, name) TracyWebGPUNamedZoneS(ctx, TracyWebGPUUnnamedZone, encoder, passDesc, name, TRACY_CALLSTACK, true)
+#  define TracyWebGPUZoneC(ctx, encoder, passDesc, name, color) TracyWebGPUNamedZoneCS(ctx, TracyWebGPUUnnamedZone, encoder, passDesc, name, color, TRACY_CALLSTACK, true)
+#  define TracyWebGPUNamedZone(ctx, varname, encoder, passDesc, name, active) TracyWebGPUSrcLocObject(name, 0); tracy::WebGPUZoneScope varname{ ctx, encoder, passDesc, &TracyWebGPUSrcLocSymbol, TRACY_CALLSTACK, active };
+#  define TracyWebGPUNamedZoneC(ctx, varname, encoder, passDesc, name, color, active) TracyWebGPUSrcLocObject(name, color); tracy::WebGPUZoneScope varname{ ctx, encoder, passDesc, &TracyWebGPUSrcLocSymbol, TRACY_CALLSTACK, active };
+#  define TracyWebGPUZoneTransient(ctx, varname, encoder, passDesc, name, active) TracyWebGPUZoneTransientS(ctx, varname, encoder, passDesc, name, TRACY_CALLSTACK, active)
 #else
-#  define TracyWebGPUZone(ctx, encoder, name) TracyWebGPUNamedZone(ctx, TracyWebGPUUnnamedZone, encoder, name, true)
-#  define TracyWebGPUZoneC(ctx, encoder, name, color) TracyWebGPUNamedZoneC(ctx, TracyWebGPUUnnamedZone, encoder, name, color, true)
-#  define TracyWebGPUNamedZone(ctx, varname, encoder, name, active) TracyWebGPUSrcLocObject(name, 0); tracy::WebGPUZoneScope varname{ ctx, encoder, &TracyWebGPUSrcLocSymbol, active };
-#  define TracyWebGPUNamedZoneC(ctx, varname, encoder, name, color, active) TracyWebGPUSrcLocObject(name, color); tracy::WebGPUZoneScope varname{ ctx, encoder, &TracyWebGPUSrcLocSymbol, active };
-#  define TracyWebGPUZoneTransient(ctx, varname, encoder, name, active) tracy::WebGPUZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), encoder, active };
+#  define TracyWebGPUZone(ctx, encoder, passDesc, name) TracyWebGPUNamedZone(ctx, TracyWebGPUUnnamedZone, encoder, passDesc, name, true)
+#  define TracyWebGPUZoneC(ctx, encoder, passDesc, name, color) TracyWebGPUNamedZoneC(ctx, TracyWebGPUUnnamedZone, encoder, passDesc, name, color, true)
+#  define TracyWebGPUNamedZone(ctx, varname, encoder, passDesc, name, active) TracyWebGPUSrcLocObject(name, 0); tracy::WebGPUZoneScope varname{ ctx, encoder, passDesc, &TracyWebGPUSrcLocSymbol, active };
+#  define TracyWebGPUNamedZoneC(ctx, varname, encoder, passDesc, name, color, active) TracyWebGPUSrcLocObject(name, color); tracy::WebGPUZoneScope varname{ ctx, encoder, passDesc, &TracyWebGPUSrcLocSymbol, active };
+#  define TracyWebGPUZoneTransient(ctx, varname, encoder, passDesc, name, active) tracy::WebGPUZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), encoder, passDesc, active };
 #endif
 
 #ifdef TRACY_HAS_CALLSTACK
-#  define TracyWebGPUZoneS(ctx, encoder, name, depth) TracyWebGPUNamedZoneS(ctx, TracyWebGPUUnnamedZone, encoder, name, depth, true)
-#  define TracyWebGPUZoneCS(ctx, encoder, name, color, depth) TracyWebGPUNamedZoneCS(ctx, TracyWebGPUUnnamedZone, encoder, name, color, depth, true)
-#  define TracyWebGPUNamedZoneS(ctx, varname, encoder, name, depth, active) TracyWebGPUSrcLocObject(name, 0); tracy::WebGPUZoneScope varname{ ctx, encoder, &TracyWebGPUSrcLocSymbol, depth, active };
-#  define TracyWebGPUNamedZoneCS(ctx, varname, encoder, name, color, depth, active) TracyWebGPUSrcLocObject(name, color); tracy::WebGPUZoneScope varname{ ctx, encoder, &TracyWebGPUSrcLocSymbol, depth, active };
-#  define TracyWebGPUZoneTransientS(ctx, varname, encoder, name, depth, active) tracy::WebGPUZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), encoder, depth, active };
+#  define TracyWebGPUZoneS(ctx, encoder, passDesc, name, depth) TracyWebGPUNamedZoneS(ctx, TracyWebGPUUnnamedZone, encoder, passDesc, name, depth, true)
+#  define TracyWebGPUZoneCS(ctx, encoder, passDesc, name, color, depth) TracyWebGPUNamedZoneCS(ctx, TracyWebGPUUnnamedZone, encoder, passDesc, name, color, depth, true)
+#  define TracyWebGPUNamedZoneS(ctx, varname, encoder, passDesc, name, depth, active) TracyWebGPUSrcLocObject(name, 0); tracy::WebGPUZoneScope varname{ ctx, encoder, passDesc, &TracyWebGPUSrcLocSymbol, depth, active };
+#  define TracyWebGPUNamedZoneCS(ctx, varname, encoder, passDesc, name, color, depth, active) TracyWebGPUSrcLocObject(name, color); tracy::WebGPUZoneScope varname{ ctx, encoder, passDesc, &TracyWebGPUSrcLocSymbol, depth, active };
+#  define TracyWebGPUZoneTransientS(ctx, varname, encoder, passDesc, name, depth, active) tracy::WebGPUZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), encoder, passDesc, depth, active };
 #else
-#  define TracyWebGPUZoneS(ctx, encoder, name, depth) TracyWebGPUZone(ctx, encoder, name)
-#  define TracyWebGPUZoneCS(ctx, encoder, name, color, depth) TracyWebGPUZoneC(ctx, encoder, name, color)
-#  define TracyWebGPUNamedZoneS(ctx, varname, encoder, name, depth, active) TracyWebGPUNamedZone(ctx, varname, encoder, name, active)
-#  define TracyWebGPUNamedZoneCS(ctx, varname, encoder, name, color, depth, active) TracyWebGPUNamedZoneC(ctx, varname, encoder, name, color, active)
-#  define TracyWebGPUZoneTransientS(ctx, varname, encoder, name, depth, active) TracyWebGPUZoneTransient(ctx, varname, encoder, name, active)
+#  define TracyWebGPUZoneS(ctx, encoder, passDesc, name, depth) TracyWebGPUZone(ctx, encoder, passDesc, name)
+#  define TracyWebGPUZoneCS(ctx, encoder, passDesc, name, color, depth) TracyWebGPUZoneC(ctx, encoder, passDesc, name, color)
+#  define TracyWebGPUNamedZoneS(ctx, varname, encoder, passDesc, name, depth, active) TracyWebGPUNamedZone(ctx, varname, encoder, passDesc, name, active)
+#  define TracyWebGPUNamedZoneCS(ctx, varname, encoder, passDesc, name, color, depth, active) TracyWebGPUNamedZoneC(ctx, varname, encoder, passDesc, name, color, active)
+#  define TracyWebGPUZoneTransientS(ctx, varname, encoder, passDesc, name, depth, active) TracyWebGPUZoneTransient(ctx, varname, encoder, passDesc, name, active)
 #endif
 
 #define TracyWebGPUCollect(ctx) ctx->Collect();
diff --git a/python/bindings/ServerModule.cpp b/python/bindings/ServerModule.cpp
index 74ee235b61..fc1890a825 100644
--- a/python/bindings/ServerModule.cpp
+++ b/python/bindings/ServerModule.cpp
@@ -1033,14 +1033,14 @@ PYBIND11_MODULE( TracyServerBindings, m )
         // --- GPU contexts ---
         .def( "get_gpu_contexts", []( const Worker& w ) {
         static const char* gpuTypeStr[] = {
-            "Invalid", "OpenGL", "Vulkan", "OpenCL", "Direct3D12", "Direct3D11", "Metal", "Custom", "CUDA", "Rocprof" };
+            "Invalid", "OpenGL", "Vulkan", "OpenCL", "Direct3D12", "Direct3D11", "Metal", "Custom", "CUDA", "Rocprof", "WebGPU" };
         std::vector<GpuContextSummary> result;
         for( const auto* ctx : w.GetGpuData() )
         {
             if( !ctx ) continue;
             const std::string name = ctx->name.Active() ? w.GetString( ctx->name ) : "";
             const uint8_t typeIdx = (uint8_t)ctx->type;
-            const char* typeStr = typeIdx < 10 ? gpuTypeStr[typeIdx] : "Unknown";
+            const char* typeStr = typeIdx < 11 ? gpuTypeStr[typeIdx] : "Unknown";
             result.push_back( GpuContextSummary{
                 name, ctx->count, std::string( typeStr ), ctx->thread } );
         }

From ecbdfde549fda66a438f1444b919b659e74bbcb4 Mon Sep 17 00:00:00 2001
From: Marcos Slomp <mslomp@gmail.com>
Date: Tue, 19 May 2026 09:41:38 -0700
Subject: [PATCH 03/21] getting rid of spontaneous callbacks

---
 public/tracy/TracyWebGPU.hpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/public/tracy/TracyWebGPU.hpp b/public/tracy/TracyWebGPU.hpp
index a4d777b152..3c368dacc9 100644
--- a/public/tracy/TracyWebGPU.hpp
+++ b/public/tracy/TracyWebGPU.hpp
@@ -217,7 +217,7 @@ namespace tracy
             // Wait for the GPU to finish executing the command buffer before mapping.
             bool gpuDone = false;
             WGPUQueueWorkDoneCallbackInfo doneCB = {};
-            doneCB.mode      = WGPUCallbackMode_AllowSpontaneous;
+            doneCB.mode      = WGPUCallbackMode_AllowProcessEvents;
             doneCB.callback  = [](WGPUQueueWorkDoneStatus, WGPUStringView, void* ud, void*) {
                 *static_cast<bool*>(ud) = true;
             };
@@ -231,7 +231,7 @@ namespace tracy
             struct MapCtx { WGPUBuffer buffer; uint32_t slotB; uint64_t gpuTime = 0; bool ok = false; };
             MapCtx mctx{ m_readbackSlots[0].buffer, calibSlotB };
             WGPUBufferMapCallbackInfo cbInfo = {};
-            cbInfo.mode      = WGPUCallbackMode_AllowSpontaneous;
+            cbInfo.mode      = WGPUCallbackMode_AllowProcessEvents;
             cbInfo.callback  = [](WGPUMapAsyncStatus status, WGPUStringView, void* ud, void*) {
                 auto* ctx = static_cast<MapCtx*>(ud);
                 if (status != WGPUMapAsyncStatus_Success) return;
@@ -436,7 +436,7 @@ namespace tracy
             SubmitQueueItem(item);
         }
 
-        void Collect()
+        void Collect(bool webgpuProcessEvents=false)
         {
 #ifdef TRACY_ON_DEMAND
             if (!GetProfiler().IsConnected()) return;
@@ -455,7 +455,8 @@ namespace tracy
             // Poll for an in-flight map to complete.
             if (collectSlot.pendingFuture.id != 0)
             {
-                wgpuInstanceProcessEvents(m_instance);
+                if (webgpuProcessEvents)
+                    wgpuInstanceProcessEvents(m_instance);
                 if (collectSlot.mapStatus == WGPUMapAsyncStatus{})
                     return;  // callback hasn't fired yet
                 collectSlot.pendingFuture = {};
@@ -517,7 +518,7 @@ namespace tracy
             m_writeIdx = newWriteIdx;
 
             WGPUBufferMapCallbackInfo cbInfo = {};
-            cbInfo.mode      = WGPUCallbackMode_AllowSpontaneous;
+            cbInfo.mode      = WGPUCallbackMode_AllowProcessEvents;
             cbInfo.callback  = &WebGPUQueueCtx::OnMapped;
             cbInfo.userdata1 = this;
             m_readbackSlots[pendingIdx].pendingFuture = wgpuBufferMapAsync(

From 88e87a3348584da093afcea97f4c3d8dbfdc1eac Mon Sep 17 00:00:00 2001
From: Marcos Slomp <mslomp@gmail.com>
Date: Tue, 19 May 2026 11:46:50 -0700
Subject: [PATCH 04/21] refactoring query resolve

---
 public/tracy/TracyWebGPU.hpp | 62 ++++++++++++++++++++----------------
 1 file changed, 35 insertions(+), 27 deletions(-)

diff --git a/public/tracy/TracyWebGPU.hpp b/public/tracy/TracyWebGPU.hpp
index 3c368dacc9..5add6baa3a 100644
--- a/public/tracy/TracyWebGPU.hpp
+++ b/public/tracy/TracyWebGPU.hpp
@@ -597,6 +597,40 @@ namespace tracy
 
         WGPUPassTimestampWrites m_timestampWrites = {};
 
+        void ResolveQueryBatch(uint32_t queryId)
+        {
+            // 32 queries = 32 * 8 bytes = 256 bytes
+            const uint32_t blockStart  = queryId - 30;
+            TracyWebGPUAssert(blockStart % 32 == 0, return);
+
+            const uint64_t blockOffset = static_cast<uint64_t>(blockStart) * sizeof(uint64_t);
+            wgpuCommandEncoderResolveQuerySet(
+                m_encoder,
+                m_ctx->m_querySet,
+                blockStart, 32,
+                m_ctx->m_resolveBuffer,
+                blockOffset // MUST be a multiple of (aligned to) 256...
+            );
+
+            auto& slot = m_ctx->m_readbackSlots[m_ctx->m_writeIdx];
+            auto readbackBuffer = slot.buffer;
+            wgpuCommandEncoderCopyBufferToBuffer(
+                m_encoder,
+                m_ctx->m_resolveBuffer,
+                blockOffset,
+                readbackBuffer,
+                blockOffset,
+                32 * sizeof(uint64_t)
+            );
+
+            // Advance this slot's high-water mark to cover the block just encoded.
+            const uint64_t blockEnd = m_rawTicket + 2;
+            uint64_t prev = slot.copiedUpto;
+            while (prev < blockEnd &&
+                   !slot.copiedUpto.compare_exchange_weak(prev, blockEnd)) {}
+            fprintf(stdout, "[TWG] WebGPUZoneScope [%d] (%d,%d)\n", (int)m_ctx->m_writeIdx, blockStart, queryId);
+        }
+
         tracy_force_inline void WriteQueueItem(const SourceLocationData* srcLocation, int32_t callstackDepth, uint32_t sourceLine, const char* sourceFile, size_t sourceFileLen, const char* functionName, size_t functionNameLen, const char* zoneName, size_t zoneNameLen)
         {
             if (!m_active) return;
@@ -728,33 +762,7 @@ namespace tracy
             Profiler::QueueSerialFinish();
 
             if (m_queryId % 32 == 30)
-            {
-                // 32 queries = 32 * 8 bytes = 256 bytes
-                const uint32_t blockStart  = m_queryId - 30;
-                const uint64_t blockOffset = static_cast<uint64_t>(blockStart) * sizeof(uint64_t);
-                wgpuCommandEncoderResolveQuerySet(
-                    m_encoder,
-                    m_ctx->m_querySet,
-                    blockStart, 32,
-                    m_ctx->m_resolveBuffer,
-                    blockOffset // MUST be a multiple of (aligned to) 256...
-                );
-                auto& slot = m_ctx->m_readbackSlots[m_ctx->m_writeIdx];
-                auto readbackBuffer = slot.buffer;
-                wgpuCommandEncoderCopyBufferToBuffer(
-                    m_encoder,
-                    m_ctx->m_resolveBuffer,
-                    blockOffset,
-                    readbackBuffer,
-                    blockOffset,
-                    32 * sizeof(uint64_t));
-                // Advance this slot's high-water mark to cover the block just encoded.
-                const uint64_t blockEnd = m_rawTicket + 2;
-                uint64_t prev = slot.copiedUpto;
-                while (prev < blockEnd &&
-                       !slot.copiedUpto.compare_exchange_weak(prev, blockEnd)) {}
-                fprintf(stdout, "[TWG] WebGPUZoneScope [%d] (%d,%d)\n", (int)m_ctx->m_writeIdx, blockStart, m_queryId);
-            }
+                ResolveQueryBatch(m_queryId);
         }
     };
 

From f657f1e99d9efaf2b896fc9089753fa5b3e6b2b5 Mon Sep 17 00:00:00 2001
From: Marcos Slomp <mslomp@gmail.com>
Date: Tue, 19 May 2026 16:32:52 -0700
Subject: [PATCH 05/21] calibration stability

---
 public/tracy/TracyWebGPU.hpp | 211 +++++++++++++++++++++--------------
 1 file changed, 125 insertions(+), 86 deletions(-)

diff --git a/public/tracy/TracyWebGPU.hpp b/public/tracy/TracyWebGPU.hpp
index 5add6baa3a..aa3042b146 100644
--- a/public/tracy/TracyWebGPU.hpp
+++ b/public/tracy/TracyWebGPU.hpp
@@ -105,7 +105,50 @@ namespace tracy
         uint32_t m_queryLimit = 0;
 
         std::vector<uint64_t> m_shadowBuffer;
-        uint64_t m_prevCalibGpuTime = 0;
+
+        struct Calibration {
+            uint64_t cpuTime = 0;
+            uint64_t gpuTime = 0;
+            static bool WaitQueueIdle(WGPUQueue queue, WGPUInstance instance)
+            {
+                bool gpuDone = false;
+                WGPUQueueWorkDoneCallbackInfo doneCB = {};
+                doneCB.mode = WGPUCallbackMode_AllowProcessEvents;
+                doneCB.callback = [](WGPUQueueWorkDoneStatus, WGPUStringView, void* userData, void*) {
+                    *static_cast<bool*>(userData) = true;
+                };
+                doneCB.userdata1 = &gpuDone;
+                wgpuQueueOnSubmittedWorkDone(queue, doneCB);
+
+                const auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(2);
+                while (!gpuDone && std::chrono::steady_clock::now() < deadline)
+                    wgpuInstanceProcessEvents(instance);
+                return gpuDone;
+            }
+            static const uint64_t* MapBufferSync(WGPUBuffer buffer, WGPUInstance instance)
+            {
+                struct MapCtx { WGPUMapAsyncStatus status = (WGPUMapAsyncStatus)0; } ctx;
+                WGPUBufferMapCallbackInfo cbInfo = {};
+                cbInfo.mode      = WGPUCallbackMode_AllowProcessEvents;
+                cbInfo.callback  = [](WGPUMapAsyncStatus status, WGPUStringView, void* userData, void*) {
+                    auto* ctx = static_cast<MapCtx*>(userData);
+                    ctx->status = status;
+                };
+                cbInfo.userdata1 = &ctx;
+                size_t offset = 0;
+                size_t size = 2 * sizeof(uint64_t);
+                wgpuBufferMapAsync(buffer, WGPUMapMode_Read, offset, size, cbInfo);
+
+                const auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(2);
+                while (ctx.status == 0 && std::chrono::steady_clock::now() < deadline)
+                    wgpuInstanceProcessEvents(instance);
+
+                if (ctx.status != WGPUMapAsyncStatus_Success) return nullptr;
+                auto data = wgpuBufferGetConstMappedRange(buffer, offset, size);
+                return static_cast<const uint64_t*>(data);
+            }
+            bool Update(uint64_t tcpu0, uint64_t tcpu1, uint64_t tgpu) { return false; }
+        } m_calibration;
 
         tracy_force_inline void SubmitQueueItem(tracy::QueueItem* item)
         {
@@ -119,9 +162,6 @@ namespace tracy
         {
             ZoneScoped;
 
-            WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(m_device, nullptr);
-            if (!enc) { TracyWebGPUPanic("Failed to create calibration command encoder.", return false); }
-
             // wgpuCommandEncoderWriteTimestamp is deprecated and returns 0 on Metal.
             // Use a render pass with an actual draw call: on Metal TBDR, begin-of-pass
             // timestamps fire at tile rasterization start. An empty render pass (no
@@ -140,7 +180,7 @@ namespace tracy
             WGPUShaderModuleDescriptor smDesc = {};
             smDesc.nextInChain  = reinterpret_cast<WGPUChainedStruct*>(&wgslSrc);
             WGPUShaderModule calibShader = wgpuDeviceCreateShaderModule(m_device, &smDesc);
-            if (!calibShader) { wgpuCommandEncoderRelease(enc); TracyWebGPUPanic("Failed to create calibration shader.", return false); }
+            if (!calibShader) { TracyWebGPUPanic("Failed to create calibration shader.", return false); }
 
             WGPUTextureDescriptor texDesc = {};
             texDesc.usage         = WGPUTextureUsage_RenderAttachment;
@@ -150,9 +190,9 @@ namespace tracy
             texDesc.mipLevelCount = 1;
             texDesc.sampleCount   = 1;
             WGPUTexture tex = wgpuDeviceCreateTexture(m_device, &texDesc);
-            if (!tex) { wgpuShaderModuleRelease(calibShader); wgpuCommandEncoderRelease(enc); TracyWebGPUPanic("Failed to create calibration scratch texture.", return false); }
+            if (!tex) { wgpuShaderModuleRelease(calibShader); TracyWebGPUPanic("Failed to create calibration scratch texture.", return false); }
             WGPUTextureView texView = wgpuTextureCreateView(tex, nullptr);
-            if (!texView) { wgpuTextureRelease(tex); wgpuShaderModuleRelease(calibShader); wgpuCommandEncoderRelease(enc); TracyWebGPUPanic("Failed to create calibration scratch texture view.", return false); }
+            if (!texView) { wgpuTextureRelease(tex); wgpuShaderModuleRelease(calibShader); TracyWebGPUPanic("Failed to create calibration scratch texture view.", return false); }
 
             WGPUColorTargetState colorTarget = {};
             colorTarget.format    = WGPUTextureFormat_BGRA8Unorm;
@@ -169,11 +209,13 @@ namespace tracy
             pipeDesc.multisample.count    = 1;
             pipeDesc.fragment             = &fragState;
             WGPURenderPipeline calibPipeline = wgpuDeviceCreateRenderPipeline(m_device, &pipeDesc);
-            if (!calibPipeline) { wgpuTextureViewRelease(texView); wgpuTextureRelease(tex); wgpuShaderModuleRelease(calibShader); wgpuCommandEncoderRelease(enc); TracyWebGPUPanic("Failed to create calibration pipeline.", return false); }
+            if (!calibPipeline) { wgpuTextureViewRelease(texView); wgpuTextureRelease(tex); wgpuShaderModuleRelease(calibShader); TracyWebGPUPanic("Failed to create calibration pipeline.", return false); }
 
             //const uint64_t calibTicket = NextQueryId();
             //const uint32_t calibSlotB  = RingIndex(calibTicket);
             //const uint32_t calibSlotE  = calibSlotB + 1;
+            //m_previousCheckpoint = m_queryCounter.load();
+
             const uint32_t calibSlotB  = 0;
             const uint32_t calibSlotE  = 1;
 
@@ -193,86 +235,70 @@ namespace tracy
             passDesc.colorAttachments     = &att;
             passDesc.timestampWrites      = &anchorTs;
 
-            WGPURenderPassEncoder pass = wgpuCommandEncoderBeginRenderPass(enc, &passDesc);
-            wgpuRenderPassEncoderSetPipeline(pass, calibPipeline);
-            wgpuRenderPassEncoderDraw(pass, 3, 1, 0, 0);
-            wgpuRenderPassEncoderEnd(pass);
-            wgpuRenderPassEncoderRelease(pass);
-            wgpuRenderPipelineRelease(calibPipeline);
-            wgpuShaderModuleRelease(calibShader);
-            wgpuTextureViewRelease(texView);
-            wgpuTextureRelease(tex);
-
-            wgpuCommandEncoderResolveQuerySet(enc, m_querySet, calibSlotB, 2, m_resolveBuffer, calibSlotB * sizeof(uint64_t));
-            wgpuCommandEncoderCopyBufferToBuffer(enc, m_resolveBuffer, calibSlotB * sizeof(uint64_t), m_readbackSlots[0].buffer, calibSlotB * sizeof(uint64_t), 2 * sizeof(uint64_t));
-
-            WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(enc, nullptr);
-            wgpuCommandEncoderRelease(enc);
-            if (!cmd) { TracyWebGPUPanic("Failed to finish calibration command encoder.", return false); }
-
-            auto t0 = Profiler::GetTime();
-            wgpuQueueSubmit(m_queue, 1, &cmd);
-            wgpuCommandBufferRelease(cmd);
-
-            // Wait for the GPU to finish executing the command buffer before mapping.
-            bool gpuDone = false;
-            WGPUQueueWorkDoneCallbackInfo doneCB = {};
-            doneCB.mode      = WGPUCallbackMode_AllowProcessEvents;
-            doneCB.callback  = [](WGPUQueueWorkDoneStatus, WGPUStringView, void* ud, void*) {
-                *static_cast<bool*>(ud) = true;
-            };
-            doneCB.userdata1 = &gpuDone;
-            wgpuQueueOnSubmittedWorkDone(m_queue, doneCB);
-
-            const auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(2);
-            while (!gpuDone && std::chrono::steady_clock::now() < deadline)
-                wgpuInstanceProcessEvents(m_instance);
-
-            struct MapCtx { WGPUBuffer buffer; uint32_t slotB; uint64_t gpuTime = 0; bool ok = false; };
-            MapCtx mctx{ m_readbackSlots[0].buffer, calibSlotB };
-            WGPUBufferMapCallbackInfo cbInfo = {};
-            cbInfo.mode      = WGPUCallbackMode_AllowProcessEvents;
-            cbInfo.callback  = [](WGPUMapAsyncStatus status, WGPUStringView, void* ud, void*) {
-                auto* ctx = static_cast<MapCtx*>(ud);
-                if (status != WGPUMapAsyncStatus_Success) return;
-                const auto* ts = static_cast<const uint64_t*>(
-                    wgpuBufferGetConstMappedRange(ctx->buffer, ctx->slotB * sizeof(uint64_t), 2 * sizeof(uint64_t)));
-                if (ts)
+            int64_t minCpuRange = 999'999'999'999;
+            for (int i=0; i<10; ++i)
+            {
+                WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(m_device, nullptr);
+                if (!enc) { TracyWebGPUPanic("Failed to create calibration command encoder.", return false); }
+
+                WGPURenderPassEncoder pass = wgpuCommandEncoderBeginRenderPass(enc, &passDesc);
+                wgpuRenderPassEncoderSetPipeline(pass, calibPipeline);
+                wgpuRenderPassEncoderDraw(pass, 3, 1, 0, 0);
+                wgpuRenderPassEncoderEnd(pass);
+                wgpuRenderPassEncoderRelease(pass);
+
+                wgpuCommandEncoderResolveQuerySet(enc, m_querySet, calibSlotB, 2, m_resolveBuffer, calibSlotB * sizeof(uint64_t));
+                wgpuCommandEncoderCopyBufferToBuffer(enc, m_resolveBuffer, calibSlotB * sizeof(uint64_t), m_readbackSlots[0].buffer, calibSlotB * sizeof(uint64_t), 2 * sizeof(uint64_t));
+
+                WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(enc, nullptr);
+                wgpuCommandEncoderRelease(enc);
+                if (!cmd) { TracyWebGPUPanic("Failed to finish calibration command encoder.", return false); }
+
+                Calibration::WaitQueueIdle(m_queue, m_instance);
+                int64_t cpu [2] = {};
+                cpu[0] = Profiler::GetTime();
+                wgpuQueueSubmit(m_queue, 1, &cmd);
+                wgpuCommandBufferRelease(cmd);
+                Calibration::WaitQueueIdle(m_queue, m_instance);
+                cpu[1] = Profiler::GetTime();
+                int64_t cpuRange = cpu[1] - cpu[0];
+                auto gpu = Calibration::MapBufferSync(m_readbackSlots[0].buffer, m_instance);
+                TracyWebGPUAssert(gpu != nullptr);
+                fprintf(stdout, "CalibrateClocks() -> %llu | %llu | %lld /// %lld\n", gpu[0], gpu[1], gpu[1]-gpu[0], cpuRange);
+                if (cpuRange < minCpuRange)
                 {
-                    ctx->gpuTime = ts[0];
-                    ctx->ok = true;
-                    fprintf(stdout, "CalibrateClocks() -> %llu | %llu | %lld\n", ts[0], ts[1], ts[1]-ts[0]);
+                    outCpuTime = cpu[1];    // static_cast<uint64_t>(t0 + (t1-t0)/2);
+                    outGpuTime = gpu[0];
+                    minCpuRange = cpuRange;
                 }
-                wgpuBufferUnmap(ctx->buffer);
-            };
-            cbInfo.userdata1 = &mctx;
-            wgpuBufferMapAsync(m_readbackSlots[0].buffer, WGPUMapMode_Read,
-                               calibSlotB * sizeof(uint64_t), 2 * sizeof(uint64_t), cbInfo);
-
-            while (!mctx.ok && std::chrono::steady_clock::now() < deadline)
-                wgpuInstanceProcessEvents(m_instance);
-            //m_previousCheckpoint = m_queryCounter.load();
+                wgpuBufferUnmap(m_readbackSlots[0].buffer);
 
-            auto t1 = Profiler::GetTime();
-            //outCpuTime = static_cast<uint64_t>(t0 + (t1-t0)/2);
-            outCpuTime = t1;
-
-            if (!mctx.ok)
-            {
-                TracyWebGPUPanic("Failed to calibrate CPU/GPU clocks.", return false);
+                if (outGpuTime < m_calibration.gpuTime)
+                    fprintf(stdout, "CalibrateClocks() -> WARNING!!! going backwards!\n%llu\n%llu\n%lld\n", m_calibration.gpuTime, outGpuTime, outGpuTime - m_calibration.gpuTime);
+                m_calibration.gpuTime = outGpuTime;
             }
 
-            outGpuTime = mctx.gpuTime;
-            fprintf(stdout, "CalibrateClocks() -> %llu\n", outGpuTime);
-            if (outGpuTime < m_prevCalibGpuTime)
-                fprintf(stdout, "CalibrateClocks() -> WARNING!!! going backwards!\n%llu\n%llu\n%lld\n", m_prevCalibGpuTime, outGpuTime, outGpuTime-m_prevCalibGpuTime);
-            m_prevCalibGpuTime = outGpuTime;
+            wgpuRenderPipelineRelease(calibPipeline);
+            wgpuShaderModuleRelease(calibShader);
+            wgpuTextureViewRelease(texView);
+            wgpuTextureRelease(tex);
+
             return true;
         }
 
     public:
         static bool SetupDevice(WGPUDeviceDescriptor& deviceDescriptor)
         {
+            static constexpr int MaxFeatures = 128;
+            static WGPUFeatureName features [MaxFeatures] = {};
+
+            int n = deviceDescriptor.requiredFeatureCount;
+            assert(n < MaxFeatures && "Too many required features in WGPUDeviceDescriptor");
+            if (n > 0 && deviceDescriptor.requiredFeatures)
+                memcpy(features, deviceDescriptor.requiredFeatures, n * sizeof(WGPUFeatureName));
+
+            features[n++] = WGPUFeatureName_TimestampQuery;
+
             // piggy-back on WGPU_DAWN_TOGGLES_DESCRIPTOR_INIT to detect Dawn header
 #           ifdef WGPU_DAWN_TOGGLES_DESCRIPTOR_INIT
                 fprintf(stderr, "[INFO] [DAWN] ENABLING RAW TIMESTAMP TICKS (disabling ns conversion + quantization)\n");
@@ -287,7 +313,20 @@ namespace tracy
                 togglesDesc.enabledToggles = dawnEnabledToggles;
                 togglesDesc.enabledToggleCount  = 1;
                 deviceDescriptor.nextInChain = reinterpret_cast<WGPUChainedStruct*>(&togglesDesc);
+#           else
+                // wgpu-native: passTimestampWrites requires the non-standard
+                // TIMESTAMP_QUERY_INSIDE_PASSES device feature in addition to
+                // the standard TimestampQuery feature.
+                fprintf(stderr, "[INFO] [WGPU] Requesting TimestampQueryInsidePasses native feature\n");
+                {
+                    constexpr auto WGPUNativeFeature_TimestampQueryInsideEncoders = 0x00030024;
+                    constexpr auto WGPUNativeFeature_TimestampQueryInsidePasses = 0x00030025;
+                    features[n++] = (WGPUFeatureName)WGPUNativeFeature_TimestampQueryInsideEncoders;
+                    //features[n++] = (WGPUFeatureName)WGPUNativeFeature_TimestampQueryInsidePasses;
+                }
 #           endif
+            deviceDescriptor.requiredFeatures = features;
+            deviceDescriptor.requiredFeatureCount = static_cast<uint32_t>(n);
             return true;
         }
 
@@ -597,17 +636,17 @@ namespace tracy
 
         WGPUPassTimestampWrites m_timestampWrites = {};
 
-        void ResolveQueryBatch(uint32_t queryId)
+        void ResolveQueryBatch(uint32_t queryBatchStartId)
         {
             // 32 queries = 32 * 8 bytes = 256 bytes
-            const uint32_t blockStart  = queryId - 30;
-            TracyWebGPUAssert(blockStart % 32 == 0, return);
+            TracyWebGPUAssert(queryBatchStartId % 32 == 0, return);
+            queryBatchStartId = queryBatchStartId % m_ctx->m_queryLimit;
 
-            const uint64_t blockOffset = static_cast<uint64_t>(blockStart) * sizeof(uint64_t);
+            const uint64_t blockOffset = static_cast<uint64_t>(queryBatchStartId) * sizeof(uint64_t);
             wgpuCommandEncoderResolveQuerySet(
                 m_encoder,
                 m_ctx->m_querySet,
-                blockStart, 32,
+                queryBatchStartId, 32,
                 m_ctx->m_resolveBuffer,
                 blockOffset // MUST be a multiple of (aligned to) 256...
             );
@@ -624,11 +663,11 @@ namespace tracy
             );
 
             // Advance this slot's high-water mark to cover the block just encoded.
-            const uint64_t blockEnd = m_rawTicket + 2;
+            const uint64_t blockEnd = m_rawTicket;
             uint64_t prev = slot.copiedUpto;
             while (prev < blockEnd &&
                    !slot.copiedUpto.compare_exchange_weak(prev, blockEnd)) {}
-            fprintf(stdout, "[TWG] WebGPUZoneScope [%d] (%d,%d)\n", (int)m_ctx->m_writeIdx, blockStart, queryId);
+            fprintf(stdout, "[TWG] WebGPUZoneScope [%d] (%d,%d)\n", (int)m_ctx->m_writeIdx, queryBatchStartId, queryBatchStartId+32);
         }
 
         tracy_force_inline void WriteQueueItem(const SourceLocationData* srcLocation, int32_t callstackDepth, uint32_t sourceLine, const char* sourceFile, size_t sourceFileLen, const char* functionName, size_t functionNameLen, const char* zoneName, size_t zoneNameLen)
@@ -761,8 +800,8 @@ namespace tracy
             MemWrite(&item->gpuZoneEnd.context, m_ctx->GetId());
             Profiler::QueueSerialFinish();
 
-            if (m_queryId % 32 == 30)
-                ResolveQueryBatch(m_queryId);
+            if (m_queryId % 32 == 0)
+                ResolveQueryBatch(m_queryId-32);
         }
     };
 

From 388a94fc79802a713ee5aa8f59e0e30420d6026a Mon Sep 17 00:00:00 2001
From: Marcos Slomp <mslomp@gmail.com>
Date: Wed, 20 May 2026 08:15:32 -0700
Subject: [PATCH 06/21] refactoring initial calibration

---
 public/tracy/TracyWebGPU.hpp | 86 ++++++++++++++++--------------------
 1 file changed, 38 insertions(+), 48 deletions(-)

diff --git a/public/tracy/TracyWebGPU.hpp b/public/tracy/TracyWebGPU.hpp
index aa3042b146..9958573113 100644
--- a/public/tracy/TracyWebGPU.hpp
+++ b/public/tracy/TracyWebGPU.hpp
@@ -109,6 +109,7 @@ namespace tracy
         struct Calibration {
             uint64_t cpuTime = 0;
             uint64_t gpuTime = 0;
+            int64_t minCpuRange = ~uint64_t(0) >> 1;
             static bool WaitQueueIdle(WGPUQueue queue, WGPUInstance instance)
             {
                 bool gpuDone = false;
@@ -147,7 +148,16 @@ namespace tracy
                 auto data = wgpuBufferGetConstMappedRange(buffer, offset, size);
                 return static_cast<const uint64_t*>(data);
             }
-            bool Update(uint64_t tcpu0, uint64_t tcpu1, uint64_t tgpu) { return false; }
+            bool Update(uint64_t tcpu0, uint64_t tcpu1, uint64_t tgpu)
+            {
+                // TODO: run some interval-based incremental regression here
+                int64_t cpuRange = tcpu1 - tcpu0;
+                if (cpuRange >= minCpuRange) return false;
+                minCpuRange = cpuRange;
+                this->cpuTime = tcpu1;    // t0 + (t1-t0)/2
+                this->gpuTime = tgpu;
+                return true;
+            }
         } m_calibration;
 
         tracy_force_inline void SubmitQueueItem(tracy::QueueItem* item)
@@ -211,18 +221,11 @@ namespace tracy
             WGPURenderPipeline calibPipeline = wgpuDeviceCreateRenderPipeline(m_device, &pipeDesc);
             if (!calibPipeline) { wgpuTextureViewRelease(texView); wgpuTextureRelease(tex); wgpuShaderModuleRelease(calibShader); TracyWebGPUPanic("Failed to create calibration pipeline.", return false); }
 
-            //const uint64_t calibTicket = NextQueryId();
-            //const uint32_t calibSlotB  = RingIndex(calibTicket);
-            //const uint32_t calibSlotE  = calibSlotB + 1;
-            //m_previousCheckpoint = m_queryCounter.load();
-
-            const uint32_t calibSlotB  = 0;
-            const uint32_t calibSlotE  = 1;
-
+            uint32_t queryId = 0;
             WGPUPassTimestampWrites anchorTs = {};
             anchorTs.querySet                  = m_querySet;
-            anchorTs.beginningOfPassWriteIndex = calibSlotB;
-            anchorTs.endOfPassWriteIndex       = calibSlotE;
+            anchorTs.beginningOfPassWriteIndex = queryId;
+            anchorTs.endOfPassWriteIndex       = queryId+1;
 
             WGPURenderPassColorAttachment att = {};
             att.view       = texView;
@@ -235,8 +238,8 @@ namespace tracy
             passDesc.colorAttachments     = &att;
             passDesc.timestampWrites      = &anchorTs;
 
-            int64_t minCpuRange = 999'999'999'999;
-            for (int i=0; i<10; ++i)
+            const auto deadline = std::chrono::steady_clock::now() + std::chrono::milliseconds(100);
+            do
             {
                 WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(m_device, nullptr);
                 if (!enc) { TracyWebGPUPanic("Failed to create calibration command encoder.", return false); }
@@ -247,8 +250,11 @@ namespace tracy
                 wgpuRenderPassEncoderEnd(pass);
                 wgpuRenderPassEncoderRelease(pass);
 
-                wgpuCommandEncoderResolveQuerySet(enc, m_querySet, calibSlotB, 2, m_resolveBuffer, calibSlotB * sizeof(uint64_t));
-                wgpuCommandEncoderCopyBufferToBuffer(enc, m_resolveBuffer, calibSlotB * sizeof(uint64_t), m_readbackSlots[0].buffer, calibSlotB * sizeof(uint64_t), 2 * sizeof(uint64_t));
+                WGPUBuffer readBackBuffer = m_readbackSlots[0].buffer;
+                uint32_t byteOffset = queryId * sizeof(uint64_t);
+                uint32_t sizeInBytes = 2 * sizeof(uint64_t);
+                wgpuCommandEncoderResolveQuerySet(enc, m_querySet, queryId, 2, m_resolveBuffer, byteOffset);
+                wgpuCommandEncoderCopyBufferToBuffer(enc, m_resolveBuffer, byteOffset, readBackBuffer, byteOffset, sizeInBytes);
 
                 WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(enc, nullptr);
                 wgpuCommandEncoderRelease(enc);
@@ -261,28 +267,24 @@ namespace tracy
                 wgpuCommandBufferRelease(cmd);
                 Calibration::WaitQueueIdle(m_queue, m_instance);
                 cpu[1] = Profiler::GetTime();
-                int64_t cpuRange = cpu[1] - cpu[0];
-                auto gpu = Calibration::MapBufferSync(m_readbackSlots[0].buffer, m_instance);
+                auto gpu = Calibration::MapBufferSync(readBackBuffer, m_instance);
                 TracyWebGPUAssert(gpu != nullptr);
-                fprintf(stdout, "CalibrateClocks() -> %llu | %llu | %lld /// %lld\n", gpu[0], gpu[1], gpu[1]-gpu[0], cpuRange);
-                if (cpuRange < minCpuRange)
-                {
-                    outCpuTime = cpu[1];    // static_cast<uint64_t>(t0 + (t1-t0)/2);
-                    outGpuTime = gpu[0];
-                    minCpuRange = cpuRange;
-                }
-                wgpuBufferUnmap(m_readbackSlots[0].buffer);
+                fprintf(stdout, "CalibrateClocks() -> %llu | %llu | %lld /// %lld\n", gpu[0], gpu[1], gpu[1]-gpu[0], cpu[1]-cpu[0]);
+                if (gpu[0] < m_calibration.gpuTime)
+                    fprintf(stdout, "CalibrateClocks() -> WARNING!!! going backwards!\n%llu\n%llu\n%lld\n", m_calibration.gpuTime, gpu[0], gpu[0] - m_calibration.gpuTime);
+                m_calibration.Update(cpu[0], cpu[1], gpu[0]);
+                wgpuBufferUnmap(readBackBuffer);
 
-                if (outGpuTime < m_calibration.gpuTime)
-                    fprintf(stdout, "CalibrateClocks() -> WARNING!!! going backwards!\n%llu\n%llu\n%lld\n", m_calibration.gpuTime, outGpuTime, outGpuTime - m_calibration.gpuTime);
-                m_calibration.gpuTime = outGpuTime;
-            }
+            } while (std::chrono::steady_clock::now() < deadline);
 
             wgpuRenderPipelineRelease(calibPipeline);
             wgpuShaderModuleRelease(calibShader);
             wgpuTextureViewRelease(texView);
             wgpuTextureRelease(tex);
 
+            outCpuTime = m_calibration.cpuTime;
+            outGpuTime = m_calibration.gpuTime;
+
             return true;
         }
 
@@ -366,28 +368,20 @@ namespace tracy
             }
 #endif
 
-            // Pick a query budget. WebGPU has no native upper bound on query
-            // set size in the spec. The WebGPU default/max for maxQuerySetSize
-            // is 4096. Queries are issued in (begin, end) pairs, so the count is
-            // always even.
-            static constexpr uint32_t MaxQueries = 512; //4096;
-            m_queryLimit = MaxQueries;
-
+            // WebGPU maxQuerySetSize is 4096. Queries are issued in (begin, end) pairs.
             WGPUQuerySetDescriptor qsDesc = {};
             qsDesc.type  = WGPUQueryType_Timestamp;
-            qsDesc.count = m_queryLimit;
-
+            qsDesc.count = 4096;
             for (;;)
             {
                 m_querySet = wgpuDeviceCreateQuerySet(m_device, &qsDesc);
                 if (m_querySet) break;
-                m_queryLimit /= 2;
-                qsDesc.count = m_queryLimit;
-                if (m_queryLimit < 64)
-                {
-                    TracyWebGPUPanic("Failed to create timestamp query set (timestamp-query feature missing?).", return);
-                }
+                qsDesc.count /= 2;
+                if (qsDesc.count < 128) break;
             }
+            if (m_querySet == nullptr)
+                TracyWebGPUPanic("Failed to create timestamp query set.", return);
+            m_queryLimit = qsDesc.count;
 
             // Resolve buffer: the GPU resolves query results into this buffer.
             WGPUBufferDescriptor resolveDesc = {};
@@ -395,9 +389,7 @@ namespace tracy
             resolveDesc.size  = static_cast<uint64_t>(m_queryLimit) * sizeof(uint64_t);
             m_resolveBuffer = wgpuDeviceCreateBuffer(m_device, &resolveDesc);
             if (!m_resolveBuffer)
-            {
                 TracyWebGPUPanic("Failed to create timestamp resolve buffer.", return);
-            }
 
             // Readback buffers: targets of CopyBufferToBuffer; mappable for read (3-slot ring).
             WGPUBufferDescriptor readbackDesc = {};
@@ -564,8 +556,6 @@ namespace tracy
                 m_readbackSlots[pendingIdx].buffer, WGPUMapMode_Read, 0,
                 static_cast<uint64_t>(m_queryLimit) * sizeof(uint64_t), cbInfo);
 
-            // Optimistic immediate poll: deliver any already-completed callbacks.
-            wgpuInstanceProcessEvents(m_instance);
             if (m_readbackSlots[pendingIdx].mapStatus != WGPUMapAsyncStatus{})
                 m_readbackSlots[pendingIdx].pendingFuture = {};
         }

From 746f1d028cddd48adf15817c6ac71c73e2f0da2c Mon Sep 17 00:00:00 2001
From: Marcos Slomp <mslomp@gmail.com>
Date: Wed, 20 May 2026 11:40:23 -0700
Subject: [PATCH 07/21] more cleanup and refactoring

---
 public/tracy/TracyWebGPU.hpp | 167 +++++++++++++++++------------------
 1 file changed, 79 insertions(+), 88 deletions(-)

diff --git a/public/tracy/TracyWebGPU.hpp b/public/tracy/TracyWebGPU.hpp
index 9958573113..7ef3a13d0e 100644
--- a/public/tracy/TracyWebGPU.hpp
+++ b/public/tracy/TracyWebGPU.hpp
@@ -48,6 +48,15 @@ using TracyWebGPUCtx = void*;
 
 #include <webgpu/webgpu.h>
 
+// piggy-back on WGPU_DAWN_TOGGLES_DESCRIPTOR_INIT to detect Dawn header
+#ifdef WGPU_DAWN_TOGGLES_DESCRIPTOR_INIT
+#define TRACY_WEBGPU_DAWN_NATIVE (1)
+#include <dawn/native/DawnNative.h>
+#else
+#define TRACY_WEBGPU_WGPU_NATIVE (1)
+#include <webgpu/wgpu.h>
+#endif
+
 #ifndef TRACY_WEBGPU_DEBUG_LEVEL
 #define TRACY_WEBGPU_DEBUG_LEVEL (0)
 #endif//TRACY_WEBGPU_DEBUG_LEVEL
@@ -81,25 +90,25 @@ namespace tracy
         std::mutex m_collectionMutex;
 
         WGPUInstance m_instance = nullptr;
-        WGPUDevice   m_device   = nullptr;
-        WGPUQueue    m_queue    = nullptr;
+        WGPUDevice m_device = nullptr;
+        WGPUQueue m_queue = nullptr;
 
         struct ReadbackSlot
         {
-            WGPUBuffer            buffer;
-            std::atomic<uint64_t> copiedUpto;
+            WGPUBuffer buffer = nullptr;
+            std::atomic<uint64_t> copiedUpto {0};
             std::atomic<WGPUMapAsyncStatus> mapStatus = {};
-            WGPUFuture            pendingFuture = {};
+            WGPUFuture pendingFuture = {};
         };
         static_assert(std::atomic<WGPUMapAsyncStatus>::is_always_lock_free, "WGPUMapAsyncStatus must be lock-free atomic");
 
-        WGPUQuerySet  m_querySet        = nullptr;
-        WGPUBuffer    m_resolveBuffer   = nullptr;  // QueryResolve | CopySrc
-        ReadbackSlot  m_readbackSlots[3];            // CopyDst | MapRead (3-slot ring)
-        std::atomic<int> m_writeIdx{0};              // WRITE slot index (ring: 0→1→2→0)
+        WGPUQuerySet  m_querySet = nullptr;
+        WGPUBuffer    m_resolveBuffer = nullptr;
+        ReadbackSlot  m_readbackSlots [3];
+        std::atomic<int> m_writeIdx {0};
 
         using atomic_counter = std::atomic<uint64_t>;
-        atomic_counter m_queryCounter       = 0;
+        atomic_counter m_queryCounter = 0;
         atomic_counter m_previousCheckpoint = 0;
 
         uint32_t m_queryLimit = 0;
@@ -170,23 +179,26 @@ namespace tracy
 
         bool CalibrateClocks(uint64_t& outCpuTime, uint64_t& outGpuTime)
         {
+            // WebGPU does not have any clock calibration API.
+            // This routine attempts to estimates a reasonable (cpuTime, gpuTime) correlation
+            // by sampling CPU and GPU timestamps around a "synchronous" draw call.
+            // Several samples are taken to tighten the estimation.
+
             ZoneScoped;
 
-            // wgpuCommandEncoderWriteTimestamp is deprecated and returns 0 on Metal.
-            // Use a render pass with an actual draw call: on Metal TBDR, begin-of-pass
-            // timestamps fire at tile rasterization start. An empty render pass (no
-            // geometry) may never trigger rasterization, yielding a deferred or
-            // meaningless timestamp that doesn't reflect actual GPU execution order.
-            static const char kCalibShader[] = R"(
+            WGPUShaderSourceWGSL wgslSrc = {};
+            wgslSrc.chain.sType = WGPUSType_ShaderSourceWGSL;
+            wgslSrc.code =
+            {
+                R"(
                 @vertex fn vs(@builtin(vertex_index) i: u32) -> @builtin(position) vec4f {
                     var p = array(vec4f(-1,-1,.5,1), vec4f(3,-1,.5,1), vec4f(-1,3,.5,1));
                     return p[i];
                 }
                 @fragment fn fs() -> @location(0) vec4f { return vec4f(0.0); }
-            )";
-            WGPUShaderSourceWGSL wgslSrc = {};
-            wgslSrc.chain.sType = WGPUSType_ShaderSourceWGSL;
-            wgslSrc.code        = { kCalibShader, WGPU_STRLEN };
+                )",
+                WGPU_STRLEN
+            };
             WGPUShaderModuleDescriptor smDesc = {};
             smDesc.nextInChain  = reinterpret_cast<WGPUChainedStruct*>(&wgslSrc);
             WGPUShaderModule calibShader = wgpuDeviceCreateShaderModule(m_device, &smDesc);
@@ -301,8 +313,7 @@ namespace tracy
 
             features[n++] = WGPUFeatureName_TimestampQuery;
 
-            // piggy-back on WGPU_DAWN_TOGGLES_DESCRIPTOR_INIT to detect Dawn header
-#           ifdef WGPU_DAWN_TOGGLES_DESCRIPTOR_INIT
+#           if (TRACY_WEBGPU_DAWN_NATIVE)
                 fprintf(stderr, "[INFO] [DAWN] ENABLING RAW TIMESTAMP TICKS (disabling ns conversion + quantization)\n");
                 // disable_timestamp_query_conversion: resolve timestamps as raw GPU ticks, not nanoseconds.
                 // timestamp_quantization: disabled defensively (off by default on Metal, but on elsewhere).
@@ -315,16 +326,13 @@ namespace tracy
                 togglesDesc.enabledToggles = dawnEnabledToggles;
                 togglesDesc.enabledToggleCount  = 1;
                 deviceDescriptor.nextInChain = reinterpret_cast<WGPUChainedStruct*>(&togglesDesc);
-#           else
+#           elif (TRACY_WEBGPU_WGPU_NATIVE)
                 // wgpu-native: passTimestampWrites requires the non-standard
                 // TIMESTAMP_QUERY_INSIDE_PASSES device feature in addition to
                 // the standard TimestampQuery feature.
                 fprintf(stderr, "[INFO] [WGPU] Requesting TimestampQueryInsidePasses native feature\n");
                 {
-                    constexpr auto WGPUNativeFeature_TimestampQueryInsideEncoders = 0x00030024;
-                    constexpr auto WGPUNativeFeature_TimestampQueryInsidePasses = 0x00030025;
                     features[n++] = (WGPUFeatureName)WGPUNativeFeature_TimestampQueryInsideEncoders;
-                    //features[n++] = (WGPUFeatureName)WGPUNativeFeature_TimestampQueryInsidePasses;
                 }
 #           endif
             deviceDescriptor.requiredFeatures = features;
@@ -332,6 +340,31 @@ namespace tracy
             return true;
         }
 
+        bool VerifyDevice(WGPUDevice device)
+        {
+            if (device == nullptr)
+                return false;
+            if (wgpuDeviceHasFeature(device, WGPUFeatureName_TimestampQuery) == WGPU_FALSE)
+                return false;
+#           if (TRACY_WEBGPU_DAWN_NATIVE)
+                bool hasDisableConversion = false, hasQuantization = false;
+                for (const char* t : ::dawn::native::GetTogglesUsed(device))
+                {
+                    if (strcmp(t, "disable_timestamp_query_conversion") == 0)
+                        hasDisableConversion = true;
+                    if (strcmp(t, "timestamp_quantization") == 0)
+                        hasQuantization = true;
+                }
+                return hasDisableConversion && !hasQuantization;
+#           elif (TRACY_WEBGPU_WGPU_NATIVE)
+                // wgpu-native also requires TimestampQueryInsideEncoders for ResolveQuerySet.
+                if (wgpuDeviceHasFeature(device, (WGPUFeatureName)WGPUNativeFeature_TimestampQueryInsideEncoders) == WGPU_FALSE)
+                    return false;
+                return true;
+#           endif
+            return false;
+        }
+
         WebGPUQueueCtx(WGPUInstance instance, WGPUDevice device, WGPUQueue queue)
             : m_instance(instance)
             , m_device(device)
@@ -339,38 +372,17 @@ namespace tracy
         {
             ZoneScopedC(Color::Red4);
 
-            // The canonical webgpu.h uses AddRef/Release for refcounting.
-            if (m_instance) wgpuInstanceAddRef(m_instance);
-            wgpuDeviceAddRef(m_device);
-            wgpuQueueAddRef(m_queue);
+            if (!VerifyDevice(m_device))
+                TracyWebGPUPanic("GPU profiling disabled because the device did not enable the necessary features.")
 
-            // Graceful early-out: if the logical device was created without the
-            // required timestamp features, GPU zones will silently do nothing.
-            // m_contextId stays 255 (invalid); CreateWebGPUContext destroys and
-            // returns nullptr, and all TracyWebGPU* macros become no-ops.
-            if (!wgpuDeviceHasFeature(m_device, WGPUFeatureName_TimestampQuery))
-            {
-                TracyWebGPUPanic(
-                    "timestamp-query feature not enabled on device; GPU profiling disabled.",
-                    return
-                )
-            }
-            // wgpuCommandEncoderResolveQuerySet requires the wgpu-native
-            // TIMESTAMP_QUERY_INSIDE_ENCODERS feature on some backends.
-#ifdef WGPUNativeFeature_TimestampQueryInsideEncoders
-            if (!wgpuDeviceHasFeature(m_device, (WGPUFeatureName)WGPUNativeFeature_TimestampQueryInsideEncoders))
-            {
-                TracyWebGPUPanic(
-                    "WGPUNativeFeature_TimestampQueryInsideEncoders not enabled on device; "
-                    "GPU profiling disabled (needed for ResolveQuerySet on the command encoder).",
-                    return
-                );
-            }
-#endif
+            TracyWebGPUAssert(m_instance); wgpuInstanceAddRef(m_instance);
+            TracyWebGPUAssert(m_device); wgpuDeviceAddRef(m_device);
+            TracyWebGPUAssert(m_queue); wgpuQueueAddRef(m_queue);
 
-            // WebGPU maxQuerySetSize is 4096. Queries are issued in (begin, end) pairs.
+            // Setup Query Set: must have even size since queries are issued in pairs.
+            // (The WebGPU spec mandates 4096, with no way to query the device limit.)
             WGPUQuerySetDescriptor qsDesc = {};
-            qsDesc.type  = WGPUQueryType_Timestamp;
+            qsDesc.type = WGPUQueryType_Timestamp;
             qsDesc.count = 4096;
             for (;;)
             {
@@ -383,7 +395,6 @@ namespace tracy
                 TracyWebGPUPanic("Failed to create timestamp query set.", return);
             m_queryLimit = qsDesc.count;
 
-            // Resolve buffer: the GPU resolves query results into this buffer.
             WGPUBufferDescriptor resolveDesc = {};
             resolveDesc.usage = WGPUBufferUsage_QueryResolve | WGPUBufferUsage_CopySrc;
             resolveDesc.size  = static_cast<uint64_t>(m_queryLimit) * sizeof(uint64_t);
@@ -391,7 +402,6 @@ namespace tracy
             if (!m_resolveBuffer)
                 TracyWebGPUPanic("Failed to create timestamp resolve buffer.", return);
 
-            // Readback buffers: targets of CopyBufferToBuffer; mappable for read (3-slot ring).
             WGPUBufferDescriptor readbackDesc = {};
             readbackDesc.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead;
             readbackDesc.size  = static_cast<uint64_t>(m_queryLimit) * sizeof(uint64_t);
@@ -402,22 +412,18 @@ namespace tracy
                 if (!slot.buffer) { TracyWebGPUPanic("Failed to create timestamp readback buffer.", return); }
             }
 
-            // Establish the (cpuTime, gpuTime) anchor for Tracy's GpuNewContext.
-            // WebGPU has no "clock calibration API", so we use a one-shot anchor
-            // to estimate a correlation for the CPU and the GPU timestamps.
             uint64_t cpuTimestamp = 0;
             uint64_t gpuTimestamp = 0;
             if (!CalibrateClocks(cpuTimestamp, gpuTimestamp))
-            {
                 TracyWebGPUPanic("Failed to calibrate CPU/GPU clocks.", return);
-            }
 
             fprintf(stdout, "INFO: gpuTimestamp is %llu\n", gpuTimestamp);
-            //m_shadowBuffer.resize(m_queryLimit, gpuTimestamp);
-            m_shadowBuffer.resize(m_queryLimit, 0);
+            m_shadowBuffer.resize(m_queryLimit, gpuTimestamp);
 
             // WebGPU timestamps are in nanoseconds, as per the spec.
-            const float period = 1.0f;  // 1ns/tick
+            float period = 1.0f;  // 1ns/tick
+            // TODO: however, with raw timestamps, the period may need adjustment
+            // (we measure that that during CalibrateClocks())
 
             // All setup completed: register the context.
             m_contextId = GetGpuCtxCounter().fetch_add(1);
@@ -549,8 +555,13 @@ namespace tracy
             m_writeIdx = newWriteIdx;
 
             WGPUBufferMapCallbackInfo cbInfo = {};
-            cbInfo.mode      = WGPUCallbackMode_AllowProcessEvents;
-            cbInfo.callback  = &WebGPUQueueCtx::OnMapped;
+            cbInfo.mode = WGPUCallbackMode_AllowProcessEvents;
+            cbInfo.callback = [](WGPUMapAsyncStatus status, WGPUStringView, void* ud, void*)
+            {
+                auto* self = static_cast<WebGPUQueueCtx*>(ud);
+                const int collectIdx = (self->m_writeIdx + 2) % 3;
+                self->m_readbackSlots[collectIdx].mapStatus = status;
+            };
             cbInfo.userdata1 = this;
             m_readbackSlots[pendingIdx].pendingFuture = wgpuBufferMapAsync(
                 m_readbackSlots[pendingIdx].buffer, WGPUMapMode_Read, 0,
@@ -561,25 +572,6 @@ namespace tracy
         }
 
     private:
-        // Drive the WebGPU event queue to deliver pending callbacks.
-        // wgpuInstanceProcessEvents is the canonical webgpu.h API.
-        // wgpu-native additionally benefits from wgpuDevicePoll.
-        void ProcessEvents()
-        {
-            if (m_instance)
-                wgpuInstanceProcessEvents(m_instance);
-#ifdef WGPU_H_
-            wgpuDevicePoll(m_device, false, nullptr);
-#endif
-        }
-
-        static void OnMapped(WGPUMapAsyncStatus status, WGPUStringView, void* ud, void*)
-        {
-            auto* self = static_cast<WebGPUQueueCtx*>(ud);
-            const int collectIdx = (self->m_writeIdx + 2) % 3;
-            self->m_readbackSlots[collectIdx].mapStatus = status;
-        }
-
         void EmitGpuTime(uint64_t gpuTimestamp, uint32_t slot)
         {
             auto* item = Profiler::QueueSerial();
@@ -630,7 +622,6 @@ namespace tracy
         {
             // 32 queries = 32 * 8 bytes = 256 bytes
             TracyWebGPUAssert(queryBatchStartId % 32 == 0, return);
-            queryBatchStartId = queryBatchStartId % m_ctx->m_queryLimit;
 
             const uint64_t blockOffset = static_cast<uint64_t>(queryBatchStartId) * sizeof(uint64_t);
             wgpuCommandEncoderResolveQuerySet(
@@ -716,7 +707,7 @@ namespace tracy
             m_encoder   = encoder;
 
             m_rawTicket = m_ctx->NextQueryId();
-            m_queryId   = static_cast<uint32_t>(m_rawTicket % ctx->m_queryLimit);
+            m_queryId   = m_ctx->RingIndex(m_rawTicket);
             m_timestampWrites.querySet                  = m_ctx->m_querySet;
             m_timestampWrites.beginningOfPassWriteIndex = m_queryId;
             m_timestampWrites.endOfPassWriteIndex       = m_queryId + 1;

From 4ba1c7ea57aad4915f61c73614b72e1d2e265a8e Mon Sep 17 00:00:00 2001
From: Marcos Slomp <mslomp@gmail.com>
Date: Wed, 20 May 2026 13:48:08 -0700
Subject: [PATCH 08/21] more refactoring

---
 public/tracy/TracyWebGPU.hpp | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/public/tracy/TracyWebGPU.hpp b/public/tracy/TracyWebGPU.hpp
index 7ef3a13d0e..e57e6aa68d 100644
--- a/public/tracy/TracyWebGPU.hpp
+++ b/public/tracy/TracyWebGPU.hpp
@@ -137,7 +137,7 @@ namespace tracy
             }
             static const uint64_t* MapBufferSync(WGPUBuffer buffer, WGPUInstance instance)
             {
-                struct MapCtx { WGPUMapAsyncStatus status = (WGPUMapAsyncStatus)0; } ctx;
+                struct MapCtx { WGPUMapAsyncStatus status = {}; } ctx;
                 WGPUBufferMapCallbackInfo cbInfo = {};
                 cbInfo.mode      = WGPUCallbackMode_AllowProcessEvents;
                 cbInfo.callback  = [](WGPUMapAsyncStatus status, WGPUStringView, void* userData, void*) {
@@ -544,31 +544,31 @@ namespace tracy
             //   COLLECT = (m_writeIdx + 2) % 3  ← recycle as new WRITE
             const int writeIdx   = m_writeIdx;
             const int pendingIdx = (writeIdx + 1) % 3;
+            const int newWriteIdx = (writeIdx + 2) % 3;
 
             if (m_readbackSlots[writeIdx].copiedUpto <= m_previousCheckpoint)
                 return;
 
-            const int newWriteIdx = (writeIdx + 2) % 3;
-
             m_readbackSlots[newWriteIdx].copiedUpto = m_previousCheckpoint.load();
 
             m_writeIdx = newWriteIdx;
 
+            auto& nextToCollect = m_readbackSlots[pendingIdx];
             WGPUBufferMapCallbackInfo cbInfo = {};
-            cbInfo.mode = WGPUCallbackMode_AllowProcessEvents;
-            cbInfo.callback = [](WGPUMapAsyncStatus status, WGPUStringView, void* ud, void*)
+            // This readback buffer map callback can fire "spontaneously"
+            cbInfo.mode = WGPUCallbackMode_AllowSpontaneous;
+            cbInfo.callback = [](WGPUMapAsyncStatus status, WGPUStringView, void* userData, void*)
             {
-                auto* self = static_cast<WebGPUQueueCtx*>(ud);
-                const int collectIdx = (self->m_writeIdx + 2) % 3;
-                self->m_readbackSlots[collectIdx].mapStatus = status;
+                auto* slot = static_cast<ReadbackSlot*>(userData);
+                slot->mapStatus = status;
             };
-            cbInfo.userdata1 = this;
-            m_readbackSlots[pendingIdx].pendingFuture = wgpuBufferMapAsync(
-                m_readbackSlots[pendingIdx].buffer, WGPUMapMode_Read, 0,
+            cbInfo.userdata1 = &nextToCollect;
+            nextToCollect.pendingFuture = wgpuBufferMapAsync(
+                nextToCollect.buffer, WGPUMapMode_Read, 0,
                 static_cast<uint64_t>(m_queryLimit) * sizeof(uint64_t), cbInfo);
 
-            if (m_readbackSlots[pendingIdx].mapStatus != WGPUMapAsyncStatus{})
-                m_readbackSlots[pendingIdx].pendingFuture = {};
+            if (nextToCollect.mapStatus != WGPUMapAsyncStatus{})
+                nextToCollect.pendingFuture = {};
         }
 
     private:
@@ -622,6 +622,7 @@ namespace tracy
         {
             // 32 queries = 32 * 8 bytes = 256 bytes
             TracyWebGPUAssert(queryBatchStartId % 32 == 0, return);
+            queryBatchStartId = m_ctx->RingIndex(queryBatchStartId);
 
             const uint64_t blockOffset = static_cast<uint64_t>(queryBatchStartId) * sizeof(uint64_t);
             wgpuCommandEncoderResolveQuerySet(

From 77dedb7bb10c8799341099e5336a91b22d417266 Mon Sep 17 00:00:00 2001
From: Marcos Slomp <mslomp@gmail.com>
Date: Thu, 21 May 2026 11:39:01 -0700
Subject: [PATCH 09/21] refactoring

---
 public/tracy/TracyWebGPU.hpp | 164 +++++++++++++++++++----------------
 1 file changed, 89 insertions(+), 75 deletions(-)

diff --git a/public/tracy/TracyWebGPU.hpp b/public/tracy/TracyWebGPU.hpp
index e57e6aa68d..b948bc317d 100644
--- a/public/tracy/TracyWebGPU.hpp
+++ b/public/tracy/TracyWebGPU.hpp
@@ -1,6 +1,14 @@
 #ifndef __TRACYWEBGPU_HPP__
 #define __TRACYWEBGPU_HPP__
 
+// WebGPU, unlike other graphics APIs, has many annoying restrictions that complicate
+// the design of the Tracy WebGPU back-end:
+// - there's no CPU/GPU clock calibration API
+// - submitting GPU commands that touch a buffer that the host is mapping is not permitted
+// - resolving timestamps require destination offsets aligned to 256 bytes
+// - timestamps are only available at pass granularity (implementations may need to emulate this)
+// - spec mandates timestamps to be in nanoseconds (implementationw may need to emulate this)
+
 #ifndef TRACY_ENABLE
 
 #define TracyWebGPUContext(instance, device, queue) nullptr
@@ -93,7 +101,7 @@ namespace tracy
         WGPUDevice m_device = nullptr;
         WGPUQueue m_queue = nullptr;
 
-        struct ReadbackSlot
+        struct ReadbackStage
         {
             WGPUBuffer buffer = nullptr;
             std::atomic<uint64_t> copiedUpto {0};
@@ -104,7 +112,7 @@ namespace tracy
 
         WGPUQuerySet  m_querySet = nullptr;
         WGPUBuffer    m_resolveBuffer = nullptr;
-        ReadbackSlot  m_readbackSlots [3];
+        ReadbackStage m_readbackReel [3];
         std::atomic<int> m_writeIdx {0};
 
         using atomic_counter = std::atomic<uint64_t>;
@@ -262,7 +270,7 @@ namespace tracy
                 wgpuRenderPassEncoderEnd(pass);
                 wgpuRenderPassEncoderRelease(pass);
 
-                WGPUBuffer readBackBuffer = m_readbackSlots[0].buffer;
+                WGPUBuffer readBackBuffer = m_readbackReel[0].buffer;
                 uint32_t byteOffset = queryId * sizeof(uint64_t);
                 uint32_t sizeInBytes = 2 * sizeof(uint64_t);
                 wgpuCommandEncoderResolveQuerySet(enc, m_querySet, queryId, 2, m_resolveBuffer, byteOffset);
@@ -281,9 +289,11 @@ namespace tracy
                 cpu[1] = Profiler::GetTime();
                 auto gpu = Calibration::MapBufferSync(readBackBuffer, m_instance);
                 TracyWebGPUAssert(gpu != nullptr);
-                fprintf(stdout, "CalibrateClocks() -> %llu | %llu | %lld /// %lld\n", gpu[0], gpu[1], gpu[1]-gpu[0], cpu[1]-cpu[0]);
-                if (gpu[0] < m_calibration.gpuTime)
-                    fprintf(stdout, "CalibrateClocks() -> WARNING!!! going backwards!\n%llu\n%llu\n%lld\n", m_calibration.gpuTime, gpu[0], gpu[0] - m_calibration.gpuTime);
+                TracyWebGPUDebug(
+                    fprintf(stdout, "CalibrateClocks() -> %llu | %llu | %lld /// %lld\n", gpu[0], gpu[1], gpu[1]-gpu[0], cpu[1]-cpu[0]);
+                    if (gpu[0] < m_calibration.gpuTime)
+                        fprintf(stdout, "CalibrateClocks() -> WARNING!!! going backwards!\n%llu\n%llu\n%lld\n", m_calibration.gpuTime, gpu[0], gpu[0] - m_calibration.gpuTime);
+                );
                 m_calibration.Update(cpu[0], cpu[1], gpu[0]);
                 wgpuBufferUnmap(readBackBuffer);
 
@@ -314,7 +324,7 @@ namespace tracy
             features[n++] = WGPUFeatureName_TimestampQuery;
 
 #           if (TRACY_WEBGPU_DAWN_NATIVE)
-                fprintf(stderr, "[INFO] [DAWN] ENABLING RAW TIMESTAMP TICKS (disabling ns conversion + quantization)\n");
+                TracyWebGPUDebug( fprintf(stderr, "[INFO] [DAWN] ENABLING RAW TIMESTAMP TICKS (disabling ns conversion + quantization)\n") );
                 // disable_timestamp_query_conversion: resolve timestamps as raw GPU ticks, not nanoseconds.
                 // timestamp_quantization: disabled defensively (off by default on Metal, but on elsewhere).
                 static const char* dawnDisabledToggles[] = { "timestamp_quantization" };
@@ -330,10 +340,8 @@ namespace tracy
                 // wgpu-native: passTimestampWrites requires the non-standard
                 // TIMESTAMP_QUERY_INSIDE_PASSES device feature in addition to
                 // the standard TimestampQuery feature.
-                fprintf(stderr, "[INFO] [WGPU] Requesting TimestampQueryInsidePasses native feature\n");
-                {
-                    features[n++] = (WGPUFeatureName)WGPUNativeFeature_TimestampQueryInsideEncoders;
-                }
+                TracyWebGPUDebug( fprintf(stderr, "[INFO] [WGPU] Requesting TimestampQueryInsidePasses native feature\n") );
+                features[n++] = (WGPUFeatureName)WGPUNativeFeature_TimestampQueryInsideEncoders;
 #           endif
             deviceDescriptor.requiredFeatures = features;
             deviceDescriptor.requiredFeatureCount = static_cast<uint32_t>(n);
@@ -405,11 +413,11 @@ namespace tracy
             WGPUBufferDescriptor readbackDesc = {};
             readbackDesc.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead;
             readbackDesc.size  = static_cast<uint64_t>(m_queryLimit) * sizeof(uint64_t);
-            for (auto& slot : m_readbackSlots)
+            for (auto& stage : m_readbackReel)
             {
-                slot.buffer = wgpuDeviceCreateBuffer(m_device, &readbackDesc);
-                slot.copiedUpto = 0;
-                if (!slot.buffer) { TracyWebGPUPanic("Failed to create timestamp readback buffer.", return); }
+                stage.buffer = wgpuDeviceCreateBuffer(m_device, &readbackDesc);
+                stage.copiedUpto = 0;
+                if (!stage.buffer) { TracyWebGPUPanic("Failed to create timestamp readback buffer.", return); }
             }
 
             uint64_t cpuTimestamp = 0;
@@ -417,7 +425,7 @@ namespace tracy
             if (!CalibrateClocks(cpuTimestamp, gpuTimestamp))
                 TracyWebGPUPanic("Failed to calibrate CPU/GPU clocks.", return);
 
-            fprintf(stdout, "INFO: gpuTimestamp is %llu\n", gpuTimestamp);
+            TracyWebGPUDebug( fprintf(stdout, "INFO: gpuTimestamp is %llu\n", gpuTimestamp) );
             m_shadowBuffer.resize(m_queryLimit, gpuTimestamp);
 
             // WebGPU timestamps are in nanoseconds, as per the spec.
@@ -443,11 +451,14 @@ namespace tracy
 
         ~WebGPUQueueCtx()
         {
-            Collect(); // best-effort non-blocking flush
-
-            // Block until any in-flight map completes before releasing buffers.
-            for (auto& slot : m_readbackSlots)
-                if (slot.buffer) { wgpuBufferRelease(slot.buffer); slot.buffer = nullptr; }
+            // TODO: a few problems to address later during this final Collect():
+            // 1. ensure "partial" query batches are collected
+            // 2. ensure all readback stages are collected and empty
+            // 3. ensure readback buffers are not mapped before deleting them
+            Collect();
+
+            for (auto& stage : m_readbackReel)
+                if (stage.buffer) { wgpuBufferRelease(stage.buffer);     stage.buffer     = nullptr; }
             if (m_resolveBuffer)  { wgpuBufferRelease(m_resolveBuffer);  m_resolveBuffer  = nullptr; }
             if (m_querySet)       { wgpuQuerySetRelease(m_querySet);     m_querySet       = nullptr; }
             if (m_queue)          { wgpuQueueRelease(m_queue);           m_queue          = nullptr; }
@@ -486,42 +497,47 @@ namespace tracy
             if (Distance(m_previousCheckpoint, m_queryCounter) <= 0)
                 return;
 
-            const int collectIdx = (m_writeIdx + 2) % 3;
-            auto& collectSlot = m_readbackSlots[collectIdx];
+            // Current Readback "Reel" Stages:
+            const int state = m_writeIdx;
+            const int fillingIdx = (state + 0) % 3; // this is where instrumentation is pushing new queries
+            const int pendingIdx = (state + 1) % 3; // instrumentation is done here; ready to be collected
+            const int collectIdx = (state + 2) % 3; // this is where queries are being collected right now
 
-            // Poll for an in-flight map to complete.
-            if (collectSlot.pendingFuture.id != 0)
+            // Ensure readback buffer has been mapped to the host
+            auto& collectStage = m_readbackReel[collectIdx];
+            if (collectStage.pendingFuture.id != 0)
             {
                 if (webgpuProcessEvents)
                     wgpuInstanceProcessEvents(m_instance);
-                if (collectSlot.mapStatus == WGPUMapAsyncStatus{})
+                if (collectStage.mapStatus == WGPUMapAsyncStatus{})
                     return;  // callback hasn't fired yet
-                collectSlot.pendingFuture = {};
+                collectStage.pendingFuture = {};
+                if (collectStage.mapStatus != WGPUMapAsyncStatus_Success)
+                    TracyWebGPUPanic("Colect(): unable to map readback buffer.", return);
             }
 
-            // If a buffer is mapped, process as many resolved queries as possible.
-            if (collectSlot.mapStatus == WGPUMapAsyncStatus_Success)
+            if (collectStage.mapStatus == WGPUMapAsyncStatus_Success)
             {
                 const uint64_t* ts = static_cast<const uint64_t*>(
-                    wgpuBufferGetConstMappedRange(collectSlot.buffer, 0,
+                    wgpuBufferGetConstMappedRange(collectStage.buffer, 0,
                         static_cast<uint64_t>(m_queryLimit) * sizeof(uint64_t)));
                 if (ts)
                 {
                     uint64_t ticket = m_previousCheckpoint;
-                    const uint64_t end = collectSlot.copiedUpto;
-                    fprintf(stdout, "[TWG] Collect [%d] (%llu, %llu)\n", collectIdx, ticket, end);
+                    const uint64_t end = collectStage.copiedUpto;
+                    TracyWebGPUDebug( fprintf(stdout, "[TWG] Collect [%d] (%llu, %llu)\n", collectIdx, ticket, end) );
                     for (; Distance(ticket, end) > 0; ticket += 2)
                     {
                         const uint32_t slotB = RingIndex(ticket);
                         const uint32_t slotE = slotB + 1;
-                        fprintf(stderr,
-                            "[TWG] slot B=%4u E=%4u ts[B]=%llu ts[E]=%llu shadow[E]=%llu ts-diff=%lld shadow-diff=%lld\n",
-                            slotB, slotE,
-                            (unsigned long long)ts[slotB],
-                            (unsigned long long)ts[slotE],
-                            (unsigned long long)m_shadowBuffer[slotE],
-                            (long long)Distance(ts[slotB], ts[slotE]),
-                            (long long)Distance(m_shadowBuffer[slotE], ts[slotE]));
+                        TracyWebGPUDebug(
+                            fprintf(stderr,
+                                "[TWG] slot B=%4u E=%4u ts[B]=%llu ts[E]=%llu shadow[E]=%llu ts-diff=%lld shadow-diff=%lld\n",
+                                slotB, slotE,
+                                ts[slotB], ts[slotE], m_shadowBuffer[slotE],
+                                Distance(ts[slotB], ts[slotE]),
+                                Distance(m_shadowBuffer[slotE], ts[slotE]));
+                        );
                         if (Distance(m_shadowBuffer[slotE], ts[slotE]) <= 0)
                             break; // GPU hasn't written this timestamp yet; retry next Collect()
                         EmitGpuTime(ts[slotB], slotB);
@@ -534,53 +550,48 @@ namespace tracy
                 }
 
                 // All queries resolved (or getMappedRange failed): unmap and fall through to rotate.
-                wgpuBufferUnmap(collectSlot.buffer);
-                collectSlot.mapStatus = {};
+                wgpuBufferUnmap(collectStage.buffer);
+                collectStage.mapStatus = {};
             }
 
-            // Idle: rotate the ring and start the next map if there is committed data to collect.
-            //   WRITE   = m_writeIdx
-            //   PENDING = (m_writeIdx + 1) % 3  ← map this
-            //   COLLECT = (m_writeIdx + 2) % 3  ← recycle as new WRITE
-            const int writeIdx   = m_writeIdx;
-            const int pendingIdx = (writeIdx + 1) % 3;
-            const int newWriteIdx = (writeIdx + 2) % 3;
+            // At this point, all queries in the collect buffer have been processed.
+            // (it's now tie to "rotate" the buffers around...)
 
-            if (m_readbackSlots[writeIdx].copiedUpto <= m_previousCheckpoint)
+            // Has any ResolveQueryBatch call landed in this reel stage since it was last recycled?
+            // (Are there any queries to resolve and collect at all?)
+            if (m_readbackReel[fillingIdx].copiedUpto <= m_previousCheckpoint)
                 return;
 
-            m_readbackSlots[newWriteIdx].copiedUpto = m_previousCheckpoint.load();
-
-            m_writeIdx = newWriteIdx;
+            // Rotate/Cycle the Readback Pipeline State:
+            // the buffer that was just collected shall now be used for instrumentation
+            collectStage.copiedUpto = m_previousCheckpoint.load();
+            m_writeIdx = collectIdx;    // atomically commit the pipeline rotation
 
-            auto& nextToCollect = m_readbackSlots[pendingIdx];
+            auto& nextToCollect = m_readbackReel[pendingIdx];
             WGPUBufferMapCallbackInfo cbInfo = {};
-            // This readback buffer map callback can fire "spontaneously"
+            // This readback buffer map callback can fire "spontaneously" (it just sets a flag)
             cbInfo.mode = WGPUCallbackMode_AllowSpontaneous;
             cbInfo.callback = [](WGPUMapAsyncStatus status, WGPUStringView, void* userData, void*)
             {
-                auto* slot = static_cast<ReadbackSlot*>(userData);
-                slot->mapStatus = status;
+                auto* stage = static_cast<ReadbackStage*>(userData);
+                stage->mapStatus = status;
             };
             cbInfo.userdata1 = &nextToCollect;
             nextToCollect.pendingFuture = wgpuBufferMapAsync(
                 nextToCollect.buffer, WGPUMapMode_Read, 0,
                 static_cast<uint64_t>(m_queryLimit) * sizeof(uint64_t), cbInfo);
-
-            if (nextToCollect.mapStatus != WGPUMapAsyncStatus{})
-                nextToCollect.pendingFuture = {};
         }
 
     private:
-        void EmitGpuTime(uint64_t gpuTimestamp, uint32_t slot)
+        void EmitGpuTime(uint64_t gpuTimestamp, uint32_t queryId)
         {
             auto* item = Profiler::QueueSerial();
             MemWrite(&item->hdr.type, QueueType::GpuTime);
             MemWrite(&item->gpuTime.gpuTime, static_cast<int64_t>(gpuTimestamp));
-            MemWrite(&item->gpuTime.queryId, static_cast<uint16_t>(slot));
+            MemWrite(&item->gpuTime.queryId, static_cast<uint16_t>(queryId));
             MemWrite(&item->gpuTime.context, GetId());
             Profiler::QueueSerialFinish();
-            m_shadowBuffer[slot] = gpuTimestamp;
+            m_shadowBuffer[queryId] = gpuTimestamp;
         }
 
         tracy_force_inline uint32_t RingCapacity() const { return m_queryLimit; }
@@ -610,11 +621,11 @@ namespace tracy
 
     class WebGPUZoneScope
     {
-        const bool         m_active;
-        WebGPUQueueCtx*    m_ctx       = nullptr;
-        WGPUCommandEncoder m_encoder   = nullptr;
-        uint64_t           m_rawTicket = 0;  // raw (non-modded) ticket from NextQueryId
-        uint32_t           m_queryId   = 0;  // ring index = m_rawTicket % queryLimit
+        const bool m_active;
+        WebGPUQueueCtx* m_ctx = nullptr;
+        WGPUCommandEncoder m_encoder = nullptr;
+        uint64_t m_rawTicket = 0;
+        uint32_t m_queryId = 0;
 
         WGPUPassTimestampWrites m_timestampWrites = {};
 
@@ -633,8 +644,8 @@ namespace tracy
                 blockOffset // MUST be a multiple of (aligned to) 256...
             );
 
-            auto& slot = m_ctx->m_readbackSlots[m_ctx->m_writeIdx];
-            auto readbackBuffer = slot.buffer;
+            auto& stage = m_ctx->m_readbackReel[m_ctx->m_writeIdx];
+            auto readbackBuffer = stage.buffer;
             wgpuCommandEncoderCopyBufferToBuffer(
                 m_encoder,
                 m_ctx->m_resolveBuffer,
@@ -644,12 +655,15 @@ namespace tracy
                 32 * sizeof(uint64_t)
             );
 
-            // Advance this slot's high-water mark to cover the block just encoded.
+            // Advance this stage's high-water mark to cover the block just encoded.
+            // TODO: maybe we can use fetch_add to increment the atomic and not need
+            // to keep track of the raw ticket; Collect would need to derive the raw
+            // end ticket number.
             const uint64_t blockEnd = m_rawTicket;
-            uint64_t prev = slot.copiedUpto;
-            while (prev < blockEnd &&
-                   !slot.copiedUpto.compare_exchange_weak(prev, blockEnd)) {}
-            fprintf(stdout, "[TWG] WebGPUZoneScope [%d] (%d,%d)\n", (int)m_ctx->m_writeIdx, queryBatchStartId, queryBatchStartId+32);
+            uint64_t prev = stage.copiedUpto;
+            while ((WebGPUQueueCtx::Distance(prev, blockEnd) > 0) &&
+                   !stage.copiedUpto.compare_exchange_weak(prev, blockEnd)) {}
+            TracyWebGPUDebug( fprintf(stdout, "[TWG] WebGPUZoneScope [%d] (%d,%d)\n", (int)m_ctx->m_writeIdx, queryBatchStartId, queryBatchStartId+32) );
         }
 
         tracy_force_inline void WriteQueueItem(const SourceLocationData* srcLocation, int32_t callstackDepth, uint32_t sourceLine, const char* sourceFile, size_t sourceFileLen, const char* functionName, size_t functionNameLen, const char* zoneName, size_t zoneNameLen)

From 3bfe769675b34d8eeb4a49298dfec4e6e2c26d05 Mon Sep 17 00:00:00 2001
From: Marcos Slomp <mslomp@gmail.com>
Date: Fri, 22 May 2026 15:30:07 -0700
Subject: [PATCH 10/21] refactoring calibration estimations

---
 public/tracy/TracyWebGPU.hpp | 201 +++++++++++++++++++++++------------
 1 file changed, 134 insertions(+), 67 deletions(-)

diff --git a/public/tracy/TracyWebGPU.hpp b/public/tracy/TracyWebGPU.hpp
index b948bc317d..e332292d99 100644
--- a/public/tracy/TracyWebGPU.hpp
+++ b/public/tracy/TracyWebGPU.hpp
@@ -123,56 +123,107 @@ namespace tracy
 
         std::vector<uint64_t> m_shadowBuffer;
 
+        using WallTime = std::chrono::steady_clock::time_point;
+        static tracy_force_inline auto GetWallTime() { return WallTime::clock::now(); }
+        static tracy_force_inline auto Milliseconds(int value) { return std::chrono::milliseconds(value); }
+
+        static bool WaitQueueIdle(WGPUQueue queue, WGPUInstance instance)
+        {
+            bool gpuDone = false;
+            WGPUQueueWorkDoneCallbackInfo doneCB = {};
+            doneCB.mode = WGPUCallbackMode_AllowProcessEvents;
+            doneCB.callback = [](WGPUQueueWorkDoneStatus, WGPUStringView, void* userData, void*) {
+                *static_cast<bool*>(userData) = true;
+            };
+            doneCB.userdata1 = &gpuDone;
+            wgpuQueueOnSubmittedWorkDone(queue, doneCB);
+
+            const auto deadline = GetWallTime() + Milliseconds(2000);
+            while (!gpuDone && GetWallTime() < deadline)
+                wgpuInstanceProcessEvents(instance);
+            return gpuDone;
+        }
+
+        static const uint64_t* MapBufferSync(WGPUBuffer buffer, WGPUInstance instance)
+        {
+            struct MapCtx { WGPUMapAsyncStatus status = {}; } ctx;
+            WGPUBufferMapCallbackInfo cbInfo = {};
+            cbInfo.mode      = WGPUCallbackMode_AllowProcessEvents;
+            cbInfo.callback  = [](WGPUMapAsyncStatus status, WGPUStringView, void* userData, void*) {
+                auto* ctx = static_cast<MapCtx*>(userData);
+                ctx->status = status;
+            };
+            cbInfo.userdata1 = &ctx;
+            size_t offset = 0;
+            size_t size = 2 * sizeof(uint64_t);
+            wgpuBufferMapAsync(buffer, WGPUMapMode_Read, offset, size, cbInfo);
+
+            const auto deadline = GetWallTime() + Milliseconds(2000);
+            while (ctx.status == 0 && GetWallTime() < deadline)
+                wgpuInstanceProcessEvents(instance);
+
+            if (ctx.status != WGPUMapAsyncStatus_Success) return nullptr;
+            auto data = wgpuBufferGetConstMappedRange(buffer, offset, size);
+            return static_cast<const uint64_t*>(data);
+        }
+
         struct Calibration {
-            uint64_t cpuTime = 0;
-            uint64_t gpuTime = 0;
             int64_t minCpuRange = ~uint64_t(0) >> 1;
-            static bool WaitQueueIdle(WGPUQueue queue, WGPUInstance instance)
+            struct Regression
             {
-                bool gpuDone = false;
-                WGPUQueueWorkDoneCallbackInfo doneCB = {};
-                doneCB.mode = WGPUCallbackMode_AllowProcessEvents;
-                doneCB.callback = [](WGPUQueueWorkDoneStatus, WGPUStringView, void* userData, void*) {
-                    *static_cast<bool*>(userData) = true;
-                };
-                doneCB.userdata1 = &gpuDone;
-                wgpuQueueOnSubmittedWorkDone(queue, doneCB);
-
-                const auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(2);
-                while (!gpuDone && std::chrono::steady_clock::now() < deadline)
-                    wgpuInstanceProcessEvents(instance);
-                return gpuDone;
-            }
-            static const uint64_t* MapBufferSync(WGPUBuffer buffer, WGPUInstance instance)
+                int64_t n = 0;
+                int64_t mean_x = 0;
+                int64_t mean_y = 0;
+                int64_t S_xx = 0;
+                int64_t S_xy = 0;
+                void Update(int64_t x, int64_t y)
+                {
+                    n += 1;
+                    int64_t dx = x - mean_x;
+                    int64_t dy = y - mean_y;
+                    mean_x += dx / n;
+                    mean_y += dy / n;
+                    S_xx += dx * (x - mean_x);
+                    S_xy += dx * (y - mean_y);
+                }
+                double Slope() const { return double(S_xy) / S_xx; }
+                double Intercept() const { return mean_y - Slope() * mean_x; }
+            };
+            Regression cpuToGpuModel;   // cpu-ticks to gpu-ticks
+            Regression cpuRangeModel;   // cpu-tick interval uncertainty
+            Regression wallToGpuModel;  // nanoseconds to gpu-ticks
+            void GetReferenceTime(uint64_t& cpuTime, uint64_t& gpuTime) const
             {
-                struct MapCtx { WGPUMapAsyncStatus status = {}; } ctx;
-                WGPUBufferMapCallbackInfo cbInfo = {};
-                cbInfo.mode      = WGPUCallbackMode_AllowProcessEvents;
-                cbInfo.callback  = [](WGPUMapAsyncStatus status, WGPUStringView, void* userData, void*) {
-                    auto* ctx = static_cast<MapCtx*>(userData);
-                    ctx->status = status;
-                };
-                cbInfo.userdata1 = &ctx;
-                size_t offset = 0;
-                size_t size = 2 * sizeof(uint64_t);
-                wgpuBufferMapAsync(buffer, WGPUMapMode_Read, offset, size, cbInfo);
-
-                const auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(2);
-                while (ctx.status == 0 && std::chrono::steady_clock::now() < deadline)
-                    wgpuInstanceProcessEvents(instance);
-
-                if (ctx.status != WGPUMapAsyncStatus_Success) return nullptr;
-                auto data = wgpuBufferGetConstMappedRange(buffer, offset, size);
-                return static_cast<const uint64_t*>(data);
+                // the mean belongs to the regression line
+                cpuTime = cpuToGpuModel.mean_x;
+                gpuTime = cpuToGpuModel.mean_y;
+            }
+            double Period() const { return 1.0 / wallToGpuModel.Slope(); }    // ns/tick
+            bool AcceptX(const Regression& r, int64_t x, double threshold = 3.0) const {
+                if (r.n < 2) return true;
+                auto dx = x - r.mean_x;
+                if (dx <= 0) return true; // always accept "tighter" outliers
+                double variance = double(r.S_xx) / (r.n - 1);
+                if (variance == 0.0) return true;
+                double zz = (double)(dx*dx) / variance;
+                return zz <= (threshold*threshold);
             }
-            bool Update(uint64_t tcpu0, uint64_t tcpu1, uint64_t tgpu)
+            bool Update(WallTime twall0, WallTime twall1, uint64_t tcpu0, uint64_t tcpu1, uint64_t tgpu)
             {
-                // TODO: run some interval-based incremental regression here
+                using namespace std::chrono;
                 int64_t cpuRange = tcpu1 - tcpu0;
-                if (cpuRange >= minCpuRange) return false;
-                minCpuRange = cpuRange;
-                this->cpuTime = tcpu1;    // t0 + (t1-t0)/2
-                this->gpuTime = tgpu;
+                cpuRangeModel.Update(cpuRange, 0);
+                if (!AcceptX(cpuRangeModel, cpuRange, 1.0)) return false;
+                // Process sample:
+                int64_t tcpu = tcpu0 + (tcpu1 - tcpu0) / 2; // mid-point
+                int64_t twall = duration_cast<nanoseconds>(
+                    (twall0 + (twall1 - twall0) / 2)        // mid-point
+                    .time_since_epoch()
+                ).count();
+                // incremental regression:
+                cpuToGpuModel.Update(tcpu, tgpu);
+                wallToGpuModel.Update(twall, tgpu);
+                fprintf(stderr, "----- (sample accepted! period = %f)\n", Period());
                 return true;
             }
         } m_calibration;
@@ -185,7 +236,7 @@ namespace tracy
             Profiler::QueueSerialFinish();
         }
 
-        bool CalibrateClocks(uint64_t& outCpuTime, uint64_t& outGpuTime)
+        bool CalibrateClocks(uint64_t& outCpuTime, uint64_t& outGpuTime, double& period)
         {
             // WebGPU does not have any clock calibration API.
             // This routine attempts to estimates a reasonable (cpuTime, gpuTime) correlation
@@ -258,11 +309,16 @@ namespace tracy
             passDesc.colorAttachments     = &att;
             passDesc.timestampWrites      = &anchorTs;
 
-            const auto deadline = std::chrono::steady_clock::now() + std::chrono::milliseconds(100);
-            do
+            // calibration loop
+            const auto deadline = GetWallTime() + Milliseconds(100);
+            for (int i = 0; i < 1000; ++i)
             {
+                // loop until time budget (100ms) allows, but ensure at least 5 iterations
+                if ((GetWallTime() >= deadline) && (i > 5))
+                    break;
+
                 WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(m_device, nullptr);
-                if (!enc) { TracyWebGPUPanic("Failed to create calibration command encoder.", return false); }
+                if (!enc) { TracyWebGPUPanic("Failed to create command encoder for time calibration.", return false); }
 
                 WGPURenderPassEncoder pass = wgpuCommandEncoderBeginRenderPass(enc, &passDesc);
                 wgpuRenderPassEncoderSetPipeline(pass, calibPipeline);
@@ -280,32 +336,47 @@ namespace tracy
                 wgpuCommandEncoderRelease(enc);
                 if (!cmd) { TracyWebGPUPanic("Failed to finish calibration command encoder.", return false); }
 
-                Calibration::WaitQueueIdle(m_queue, m_instance);
+                WaitQueueIdle(m_queue, m_instance);
                 int64_t cpu [2] = {};
+                int64_t gpu [2] = {};
+                WallTime wall [2] = {};
                 cpu[0] = Profiler::GetTime();
+                wall[0] = GetWallTime();
                 wgpuQueueSubmit(m_queue, 1, &cmd);
                 wgpuCommandBufferRelease(cmd);
-                Calibration::WaitQueueIdle(m_queue, m_instance);
+                WaitQueueIdle(m_queue, m_instance);
+                wall[1] = GetWallTime();
                 cpu[1] = Profiler::GetTime();
-                auto gpu = Calibration::MapBufferSync(readBackBuffer, m_instance);
-                TracyWebGPUAssert(gpu != nullptr);
+                auto gpuTimestamps = MapBufferSync(readBackBuffer, m_instance);
+                TracyWebGPUAssert(gpuTimestamps != nullptr);
+                gpu[0] = gpuTimestamps[0];
+                gpu[1] = gpuTimestamps[0];
+                wgpuBufferUnmap(readBackBuffer);
                 TracyWebGPUDebug(
-                    fprintf(stdout, "CalibrateClocks() -> %llu | %llu | %lld /// %lld\n", gpu[0], gpu[1], gpu[1]-gpu[0], cpu[1]-cpu[0]);
-                    if (gpu[0] < m_calibration.gpuTime)
-                        fprintf(stdout, "CalibrateClocks() -> WARNING!!! going backwards!\n%llu\n%llu\n%lld\n", m_calibration.gpuTime, gpu[0], gpu[0] - m_calibration.gpuTime);
+                    fprintf(stdout, "[%03d] CalibrateClocks() -> %llu | %llu | %lld /// %lld\n", i, gpu[0], gpu[1], gpu[1]-gpu[0], cpu[1]-cpu[0]);
+                    uint64_t cpuTimeRef, gpuTimeRef;
+                    m_calibration.GetReferenceTime(cpuTimeRef, gpuTimeRef);
+                    if (gpu[0] < gpuTimeRef)
+                        fprintf(stdout, "!!!!! CalibrateClocks() -> WARNING!!! going backwards!\n%llu\n%llu\n%lld\n", gpuTimeRef, gpu[0], gpu[0] - gpuTimeRef);
                 );
-                m_calibration.Update(cpu[0], cpu[1], gpu[0]);
-                wgpuBufferUnmap(readBackBuffer);
 
-            } while (std::chrono::steady_clock::now() < deadline);
+                // skip first sample since it is quite jittery (lazy intialization of WebGPU objects)
+                if (i == 0)
+                    continue;
+
+                m_calibration.Update(wall[0], wall[1], cpu[0], cpu[1], gpu[0]);
+            };
 
             wgpuRenderPipelineRelease(calibPipeline);
             wgpuShaderModuleRelease(calibShader);
             wgpuTextureViewRelease(texView);
             wgpuTextureRelease(tex);
 
-            outCpuTime = m_calibration.cpuTime;
-            outGpuTime = m_calibration.gpuTime;
+            m_calibration.GetReferenceTime(outCpuTime, outGpuTime);
+            period = m_calibration.Period();
+            // assume 1 ns/tick if the period estimation is close enough to 1
+            if (std::abs(period - 1.0) < 0.001)
+                period = 1.0;
 
             return true;
         }
@@ -422,17 +493,13 @@ namespace tracy
 
             uint64_t cpuTimestamp = 0;
             uint64_t gpuTimestamp = 0;
-            if (!CalibrateClocks(cpuTimestamp, gpuTimestamp))
+            double period = 0.0;  // in nanoseconds per gpu-tick
+            if (!CalibrateClocks(cpuTimestamp, gpuTimestamp, period))
                 TracyWebGPUPanic("Failed to calibrate CPU/GPU clocks.", return);
 
-            TracyWebGPUDebug( fprintf(stdout, "INFO: gpuTimestamp is %llu\n", gpuTimestamp) );
+            TracyWebGPUDebug( fprintf(stdout, "[WebGPUQueueCtx] cpuTimestamp: %llu | gpuTimestamp: %llu | period: %f\n", cpuTimestamp, gpuTimestamp, period) );
             m_shadowBuffer.resize(m_queryLimit, gpuTimestamp);
 
-            // WebGPU timestamps are in nanoseconds, as per the spec.
-            float period = 1.0f;  // 1ns/tick
-            // TODO: however, with raw timestamps, the period may need adjustment
-            // (we measure that that during CalibrateClocks())
-
             // All setup completed: register the context.
             m_contextId = GetGpuCtxCounter().fetch_add(1);
             ZoneValue(m_contextId);
@@ -442,7 +509,7 @@ namespace tracy
             MemWrite(&item->gpuNewContext.cpuTime, static_cast<int64_t>(cpuTimestamp));
             MemWrite(&item->gpuNewContext.gpuTime, static_cast<int64_t>(gpuTimestamp));
             MemWrite(&item->gpuNewContext.thread, static_cast<uint32_t>(0));
-            MemWrite(&item->gpuNewContext.period, period);
+            MemWrite(&item->gpuNewContext.period, static_cast<float>(period));
             MemWrite(&item->gpuNewContext.context, static_cast<uint8_t>(GetId()));
             MemWrite(&item->gpuNewContext.flags, static_cast<uint8_t>(0));  // no calibration available
             MemWrite(&item->gpuNewContext.type, static_cast<uint8_t>(GpuContextType::WebGPU));

From 47397fc9b26e88bd3997faa90c760c298073c001 Mon Sep 17 00:00:00 2001
From: Marcos Slomp <mslomp@gmail.com>
Date: Fri, 22 May 2026 16:01:52 -0700
Subject: [PATCH 11/21] minor fixes/comments

---
 public/tracy/TracyWebGPU.hpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/public/tracy/TracyWebGPU.hpp b/public/tracy/TracyWebGPU.hpp
index e332292d99..6bf4d362c1 100644
--- a/public/tracy/TracyWebGPU.hpp
+++ b/public/tracy/TracyWebGPU.hpp
@@ -205,6 +205,7 @@ namespace tracy
                 if (dx <= 0) return true; // always accept "tighter" outliers
                 double variance = double(r.S_xx) / (r.n - 1);
                 if (variance == 0.0) return true;
+                // WARN: dx*dx "could" overflow, but very unlikely in practice
                 double zz = (double)(dx*dx) / variance;
                 return zz <= (threshold*threshold);
             }
@@ -350,7 +351,7 @@ namespace tracy
                 auto gpuTimestamps = MapBufferSync(readBackBuffer, m_instance);
                 TracyWebGPUAssert(gpuTimestamps != nullptr);
                 gpu[0] = gpuTimestamps[0];
-                gpu[1] = gpuTimestamps[0];
+                gpu[1] = gpuTimestamps[1];
                 wgpuBufferUnmap(readBackBuffer);
                 TracyWebGPUDebug(
                     fprintf(stdout, "[%03d] CalibrateClocks() -> %llu | %llu | %lld /// %lld\n", i, gpu[0], gpu[1], gpu[1]-gpu[0], cpu[1]-cpu[0]);
@@ -384,6 +385,7 @@ namespace tracy
     public:
         static bool SetupDevice(WGPUDeviceDescriptor& deviceDescriptor)
         {
+            // TODO: pass features array/size as argument to better allow for repeated calls
             static constexpr int MaxFeatures = 128;
             static WGPUFeatureName features [MaxFeatures] = {};
 
@@ -452,7 +454,7 @@ namespace tracy
             ZoneScopedC(Color::Red4);
 
             if (!VerifyDevice(m_device))
-                TracyWebGPUPanic("GPU profiling disabled because the device did not enable the necessary features.")
+                TracyWebGPUPanic("GPU profiling disabled because the device did not enable the necessary features.", return)
 
             TracyWebGPUAssert(m_instance); wgpuInstanceAddRef(m_instance);
             TracyWebGPUAssert(m_device); wgpuDeviceAddRef(m_device);

From 4cf3160c2bfb4105d69a100f754038ad593c758d Mon Sep 17 00:00:00 2001
From: Marcos Slomp <mslomp@gmail.com>
Date: Sun, 24 May 2026 07:04:29 -0700
Subject: [PATCH 12/21] missing interface, and more debugging

---
 public/tracy/TracyWebGPU.hpp | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/public/tracy/TracyWebGPU.hpp b/public/tracy/TracyWebGPU.hpp
index 6bf4d362c1..8ca03175bd 100644
--- a/public/tracy/TracyWebGPU.hpp
+++ b/public/tracy/TracyWebGPU.hpp
@@ -11,6 +11,8 @@
 
 #ifndef TRACY_ENABLE
 
+#define TracyWebGPUSetupDevice(deviceDescriptor)
+
 #define TracyWebGPUContext(instance, device, queue) nullptr
 #define TracyWebGPUDestroy(ctx)
 #define TracyWebGPUContextName(ctx, name, size)
@@ -224,7 +226,7 @@ namespace tracy
                 // incremental regression:
                 cpuToGpuModel.Update(tcpu, tgpu);
                 wallToGpuModel.Update(twall, tgpu);
-                fprintf(stderr, "----- (sample accepted! period = %f)\n", Period());
+                fprintf(stderr, "----- (sample accepted! wall = %lld | cpu = %lld | gpu = %lld | period = %f)\n", twall, tcpu, tgpu, Period());
                 return true;
             }
         } m_calibration;
@@ -354,7 +356,8 @@ namespace tracy
                 gpu[1] = gpuTimestamps[1];
                 wgpuBufferUnmap(readBackBuffer);
                 TracyWebGPUDebug(
-                    fprintf(stdout, "[%03d] CalibrateClocks() -> %llu | %llu | %lld /// %lld\n", i, gpu[0], gpu[1], gpu[1]-gpu[0], cpu[1]-cpu[0]);
+                    fprintf(stdout, "[%03d] CalibrateClocks() [CPU] %16lld | %16lld | /// %lld\n", i, cpu[0], cpu[1], cpu[1]-cpu[0]);
+                    fprintf(stdout,  "----------------------- [GPU] %16llu | %16llu | /// %lld\n",    gpu[0], gpu[1], gpu[1]-gpu[0]);
                     uint64_t cpuTimeRef, gpuTimeRef;
                     m_calibration.GetReferenceTime(cpuTimeRef, gpuTimeRef);
                     if (gpu[0] < gpuTimeRef)
@@ -368,6 +371,14 @@ namespace tracy
                 m_calibration.Update(wall[0], wall[1], cpu[0], cpu[1], gpu[0]);
             };
 
+            TracyWebGPUDebug(
+                fprintf(stdout, "##### CalibrateClocks() WALL = %lld | CPU = %lld | GPU = %lld | period = %f\n",
+                    m_calibration.wallToGpuModel.mean_x,
+                    m_calibration.cpuToGpuModel.mean_x,
+                    m_calibration.cpuToGpuModel.mean_y,
+                    m_calibration.Period());
+            );
+
             wgpuRenderPipelineRelease(calibPipeline);
             wgpuShaderModuleRelease(calibShader);
             wgpuTextureViewRelease(texView);
@@ -900,6 +911,8 @@ namespace tracy
 
 using TracyWebGPUCtx = tracy::WebGPUQueueCtx*;
 
+#define TracyWebGPUSetupDevice(deviceDescriptor) tracy::WebGPUQueueCtx::SetupDevice(deviceDescriptor)
+
 #define TracyWebGPUContext(instance, device, queue) tracy::CreateWebGPUContext(instance, device, queue);
 #define TracyWebGPUDestroy(ctx) tracy::DestroyWebGPUContext(ctx);
 #define TracyWebGPUContextName(ctx, name, size) ctx->Name(name, size);

From 5f82102fba13821c2b2d173bfb1b15127c1e801b Mon Sep 17 00:00:00 2001
From: Marcos Slomp <mslomp@gmail.com>
Date: Sun, 24 May 2026 09:10:21 -0700
Subject: [PATCH 13/21] updating docs

---
 README.md        |  2 +-
 manual/tracy.tex | 26 +++++++++++++++++++-------
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 39e878e359..7f805b04f4 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 
 ### A real time, nanosecond resolution, remote telemetry, hybrid frame and sampling profiler for games and other applications.
 
-Tracy supports profiling CPU (Direct support is provided for C, C++, Lua, Python and Fortran integration. At the same time, third-party bindings to many other languages exist on the internet, such as [Rust](https://github.com/nagisa/rust_tracy_client), [Zig](https://github.com/tealsnow/zig-tracy), [C#](https://github.com/clibequilibrium/Tracy-CSharp), [OCaml](https://github.com/imandra-ai/ocaml-tracy), [Odin](https://github.com/oskarnp/odin-tracy), etc.), GPU (All major graphic APIs: OpenGL, Vulkan, Direct3D 11/12, Metal, OpenCL, CUDA.), memory allocations, locks, context switches, automatically attribute screenshots to captured frames, and much more.
+Tracy supports profiling CPU (Direct support is provided for C, C++, Lua, Python and Fortran integration. At the same time, third-party bindings to many other languages exist on the internet, such as [Rust](https://github.com/nagisa/rust_tracy_client), [Zig](https://github.com/tealsnow/zig-tracy), [C#](https://github.com/clibequilibrium/Tracy-CSharp), [OCaml](https://github.com/imandra-ai/ocaml-tracy), [Odin](https://github.com/oskarnp/odin-tracy), etc.), GPU (All major graphics/compute APIs: OpenGL, Vulkan, Direct3D 11/12, Metal, OpenCL, CUDA, WebGPU.), memory allocations, locks, context switches, automatically attribute screenshots to captured frames, and much more.
 
 - [Documentation](https://github.com/wolfpld/tracy/releases/latest/download/tracy.pdf) for usage and build process instructions
 - [Releases](https://github.com/wolfpld/tracy/releases) containing the documentation (`tracy.pdf`) and compiled Windows x64 binaries (`Tracy-<version>.7z`) as assets
diff --git a/manual/tracy.tex b/manual/tracy.tex
index bbd2745156..7cbf63ce58 100644
--- a/manual/tracy.tex
+++ b/manual/tracy.tex
@@ -141,7 +141,7 @@ \section*{Quick-start guide}
 \section{A quick look at Tracy Profiler}
 \label{quicklook}
 
-Tracy is a real-time, nanosecond resolution \emph{hybrid frame and sampling profiler} that you can use for remote or embedded telemetry of games and other applications. It can profile CPU\footnote{Direct support is provided for C, C++, Lua, Python and Fortran integration. At the same time, third-party bindings to many other languages exist on the internet, such as Rust, Zig, C\#, OCaml, Odin, etc.}, GPU\footnote{All major graphic APIs: OpenGL, Vulkan, Direct3D 11/12, Metal, OpenCL.}, memory allocations, locks, context switches, automatically attribute screenshots to captured frames, and much more.
+Tracy is a real-time, nanosecond resolution \emph{hybrid frame and sampling profiler} that you can use for remote or embedded telemetry of games and other applications. It can profile CPU\footnote{Direct support is provided for C, C++, Lua, Python and Fortran integration. At the same time, third-party bindings to many other languages exist on the internet, such as Rust, Zig, C\#, OCaml, Odin, etc.}, GPU\footnote{All major graphics/compute APIs: OpenGL, Vulkan, Direct3D 11/12, Metal, OpenCL, CUDA, WebGPU.}, memory allocations, locks, context switches, automatically attribute screenshots to captured frames, and much more.
 
 While Tracy can perform statistical analysis of sampled call stack data, just like other \emph{statistical profilers} (such as VTune, perf, or Very Sleepy), it mainly focuses on manual markup of the source code. Such markup allows frame-by-frame inspection of the program execution. For example, you will be able to see exactly which functions are called, how much time they require, and how they interact with each other in a multi-threaded environment. In contrast, the statistical analysis may show you the hot spots in your code, but it cannot accurately pinpoint the underlying cause for semi-random frame stutter that may occur every couple of seconds.
 
@@ -1050,6 +1050,8 @@ \subsection{Feature support matrix}
 GPU zones (OpenGL) & \faCheck & \faCheck & \faCheck & \faPoo & \faPoo & & \faXmark \\
 GPU zones (Vulkan) & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & & \faXmark \\
 GPU zones (Metal) & \faXmark & \faXmark & \faXmark & \faCheck\textsuperscript{\emph{b}} & \faCheck\textsuperscript{\emph{b}} & \faXmark & \faXmark \\
+GPU zones (CUDA) & \faCheck & \faCheck & \faXmark & \faXmark & \faXmark & \faQuestion & \faXmark \\
+GPU zones (WebGPU) & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faQuestion & \faQuestion \\
 Call stacks & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faXmark \\
 Symbol resolution & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck \\
 Crash handling & \faCheck & \faCheck & \faCheck & \faXmark & \faXmark & \faXmark & \faXmark \\
@@ -1645,7 +1647,7 @@ \subsubsection{Memory pools}
 \subsection{GPU profiling}
 \label{gpuprofiling}
 
-Tracy provides bindings for profiling OpenGL, Vulkan, Direct3D 11, Direct3D 12, Metal, OpenCL and CUDA execution time on GPU.
+Tracy provides bindings for profiling OpenGL, Vulkan, Direct3D 11, Direct3D 12, Metal, OpenCL, CUDA and WebGPU execution time on GPU.
 
 Note that the CPU and GPU timers may be unsynchronized unless you create a calibrated context, but the availability of calibrated contexts is limited. You can try to correct the desynchronization of uncalibrated contexts in the profiler's options (section~\ref{options}).
 
@@ -1785,6 +1787,16 @@ \subsubsection{CUDA}
 
 To stop profiling, call the \texttt{TracyCUDAStopProfiling(ctx)} macro.
 
+\subsubsection{WebGPU}
+
+WebGPU support is enabled by including the \texttt{public/tracy/TracyWebGPU.hpp} header file. Both major implementations of WebGPU (Dawn and wgpu-native) are supported.
+
+Before creating the WebGPU device, make sure to call \texttt{TracyWebGPUSetupDevice()} to let Tracy request the necessary device features and extensions necessary for profiling. After the device is created, use the \texttt{TracyWebGPUContext()} macro to instantiate the necessary \texttt{WebGPUQueueCtx} object required for GPU instrumentation. The object should later be cleaned up with the \texttt{TracyWebGPUDestroy()} macro. To set a custom name for the context, use the \texttt{TracyWebGPUContextName()} macro.
+
+To instrument a GPU zone, use the various \texttt{TracyWebGPU*Zone*()} macros. Note that WebGPU only offers command instrumentation at the "pass"-level. While command-level granularity is possible through implementation-specific WebGPU extensions, Tracy does not support it at the moment. Supply the corresponding WebGPU pass descriptor to the instrumentation macro \textit{before} creating the WebGPU pass encoder.
+
+You are required to periodically collect the GPU events using the \texttt{TracyWebGPUCollect()} macro. Good places for collection are: after synchronous waits, after event processing \texttt{wgpuInstanceProcessEvents}, after present drawable calls (\texttt{wgpuSurfacePresent}), and inside the completion callback of command queues (\texttt{wgpuQueueOnSubmittedWorkDone}).
+
 \subsubsection{ROCm}
 
 On Linux, if rocprofiler-sdk is installed, tracy can automatically trace GPU dispatches and collect
@@ -1818,13 +1830,13 @@ \subsubsection{Multiple zones in one scope}
 
 Putting more than one GPU zone macro in a single scope features the same issue as with the \texttt{ZoneScoped} macros, described in section~\ref{multizone} (but this time the variable name is \texttt{\_\_\_tracy\_gpu\_zone}).
 
-To solve this problem, in case of OpenGL use the \texttt{TracyGpuNamedZone} macro in place of \texttt{TracyGpuZone} (or the color variant). The same applies to Vulkan, Direct3D 11/12 and Metal -- replace \texttt{TracyVkZone} with \texttt{TracyVkNamedZone}, \texttt{TracyD3D11Zone}/\texttt{TracyD3D12Zone} with \texttt{TracyD3D11NamedZone}/\texttt{TracyD3D12NamedZone}, and \texttt{TracyMetalZone} with \texttt{TracyMetalNamedZone}.
+To solve this problem, in case of OpenGL use the \texttt{TracyGpuNamedZone} macro in place of \texttt{TracyGpuZone} (or the color variant). The same applies to Vulkan, Direct3D 11/12, Metal and WebGPU -- replace \texttt{TracyVkZone} with \texttt{TracyVkNamedZone}, \texttt{TracyD3D11Zone}/\texttt{TracyD3D12Zone} with \texttt{TracyD3D11NamedZone}/\texttt{TracyD3D12NamedZone}, \texttt{TracyMetalZone} with \texttt{TracyMetalNamedZone}, and \texttt{TracyWebGPUZone} with \texttt{TracyWebGPUNamedZone}.
 
 Remember to provide your name for the created stack variable as the first parameter to the macros.
 
 \subsubsection{Transient GPU zones}
 
-Transient zones (see section~\ref{transientzones} for details) are available in OpenGL, Vulkan, and Direct3D 11/12 macros. Transient zones are not available for Metal at this moment.
+Transient zones (see section~\ref{transientzones} for details) are available in OpenGL, Vulkan, Direct3D 11/12 and WebGPU macros. Transient zones are not available for Metal at this moment.
 
 \subsection{Fibers}
 \label{fibers}
@@ -3846,7 +3858,7 @@ \subsubsection{Timeline view}
 The left-hand side \emph{index area} of the timeline view displays various labels (threads, locks), which can be categorized in the following way:
 
 \begin{itemize}
-\item \emph{Light blue label} -- GPU context. Multi-threaded Vulkan, OpenCL, Direct3D 12 and Metal contexts are additionally split into separate threads.
+\item \emph{Light blue label} -- GPU context. Multi-threaded Vulkan, OpenCL, Direct3D 12, Metal and WebGPU contexts are additionally split into separate threads.
 \item \emph{Pink label} -- CPU data graph.
 \item \emph{White label} -- A CPU thread. It will be replaced by a bright red label in a thread that has crashed (section~\ref{crashhandling}). If automated sampling was performed, clicking the~\LMB{}~left mouse button on the \emph{\faGhost{}~ghost zones} button will switch zone display mode between 'instrumented' and 'ghost.'
 \item \emph{Green label} -- Fiber, coroutine, or any other sort of cooperative multitasking 'green thread.'
@@ -3868,7 +3880,7 @@ \subsubsection{Timeline view}
 
 Meanwhile, the \emph{Streaming thread} is performing some \emph{Streaming jobs}. The first \emph{Streaming job} sent a message (section~\ref{messagelog}). In addition to being listed in the message log, it is indicated by a triangle over the thread separator. When multiple messages are in one place, the triangle outline shape changes to a filled triangle.
 
-The GPU zones are displayed just like CPU zones, with an OpenGL/Vulkan/Direct3D/Metal/OpenCL context in place of a thread name.
+The GPU zones are displayed just like CPU zones, with an OpenGL/Vulkan/Direct3D/Metal/OpenCL/CUDA/WebGPU context in place of a thread name.
 
 Hovering the \faArrowPointer{} mouse pointer over a zone will highlight all other zones that have the exact source location with a white outline. Clicking the \LMB{}~left mouse button on a zone will open the zone information window (section~\ref{zoneinfo}). Holding the \keys{\ctrl} key and clicking the \LMB{}~left mouse button on a zone will open the zone statistics window (section~\ref{findzone}). Clicking the \MMB{}~middle mouse button on a zone will zoom the view to the extent of the zone.
 
@@ -4077,7 +4089,7 @@ \subsection{Options menu}
 \begin{itemize}
 \item \emph{\faSignature{} Draw CPU usage graph} -- You can disable drawing of the CPU usage graph here.
 \end{itemize}
-\item \emph{\faEye{} Draw GPU zones} -- Allows disabling display of OpenGL/Vulkan/Metal/Direct3D/OpenCL zones. The \emph{GPU zones} drop-down allows disabling individual GPU contexts and setting CPU/GPU drift offsets of uncalibrated contexts (see section~\ref{gpuprofiling} for more information). The \emph{\faRobot~Auto} button automatically measures the GPU drift value\footnote{There is an assumption that drift is linear. Automated measurement calculates and removes change over time in delay-to-execution of GPU zones. Resulting value may still be incorrect.}.
+\item \emph{\faEye{} Draw GPU zones} -- Allows disabling display of OpenGL/Vulkan/Metal/Direct3D/OpenCL/CUDA/WebGPU zones. The \emph{GPU zones} drop-down allows disabling individual GPU contexts and setting CPU/GPU drift offsets of uncalibrated contexts (see section~\ref{gpuprofiling} for more information). The \emph{\faRobot~Auto} button automatically measures the GPU drift value\footnote{There is an assumption that drift is linear. Automated measurement calculates and removes change over time in delay-to-execution of GPU zones. Resulting value may still be incorrect.}.
 \item \emph{\faMicrochip{} Draw CPU zones} -- Determines whether CPU zones are displayed.
 \begin{itemize}
 \item \emph{\faGhost{} Draw ghost zones} -- Controls if ghost zones should be displayed in threads which don't have any instrumented zones available.

From 5a1629d36c65687a063dc3ba565d2ef07376c838 Mon Sep 17 00:00:00 2001
From: Marcos Slomp <mslomp@gmail.com>
Date: Sun, 24 May 2026 10:06:55 -0700
Subject: [PATCH 14/21] GPU context name

---
 profiler/src/profiler/TracyView.hpp | 3 ++-
 python/bindings/ServerModule.cpp    | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/profiler/src/profiler/TracyView.hpp b/profiler/src/profiler/TracyView.hpp
index 4e34976079..66fcf46f0c 100644
--- a/profiler/src/profiler/TracyView.hpp
+++ b/profiler/src/profiler/TracyView.hpp
@@ -49,7 +49,8 @@ constexpr const char* GpuContextNames[] = {
     "Metal",
     "Custom",
     "CUDA",
-    "Rocprof"
+    "Rocprof",
+    "WebGPU"
 };
 
 struct MemoryPage;
diff --git a/python/bindings/ServerModule.cpp b/python/bindings/ServerModule.cpp
index fc1890a825..d2bbb772be 100644
--- a/python/bindings/ServerModule.cpp
+++ b/python/bindings/ServerModule.cpp
@@ -1034,13 +1034,14 @@ PYBIND11_MODULE( TracyServerBindings, m )
         .def( "get_gpu_contexts", []( const Worker& w ) {
         static const char* gpuTypeStr[] = {
             "Invalid", "OpenGL", "Vulkan", "OpenCL", "Direct3D12", "Direct3D11", "Metal", "Custom", "CUDA", "Rocprof", "WebGPU" };
+        static size_t numTypes = sizeof(gpuTypeStr) / sizeof(gpuTypeStr[0]);
         std::vector<GpuContextSummary> result;
         for( const auto* ctx : w.GetGpuData() )
         {
             if( !ctx ) continue;
             const std::string name = ctx->name.Active() ? w.GetString( ctx->name ) : "";
             const uint8_t typeIdx = (uint8_t)ctx->type;
-            const char* typeStr = typeIdx < 11 ? gpuTypeStr[typeIdx] : "Unknown";
+            const char* typeStr = typeIdx < numTypes ? gpuTypeStr[typeIdx] : "Unknown";
             result.push_back( GpuContextSummary{
                 name, ctx->count, std::string( typeStr ), ctx->thread } );
         }

From 0f22144e4b0d412dc1da84df7ace3eeaeed3fbf8 Mon Sep 17 00:00:00 2001
From: Marcos Slomp <mslomp@gmail.com>
Date: Sun, 24 May 2026 12:53:16 -0700
Subject: [PATCH 15/21] adding webgpu example/demo

---
 examples/WebGPUDemo/CMakeLists.txt            | 135 +++++++
 examples/WebGPUDemo/platform/platform.h       |  23 ++
 .../WebGPUDemo/platform/platform_macos.mm     | 120 ++++++
 .../WebGPUDemo/platform/platform_wayland.cpp  | 213 ++++++++++
 .../WebGPUDemo/platform/platform_windows.cpp  | 131 +++++++
 examples/WebGPUDemo/spinning_triangle.cpp     | 364 ++++++++++++++++++
 6 files changed, 986 insertions(+)
 create mode 100644 examples/WebGPUDemo/CMakeLists.txt
 create mode 100644 examples/WebGPUDemo/platform/platform.h
 create mode 100644 examples/WebGPUDemo/platform/platform_macos.mm
 create mode 100644 examples/WebGPUDemo/platform/platform_wayland.cpp
 create mode 100644 examples/WebGPUDemo/platform/platform_windows.cpp
 create mode 100644 examples/WebGPUDemo/spinning_triangle.cpp

diff --git a/examples/WebGPUDemo/CMakeLists.txt b/examples/WebGPUDemo/CMakeLists.txt
new file mode 100644
index 0000000000..638d337f19
--- /dev/null
+++ b/examples/WebGPUDemo/CMakeLists.txt
@@ -0,0 +1,135 @@
+# CMakeLists.txt — WebGPU spinning triangle demo
+#
+#   macOS:
+#     clang++ -std=c++17 -ObjC++ spinning_triangle.cpp platform/platform_macos.mm \
+#         -I/path/to/wgpu/include -L/path/to/wgpu/lib -lwgpu_native \
+#         -Wl,-rpath,@executable_path \
+#         -framework Cocoa -framework Metal -framework QuartzCore \
+#         -framework Foundation -framework IOKit -framework IOSurface \
+#         -o spinning_triangle
+#
+#   Windows (MSVC):
+#     cl /std:c++17 spinning_triangle.cpp platform/platform_windows.cpp \
+#         /I\path\to\wgpu\include \path\to\wgpu\lib\wgpu_native.lib \
+#         user32.lib gdi32.lib /Fe:spinning_triangle.exe
+#
+#   Linux / Wayland:
+#     g++ -std=c++17 spinning_triangle.cpp platform/platform_wayland.cpp \
+#         xdg-shell-protocol.c \
+#         -I/path/to/wgpu/include -L/path/to/wgpu/lib -lwgpu_native \
+#         -lwayland-client -o spinning_triangle
+
+cmake_minimum_required(VERSION 3.16)
+project(spinning_triangle LANGUAGES CXX)
+
+# ---------------------------------------------------------------------------
+# WebGPU backend — set WGPU_PATH to your wgpu-native or Dawn installation.
+# The library name differs between backends:
+#   wgpu-native  →  wgpu_native
+#   Dawn         →  webgpu_dawn
+# ---------------------------------------------------------------------------
+set(WGPU_PATH "" CACHE PATH "Root of the WebGPU native installation (contains include/ and lib/)")
+set(WGPU_LIB  "webgpu_dawn" CACHE STRING "WebGPU library name (wgpu_native or webgpu_dawn)")
+
+if(NOT WGPU_PATH)
+    message(FATAL_ERROR "Set WGPU_PATH to the root of your WebGPU native installation.")
+endif()
+
+# ---------------------------------------------------------------------------
+# Tracy root — defaults to two directories above this CMakeLists.txt.
+# ---------------------------------------------------------------------------
+set(TRACY_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../..")
+option(TRACY_ENABLE "Enable Tracy profiling" ON)
+
+# ---------------------------------------------------------------------------
+# macOS quarantine — pre-built WebGPU binaries downloaded from the internet
+# carry a com.apple.quarantine extended attribute that prevents dyld from
+# loading them ("damaged or incomplete" / Gatekeeper block).  Strip it once
+# at configure time so the linker and the runtime loader can both access the
+# library directory without further user intervention.
+# ---------------------------------------------------------------------------
+if(APPLE)
+    execute_process(
+        COMMAND xattr -dr com.apple.quarantine "${WGPU_PATH}/lib"
+    )
+endif()
+
+# ---------------------------------------------------------------------------
+# Platform-specific source and link settings
+# ---------------------------------------------------------------------------
+if(APPLE)
+    set(PLATFORM_SOURCES platform/platform_macos.mm)
+    set(PLATFORM_LIBS
+        "-framework Cocoa"
+        "-framework Metal"
+        "-framework QuartzCore"
+        "-framework Foundation"
+        "-framework IOKit"
+        "-framework IOSurface"
+    )
+    set_source_files_properties(platform/platform_macos.mm
+        PROPERTIES COMPILE_FLAGS "-ObjC++"
+    )
+elseif(WIN32)
+    set(PLATFORM_SOURCES platform/platform_windows.cpp)
+    set(PLATFORM_LIBS user32 gdi32)
+else()
+    # Linux / Wayland — also needs the generated xdg-shell protocol glue.
+    set(PLATFORM_SOURCES
+        platform/platform_wayland.cpp
+        xdg-shell-protocol.c
+    )
+    set(PLATFORM_LIBS wayland-client)
+endif()
+
+# ---------------------------------------------------------------------------
+# Target
+# ---------------------------------------------------------------------------
+add_executable(spinning_triangle
+    spinning_triangle.cpp
+    "${TRACY_DIR}/public/TracyClient.cpp"
+    ${PLATFORM_SOURCES}
+)
+
+# Treat TracyClient.cpp as third-party code — suppress all warnings so that
+# upstream changes don't pollute our build output.
+if(MSVC)
+    set_source_files_properties("${TRACY_DIR}/public/TracyClient.cpp"
+        PROPERTIES COMPILE_FLAGS "/w"
+    )
+else()
+    set_source_files_properties("${TRACY_DIR}/public/TracyClient.cpp"
+        PROPERTIES COMPILE_FLAGS "-w"
+    )
+endif()
+
+target_compile_features(spinning_triangle PRIVATE cxx_std_17)
+
+if(TRACY_ENABLE)
+    target_compile_definitions(spinning_triangle PRIVATE TRACY_ENABLE)
+endif()
+
+target_include_directories(spinning_triangle PRIVATE
+    "${WGPU_PATH}/include"
+    "${TRACY_DIR}/public"
+)
+
+target_link_directories(spinning_triangle PRIVATE "${WGPU_PATH}/lib")
+
+target_link_libraries(spinning_triangle PRIVATE
+    ${WGPU_LIB}
+    ${PLATFORM_LIBS}
+)
+
+# Embed the rpath so the binary finds the WebGPU dylib/so next to itself.
+if(APPLE)
+    set_target_properties(spinning_triangle PROPERTIES
+        BUILD_RPATH "${WGPU_PATH}/lib"
+        INSTALL_RPATH "@executable_path"
+    )
+elseif(UNIX)
+    set_target_properties(spinning_triangle PROPERTIES
+        BUILD_RPATH "${WGPU_PATH}/lib"
+        INSTALL_RPATH "$ORIGIN"
+    )
+endif()
diff --git a/examples/WebGPUDemo/platform/platform.h b/examples/WebGPUDemo/platform/platform.h
new file mode 100644
index 0000000000..c1439bc51a
--- /dev/null
+++ b/examples/WebGPUDemo/platform/platform.h
@@ -0,0 +1,23 @@
+// platform.h — interface between platform-agnostic code and platform backends
+//
+// Each platform_*.mm / platform_*.cpp file implements these five functions.
+// Exactly one backend must be linked into the final binary.
+
+#pragma once
+#include <webgpu/webgpu.h>
+
+// Initialize the windowing system and create a window of the given dimensions.
+// Returns true on success.
+bool platformInit(int width, int height, const char* title);
+
+// Create a WebGPU surface backed by the platform window.
+// Must be called after wgpuCreateInstance() and platformInit().
+WGPUSurface platformCreateSurface(WGPUInstance instance);
+
+// Elapsed wall-clock time in seconds since platformInit().
+double platformGetTime();
+
+// Enter the platform event/render loop.
+// Calls render() each frame at ~60 fps.
+// Calls shutdown() exactly once before returning.
+void platformRunLoop(void (*render)(), void (*shutdown)());
diff --git a/examples/WebGPUDemo/platform/platform_macos.mm b/examples/WebGPUDemo/platform/platform_macos.mm
new file mode 100644
index 0000000000..673edf481f
--- /dev/null
+++ b/examples/WebGPUDemo/platform/platform_macos.mm
@@ -0,0 +1,120 @@
+// platform_macos.mm — macOS backend (Cocoa + CAMetalLayer)
+//
+// Compile flags (see spinning_triangle.cpp header for full invocation):
+//   -ObjC++ -framework Cocoa -framework Metal -framework QuartzCore \
+//   -framework Foundation -framework IOKit -framework IOSurface
+
+#import <Cocoa/Cocoa.h>
+#import <QuartzCore/CAMetalLayer.h>
+#include <CoreFoundation/CFDate.h>
+#include <webgpu/webgpu.h>
+#include "platform.h"
+
+static CAMetalLayer*  sMetalLayer  = nullptr;
+static CFAbsoluteTime sStartTime   = 0;
+static void (*sRenderCb)()         = nullptr;
+static void (*sShutdownCb)()       = nullptr;
+
+// ---------------------------------------------------------------------------
+// Cocoa app — window, metal layer, render timer
+// ---------------------------------------------------------------------------
+
+@interface AppDelegate : NSObject <NSApplicationDelegate, NSWindowDelegate>
+@property (strong) NSWindow* window;
+@property (strong) NSTimer*  timer;
+@end
+
+@implementation AppDelegate
+
+- (void)applicationDidFinishLaunching:(NSNotification*)notification {
+    // ~60 fps render loop
+    self.timer = [NSTimer scheduledTimerWithTimeInterval:1.0 / 60.0
+                                                 target:self
+                                               selector:@selector(tick:)
+                                               userInfo:nil
+                                                repeats:YES];
+    [[NSRunLoop currentRunLoop] addTimer:self.timer forMode:NSRunLoopCommonModes];
+
+    [NSEvent addLocalMonitorForEventsMatchingMask:NSEventMaskKeyDown
+                                         handler:^NSEvent*(NSEvent* event) {
+        if (event.keyCode == 53) { // kVK_Escape
+            [NSApp terminate:nil];
+            return nil;
+        }
+        return event;
+    }];
+
+    [self.window makeKeyAndOrderFront:nil];
+}
+
+- (void)tick:(NSTimer*)t {
+    if (sRenderCb) sRenderCb();
+}
+
+- (BOOL)applicationShouldTerminateAfterLastWindowClosed:(NSApplication*)app {
+    return YES;
+}
+
+- (void)applicationWillTerminate:(NSNotification*)notification {
+    [self.timer invalidate];
+    if (sShutdownCb) sShutdownCb();
+}
+
+@end
+
+// ---------------------------------------------------------------------------
+// Platform interface implementation
+// ---------------------------------------------------------------------------
+
+bool platformInit(int width, int height, const char* title) {
+    NSApplication* app = [NSApplication sharedApplication];
+    [app setActivationPolicy:NSApplicationActivationPolicyRegular];
+
+    NSRect frame = NSMakeRect(200, 200, width, height);
+    NSWindow* window = [[NSWindow alloc]
+        initWithContentRect:frame
+                  styleMask:(NSWindowStyleMaskTitled |
+                             NSWindowStyleMaskClosable |
+                             NSWindowStyleMaskMiniaturizable)
+                    backing:NSBackingStoreBuffered
+                      defer:NO];
+    [window setTitle:[NSString stringWithUTF8String:title]];
+
+    // Metal-backed layer
+    NSView* contentView = [window contentView];
+    [contentView setWantsLayer:YES];
+    sMetalLayer = [CAMetalLayer layer];
+    sMetalLayer.frame = contentView.bounds;
+    sMetalLayer.contentsScale = [window backingScaleFactor];
+    sMetalLayer.pixelFormat = MTLPixelFormatBGRA8Unorm;
+    [contentView.layer addSublayer:sMetalLayer];
+
+    AppDelegate* del = [[AppDelegate alloc] init];
+    del.window = window;
+    [app setDelegate:del];
+
+    sStartTime = CFAbsoluteTimeGetCurrent();
+    return true;
+}
+
+WGPUSurface platformCreateSurface(WGPUInstance instance) {
+    WGPUSurfaceSourceMetalLayer metalSrc = {};
+    metalSrc.chain.sType = WGPUSType_SurfaceSourceMetalLayer;
+    metalSrc.layer = sMetalLayer;
+
+    WGPUSurfaceDescriptor surfDesc = {};
+    surfDesc.nextInChain = (WGPUChainedStruct*)&metalSrc;
+    return wgpuInstanceCreateSurface(instance, &surfDesc);
+}
+
+double platformGetTime() {
+    return CFAbsoluteTimeGetCurrent() - sStartTime;
+}
+
+void platformRunLoop(void (*render)(), void (*shutdown)()) {
+    sRenderCb   = render;
+    sShutdownCb = shutdown;
+    @autoreleasepool {
+        [[NSApplication sharedApplication] run];
+    }
+}
diff --git a/examples/WebGPUDemo/platform/platform_wayland.cpp b/examples/WebGPUDemo/platform/platform_wayland.cpp
new file mode 100644
index 0000000000..619abb6ea9
--- /dev/null
+++ b/examples/WebGPUDemo/platform/platform_wayland.cpp
@@ -0,0 +1,213 @@
+// platform_wayland.cpp — Linux/Wayland backend
+//
+// Dependencies:
+//   libwayland-client, wayland-protocols (for xdg-shell)
+//
+// Generate xdg-shell protocol glue before building:
+//   XML=$(pkg-config --variable=pkgdatadir wayland-protocols)/stable/xdg-shell/xdg-shell.xml
+//   wayland-scanner client-header  $XML xdg-shell-client-protocol.h
+//   wayland-scanner private-code   $XML xdg-shell-protocol.c
+//
+// Compile flags (see spinning_triangle.cpp header for full invocation):
+//   g++ -std=c++17 spinning_triangle.cpp platform_wayland.cpp \
+//       xdg-shell-protocol.c \
+//       -I/path/to/wgpu/include -L/path/to/wgpu/lib -lwgpu_native \
+//       $(pkg-config --cflags --libs wayland-client) \
+//       -o spinning_triangle
+
+#include <wayland-client.h>
+#include "xdg-shell-client-protocol.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <time.h>
+#include <webgpu/webgpu.h>
+#include "platform.h"
+
+static wl_display*    sDisplay    = nullptr;
+static wl_compositor* sCompositor = nullptr;
+static xdg_wm_base*   sWmBase     = nullptr;
+static wl_seat*       sSeat       = nullptr;
+static wl_keyboard*   sKeyboard   = nullptr;
+static wl_surface*    sSurface    = nullptr;
+static xdg_surface*   sXdgSurface = nullptr;
+static xdg_toplevel*  sToplevel   = nullptr;
+static bool           sConfigured = false;
+static bool           sRunning    = false;
+static struct timespec sStartTime  = {};
+
+// ---------------------------------------------------------------------------
+// xdg_wm_base listener — ping/pong keepalive
+// ---------------------------------------------------------------------------
+
+static void wmBasePing(void*, xdg_wm_base* wm, uint32_t serial) {
+    xdg_wm_base_pong(wm, serial);
+}
+static const xdg_wm_base_listener kWmBaseListener = { wmBasePing };
+
+// ---------------------------------------------------------------------------
+// xdg_surface listener — acknowledge configure events
+// ---------------------------------------------------------------------------
+
+static void xdgSurfaceConfigure(void*, xdg_surface* surf, uint32_t serial) {
+    xdg_surface_ack_configure(surf, serial);
+    sConfigured = true;
+}
+static const xdg_surface_listener kXdgSurfaceListener = { xdgSurfaceConfigure };
+
+// ---------------------------------------------------------------------------
+// xdg_toplevel listener — window close / resize
+// ---------------------------------------------------------------------------
+
+static void toplevelClose(void*, xdg_toplevel*) {
+    sRunning = false;
+}
+static void toplevelConfigure(void*, xdg_toplevel*, int32_t, int32_t, wl_array*) {}
+static const xdg_toplevel_listener kToplevelListener = { toplevelConfigure, toplevelClose };
+
+// ---------------------------------------------------------------------------
+// Keyboard listener — Escape to quit
+// ---------------------------------------------------------------------------
+
+static void kbdKeymap(void*, wl_keyboard*, uint32_t, int32_t, uint32_t) {}
+static void kbdEnter(void*, wl_keyboard*, uint32_t, wl_surface*, wl_array*) {}
+static void kbdLeave(void*, wl_keyboard*, uint32_t, wl_surface*) {}
+static void kbdKey(void*, wl_keyboard*, uint32_t, uint32_t, uint32_t key, uint32_t state) {
+    // key 1 == KEY_ESC in Linux evdev (linux/input-event-codes.h)
+    if (key == 1 && state == WL_KEYBOARD_KEY_STATE_PRESSED)
+        sRunning = false;
+}
+static void kbdModifiers(void*, wl_keyboard*, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t) {}
+static void kbdRepeatInfo(void*, wl_keyboard*, int32_t, int32_t) {}
+static const wl_keyboard_listener kKbdListener = {
+    kbdKeymap, kbdEnter, kbdLeave, kbdKey, kbdModifiers, kbdRepeatInfo
+};
+
+// ---------------------------------------------------------------------------
+// wl_seat listener — grab keyboard capability
+// ---------------------------------------------------------------------------
+
+static void seatCapabilities(void*, wl_seat* seat, uint32_t caps) {
+    if ((caps & WL_SEAT_CAPABILITY_KEYBOARD) && !sKeyboard) {
+        sKeyboard = wl_seat_get_keyboard(seat);
+        wl_keyboard_add_listener(sKeyboard, &kKbdListener, nullptr);
+    } else if (!(caps & WL_SEAT_CAPABILITY_KEYBOARD) && sKeyboard) {
+        wl_keyboard_release(sKeyboard);
+        sKeyboard = nullptr;
+    }
+}
+static void seatName(void*, wl_seat*, const char*) {}
+static const wl_seat_listener kSeatListener = { seatCapabilities, seatName };
+
+// ---------------------------------------------------------------------------
+// Registry listener — bind global interfaces
+// ---------------------------------------------------------------------------
+
+static void registryGlobal(void*, wl_registry* reg,
+                            uint32_t name, const char* iface, uint32_t ver) {
+    if (strcmp(iface, wl_compositor_interface.name) == 0)
+        sCompositor = (wl_compositor*)wl_registry_bind(reg, name, &wl_compositor_interface, 4);
+    else if (strcmp(iface, xdg_wm_base_interface.name) == 0) {
+        sWmBase = (xdg_wm_base*)wl_registry_bind(reg, name, &xdg_wm_base_interface, 1);
+        xdg_wm_base_add_listener(sWmBase, &kWmBaseListener, nullptr);
+    } else if (strcmp(iface, wl_seat_interface.name) == 0) {
+        sSeat = (wl_seat*)wl_registry_bind(reg, name, &wl_seat_interface, 5);
+        wl_seat_add_listener(sSeat, &kSeatListener, nullptr);
+    }
+}
+static void registryGlobalRemove(void*, wl_registry*, uint32_t) {}
+static const wl_registry_listener kRegistryListener = { registryGlobal, registryGlobalRemove };
+
+// ---------------------------------------------------------------------------
+// Platform interface implementation
+// ---------------------------------------------------------------------------
+
+bool platformInit(int width, int height, const char* title) {
+    sDisplay = wl_display_connect(nullptr);
+    if (!sDisplay) { fprintf(stderr, "Cannot connect to Wayland display\n"); return false; }
+
+    wl_registry* registry = wl_display_get_registry(sDisplay);
+    wl_registry_add_listener(registry, &kRegistryListener, nullptr);
+
+    // Two roundtrips: first to enumerate globals, second for seat capabilities
+    wl_display_roundtrip(sDisplay);
+    wl_display_roundtrip(sDisplay);
+
+    if (!sCompositor) { fprintf(stderr, "No wl_compositor\n"); return false; }
+    if (!sWmBase)     { fprintf(stderr, "No xdg_wm_base\n");  return false; }
+
+    sSurface    = wl_compositor_create_surface(sCompositor);
+    sXdgSurface = xdg_wm_base_get_xdg_surface(sWmBase, sSurface);
+    sToplevel   = xdg_surface_get_toplevel(sXdgSurface);
+
+    xdg_surface_add_listener(sXdgSurface, &kXdgSurfaceListener, nullptr);
+    xdg_toplevel_add_listener(sToplevel, &kToplevelListener, nullptr);
+    xdg_toplevel_set_title(sToplevel, title);
+    xdg_toplevel_set_app_id(sToplevel, "spinning_triangle");
+
+    wl_surface_commit(sSurface);
+
+    // Wait for the compositor to send the first configure
+    while (!sConfigured) wl_display_dispatch(sDisplay);
+
+    clock_gettime(CLOCK_MONOTONIC, &sStartTime);
+    return true;
+}
+
+WGPUSurface platformCreateSurface(WGPUInstance instance) {
+    WGPUSurfaceSourceWaylandSurface waylandSrc = {};
+    waylandSrc.chain.sType = WGPUSType_SurfaceSourceWaylandSurface;
+    waylandSrc.display     = sDisplay;
+    waylandSrc.surface     = sSurface;
+
+    WGPUSurfaceDescriptor surfDesc = {};
+    surfDesc.nextInChain = (WGPUChainedStruct*)&waylandSrc;
+    return wgpuInstanceCreateSurface(instance, &surfDesc);
+}
+
+double platformGetTime() {
+    struct timespec now;
+    clock_gettime(CLOCK_MONOTONIC, &now);
+    return (double)(now.tv_sec  - sStartTime.tv_sec)
+         + (double)(now.tv_nsec - sStartTime.tv_nsec) * 1e-9;
+}
+
+void platformRunLoop(void (*render)(), void (*shutdown)()) {
+    // Target ~16.67 ms per frame (60 fps)
+    static const long kFrameNs = 1000000000L / 60;
+
+    sRunning = true;
+    while (sRunning) {
+        struct timespec frameStart;
+        clock_gettime(CLOCK_MONOTONIC, &frameStart);
+
+        // Dispatch pending Wayland events without blocking
+        if (wl_display_dispatch_pending(sDisplay) < 0) break;
+        wl_display_flush(sDisplay);
+
+        if (sRunning) render();
+
+        // Sleep for the remainder of the frame budget
+        struct timespec frameEnd;
+        clock_gettime(CLOCK_MONOTONIC, &frameEnd);
+        long elapsed = (frameEnd.tv_sec  - frameStart.tv_sec)  * 1000000000L
+                     + (frameEnd.tv_nsec - frameStart.tv_nsec);
+        long remaining = kFrameNs - elapsed;
+        if (remaining > 0) {
+            struct timespec ts = { 0, remaining };
+            nanosleep(&ts, nullptr);
+        }
+    }
+
+    shutdown();
+
+    // Cleanup Wayland objects
+    if (sKeyboard)   { wl_keyboard_release(sKeyboard);   sKeyboard   = nullptr; }
+    if (sToplevel)   { xdg_toplevel_destroy(sToplevel);  sToplevel   = nullptr; }
+    if (sXdgSurface) { xdg_surface_destroy(sXdgSurface); sXdgSurface = nullptr; }
+    if (sSurface)    { wl_surface_destroy(sSurface);     sSurface    = nullptr; }
+    if (sWmBase)     { xdg_wm_base_destroy(sWmBase);     sWmBase     = nullptr; }
+    if (sSeat)       { wl_seat_release(sSeat);           sSeat       = nullptr; }
+    if (sCompositor) { wl_compositor_destroy(sCompositor); sCompositor = nullptr; }
+    wl_display_disconnect(sDisplay);
+}
diff --git a/examples/WebGPUDemo/platform/platform_windows.cpp b/examples/WebGPUDemo/platform/platform_windows.cpp
new file mode 100644
index 0000000000..80ed4c7f3d
--- /dev/null
+++ b/examples/WebGPUDemo/platform/platform_windows.cpp
@@ -0,0 +1,131 @@
+// platform_windows.cpp — Windows backend (Win32)
+//
+// Compile flags (MSVC, console subsystem):
+//   cl /std:c++17 spinning_triangle.cpp platform_windows.cpp \
+//       /I\path\to\wgpu\include \path\to\wgpu\lib\wgpu_native.lib \
+//       user32.lib gdi32.lib /Fe:spinning_triangle.exe
+//
+// MinGW/Clang equivalent:
+//   clang++ -std=c++17 spinning_triangle.cpp platform_windows.cpp \
+//       -I/path/to/wgpu/include -L/path/to/wgpu/lib -lwgpu_native \
+//       -luser32 -lgdi32 -o spinning_triangle.exe
+
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+#include <webgpu/webgpu.h>
+#include "platform.h"
+
+#pragma comment(lib, "user32.lib")
+#pragma comment(lib, "gdi32.lib")
+
+static HWND   sHwnd      = nullptr;
+static bool   sRunning   = false;
+static LARGE_INTEGER sFreq      = {};
+static LARGE_INTEGER sStartTime = {};
+
+// ---------------------------------------------------------------------------
+// Win32 window procedure
+// ---------------------------------------------------------------------------
+
+static LRESULT CALLBACK wndProc(HWND hwnd, UINT msg, WPARAM wp, LPARAM lp) {
+    switch (msg) {
+    case WM_KEYDOWN:
+        if (wp == VK_ESCAPE) { sRunning = false; return 0; }
+        break;
+    case WM_CLOSE:
+    case WM_DESTROY:
+        sRunning = false;
+        PostQuitMessage(0);
+        return 0;
+    }
+    return DefWindowProcA(hwnd, msg, wp, lp);
+}
+
+// ---------------------------------------------------------------------------
+// Platform interface implementation
+// ---------------------------------------------------------------------------
+
+bool platformInit(int width, int height, const char* title) {
+    WNDCLASSEXA wc  = {};
+    wc.cbSize        = sizeof(wc);
+    wc.style         = CS_HREDRAW | CS_VREDRAW;
+    wc.lpfnWndProc   = wndProc;
+    wc.hInstance     = GetModuleHandleA(nullptr);
+    wc.hCursor       = LoadCursor(nullptr, IDC_ARROW);
+    wc.lpszClassName = "SpinningTriangle";
+    if (!RegisterClassExA(&wc)) {
+        fprintf(stderr, "RegisterClassExA failed (%lu)\n", GetLastError());
+        return false;
+    }
+
+    // Adjust client area to match the requested dimensions
+    RECT rect = { 0, 0, width, height };
+    AdjustWindowRect(&rect, WS_OVERLAPPEDWINDOW & ~(WS_THICKFRAME | WS_MAXIMIZEBOX), FALSE);
+
+    sHwnd = CreateWindowExA(
+        0, "SpinningTriangle", title,
+        WS_OVERLAPPEDWINDOW & ~(WS_THICKFRAME | WS_MAXIMIZEBOX),
+        CW_USEDEFAULT, CW_USEDEFAULT,
+        rect.right - rect.left, rect.bottom - rect.top,
+        nullptr, nullptr, GetModuleHandleA(nullptr), nullptr);
+    if (!sHwnd) {
+        fprintf(stderr, "CreateWindowExA failed (%lu)\n", GetLastError());
+        return false;
+    }
+
+    ShowWindow(sHwnd, SW_SHOW);
+    UpdateWindow(sHwnd);
+
+    QueryPerformanceFrequency(&sFreq);
+    QueryPerformanceCounter(&sStartTime);
+    return true;
+}
+
+WGPUSurface platformCreateSurface(WGPUInstance instance) {
+    WGPUSurfaceSourceWindowsHWND hwndSrc = {};
+    hwndSrc.chain.sType = WGPUSType_SurfaceSourceWindowsHWND;
+    hwndSrc.hinstance   = GetModuleHandleA(nullptr);
+    hwndSrc.hwnd        = sHwnd;
+
+    WGPUSurfaceDescriptor surfDesc = {};
+    surfDesc.nextInChain = (WGPUChainedStruct*)&hwndSrc;
+    return wgpuInstanceCreateSurface(instance, &surfDesc);
+}
+
+double platformGetTime() {
+    LARGE_INTEGER now;
+    QueryPerformanceCounter(&now);
+    return (double)(now.QuadPart - sStartTime.QuadPart) / (double)sFreq.QuadPart;
+}
+
+void platformRunLoop(void (*render)(), void (*shutdown)()) {
+    // Target ~16.67 ms per frame (60 fps)
+    static const double kFrameTime = 1.0 / 60.0;
+
+    sRunning = true;
+    while (sRunning) {
+        double frameStart = platformGetTime();
+
+        // Drain the Win32 message queue
+        MSG msg;
+        while (PeekMessageA(&msg, nullptr, 0, 0, PM_REMOVE)) {
+            if (msg.message == WM_QUIT) { sRunning = false; break; }
+            TranslateMessage(&msg);
+            DispatchMessageA(&msg);
+        }
+
+        if (sRunning) render();
+
+        // Sleep for the remainder of the frame budget
+        double elapsed = platformGetTime() - frameStart;
+        if (elapsed < kFrameTime) {
+            DWORD ms = (DWORD)((kFrameTime - elapsed) * 1000.0);
+            if (ms > 0) Sleep(ms);
+        }
+    }
+
+    shutdown();
+    if (sHwnd) DestroyWindow(sHwnd);
+}
diff --git a/examples/WebGPUDemo/spinning_triangle.cpp b/examples/WebGPUDemo/spinning_triangle.cpp
new file mode 100644
index 0000000000..f5c2660aec
--- /dev/null
+++ b/examples/WebGPUDemo/spinning_triangle.cpp
@@ -0,0 +1,364 @@
+// spinning_triangle.cpp — platform-agnostic WebGPU spinning triangle demo.
+
+#include "platform/platform.h"
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <webgpu/webgpu.h>
+
+#include <tracy/Tracy.hpp>
+#include <tracy/TracyWebGPU.hpp>
+
+#ifndef __TRACYWEBGPU_HPP__
+#define TracyWebGPUCtx void*
+#define TracyWebGPUSetupDevice(...)
+#define TracyWebGPUContext(...) nullptr
+#define TracyWebGPUContextName(...)
+#define TracyWebGPUNamedZone(...)
+#define TracyWebGPUCollect(...)
+#define TracyWebGPUDestroy(...)
+namespace tracy { struct WebGPUQueueCtx { static void SetupDevice(WGPUDeviceDescriptor) {} }; }
+#endif//__TRACYWEBGPU_HPP__
+
+
+// ---------------------------------------------------------------------------
+// Globals
+// ---------------------------------------------------------------------------
+
+static const int kWidth  = 800;
+static const int kHeight = 600;
+
+static WGPUInstance       gInstance   = nullptr;
+static WGPUSurface        gSurface    = nullptr;
+static WGPUAdapter        gAdapter    = nullptr;
+static WGPUDevice         gDevice     = nullptr;
+static WGPUQueue          gQueue      = nullptr;
+static WGPURenderPipeline gPipeline   = nullptr;
+static WGPUBuffer         gUniformBuf = nullptr;
+static WGPUBindGroup      gBindGroup  = nullptr;
+
+static TracyWebGPUCtx     gTracyCtx   = nullptr;
+
+static WGPUTextureFormat gSurfaceFormat = WGPUTextureFormat_BGRA8Unorm;
+
+// TODO: this can become platformError() instead
+int error(int code, const char* message) {
+    fprintf(stderr, "ERROR: %s (code: %d)\n", message, code);
+    return code;
+}
+
+// ---------------------------------------------------------------------------
+// WGSL shader — vertex colours baked in, rotation via a uniform float.
+// ---------------------------------------------------------------------------
+
+static const char* kShaderSource = R"(
+struct Uniforms {
+    angle: f32,
+};
+@group(0) @binding(0) var<uniform> u: Uniforms;
+
+struct VSOut {
+    @builtin(position) pos: vec4f,
+    @location(0) color: vec3f,
+};
+
+@vertex
+fn vs_main(@builtin(vertex_index) vi: u32) -> VSOut {
+    var positions = array<vec2f, 3>(
+        vec2f( 0.0,  0.5),
+        vec2f(-0.433, -0.25),
+        vec2f( 0.433, -0.25),
+    );
+    var colors = array<vec3f, 3>(
+        vec3f(1.0, 0.0, 0.0),
+        vec3f(0.0, 1.0, 0.0),
+        vec3f(0.0, 0.0, 1.0),
+    );
+
+    let c = cos(u.angle);
+    let s = sin(u.angle);
+    let p = positions[vi];
+    let rotated = vec2f(p.x * c - p.y * s, p.x * s + p.y * c);
+
+    var out: VSOut;
+    out.pos   = vec4f(rotated, 0.0, 1.0);
+    out.color = colors[vi];
+    return out;
+}
+
+@fragment
+fn fs_main(@location(0) color: vec3f) -> @location(0) vec4f {
+    return vec4f(color, 1.0);
+}
+)";
+
+// ---------------------------------------------------------------------------
+// Adapter / Device request callbacks  (current wgpu-native API)
+// ---------------------------------------------------------------------------
+
+static void onAdapterReady(WGPURequestAdapterStatus status,
+                           WGPUAdapter adapter,
+                           WGPUStringView message,
+                           void* userdata1, void* /*userdata2*/) {
+    if (status == WGPURequestAdapterStatus_Success) {
+        *(WGPUAdapter*)userdata1 = adapter;
+    } else {
+        fprintf(stderr, "Adapter request failed: %.*s\n",
+                (int)message.length, message.data);
+    }
+}
+
+static void onDeviceReady(WGPURequestDeviceStatus status,
+                          WGPUDevice device,
+                          WGPUStringView message,
+                          void* userdata1, void* /*userdata2*/) {
+    if (status == WGPURequestDeviceStatus_Success) {
+        *(WGPUDevice*)userdata1 = device;
+    } else {
+        fprintf(stderr, "Device request failed: %.*s\n",
+                (int)message.length, message.data);
+    }
+}
+
+// ---------------------------------------------------------------------------
+// WebGPU init
+// ---------------------------------------------------------------------------
+
+static int initWebGPU() {
+    // Adapter
+    WGPURequestAdapterOptions adapterOpts = {};
+    adapterOpts.compatibleSurface = gSurface;
+
+    WGPURequestAdapterCallbackInfo adapterCB = {};
+    adapterCB.mode     = WGPUCallbackMode_AllowProcessEvents;
+    adapterCB.callback  = onAdapterReady;
+    adapterCB.userdata1 = &gAdapter;
+    wgpuInstanceRequestAdapter(gInstance, &adapterOpts, adapterCB);
+    while (!gAdapter) { wgpuInstanceProcessEvents(gInstance); }
+    if (!gAdapter) return error(11, "No adapter");
+
+    WGPUUncapturedErrorCallbackInfo errorCB = {};
+    errorCB.callback = [](WGPUDevice const*, WGPUErrorType type,
+                          WGPUStringView message, void*, void*) {
+        fprintf(stderr, "[WGPU ERROR] type=%d  %.*s\n",
+                (int)type, (int)message.length, message.data);
+    };
+
+    WGPUDeviceDescriptor deviceDesc = {};
+    deviceDesc.uncapturedErrorCallbackInfo = errorCB;
+
+    TracyWebGPUSetupDevice(deviceDesc);
+
+    WGPURequestDeviceCallbackInfo deviceCB = {};
+    deviceCB.mode      = WGPUCallbackMode_AllowProcessEvents;
+    deviceCB.callback  = onDeviceReady;
+    deviceCB.userdata1 = &gDevice;
+    wgpuAdapterRequestDevice(gAdapter, &deviceDesc, deviceCB);
+    while (!gDevice) { wgpuInstanceProcessEvents(gInstance); }
+    if (!gDevice) return error(12, "No device");
+
+    gQueue = wgpuDeviceGetQueue(gDevice);
+    gTracyCtx = TracyWebGPUContext(gInstance, gDevice, gQueue);
+    TracyWebGPUContextName(gTracyCtx, "WebGPU", 6);
+
+    // Configure surface
+    WGPUSurfaceConfiguration config = {};
+    config.device      = gDevice;
+    config.format      = gSurfaceFormat;
+    config.usage       = WGPUTextureUsage_RenderAttachment;
+    config.alphaMode   = WGPUCompositeAlphaMode_Opaque;
+    config.width       = kWidth;
+    config.height      = kHeight;
+    config.presentMode = WGPUPresentMode_Fifo;
+    wgpuSurfaceConfigure(gSurface, &config);
+
+    // Shader module
+    WGPUShaderSourceWGSL wgslSrc = {};
+    wgslSrc.chain.sType = WGPUSType_ShaderSourceWGSL;
+    wgslSrc.code = { kShaderSource, WGPU_STRLEN };
+
+    WGPUShaderModuleDescriptor smDesc = {};
+    smDesc.nextInChain = (WGPUChainedStruct*)&wgslSrc;
+    WGPUShaderModule shaderMod = wgpuDeviceCreateShaderModule(gDevice, &smDesc);
+
+    // Uniform buffer (one f32 for rotation angle)
+    WGPUBufferDescriptor bufDesc = {};
+    bufDesc.usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst;
+    bufDesc.size  = sizeof(float);
+    gUniformBuf = wgpuDeviceCreateBuffer(gDevice, &bufDesc);
+
+    // Bind group layout + bind group
+    WGPUBindGroupLayoutEntry bglEntry = {};
+    bglEntry.binding    = 0;
+    bglEntry.visibility = WGPUShaderStage_Vertex;
+    bglEntry.buffer.type            = WGPUBufferBindingType_Uniform;
+    bglEntry.buffer.minBindingSize  = sizeof(float);
+
+    WGPUBindGroupLayoutDescriptor bglDesc = {};
+    bglDesc.entryCount = 1;
+    bglDesc.entries    = &bglEntry;
+    WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(gDevice, &bglDesc);
+
+    WGPUBindGroupEntry bgEntry = {};
+    bgEntry.binding = 0;
+    bgEntry.buffer  = gUniformBuf;
+    bgEntry.size    = sizeof(float);
+
+    WGPUBindGroupDescriptor bgDesc = {};
+    bgDesc.layout     = bgl;
+    bgDesc.entryCount = 1;
+    bgDesc.entries    = &bgEntry;
+    gBindGroup = wgpuDeviceCreateBindGroup(gDevice, &bgDesc);
+
+    // Pipeline layout
+    WGPUPipelineLayoutDescriptor plDesc = {};
+    plDesc.bindGroupLayoutCount = 1;
+    plDesc.bindGroupLayouts     = &bgl;
+    WGPUPipelineLayout pipelineLayout = wgpuDeviceCreatePipelineLayout(gDevice, &plDesc);
+
+    // Render pipeline
+    WGPUColorTargetState colorTarget = {};
+    colorTarget.format    = gSurfaceFormat;
+    colorTarget.writeMask = WGPUColorWriteMask_All;
+
+    WGPUFragmentState fragState = {};
+    fragState.module      = shaderMod;
+    fragState.entryPoint  = { "fs_main", WGPU_STRLEN };
+    fragState.targetCount = 1;
+    fragState.targets     = &colorTarget;
+
+    WGPURenderPipelineDescriptor rpDesc = {};
+    rpDesc.layout = pipelineLayout;
+    rpDesc.vertex.module     = shaderMod;
+    rpDesc.vertex.entryPoint = { "vs_main", WGPU_STRLEN };
+    rpDesc.primitive.topology = WGPUPrimitiveTopology_TriangleList;
+    rpDesc.multisample.count  = 1;
+    rpDesc.multisample.mask   = 0xFFFFFFFF;
+    rpDesc.fragment = &fragState;
+
+    gPipeline = wgpuDeviceCreateRenderPipeline(gDevice, &rpDesc);
+
+    // Cleanup intermediates
+    wgpuShaderModuleRelease(shaderMod);
+    wgpuPipelineLayoutRelease(pipelineLayout);
+    wgpuBindGroupLayoutRelease(bgl);
+    return 0;
+}
+
+// ---------------------------------------------------------------------------
+// Frame rendering
+// ---------------------------------------------------------------------------
+
+// Returns the surface texture for the current frame, or {.texture=nullptr} on
+// a skippable condition (timeout, occlusion) or an error.
+static WGPUSurfaceTexture getWindowSurface() {
+    WGPUSurfaceTexture surfTex = {};
+    wgpuSurfaceGetCurrentTexture(gSurface, &surfTex);
+    if (surfTex.status == WGPUSurfaceGetCurrentTextureStatus_SuccessOptimal ||
+        surfTex.status == WGPUSurfaceGetCurrentTextureStatus_SuccessSuboptimal)
+        return surfTex;
+
+    // Timeout and Occluded are normal OS events (window covered / on a different Space).
+    bool silent = surfTex.status == WGPUSurfaceGetCurrentTextureStatus_Timeout;
+#ifdef WGPU_H_
+    silent = silent || surfTex.status == (WGPUSurfaceGetCurrentTextureStatus)WGPUSurfaceGetCurrentTextureStatus_Occluded;
+#endif
+    if (!silent)
+        fprintf(stderr, "Failed to get surface texture (status %d)\n", surfTex.status);
+    if (surfTex.texture) wgpuTextureRelease(surfTex.texture);
+    surfTex.texture = nullptr;
+    return surfTex;
+}
+
+static void renderFrame() {
+    ZoneScoped;
+
+    // Update rotation angle
+    float angle = (float)platformGetTime();
+    wgpuQueueWriteBuffer(gQueue, gUniformBuf, 0, &angle, sizeof(float));
+
+    WGPUSurfaceTexture surfTex = getWindowSurface();
+    if (!surfTex.texture) return;
+
+    WGPUTextureView view = wgpuTextureCreateView(surfTex.texture, nullptr);
+
+    // Command encoder
+    WGPUCommandEncoder encoder = wgpuDeviceCreateCommandEncoder(gDevice, nullptr);
+
+    // Render pass
+    WGPURenderPassColorAttachment colorAtt = {};
+    colorAtt.view       = view;
+    colorAtt.loadOp     = WGPULoadOp_Clear;
+    colorAtt.storeOp    = WGPUStoreOp_Store;
+    colorAtt.clearValue  = { 0.05, 0.05, 0.08, 1.0 };
+    colorAtt.depthSlice  = WGPU_DEPTH_SLICE_UNDEFINED;
+
+    WGPURenderPassDescriptor passDesc = {};
+    passDesc.colorAttachmentCount = 1;
+    passDesc.colorAttachments     = &colorAtt;
+
+    {
+        ZoneScopedN("render-pass");
+        TracyWebGPUNamedZone(gTracyCtx, tracyZone, encoder, passDesc, "triangle draw", true);
+        WGPURenderPassEncoder pass = wgpuCommandEncoderBeginRenderPass(encoder, &passDesc);
+        wgpuRenderPassEncoderSetPipeline(pass, gPipeline);
+        wgpuRenderPassEncoderSetBindGroup(pass, 0, gBindGroup, 0, nullptr);
+        wgpuRenderPassEncoderDraw(pass, 3, 1, 0, 0);
+        wgpuRenderPassEncoderEnd(pass);
+        wgpuRenderPassEncoderRelease(pass);
+    }
+
+    // Submit
+    WGPUCommandBuffer cmdBuf = wgpuCommandEncoderFinish(encoder, nullptr);
+    wgpuQueueSubmit(gQueue, 1, &cmdBuf);
+
+    // Present
+    wgpuSurfacePresent(gSurface);
+
+    // Process Events
+    wgpuInstanceProcessEvents(gInstance);
+    TracyWebGPUCollect(gTracyCtx);
+
+    // Cleanup
+    wgpuCommandBufferRelease(cmdBuf);
+    wgpuCommandEncoderRelease(encoder);
+    wgpuTextureViewRelease(view);
+    wgpuTextureRelease(surfTex.texture);
+}
+
+// ---------------------------------------------------------------------------
+// Shutdown
+// ---------------------------------------------------------------------------
+
+static void shutdown() {
+    fprintf(stderr, "application is shutting down...\n");
+    TracyWebGPUDestroy(gTracyCtx);
+    if (gBindGroup)  wgpuBindGroupRelease(gBindGroup);
+    if (gUniformBuf) wgpuBufferRelease(gUniformBuf);
+    if (gPipeline)   wgpuRenderPipelineRelease(gPipeline);
+    if (gQueue)      wgpuQueueRelease(gQueue);
+    if (gDevice)     wgpuDeviceRelease(gDevice);
+    if (gAdapter)    wgpuAdapterRelease(gAdapter);
+    if (gSurface)    wgpuSurfaceRelease(gSurface);
+    if (gInstance)   wgpuInstanceRelease(gInstance);
+}
+
+// ---------------------------------------------------------------------------
+// main
+// ---------------------------------------------------------------------------
+
+int main(int argc, char* argv[]) {
+    if (!platformInit(kWidth, kHeight, "WebGPU Spinning Triangle"))
+        return 1;
+
+    gInstance = wgpuCreateInstance(nullptr);
+    if (!gInstance) return error(2, "Failed to create WebGPU instance.");
+
+    gSurface = platformCreateSurface(gInstance);
+    if (!gSurface) return error(3, "Failed to create surface.");
+
+    if (initWebGPU() != 0) return 4;
+
+    platformRunLoop(renderFrame, shutdown);
+    return 0;
+}

From c47f69a694c790683cfdb24dfe1469c5630eba92 Mon Sep 17 00:00:00 2001
From: Marcos Slomp <mslomp@gmail.com>
Date: Sun, 24 May 2026 13:41:59 -0700
Subject: [PATCH 16/21] build fix

---
 examples/WebGPUDemo/platform/platform_windows.cpp | 4 ++++
 public/tracy/TracyWebGPU.hpp                      | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/examples/WebGPUDemo/platform/platform_windows.cpp b/examples/WebGPUDemo/platform/platform_windows.cpp
index 80ed4c7f3d..153531ca75 100644
--- a/examples/WebGPUDemo/platform/platform_windows.cpp
+++ b/examples/WebGPUDemo/platform/platform_windows.cpp
@@ -15,10 +15,14 @@
 #endif
 #include <windows.h>
 #include <webgpu/webgpu.h>
+#include <stdio.h>
 #include "platform.h"
 
 #pragma comment(lib, "user32.lib")
 #pragma comment(lib, "gdi32.lib")
+#pragma comment(lib, "dxguid.lib")    // Dawn: WKPDID_D3DDebugObjectName
+#pragma comment(lib, "OneCore")       // Dawn: CompareObjectHandles
+#pragma comment(lib, "ntdll.lib")     // wgpu-native: NtReadFile et al.
 
 static HWND   sHwnd      = nullptr;
 static bool   sRunning   = false;
diff --git a/public/tracy/TracyWebGPU.hpp b/public/tracy/TracyWebGPU.hpp
index 8ca03175bd..0f35c4edf7 100644
--- a/public/tracy/TracyWebGPU.hpp
+++ b/public/tracy/TracyWebGPU.hpp
@@ -74,6 +74,7 @@ using TracyWebGPUCtx = void*;
 #if TRACY_WEBGPU_DEBUG_LEVEL
 #define TracyWebGPUDebug(...) __VA_ARGS__;
 #if defined(_MSC_VER)
+extern "C" int32_t IsDebuggerPresent(void);
 #define TracyWebGPUBreak() if (IsDebuggerPresent()) __debugbreak()
 #else
 #define TracyWebGPUBreak() ((void)0)
@@ -226,7 +227,7 @@ namespace tracy
                 // incremental regression:
                 cpuToGpuModel.Update(tcpu, tgpu);
                 wallToGpuModel.Update(twall, tgpu);
-                fprintf(stderr, "----- (sample accepted! wall = %lld | cpu = %lld | gpu = %lld | period = %f)\n", twall, tcpu, tgpu, Period());
+                TracyWebGPUDebug( fprintf(stderr, "----- (sample accepted! wall = %lld | cpu = %lld | gpu = %lld | period = %f)\n", twall, tcpu, tgpu, Period()) );
                 return true;
             }
         } m_calibration;

From 4094c89ef679037c7d1ab64d1f3cf5c1d78c867c Mon Sep 17 00:00:00 2001
From: Marcos Slomp <mslomp@gmail.com>
Date: Sun, 24 May 2026 16:28:03 -0700
Subject: [PATCH 17/21] fix Linux build

---
 examples/WebGPUDemo/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/WebGPUDemo/CMakeLists.txt b/examples/WebGPUDemo/CMakeLists.txt
index 638d337f19..ac901d150b 100644
--- a/examples/WebGPUDemo/CMakeLists.txt
+++ b/examples/WebGPUDemo/CMakeLists.txt
@@ -20,7 +20,7 @@
 #         -lwayland-client -o spinning_triangle
 
 cmake_minimum_required(VERSION 3.16)
-project(spinning_triangle LANGUAGES CXX)
+project(spinning_triangle LANGUAGES C CXX)
 
 # ---------------------------------------------------------------------------
 # WebGPU backend — set WGPU_PATH to your wgpu-native or Dawn installation.
@@ -77,7 +77,7 @@ else()
     # Linux / Wayland — also needs the generated xdg-shell protocol glue.
     set(PLATFORM_SOURCES
         platform/platform_wayland.cpp
-        xdg-shell-protocol.c
+        platform/xdg-shell-protocol.c
     )
     set(PLATFORM_LIBS wayland-client)
 endif()

From dd7060a4d9d7fd8eef7289df4914ac094930d5ce Mon Sep 17 00:00:00 2001
From: Marcos Slomp <mslomp@gmail.com>
Date: Sun, 24 May 2026 18:16:58 -0700
Subject: [PATCH 18/21] Wayland woes...

---
 examples/WebGPUDemo/CMakeLists.txt | 33 ++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/examples/WebGPUDemo/CMakeLists.txt b/examples/WebGPUDemo/CMakeLists.txt
index ac901d150b..06fd7526a4 100644
--- a/examples/WebGPUDemo/CMakeLists.txt
+++ b/examples/WebGPUDemo/CMakeLists.txt
@@ -57,6 +57,8 @@ endif()
 # ---------------------------------------------------------------------------
 # Platform-specific source and link settings
 # ---------------------------------------------------------------------------
+set(PLATFORM_GENERATED_INCLUDES "")
+
 if(APPLE)
     set(PLATFORM_SOURCES platform/platform_macos.mm)
     set(PLATFORM_LIBS
@@ -74,12 +76,38 @@ elseif(WIN32)
     set(PLATFORM_SOURCES platform/platform_windows.cpp)
     set(PLATFORM_LIBS user32 gdi32)
 else()
-    # Linux / Wayland — also needs the generated xdg-shell protocol glue.
+    # Linux / Wayland — generate xdg-shell protocol glue via wayland-scanner.
+    find_package(PkgConfig REQUIRED)
+    pkg_check_modules(WAYLAND_PROTOCOLS REQUIRED wayland-protocols)
+    pkg_get_variable(WAYLAND_PROTOCOLS_DIR wayland-protocols pkgdatadir)
+    find_program(WAYLAND_SCANNER wayland-scanner REQUIRED)
+
+    set(XDG_SHELL_XML "${WAYLAND_PROTOCOLS_DIR}/stable/xdg-shell/xdg-shell.xml")
+    set(XDG_SHELL_H   "${CMAKE_CURRENT_BINARY_DIR}/xdg-shell-client-protocol.h")
+    set(XDG_SHELL_C   "${CMAKE_CURRENT_BINARY_DIR}/xdg-shell-protocol.c")
+
+    add_custom_command(
+        OUTPUT  "${XDG_SHELL_H}"
+        COMMAND "${WAYLAND_SCANNER}" client-header "${XDG_SHELL_XML}" "${XDG_SHELL_H}"
+        DEPENDS "${XDG_SHELL_XML}"
+        COMMENT "Generating xdg-shell-client-protocol.h"
+        VERBATIM
+    )
+    add_custom_command(
+        OUTPUT  "${XDG_SHELL_C}"
+        COMMAND "${WAYLAND_SCANNER}" private-code  "${XDG_SHELL_XML}" "${XDG_SHELL_C}"
+        DEPENDS "${XDG_SHELL_XML}"
+        COMMENT "Generating xdg-shell-protocol.c"
+        VERBATIM
+    )
+
     set(PLATFORM_SOURCES
         platform/platform_wayland.cpp
-        platform/xdg-shell-protocol.c
+        "${XDG_SHELL_C}"
+        "${XDG_SHELL_H}"
     )
     set(PLATFORM_LIBS wayland-client)
+    set(PLATFORM_GENERATED_INCLUDES "${CMAKE_CURRENT_BINARY_DIR}")
 endif()
 
 # ---------------------------------------------------------------------------
@@ -112,6 +140,7 @@ endif()
 target_include_directories(spinning_triangle PRIVATE
     "${WGPU_PATH}/include"
     "${TRACY_DIR}/public"
+    ${PLATFORM_GENERATED_INCLUDES}
 )
 
 target_link_directories(spinning_triangle PRIVATE "${WGPU_PATH}/lib")

From 5597b79f1f6184f1bf1609af41dc9be2a4e5b902 Mon Sep 17 00:00:00 2001
From: Marcos Slomp <mslomp@gmail.com>
Date: Sat, 6 Jun 2026 12:22:19 -0700
Subject: [PATCH 19/21] relocating webgpu example

---
 examples/{WebGPUDemo => webgpu/triangle}/CMakeLists.txt         | 2 +-
 examples/{WebGPUDemo => webgpu/triangle}/platform/platform.h    | 0
 .../{WebGPUDemo => webgpu/triangle}/platform/platform_macos.mm  | 0
 .../triangle}/platform/platform_wayland.cpp                     | 0
 .../triangle}/platform/platform_windows.cpp                     | 0
 examples/{WebGPUDemo => webgpu/triangle}/spinning_triangle.cpp  | 0
 6 files changed, 1 insertion(+), 1 deletion(-)
 rename examples/{WebGPUDemo => webgpu/triangle}/CMakeLists.txt (99%)
 rename examples/{WebGPUDemo => webgpu/triangle}/platform/platform.h (100%)
 rename examples/{WebGPUDemo => webgpu/triangle}/platform/platform_macos.mm (100%)
 rename examples/{WebGPUDemo => webgpu/triangle}/platform/platform_wayland.cpp (100%)
 rename examples/{WebGPUDemo => webgpu/triangle}/platform/platform_windows.cpp (100%)
 rename examples/{WebGPUDemo => webgpu/triangle}/spinning_triangle.cpp (100%)

diff --git a/examples/WebGPUDemo/CMakeLists.txt b/examples/webgpu/triangle/CMakeLists.txt
similarity index 99%
rename from examples/WebGPUDemo/CMakeLists.txt
rename to examples/webgpu/triangle/CMakeLists.txt
index 06fd7526a4..f288580527 100644
--- a/examples/WebGPUDemo/CMakeLists.txt
+++ b/examples/webgpu/triangle/CMakeLists.txt
@@ -38,7 +38,7 @@ endif()
 # ---------------------------------------------------------------------------
 # Tracy root — defaults to two directories above this CMakeLists.txt.
 # ---------------------------------------------------------------------------
-set(TRACY_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../..")
+set(TRACY_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../..")
 option(TRACY_ENABLE "Enable Tracy profiling" ON)
 
 # ---------------------------------------------------------------------------
diff --git a/examples/WebGPUDemo/platform/platform.h b/examples/webgpu/triangle/platform/platform.h
similarity index 100%
rename from examples/WebGPUDemo/platform/platform.h
rename to examples/webgpu/triangle/platform/platform.h
diff --git a/examples/WebGPUDemo/platform/platform_macos.mm b/examples/webgpu/triangle/platform/platform_macos.mm
similarity index 100%
rename from examples/WebGPUDemo/platform/platform_macos.mm
rename to examples/webgpu/triangle/platform/platform_macos.mm
diff --git a/examples/WebGPUDemo/platform/platform_wayland.cpp b/examples/webgpu/triangle/platform/platform_wayland.cpp
similarity index 100%
rename from examples/WebGPUDemo/platform/platform_wayland.cpp
rename to examples/webgpu/triangle/platform/platform_wayland.cpp
diff --git a/examples/WebGPUDemo/platform/platform_windows.cpp b/examples/webgpu/triangle/platform/platform_windows.cpp
similarity index 100%
rename from examples/WebGPUDemo/platform/platform_windows.cpp
rename to examples/webgpu/triangle/platform/platform_windows.cpp
diff --git a/examples/WebGPUDemo/spinning_triangle.cpp b/examples/webgpu/triangle/spinning_triangle.cpp
similarity index 100%
rename from examples/WebGPUDemo/spinning_triangle.cpp
rename to examples/webgpu/triangle/spinning_triangle.cpp

From cf81651ea163f926b396f7a5b0e80293d00c8ee5 Mon Sep 17 00:00:00 2001
From: Marcos Slomp <mslomp@gmail.com>
Date: Sat, 6 Jun 2026 12:22:39 -0700
Subject: [PATCH 20/21] fixing webgpu lib linkage based on WGPU_PATH

---
 examples/webgpu/triangle/CMakeLists.txt | 29 ++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/examples/webgpu/triangle/CMakeLists.txt b/examples/webgpu/triangle/CMakeLists.txt
index f288580527..4714573660 100644
--- a/examples/webgpu/triangle/CMakeLists.txt
+++ b/examples/webgpu/triangle/CMakeLists.txt
@@ -29,12 +29,39 @@ project(spinning_triangle LANGUAGES C CXX)
 #   Dawn         →  webgpu_dawn
 # ---------------------------------------------------------------------------
 set(WGPU_PATH "" CACHE PATH "Root of the WebGPU native installation (contains include/ and lib/)")
-set(WGPU_LIB  "webgpu_dawn" CACHE STRING "WebGPU library name (wgpu_native or webgpu_dawn)")
+set(WGPU_LIB  "" CACHE STRING "WebGPU library name (wgpu_native or webgpu_dawn); auto-detected if empty")
 
 if(NOT WGPU_PATH)
     message(FATAL_ERROR "Set WGPU_PATH to the root of your WebGPU native installation.")
 endif()
 
+# When WGPU_PATH changes, discard any previously auto-detected WGPU_LIB so
+# detection re-runs against the new path.
+if(NOT "${WGPU_PATH}" STREQUAL "${_WGPU_PATH_LAST}" AND _WGPU_LIB_AUTO)
+    unset(WGPU_LIB CACHE)
+    set(WGPU_LIB "" CACHE STRING "WebGPU library name (wgpu_native or webgpu_dawn); auto-detected if empty")
+endif()
+set(_WGPU_PATH_LAST "${WGPU_PATH}" CACHE INTERNAL "")
+
+if(NOT WGPU_LIB)
+    # Always unset the find_library results so they re-probe the current WGPU_PATH.
+    unset(_WGPU_NATIVE_LIB CACHE)
+    unset(_WEBGPU_DAWN_LIB CACHE)
+    find_library(_WGPU_NATIVE_LIB NAMES wgpu_native wgpu_native.dll PATHS "${WGPU_PATH}/lib" NO_DEFAULT_PATH)
+    find_library(_WEBGPU_DAWN_LIB NAMES webgpu_dawn                 PATHS "${WGPU_PATH}/lib" NO_DEFAULT_PATH)
+    if(_WGPU_NATIVE_LIB)
+        set(WGPU_LIB "wgpu_native" CACHE STRING "WebGPU library name (wgpu_native or webgpu_dawn); auto-detected if empty" FORCE)
+    elseif(_WEBGPU_DAWN_LIB)
+        set(WGPU_LIB "webgpu_dawn" CACHE STRING "WebGPU library name (wgpu_native or webgpu_dawn); auto-detected if empty" FORCE)
+    else()
+        message(FATAL_ERROR "Could not detect a WebGPU library in ${WGPU_PATH}/lib. Set WGPU_LIB explicitly (wgpu_native or webgpu_dawn).")
+    endif()
+    set(_WGPU_LIB_AUTO TRUE CACHE INTERNAL "")
+    message(STATUS "WebGPU library auto-detected: ${WGPU_LIB}")
+else()
+    set(_WGPU_LIB_AUTO FALSE CACHE INTERNAL "")
+endif()
+
 # ---------------------------------------------------------------------------
 # Tracy root — defaults to two directories above this CMakeLists.txt.
 # ---------------------------------------------------------------------------

From dd13487b867250c8be333bb77950f177c482d055 Mon Sep 17 00:00:00 2001
From: Marcos Slomp <mslomp@gmail.com>
Date: Sat, 6 Jun 2026 12:22:49 -0700
Subject: [PATCH 21/21] fixing MemWrite casts

---
 public/tracy/TracyWebGPU.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/public/tracy/TracyWebGPU.hpp b/public/tracy/TracyWebGPU.hpp
index 0f35c4edf7..6a4c4071fb 100644
--- a/public/tracy/TracyWebGPU.hpp
+++ b/public/tracy/TracyWebGPU.hpp
@@ -525,8 +525,8 @@ namespace tracy
             MemWrite(&item->gpuNewContext.thread, static_cast<uint32_t>(0));
             MemWrite(&item->gpuNewContext.period, static_cast<float>(period));
             MemWrite(&item->gpuNewContext.context, static_cast<uint8_t>(GetId()));
-            MemWrite(&item->gpuNewContext.flags, static_cast<uint8_t>(0));  // no calibration available
-            MemWrite(&item->gpuNewContext.type, static_cast<uint8_t>(GpuContextType::WebGPU));
+            MemWrite(&item->gpuNewContext.flags, GpuContextFlags(0));  // no calibration available
+            MemWrite(&item->gpuNewContext.type, GpuContextType::WebGPU);
             SubmitQueueItem(item);
         }