diff --git a/.gitignore b/.gitignore index 683f6f0a6ef..b9f3996b51f 100644 --- a/.gitignore +++ b/.gitignore @@ -418,3 +418,4 @@ FodyWeavers.xsd # JetBrains .idea cmake-build-* +nul diff --git a/CMakeLists.txt b/CMakeLists.txt index b07dcea87d8..44ad16dc3b7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1034,6 +1034,8 @@ set(VIDEO_CORE src/video_core/amdgpu/cb_db_extent.h src/video_core/renderer_vulkan/vk_resource_pool.h src/video_core/renderer_vulkan/vk_scheduler.cpp src/video_core/renderer_vulkan/vk_scheduler.h + src/video_core/renderer_vulkan/vk_compute_scheduler.cpp + src/video_core/renderer_vulkan/vk_compute_scheduler.h src/video_core/renderer_vulkan/vk_shader_hle.cpp src/video_core/renderer_vulkan/vk_shader_hle.h src/video_core/renderer_vulkan/vk_shader_util.cpp diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt index 41a0f71c71d..6b1855f6d97 100644 --- a/externals/CMakeLists.txt +++ b/externals/CMakeLists.txt @@ -151,7 +151,7 @@ endif() # sirit add_subdirectory(sirit) -if (WIN32) +if (WIN32 AND NOT MSVC) target_compile_options(sirit PRIVATE "-Wno-error=unused-command-line-argument") endif() diff --git a/src/common/config.cpp b/src/common/config.cpp index fb1181d6279..c02cbb5dcd8 100644 --- a/src/common/config.cpp +++ b/src/common/config.cpp @@ -175,6 +175,7 @@ static ConfigEntry shouldCopyGPUBuffers(false); static ConfigEntry readbacksEnabled(false); static ConfigEntry readbackLinearImagesEnabled(false); static ConfigEntry directMemoryAccessEnabled(false); +static ConfigEntry asyncComputeEnabled(true); static ConfigEntry shouldDumpShaders(false); static ConfigEntry shouldPatchShaders(false); static ConfigEntry vblankFrequency(60); @@ -452,6 +453,10 @@ bool directMemoryAccess() { return directMemoryAccessEnabled.get(); } +bool asyncCompute() { + return asyncComputeEnabled.get(); +} + bool dumpShaders() { return shouldDumpShaders.get(); } @@ -603,6 +608,10 @@ void setDirectMemoryAccess(bool enable, bool is_game_specific) { directMemoryAccessEnabled.set(enable, is_game_specific); } +void setAsyncCompute(bool enable, bool is_game_specific) { + asyncComputeEnabled.set(enable, is_game_specific); +} + void setDumpShaders(bool enable, bool is_game_specific) { shouldDumpShaders.set(enable, is_game_specific); } diff --git a/src/common/config.h b/src/common/config.h index eb2b91f523f..60465ce1f29 100644 --- a/src/common/config.h +++ b/src/common/config.h @@ -69,6 +69,8 @@ bool readbackLinearImages(); void setReadbackLinearImages(bool enable, bool is_game_specific = false); bool directMemoryAccess(); void setDirectMemoryAccess(bool enable, bool is_game_specific = false); +bool asyncCompute(); +void setAsyncCompute(bool enable, bool is_game_specific = false); bool dumpShaders(); void setDumpShaders(bool enable, bool is_game_specific = false); u32 vblankFreq(); diff --git a/src/video_core/buffer_cache/buffer.cpp b/src/video_core/buffer_cache/buffer.cpp index 5ff80facd48..a0638278d29 100644 --- a/src/video_core/buffer_cache/buffer.cpp +++ b/src/video_core/buffer_cache/buffer.cpp @@ -1,4 +1,5 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +#include // SPDX-License-Identifier: GPL-2.0-or-later #include "common/alignment.h" @@ -104,10 +105,20 @@ Buffer::Buffer(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, VAddr cpu_addr_, vk::BufferUsageFlags flags, u64 size_bytes_) : cpu_addr{cpu_addr_}, size_bytes{size_bytes_}, instance{&instance_}, scheduler{&scheduler_}, usage{usage_}, buffer{instance->GetDevice(), instance->GetAllocator()} { - // Create buffer object. + // Check if we need concurrent sharing for async compute + const bool has_async = instance->HasDedicatedComputeQueue(); + std::array queue_families = { + instance->GetGraphicsQueueFamilyIndex(), + instance->GetComputeQueueFamilyIndex(), + }; + + // Create buffer object with concurrent sharing if async compute is available const vk::BufferCreateInfo buffer_ci = { .size = size_bytes, .usage = flags, + .sharingMode = has_async ? vk::SharingMode::eConcurrent : vk::SharingMode::eExclusive, + .queueFamilyIndexCount = has_async ? static_cast(queue_families.size()) : 0, + .pQueueFamilyIndices = has_async ? queue_families.data() : nullptr, }; VmaAllocationInfo alloc_info{}; buffer.Create(buffer_ci, usage, &alloc_info); diff --git a/src/video_core/renderer_vulkan/vk_compute_scheduler.cpp b/src/video_core/renderer_vulkan/vk_compute_scheduler.cpp new file mode 100644 index 00000000000..09fa9781a21 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_compute_scheduler.cpp @@ -0,0 +1,201 @@ +// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "common/assert.h" +#include "video_core/renderer_vulkan/vk_compute_scheduler.h" +#include "video_core/renderer_vulkan/vk_instance.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" + +namespace Vulkan { + +ComputeScheduler::ComputeScheduler(const Instance& instance_) + : instance{instance_}, master_semaphore{instance}, + command_pool{instance, &master_semaphore, instance.GetComputeQueueFamilyIndex()}, + compute_queue{instance.GetComputeQueue()}, is_dedicated{instance.HasDedicatedComputeQueue()} { + AllocateWorkerCommandBuffers(); + LOG_INFO(Render_Vulkan, "ComputeScheduler initialized (dedicated queue: {})", is_dedicated); +} + +ComputeScheduler::~ComputeScheduler() = default; + +void ComputeScheduler::Flush() { + SubmitExecution(); +} + +void ComputeScheduler::Finish() { + const u64 presubmit_tick = CurrentTick(); + SubmitExecution(); + Wait(presubmit_tick); +} + +void ComputeScheduler::Wait(u64 tick) { + if (tick >= master_semaphore.CurrentTick()) { + Flush(); + } + master_semaphore.Wait(tick); +} + +void ComputeScheduler::PopPendingOperations() { + master_semaphore.Refresh(); + while (!pending_ops.empty() && master_semaphore.IsFree(pending_ops.front().gpu_tick)) { + pending_ops.front().callback(); + pending_ops.pop(); + } +} + +void ComputeScheduler::WaitForGraphics(Scheduler& graphics_scheduler) { + if (!is_dedicated) { + // If sharing the queue, standard pipeline barriers handle this. + graphics_scheduler.EndRendering(); + return; + } + + // End any active rendering + graphics_scheduler.EndRendering(); + + // The tick we want to wait for is the CURRENT graphics tick minus one (the last submitted) + // Graphics scheduler's CurrentTick() is always the one it's BUILDING, not the one it just submitted. + const auto graphics_tick = graphics_scheduler.CurrentTick() - 1; + + // If we've already synced with this tick or a later one, skip adding another wait + if (graphics_tick <= last_graphics_sync_tick || graphics_tick == 0) { + return; + } + + const auto graphics_sem = graphics_scheduler.GetTimelineSemaphore(); + + std::lock_guard lk{submit_mutex}; + + // Check if we already have a wait for this semaphore in the current batch + bool already_waiting = false; + for (size_t i = 0; i < wait_semaphores.size(); ++i) { + if (wait_semaphores[i] == graphics_sem) { + wait_values[i] = std::max(wait_values[i], graphics_tick); + already_waiting = true; + break; + } + } + + if (!already_waiting) { + wait_semaphores.push_back(graphics_sem); + wait_values.push_back(graphics_tick); + } + + last_graphics_sync_tick = graphics_tick; +} + +void ComputeScheduler::SignalGraphics(Scheduler& graphics_scheduler) { + if (!is_dedicated || !has_pending_work) { + return; + } + + // Submit any pending compute work to the GPU. + // This is safe because graphics work hasn't been submitted yet for this DRAW. + Flush(); + + const auto compute_sem = master_semaphore.Handle(); + // The tick we just submitted in Flush() is CurrentTick() - 1 + const auto signal_value = master_semaphore.CurrentTick() - 1; + + if (signal_value > 0) { + // Register the wait. The graphics scheduler will apply this + // in its NEXT SubmitExecution call. + graphics_scheduler.Wait(compute_sem, signal_value); + } +} + +void ComputeScheduler::OnComputeDispatch(Scheduler& graphics_scheduler) { + // Mark that we have work that needs to be synced later + has_pending_work = true; + + // Baseline sync: ensure compute is waiting for current graphics state if not already doing so + WaitForGraphics(graphics_scheduler); +} + +void ComputeScheduler::AllocateWorkerCommandBuffers() { + const vk::CommandBufferBeginInfo begin_info = { + .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit, + }; + + current_cmdbuf = command_pool.Commit(); + Check(current_cmdbuf.begin(begin_info)); + has_pending_work = false; +} + +void ComputeScheduler::SubmitExecution() { + std::lock_guard lk{submit_mutex}; + + if (!has_pending_work && wait_semaphores.empty()) { + // No work to submit and no waits to process + return; + } + + // Apply global memory barrier to ensure compute results are visible to graphics + if (has_pending_work) { + vk::MemoryBarrier2 memory_barrier = { + .srcStageMask = vk::PipelineStageFlagBits2::eComputeShader, + .srcAccessMask = vk::AccessFlagBits2::eShaderStorageWrite | vk::AccessFlagBits2::eShaderWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eAllGraphics | vk::PipelineStageFlagBits2::eComputeShader | vk::PipelineStageFlagBits2::eAllCommands, + .dstAccessMask = vk::AccessFlagBits2::eShaderStorageRead | vk::AccessFlagBits2::eShaderRead | vk::AccessFlagBits2::eUniformRead | vk::AccessFlagBits2::eIndexRead | vk::AccessFlagBits2::eVertexAttributeRead | vk::AccessFlagBits2::eIndirectCommandRead, + }; + + vk::DependencyInfo dependency_info = { + .memoryBarrierCount = 1, + .pMemoryBarriers = &memory_barrier, + }; + current_cmdbuf.pipelineBarrier2(dependency_info); + } + + Check(current_cmdbuf.end()); + + const u64 signal_value = master_semaphore.NextTick(); + const vk::Semaphore timeline = master_semaphore.Handle(); + + // Build wait semaphore infos using synchronization2 + std::vector wait_infos; + wait_infos.reserve(wait_semaphores.size()); + for (size_t i = 0; i < wait_semaphores.size(); ++i) { + wait_infos.push_back({ + .semaphore = wait_semaphores[i], + .value = wait_values[i], + .stageMask = vk::PipelineStageFlagBits2::eComputeShader, + }); + } + + // Signal semaphore info + const vk::SemaphoreSubmitInfo signal_info = { + .semaphore = timeline, + .value = signal_value, + .stageMask = vk::PipelineStageFlagBits2::eComputeShader, + }; + + // Command buffer info + const vk::CommandBufferSubmitInfo cmdbuf_info = { + .commandBuffer = current_cmdbuf, + }; + + // Use vkQueueSubmit2 (synchronization2) + const vk::SubmitInfo2 submit_info = { + .waitSemaphoreInfoCount = static_cast(wait_infos.size()), + .pWaitSemaphoreInfos = wait_infos.data(), + .commandBufferInfoCount = 1U, + .pCommandBufferInfos = &cmdbuf_info, + .signalSemaphoreInfoCount = 1U, + .pSignalSemaphoreInfos = &signal_info, + }; + + auto submit_result = compute_queue.submit2(submit_info, nullptr); + ASSERT_MSG(submit_result != vk::Result::eErrorDeviceLost, + "Device lost during compute submit! signal_value={}", signal_value); + + // Clear waits after submission + wait_semaphores.clear(); + wait_values.clear(); + + master_semaphore.Refresh(); + AllocateWorkerCommandBuffers(); + + PopPendingOperations(); +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_compute_scheduler.h b/src/video_core/renderer_vulkan/vk_compute_scheduler.h new file mode 100644 index 00000000000..b6b0289486e --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_compute_scheduler.h @@ -0,0 +1,114 @@ +// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include +#include + +#include "common/unique_function.h" +#include "video_core/renderer_vulkan/vk_master_semaphore.h" +#include "video_core/renderer_vulkan/vk_resource_pool.h" + +namespace Vulkan { + +class Instance; +class Scheduler; + +/// Scheduler for async compute operations on a dedicated compute queue +class ComputeScheduler { +public: + explicit ComputeScheduler(const Instance& instance); + ~ComputeScheduler(); + + /// Sends the current execution context to the GPU and increments the timeline semaphore. + void Flush(); + + /// Sends the current execution context to the GPU and waits for it to complete. + void Finish(); + + /// Waits for the given tick to trigger on the GPU. + void Wait(u64 tick); + + /// Returns the current command buffer. + vk::CommandBuffer CommandBuffer() const { + return current_cmdbuf; + } + + /// Returns the current command buffer tick. + [[nodiscard]] u64 CurrentTick() const noexcept { + return master_semaphore.CurrentTick(); + } + + /// Returns true when a tick has been triggered by the GPU. + [[nodiscard]] bool IsFree(u64 tick) noexcept { + if (master_semaphore.IsFree(tick)) { + return true; + } + master_semaphore.Refresh(); + return master_semaphore.IsFree(tick); + } + + /// Returns the master timeline semaphore. + [[nodiscard]] MasterSemaphore* GetMasterSemaphore() noexcept { + return &master_semaphore; + } + + /// Defers an operation until the gpu has reached the current cpu tick. + void DeferOperation(Common::UniqueFunction&& func) { + pending_ops.emplace(std::move(func), CurrentTick()); + } + + /// Attempts to execute operations whose tick the GPU has caught up with. + void PopPendingOperations(); + + /// Inserts a barrier to wait for the graphics queue to finish before compute. + void WaitForGraphics(Scheduler& graphics_scheduler); + + /// Signals the graphics queue that compute has finished. + void SignalGraphics(Scheduler& graphics_scheduler); + + /// Called during dispatch to track resource dependencies and ensure synchronization. + void OnComputeDispatch(Scheduler& graphics_scheduler); + + /// Returns true if this scheduler uses a dedicated compute queue + /// Mark that there is pending compute work + void MarkPendingWork() { + has_pending_work = true; + } + + bool IsDedicated() const { + return is_dedicated; + } + +private: + void AllocateWorkerCommandBuffers(); + void SubmitExecution(); + +private: + const Instance& instance; + MasterSemaphore master_semaphore; + CommandPool command_pool; + vk::CommandBuffer current_cmdbuf; + vk::Queue compute_queue; + + struct PendingOp { + Common::UniqueFunction callback; + u64 gpu_tick; + }; + std::queue pending_ops; + + bool is_dedicated{false}; + bool has_pending_work{false}; // Track if there are dispatches since last flush + std::mutex submit_mutex; + + // Track the last graphics tick we've synced with to avoid redundant waits + u64 last_graphics_sync_tick{0}; + + // Semaphores to wait on for the next submission + std::vector wait_semaphores; + std::vector wait_values; +}; + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 44aa79d986f..148a714534e 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -343,12 +343,19 @@ bool Instance::CreateDevice() { } bool graphics_queue_found = false; + bool dedicated_compute_found = false; for (std::size_t i = 0; i < family_properties.size(); i++) { const u32 index = static_cast(i); - if (family_properties[i].queueFlags & vk::QueueFlagBits::eGraphics) { + const auto flags = family_properties[i].queueFlags; + if (flags & vk::QueueFlagBits::eGraphics) { queue_family_index = index; graphics_queue_found = true; } + // Look for a dedicated compute queue (has compute but NOT graphics) + if ((flags & vk::QueueFlagBits::eCompute) && !(flags & vk::QueueFlagBits::eGraphics)) { + compute_queue_family_index = index; + dedicated_compute_found = true; + } } if (!graphics_queue_found) { @@ -356,12 +363,32 @@ bool Instance::CreateDevice() { return false; } + // If no dedicated compute queue, use graphics queue for compute + if (!dedicated_compute_found) { + compute_queue_family_index = queue_family_index; + LOG_INFO(Render_Vulkan, "No dedicated compute queue found, using graphics queue for compute"); + } else { + has_dedicated_compute_queue = true; + LOG_INFO(Render_Vulkan, "Found dedicated async compute queue at family index {}", + compute_queue_family_index); + } + static constexpr std::array queue_priorities = {1.0f}; - const vk::DeviceQueueCreateInfo queue_info = { + std::vector queue_infos; + queue_infos.push_back({ .queueFamilyIndex = queue_family_index, .queueCount = static_cast(queue_priorities.size()), .pQueuePriorities = queue_priorities.data(), - }; + }); + + // Add separate compute queue if available + if (has_dedicated_compute_queue) { + queue_infos.push_back({ + .queueFamilyIndex = compute_queue_family_index, + .queueCount = static_cast(queue_priorities.size()), + .pQueuePriorities = queue_priorities.data(), + }); + } const auto topology_list_restart_features = feature_chain.get(); @@ -370,8 +397,8 @@ bool Instance::CreateDevice() { const auto vk13_features = feature_chain.get(); vk::StructureChain device_chain = { vk::DeviceCreateInfo{ - .queueCreateInfoCount = 1u, - .pQueueCreateInfos = &queue_info, + .queueCreateInfoCount = static_cast(queue_infos.size()), + .pQueueCreateInfos = queue_infos.data(), .enabledExtensionCount = static_cast(enabled_extensions.size()), .ppEnabledExtensionNames = enabled_extensions.data(), }, @@ -575,6 +602,7 @@ bool Instance::CreateDevice() { graphics_queue = device->getQueue(queue_family_index, 0); present_queue = device->getQueue(queue_family_index, 0); + compute_queue = device->getQueue(compute_queue_family_index, 0); if (calibrated_timestamps) { const auto [time_domains_result, time_domains] = diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index 8975669bb9b..e14dfd56453 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -75,6 +75,18 @@ class Instance { return present_queue; } + vk::Queue GetComputeQueue() const { + return compute_queue; + } + + u32 GetComputeQueueFamilyIndex() const { + return compute_queue_family_index; + } + + bool HasDedicatedComputeQueue() const { + return has_dedicated_compute_queue; + } + TracyVkCtx GetProfilerContext() const { return profiler_context; } @@ -470,11 +482,14 @@ class Instance { VmaAllocator allocator{}; vk::Queue present_queue; vk::Queue graphics_queue; + vk::Queue compute_queue; std::vector physical_devices; std::vector available_extensions; std::unordered_map format_properties; TracyVkCtx profiler_context{}; u32 queue_family_index{0}; + u32 compute_queue_family_index{0}; + bool has_dedicated_compute_queue{false}; bool custom_border_color{}; bool fragment_shader_barycentric{}; bool amd_shader_explicit_vertex_parameter{}; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_common.cpp b/src/video_core/renderer_vulkan/vk_pipeline_common.cpp index ef1623a548f..1a2ed81a1d5 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_common.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_common.cpp @@ -1,74 +1,92 @@ -// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#include - -#include "shader_recompiler/resource.h" -#include "video_core/renderer_vulkan/vk_instance.h" -#include "video_core/renderer_vulkan/vk_pipeline_cache.h" -#include "video_core/renderer_vulkan/vk_pipeline_common.h" -#include "video_core/renderer_vulkan/vk_scheduler.h" - -namespace Vulkan { - -Pipeline::Pipeline(const Instance& instance_, Scheduler& scheduler_, DescriptorHeap& desc_heap_, - const Shader::Profile& profile_, vk::PipelineCache pipeline_cache, - bool is_compute_ /*= false*/) - : instance{instance_}, scheduler{scheduler_}, desc_heap{desc_heap_}, profile{profile_}, - is_compute{is_compute_} {} - -Pipeline::~Pipeline() = default; - -void Pipeline::BindResources(DescriptorWrites& set_writes, const BufferBarriers& buffer_barriers, - const Shader::PushData& push_data) const { - const auto cmdbuf = scheduler.CommandBuffer(); - const auto bind_point = - IsCompute() ? vk::PipelineBindPoint::eCompute : vk::PipelineBindPoint::eGraphics; - - if (!buffer_barriers.empty()) { - const auto dependencies = vk::DependencyInfo{ - .dependencyFlags = vk::DependencyFlagBits::eByRegion, - .bufferMemoryBarrierCount = u32(buffer_barriers.size()), - .pBufferMemoryBarriers = buffer_barriers.data(), - }; - scheduler.EndRendering(); - cmdbuf.pipelineBarrier2(dependencies); - } - - const auto stage_flags = IsCompute() ? vk::ShaderStageFlagBits::eCompute : AllGraphicsStageBits; - cmdbuf.pushConstants(*pipeline_layout, stage_flags, 0u, sizeof(push_data), &push_data); - - // Bind descriptor set. - if (set_writes.empty()) { - return; - } - - if (uses_push_descriptors) { - cmdbuf.pushDescriptorSetKHR(bind_point, *pipeline_layout, 0, set_writes); - return; - } - - const auto desc_set = desc_heap.Commit(*desc_layout); - for (auto& set_write : set_writes) { - set_write.dstSet = desc_set; - } - instance.GetDevice().updateDescriptorSets(set_writes, {}); - cmdbuf.bindDescriptorSets(bind_point, *pipeline_layout, 0, desc_set, {}); -} - -std::string Pipeline::GetDebugString() const { - std::string stage_desc; - for (const auto& stage : stages) { - if (stage) { - const auto shader_name = PipelineCache::GetShaderName(stage->stage, stage->pgm_hash); - if (stage_desc.empty()) { - stage_desc = shader_name; - } else { - stage_desc = fmt::format("{},{}", stage_desc, shader_name); - } - } - } - return stage_desc; -} - -} // namespace Vulkan +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include + +#include "shader_recompiler/resource.h" +#include "video_core/renderer_vulkan/vk_instance.h" +#include "video_core/renderer_vulkan/vk_pipeline_cache.h" +#include "video_core/renderer_vulkan/vk_pipeline_common.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" + +namespace Vulkan { + +Pipeline::Pipeline(const Instance& instance_, Scheduler& scheduler_, DescriptorHeap& desc_heap_, + const Shader::Profile& profile_, vk::PipelineCache pipeline_cache, + bool is_compute_ /*= false*/) + : instance{instance_}, scheduler{scheduler_}, desc_heap{desc_heap_}, profile{profile_}, + is_compute{is_compute_} {} + +Pipeline::~Pipeline() = default; + +void Pipeline::BindResources(DescriptorWrites& set_writes, const BufferBarriers& buffer_barriers, + const Shader::PushData& push_data) const { + const auto cmdbuf = scheduler.CommandBuffer(); + BindResources(cmdbuf, set_writes, buffer_barriers, push_data); +} + +void Pipeline::BindResources(vk::CommandBuffer cmdbuf, DescriptorWrites& set_writes, + const BufferBarriers& buffer_barriers, + const Shader::PushData& push_data) const { + BindResources(cmdbuf, set_writes, buffer_barriers, push_data, desc_heap); +} + +void Pipeline::BindResources(vk::CommandBuffer cmdbuf, DescriptorWrites& set_writes, + const BufferBarriers& buffer_barriers, + const Shader::PushData& push_data, + DescriptorHeap& heap) const { + const auto bind_point = + IsCompute() ? vk::PipelineBindPoint::eCompute : vk::PipelineBindPoint::eGraphics; + + if (!buffer_barriers.empty()) { + // For async compute (different heap = different queue), skip buffer barriers entirely. + // Timeline semaphores include memory dependencies per Vulkan spec, so cross-queue + // synchronization is fully handled by WaitForGraphics/SignalGraphics. + const bool is_async_compute = IsCompute() && &heap != &desc_heap; + if (!is_async_compute) { + const auto dependencies = vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = u32(buffer_barriers.size()), + .pBufferMemoryBarriers = buffer_barriers.data(), + }; + cmdbuf.pipelineBarrier2(dependencies); + } + } + + const auto stage_flags = IsCompute() ? vk::ShaderStageFlagBits::eCompute : AllGraphicsStageBits; + cmdbuf.pushConstants(*pipeline_layout, stage_flags, 0u, sizeof(push_data), &push_data); + + // Bind descriptor set. + if (set_writes.empty()) { + return; + } + + if (uses_push_descriptors) { + cmdbuf.pushDescriptorSetKHR(bind_point, *pipeline_layout, 0, set_writes); + return; + } + + const auto desc_set = heap.Commit(*desc_layout); + for (auto& set_write : set_writes) { + set_write.dstSet = desc_set; + } + instance.GetDevice().updateDescriptorSets(set_writes, {}); + cmdbuf.bindDescriptorSets(bind_point, *pipeline_layout, 0, desc_set, {}); +} + +std::string Pipeline::GetDebugString() const { + std::string stage_desc; + for (const auto& stage : stages) { + if (stage) { + const auto shader_name = PipelineCache::GetShaderName(stage->stage, stage->pgm_hash); + if (stage_desc.empty()) { + stage_desc = shader_name; + } else { + stage_desc = fmt::format("{},{}", stage_desc, shader_name); + } + } + } + return stage_desc; +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_pipeline_common.h b/src/video_core/renderer_vulkan/vk_pipeline_common.h index eb4e64c80b0..59279d72b5e 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_common.h +++ b/src/video_core/renderer_vulkan/vk_pipeline_common.h @@ -1,81 +1,92 @@ -// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#pragma once - -#include "shader_recompiler/profile.h" -#include "shader_recompiler/runtime_info.h" -#include "video_core/renderer_vulkan/vk_common.h" - -#include - -namespace Shader { -struct Info; -struct PushData; -} // namespace Shader - -namespace Vulkan { - -static constexpr auto AllGraphicsStageBits = - vk::ShaderStageFlagBits::eVertex | vk::ShaderStageFlagBits::eTessellationControl | - vk::ShaderStageFlagBits::eTessellationEvaluation | vk::ShaderStageFlagBits::eGeometry | - vk::ShaderStageFlagBits::eFragment; - -class Instance; -class Scheduler; -class DescriptorHeap; - -class Pipeline { -public: - Pipeline(const Instance& instance, Scheduler& scheduler, DescriptorHeap& desc_heap, - const Shader::Profile& profile, vk::PipelineCache pipeline_cache, - bool is_compute = false); - virtual ~Pipeline(); - - vk::Pipeline Handle() const noexcept { - return *pipeline; - } - - vk::PipelineLayout GetLayout() const noexcept { - return *pipeline_layout; - } - - auto GetStages() const { - static_assert(static_cast(Shader::LogicalStage::Compute) == Shader::MaxStageTypes - 1); - if (is_compute) { - return std::span{stages.cend() - 1, stages.cend()}; - } else { - return std::span{stages.cbegin(), stages.cend() - 1}; - } - } - - const Shader::Info& GetStage(Shader::LogicalStage stage) const noexcept { - return *stages[u32(stage)]; - } - - bool IsCompute() const { - return is_compute; - } - - using DescriptorWrites = boost::container::small_vector; - using BufferBarriers = boost::container::small_vector; - - void BindResources(DescriptorWrites& set_writes, const BufferBarriers& buffer_barriers, - const Shader::PushData& push_data) const; - -protected: - [[nodiscard]] std::string GetDebugString() const; - - const Instance& instance; - Scheduler& scheduler; - DescriptorHeap& desc_heap; - const Shader::Profile& profile; - vk::UniquePipeline pipeline; - vk::UniquePipelineLayout pipeline_layout; - vk::UniqueDescriptorSetLayout desc_layout; - std::array stages{}; - bool uses_push_descriptors{}; - bool is_compute; -}; - -} // namespace Vulkan +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include "shader_recompiler/profile.h" +#include "shader_recompiler/runtime_info.h" +#include "video_core/renderer_vulkan/vk_common.h" + +#include + +namespace Shader { +struct Info; +struct PushData; +} // namespace Shader + +namespace Vulkan { + +static constexpr auto AllGraphicsStageBits = + vk::ShaderStageFlagBits::eVertex | vk::ShaderStageFlagBits::eTessellationControl | + vk::ShaderStageFlagBits::eTessellationEvaluation | vk::ShaderStageFlagBits::eGeometry | + vk::ShaderStageFlagBits::eFragment; + +class Instance; +class Scheduler; +class DescriptorHeap; + +class Pipeline { +public: + Pipeline(const Instance& instance, Scheduler& scheduler, DescriptorHeap& desc_heap, + const Shader::Profile& profile, vk::PipelineCache pipeline_cache, + bool is_compute = false); + virtual ~Pipeline(); + + vk::Pipeline Handle() const noexcept { + return *pipeline; + } + + vk::PipelineLayout GetLayout() const noexcept { + return *pipeline_layout; + } + + auto GetStages() const { + static_assert(static_cast(Shader::LogicalStage::Compute) == Shader::MaxStageTypes - 1); + if (is_compute) { + return std::span{stages.cend() - 1, stages.cend()}; + } else { + return std::span{stages.cbegin(), stages.cend() - 1}; + } + } + + const Shader::Info& GetStage(Shader::LogicalStage stage) const noexcept { + return *stages[u32(stage)]; + } + + bool IsCompute() const { + return is_compute; + } + + using DescriptorWrites = boost::container::small_vector; + using BufferBarriers = boost::container::small_vector; + + void BindResources(DescriptorWrites& set_writes, const BufferBarriers& buffer_barriers, + const Shader::PushData& push_data) const; + + /// Overload for async compute - binds resources to a specific command buffer + void BindResources(vk::CommandBuffer cmdbuf, DescriptorWrites& set_writes, + const BufferBarriers& buffer_barriers, + const Shader::PushData& push_data) const; + + /// Overload for async compute with custom descriptor heap + void BindResources(vk::CommandBuffer cmdbuf, DescriptorWrites& set_writes, + const BufferBarriers& buffer_barriers, + const Shader::PushData& push_data, + DescriptorHeap& override_heap) const; + +protected: + [[nodiscard]] std::string GetDebugString() const; + + const Instance& instance; + Scheduler& scheduler; + DescriptorHeap& desc_heap; + const Shader::Profile& profile; + vk::UniquePipeline pipeline; + vk::UniquePipelineLayout pipeline_layout; + vk::UniqueDescriptorSetLayout desc_layout; + std::array stages{}; + bool uses_push_descriptors{}; + bool is_compute; +}; + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_presenter.cpp b/src/video_core/renderer_vulkan/vk_presenter.cpp index 1694d137fe0..a0501c6491b 100644 --- a/src/video_core/renderer_vulkan/vk_presenter.cpp +++ b/src/video_core/renderer_vulkan/vk_presenter.cpp @@ -223,6 +223,9 @@ Frame* Presenter::PrepareLastFrame() { return nullptr; } + // Ensure any pending async compute work is synced before presenting + rasterizer->SyncComputeForPresent(); + Frame* frame = last_submit_frame; while (true) { @@ -296,6 +299,9 @@ Frame* Presenter::PrepareFrame(const Libraries::VideoOut::BufferAttributeGroup& const auto image_id = texture_cache.FindImage(desc); texture_cache.UpdateImage(image_id); + // Ensure any pending async compute work is synced before presenting + rasterizer->SyncComputeForPresent(); + Frame* frame = GetRenderFrame(); const auto frame_subresources = vk::ImageSubresourceRange{ @@ -352,6 +358,9 @@ Frame* Presenter::PrepareFrame(const Libraries::VideoOut::BufferAttributeGroup& } Frame* Presenter::PrepareBlankFrame(bool present_thread) { + // Ensure any pending async compute work is synced before presenting + rasterizer->SyncComputeForPresent(); + // Request a free presentation frame. Frame* frame = GetRenderFrame(); diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 737c9feedc7..a2f2e8c0eae 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -1,1325 +1,1432 @@ -// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#include "common/config.h" -#include "common/debug.h" -#include "core/memory.h" -#include "shader_recompiler/runtime_info.h" -#include "video_core/amdgpu/liverpool.h" -#include "video_core/renderer_vulkan/liverpool_to_vk.h" -#include "video_core/renderer_vulkan/vk_instance.h" -#include "video_core/renderer_vulkan/vk_rasterizer.h" -#include "video_core/renderer_vulkan/vk_scheduler.h" -#include "video_core/renderer_vulkan/vk_shader_hle.h" -#include "video_core/texture_cache/image_view.h" -#include "video_core/texture_cache/texture_cache.h" - -#ifdef MemoryBarrier -#undef MemoryBarrier -#endif - -namespace Vulkan { - -static Shader::PushData MakeUserData(const AmdGpu::Regs& regs) { - // TODO(roamic): Add support for multiple viewports and geometry shaders when ViewportIndex - // is encountered and implemented in the recompiler. - Shader::PushData push_data{}; - push_data.xoffset = regs.viewport_control.xoffset_enable ? regs.viewports[0].xoffset : 0.f; - push_data.xscale = regs.viewport_control.xscale_enable ? regs.viewports[0].xscale : 1.f; - push_data.yoffset = regs.viewport_control.yoffset_enable ? regs.viewports[0].yoffset : 0.f; - push_data.yscale = regs.viewport_control.yscale_enable ? regs.viewports[0].yscale : 1.f; - return push_data; -} - -Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_, - AmdGpu::Liverpool* liverpool_) - : instance{instance_}, scheduler{scheduler_}, page_manager{this}, - buffer_cache{instance, scheduler, liverpool_, texture_cache, page_manager}, - texture_cache{instance, scheduler, liverpool_, buffer_cache, page_manager}, - liverpool{liverpool_}, memory{Core::Memory::Instance()}, - pipeline_cache{instance, scheduler, liverpool} { - if (!Config::nullGpu()) { - liverpool->BindRasterizer(this); - } - memory->SetRasterizer(this); -} - -Rasterizer::~Rasterizer() = default; - -void Rasterizer::CpSync() { - scheduler.EndRendering(); - auto cmdbuf = scheduler.CommandBuffer(); - - const vk::MemoryBarrier ib_barrier{ - .srcAccessMask = vk::AccessFlagBits::eShaderWrite, - .dstAccessMask = vk::AccessFlagBits::eIndirectCommandRead, - }; - cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader, - vk::PipelineStageFlagBits::eDrawIndirect, - vk::DependencyFlagBits::eByRegion, ib_barrier, {}, {}); -} - -bool Rasterizer::FilterDraw() { - const auto& regs = liverpool->regs; - if (regs.color_control.mode == AmdGpu::ColorControl::OperationMode::EliminateFastClear) { - // Clears the render target if FCE is launched before any draws - EliminateFastClear(); - return false; - } - if (regs.color_control.mode == AmdGpu::ColorControl::OperationMode::FmaskDecompress) { - // TODO: check for a valid MRT1 to promote the draw to the resolve pass. - LOG_TRACE(Render_Vulkan, "FMask decompression pass skipped"); - ScopedMarkerInsert("FmaskDecompress"); - return false; - } - if (regs.color_control.mode == AmdGpu::ColorControl::OperationMode::Resolve) { - LOG_TRACE(Render_Vulkan, "Resolve pass"); - Resolve(); - return false; - } - if (regs.primitive_type == AmdGpu::PrimitiveType::None) { - LOG_TRACE(Render_Vulkan, "Primitive type 'None' skipped"); - ScopedMarkerInsert("PrimitiveTypeNone"); - return false; - } - - const bool cb_disabled = - regs.color_control.mode == AmdGpu::ColorControl::OperationMode::Disable; - const auto depth_copy = - regs.depth_render_override.force_z_dirty && regs.depth_render_override.force_z_valid && - regs.depth_buffer.DepthValid() && regs.depth_buffer.DepthWriteValid() && - regs.depth_buffer.DepthAddress() != regs.depth_buffer.DepthWriteAddress(); - const auto stencil_copy = - regs.depth_render_override.force_stencil_dirty && - regs.depth_render_override.force_stencil_valid && regs.depth_buffer.StencilValid() && - regs.depth_buffer.StencilWriteValid() && - regs.depth_buffer.StencilAddress() != regs.depth_buffer.StencilWriteAddress(); - if (cb_disabled && (depth_copy || stencil_copy)) { - // Games may disable color buffer and enable force depth/stencil dirty and valid to - // do a copy from one depth-stencil surface to another, without a pixel shader. - // We need to detect this case and perform the copy, otherwise it will have no effect. - LOG_TRACE(Render_Vulkan, "Performing depth-stencil override copy"); - DepthStencilCopy(depth_copy, stencil_copy); - return false; - } - - return true; -} - -void Rasterizer::PrepareRenderState(const GraphicsPipeline* pipeline) { - // Prefetch render targets to handle overlaps with bound textures (e.g. mipgen) - const auto& key = pipeline->GetGraphicsKey(); - const auto& regs = liverpool->regs; - if (regs.color_control.degamma_enable) { - LOG_WARNING(Render_Vulkan, "Color buffers require gamma correction"); - } - - const bool skip_cb_binding = - regs.color_control.mode == AmdGpu::ColorControl::OperationMode::Disable; - for (s32 cb = 0; cb < std::bit_width(key.mrt_mask); ++cb) { - auto& [image_id, desc] = cb_descs[cb]; - const auto& col_buf = regs.color_buffers[cb]; - const u32 target_mask = regs.color_target_mask.GetMask(cb); - if (skip_cb_binding || !col_buf || !target_mask || (key.mrt_mask & (1 << cb)) == 0) { - image_id = {}; - continue; - } - const auto& hint = liverpool->last_cb_extent[cb]; - std::construct_at(&desc, col_buf, hint); - image_id = bound_images.emplace_back(texture_cache.FindImage(desc)); - auto& image = texture_cache.GetImage(image_id); - image.binding.is_target = 1u; - } - - if ((regs.depth_control.depth_enable && regs.depth_buffer.DepthValid()) || - (regs.depth_control.stencil_enable && regs.depth_buffer.StencilValid())) { - const auto htile_address = regs.depth_htile_data_base.GetAddress(); - const auto& hint = liverpool->last_db_extent; - auto& [image_id, desc] = db_desc; - std::construct_at(&desc, regs.depth_buffer, regs.depth_view, regs.depth_control, - htile_address, hint); - image_id = bound_images.emplace_back(texture_cache.FindImage(desc)); - auto& image = texture_cache.GetImage(image_id); - image.binding.is_target = 1u; - } else { - db_desc.first = {}; - } -} - -static std::pair GetDrawOffsets( - const AmdGpu::Regs& regs, const Shader::Info& info, - const std::optional& fetch_shader) { - u32 vertex_offset = regs.index_offset; - u32 instance_offset = 0; - if (fetch_shader) { - if (vertex_offset == 0 && fetch_shader->vertex_offset_sgpr != -1) { - vertex_offset = info.user_data[fetch_shader->vertex_offset_sgpr]; - } - if (fetch_shader->instance_offset_sgpr != -1) { - instance_offset = info.user_data[fetch_shader->instance_offset_sgpr]; - } - } - return {vertex_offset, instance_offset}; -} - -void Rasterizer::EliminateFastClear() { - auto& col_buf = liverpool->regs.color_buffers[0]; - if (!col_buf || !col_buf.info.fast_clear) { - return; - } - VideoCore::TextureCache::ImageDesc desc(col_buf, liverpool->last_cb_extent[0]); - const auto image_id = texture_cache.FindImage(desc); - const auto& image_view = texture_cache.FindRenderTarget(image_id, desc); - if (!texture_cache.IsMetaCleared(col_buf.CmaskAddress(), col_buf.view.slice_start)) { - return; - } - for (u32 slice = col_buf.view.slice_start; slice <= col_buf.view.slice_max; ++slice) { - texture_cache.TouchMeta(col_buf.CmaskAddress(), slice, false); - } - auto& image = texture_cache.GetImage(image_id); - const auto clear_value = LiverpoolToVK::ColorBufferClearValue(col_buf); - - ScopeMarkerBegin(fmt::format("EliminateFastClear:MRT={:#x}:M={:#x}", col_buf.Address(), - col_buf.CmaskAddress())); - image.Clear(clear_value, desc.view_info.range); - ScopeMarkerEnd(); -} - -void Rasterizer::Draw(bool is_indexed, u32 index_offset) { - RENDERER_TRACE; - - scheduler.PopPendingOperations(); - - if (!FilterDraw()) { - return; - } - - const auto& regs = liverpool->regs; - const GraphicsPipeline* pipeline = pipeline_cache.GetGraphicsPipeline(); - if (!pipeline) { - return; - } - - PrepareRenderState(pipeline); - if (!BindResources(pipeline)) { - return; - } - const auto state = BeginRendering(pipeline); - - buffer_cache.BindVertexBuffers(*pipeline); - if (is_indexed) { - buffer_cache.BindIndexBuffer(index_offset); - } - - pipeline->BindResources(set_writes, buffer_barriers, push_data); - UpdateDynamicState(pipeline, is_indexed); - scheduler.BeginRendering(state); - - const auto& vs_info = pipeline->GetStage(Shader::LogicalStage::Vertex); - const auto& fetch_shader = pipeline->GetFetchShader(); - const auto [vertex_offset, instance_offset] = GetDrawOffsets(regs, vs_info, fetch_shader); - - const auto cmdbuf = scheduler.CommandBuffer(); - cmdbuf.bindPipeline(vk::PipelineBindPoint::eGraphics, pipeline->Handle()); - - if (is_indexed) { - cmdbuf.drawIndexed(regs.num_indices, regs.num_instances.NumInstances(), 0, - s32(vertex_offset), instance_offset); - } else { - cmdbuf.draw(regs.num_indices, regs.num_instances.NumInstances(), vertex_offset, - instance_offset); - } - - ResetBindings(); -} - -void Rasterizer::DrawIndirect(bool is_indexed, VAddr arg_address, u32 offset, u32 stride, - u32 max_count, VAddr count_address) { - RENDERER_TRACE; - - scheduler.PopPendingOperations(); - - if (!FilterDraw()) { - return; - } - - const GraphicsPipeline* pipeline = pipeline_cache.GetGraphicsPipeline(); - if (!pipeline) { - return; - } - - PrepareRenderState(pipeline); - if (!BindResources(pipeline)) { - return; - } - const auto state = BeginRendering(pipeline); - - buffer_cache.BindVertexBuffers(*pipeline); - if (is_indexed) { - buffer_cache.BindIndexBuffer(0); - } - - const auto& [buffer, base] = - buffer_cache.ObtainBuffer(arg_address + offset, stride * max_count, false); - - VideoCore::Buffer* count_buffer{}; - u32 count_base{}; - if (count_address != 0) { - std::tie(count_buffer, count_base) = buffer_cache.ObtainBuffer(count_address, 4, false); - } - - pipeline->BindResources(set_writes, buffer_barriers, push_data); - UpdateDynamicState(pipeline, is_indexed); - scheduler.BeginRendering(state); - - // We can safely ignore both SGPR UD indices and results of fetch shader parsing, as vertex and - // instance offsets will be automatically applied by Vulkan from indirect args buffer. - - const auto cmdbuf = scheduler.CommandBuffer(); - cmdbuf.bindPipeline(vk::PipelineBindPoint::eGraphics, pipeline->Handle()); - - if (is_indexed) { - ASSERT(sizeof(VkDrawIndexedIndirectCommand) == stride); - - if (count_address != 0) { - cmdbuf.drawIndexedIndirectCount(buffer->Handle(), base, count_buffer->Handle(), - count_base, max_count, stride); - } else { - cmdbuf.drawIndexedIndirect(buffer->Handle(), base, max_count, stride); - } - } else { - ASSERT(sizeof(VkDrawIndirectCommand) == stride); - - if (count_address != 0) { - cmdbuf.drawIndirectCount(buffer->Handle(), base, count_buffer->Handle(), count_base, - max_count, stride); - } else { - cmdbuf.drawIndirect(buffer->Handle(), base, max_count, stride); - } - } - - ResetBindings(); -} - -void Rasterizer::DispatchDirect() { - RENDERER_TRACE; - - scheduler.PopPendingOperations(); - - const auto& cs_program = liverpool->GetCsRegs(); - const ComputePipeline* pipeline = pipeline_cache.GetComputePipeline(); - if (!pipeline) { - return; - } - - const auto& cs = pipeline->GetStage(Shader::LogicalStage::Compute); - if (ExecuteShaderHLE(cs, liverpool->regs, cs_program, *this)) { - return; - } - - if (!BindResources(pipeline)) { - return; - } - - scheduler.EndRendering(); - pipeline->BindResources(set_writes, buffer_barriers, push_data); - - const auto cmdbuf = scheduler.CommandBuffer(); - cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->Handle()); - cmdbuf.dispatch(cs_program.dim_x, cs_program.dim_y, cs_program.dim_z); - - ResetBindings(); -} - -void Rasterizer::DispatchIndirect(VAddr address, u32 offset, u32 size) { - RENDERER_TRACE; - - scheduler.PopPendingOperations(); - - const auto& cs_program = liverpool->GetCsRegs(); - const ComputePipeline* pipeline = pipeline_cache.GetComputePipeline(); - if (!pipeline) { - return; - } - - if (!BindResources(pipeline)) { - return; - } - - const auto [buffer, base] = buffer_cache.ObtainBuffer(address + offset, size, false); - - scheduler.EndRendering(); - pipeline->BindResources(set_writes, buffer_barriers, push_data); - - const auto cmdbuf = scheduler.CommandBuffer(); - cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->Handle()); - cmdbuf.dispatchIndirect(buffer->Handle(), base); - - ResetBindings(); -} - -u64 Rasterizer::Flush() { - const u64 current_tick = scheduler.CurrentTick(); - SubmitInfo info{}; - scheduler.Flush(info); - return current_tick; -} - -void Rasterizer::Finish() { - scheduler.Finish(); -} - -void Rasterizer::OnSubmit() { - if (fault_process_pending) { - fault_process_pending = false; - buffer_cache.ProcessFaultBuffer(); - } - texture_cache.ProcessDownloadImages(); - texture_cache.RunGarbageCollector(); - buffer_cache.RunGarbageCollector(); -} - -bool Rasterizer::BindResources(const Pipeline* pipeline) { - if (IsComputeImageCopy(pipeline) || IsComputeMetaClear(pipeline) || - IsComputeImageClear(pipeline)) { - return false; - } - - set_writes.clear(); - buffer_barriers.clear(); - buffer_infos.clear(); - image_infos.clear(); - - bool uses_dma = false; - - // Bind resource buffers and textures. - Shader::Backend::Bindings binding{}; - push_data = MakeUserData(liverpool->regs); - for (const auto* stage : pipeline->GetStages()) { - if (!stage) { - continue; - } - stage->PushUd(binding, push_data); - BindBuffers(*stage, binding, push_data); - BindTextures(*stage, binding); - uses_dma |= stage->uses_dma; - } - - if (uses_dma) { - // We only use fault buffer for DMA right now. - Common::RecursiveSharedLock lock{mapped_ranges_mutex}; - for (auto& range : mapped_ranges) { - buffer_cache.SynchronizeBuffersInRange(range.lower(), range.upper() - range.lower()); - } - fault_process_pending = true; - } - - return true; -} - -bool Rasterizer::IsComputeMetaClear(const Pipeline* pipeline) { - if (!pipeline->IsCompute()) { - return false; - } - - // Most of the time when a metadata is updated with a shader it gets cleared. It means - // we can skip the whole dispatch and update the tracked state instead. Also, it is not - // intended to be consumed and in such rare cases (e.g. HTile introspection, CRAA) we - // will need its full emulation anyways. - const auto& info = pipeline->GetStage(Shader::LogicalStage::Compute); - - // Assume if a shader reads metadata, it is a copy shader. - for (const auto& desc : info.buffers) { - const VAddr address = desc.GetSharp(info).base_address; - if (!desc.IsSpecial() && !desc.is_written && texture_cache.IsMeta(address)) { - return false; - } - } - - // Metadata surfaces are tiled and thus need address calculation to be written properly. - // If a shader wants to encode HTILE, for example, from a depth image it will have to compute - // proper tile address from dispatch invocation id. This address calculation contains an xor - // operation so use it as a heuristic for metadata writes that are probably not clears. - if (!info.has_bitwise_xor) { - // Assume if a shader writes metadata without address calculation, it is a clear shader. - for (const auto& desc : info.buffers) { - const VAddr address = desc.GetSharp(info).base_address; - if (!desc.IsSpecial() && desc.is_written && texture_cache.ClearMeta(address)) { - // Assume all slices were updates - LOG_TRACE(Render_Vulkan, "Metadata update skipped"); - return true; - } - } - } - return false; -} - -bool Rasterizer::IsComputeImageCopy(const Pipeline* pipeline) { - if (!pipeline->IsCompute()) { - return false; - } - - // Ensure shader only has 2 bound buffers - const auto& cs_pgm = liverpool->GetCsRegs(); - const auto& info = pipeline->GetStage(Shader::LogicalStage::Compute); - if (cs_pgm.num_thread_x.full != 64 || info.buffers.size() != 2 || !info.images.empty()) { - return false; - } - - // Those 2 buffers must both be formatted. One must be source and another destination. - const auto& desc0 = info.buffers[0]; - const auto& desc1 = info.buffers[1]; - if (!desc0.is_formatted || !desc1.is_formatted || desc0.is_written == desc1.is_written) { - return false; - } - - // Buffers must have the same size and each thread of the dispatch must copy 1 dword of data - const AmdGpu::Buffer buf0 = desc0.GetSharp(info); - const AmdGpu::Buffer buf1 = desc1.GetSharp(info); - if (buf0.GetSize() != buf1.GetSize() || cs_pgm.dim_x != (buf0.GetSize() / 256)) { - return false; - } - - // Find images the buffer alias - const auto image0_id = texture_cache.FindImageFromRange(buf0.base_address, buf0.GetSize()); - if (!image0_id) { - return false; - } - const auto image1_id = - texture_cache.FindImageFromRange(buf1.base_address, buf1.GetSize(), false); - if (!image1_id) { - return false; - } - - // Image copy must be valid - VideoCore::Image& image0 = texture_cache.GetImage(image0_id); - VideoCore::Image& image1 = texture_cache.GetImage(image1_id); - if (image0.info.guest_size != image1.info.guest_size || - image0.info.pitch != image1.info.pitch || image0.info.guest_size != buf0.GetSize() || - image0.info.num_bits != image1.info.num_bits) { - return false; - } - - // Perform image copy - VideoCore::Image& src_image = desc0.is_written ? image1 : image0; - VideoCore::Image& dst_image = desc0.is_written ? image0 : image1; - if (instance.IsMaintenance8Supported() || - src_image.info.props.is_depth == dst_image.info.props.is_depth) { - dst_image.CopyImage(src_image); - } else { - const auto& copy_buffer = - buffer_cache.GetUtilityBuffer(VideoCore::MemoryUsage::DeviceLocal); - dst_image.CopyImageWithBuffer(src_image, copy_buffer.Handle(), 0); - } - dst_image.flags |= VideoCore::ImageFlagBits::GpuModified; - dst_image.flags &= ~VideoCore::ImageFlagBits::Dirty; - return true; -} - -bool Rasterizer::IsComputeImageClear(const Pipeline* pipeline) { - if (!pipeline->IsCompute()) { - return false; - } - - // Ensure shader only has 2 bound buffers - const auto& cs_pgm = liverpool->GetCsRegs(); - const auto& info = pipeline->GetStage(Shader::LogicalStage::Compute); - if (cs_pgm.num_thread_x.full != 64 || info.buffers.size() != 2 || !info.images.empty()) { - return false; - } - - // From those 2 buffers, first must hold the clear vector and second the image being cleared - const auto& desc0 = info.buffers[0]; - const auto& desc1 = info.buffers[1]; - if (desc0.is_formatted || !desc1.is_formatted || desc0.is_written || !desc1.is_written) { - return false; - } - - // First buffer must have size of vec4 and second the size of a single layer - const AmdGpu::Buffer buf0 = desc0.GetSharp(info); - const AmdGpu::Buffer buf1 = desc1.GetSharp(info); - const u32 buf1_bpp = AmdGpu::NumBitsPerBlock(buf1.GetDataFmt()); - if (buf0.GetSize() != 16 || (cs_pgm.dim_x * 128ULL * (buf1_bpp / 8)) != buf1.GetSize()) { - return false; - } - - // Find image the buffer alias - const auto image1_id = - texture_cache.FindImageFromRange(buf1.base_address, buf1.GetSize(), false); - if (!image1_id) { - return false; - } - - // Image clear must be valid - VideoCore::Image& image1 = texture_cache.GetImage(image1_id); - if (image1.info.guest_size != buf1.GetSize() || image1.info.num_bits != buf1_bpp || - image1.info.props.is_depth) { - return false; - } - - // Perform image clear - const float* values = reinterpret_cast(buf0.base_address); - const vk::ClearValue clear = { - .color = {.float32 = std::array{values[0], values[1], values[2], values[3]}}, - }; - const VideoCore::SubresourceRange range = { - .base = - { - .level = 0, - .layer = 0, - }, - .extent = image1.info.resources, - }; - image1.Clear(clear, range); - image1.flags |= VideoCore::ImageFlagBits::GpuModified; - image1.flags &= ~VideoCore::ImageFlagBits::Dirty; - return true; -} - -void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Bindings& binding, - Shader::PushData& push_data) { - buffer_bindings.clear(); - - for (const auto& desc : stage.buffers) { - const auto vsharp = desc.GetSharp(stage); - if (!desc.IsSpecial() && vsharp.base_address != 0 && vsharp.GetSize() > 0) { - const u64 size = memory->ClampRangeSize(vsharp.base_address, vsharp.GetSize()); - const auto buffer_id = buffer_cache.FindBuffer(vsharp.base_address, size); - buffer_bindings.emplace_back(buffer_id, vsharp, size); - } else { - buffer_bindings.emplace_back(VideoCore::BufferId{}, vsharp, 0); - } - } - - // Second pass to re-bind buffers that were updated after binding - for (u32 i = 0; i < buffer_bindings.size(); i++) { - const auto& [buffer_id, vsharp, size] = buffer_bindings[i]; - const auto& desc = stage.buffers[i]; - const bool is_storage = desc.IsStorage(vsharp); - const u32 alignment = - is_storage ? instance.StorageMinAlignment() : instance.UniformMinAlignment(); - // Buffer is not from the cache, either a special buffer or unbound. - if (!buffer_id) { - if (desc.buffer_type == Shader::BufferType::GdsBuffer) { - const auto* gds_buf = buffer_cache.GetGdsBuffer(); - buffer_infos.emplace_back(gds_buf->Handle(), 0, gds_buf->SizeBytes()); - } else if (desc.buffer_type == Shader::BufferType::Flatbuf) { - auto& vk_buffer = buffer_cache.GetUtilityBuffer(VideoCore::MemoryUsage::Stream); - const u32 ubo_size = stage.flattened_ud_buf.size() * sizeof(u32); - const u64 offset = - vk_buffer.Copy(stage.flattened_ud_buf.data(), ubo_size, alignment); - buffer_infos.emplace_back(vk_buffer.Handle(), offset, ubo_size); - } else if (desc.buffer_type == Shader::BufferType::BdaPagetable) { - const auto* bda_buffer = buffer_cache.GetBdaPageTableBuffer(); - buffer_infos.emplace_back(bda_buffer->Handle(), 0, bda_buffer->SizeBytes()); - } else if (desc.buffer_type == Shader::BufferType::FaultBuffer) { - const auto* fault_buffer = buffer_cache.GetFaultBuffer(); - buffer_infos.emplace_back(fault_buffer->Handle(), 0, fault_buffer->SizeBytes()); - } else if (desc.buffer_type == Shader::BufferType::SharedMemory) { - auto& lds_buffer = buffer_cache.GetUtilityBuffer(VideoCore::MemoryUsage::Stream); - const auto& cs_program = liverpool->GetCsRegs(); - const auto lds_size = cs_program.SharedMemSize() * cs_program.NumWorkgroups(); - const auto [data, offset] = lds_buffer.Map(lds_size, alignment); - std::memset(data, 0, lds_size); - buffer_infos.emplace_back(lds_buffer.Handle(), offset, lds_size); - } else if (instance.IsNullDescriptorSupported()) { - buffer_infos.emplace_back(VK_NULL_HANDLE, 0, VK_WHOLE_SIZE); - } else { - auto& null_buffer = buffer_cache.GetBuffer(VideoCore::NULL_BUFFER_ID); - buffer_infos.emplace_back(null_buffer.Handle(), 0, VK_WHOLE_SIZE); - } - } else { - const auto [vk_buffer, offset] = buffer_cache.ObtainBuffer( - vsharp.base_address, size, desc.is_written, desc.is_formatted, buffer_id); - const u32 offset_aligned = Common::AlignDown(offset, alignment); - const u32 adjust = offset - offset_aligned; - ASSERT(adjust % 4 == 0); - push_data.AddOffset(binding.buffer, adjust); - buffer_infos.emplace_back(vk_buffer->Handle(), offset_aligned, size + adjust); - if (auto barrier = - vk_buffer->GetBarrier(desc.is_written ? vk::AccessFlagBits2::eShaderWrite - : vk::AccessFlagBits2::eShaderRead, - vk::PipelineStageFlagBits2::eAllCommands)) { - buffer_barriers.emplace_back(*barrier); - } - if (desc.is_written && desc.is_formatted) { - texture_cache.InvalidateMemoryFromGPU(vsharp.base_address, size); - } - } - - set_writes.push_back({ - .dstSet = VK_NULL_HANDLE, - .dstBinding = binding.unified++, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = is_storage ? vk::DescriptorType::eStorageBuffer - : vk::DescriptorType::eUniformBuffer, - .pBufferInfo = &buffer_infos.back(), - }); - ++binding.buffer; - } -} - -void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindings& binding) { - image_bindings.clear(); - - for (const auto& image_desc : stage.images) { - const auto tsharp = image_desc.GetSharp(stage); - if (texture_cache.IsMeta(tsharp.Address())) { - LOG_WARNING(Render_Vulkan, "Unexpected metadata read by a shader (texture)"); - } - - if (tsharp.GetDataFmt() == AmdGpu::DataFormat::FormatInvalid) { - image_bindings.emplace_back(std::piecewise_construct, std::tuple{}, std::tuple{}); - continue; - } - - auto& [image_id, desc] = image_bindings.emplace_back(std::piecewise_construct, std::tuple{}, - std::tuple{tsharp, image_desc}); - image_id = texture_cache.FindImage(desc); - auto* image = &texture_cache.GetImage(image_id); - if (image->depth_id) { - // If this image has an associated depth image, it's a stencil attachment. - // Redirect the access to the actual depth-stencil buffer. - image_id = image->depth_id; - image = &texture_cache.GetImage(image_id); - } - if (image->binding.is_bound) { - // The image is already bound. In case if it is about to be used as storage we need - // to force general layout on it. - image->binding.force_general |= image_desc.is_written; - } - image->binding.is_bound = 1u; - } - - // Second pass to re-bind images that were updated after binding - for (auto& [image_id, desc] : image_bindings) { - bool is_storage = desc.type == VideoCore::TextureCache::BindingType::Storage; - if (!image_id) { - if (instance.IsNullDescriptorSupported()) { - image_infos.emplace_back(VK_NULL_HANDLE, VK_NULL_HANDLE, vk::ImageLayout::eGeneral); - } else { - auto& null_image_view = texture_cache.FindTexture(VideoCore::NULL_IMAGE_ID, desc); - image_infos.emplace_back(VK_NULL_HANDLE, *null_image_view.image_view, - vk::ImageLayout::eGeneral); - } - } else { - if (auto& old_image = texture_cache.GetImage(image_id); - old_image.binding.needs_rebind) { - old_image.binding = {}; - image_id = texture_cache.FindImage(desc); - } - - bound_images.emplace_back(image_id); - - auto& image = texture_cache.GetImage(image_id); - auto& image_view = texture_cache.FindTexture(image_id, desc); - - // The image is either bound as storage in a separate descriptor or bound as render - // target in feedback loop. Depth images are excluded because they can't be bound as - // storage and feedback loop doesn't make sense for them - if ((image.binding.force_general || image.binding.is_target) && - !image.info.props.is_depth) { - image.Transit(instance.IsAttachmentFeedbackLoopLayoutSupported() && - image.binding.is_target - ? vk::ImageLayout::eAttachmentFeedbackLoopOptimalEXT - : vk::ImageLayout::eGeneral, - vk::AccessFlagBits2::eShaderRead | - (image.info.props.is_depth - ? vk::AccessFlagBits2::eDepthStencilAttachmentWrite - : vk::AccessFlagBits2::eColorAttachmentWrite), - {}); - } else { - if (is_storage) { - image.Transit(vk::ImageLayout::eGeneral, - vk::AccessFlagBits2::eShaderRead | - vk::AccessFlagBits2::eShaderWrite, - desc.view_info.range); - } else { - const auto new_layout = image.info.props.is_depth - ? vk::ImageLayout::eDepthStencilReadOnlyOptimal - : vk::ImageLayout::eShaderReadOnlyOptimal; - image.Transit(new_layout, vk::AccessFlagBits2::eShaderRead, - desc.view_info.range); - } - } - image.usage.storage |= is_storage; - image.usage.texture |= !is_storage; - - image_infos.emplace_back(VK_NULL_HANDLE, *image_view.image_view, - image.backing->state.layout); - } - - set_writes.push_back({ - .dstSet = VK_NULL_HANDLE, - .dstBinding = binding.unified++, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = - is_storage ? vk::DescriptorType::eStorageImage : vk::DescriptorType::eSampledImage, - .pImageInfo = &image_infos.back(), - }); - } - - for (const auto& sampler : stage.samplers) { - auto ssharp = sampler.GetSharp(stage); - if (sampler.disable_aniso) { - const auto& tsharp = stage.images[sampler.associated_image].GetSharp(stage); - if (tsharp.base_level == 0 && tsharp.last_level == 0) { - ssharp.max_aniso.Assign(AmdGpu::AnisoRatio::One); - } - } - const auto vk_sampler = texture_cache.GetSampler(ssharp, liverpool->regs.ta_bc_base); - image_infos.emplace_back(vk_sampler, VK_NULL_HANDLE, vk::ImageLayout::eGeneral); - set_writes.push_back({ - .dstSet = VK_NULL_HANDLE, - .dstBinding = binding.unified++, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = vk::DescriptorType::eSampler, - .pImageInfo = &image_infos.back(), - }); - } -} - -RenderState Rasterizer::BeginRendering(const GraphicsPipeline* pipeline) { - attachment_feedback_loop = false; - const auto& regs = liverpool->regs; - const auto& key = pipeline->GetGraphicsKey(); - RenderState state; - state.width = instance.GetMaxFramebufferWidth(); - state.height = instance.GetMaxFramebufferHeight(); - state.num_layers = std::numeric_limits::max(); - state.num_color_attachments = std::bit_width(key.mrt_mask); - for (auto cb = 0u; cb < state.num_color_attachments; ++cb) { - auto& [image_id, desc] = cb_descs[cb]; - if (!image_id) { - continue; - } - auto* image = &texture_cache.GetImage(image_id); - if (image->binding.needs_rebind) { - image_id = bound_images.emplace_back(texture_cache.FindImage(desc)); - image = &texture_cache.GetImage(image_id); - } - texture_cache.UpdateImage(image_id); - image->SetBackingSamples(key.color_samples[cb]); - const auto& image_view = texture_cache.FindRenderTarget(image_id, desc); - const auto slice = image_view.info.range.base.layer; - const auto mip = image_view.info.range.base.level; - - const auto& col_buf = regs.color_buffers[cb]; - const bool is_clear = texture_cache.IsMetaCleared(col_buf.CmaskAddress(), slice); - texture_cache.TouchMeta(col_buf.CmaskAddress(), slice, false); - - if (image->binding.is_bound) { - ASSERT_MSG(!image->binding.force_general, - "Having image both as storage and render target is unsupported"); - image->Transit(instance.IsAttachmentFeedbackLoopLayoutSupported() - ? vk::ImageLayout::eAttachmentFeedbackLoopOptimalEXT - : vk::ImageLayout::eGeneral, - vk::AccessFlagBits2::eColorAttachmentWrite, {}); - attachment_feedback_loop = true; - } else { - image->Transit(vk::ImageLayout::eColorAttachmentOptimal, - vk::AccessFlagBits2::eColorAttachmentWrite | - vk::AccessFlagBits2::eColorAttachmentRead, - desc.view_info.range); - } - - state.width = std::min(state.width, std::max(image->info.size.width >> mip, 1u)); - state.height = std::min(state.height, std::max(image->info.size.height >> mip, 1u)); - state.num_layers = std::min(state.num_layers, image_view.info.range.extent.layers); - state.color_attachments[cb] = { - .imageView = *image_view.image_view, - .imageLayout = image->backing->state.layout, - .loadOp = is_clear ? vk::AttachmentLoadOp::eClear : vk::AttachmentLoadOp::eLoad, - .storeOp = vk::AttachmentStoreOp::eStore, - .clearValue = - is_clear ? LiverpoolToVK::ColorBufferClearValue(col_buf) : vk::ClearValue{}, - }; - image->usage.render_target = 1u; - } - - if (auto image_id = db_desc.first; image_id) { - auto& desc = db_desc.second; - const auto htile_address = regs.depth_htile_data_base.GetAddress(); - const auto& image_view = texture_cache.FindDepthTarget(image_id, desc); - auto& image = texture_cache.GetImage(image_id); - - const auto slice = image_view.info.range.base.layer; - const bool is_depth_clear = regs.depth_render_control.depth_clear_enable || - texture_cache.IsMetaCleared(htile_address, slice); - const bool is_stencil_clear = regs.depth_render_control.stencil_clear_enable; - texture_cache.TouchMeta(htile_address, slice, false); - ASSERT(desc.view_info.range.extent.levels == 1 && !image.binding.needs_rebind); - - const bool has_stencil = image.info.props.has_stencil; - const auto new_layout = desc.view_info.is_storage - ? has_stencil ? vk::ImageLayout::eDepthStencilAttachmentOptimal - : vk::ImageLayout::eDepthAttachmentOptimal - : has_stencil ? vk::ImageLayout::eDepthStencilReadOnlyOptimal - : vk::ImageLayout::eDepthReadOnlyOptimal; - image.Transit(new_layout, - vk::AccessFlagBits2::eDepthStencilAttachmentWrite | - vk::AccessFlagBits2::eDepthStencilAttachmentRead, - desc.view_info.range); - - state.width = std::min(state.width, image.info.size.width); - state.height = std::min(state.height, image.info.size.height); - state.has_depth = regs.depth_buffer.DepthValid(); - state.has_stencil = regs.depth_buffer.StencilValid(); - state.num_layers = std::min(state.num_layers, image_view.info.range.extent.layers); - if (state.has_depth) { - state.depth_attachment = { - .imageView = *image_view.image_view, - .imageLayout = image.backing->state.layout, - .loadOp = - is_depth_clear ? vk::AttachmentLoadOp::eClear : vk::AttachmentLoadOp::eLoad, - .storeOp = vk::AttachmentStoreOp::eStore, - .clearValue = vk::ClearValue{.depthStencil = {.depth = regs.depth_clear}}, - }; - } - if (state.has_stencil) { - state.stencil_attachment = { - .imageView = *image_view.image_view, - .imageLayout = image.backing->state.layout, - .loadOp = - is_stencil_clear ? vk::AttachmentLoadOp::eClear : vk::AttachmentLoadOp::eLoad, - .storeOp = vk::AttachmentStoreOp::eStore, - .clearValue = vk::ClearValue{.depthStencil = {.stencil = regs.stencil_clear}}, - }; - } - - image.usage.depth_target = true; - } - - if (state.num_layers == std::numeric_limits::max()) { - state.num_layers = 1; - } - - return state; -} - -void Rasterizer::Resolve() { - const auto& mrt0_hint = liverpool->last_cb_extent[0]; - const auto& mrt1_hint = liverpool->last_cb_extent[1]; - VideoCore::TextureCache::ImageDesc mrt0_desc{liverpool->regs.color_buffers[0], mrt0_hint}; - VideoCore::TextureCache::ImageDesc mrt1_desc{liverpool->regs.color_buffers[1], mrt1_hint}; - auto& mrt0_image = texture_cache.GetImage(texture_cache.FindImage(mrt0_desc, true)); - auto& mrt1_image = texture_cache.GetImage(texture_cache.FindImage(mrt1_desc, true)); - - ScopeMarkerBegin(fmt::format("Resolve:MRT0={:#x}:MRT1={:#x}", - liverpool->regs.color_buffers[0].Address(), - liverpool->regs.color_buffers[1].Address())); - mrt1_image.Resolve(mrt0_image, mrt0_desc.view_info.range, mrt1_desc.view_info.range); - ScopeMarkerEnd(); -} - -void Rasterizer::DepthStencilCopy(bool is_depth, bool is_stencil) { - auto& regs = liverpool->regs; - - auto read_desc = VideoCore::TextureCache::ImageDesc( - regs.depth_buffer, regs.depth_view, regs.depth_control, - regs.depth_htile_data_base.GetAddress(), liverpool->last_db_extent, false); - auto write_desc = VideoCore::TextureCache::ImageDesc( - regs.depth_buffer, regs.depth_view, regs.depth_control, - regs.depth_htile_data_base.GetAddress(), liverpool->last_db_extent, true); - - auto& read_image = texture_cache.GetImage(texture_cache.FindImage(read_desc)); - auto& write_image = texture_cache.GetImage(texture_cache.FindImage(write_desc)); - - VideoCore::SubresourceRange sub_range; - sub_range.base.layer = liverpool->regs.depth_view.slice_start; - sub_range.extent.layers = liverpool->regs.depth_view.NumSlices() - sub_range.base.layer; - - ScopeMarkerBegin(fmt::format( - "DepthStencilCopy:DR={:#x}:SR={:#x}:DW={:#x}:SW={:#x}", regs.depth_buffer.DepthAddress(), - regs.depth_buffer.StencilAddress(), regs.depth_buffer.DepthWriteAddress(), - regs.depth_buffer.StencilWriteAddress())); - - read_image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits2::eTransferRead, - sub_range); - write_image.Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits2::eTransferWrite, - sub_range); - - auto aspect_mask = vk::ImageAspectFlags(0); - if (is_depth) { - aspect_mask |= vk::ImageAspectFlagBits::eDepth; - } - if (is_stencil) { - aspect_mask |= vk::ImageAspectFlagBits::eStencil; - } - - vk::ImageCopy region = { - .srcSubresource = - { - .aspectMask = aspect_mask, - .mipLevel = 0, - .baseArrayLayer = sub_range.base.layer, - .layerCount = sub_range.extent.layers, - }, - .srcOffset = {0, 0, 0}, - .dstSubresource = - { - .aspectMask = aspect_mask, - .mipLevel = 0, - .baseArrayLayer = sub_range.base.layer, - .layerCount = sub_range.extent.layers, - }, - .dstOffset = {0, 0, 0}, - .extent = {write_image.info.size.width, write_image.info.size.height, 1}, - }; - scheduler.CommandBuffer().copyImage(read_image.GetImage(), vk::ImageLayout::eTransferSrcOptimal, - write_image.GetImage(), - vk::ImageLayout::eTransferDstOptimal, region); - - ScopeMarkerEnd(); -} - -void Rasterizer::FillBuffer(VAddr address, u32 num_bytes, u32 value, bool is_gds) { - buffer_cache.FillBuffer(address, num_bytes, value, is_gds); -} - -void Rasterizer::CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds) { - buffer_cache.CopyBuffer(dst, src, num_bytes, dst_gds, src_gds); -} - -u32 Rasterizer::ReadDataFromGds(u32 gds_offset) { - auto* gds_buf = buffer_cache.GetGdsBuffer(); - u32 value; - std::memcpy(&value, gds_buf->mapped_data.data() + gds_offset, sizeof(u32)); - return value; -} - -bool Rasterizer::InvalidateMemory(VAddr addr, u64 size) { - if (!IsMapped(addr, size)) { - // Not GPU mapped memory, can skip invalidation logic entirely. - return false; - } - buffer_cache.InvalidateMemory(addr, size); - texture_cache.InvalidateMemory(addr, size); - return true; -} - -bool Rasterizer::ReadMemory(VAddr addr, u64 size) { - if (!IsMapped(addr, size)) { - // Not GPU mapped memory, can skip invalidation logic entirely. - return false; - } - buffer_cache.ReadMemory(addr, size); - return true; -} - -bool Rasterizer::IsMapped(VAddr addr, u64 size) { - if (size == 0) { - // There is no memory, so not mapped. - return false; - } - if (static_cast(addr) > std::numeric_limits::max() - size) { - // Memory range wrapped the address space, cannot be mapped. - return false; - } - const auto range = decltype(mapped_ranges)::interval_type::right_open(addr, addr + size); - - Common::RecursiveSharedLock lock{mapped_ranges_mutex}; - return boost::icl::contains(mapped_ranges, range); -} - -void Rasterizer::MapMemory(VAddr addr, u64 size) { - { - std::scoped_lock lock{mapped_ranges_mutex}; - mapped_ranges += decltype(mapped_ranges)::interval_type::right_open(addr, addr + size); - } - page_manager.OnGpuMap(addr, size); -} - -void Rasterizer::UnmapMemory(VAddr addr, u64 size) { - buffer_cache.InvalidateMemory(addr, size); - texture_cache.UnmapMemory(addr, size); - page_manager.OnGpuUnmap(addr, size); - { - std::scoped_lock lock{mapped_ranges_mutex}; - mapped_ranges -= decltype(mapped_ranges)::interval_type::right_open(addr, addr + size); - } -} - -void Rasterizer::UpdateDynamicState(const GraphicsPipeline* pipeline, const bool is_indexed) const { - UpdateViewportScissorState(); - UpdateDepthStencilState(); - UpdatePrimitiveState(is_indexed); - UpdateRasterizationState(); - UpdateColorBlendingState(pipeline); - - auto& dynamic_state = scheduler.GetDynamicState(); - dynamic_state.Commit(instance, scheduler.CommandBuffer()); -} - -void Rasterizer::UpdateViewportScissorState() const { - const auto& regs = liverpool->regs; - - const auto combined_scissor_value_tl = [](s16 scr, s16 win, s16 gen, s16 win_offset) { - return std::max({scr, s16(win + win_offset), s16(gen + win_offset)}); - }; - const auto combined_scissor_value_br = [](s16 scr, s16 win, s16 gen, s16 win_offset) { - return std::min({scr, s16(win + win_offset), s16(gen + win_offset)}); - }; - const bool enable_offset = !regs.window_scissor.window_offset_disable; - - AmdGpu::Scissor scsr{}; - scsr.top_left_x = combined_scissor_value_tl( - regs.screen_scissor.top_left_x, s16(regs.window_scissor.top_left_x), - s16(regs.generic_scissor.top_left_x), - enable_offset ? regs.window_offset.window_x_offset : 0); - scsr.top_left_y = combined_scissor_value_tl( - regs.screen_scissor.top_left_y, s16(regs.window_scissor.top_left_y), - s16(regs.generic_scissor.top_left_y), - enable_offset ? regs.window_offset.window_y_offset : 0); - scsr.bottom_right_x = combined_scissor_value_br( - regs.screen_scissor.bottom_right_x, regs.window_scissor.bottom_right_x, - regs.generic_scissor.bottom_right_x, - enable_offset ? regs.window_offset.window_x_offset : 0); - scsr.bottom_right_y = combined_scissor_value_br( - regs.screen_scissor.bottom_right_y, regs.window_scissor.bottom_right_y, - regs.generic_scissor.bottom_right_y, - enable_offset ? regs.window_offset.window_y_offset : 0); - - boost::container::static_vector viewports; - boost::container::static_vector scissors; - - if (regs.polygon_control.enable_window_offset && - (regs.window_offset.window_x_offset != 0 || regs.window_offset.window_y_offset != 0)) { - LOG_ERROR(Render_Vulkan, - "PA_SU_SC_MODE_CNTL.VTX_WINDOW_OFFSET_ENABLE support is not yet implemented."); - } - - const auto& vp_ctl = regs.viewport_control; - for (u32 i = 0; i < AmdGpu::NUM_VIEWPORTS; i++) { - const auto& vp = regs.viewports[i]; - const auto& vp_d = regs.viewport_depths[i]; - if (vp.xscale == 0) { - continue; - } - - const auto zoffset = vp_ctl.zoffset_enable ? vp.zoffset : 0.f; - const auto zscale = vp_ctl.zscale_enable ? vp.zscale : 1.f; - - vk::Viewport viewport{}; - - // https://gitlab.freedesktop.org/mesa/mesa/-/blob/209a0ed/src/amd/vulkan/radv_pipeline_graphics.c#L688-689 - // https://gitlab.freedesktop.org/mesa/mesa/-/blob/209a0ed/src/amd/vulkan/radv_cmd_buffer.c#L3103-3109 - // When the clip space is ranged [-1...1], the zoffset is centered. - // By reversing the above viewport calculations, we get the following: - if (regs.clipper_control.clip_space == AmdGpu::ClipSpace::MinusWToW) { - viewport.minDepth = zoffset - zscale; - viewport.maxDepth = zoffset + zscale; - } else { - viewport.minDepth = zoffset; - viewport.maxDepth = zoffset + zscale; - } - - if (!instance.IsDepthRangeUnrestrictedSupported()) { - // Unrestricted depth range not supported by device. Restrict to valid range. - viewport.minDepth = std::max(viewport.minDepth, 0.f); - viewport.maxDepth = std::min(viewport.maxDepth, 1.f); - } - - if (regs.IsClipDisabled()) { - // In case if clipping is disabled we patch the shader to convert vertex position - // from screen space coordinates to NDC by defining a render space as full hardware - // window range [0..16383, 0..16383] and setting the viewport to its size. - viewport.x = 0.f; - viewport.y = 0.f; - viewport.width = float(std::min(instance.GetMaxViewportWidth(), 16_KB)); - viewport.height = float(std::min(instance.GetMaxViewportHeight(), 16_KB)); - } else { - const auto xoffset = vp_ctl.xoffset_enable ? vp.xoffset : 0.f; - const auto xscale = vp_ctl.xscale_enable ? vp.xscale : 1.f; - const auto yoffset = vp_ctl.yoffset_enable ? vp.yoffset : 0.f; - const auto yscale = vp_ctl.yscale_enable ? vp.yscale : 1.f; - - viewport.x = xoffset - xscale; - viewport.y = yoffset - yscale; - viewport.width = xscale * 2.0f; - viewport.height = yscale * 2.0f; - } - - viewports.push_back(viewport); - - auto vp_scsr = scsr; - if (regs.mode_control.vport_scissor_enable) { - vp_scsr.top_left_x = - std::max(vp_scsr.top_left_x, s16(regs.viewport_scissors[i].top_left_x)); - vp_scsr.top_left_y = - std::max(vp_scsr.top_left_y, s16(regs.viewport_scissors[i].top_left_y)); - vp_scsr.bottom_right_x = std::min(AmdGpu::Scissor::Clamp(vp_scsr.bottom_right_x), - regs.viewport_scissors[i].bottom_right_x); - vp_scsr.bottom_right_y = std::min(AmdGpu::Scissor::Clamp(vp_scsr.bottom_right_y), - regs.viewport_scissors[i].bottom_right_y); - } - scissors.push_back({ - .offset = {vp_scsr.top_left_x, vp_scsr.top_left_y}, - .extent = {vp_scsr.GetWidth(), vp_scsr.GetHeight()}, - }); - } - - if (viewports.empty()) { - // Vulkan requires providing at least one viewport. - constexpr vk::Viewport empty_viewport = { - .x = -1.0f, - .y = -1.0f, - .width = 1.0f, - .height = 1.0f, - .minDepth = 0.0f, - .maxDepth = 1.0f, - }; - constexpr vk::Rect2D empty_scissor = { - .offset = {0, 0}, - .extent = {1, 1}, - }; - viewports.push_back(empty_viewport); - scissors.push_back(empty_scissor); - } - - auto& dynamic_state = scheduler.GetDynamicState(); - dynamic_state.SetViewports(viewports); - dynamic_state.SetScissors(scissors); -} - -void Rasterizer::UpdateDepthStencilState() const { - const auto& regs = liverpool->regs; - auto& dynamic_state = scheduler.GetDynamicState(); - - const auto depth_test_enabled = - regs.depth_control.depth_enable && regs.depth_buffer.DepthValid(); - dynamic_state.SetDepthTestEnabled(depth_test_enabled); - if (depth_test_enabled) { - dynamic_state.SetDepthWriteEnabled(regs.depth_control.depth_write_enable && - !regs.depth_render_control.depth_clear_enable); - dynamic_state.SetDepthCompareOp(LiverpoolToVK::CompareOp(regs.depth_control.depth_func)); - } - - const auto depth_bounds_test_enabled = regs.depth_control.depth_bounds_enable; - dynamic_state.SetDepthBoundsTestEnabled(depth_bounds_test_enabled); - if (depth_bounds_test_enabled) { - dynamic_state.SetDepthBounds(regs.depth_bounds_min, regs.depth_bounds_max); - } - - const auto depth_bias_enabled = regs.polygon_control.NeedsBias(); - dynamic_state.SetDepthBiasEnabled(depth_bias_enabled); - if (depth_bias_enabled) { - const bool front = regs.polygon_control.enable_polygon_offset_front; - dynamic_state.SetDepthBias( - front ? regs.poly_offset.front_offset : regs.poly_offset.back_offset, - regs.poly_offset.depth_bias, - (front ? regs.poly_offset.front_scale : regs.poly_offset.back_scale) / 16.f); - } - - const auto stencil_test_enabled = - regs.depth_control.stencil_enable && regs.depth_buffer.StencilValid(); - dynamic_state.SetStencilTestEnabled(stencil_test_enabled); - if (stencil_test_enabled) { - const StencilOps front_ops{ - .fail_op = LiverpoolToVK::StencilOp(regs.stencil_control.stencil_fail_front), - .pass_op = LiverpoolToVK::StencilOp(regs.stencil_control.stencil_zpass_front), - .depth_fail_op = LiverpoolToVK::StencilOp(regs.stencil_control.stencil_zfail_front), - .compare_op = LiverpoolToVK::CompareOp(regs.depth_control.stencil_ref_func), - }; - const StencilOps back_ops = regs.depth_control.backface_enable ? StencilOps{ - .fail_op = LiverpoolToVK::StencilOp(regs.stencil_control.stencil_fail_back), - .pass_op = LiverpoolToVK::StencilOp(regs.stencil_control.stencil_zpass_back), - .depth_fail_op = LiverpoolToVK::StencilOp(regs.stencil_control.stencil_zfail_back), - .compare_op = LiverpoolToVK::CompareOp(regs.depth_control.stencil_bf_func), - } : front_ops; - dynamic_state.SetStencilOps(front_ops, back_ops); - - const bool stencil_clear = regs.depth_render_control.stencil_clear_enable; - const auto front = regs.stencil_ref_front; - const auto back = - regs.depth_control.backface_enable ? regs.stencil_ref_back : regs.stencil_ref_front; - dynamic_state.SetStencilReferences(front.stencil_test_val, back.stencil_test_val); - dynamic_state.SetStencilWriteMasks(!stencil_clear ? front.stencil_write_mask : 0U, - !stencil_clear ? back.stencil_write_mask : 0U); - dynamic_state.SetStencilCompareMasks(front.stencil_mask, back.stencil_mask); - } -} - -void Rasterizer::UpdatePrimitiveState(const bool is_indexed) const { - const auto& regs = liverpool->regs; - auto& dynamic_state = scheduler.GetDynamicState(); - - const auto prim_restart = (regs.enable_primitive_restart & 1) != 0; - ASSERT_MSG(!is_indexed || !prim_restart || regs.primitive_restart_index == 0xFFFF || - regs.primitive_restart_index == 0xFFFFFFFF, - "Primitive restart index other than -1 is not supported yet"); - - const auto cull_mode = LiverpoolToVK::IsPrimitiveCulled(regs.primitive_type) - ? LiverpoolToVK::CullMode(regs.polygon_control.CullingMode()) - : vk::CullModeFlagBits::eNone; - const auto front_face = LiverpoolToVK::FrontFace(regs.polygon_control.front_face); - - dynamic_state.SetPrimitiveRestartEnabled(prim_restart); - dynamic_state.SetRasterizerDiscardEnabled(regs.clipper_control.dx_rasterization_kill); - dynamic_state.SetCullMode(cull_mode); - dynamic_state.SetFrontFace(front_face); -} - -void Rasterizer::UpdateRasterizationState() const { - const auto& regs = liverpool->regs; - auto& dynamic_state = scheduler.GetDynamicState(); - dynamic_state.SetLineWidth(regs.line_control.Width()); -} - -void Rasterizer::UpdateColorBlendingState(const GraphicsPipeline* pipeline) const { - const auto& regs = liverpool->regs; - auto& dynamic_state = scheduler.GetDynamicState(); - dynamic_state.SetBlendConstants(regs.blend_constants); - dynamic_state.SetColorWriteMasks(pipeline->GetGraphicsKey().write_masks); - dynamic_state.SetAttachmentFeedbackLoopEnabled(attachment_feedback_loop); -} - -void Rasterizer::ScopeMarkerBegin(const std::string_view& str, bool from_guest) { - if ((from_guest && !Config::getVkGuestMarkersEnabled()) || - (!from_guest && !Config::getVkHostMarkersEnabled())) { - return; - } - const auto cmdbuf = scheduler.CommandBuffer(); - cmdbuf.beginDebugUtilsLabelEXT(vk::DebugUtilsLabelEXT{ - .pLabelName = str.data(), - }); -} - -void Rasterizer::ScopeMarkerEnd(bool from_guest) { - if ((from_guest && !Config::getVkGuestMarkersEnabled()) || - (!from_guest && !Config::getVkHostMarkersEnabled())) { - return; - } - const auto cmdbuf = scheduler.CommandBuffer(); - cmdbuf.endDebugUtilsLabelEXT(); -} - -void Rasterizer::ScopedMarkerInsert(const std::string_view& str, bool from_guest) { - if ((from_guest && !Config::getVkGuestMarkersEnabled()) || - (!from_guest && !Config::getVkHostMarkersEnabled())) { - return; - } - const auto cmdbuf = scheduler.CommandBuffer(); - cmdbuf.insertDebugUtilsLabelEXT(vk::DebugUtilsLabelEXT{ - .pLabelName = str.data(), - }); -} - -void Rasterizer::ScopedMarkerInsertColor(const std::string_view& str, const u32 color, - bool from_guest) { - if ((from_guest && !Config::getVkGuestMarkersEnabled()) || - (!from_guest && !Config::getVkHostMarkersEnabled())) { - return; - } - const auto cmdbuf = scheduler.CommandBuffer(); - cmdbuf.insertDebugUtilsLabelEXT(vk::DebugUtilsLabelEXT{ - .pLabelName = str.data(), - .color = std::array( - {(f32)((color >> 16) & 0xff) / 255.0f, (f32)((color >> 8) & 0xff) / 255.0f, - (f32)(color & 0xff) / 255.0f, (f32)((color >> 24) & 0xff) / 255.0f})}); -} - -} // namespace Vulkan +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "common/assert.h" +#include "common/config.h" +#include "common/debug.h" +#include "core/memory.h" +#include "shader_recompiler/runtime_info.h" +#include "video_core/amdgpu/liverpool.h" +#include "video_core/renderer_vulkan/liverpool_to_vk.h" +#include "video_core/renderer_vulkan/vk_instance.h" +#include "video_core/renderer_vulkan/vk_rasterizer.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_shader_hle.h" +#include "video_core/texture_cache/image_view.h" +#include "video_core/texture_cache/texture_cache.h" + +#ifdef MemoryBarrier +#undef MemoryBarrier +#endif + +namespace Vulkan { + +static Shader::PushData MakeUserData(const AmdGpu::Regs& regs) { + // TODO(roamic): Add support for multiple viewports and geometry shaders when ViewportIndex + // is encountered and implemented in the recompiler. + Shader::PushData push_data{}; + push_data.xoffset = regs.viewport_control.xoffset_enable ? regs.viewports[0].xoffset : 0.f; + push_data.xscale = regs.viewport_control.xscale_enable ? regs.viewports[0].xscale : 1.f; + push_data.yoffset = regs.viewport_control.yoffset_enable ? regs.viewports[0].yoffset : 0.f; + push_data.yscale = regs.viewport_control.yscale_enable ? regs.viewports[0].yscale : 1.f; + return push_data; +} + +Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_, + AmdGpu::Liverpool* liverpool_) + : instance{instance_}, scheduler{scheduler_}, page_manager{this}, + buffer_cache{instance, scheduler, liverpool_, texture_cache, page_manager}, + texture_cache{instance, scheduler, liverpool_, buffer_cache, page_manager}, + liverpool{liverpool_}, memory{Core::Memory::Instance()}, + pipeline_cache{instance, scheduler, liverpool} { + if (!Config::nullGpu()) { + liverpool->BindRasterizer(this); + } + memory->SetRasterizer(this); + + // Initialize async compute scheduler if dedicated compute queue is available + if (instance.HasDedicatedComputeQueue()) { + compute_scheduler = std::make_unique(instance); + + // Create compute-specific descriptor heap with compute scheduler's semaphore + static constexpr std::array ComputeDescriptorHeapSizes = { + vk::DescriptorPoolSize{vk::DescriptorType::eUniformBuffer, 512}, + vk::DescriptorPoolSize{vk::DescriptorType::eStorageBuffer, 8192}, + vk::DescriptorPoolSize{vk::DescriptorType::eSampledImage, 8192}, + vk::DescriptorPoolSize{vk::DescriptorType::eStorageImage, 1024}, + vk::DescriptorPoolSize{vk::DescriptorType::eSampler, 1024}, + }; + compute_desc_heap = std::make_unique( + instance, compute_scheduler->GetMasterSemaphore(), + ComputeDescriptorHeapSizes, 1024); + } +} + +Rasterizer::~Rasterizer() = default; + +void Rasterizer::CpSync() { + scheduler.EndRendering(); + + // ASYNC COMPUTE SYNC: + // Ensure pending compute work is submitted and graphics queue waits for it. + if (compute_scheduler && compute_scheduler->IsDedicated()) { + compute_scheduler->Flush(); + + const auto compute_sem = compute_scheduler->GetMasterSemaphore()->Handle(); + // After Flush, CurrentTick is N+1, but Flush signaled N, so wait for N + const auto compute_tick = compute_scheduler->CurrentTick() - 1; + + ASSERT_MSG(compute_tick > 0, "Invalid compute tick {} in CpSync", compute_tick); + scheduler.Wait(compute_sem, compute_tick); + } + + auto cmdbuf = scheduler.CommandBuffer(); + + const vk::MemoryBarrier ib_barrier{ + .srcAccessMask = vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eMemoryWrite | + vk::AccessFlagBits::eTransferWrite, + .dstAccessMask = vk::AccessFlagBits::eIndirectCommandRead | + vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eMemoryRead, + }; + cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eAllCommands, + vk::PipelineStageFlagBits::eAllCommands, + vk::DependencyFlagBits::eByRegion, ib_barrier, {}, {}); +} + +bool Rasterizer::FilterDraw() { + const auto& regs = liverpool->regs; + if (regs.color_control.mode == AmdGpu::ColorControl::OperationMode::EliminateFastClear) { + // Clears the render target if FCE is launched before any draws + EliminateFastClear(); + return false; + } + if (regs.color_control.mode == AmdGpu::ColorControl::OperationMode::FmaskDecompress) { + // TODO: check for a valid MRT1 to promote the draw to the resolve pass. + LOG_TRACE(Render_Vulkan, "FMask decompression pass skipped"); + ScopedMarkerInsert("FmaskDecompress"); + return false; + } + if (regs.color_control.mode == AmdGpu::ColorControl::OperationMode::Resolve) { + LOG_TRACE(Render_Vulkan, "Resolve pass"); + Resolve(); + return false; + } + if (regs.primitive_type == AmdGpu::PrimitiveType::None) { + LOG_TRACE(Render_Vulkan, "Primitive type 'None' skipped"); + ScopedMarkerInsert("PrimitiveTypeNone"); + return false; + } + + const bool cb_disabled = + regs.color_control.mode == AmdGpu::ColorControl::OperationMode::Disable; + const auto depth_copy = + regs.depth_render_override.force_z_dirty && regs.depth_render_override.force_z_valid && + regs.depth_buffer.DepthValid() && regs.depth_buffer.DepthWriteValid() && + regs.depth_buffer.DepthAddress() != regs.depth_buffer.DepthWriteAddress(); + const auto stencil_copy = + regs.depth_render_override.force_stencil_dirty && + regs.depth_render_override.force_stencil_valid && regs.depth_buffer.StencilValid() && + regs.depth_buffer.StencilWriteValid() && + regs.depth_buffer.StencilAddress() != regs.depth_buffer.StencilWriteAddress(); + if (cb_disabled && (depth_copy || stencil_copy)) { + // Games may disable color buffer and enable force depth/stencil dirty and valid to + // do a copy from one depth-stencil surface to another, without a pixel shader. + // We need to detect this case and perform the copy, otherwise it will have no effect. + LOG_TRACE(Render_Vulkan, "Performing depth-stencil override copy"); + DepthStencilCopy(depth_copy, stencil_copy); + return false; + } + + return true; +} + +void Rasterizer::PrepareRenderState(const GraphicsPipeline* pipeline) { + // Prefetch render targets to handle overlaps with bound textures (e.g. mipgen) + const auto& key = pipeline->GetGraphicsKey(); + const auto& regs = liverpool->regs; + if (regs.color_control.degamma_enable) { + LOG_WARNING(Render_Vulkan, "Color buffers require gamma correction"); + } + + const bool skip_cb_binding = + regs.color_control.mode == AmdGpu::ColorControl::OperationMode::Disable; + for (s32 cb = 0; cb < std::bit_width(key.mrt_mask); ++cb) { + auto& [image_id, desc] = cb_descs[cb]; + const auto& col_buf = regs.color_buffers[cb]; + const u32 target_mask = regs.color_target_mask.GetMask(cb); + if (skip_cb_binding || !col_buf || !target_mask || (key.mrt_mask & (1 << cb)) == 0) { + image_id = {}; + continue; + } + const auto& hint = liverpool->last_cb_extent[cb]; + std::construct_at(&desc, col_buf, hint); + image_id = bound_images.emplace_back(texture_cache.FindImage(desc)); + auto& image = texture_cache.GetImage(image_id); + image.binding.is_target = 1u; + } + + if ((regs.depth_control.depth_enable && regs.depth_buffer.DepthValid()) || + (regs.depth_control.stencil_enable && regs.depth_buffer.StencilValid())) { + const auto htile_address = regs.depth_htile_data_base.GetAddress(); + const auto& hint = liverpool->last_db_extent; + auto& [image_id, desc] = db_desc; + std::construct_at(&desc, regs.depth_buffer, regs.depth_view, regs.depth_control, + htile_address, hint); + image_id = bound_images.emplace_back(texture_cache.FindImage(desc)); + auto& image = texture_cache.GetImage(image_id); + image.binding.is_target = 1u; + } else { + db_desc.first = {}; + } +} + +static std::pair GetDrawOffsets( + const AmdGpu::Regs& regs, const Shader::Info& info, + const std::optional& fetch_shader) { + u32 vertex_offset = regs.index_offset; + u32 instance_offset = 0; + if (fetch_shader) { + if (vertex_offset == 0 && fetch_shader->vertex_offset_sgpr != -1) { + vertex_offset = info.user_data[fetch_shader->vertex_offset_sgpr]; + } + if (fetch_shader->instance_offset_sgpr != -1) { + instance_offset = info.user_data[fetch_shader->instance_offset_sgpr]; + } + } + return {vertex_offset, instance_offset}; +} + +void Rasterizer::EliminateFastClear() { + auto& col_buf = liverpool->regs.color_buffers[0]; + if (!col_buf || !col_buf.info.fast_clear) { + return; + } + VideoCore::TextureCache::ImageDesc desc(col_buf, liverpool->last_cb_extent[0]); + const auto image_id = texture_cache.FindImage(desc); + const auto& image_view = texture_cache.FindRenderTarget(image_id, desc); + if (!texture_cache.IsMetaCleared(col_buf.CmaskAddress(), col_buf.view.slice_start)) { + return; + } + for (u32 slice = col_buf.view.slice_start; slice <= col_buf.view.slice_max; ++slice) { + texture_cache.TouchMeta(col_buf.CmaskAddress(), slice, false); + } + auto& image = texture_cache.GetImage(image_id); + const auto clear_value = LiverpoolToVK::ColorBufferClearValue(col_buf); + + ScopeMarkerBegin(fmt::format("EliminateFastClear:MRT={:#x}:M={:#x}", col_buf.Address(), + col_buf.CmaskAddress())); + image.Clear(clear_value, desc.view_info.range); + ScopeMarkerEnd(); +} + +void Rasterizer::Draw(bool is_indexed, u32 index_offset) { + RENDERER_TRACE; + + scheduler.PopPendingOperations(); + + // Sync with async compute before drawing - compute results may be needed + if (compute_scheduler && compute_scheduler->IsDedicated()) { + compute_scheduler->SignalGraphics(scheduler); + } + + if (!FilterDraw()) { + return; + } + + const auto& regs = liverpool->regs; + const GraphicsPipeline* pipeline = pipeline_cache.GetGraphicsPipeline(); + if (!pipeline) { + return; + } + + PrepareRenderState(pipeline); + if (!BindResources(pipeline)) { + return; + } + const auto state = BeginRendering(pipeline); + + buffer_cache.BindVertexBuffers(*pipeline); + if (is_indexed) { + buffer_cache.BindIndexBuffer(index_offset); + } + + pipeline->BindResources(set_writes, buffer_barriers, push_data); + UpdateDynamicState(pipeline, is_indexed); + scheduler.BeginRendering(state); + + const auto& vs_info = pipeline->GetStage(Shader::LogicalStage::Vertex); + const auto& fetch_shader = pipeline->GetFetchShader(); + const auto [vertex_offset, instance_offset] = GetDrawOffsets(regs, vs_info, fetch_shader); + + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.bindPipeline(vk::PipelineBindPoint::eGraphics, pipeline->Handle()); + + if (is_indexed) { + cmdbuf.drawIndexed(regs.num_indices, regs.num_instances.NumInstances(), 0, + s32(vertex_offset), instance_offset); + } else { + cmdbuf.draw(regs.num_indices, regs.num_instances.NumInstances(), vertex_offset, + instance_offset); + } + + ResetBindings(); +} + +void Rasterizer::DrawIndirect(bool is_indexed, VAddr arg_address, u32 offset, u32 stride, + u32 max_count, VAddr count_address) { + RENDERER_TRACE; + + scheduler.PopPendingOperations(); + + // Sync with async compute before drawing - compute results may be needed + if (compute_scheduler && compute_scheduler->IsDedicated()) { + if (compute_scheduler->HasPendingWork()) { + compute_scheduler->SignalGraphics(scheduler); + // Flush graphics to ensure the NEXT draw (this one) waits for compute + scheduler.Flush(); + } + } + + if (!FilterDraw()) { + return; + } + + const GraphicsPipeline* pipeline = pipeline_cache.GetGraphicsPipeline(); + if (!pipeline) { + return; + } + + PrepareRenderState(pipeline); + if (!BindResources(pipeline)) { + return; + } + const auto state = BeginRendering(pipeline); + + buffer_cache.BindVertexBuffers(*pipeline); + if (is_indexed) { + buffer_cache.BindIndexBuffer(0); + } + + const auto& [buffer, base] = + buffer_cache.ObtainBuffer(arg_address + offset, stride * max_count, false); + + VideoCore::Buffer* count_buffer{}; + u32 count_base{}; + if (count_address != 0) { + std::tie(count_buffer, count_base) = buffer_cache.ObtainBuffer(count_address, 4, false); + } + + pipeline->BindResources(set_writes, buffer_barriers, push_data); + UpdateDynamicState(pipeline, is_indexed); + scheduler.BeginRendering(state); + + // We can safely ignore both SGPR UD indices and results of fetch shader parsing, as vertex and + // instance offsets will be automatically applied by Vulkan from indirect args buffer. + + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.bindPipeline(vk::PipelineBindPoint::eGraphics, pipeline->Handle()); + + if (is_indexed) { + ASSERT(sizeof(VkDrawIndexedIndirectCommand) == stride); + + if (count_address != 0) { + cmdbuf.drawIndexedIndirectCount(buffer->Handle(), base, count_buffer->Handle(), + count_base, max_count, stride); + } else { + cmdbuf.drawIndexedIndirect(buffer->Handle(), base, max_count, stride); + } + } else { + ASSERT(sizeof(VkDrawIndirectCommand) == stride); + + if (count_address != 0) { + cmdbuf.drawIndirectCount(buffer->Handle(), base, count_buffer->Handle(), count_base, + max_count, stride); + } else { + cmdbuf.drawIndirect(buffer->Handle(), base, max_count, stride); + } + } + + ResetBindings(); +} + +void Rasterizer::DispatchDirect() { + RENDERER_TRACE; + + const auto& cs_program = liverpool->GetCsRegs(); + const ComputePipeline* pipeline = pipeline_cache.GetComputePipeline(); + if (!pipeline) { + return; + } + + const auto& cs = pipeline->GetStage(Shader::LogicalStage::Compute); + if (ExecuteShaderHLE(cs, liverpool->regs, cs_program, *this)) { + return; + } + + if (!BindResources(pipeline)) { + return; + } + + // Temporary flag to test async compute - set to false to disable + constexpr bool enable_async_compute = true; + + // Use async compute queue if available + if (enable_async_compute && compute_scheduler && compute_scheduler->IsDedicated() && compute_desc_heap) { + // Process pending operations on both schedulers + scheduler.PopPendingOperations(); + compute_scheduler->PopPendingOperations(); + + // End any active rendering before compute uses shared resources + scheduler.EndRendering(); + + // WAR Hazard: Ensure graphics has finished writing before compute reads + compute_scheduler->WaitForGraphics(scheduler); + + // Get compute command buffer and bind resources using compute descriptor heap + const auto cmdbuf = compute_scheduler->CommandBuffer(); + pipeline->BindResources(cmdbuf, set_writes, buffer_barriers, push_data, *compute_desc_heap); + cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->Handle()); + cmdbuf.dispatch(cs_program.dim_x, cs_program.dim_y, cs_program.dim_z); + + // Mark that compute work is pending, but don't flush yet to allow batching. + compute_scheduler->MarkPendingWork(); + } else { + // Fallback to graphics queue + scheduler.PopPendingOperations(); + scheduler.EndRendering(); + pipeline->BindResources(set_writes, buffer_barriers, push_data); + + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->Handle()); + cmdbuf.dispatch(cs_program.dim_x, cs_program.dim_y, cs_program.dim_z); + } + + ResetBindings(); +} + +void Rasterizer::DispatchIndirect(VAddr address, u32 offset, u32 size) { + RENDERER_TRACE; + + const auto& cs_program = liverpool->GetCsRegs(); + const ComputePipeline* pipeline = pipeline_cache.GetComputePipeline(); + if (!pipeline) { + return; + } + + if (!BindResources(pipeline)) { + return; + } + + const auto [buffer, base] = buffer_cache.ObtainBuffer(address + offset, size, false); + + // Temporary flag to test async compute - set to false to disable + constexpr bool enable_async_compute = true; + + // Use async compute queue if available + if (enable_async_compute && compute_scheduler && compute_scheduler->IsDedicated() && compute_desc_heap) { + // Process pending operations on both schedulers + scheduler.PopPendingOperations(); + compute_scheduler->PopPendingOperations(); + + // End any active rendering before compute uses shared resources + scheduler.EndRendering(); + + // WAR Hazard: Ensure graphics has finished writing before compute reads + compute_scheduler->WaitForGraphics(scheduler); + + // Get compute command buffer and bind resources using compute descriptor heap + const auto cmdbuf = compute_scheduler->CommandBuffer(); + pipeline->BindResources(cmdbuf, set_writes, buffer_barriers, push_data, *compute_desc_heap); + cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->Handle()); + cmdbuf.dispatchIndirect(buffer->Handle(), base); + + // Flush compute immediately (no batching) and sync with graphics + compute_scheduler->MarkPendingWork(); + compute_scheduler->SignalGraphics(scheduler); + } else { + // Fallback to graphics queue + scheduler.PopPendingOperations(); + scheduler.EndRendering(); + pipeline->BindResources(set_writes, buffer_barriers, push_data); + + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->Handle()); + cmdbuf.dispatchIndirect(buffer->Handle(), base); + } + + ResetBindings(); +} + +u64 Rasterizer::Flush() { + const u64 current_tick = scheduler.CurrentTick(); + SubmitInfo info{}; + scheduler.Flush(info); + return current_tick; +} + +void Rasterizer::Finish() { + scheduler.Finish(); +} + +void Rasterizer::SyncComputeForPresent() { + // Ensure all pending compute work is synced with graphics before presentation + if (compute_scheduler && compute_scheduler->IsDedicated()) { + // SignalGraphics will flush compute and make graphics wait for it + compute_scheduler->SignalGraphics(scheduler); + } +} + +void Rasterizer::OnSubmit() { + if (fault_process_pending) { + fault_process_pending = false; + buffer_cache.ProcessFaultBuffer(); + } + texture_cache.ProcessDownloadImages(); + texture_cache.RunGarbageCollector(); + buffer_cache.RunGarbageCollector(); +} + +bool Rasterizer::BindResources(const Pipeline* pipeline) { + if (IsComputeImageCopy(pipeline) || IsComputeMetaClear(pipeline) || + IsComputeImageClear(pipeline)) { + return false; + } + + set_writes.clear(); + buffer_barriers.clear(); + buffer_infos.clear(); + image_infos.clear(); + + bool uses_dma = false; + + // Bind resource buffers and textures. + Shader::Backend::Bindings binding{}; + push_data = MakeUserData(liverpool->regs); + for (const auto* stage : pipeline->GetStages()) { + if (!stage) { + continue; + } + stage->PushUd(binding, push_data); + BindBuffers(*stage, binding, push_data); + BindTextures(*stage, binding); + uses_dma |= stage->uses_dma; + } + + if (uses_dma) { + // We only use fault buffer for DMA right now. + Common::RecursiveSharedLock lock{mapped_ranges_mutex}; + for (auto& range : mapped_ranges) { + buffer_cache.SynchronizeBuffersInRange(range.lower(), range.upper() - range.lower()); + } + fault_process_pending = true; + } + + return true; +} + +bool Rasterizer::IsComputeMetaClear(const Pipeline* pipeline) { + if (!pipeline->IsCompute()) { + return false; + } + + // Most of the time when a metadata is updated with a shader it gets cleared. It means + // we can skip the whole dispatch and update the tracked state instead. Also, it is not + // intended to be consumed and in such rare cases (e.g. HTile introspection, CRAA) we + // will need its full emulation anyways. + const auto& info = pipeline->GetStage(Shader::LogicalStage::Compute); + + // Assume if a shader reads metadata, it is a copy shader. + for (const auto& desc : info.buffers) { + const VAddr address = desc.GetSharp(info).base_address; + if (!desc.IsSpecial() && !desc.is_written && texture_cache.IsMeta(address)) { + return false; + } + } + + // Metadata surfaces are tiled and thus need address calculation to be written properly. + // If a shader wants to encode HTILE, for example, from a depth image it will have to compute + // proper tile address from dispatch invocation id. This address calculation contains an xor + // operation so use it as a heuristic for metadata writes that are probably not clears. + if (!info.has_bitwise_xor) { + // Assume if a shader writes metadata without address calculation, it is a clear shader. + for (const auto& desc : info.buffers) { + const VAddr address = desc.GetSharp(info).base_address; + if (!desc.IsSpecial() && desc.is_written && texture_cache.ClearMeta(address)) { + // Assume all slices were updates + LOG_TRACE(Render_Vulkan, "Metadata update skipped"); + return true; + } + } + } + return false; +} + +bool Rasterizer::IsComputeImageCopy(const Pipeline* pipeline) { + if (!pipeline->IsCompute()) { + return false; + } + + // Ensure shader only has 2 bound buffers + const auto& cs_pgm = liverpool->GetCsRegs(); + const auto& info = pipeline->GetStage(Shader::LogicalStage::Compute); + if (cs_pgm.num_thread_x.full != 64 || info.buffers.size() != 2 || !info.images.empty()) { + return false; + } + + // Those 2 buffers must both be formatted. One must be source and another destination. + const auto& desc0 = info.buffers[0]; + const auto& desc1 = info.buffers[1]; + if (!desc0.is_formatted || !desc1.is_formatted || desc0.is_written == desc1.is_written) { + return false; + } + + // Buffers must have the same size and each thread of the dispatch must copy 1 dword of data + const AmdGpu::Buffer buf0 = desc0.GetSharp(info); + const AmdGpu::Buffer buf1 = desc1.GetSharp(info); + if (buf0.GetSize() != buf1.GetSize() || cs_pgm.dim_x != (buf0.GetSize() / 256)) { + return false; + } + + // Find images the buffer alias + const auto image0_id = texture_cache.FindImageFromRange(buf0.base_address, buf0.GetSize()); + if (!image0_id) { + return false; + } + const auto image1_id = + texture_cache.FindImageFromRange(buf1.base_address, buf1.GetSize(), false); + if (!image1_id) { + return false; + } + + // Image copy must be valid + VideoCore::Image& image0 = texture_cache.GetImage(image0_id); + VideoCore::Image& image1 = texture_cache.GetImage(image1_id); + if (image0.info.guest_size != image1.info.guest_size || + image0.info.pitch != image1.info.pitch || image0.info.guest_size != buf0.GetSize() || + image0.info.num_bits != image1.info.num_bits) { + return false; + } + + // Perform image copy + VideoCore::Image& src_image = desc0.is_written ? image1 : image0; + VideoCore::Image& dst_image = desc0.is_written ? image0 : image1; + if (instance.IsMaintenance8Supported() || + src_image.info.props.is_depth == dst_image.info.props.is_depth) { + dst_image.CopyImage(src_image); + } else { + const auto& copy_buffer = + buffer_cache.GetUtilityBuffer(VideoCore::MemoryUsage::DeviceLocal); + dst_image.CopyImageWithBuffer(src_image, copy_buffer.Handle(), 0); + } + dst_image.flags |= VideoCore::ImageFlagBits::GpuModified; + dst_image.flags &= ~VideoCore::ImageFlagBits::Dirty; + return true; +} + +bool Rasterizer::IsComputeImageClear(const Pipeline* pipeline) { + if (!pipeline->IsCompute()) { + return false; + } + + // Ensure shader only has 2 bound buffers + const auto& cs_pgm = liverpool->GetCsRegs(); + const auto& info = pipeline->GetStage(Shader::LogicalStage::Compute); + if (cs_pgm.num_thread_x.full != 64 || info.buffers.size() != 2 || !info.images.empty()) { + return false; + } + + // From those 2 buffers, first must hold the clear vector and second the image being cleared + const auto& desc0 = info.buffers[0]; + const auto& desc1 = info.buffers[1]; + if (desc0.is_formatted || !desc1.is_formatted || desc0.is_written || !desc1.is_written) { + return false; + } + + // First buffer must have size of vec4 and second the size of a single layer + const AmdGpu::Buffer buf0 = desc0.GetSharp(info); + const AmdGpu::Buffer buf1 = desc1.GetSharp(info); + const u32 buf1_bpp = AmdGpu::NumBitsPerBlock(buf1.GetDataFmt()); + if (buf0.GetSize() != 16 || (cs_pgm.dim_x * 128ULL * (buf1_bpp / 8)) != buf1.GetSize()) { + return false; + } + + // Find image the buffer alias + const auto image1_id = + texture_cache.FindImageFromRange(buf1.base_address, buf1.GetSize(), false); + if (!image1_id) { + return false; + } + + // Image clear must be valid + VideoCore::Image& image1 = texture_cache.GetImage(image1_id); + if (image1.info.guest_size != buf1.GetSize() || image1.info.num_bits != buf1_bpp || + image1.info.props.is_depth) { + return false; + } + + // Perform image clear + const float* values = reinterpret_cast(buf0.base_address); + const vk::ClearValue clear = { + .color = {.float32 = std::array{values[0], values[1], values[2], values[3]}}, + }; + const VideoCore::SubresourceRange range = { + .base = + { + .level = 0, + .layer = 0, + }, + .extent = image1.info.resources, + }; + image1.Clear(clear, range); + image1.flags |= VideoCore::ImageFlagBits::GpuModified; + image1.flags &= ~VideoCore::ImageFlagBits::Dirty; + return true; +} + +void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Bindings& binding, + Shader::PushData& push_data) { + buffer_bindings.clear(); + + for (const auto& desc : stage.buffers) { + const auto vsharp = desc.GetSharp(stage); + if (!desc.IsSpecial() && vsharp.base_address != 0 && vsharp.GetSize() > 0) { + const u64 size = memory->ClampRangeSize(vsharp.base_address, vsharp.GetSize()); + const auto buffer_id = buffer_cache.FindBuffer(vsharp.base_address, size); + buffer_bindings.emplace_back(buffer_id, vsharp, size); + } else { + buffer_bindings.emplace_back(VideoCore::BufferId{}, vsharp, 0); + } + } + + // Second pass to re-bind buffers that were updated after binding + for (u32 i = 0; i < buffer_bindings.size(); i++) { + const auto& [buffer_id, vsharp, size] = buffer_bindings[i]; + const auto& desc = stage.buffers[i]; + const bool is_storage = desc.IsStorage(vsharp); + const u32 alignment = + is_storage ? instance.StorageMinAlignment() : instance.UniformMinAlignment(); + // Buffer is not from the cache, either a special buffer or unbound. + if (!buffer_id) { + if (desc.buffer_type == Shader::BufferType::GdsBuffer) { + const auto* gds_buf = buffer_cache.GetGdsBuffer(); + buffer_infos.emplace_back(gds_buf->Handle(), 0, gds_buf->SizeBytes()); + } else if (desc.buffer_type == Shader::BufferType::Flatbuf) { + auto& vk_buffer = buffer_cache.GetUtilityBuffer(VideoCore::MemoryUsage::Stream); + const u32 ubo_size = stage.flattened_ud_buf.size() * sizeof(u32); + const u64 offset = + vk_buffer.Copy(stage.flattened_ud_buf.data(), ubo_size, alignment); + buffer_infos.emplace_back(vk_buffer.Handle(), offset, ubo_size); + } else if (desc.buffer_type == Shader::BufferType::BdaPagetable) { + const auto* bda_buffer = buffer_cache.GetBdaPageTableBuffer(); + buffer_infos.emplace_back(bda_buffer->Handle(), 0, bda_buffer->SizeBytes()); + } else if (desc.buffer_type == Shader::BufferType::FaultBuffer) { + const auto* fault_buffer = buffer_cache.GetFaultBuffer(); + buffer_infos.emplace_back(fault_buffer->Handle(), 0, fault_buffer->SizeBytes()); + } else if (desc.buffer_type == Shader::BufferType::SharedMemory) { + auto& lds_buffer = buffer_cache.GetUtilityBuffer(VideoCore::MemoryUsage::Stream); + const auto& cs_program = liverpool->GetCsRegs(); + const auto lds_size = cs_program.SharedMemSize() * cs_program.NumWorkgroups(); + const auto [data, offset] = lds_buffer.Map(lds_size, alignment); + std::memset(data, 0, lds_size); + buffer_infos.emplace_back(lds_buffer.Handle(), offset, lds_size); + } else if (instance.IsNullDescriptorSupported()) { + buffer_infos.emplace_back(VK_NULL_HANDLE, 0, VK_WHOLE_SIZE); + } else { + auto& null_buffer = buffer_cache.GetBuffer(VideoCore::NULL_BUFFER_ID); + buffer_infos.emplace_back(null_buffer.Handle(), 0, VK_WHOLE_SIZE); + } + } else { + const auto [vk_buffer, offset] = buffer_cache.ObtainBuffer( + vsharp.base_address, size, desc.is_written, desc.is_formatted, buffer_id); + const u32 offset_aligned = Common::AlignDown(offset, alignment); + const u32 adjust = offset - offset_aligned; + ASSERT(adjust % 4 == 0); + push_data.AddOffset(binding.buffer, adjust); + buffer_infos.emplace_back(vk_buffer->Handle(), offset_aligned, size + adjust); + if (auto barrier = + vk_buffer->GetBarrier(desc.is_written ? vk::AccessFlagBits2::eShaderWrite + : vk::AccessFlagBits2::eShaderRead, + vk::PipelineStageFlagBits2::eAllCommands)) { + buffer_barriers.emplace_back(*barrier); + } + if (desc.is_written && desc.is_formatted) { + texture_cache.InvalidateMemoryFromGPU(vsharp.base_address, size); + } + } + + set_writes.push_back({ + .dstSet = VK_NULL_HANDLE, + .dstBinding = binding.unified++, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = is_storage ? vk::DescriptorType::eStorageBuffer + : vk::DescriptorType::eUniformBuffer, + .pBufferInfo = &buffer_infos.back(), + }); + ++binding.buffer; + } +} + +void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindings& binding) { + image_bindings.clear(); + + for (const auto& image_desc : stage.images) { + const auto tsharp = image_desc.GetSharp(stage); + if (texture_cache.IsMeta(tsharp.Address())) { + LOG_WARNING(Render_Vulkan, "Unexpected metadata read by a shader (texture)"); + } + + if (tsharp.GetDataFmt() == AmdGpu::DataFormat::FormatInvalid) { + image_bindings.emplace_back(std::piecewise_construct, std::tuple{}, std::tuple{}); + continue; + } + + auto& [image_id, desc] = image_bindings.emplace_back(std::piecewise_construct, std::tuple{}, + std::tuple{tsharp, image_desc}); + image_id = texture_cache.FindImage(desc); + auto* image = &texture_cache.GetImage(image_id); + if (image->depth_id) { + // If this image has an associated depth image, it's a stencil attachment. + // Redirect the access to the actual depth-stencil buffer. + image_id = image->depth_id; + image = &texture_cache.GetImage(image_id); + } + if (image->binding.is_bound) { + // The image is already bound. In case if it is about to be used as storage we need + // to force general layout on it. + image->binding.force_general |= image_desc.is_written; + } + image->binding.is_bound = 1u; + } + + // Second pass to re-bind images that were updated after binding + for (auto& [image_id, desc] : image_bindings) { + bool is_storage = desc.type == VideoCore::TextureCache::BindingType::Storage; + if (!image_id) { + if (instance.IsNullDescriptorSupported()) { + image_infos.emplace_back(VK_NULL_HANDLE, VK_NULL_HANDLE, vk::ImageLayout::eGeneral); + } else { + auto& null_image_view = texture_cache.FindTexture(VideoCore::NULL_IMAGE_ID, desc); + image_infos.emplace_back(VK_NULL_HANDLE, *null_image_view.image_view, + vk::ImageLayout::eGeneral); + } + } else { + if (auto& old_image = texture_cache.GetImage(image_id); + old_image.binding.needs_rebind) { + old_image.binding = {}; + image_id = texture_cache.FindImage(desc); + } + + bound_images.emplace_back(image_id); + + auto& image = texture_cache.GetImage(image_id); + auto& image_view = texture_cache.FindTexture(image_id, desc); + + // The image is either bound as storage in a separate descriptor or bound as render + // target in feedback loop. Depth images are excluded because they can't be bound as + // storage and feedback loop doesn't make sense for them + if ((image.binding.force_general || image.binding.is_target) && + !image.info.props.is_depth) { + image.Transit(instance.IsAttachmentFeedbackLoopLayoutSupported() && + image.binding.is_target + ? vk::ImageLayout::eAttachmentFeedbackLoopOptimalEXT + : vk::ImageLayout::eGeneral, + vk::AccessFlagBits2::eShaderRead | + (image.info.props.is_depth + ? vk::AccessFlagBits2::eDepthStencilAttachmentWrite + : vk::AccessFlagBits2::eColorAttachmentWrite), + {}); + } else { + if (is_storage) { + image.Transit(vk::ImageLayout::eGeneral, + vk::AccessFlagBits2::eShaderRead | + vk::AccessFlagBits2::eShaderWrite, + desc.view_info.range); + } else { + const auto new_layout = image.info.props.is_depth + ? vk::ImageLayout::eDepthStencilReadOnlyOptimal + : vk::ImageLayout::eShaderReadOnlyOptimal; + image.Transit(new_layout, vk::AccessFlagBits2::eShaderRead, + desc.view_info.range); + } + } + image.usage.storage |= is_storage; + image.usage.texture |= !is_storage; + + image_infos.emplace_back(VK_NULL_HANDLE, *image_view.image_view, + image.backing->state.layout); + } + + set_writes.push_back({ + .dstSet = VK_NULL_HANDLE, + .dstBinding = binding.unified++, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = + is_storage ? vk::DescriptorType::eStorageImage : vk::DescriptorType::eSampledImage, + .pImageInfo = &image_infos.back(), + }); + } + + for (const auto& sampler : stage.samplers) { + auto ssharp = sampler.GetSharp(stage); + if (sampler.disable_aniso) { + const auto& tsharp = stage.images[sampler.associated_image].GetSharp(stage); + if (tsharp.base_level == 0 && tsharp.last_level == 0) { + ssharp.max_aniso.Assign(AmdGpu::AnisoRatio::One); + } + } + const auto vk_sampler = texture_cache.GetSampler(ssharp, liverpool->regs.ta_bc_base); + image_infos.emplace_back(vk_sampler, VK_NULL_HANDLE, vk::ImageLayout::eGeneral); + set_writes.push_back({ + .dstSet = VK_NULL_HANDLE, + .dstBinding = binding.unified++, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eSampler, + .pImageInfo = &image_infos.back(), + }); + } +} + +RenderState Rasterizer::BeginRendering(const GraphicsPipeline* pipeline) { + attachment_feedback_loop = false; + const auto& regs = liverpool->regs; + const auto& key = pipeline->GetGraphicsKey(); + RenderState state; + state.width = instance.GetMaxFramebufferWidth(); + state.height = instance.GetMaxFramebufferHeight(); + state.num_layers = std::numeric_limits::max(); + state.num_color_attachments = std::bit_width(key.mrt_mask); + for (auto cb = 0u; cb < state.num_color_attachments; ++cb) { + auto& [image_id, desc] = cb_descs[cb]; + if (!image_id) { + continue; + } + auto* image = &texture_cache.GetImage(image_id); + if (image->binding.needs_rebind) { + image_id = bound_images.emplace_back(texture_cache.FindImage(desc)); + image = &texture_cache.GetImage(image_id); + } + texture_cache.UpdateImage(image_id); + image->SetBackingSamples(key.color_samples[cb]); + const auto& image_view = texture_cache.FindRenderTarget(image_id, desc); + const auto slice = image_view.info.range.base.layer; + const auto mip = image_view.info.range.base.level; + + const auto& col_buf = regs.color_buffers[cb]; + const bool is_clear = texture_cache.IsMetaCleared(col_buf.CmaskAddress(), slice); + texture_cache.TouchMeta(col_buf.CmaskAddress(), slice, false); + + if (image->binding.is_bound) { + ASSERT_MSG(!image->binding.force_general, + "Having image both as storage and render target is unsupported"); + image->Transit(instance.IsAttachmentFeedbackLoopLayoutSupported() + ? vk::ImageLayout::eAttachmentFeedbackLoopOptimalEXT + : vk::ImageLayout::eGeneral, + vk::AccessFlagBits2::eColorAttachmentWrite, {}); + attachment_feedback_loop = true; + } else { + image->Transit(vk::ImageLayout::eColorAttachmentOptimal, + vk::AccessFlagBits2::eColorAttachmentWrite | + vk::AccessFlagBits2::eColorAttachmentRead, + desc.view_info.range); + } + + state.width = std::min(state.width, std::max(image->info.size.width >> mip, 1u)); + state.height = std::min(state.height, std::max(image->info.size.height >> mip, 1u)); + state.num_layers = std::min(state.num_layers, image_view.info.range.extent.layers); + state.color_attachments[cb] = { + .imageView = *image_view.image_view, + .imageLayout = image->backing->state.layout, + .loadOp = is_clear ? vk::AttachmentLoadOp::eClear : vk::AttachmentLoadOp::eLoad, + .storeOp = vk::AttachmentStoreOp::eStore, + .clearValue = + is_clear ? LiverpoolToVK::ColorBufferClearValue(col_buf) : vk::ClearValue{}, + }; + image->usage.render_target = 1u; + } + + if (auto image_id = db_desc.first; image_id) { + auto& desc = db_desc.second; + const auto htile_address = regs.depth_htile_data_base.GetAddress(); + const auto& image_view = texture_cache.FindDepthTarget(image_id, desc); + auto& image = texture_cache.GetImage(image_id); + + const auto slice = image_view.info.range.base.layer; + const bool is_depth_clear = regs.depth_render_control.depth_clear_enable || + texture_cache.IsMetaCleared(htile_address, slice); + const bool is_stencil_clear = regs.depth_render_control.stencil_clear_enable; + texture_cache.TouchMeta(htile_address, slice, false); + ASSERT(desc.view_info.range.extent.levels == 1 && !image.binding.needs_rebind); + + const bool has_stencil = image.info.props.has_stencil; + const auto new_layout = desc.view_info.is_storage + ? has_stencil ? vk::ImageLayout::eDepthStencilAttachmentOptimal + : vk::ImageLayout::eDepthAttachmentOptimal + : has_stencil ? vk::ImageLayout::eDepthStencilReadOnlyOptimal + : vk::ImageLayout::eDepthReadOnlyOptimal; + image.Transit(new_layout, + vk::AccessFlagBits2::eDepthStencilAttachmentWrite | + vk::AccessFlagBits2::eDepthStencilAttachmentRead, + desc.view_info.range); + + state.width = std::min(state.width, image.info.size.width); + state.height = std::min(state.height, image.info.size.height); + state.has_depth = regs.depth_buffer.DepthValid(); + state.has_stencil = regs.depth_buffer.StencilValid(); + state.num_layers = std::min(state.num_layers, image_view.info.range.extent.layers); + if (state.has_depth) { + state.depth_attachment = { + .imageView = *image_view.image_view, + .imageLayout = image.backing->state.layout, + .loadOp = + is_depth_clear ? vk::AttachmentLoadOp::eClear : vk::AttachmentLoadOp::eLoad, + .storeOp = vk::AttachmentStoreOp::eStore, + .clearValue = vk::ClearValue{.depthStencil = {.depth = regs.depth_clear}}, + }; + } + if (state.has_stencil) { + state.stencil_attachment = { + .imageView = *image_view.image_view, + .imageLayout = image.backing->state.layout, + .loadOp = + is_stencil_clear ? vk::AttachmentLoadOp::eClear : vk::AttachmentLoadOp::eLoad, + .storeOp = vk::AttachmentStoreOp::eStore, + .clearValue = vk::ClearValue{.depthStencil = {.stencil = regs.stencil_clear}}, + }; + } + + image.usage.depth_target = true; + } + + if (state.num_layers == std::numeric_limits::max()) { + state.num_layers = 1; + } + + return state; +} + +void Rasterizer::Resolve() { + const auto& mrt0_hint = liverpool->last_cb_extent[0]; + const auto& mrt1_hint = liverpool->last_cb_extent[1]; + VideoCore::TextureCache::ImageDesc mrt0_desc{liverpool->regs.color_buffers[0], mrt0_hint}; + VideoCore::TextureCache::ImageDesc mrt1_desc{liverpool->regs.color_buffers[1], mrt1_hint}; + auto& mrt0_image = texture_cache.GetImage(texture_cache.FindImage(mrt0_desc, true)); + auto& mrt1_image = texture_cache.GetImage(texture_cache.FindImage(mrt1_desc, true)); + + ScopeMarkerBegin(fmt::format("Resolve:MRT0={:#x}:MRT1={:#x}", + liverpool->regs.color_buffers[0].Address(), + liverpool->regs.color_buffers[1].Address())); + mrt1_image.Resolve(mrt0_image, mrt0_desc.view_info.range, mrt1_desc.view_info.range); + ScopeMarkerEnd(); +} + +void Rasterizer::DepthStencilCopy(bool is_depth, bool is_stencil) { + auto& regs = liverpool->regs; + + auto read_desc = VideoCore::TextureCache::ImageDesc( + regs.depth_buffer, regs.depth_view, regs.depth_control, + regs.depth_htile_data_base.GetAddress(), liverpool->last_db_extent, false); + auto write_desc = VideoCore::TextureCache::ImageDesc( + regs.depth_buffer, regs.depth_view, regs.depth_control, + regs.depth_htile_data_base.GetAddress(), liverpool->last_db_extent, true); + + auto& read_image = texture_cache.GetImage(texture_cache.FindImage(read_desc)); + auto& write_image = texture_cache.GetImage(texture_cache.FindImage(write_desc)); + + VideoCore::SubresourceRange sub_range; + sub_range.base.layer = liverpool->regs.depth_view.slice_start; + sub_range.extent.layers = liverpool->regs.depth_view.NumSlices() - sub_range.base.layer; + + ScopeMarkerBegin(fmt::format( + "DepthStencilCopy:DR={:#x}:SR={:#x}:DW={:#x}:SW={:#x}", regs.depth_buffer.DepthAddress(), + regs.depth_buffer.StencilAddress(), regs.depth_buffer.DepthWriteAddress(), + regs.depth_buffer.StencilWriteAddress())); + + read_image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits2::eTransferRead, + sub_range); + write_image.Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits2::eTransferWrite, + sub_range); + + auto aspect_mask = vk::ImageAspectFlags(0); + if (is_depth) { + aspect_mask |= vk::ImageAspectFlagBits::eDepth; + } + if (is_stencil) { + aspect_mask |= vk::ImageAspectFlagBits::eStencil; + } + + vk::ImageCopy region = { + .srcSubresource = + { + .aspectMask = aspect_mask, + .mipLevel = 0, + .baseArrayLayer = sub_range.base.layer, + .layerCount = sub_range.extent.layers, + }, + .srcOffset = {0, 0, 0}, + .dstSubresource = + { + .aspectMask = aspect_mask, + .mipLevel = 0, + .baseArrayLayer = sub_range.base.layer, + .layerCount = sub_range.extent.layers, + }, + .dstOffset = {0, 0, 0}, + .extent = {write_image.info.size.width, write_image.info.size.height, 1}, + }; + scheduler.CommandBuffer().copyImage(read_image.GetImage(), vk::ImageLayout::eTransferSrcOptimal, + write_image.GetImage(), + vk::ImageLayout::eTransferDstOptimal, region); + + ScopeMarkerEnd(); +} + +void Rasterizer::FillBuffer(VAddr address, u32 num_bytes, u32 value, bool is_gds) { + buffer_cache.FillBuffer(address, num_bytes, value, is_gds); +} + +void Rasterizer::CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds) { + buffer_cache.CopyBuffer(dst, src, num_bytes, dst_gds, src_gds); +} + +u32 Rasterizer::ReadDataFromGds(u32 gds_offset) { + auto* gds_buf = buffer_cache.GetGdsBuffer(); + u32 value; + std::memcpy(&value, gds_buf->mapped_data.data() + gds_offset, sizeof(u32)); + return value; +} + +bool Rasterizer::InvalidateMemory(VAddr addr, u64 size) { + if (!IsMapped(addr, size)) { + // Not GPU mapped memory, can skip invalidation logic entirely. + return false; + } + buffer_cache.InvalidateMemory(addr, size); + texture_cache.InvalidateMemory(addr, size); + return true; +} + +bool Rasterizer::ReadMemory(VAddr addr, u64 size) { + if (!IsMapped(addr, size)) { + // Not GPU mapped memory, can skip invalidation logic entirely. + return false; + } + buffer_cache.ReadMemory(addr, size); + return true; +} + +bool Rasterizer::IsMapped(VAddr addr, u64 size) { + if (size == 0) { + // There is no memory, so not mapped. + return false; + } + if (static_cast(addr) > std::numeric_limits::max() - size) { + // Memory range wrapped the address space, cannot be mapped. + return false; + } + const auto range = decltype(mapped_ranges)::interval_type::right_open(addr, addr + size); + + Common::RecursiveSharedLock lock{mapped_ranges_mutex}; + return boost::icl::contains(mapped_ranges, range); +} + +void Rasterizer::MapMemory(VAddr addr, u64 size) { + { + std::scoped_lock lock{mapped_ranges_mutex}; + mapped_ranges += decltype(mapped_ranges)::interval_type::right_open(addr, addr + size); + } + page_manager.OnGpuMap(addr, size); +} + +void Rasterizer::UnmapMemory(VAddr addr, u64 size) { + buffer_cache.InvalidateMemory(addr, size); + texture_cache.UnmapMemory(addr, size); + page_manager.OnGpuUnmap(addr, size); + { + std::scoped_lock lock{mapped_ranges_mutex}; + mapped_ranges -= decltype(mapped_ranges)::interval_type::right_open(addr, addr + size); + } +} + +void Rasterizer::UpdateDynamicState(const GraphicsPipeline* pipeline, const bool is_indexed) const { + UpdateViewportScissorState(); + UpdateDepthStencilState(); + UpdatePrimitiveState(is_indexed); + UpdateRasterizationState(); + UpdateColorBlendingState(pipeline); + + auto& dynamic_state = scheduler.GetDynamicState(); + dynamic_state.Commit(instance, scheduler.CommandBuffer()); +} + +void Rasterizer::UpdateViewportScissorState() const { + const auto& regs = liverpool->regs; + + const auto combined_scissor_value_tl = [](s16 scr, s16 win, s16 gen, s16 win_offset) { + return std::max({scr, s16(win + win_offset), s16(gen + win_offset)}); + }; + const auto combined_scissor_value_br = [](s16 scr, s16 win, s16 gen, s16 win_offset) { + return std::min({scr, s16(win + win_offset), s16(gen + win_offset)}); + }; + const bool enable_offset = !regs.window_scissor.window_offset_disable; + + AmdGpu::Scissor scsr{}; + scsr.top_left_x = combined_scissor_value_tl( + regs.screen_scissor.top_left_x, s16(regs.window_scissor.top_left_x), + s16(regs.generic_scissor.top_left_x), + enable_offset ? regs.window_offset.window_x_offset : 0); + scsr.top_left_y = combined_scissor_value_tl( + regs.screen_scissor.top_left_y, s16(regs.window_scissor.top_left_y), + s16(regs.generic_scissor.top_left_y), + enable_offset ? regs.window_offset.window_y_offset : 0); + scsr.bottom_right_x = combined_scissor_value_br( + regs.screen_scissor.bottom_right_x, regs.window_scissor.bottom_right_x, + regs.generic_scissor.bottom_right_x, + enable_offset ? regs.window_offset.window_x_offset : 0); + scsr.bottom_right_y = combined_scissor_value_br( + regs.screen_scissor.bottom_right_y, regs.window_scissor.bottom_right_y, + regs.generic_scissor.bottom_right_y, + enable_offset ? regs.window_offset.window_y_offset : 0); + + boost::container::static_vector viewports; + boost::container::static_vector scissors; + + if (regs.polygon_control.enable_window_offset && + (regs.window_offset.window_x_offset != 0 || regs.window_offset.window_y_offset != 0)) { + LOG_ERROR(Render_Vulkan, + "PA_SU_SC_MODE_CNTL.VTX_WINDOW_OFFSET_ENABLE support is not yet implemented."); + } + + const auto& vp_ctl = regs.viewport_control; + for (u32 i = 0; i < AmdGpu::NUM_VIEWPORTS; i++) { + const auto& vp = regs.viewports[i]; + const auto& vp_d = regs.viewport_depths[i]; + if (vp.xscale == 0) { + continue; + } + + const auto zoffset = vp_ctl.zoffset_enable ? vp.zoffset : 0.f; + const auto zscale = vp_ctl.zscale_enable ? vp.zscale : 1.f; + + vk::Viewport viewport{}; + + // https://gitlab.freedesktop.org/mesa/mesa/-/blob/209a0ed/src/amd/vulkan/radv_pipeline_graphics.c#L688-689 + // https://gitlab.freedesktop.org/mesa/mesa/-/blob/209a0ed/src/amd/vulkan/radv_cmd_buffer.c#L3103-3109 + // When the clip space is ranged [-1...1], the zoffset is centered. + // By reversing the above viewport calculations, we get the following: + if (regs.clipper_control.clip_space == AmdGpu::ClipSpace::MinusWToW) { + viewport.minDepth = zoffset - zscale; + viewport.maxDepth = zoffset + zscale; + } else { + viewport.minDepth = zoffset; + viewport.maxDepth = zoffset + zscale; + } + + if (!instance.IsDepthRangeUnrestrictedSupported()) { + // Unrestricted depth range not supported by device. Restrict to valid range. + viewport.minDepth = std::max(viewport.minDepth, 0.f); + viewport.maxDepth = std::min(viewport.maxDepth, 1.f); + } + + if (regs.IsClipDisabled()) { + // In case if clipping is disabled we patch the shader to convert vertex position + // from screen space coordinates to NDC by defining a render space as full hardware + // window range [0..16383, 0..16383] and setting the viewport to its size. + viewport.x = 0.f; + viewport.y = 0.f; + viewport.width = float(std::min(instance.GetMaxViewportWidth(), 16_KB)); + viewport.height = float(std::min(instance.GetMaxViewportHeight(), 16_KB)); + } else { + const auto xoffset = vp_ctl.xoffset_enable ? vp.xoffset : 0.f; + const auto xscale = vp_ctl.xscale_enable ? vp.xscale : 1.f; + const auto yoffset = vp_ctl.yoffset_enable ? vp.yoffset : 0.f; + const auto yscale = vp_ctl.yscale_enable ? vp.yscale : 1.f; + + viewport.x = xoffset - xscale; + viewport.y = yoffset - yscale; + viewport.width = xscale * 2.0f; + viewport.height = yscale * 2.0f; + } + + viewports.push_back(viewport); + + auto vp_scsr = scsr; + if (regs.mode_control.vport_scissor_enable) { + vp_scsr.top_left_x = + std::max(vp_scsr.top_left_x, s16(regs.viewport_scissors[i].top_left_x)); + vp_scsr.top_left_y = + std::max(vp_scsr.top_left_y, s16(regs.viewport_scissors[i].top_left_y)); + vp_scsr.bottom_right_x = std::min(AmdGpu::Scissor::Clamp(vp_scsr.bottom_right_x), + regs.viewport_scissors[i].bottom_right_x); + vp_scsr.bottom_right_y = std::min(AmdGpu::Scissor::Clamp(vp_scsr.bottom_right_y), + regs.viewport_scissors[i].bottom_right_y); + } + scissors.push_back({ + .offset = {vp_scsr.top_left_x, vp_scsr.top_left_y}, + .extent = {vp_scsr.GetWidth(), vp_scsr.GetHeight()}, + }); + } + + if (viewports.empty()) { + // Vulkan requires providing at least one viewport. + constexpr vk::Viewport empty_viewport = { + .x = -1.0f, + .y = -1.0f, + .width = 1.0f, + .height = 1.0f, + .minDepth = 0.0f, + .maxDepth = 1.0f, + }; + constexpr vk::Rect2D empty_scissor = { + .offset = {0, 0}, + .extent = {1, 1}, + }; + viewports.push_back(empty_viewport); + scissors.push_back(empty_scissor); + } + + auto& dynamic_state = scheduler.GetDynamicState(); + dynamic_state.SetViewports(viewports); + dynamic_state.SetScissors(scissors); +} + +void Rasterizer::UpdateDepthStencilState() const { + const auto& regs = liverpool->regs; + auto& dynamic_state = scheduler.GetDynamicState(); + + const auto depth_test_enabled = + regs.depth_control.depth_enable && regs.depth_buffer.DepthValid(); + dynamic_state.SetDepthTestEnabled(depth_test_enabled); + if (depth_test_enabled) { + dynamic_state.SetDepthWriteEnabled(regs.depth_control.depth_write_enable && + !regs.depth_render_control.depth_clear_enable); + dynamic_state.SetDepthCompareOp(LiverpoolToVK::CompareOp(regs.depth_control.depth_func)); + } + + const auto depth_bounds_test_enabled = regs.depth_control.depth_bounds_enable; + dynamic_state.SetDepthBoundsTestEnabled(depth_bounds_test_enabled); + if (depth_bounds_test_enabled) { + dynamic_state.SetDepthBounds(regs.depth_bounds_min, regs.depth_bounds_max); + } + + const auto depth_bias_enabled = regs.polygon_control.NeedsBias(); + dynamic_state.SetDepthBiasEnabled(depth_bias_enabled); + if (depth_bias_enabled) { + const bool front = regs.polygon_control.enable_polygon_offset_front; + dynamic_state.SetDepthBias( + front ? regs.poly_offset.front_offset : regs.poly_offset.back_offset, + regs.poly_offset.depth_bias, + (front ? regs.poly_offset.front_scale : regs.poly_offset.back_scale) / 16.f); + } + + const auto stencil_test_enabled = + regs.depth_control.stencil_enable && regs.depth_buffer.StencilValid(); + dynamic_state.SetStencilTestEnabled(stencil_test_enabled); + if (stencil_test_enabled) { + const StencilOps front_ops{ + .fail_op = LiverpoolToVK::StencilOp(regs.stencil_control.stencil_fail_front), + .pass_op = LiverpoolToVK::StencilOp(regs.stencil_control.stencil_zpass_front), + .depth_fail_op = LiverpoolToVK::StencilOp(regs.stencil_control.stencil_zfail_front), + .compare_op = LiverpoolToVK::CompareOp(regs.depth_control.stencil_ref_func), + }; + const StencilOps back_ops = regs.depth_control.backface_enable ? StencilOps{ + .fail_op = LiverpoolToVK::StencilOp(regs.stencil_control.stencil_fail_back), + .pass_op = LiverpoolToVK::StencilOp(regs.stencil_control.stencil_zpass_back), + .depth_fail_op = LiverpoolToVK::StencilOp(regs.stencil_control.stencil_zfail_back), + .compare_op = LiverpoolToVK::CompareOp(regs.depth_control.stencil_bf_func), + } : front_ops; + dynamic_state.SetStencilOps(front_ops, back_ops); + + const bool stencil_clear = regs.depth_render_control.stencil_clear_enable; + const auto front = regs.stencil_ref_front; + const auto back = + regs.depth_control.backface_enable ? regs.stencil_ref_back : regs.stencil_ref_front; + dynamic_state.SetStencilReferences(front.stencil_test_val, back.stencil_test_val); + dynamic_state.SetStencilWriteMasks(!stencil_clear ? front.stencil_write_mask : 0U, + !stencil_clear ? back.stencil_write_mask : 0U); + dynamic_state.SetStencilCompareMasks(front.stencil_mask, back.stencil_mask); + } +} + +void Rasterizer::UpdatePrimitiveState(const bool is_indexed) const { + const auto& regs = liverpool->regs; + auto& dynamic_state = scheduler.GetDynamicState(); + + const auto prim_restart = (regs.enable_primitive_restart & 1) != 0; + ASSERT_MSG(!is_indexed || !prim_restart || regs.primitive_restart_index == 0xFFFF || + regs.primitive_restart_index == 0xFFFFFFFF, + "Primitive restart index other than -1 is not supported yet"); + + const auto cull_mode = LiverpoolToVK::IsPrimitiveCulled(regs.primitive_type) + ? LiverpoolToVK::CullMode(regs.polygon_control.CullingMode()) + : vk::CullModeFlagBits::eNone; + const auto front_face = LiverpoolToVK::FrontFace(regs.polygon_control.front_face); + + dynamic_state.SetPrimitiveRestartEnabled(prim_restart); + dynamic_state.SetRasterizerDiscardEnabled(regs.clipper_control.dx_rasterization_kill); + dynamic_state.SetCullMode(cull_mode); + dynamic_state.SetFrontFace(front_face); +} + +void Rasterizer::UpdateRasterizationState() const { + const auto& regs = liverpool->regs; + auto& dynamic_state = scheduler.GetDynamicState(); + dynamic_state.SetLineWidth(regs.line_control.Width()); +} + +void Rasterizer::UpdateColorBlendingState(const GraphicsPipeline* pipeline) const { + const auto& regs = liverpool->regs; + auto& dynamic_state = scheduler.GetDynamicState(); + dynamic_state.SetBlendConstants(regs.blend_constants); + dynamic_state.SetColorWriteMasks(pipeline->GetGraphicsKey().write_masks); + dynamic_state.SetAttachmentFeedbackLoopEnabled(attachment_feedback_loop); +} + +void Rasterizer::ScopeMarkerBegin(const std::string_view& str, bool from_guest) { + if ((from_guest && !Config::getVkGuestMarkersEnabled()) || + (!from_guest && !Config::getVkHostMarkersEnabled())) { + return; + } + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.beginDebugUtilsLabelEXT(vk::DebugUtilsLabelEXT{ + .pLabelName = str.data(), + }); +} + +void Rasterizer::ScopeMarkerEnd(bool from_guest) { + if ((from_guest && !Config::getVkGuestMarkersEnabled()) || + (!from_guest && !Config::getVkHostMarkersEnabled())) { + return; + } + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.endDebugUtilsLabelEXT(); +} + +void Rasterizer::ScopedMarkerInsert(const std::string_view& str, bool from_guest) { + if ((from_guest && !Config::getVkGuestMarkersEnabled()) || + (!from_guest && !Config::getVkHostMarkersEnabled())) { + return; + } + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.insertDebugUtilsLabelEXT(vk::DebugUtilsLabelEXT{ + .pLabelName = str.data(), + }); +} + +void Rasterizer::ScopedMarkerInsertColor(const std::string_view& str, const u32 color, + bool from_guest) { + if ((from_guest && !Config::getVkGuestMarkersEnabled()) || + (!from_guest && !Config::getVkHostMarkersEnabled())) { + return; + } + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.insertDebugUtilsLabelEXT(vk::DebugUtilsLabelEXT{ + .pLabelName = str.data(), + .color = std::array( + {(f32)((color >> 16) & 0xff) / 255.0f, (f32)((color >> 8) & 0xff) / 255.0f, + (f32)(color & 0xff) / 255.0f, (f32)((color >> 24) & 0xff) / 255.0f})}); +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index c73626f3f53..006f8881ade 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -1,150 +1,164 @@ -// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#pragma once - -#include "common/recursive_lock.h" -#include "common/shared_first_mutex.h" -#include "video_core/buffer_cache/buffer_cache.h" -#include "video_core/page_manager.h" -#include "video_core/renderer_vulkan/vk_pipeline_cache.h" -#include "video_core/texture_cache/texture_cache.h" - -namespace AmdGpu { -struct Liverpool; -} - -namespace Core { -class MemoryManager; -} - -namespace Vulkan { - -class Scheduler; -class RenderState; -class GraphicsPipeline; - -class Rasterizer { -public: - explicit Rasterizer(const Instance& instance, Scheduler& scheduler, - AmdGpu::Liverpool* liverpool); - ~Rasterizer(); - - [[nodiscard]] Scheduler& GetScheduler() noexcept { - return scheduler; - } - - [[nodiscard]] VideoCore::BufferCache& GetBufferCache() noexcept { - return buffer_cache; - } - - [[nodiscard]] VideoCore::TextureCache& GetTextureCache() noexcept { - return texture_cache; - } - - void Draw(bool is_indexed, u32 index_offset = 0); - void DrawIndirect(bool is_indexed, VAddr arg_address, u32 offset, u32 size, u32 max_count, - VAddr count_address); - - void DispatchDirect(); - void DispatchIndirect(VAddr address, u32 offset, u32 size); - - void ScopeMarkerBegin(const std::string_view& str, bool from_guest = false); - void ScopeMarkerEnd(bool from_guest = false); - void ScopedMarkerInsert(const std::string_view& str, bool from_guest = false); - void ScopedMarkerInsertColor(const std::string_view& str, const u32 color, - bool from_guest = false); - - void FillBuffer(VAddr address, u32 num_bytes, u32 value, bool is_gds); - void CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds); - u32 ReadDataFromGds(u32 gsd_offset); - bool InvalidateMemory(VAddr addr, u64 size); - bool ReadMemory(VAddr addr, u64 size); - bool IsMapped(VAddr addr, u64 size); - void MapMemory(VAddr addr, u64 size); - void UnmapMemory(VAddr addr, u64 size); - - void CpSync(); - u64 Flush(); - void Finish(); - void OnSubmit(); - - PipelineCache& GetPipelineCache() { - return pipeline_cache; - } - - template - void ForEachMappedRangeInRange(VAddr addr, u64 size, Func&& func) { - const auto range = decltype(mapped_ranges)::interval_type::right_open(addr, addr + size); - Common::RecursiveSharedLock lock{mapped_ranges_mutex}; - for (const auto& mapped_range : (mapped_ranges & range)) { - func(mapped_range); - } - } - -private: - void PrepareRenderState(const GraphicsPipeline* pipeline); - RenderState BeginRendering(const GraphicsPipeline* pipeline); - void Resolve(); - void DepthStencilCopy(bool is_depth, bool is_stencil); - void EliminateFastClear(); - - void UpdateDynamicState(const GraphicsPipeline* pipeline, bool is_indexed) const; - void UpdateViewportScissorState() const; - void UpdateDepthStencilState() const; - void UpdatePrimitiveState(bool is_indexed) const; - void UpdateRasterizationState() const; - void UpdateColorBlendingState(const GraphicsPipeline* pipeline) const; - - bool FilterDraw(); - - void BindBuffers(const Shader::Info& stage, Shader::Backend::Bindings& binding, - Shader::PushData& push_data); - void BindTextures(const Shader::Info& stage, Shader::Backend::Bindings& binding); - bool BindResources(const Pipeline* pipeline); - - void ResetBindings() { - for (auto& image_id : bound_images) { - texture_cache.GetImage(image_id).binding = {}; - } - bound_images.clear(); - } - - bool IsComputeMetaClear(const Pipeline* pipeline); - bool IsComputeImageCopy(const Pipeline* pipeline); - bool IsComputeImageClear(const Pipeline* pipeline); - -private: - friend class VideoCore::BufferCache; - - const Instance& instance; - Scheduler& scheduler; - VideoCore::PageManager page_manager; - VideoCore::BufferCache buffer_cache; - VideoCore::TextureCache texture_cache; - AmdGpu::Liverpool* liverpool; - Core::MemoryManager* memory; - boost::icl::interval_set mapped_ranges; - Common::SharedFirstMutex mapped_ranges_mutex; - PipelineCache pipeline_cache; - - using RenderTargetInfo = std::pair; - std::array cb_descs; - std::pair db_desc; - boost::container::static_vector image_infos; - boost::container::static_vector buffer_infos; - boost::container::static_vector bound_images; - - Pipeline::DescriptorWrites set_writes; - Pipeline::BufferBarriers buffer_barriers; - Shader::PushData push_data; - - using BufferBindingInfo = std::tuple; - boost::container::static_vector buffer_bindings; - using ImageBindingInfo = std::pair; - boost::container::static_vector image_bindings; - bool fault_process_pending{}; - bool attachment_feedback_loop{}; -}; - -} // namespace Vulkan +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include "common/recursive_lock.h" +#include "common/shared_first_mutex.h" +#include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/page_manager.h" +#include "video_core/renderer_vulkan/vk_compute_scheduler.h" +#include "video_core/renderer_vulkan/vk_pipeline_cache.h" +#include "video_core/texture_cache/texture_cache.h" + +namespace AmdGpu { +struct Liverpool; +} + +namespace Core { +class MemoryManager; +} + +namespace Vulkan { + +class Scheduler; +class RenderState; +class GraphicsPipeline; + +class Rasterizer { +public: + explicit Rasterizer(const Instance& instance, Scheduler& scheduler, + AmdGpu::Liverpool* liverpool); + ~Rasterizer(); + + [[nodiscard]] Scheduler& GetScheduler() noexcept { + return scheduler; + } + + [[nodiscard]] ComputeScheduler& GetComputeScheduler() noexcept { + return *compute_scheduler; + } + + [[nodiscard]] bool HasAsyncCompute() const noexcept { + return compute_scheduler && compute_scheduler->IsDedicated(); + } + + [[nodiscard]] VideoCore::BufferCache& GetBufferCache() noexcept { + return buffer_cache; + } + + [[nodiscard]] VideoCore::TextureCache& GetTextureCache() noexcept { + return texture_cache; + } + + void Draw(bool is_indexed, u32 index_offset = 0); + void DrawIndirect(bool is_indexed, VAddr arg_address, u32 offset, u32 size, u32 max_count, + VAddr count_address); + + void DispatchDirect(); + void DispatchIndirect(VAddr address, u32 offset, u32 size); + + void ScopeMarkerBegin(const std::string_view& str, bool from_guest = false); + void ScopeMarkerEnd(bool from_guest = false); + void ScopedMarkerInsert(const std::string_view& str, bool from_guest = false); + void ScopedMarkerInsertColor(const std::string_view& str, const u32 color, + bool from_guest = false); + + void FillBuffer(VAddr address, u32 num_bytes, u32 value, bool is_gds); + void CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds); + u32 ReadDataFromGds(u32 gsd_offset); + bool InvalidateMemory(VAddr addr, u64 size); + bool ReadMemory(VAddr addr, u64 size); + bool IsMapped(VAddr addr, u64 size); + void MapMemory(VAddr addr, u64 size); + void UnmapMemory(VAddr addr, u64 size); + + void CpSync(); + u64 Flush(); + void Finish(); + void OnSubmit(); + + /// Ensures all pending compute work is synced with graphics before presentation + void SyncComputeForPresent(); + + PipelineCache& GetPipelineCache() { + return pipeline_cache; + } + + template + void ForEachMappedRangeInRange(VAddr addr, u64 size, Func&& func) { + const auto range = decltype(mapped_ranges)::interval_type::right_open(addr, addr + size); + Common::RecursiveSharedLock lock{mapped_ranges_mutex}; + for (const auto& mapped_range : (mapped_ranges & range)) { + func(mapped_range); + } + } + +private: + void PrepareRenderState(const GraphicsPipeline* pipeline); + RenderState BeginRendering(const GraphicsPipeline* pipeline); + void Resolve(); + void DepthStencilCopy(bool is_depth, bool is_stencil); + void EliminateFastClear(); + + void UpdateDynamicState(const GraphicsPipeline* pipeline, bool is_indexed) const; + void UpdateViewportScissorState() const; + void UpdateDepthStencilState() const; + void UpdatePrimitiveState(bool is_indexed) const; + void UpdateRasterizationState() const; + void UpdateColorBlendingState(const GraphicsPipeline* pipeline) const; + + bool FilterDraw(); + + void BindBuffers(const Shader::Info& stage, Shader::Backend::Bindings& binding, + Shader::PushData& push_data); + void BindTextures(const Shader::Info& stage, Shader::Backend::Bindings& binding); + bool BindResources(const Pipeline* pipeline); + + void ResetBindings() { + for (auto& image_id : bound_images) { + texture_cache.GetImage(image_id).binding = {}; + } + bound_images.clear(); + } + + bool IsComputeMetaClear(const Pipeline* pipeline); + bool IsComputeImageCopy(const Pipeline* pipeline); + bool IsComputeImageClear(const Pipeline* pipeline); + +private: + friend class VideoCore::BufferCache; + + const Instance& instance; + Scheduler& scheduler; + std::unique_ptr compute_scheduler; + std::unique_ptr compute_desc_heap; + VideoCore::PageManager page_manager; + VideoCore::BufferCache buffer_cache; + VideoCore::TextureCache texture_cache; + AmdGpu::Liverpool* liverpool; + Core::MemoryManager* memory; + boost::icl::interval_set mapped_ranges; + Common::SharedFirstMutex mapped_ranges_mutex; + PipelineCache pipeline_cache; + + using RenderTargetInfo = std::pair; + std::array cb_descs; + std::pair db_desc; + boost::container::static_vector image_infos; + boost::container::static_vector buffer_infos; + boost::container::static_vector bound_images; + + Pipeline::DescriptorWrites set_writes; + Pipeline::BufferBarriers buffer_barriers; + Shader::PushData push_data; + + using BufferBindingInfo = std::tuple; + boost::container::static_vector buffer_bindings; + using ImageBindingInfo = std::pair; + boost::container::static_vector image_bindings; + bool fault_process_pending{}; + bool attachment_feedback_loop{}; +}; + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_resource_pool.cpp b/src/video_core/renderer_vulkan/vk_resource_pool.cpp index 5bd8025aa52..437b0fd11c4 100644 --- a/src/video_core/renderer_vulkan/vk_resource_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_resource_pool.cpp @@ -1,193 +1,211 @@ -// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#include -#include -#include "common/assert.h" -#include "video_core/renderer_vulkan/vk_instance.h" -#include "video_core/renderer_vulkan/vk_master_semaphore.h" -#include "video_core/renderer_vulkan/vk_resource_pool.h" - -namespace Vulkan { - -ResourcePool::ResourcePool(MasterSemaphore* master_semaphore_, std::size_t grow_step_) - : master_semaphore{master_semaphore_}, grow_step{grow_step_} {} - -std::size_t ResourcePool::CommitResource() { - u64 gpu_tick = master_semaphore->KnownGpuTick(); - const auto search = [this, gpu_tick](std::size_t begin, - std::size_t end) -> std::optional { - for (std::size_t iterator = begin; iterator < end; ++iterator) { - if (gpu_tick >= ticks[iterator]) { - ticks[iterator] = master_semaphore->CurrentTick(); - return iterator; - } - } - return std::nullopt; - }; - - // Try to find a free resource from the hinted position to the end. - auto found = search(hint_iterator, ticks.size()); - if (!found) { - // Refresh semaphore to query updated results - master_semaphore->Refresh(); - gpu_tick = master_semaphore->KnownGpuTick(); - found = search(hint_iterator, ticks.size()); - } - if (!found) { - // Search from beginning to the hinted position. - found = search(0, hint_iterator); - if (!found) { - // Both searches failed, the pool is full; handle it. - const std::size_t free_resource = ManageOverflow(); - - ticks[free_resource] = master_semaphore->CurrentTick(); - found = free_resource; - } - } - - // Free iterator is hinted to the resource after the one that's been commited. - hint_iterator = (*found + 1) % ticks.size(); - return *found; -} - -std::size_t ResourcePool::ManageOverflow() { - const std::size_t old_capacity = ticks.size(); - ticks.resize(old_capacity + grow_step); - Allocate(old_capacity, old_capacity + grow_step); - return old_capacity; -} - -constexpr std::size_t COMMAND_BUFFER_POOL_SIZE = 4; - -CommandPool::CommandPool(const Instance& instance, MasterSemaphore* master_semaphore) - : ResourcePool{master_semaphore, COMMAND_BUFFER_POOL_SIZE}, instance{instance} { - const vk::CommandPoolCreateInfo pool_create_info = { - .flags = vk::CommandPoolCreateFlagBits::eTransient | - vk::CommandPoolCreateFlagBits::eResetCommandBuffer, - .queueFamilyIndex = instance.GetGraphicsQueueFamilyIndex(), - }; - const vk::Device device = instance.GetDevice(); - auto [pool_result, pool] = device.createCommandPoolUnique(pool_create_info); - ASSERT_MSG(pool_result == vk::Result::eSuccess, "Failed to create command pool: {}", - vk::to_string(pool_result)); - cmd_pool = std::move(pool); - SetObjectName(device, *cmd_pool, "CommandPool"); -} - -CommandPool::~CommandPool() = default; - -void CommandPool::Allocate(std::size_t begin, std::size_t end) { - cmd_buffers.resize(end); - - const vk::CommandBufferAllocateInfo buffer_alloc_info = { - .commandPool = *cmd_pool, - .level = vk::CommandBufferLevel::ePrimary, - .commandBufferCount = COMMAND_BUFFER_POOL_SIZE, - }; - - const vk::Device device = instance.GetDevice(); - const auto result = - device.allocateCommandBuffers(&buffer_alloc_info, cmd_buffers.data() + begin); - ASSERT(result == vk::Result::eSuccess); - - for (std::size_t i = begin; i < end; ++i) { - SetObjectName(device, cmd_buffers[i], "CommandPool: Command Buffer {}", i); - } -} - -vk::CommandBuffer CommandPool::Commit() { - const std::size_t index = CommitResource(); - return cmd_buffers[index]; -} - -DescriptorHeap::DescriptorHeap(const Instance& instance, MasterSemaphore* master_semaphore_, - std::span pool_sizes_, - u32 descriptor_heap_count_) - : device{instance.GetDevice()}, master_semaphore{master_semaphore_}, - descriptor_heap_count{descriptor_heap_count_}, pool_sizes{pool_sizes_} { - CreateDescriptorPool(); -} - -DescriptorHeap::~DescriptorHeap() { - device.destroyDescriptorPool(curr_pool); - for (const auto [pool, tick] : pending_pools) { - master_semaphore->Wait(tick); - device.destroyDescriptorPool(pool); - } -} - -vk::DescriptorSet DescriptorHeap::Commit(vk::DescriptorSetLayout set_layout) { - const u64 set_key = std::bit_cast(set_layout); - const auto [it, _] = descriptor_sets.try_emplace(set_key); - - // Check if allocated sets exist and pick one. - if (!it->second.empty()) { - const auto desc_set = it->second.back(); - it.value().pop_back(); - return desc_set; - } - - DescSetBatch desc_sets(DescriptorSetBatch); - std::array layouts; - layouts.fill(set_layout); - - vk::DescriptorSetAllocateInfo alloc_info = { - .descriptorPool = curr_pool, - .descriptorSetCount = DescriptorSetBatch, - .pSetLayouts = layouts.data(), - }; - - // Attempt to allocate the descriptor set batch. - auto result = device.allocateDescriptorSets(&alloc_info, desc_sets.data()); - if (result == vk::Result::eSuccess) { - const auto desc_set = desc_sets.back(); - desc_sets.pop_back(); - it.value() = std::move(desc_sets); - return desc_set; - } - - // The pool has run out. Record current tick and place it in pending list. - ASSERT_MSG(result == vk::Result::eErrorOutOfPoolMemory || - result == vk::Result::eErrorFragmentedPool, - "Unexpected error during descriptor set allocation: {}", vk::to_string(result)); - pending_pools.emplace_back(curr_pool, master_semaphore->CurrentTick()); - if (const auto [pool, tick] = pending_pools.front(); master_semaphore->IsFree(tick)) { - curr_pool = pool; - pending_pools.pop_front(); - - const auto reset_result = device.resetDescriptorPool(curr_pool); - ASSERT_MSG(reset_result == vk::Result::eSuccess, - "Unexpected error resetting descriptor pool: {}", vk::to_string(reset_result)); - } else { - CreateDescriptorPool(); - } - - // Attempt to allocate again with fresh pool. - alloc_info.descriptorPool = curr_pool; - result = device.allocateDescriptorSets(&alloc_info, desc_sets.data()); - ASSERT_MSG(result == vk::Result::eSuccess, - "Unexpected error during descriptor set allocation {}", vk::to_string(result)); - - // We've changed pool so also reset descriptor batch cache. - descriptor_sets.clear(); - const auto desc_set = desc_sets.back(); - desc_sets.pop_back(); - descriptor_sets[set_key] = std::move(desc_sets); - return desc_set; -} - -void DescriptorHeap::CreateDescriptorPool() { - const vk::DescriptorPoolCreateInfo pool_info = { - .flags = vk::DescriptorPoolCreateFlagBits::eUpdateAfterBind, - .maxSets = descriptor_heap_count, - .poolSizeCount = static_cast(pool_sizes.size()), - .pPoolSizes = pool_sizes.data(), - }; - auto [pool_result, pool] = device.createDescriptorPool(pool_info); - ASSERT_MSG(pool_result == vk::Result::eSuccess, "Failed to create descriptor pool: {}", - vk::to_string(pool_result)); - curr_pool = pool; -} - -} // namespace Vulkan +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include "common/assert.h" +#include "video_core/renderer_vulkan/vk_instance.h" +#include "video_core/renderer_vulkan/vk_master_semaphore.h" +#include "video_core/renderer_vulkan/vk_resource_pool.h" + +namespace Vulkan { + +ResourcePool::ResourcePool(MasterSemaphore* master_semaphore_, std::size_t grow_step_) + : master_semaphore{master_semaphore_}, grow_step{grow_step_} {} + +std::size_t ResourcePool::CommitResource() { + u64 gpu_tick = master_semaphore->KnownGpuTick(); + const auto search = [this, gpu_tick](std::size_t begin, + std::size_t end) -> std::optional { + for (std::size_t iterator = begin; iterator < end; ++iterator) { + if (gpu_tick >= ticks[iterator]) { + ticks[iterator] = master_semaphore->CurrentTick(); + return iterator; + } + } + return std::nullopt; + }; + + // Try to find a free resource from the hinted position to the end. + auto found = search(hint_iterator, ticks.size()); + if (!found) { + // Refresh semaphore to query updated results + master_semaphore->Refresh(); + gpu_tick = master_semaphore->KnownGpuTick(); + found = search(hint_iterator, ticks.size()); + } + if (!found) { + // Search from beginning to the hinted position. + found = search(0, hint_iterator); + if (!found) { + // Both searches failed, the pool is full; handle it. + const std::size_t free_resource = ManageOverflow(); + + ticks[free_resource] = master_semaphore->CurrentTick(); + found = free_resource; + } + } + + // Free iterator is hinted to the resource after the one that's been commited. + hint_iterator = (*found + 1) % ticks.size(); + return *found; +} + +std::size_t ResourcePool::ManageOverflow() { + const std::size_t old_capacity = ticks.size(); + ticks.resize(old_capacity + grow_step); + Allocate(old_capacity, old_capacity + grow_step); + return old_capacity; +} + +constexpr std::size_t COMMAND_BUFFER_POOL_SIZE = 4; + +CommandPool::CommandPool(const Instance& instance, MasterSemaphore* master_semaphore) + : ResourcePool{master_semaphore, COMMAND_BUFFER_POOL_SIZE}, instance{instance}, + queue_family{instance.GetGraphicsQueueFamilyIndex()} { + const vk::CommandPoolCreateInfo pool_create_info = { + .flags = vk::CommandPoolCreateFlagBits::eTransient | + vk::CommandPoolCreateFlagBits::eResetCommandBuffer, + .queueFamilyIndex = queue_family, + }; + const vk::Device device = instance.GetDevice(); + auto [pool_result, pool] = device.createCommandPoolUnique(pool_create_info); + ASSERT_MSG(pool_result == vk::Result::eSuccess, "Failed to create command pool: {}", + vk::to_string(pool_result)); + cmd_pool = std::move(pool); + SetObjectName(device, *cmd_pool, "CommandPool"); +} + +CommandPool::CommandPool(const Instance& instance, MasterSemaphore* master_semaphore, + u32 queue_family_index) + : ResourcePool{master_semaphore, COMMAND_BUFFER_POOL_SIZE}, instance{instance}, + queue_family{queue_family_index} { + const vk::CommandPoolCreateInfo pool_create_info = { + .flags = vk::CommandPoolCreateFlagBits::eTransient | + vk::CommandPoolCreateFlagBits::eResetCommandBuffer, + .queueFamilyIndex = queue_family, + }; + const vk::Device device = instance.GetDevice(); + auto [pool_result, pool] = device.createCommandPoolUnique(pool_create_info); + ASSERT_MSG(pool_result == vk::Result::eSuccess, "Failed to create command pool: {}", + vk::to_string(pool_result)); + cmd_pool = std::move(pool); + SetObjectName(device, *cmd_pool, "ComputeCommandPool"); +} + +CommandPool::~CommandPool() = default; + +void CommandPool::Allocate(std::size_t begin, std::size_t end) { + cmd_buffers.resize(end); + + const vk::CommandBufferAllocateInfo buffer_alloc_info = { + .commandPool = *cmd_pool, + .level = vk::CommandBufferLevel::ePrimary, + .commandBufferCount = COMMAND_BUFFER_POOL_SIZE, + }; + + const vk::Device device = instance.GetDevice(); + const auto result = + device.allocateCommandBuffers(&buffer_alloc_info, cmd_buffers.data() + begin); + ASSERT(result == vk::Result::eSuccess); + + for (std::size_t i = begin; i < end; ++i) { + SetObjectName(device, cmd_buffers[i], "CommandPool: Command Buffer {}", i); + } +} + +vk::CommandBuffer CommandPool::Commit() { + const std::size_t index = CommitResource(); + return cmd_buffers[index]; +} + +DescriptorHeap::DescriptorHeap(const Instance& instance, MasterSemaphore* master_semaphore_, + std::span pool_sizes_, + u32 descriptor_heap_count_) + : device{instance.GetDevice()}, master_semaphore{master_semaphore_}, + descriptor_heap_count{descriptor_heap_count_}, pool_sizes{pool_sizes_} { + CreateDescriptorPool(); +} + +DescriptorHeap::~DescriptorHeap() { + device.destroyDescriptorPool(curr_pool); + for (const auto [pool, tick] : pending_pools) { + master_semaphore->Wait(tick); + device.destroyDescriptorPool(pool); + } +} + +vk::DescriptorSet DescriptorHeap::Commit(vk::DescriptorSetLayout set_layout) { + const u64 set_key = std::bit_cast(set_layout); + const auto [it, _] = descriptor_sets.try_emplace(set_key); + + // Check if allocated sets exist and pick one. + if (!it->second.empty()) { + const auto desc_set = it->second.back(); + it.value().pop_back(); + return desc_set; + } + + DescSetBatch desc_sets(DescriptorSetBatch); + std::array layouts; + layouts.fill(set_layout); + + vk::DescriptorSetAllocateInfo alloc_info = { + .descriptorPool = curr_pool, + .descriptorSetCount = DescriptorSetBatch, + .pSetLayouts = layouts.data(), + }; + + // Attempt to allocate the descriptor set batch. + auto result = device.allocateDescriptorSets(&alloc_info, desc_sets.data()); + if (result == vk::Result::eSuccess) { + const auto desc_set = desc_sets.back(); + desc_sets.pop_back(); + it.value() = std::move(desc_sets); + return desc_set; + } + + // The pool has run out. Record current tick and place it in pending list. + ASSERT_MSG(result == vk::Result::eErrorOutOfPoolMemory || + result == vk::Result::eErrorFragmentedPool, + "Unexpected error during descriptor set allocation: {}", vk::to_string(result)); + pending_pools.emplace_back(curr_pool, master_semaphore->CurrentTick()); + if (const auto [pool, tick] = pending_pools.front(); master_semaphore->IsFree(tick)) { + curr_pool = pool; + pending_pools.pop_front(); + + const auto reset_result = device.resetDescriptorPool(curr_pool); + ASSERT_MSG(reset_result == vk::Result::eSuccess, + "Unexpected error resetting descriptor pool: {}", vk::to_string(reset_result)); + } else { + CreateDescriptorPool(); + } + + // Attempt to allocate again with fresh pool. + alloc_info.descriptorPool = curr_pool; + result = device.allocateDescriptorSets(&alloc_info, desc_sets.data()); + ASSERT_MSG(result == vk::Result::eSuccess, + "Unexpected error during descriptor set allocation {}", vk::to_string(result)); + + // We've changed pool so also reset descriptor batch cache. + descriptor_sets.clear(); + const auto desc_set = desc_sets.back(); + desc_sets.pop_back(); + descriptor_sets[set_key] = std::move(desc_sets); + return desc_set; +} + +void DescriptorHeap::CreateDescriptorPool() { + const vk::DescriptorPoolCreateInfo pool_info = { + .flags = vk::DescriptorPoolCreateFlagBits::eUpdateAfterBind, + .maxSets = descriptor_heap_count, + .poolSizeCount = static_cast(pool_sizes.size()), + .pPoolSizes = pool_sizes.data(), + }; + auto [pool_result, pool] = device.createDescriptorPool(pool_info); + ASSERT_MSG(pool_result == vk::Result::eSuccess, "Failed to create descriptor pool: {}", + vk::to_string(pool_result)); + curr_pool = pool; +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_resource_pool.h b/src/video_core/renderer_vulkan/vk_resource_pool.h index 98c2ddb8c49..302f11f65c0 100644 --- a/src/video_core/renderer_vulkan/vk_resource_pool.h +++ b/src/video_core/renderer_vulkan/vk_resource_pool.h @@ -1,92 +1,95 @@ -// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#pragma once - -#include -#include -#include -#include - -#include "common/types.h" -#include "video_core/renderer_vulkan/vk_common.h" - -namespace Vulkan { - -class Instance; -class MasterSemaphore; - -/** - * Handles a pool of resources protected by fences. Manages resource overflow allocating more - * resources. - */ -class ResourcePool { -public: - explicit ResourcePool() = default; - explicit ResourcePool(MasterSemaphore* master_semaphore, std::size_t grow_step); - virtual ~ResourcePool() = default; - - ResourcePool& operator=(ResourcePool&&) noexcept = default; - ResourcePool(ResourcePool&&) noexcept = default; - - ResourcePool& operator=(const ResourcePool&) = default; - ResourcePool(const ResourcePool&) = default; - -protected: - std::size_t CommitResource(); - - /// Called when a chunk of resources have to be allocated. - virtual void Allocate(std::size_t begin, std::size_t end) = 0; - -private: - /// Manages pool overflow allocating new resources. - std::size_t ManageOverflow(); - -protected: - MasterSemaphore* master_semaphore{nullptr}; - std::size_t grow_step = 0; ///< Number of new resources created after an overflow - std::size_t hint_iterator = 0; ///< Hint to where the next free resources is likely to be found - std::vector ticks; ///< Ticks for each resource -}; - -class CommandPool final : public ResourcePool { -public: - explicit CommandPool(const Instance& instance, MasterSemaphore* master_semaphore); - ~CommandPool() override; - - void Allocate(std::size_t begin, std::size_t end) override; - - vk::CommandBuffer Commit(); - -private: - const Instance& instance; - vk::UniqueCommandPool cmd_pool; - std::vector cmd_buffers; -}; - -class DescriptorHeap final { - static constexpr u32 DescriptorSetBatch = 32; - -public: - explicit DescriptorHeap(const Instance& instance, MasterSemaphore* master_semaphore, - std::span pool_sizes, - u32 descriptor_heap_count = 1024); - ~DescriptorHeap(); - - vk::DescriptorSet Commit(vk::DescriptorSetLayout set_layout); - -private: - void CreateDescriptorPool(); - -private: - vk::Device device; - MasterSemaphore* master_semaphore; - u32 descriptor_heap_count; - std::span pool_sizes; - vk::DescriptorPool curr_pool; - std::deque> pending_pools; - using DescSetBatch = boost::container::static_vector; - tsl::robin_map descriptor_sets; -}; - -} // namespace Vulkan +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include +#include +#include + +#include "common/types.h" +#include "video_core/renderer_vulkan/vk_common.h" + +namespace Vulkan { + +class Instance; +class MasterSemaphore; + +/** + * Handles a pool of resources protected by fences. Manages resource overflow allocating more + * resources. + */ +class ResourcePool { +public: + explicit ResourcePool() = default; + explicit ResourcePool(MasterSemaphore* master_semaphore, std::size_t grow_step); + virtual ~ResourcePool() = default; + + ResourcePool& operator=(ResourcePool&&) noexcept = default; + ResourcePool(ResourcePool&&) noexcept = default; + + ResourcePool& operator=(const ResourcePool&) = default; + ResourcePool(const ResourcePool&) = default; + +protected: + std::size_t CommitResource(); + + /// Called when a chunk of resources have to be allocated. + virtual void Allocate(std::size_t begin, std::size_t end) = 0; + +private: + /// Manages pool overflow allocating new resources. + std::size_t ManageOverflow(); + +protected: + MasterSemaphore* master_semaphore{nullptr}; + std::size_t grow_step = 0; ///< Number of new resources created after an overflow + std::size_t hint_iterator = 0; ///< Hint to where the next free resources is likely to be found + std::vector ticks; ///< Ticks for each resource +}; + +class CommandPool final : public ResourcePool { +public: + explicit CommandPool(const Instance& instance, MasterSemaphore* master_semaphore); + explicit CommandPool(const Instance& instance, MasterSemaphore* master_semaphore, + u32 queue_family_index); + ~CommandPool() override; + + void Allocate(std::size_t begin, std::size_t end) override; + + vk::CommandBuffer Commit(); + +private: + const Instance& instance; + u32 queue_family{0}; + vk::UniqueCommandPool cmd_pool; + std::vector cmd_buffers; +}; + +class DescriptorHeap final { + static constexpr u32 DescriptorSetBatch = 32; + +public: + explicit DescriptorHeap(const Instance& instance, MasterSemaphore* master_semaphore, + std::span pool_sizes, + u32 descriptor_heap_count = 1024); + ~DescriptorHeap(); + + vk::DescriptorSet Commit(vk::DescriptorSetLayout set_layout); + +private: + void CreateDescriptorPool(); + +private: + vk::Device device; + MasterSemaphore* master_semaphore; + u32 descriptor_heap_count; + std::span pool_sizes; + vk::DescriptorPool curr_pool; + std::deque> pending_pools; + using DescSetBatch = boost::container::static_vector; + tsl::robin_map descriptor_sets; +}; + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index fee0b408e6d..e87ba2426b1 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -1,358 +1,383 @@ -// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#include "common/assert.h" -#include "common/debug.h" -#include "common/thread.h" -#include "imgui/renderer/texture_manager.h" -#include "video_core/renderer_vulkan/vk_instance.h" -#include "video_core/renderer_vulkan/vk_scheduler.h" - -namespace Vulkan { - -std::mutex Scheduler::submit_mutex; - -Scheduler::Scheduler(const Instance& instance) - : instance{instance}, master_semaphore{instance}, command_pool{instance, &master_semaphore} { -#if TRACY_GPU_ENABLED - profiler_scope = reinterpret_cast(std::malloc(sizeof(tracy::VkCtxScope))); -#endif - AllocateWorkerCommandBuffers(); - priority_pending_ops_thread = - std::jthread(std::bind_front(&Scheduler::PriorityPendingOpsThread, this)); -} - -Scheduler::~Scheduler() { -#if TRACY_GPU_ENABLED - std::free(profiler_scope); -#endif -} - -void Scheduler::BeginRendering(const RenderState& new_state) { - if (is_rendering && render_state == new_state) { - return; - } - EndRendering(); - is_rendering = true; - render_state = new_state; - - const vk::RenderingInfo rendering_info = { - .renderArea = - { - .offset = {0, 0}, - .extent = {render_state.width, render_state.height}, - }, - .layerCount = render_state.num_layers, - .colorAttachmentCount = render_state.num_color_attachments, - .pColorAttachments = render_state.num_color_attachments > 0 - ? render_state.color_attachments.data() - : nullptr, - .pDepthAttachment = render_state.has_depth ? &render_state.depth_attachment : nullptr, - .pStencilAttachment = render_state.has_stencil ? &render_state.stencil_attachment : nullptr, - }; - - current_cmdbuf.beginRendering(rendering_info); -} - -void Scheduler::EndRendering() { - if (!is_rendering) { - return; - } - is_rendering = false; - current_cmdbuf.endRendering(); -} - -void Scheduler::Flush(SubmitInfo& info) { - // When flushing, we only send data to the driver; no waiting is necessary. - SubmitExecution(info); -} - -void Scheduler::Flush() { - SubmitInfo info{}; - Flush(info); -} - -void Scheduler::Finish() { - // When finishing, we need to wait for the submission to have executed on the device. - const u64 presubmit_tick = CurrentTick(); - SubmitInfo info{}; - SubmitExecution(info); - Wait(presubmit_tick); -} - -void Scheduler::Wait(u64 tick) { - if (tick >= master_semaphore.CurrentTick()) { - // Make sure we are not waiting for the current tick without signalling - SubmitInfo info{}; - Flush(info); - } - master_semaphore.Wait(tick); -} - -void Scheduler::PopPendingOperations() { - master_semaphore.Refresh(); - while (!pending_ops.empty() && master_semaphore.IsFree(pending_ops.front().gpu_tick)) { - pending_ops.front().callback(); - pending_ops.pop(); - } -} - -void Scheduler::AllocateWorkerCommandBuffers() { - const vk::CommandBufferBeginInfo begin_info = { - .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit, - }; - - current_cmdbuf = command_pool.Commit(); - Check(current_cmdbuf.begin(begin_info)); - - // Invalidate dynamic state so it gets applied to the new command buffer. - dynamic_state.Invalidate(); - -#if TRACY_GPU_ENABLED - auto* profiler_ctx = instance.GetProfilerContext(); - if (profiler_ctx) { - static const auto scope_loc = - GPU_SCOPE_LOCATION("Guest Frame", MarkersPalette::GpuMarkerColor); - new (profiler_scope) tracy::VkCtxScope{profiler_ctx, &scope_loc, current_cmdbuf, true}; - } -#endif -} - -void Scheduler::SubmitExecution(SubmitInfo& info) { - std::scoped_lock lk{submit_mutex}; - const u64 signal_value = master_semaphore.NextTick(); - -#if TRACY_GPU_ENABLED - auto* profiler_ctx = instance.GetProfilerContext(); - if (profiler_ctx) { - profiler_scope->~VkCtxScope(); - TracyVkCollect(profiler_ctx, current_cmdbuf); - } -#endif - - EndRendering(); - Check(current_cmdbuf.end()); - - const vk::Semaphore timeline = master_semaphore.Handle(); - info.AddSignal(timeline, signal_value); - - static constexpr std::array wait_stage_masks = { - vk::PipelineStageFlagBits::eAllCommands, - vk::PipelineStageFlagBits::eColorAttachmentOutput, - }; - - const vk::TimelineSemaphoreSubmitInfo timeline_si = { - .waitSemaphoreValueCount = info.num_wait_semas, - .pWaitSemaphoreValues = info.wait_ticks.data(), - .signalSemaphoreValueCount = info.num_signal_semas, - .pSignalSemaphoreValues = info.signal_ticks.data(), - }; - - const vk::SubmitInfo submit_info = { - .pNext = &timeline_si, - .waitSemaphoreCount = info.num_wait_semas, - .pWaitSemaphores = info.wait_semas.data(), - .pWaitDstStageMask = wait_stage_masks.data(), - .commandBufferCount = 1U, - .pCommandBuffers = ¤t_cmdbuf, - .signalSemaphoreCount = info.num_signal_semas, - .pSignalSemaphores = info.signal_semas.data(), - }; - - ImGui::Core::TextureManager::Submit(); - auto submit_result = instance.GetGraphicsQueue().submit(submit_info, info.fence); - ASSERT_MSG(submit_result != vk::Result::eErrorDeviceLost, "Device lost during submit"); - - master_semaphore.Refresh(); - AllocateWorkerCommandBuffers(); - - // Apply pending operations - PopPendingOperations(); -} - -void Scheduler::PriorityPendingOpsThread(std::stop_token stoken) { - Common::SetCurrentThreadName("shadPS4:GpuSchedPriorityPendingOpsRunner"); - - while (!stoken.stop_requested()) { - PendingOp op; - { - std::unique_lock lk(priority_pending_ops_mutex); - priority_pending_ops_cv.wait(lk, stoken, - [this] { return !priority_pending_ops.empty(); }); - if (stoken.stop_requested()) { - break; - } - - op = std::move(priority_pending_ops.front()); - priority_pending_ops.pop(); - } - - master_semaphore.Wait(op.gpu_tick); - if (stoken.stop_requested()) { - break; - } - - op.callback(); - } -} - -void DynamicState::Commit(const Instance& instance, const vk::CommandBuffer& cmdbuf) { - if (dirty_state.viewports) { - dirty_state.viewports = false; - cmdbuf.setViewportWithCount(viewports); - } - if (dirty_state.scissors) { - dirty_state.scissors = false; - cmdbuf.setScissorWithCount(scissors); - } - if (dirty_state.depth_test_enabled) { - dirty_state.depth_test_enabled = false; - cmdbuf.setDepthTestEnable(depth_test_enabled); - } - if (dirty_state.depth_write_enabled) { - dirty_state.depth_write_enabled = false; - // Note that this must be set in a command buffer even if depth test is disabled. - cmdbuf.setDepthWriteEnable(depth_write_enabled); - } - if (depth_test_enabled && dirty_state.depth_compare_op) { - dirty_state.depth_compare_op = false; - cmdbuf.setDepthCompareOp(depth_compare_op); - } - if (dirty_state.depth_bounds_test_enabled) { - dirty_state.depth_bounds_test_enabled = false; - if (instance.IsDepthBoundsSupported()) { - cmdbuf.setDepthBoundsTestEnable(depth_bounds_test_enabled); - } - } - if (depth_bounds_test_enabled && dirty_state.depth_bounds) { - dirty_state.depth_bounds = false; - if (instance.IsDepthBoundsSupported()) { - cmdbuf.setDepthBounds(depth_bounds_min, depth_bounds_max); - } - } - if (dirty_state.depth_bias_enabled) { - dirty_state.depth_bias_enabled = false; - cmdbuf.setDepthBiasEnable(depth_bias_enabled); - } - if (depth_bias_enabled && dirty_state.depth_bias) { - dirty_state.depth_bias = false; - cmdbuf.setDepthBias(depth_bias_constant, depth_bias_clamp, depth_bias_slope); - } - if (dirty_state.stencil_test_enabled) { - dirty_state.stencil_test_enabled = false; - cmdbuf.setStencilTestEnable(stencil_test_enabled); - } - if (stencil_test_enabled) { - if (dirty_state.stencil_front_ops && dirty_state.stencil_back_ops && - stencil_front_ops == stencil_back_ops) { - dirty_state.stencil_front_ops = false; - dirty_state.stencil_back_ops = false; - cmdbuf.setStencilOp(vk::StencilFaceFlagBits::eFrontAndBack, stencil_front_ops.fail_op, - stencil_front_ops.pass_op, stencil_front_ops.depth_fail_op, - stencil_front_ops.compare_op); - } else { - if (dirty_state.stencil_front_ops) { - dirty_state.stencil_front_ops = false; - cmdbuf.setStencilOp(vk::StencilFaceFlagBits::eFront, stencil_front_ops.fail_op, - stencil_front_ops.pass_op, stencil_front_ops.depth_fail_op, - stencil_front_ops.compare_op); - } - if (dirty_state.stencil_back_ops) { - dirty_state.stencil_back_ops = false; - cmdbuf.setStencilOp(vk::StencilFaceFlagBits::eBack, stencil_back_ops.fail_op, - stencil_back_ops.pass_op, stencil_back_ops.depth_fail_op, - stencil_back_ops.compare_op); - } - } - if (dirty_state.stencil_front_reference && dirty_state.stencil_back_reference && - stencil_front_reference == stencil_back_reference) { - dirty_state.stencil_front_reference = false; - dirty_state.stencil_back_reference = false; - cmdbuf.setStencilReference(vk::StencilFaceFlagBits::eFrontAndBack, - stencil_front_reference); - } else { - if (dirty_state.stencil_front_reference) { - dirty_state.stencil_front_reference = false; - cmdbuf.setStencilReference(vk::StencilFaceFlagBits::eFront, - stencil_front_reference); - } - if (dirty_state.stencil_back_reference) { - dirty_state.stencil_back_reference = false; - cmdbuf.setStencilReference(vk::StencilFaceFlagBits::eBack, stencil_back_reference); - } - } - if (dirty_state.stencil_front_write_mask && dirty_state.stencil_back_write_mask && - stencil_front_write_mask == stencil_back_write_mask) { - dirty_state.stencil_front_write_mask = false; - dirty_state.stencil_back_write_mask = false; - cmdbuf.setStencilWriteMask(vk::StencilFaceFlagBits::eFrontAndBack, - stencil_front_write_mask); - } else { - if (dirty_state.stencil_front_write_mask) { - dirty_state.stencil_front_write_mask = false; - cmdbuf.setStencilWriteMask(vk::StencilFaceFlagBits::eFront, - stencil_front_write_mask); - } - if (dirty_state.stencil_back_write_mask) { - dirty_state.stencil_back_write_mask = false; - cmdbuf.setStencilWriteMask(vk::StencilFaceFlagBits::eBack, stencil_back_write_mask); - } - } - if (dirty_state.stencil_front_compare_mask && dirty_state.stencil_back_compare_mask && - stencil_front_compare_mask == stencil_back_compare_mask) { - dirty_state.stencil_front_compare_mask = false; - dirty_state.stencil_back_compare_mask = false; - cmdbuf.setStencilCompareMask(vk::StencilFaceFlagBits::eFrontAndBack, - stencil_front_compare_mask); - } else { - if (dirty_state.stencil_front_compare_mask) { - dirty_state.stencil_front_compare_mask = false; - cmdbuf.setStencilCompareMask(vk::StencilFaceFlagBits::eFront, - stencil_front_compare_mask); - } - if (dirty_state.stencil_back_compare_mask) { - dirty_state.stencil_back_compare_mask = false; - cmdbuf.setStencilCompareMask(vk::StencilFaceFlagBits::eBack, - stencil_back_compare_mask); - } - } - } - if (dirty_state.primitive_restart_enable) { - dirty_state.primitive_restart_enable = false; - cmdbuf.setPrimitiveRestartEnable(primitive_restart_enable); - } - if (dirty_state.rasterizer_discard_enable) { - dirty_state.rasterizer_discard_enable = false; - cmdbuf.setRasterizerDiscardEnable(rasterizer_discard_enable); - } - if (dirty_state.cull_mode) { - dirty_state.cull_mode = false; - cmdbuf.setCullMode(cull_mode); - } - if (dirty_state.front_face) { - dirty_state.front_face = false; - cmdbuf.setFrontFace(front_face); - } - if (dirty_state.blend_constants) { - dirty_state.blend_constants = false; - cmdbuf.setBlendConstants(blend_constants.data()); - } - if (dirty_state.color_write_masks) { - dirty_state.color_write_masks = false; - if (instance.IsDynamicColorWriteMaskSupported()) { - cmdbuf.setColorWriteMaskEXT(0, color_write_masks); - } - } - if (dirty_state.line_width) { - dirty_state.line_width = false; - cmdbuf.setLineWidth(line_width); - } - if (dirty_state.feedback_loop_enabled && instance.IsAttachmentFeedbackLoopLayoutSupported()) { - dirty_state.feedback_loop_enabled = false; - cmdbuf.setAttachmentFeedbackLoopEnableEXT(feedback_loop_enabled - ? vk::ImageAspectFlagBits::eColor - : vk::ImageAspectFlagBits::eNone); - } -} - -} // namespace Vulkan +// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "common/assert.h" +#include "common/debug.h" +#include "common/thread.h" +#include "imgui/renderer/texture_manager.h" +#include "video_core/renderer_vulkan/vk_instance.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" + +namespace Vulkan { + +std::mutex Scheduler::submit_mutex; + +Scheduler::Scheduler(const Instance& instance) + : instance{instance}, master_semaphore{instance}, command_pool{instance, &master_semaphore} { +#if TRACY_GPU_ENABLED + profiler_scope = reinterpret_cast(std::malloc(sizeof(tracy::VkCtxScope))); +#endif + AllocateWorkerCommandBuffers(); + priority_pending_ops_thread = + std::jthread(std::bind_front(&Scheduler::PriorityPendingOpsThread, this)); +} + +Scheduler::~Scheduler() { +#if TRACY_GPU_ENABLED + std::free(profiler_scope); +#endif +} + +void Scheduler::BeginRendering(const RenderState& new_state) { + if (is_rendering && render_state == new_state) { + return; + } + EndRendering(); + is_rendering = true; + render_state = new_state; + + const vk::RenderingInfo rendering_info = { + .renderArea = + { + .offset = {0, 0}, + .extent = {render_state.width, render_state.height}, + }, + .layerCount = render_state.num_layers, + .colorAttachmentCount = render_state.num_color_attachments, + .pColorAttachments = render_state.num_color_attachments > 0 + ? render_state.color_attachments.data() + : nullptr, + .pDepthAttachment = render_state.has_depth ? &render_state.depth_attachment : nullptr, + .pStencilAttachment = render_state.has_stencil ? &render_state.stencil_attachment : nullptr, + }; + + current_cmdbuf.beginRendering(rendering_info); +} + +void Scheduler::EndRendering() { + if (!is_rendering) { + return; + } + is_rendering = false; + current_cmdbuf.endRendering(); +} + +void Scheduler::Flush(SubmitInfo& info) { + // When flushing, we only send data to the driver; no waiting is necessary. + SubmitExecution(info); +} + +void Scheduler::Flush() { + SubmitInfo info{}; + Flush(info); +} + +void Scheduler::Finish() { + // When finishing, we need to wait for the submission to have executed on the device. + const u64 presubmit_tick = CurrentTick(); + SubmitInfo info{}; + SubmitExecution(info); + Wait(presubmit_tick); +} + +void Scheduler::Wait(u64 tick) { + if (tick >= master_semaphore.CurrentTick()) { + // Make sure we are not waiting for the current tick without signalling + SubmitInfo info{}; + Flush(info); + } + master_semaphore.Wait(tick); +} + +void Scheduler::Wait(vk::Semaphore semaphore, u64 value) { + std::scoped_lock lk{submit_mutex}; + wait_semaphores.push_back(semaphore); + wait_values.push_back(value); +} + +void Scheduler::PopPendingOperations() { + master_semaphore.Refresh(); + while (!pending_ops.empty() && master_semaphore.IsFree(pending_ops.front().gpu_tick)) { + pending_ops.front().callback(); + pending_ops.pop(); + } +} + +void Scheduler::AllocateWorkerCommandBuffers() { + const vk::CommandBufferBeginInfo begin_info = { + .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit, + }; + + current_cmdbuf = command_pool.Commit(); + Check(current_cmdbuf.begin(begin_info)); + + // Invalidate dynamic state so it gets applied to the new command buffer. + dynamic_state.Invalidate(); + +#if TRACY_GPU_ENABLED + auto* profiler_ctx = instance.GetProfilerContext(); + if (profiler_ctx) { + static const auto scope_loc = + GPU_SCOPE_LOCATION("Guest Frame", MarkersPalette::GpuMarkerColor); + new (profiler_scope) tracy::VkCtxScope{profiler_ctx, &scope_loc, current_cmdbuf, true}; + } +#endif +} + +void Scheduler::SubmitExecution(SubmitInfo& info) { + std::scoped_lock lk{submit_mutex}; + const u64 signal_value = master_semaphore.NextTick(); + +#if TRACY_GPU_ENABLED + auto* profiler_ctx = instance.GetProfilerContext(); + if (profiler_ctx) { + profiler_scope->~VkCtxScope(); + TracyVkCollect(profiler_ctx, current_cmdbuf); + } +#endif + + EndRendering(); + Check(current_cmdbuf.end()); + + const vk::Semaphore timeline = master_semaphore.Handle(); + info.AddSignal(timeline, signal_value); + + // Merge cross-queue wait semaphores with info wait semaphores + std::vector all_wait_semas; + std::vector all_wait_values; + std::vector all_wait_stages; + + // Add info semaphores first + for (u32 i = 0; i < info.num_wait_semas; ++i) { + all_wait_semas.push_back(info.wait_semas[i]); + all_wait_values.push_back(info.wait_ticks[i]); + all_wait_stages.push_back(i == 0 ? vk::PipelineStageFlagBits::eAllCommands + : vk::PipelineStageFlagBits::eColorAttachmentOutput); + } + + // Add cross-queue sync semaphores + for (size_t i = 0; i < wait_semaphores.size(); ++i) { + all_wait_semas.push_back(wait_semaphores[i]); + all_wait_values.push_back(wait_values[i]); + all_wait_stages.push_back(vk::PipelineStageFlagBits::eAllCommands); + } + + const vk::TimelineSemaphoreSubmitInfo timeline_si = { + .waitSemaphoreValueCount = static_cast(all_wait_values.size()), + .pWaitSemaphoreValues = all_wait_values.data(), + .signalSemaphoreValueCount = info.num_signal_semas, + .pSignalSemaphoreValues = info.signal_ticks.data(), + }; + + const vk::SubmitInfo submit_info = { + .pNext = &timeline_si, + .waitSemaphoreCount = static_cast(all_wait_semas.size()), + .pWaitSemaphores = all_wait_semas.data(), + .pWaitDstStageMask = all_wait_stages.data(), + .commandBufferCount = 1U, + .pCommandBuffers = ¤t_cmdbuf, + .signalSemaphoreCount = info.num_signal_semas, + .pSignalSemaphores = info.signal_semas.data(), + }; + + ImGui::Core::TextureManager::Submit(); + auto submit_result = instance.GetGraphicsQueue().submit(submit_info, info.fence); + ASSERT_MSG(submit_result != vk::Result::eErrorDeviceLost, "Device lost during submit"); + + // Clear cross-queue waits after submission + wait_semaphores.clear(); + wait_values.clear(); + + master_semaphore.Refresh(); + AllocateWorkerCommandBuffers(); + + // Apply pending operations + PopPendingOperations(); +} + +void Scheduler::PriorityPendingOpsThread(std::stop_token stoken) { + Common::SetCurrentThreadName("shadPS4:GpuSchedPriorityPendingOpsRunner"); + + while (!stoken.stop_requested()) { + PendingOp op; + { + std::unique_lock lk(priority_pending_ops_mutex); + priority_pending_ops_cv.wait(lk, stoken, + [this] { return !priority_pending_ops.empty(); }); + if (stoken.stop_requested()) { + break; + } + + op = std::move(priority_pending_ops.front()); + priority_pending_ops.pop(); + } + + master_semaphore.Wait(op.gpu_tick); + if (stoken.stop_requested()) { + break; + } + + op.callback(); + } +} + +void DynamicState::Commit(const Instance& instance, const vk::CommandBuffer& cmdbuf) { + if (dirty_state.viewports) { + dirty_state.viewports = false; + cmdbuf.setViewportWithCount(viewports); + } + if (dirty_state.scissors) { + dirty_state.scissors = false; + cmdbuf.setScissorWithCount(scissors); + } + if (dirty_state.depth_test_enabled) { + dirty_state.depth_test_enabled = false; + cmdbuf.setDepthTestEnable(depth_test_enabled); + } + if (dirty_state.depth_write_enabled) { + dirty_state.depth_write_enabled = false; + // Note that this must be set in a command buffer even if depth test is disabled. + cmdbuf.setDepthWriteEnable(depth_write_enabled); + } + if (depth_test_enabled && dirty_state.depth_compare_op) { + dirty_state.depth_compare_op = false; + cmdbuf.setDepthCompareOp(depth_compare_op); + } + if (dirty_state.depth_bounds_test_enabled) { + dirty_state.depth_bounds_test_enabled = false; + if (instance.IsDepthBoundsSupported()) { + cmdbuf.setDepthBoundsTestEnable(depth_bounds_test_enabled); + } + } + if (depth_bounds_test_enabled && dirty_state.depth_bounds) { + dirty_state.depth_bounds = false; + if (instance.IsDepthBoundsSupported()) { + cmdbuf.setDepthBounds(depth_bounds_min, depth_bounds_max); + } + } + if (dirty_state.depth_bias_enabled) { + dirty_state.depth_bias_enabled = false; + cmdbuf.setDepthBiasEnable(depth_bias_enabled); + } + if (depth_bias_enabled && dirty_state.depth_bias) { + dirty_state.depth_bias = false; + cmdbuf.setDepthBias(depth_bias_constant, depth_bias_clamp, depth_bias_slope); + } + if (dirty_state.stencil_test_enabled) { + dirty_state.stencil_test_enabled = false; + cmdbuf.setStencilTestEnable(stencil_test_enabled); + } + if (stencil_test_enabled) { + if (dirty_state.stencil_front_ops && dirty_state.stencil_back_ops && + stencil_front_ops == stencil_back_ops) { + dirty_state.stencil_front_ops = false; + dirty_state.stencil_back_ops = false; + cmdbuf.setStencilOp(vk::StencilFaceFlagBits::eFrontAndBack, stencil_front_ops.fail_op, + stencil_front_ops.pass_op, stencil_front_ops.depth_fail_op, + stencil_front_ops.compare_op); + } else { + if (dirty_state.stencil_front_ops) { + dirty_state.stencil_front_ops = false; + cmdbuf.setStencilOp(vk::StencilFaceFlagBits::eFront, stencil_front_ops.fail_op, + stencil_front_ops.pass_op, stencil_front_ops.depth_fail_op, + stencil_front_ops.compare_op); + } + if (dirty_state.stencil_back_ops) { + dirty_state.stencil_back_ops = false; + cmdbuf.setStencilOp(vk::StencilFaceFlagBits::eBack, stencil_back_ops.fail_op, + stencil_back_ops.pass_op, stencil_back_ops.depth_fail_op, + stencil_back_ops.compare_op); + } + } + if (dirty_state.stencil_front_reference && dirty_state.stencil_back_reference && + stencil_front_reference == stencil_back_reference) { + dirty_state.stencil_front_reference = false; + dirty_state.stencil_back_reference = false; + cmdbuf.setStencilReference(vk::StencilFaceFlagBits::eFrontAndBack, + stencil_front_reference); + } else { + if (dirty_state.stencil_front_reference) { + dirty_state.stencil_front_reference = false; + cmdbuf.setStencilReference(vk::StencilFaceFlagBits::eFront, + stencil_front_reference); + } + if (dirty_state.stencil_back_reference) { + dirty_state.stencil_back_reference = false; + cmdbuf.setStencilReference(vk::StencilFaceFlagBits::eBack, stencil_back_reference); + } + } + if (dirty_state.stencil_front_write_mask && dirty_state.stencil_back_write_mask && + stencil_front_write_mask == stencil_back_write_mask) { + dirty_state.stencil_front_write_mask = false; + dirty_state.stencil_back_write_mask = false; + cmdbuf.setStencilWriteMask(vk::StencilFaceFlagBits::eFrontAndBack, + stencil_front_write_mask); + } else { + if (dirty_state.stencil_front_write_mask) { + dirty_state.stencil_front_write_mask = false; + cmdbuf.setStencilWriteMask(vk::StencilFaceFlagBits::eFront, + stencil_front_write_mask); + } + if (dirty_state.stencil_back_write_mask) { + dirty_state.stencil_back_write_mask = false; + cmdbuf.setStencilWriteMask(vk::StencilFaceFlagBits::eBack, stencil_back_write_mask); + } + } + if (dirty_state.stencil_front_compare_mask && dirty_state.stencil_back_compare_mask && + stencil_front_compare_mask == stencil_back_compare_mask) { + dirty_state.stencil_front_compare_mask = false; + dirty_state.stencil_back_compare_mask = false; + cmdbuf.setStencilCompareMask(vk::StencilFaceFlagBits::eFrontAndBack, + stencil_front_compare_mask); + } else { + if (dirty_state.stencil_front_compare_mask) { + dirty_state.stencil_front_compare_mask = false; + cmdbuf.setStencilCompareMask(vk::StencilFaceFlagBits::eFront, + stencil_front_compare_mask); + } + if (dirty_state.stencil_back_compare_mask) { + dirty_state.stencil_back_compare_mask = false; + cmdbuf.setStencilCompareMask(vk::StencilFaceFlagBits::eBack, + stencil_back_compare_mask); + } + } + } + if (dirty_state.primitive_restart_enable) { + dirty_state.primitive_restart_enable = false; + cmdbuf.setPrimitiveRestartEnable(primitive_restart_enable); + } + if (dirty_state.rasterizer_discard_enable) { + dirty_state.rasterizer_discard_enable = false; + cmdbuf.setRasterizerDiscardEnable(rasterizer_discard_enable); + } + if (dirty_state.cull_mode) { + dirty_state.cull_mode = false; + cmdbuf.setCullMode(cull_mode); + } + if (dirty_state.front_face) { + dirty_state.front_face = false; + cmdbuf.setFrontFace(front_face); + } + if (dirty_state.blend_constants) { + dirty_state.blend_constants = false; + cmdbuf.setBlendConstants(blend_constants.data()); + } + if (dirty_state.color_write_masks) { + dirty_state.color_write_masks = false; + if (instance.IsDynamicColorWriteMaskSupported()) { + cmdbuf.setColorWriteMaskEXT(0, color_write_masks); + } + } + if (dirty_state.line_width) { + dirty_state.line_width = false; + cmdbuf.setLineWidth(line_width); + } + if (dirty_state.feedback_loop_enabled && instance.IsAttachmentFeedbackLoopLayoutSupported()) { + dirty_state.feedback_loop_enabled = false; + cmdbuf.setAttachmentFeedbackLoopEnableEXT(feedback_loop_enabled + ? vk::ImageAspectFlagBits::eColor + : vk::ImageAspectFlagBits::eNone); + } +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index aff299e54c4..06cb7d5daa3 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -1,450 +1,462 @@ -// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#pragma once - -#include -#include -#include -#include - -#include "common/unique_function.h" -#include "video_core/amdgpu/regs_color.h" -#include "video_core/amdgpu/regs_primitive.h" -#include "video_core/renderer_vulkan/vk_master_semaphore.h" -#include "video_core/renderer_vulkan/vk_resource_pool.h" - -namespace tracy { -class VkCtxScope; -} - -namespace Vulkan { - -class Instance; - -struct RenderState { - std::array color_attachments; - vk::RenderingAttachmentInfo depth_attachment; - vk::RenderingAttachmentInfo stencil_attachment; - u32 num_color_attachments; - u32 num_layers; - bool has_depth; - bool has_stencil; - u32 width; - u32 height; - - RenderState() { - std::memset(this, 0, sizeof(*this)); - color_attachments.fill(vk::RenderingAttachmentInfo{}); - depth_attachment = vk::RenderingAttachmentInfo{}; - stencil_attachment = vk::RenderingAttachmentInfo{}; - num_layers = 1; - } - - bool operator==(const RenderState& other) const noexcept { - return std::memcmp(this, &other, sizeof(RenderState)) == 0; - } -}; - -struct SubmitInfo { - std::array wait_semas; - std::array wait_ticks; - std::array signal_semas; - std::array signal_ticks; - vk::Fence fence; - u32 num_wait_semas; - u32 num_signal_semas; - - void AddWait(vk::Semaphore semaphore, u64 tick = 1) { - wait_semas[num_wait_semas] = semaphore; - wait_ticks[num_wait_semas++] = tick; - } - - void AddSignal(vk::Semaphore semaphore, u64 tick = 1) { - signal_semas[num_signal_semas] = semaphore; - signal_ticks[num_signal_semas++] = tick; - } - - void AddSignal(vk::Fence fence) { - this->fence = fence; - } -}; - -using Viewports = boost::container::static_vector; -using Scissors = boost::container::static_vector; -using ColorWriteMasks = std::array; -struct StencilOps { - vk::StencilOp fail_op{}; - vk::StencilOp pass_op{}; - vk::StencilOp depth_fail_op{}; - vk::CompareOp compare_op{}; - - bool operator==(const StencilOps& other) const { - return fail_op == other.fail_op && pass_op == other.pass_op && - depth_fail_op == other.depth_fail_op && compare_op == other.compare_op; - } -}; -struct DynamicState { - struct { - bool viewports : 1; - bool scissors : 1; - - bool depth_test_enabled : 1; - bool depth_write_enabled : 1; - bool depth_compare_op : 1; - - bool depth_bounds_test_enabled : 1; - bool depth_bounds : 1; - - bool depth_bias_enabled : 1; - bool depth_bias : 1; - - bool stencil_test_enabled : 1; - bool stencil_front_ops : 1; - bool stencil_front_reference : 1; - bool stencil_front_write_mask : 1; - bool stencil_front_compare_mask : 1; - bool stencil_back_ops : 1; - bool stencil_back_reference : 1; - bool stencil_back_write_mask : 1; - bool stencil_back_compare_mask : 1; - - bool primitive_restart_enable : 1; - bool rasterizer_discard_enable : 1; - bool cull_mode : 1; - bool front_face : 1; - - bool blend_constants : 1; - bool color_write_masks : 1; - bool line_width : 1; - bool feedback_loop_enabled : 1; - } dirty_state{}; - - Viewports viewports{}; - Scissors scissors{}; - - bool depth_test_enabled{}; - bool depth_write_enabled{}; - vk::CompareOp depth_compare_op{}; - - bool depth_bounds_test_enabled{}; - float depth_bounds_min{}; - float depth_bounds_max{}; - - bool depth_bias_enabled{}; - float depth_bias_constant{}; - float depth_bias_clamp{}; - float depth_bias_slope{}; - - bool stencil_test_enabled{}; - StencilOps stencil_front_ops{}; - u32 stencil_front_reference{}; - u32 stencil_front_write_mask{}; - u32 stencil_front_compare_mask{}; - StencilOps stencil_back_ops{}; - u32 stencil_back_reference{}; - u32 stencil_back_write_mask{}; - u32 stencil_back_compare_mask{}; - - bool primitive_restart_enable{}; - bool rasterizer_discard_enable{}; - vk::CullModeFlags cull_mode{}; - vk::FrontFace front_face{}; - - std::array blend_constants{}; - ColorWriteMasks color_write_masks{}; - float line_width{}; - bool feedback_loop_enabled{}; - - /// Commits the dynamic state to the provided command buffer. - void Commit(const Instance& instance, const vk::CommandBuffer& cmdbuf); - - /// Invalidates all dynamic state to be flushed into the next command buffer. - void Invalidate() { - std::memset(&dirty_state, 0xFF, sizeof(dirty_state)); - } - - void SetViewports(const Viewports& viewports_) { - if (!std::ranges::equal(viewports, viewports_)) { - viewports = viewports_; - dirty_state.viewports = true; - } - } - - void SetScissors(const Scissors& scissors_) { - if (!std::ranges::equal(scissors, scissors_)) { - scissors = scissors_; - dirty_state.scissors = true; - } - } - - void SetDepthTestEnabled(const bool enabled) { - if (depth_test_enabled != enabled) { - depth_test_enabled = enabled; - dirty_state.depth_test_enabled = true; - } - } - - void SetDepthWriteEnabled(const bool enabled) { - if (depth_write_enabled != enabled) { - depth_write_enabled = enabled; - dirty_state.depth_write_enabled = true; - } - } - - void SetDepthCompareOp(const vk::CompareOp compare_op) { - if (depth_compare_op != compare_op) { - depth_compare_op = compare_op; - dirty_state.depth_compare_op = true; - } - } - - void SetDepthBoundsTestEnabled(const bool enabled) { - if (depth_bounds_test_enabled != enabled) { - depth_bounds_test_enabled = enabled; - dirty_state.depth_bounds_test_enabled = true; - } - } - - void SetDepthBounds(const float min, const float max) { - if (depth_bounds_min != min || depth_bounds_max != max) { - depth_bounds_min = min; - depth_bounds_max = max; - dirty_state.depth_bounds = true; - } - } - - void SetDepthBiasEnabled(const bool enabled) { - if (depth_bias_enabled != enabled) { - depth_bias_enabled = enabled; - dirty_state.depth_bias_enabled = true; - } - } - - void SetDepthBias(const float constant, const float clamp, const float slope) { - if (depth_bias_constant != constant || depth_bias_clamp != clamp || - depth_bias_slope != slope) { - depth_bias_constant = constant; - depth_bias_clamp = clamp; - depth_bias_slope = slope; - dirty_state.depth_bias = true; - } - } - - void SetStencilTestEnabled(const bool enabled) { - if (stencil_test_enabled != enabled) { - stencil_test_enabled = enabled; - dirty_state.stencil_test_enabled = true; - } - } - - void SetStencilOps(const StencilOps& front_ops, const StencilOps& back_ops) { - if (stencil_front_ops != front_ops) { - stencil_front_ops = front_ops; - dirty_state.stencil_front_ops = true; - } - if (stencil_back_ops != back_ops) { - stencil_back_ops = back_ops; - dirty_state.stencil_back_ops = true; - } - } - - void SetStencilReferences(const u32 front_reference, const u32 back_reference) { - if (stencil_front_reference != front_reference) { - stencil_front_reference = front_reference; - dirty_state.stencil_front_reference = true; - } - if (stencil_back_reference != back_reference) { - stencil_back_reference = back_reference; - dirty_state.stencil_back_reference = true; - } - } - - void SetStencilWriteMasks(const u32 front_write_mask, const u32 back_write_mask) { - if (stencil_front_write_mask != front_write_mask) { - stencil_front_write_mask = front_write_mask; - dirty_state.stencil_front_write_mask = true; - } - if (stencil_back_write_mask != back_write_mask) { - stencil_back_write_mask = back_write_mask; - dirty_state.stencil_back_write_mask = true; - } - } - - void SetStencilCompareMasks(const u32 front_compare_mask, const u32 back_compare_mask) { - if (stencil_front_compare_mask != front_compare_mask) { - stencil_front_compare_mask = front_compare_mask; - dirty_state.stencil_front_compare_mask = true; - } - if (stencil_back_compare_mask != back_compare_mask) { - stencil_back_compare_mask = back_compare_mask; - dirty_state.stencil_back_compare_mask = true; - } - } - - void SetPrimitiveRestartEnabled(const bool enabled) { - if (primitive_restart_enable != enabled) { - primitive_restart_enable = enabled; - dirty_state.primitive_restart_enable = true; - } - } - - void SetCullMode(const vk::CullModeFlags cull_mode_) { - if (cull_mode != cull_mode_) { - cull_mode = cull_mode_; - dirty_state.cull_mode = true; - } - } - - void SetFrontFace(const vk::FrontFace front_face_) { - if (front_face != front_face_) { - front_face = front_face_; - dirty_state.front_face = true; - } - } - - void SetBlendConstants(const std::array blend_constants_) { - if (blend_constants != blend_constants_) { - blend_constants = blend_constants_; - dirty_state.blend_constants = true; - } - } - - void SetRasterizerDiscardEnabled(const bool enabled) { - if (rasterizer_discard_enable != enabled) { - rasterizer_discard_enable = enabled; - dirty_state.rasterizer_discard_enable = true; - } - } - - void SetColorWriteMasks(const ColorWriteMasks& color_write_masks_) { - if (!std::ranges::equal(color_write_masks, color_write_masks_)) { - color_write_masks = color_write_masks_; - dirty_state.color_write_masks = true; - } - } - - void SetLineWidth(const float width) { - if (line_width != width) { - line_width = width; - dirty_state.line_width = true; - } - } - - void SetAttachmentFeedbackLoopEnabled(const bool enabled) { - if (feedback_loop_enabled != enabled) { - feedback_loop_enabled = enabled; - dirty_state.feedback_loop_enabled = true; - } - } -}; - -class Scheduler { -public: - explicit Scheduler(const Instance& instance); - ~Scheduler(); - - /// Sends the current execution context to the GPU - /// and increments the scheduler timeline semaphore. - void Flush(SubmitInfo& info); - - /// Sends the current execution context to the GPU - /// and increments the scheduler timeline semaphore. - void Flush(); - - /// Sends the current execution context to the GPU and waits for it to complete. - void Finish(); - - /// Waits for the given tick to trigger on the GPU. - void Wait(u64 tick); - - /// Attempts to execute operations whose tick the GPU has caught up with. - void PopPendingOperations(); - - /// Starts a new rendering scope with provided state. - void BeginRendering(const RenderState& new_state); - - /// Ends current rendering scope. - void EndRendering(); - - /// Returns the current render state. - const RenderState& GetRenderState() const { - return render_state; - } - - /// Returns the current pipeline dynamic state tracking. - DynamicState& GetDynamicState() { - return dynamic_state; - } - - /// Returns the current command buffer. - vk::CommandBuffer CommandBuffer() const { - return current_cmdbuf; - } - - /// Returns the current command buffer tick. - [[nodiscard]] u64 CurrentTick() const noexcept { - return master_semaphore.CurrentTick(); - } - - /// Returns true when a tick has been triggered by the GPU. - [[nodiscard]] bool IsFree(u64 tick) noexcept { - if (master_semaphore.IsFree(tick)) { - return true; - } - master_semaphore.Refresh(); - return master_semaphore.IsFree(tick); - } - - /// Returns the master timeline semaphore. - [[nodiscard]] MasterSemaphore* GetMasterSemaphore() noexcept { - return &master_semaphore; - } - - /// Defers an operation until the gpu has reached the current cpu tick. - /// Will be run when submitting or calling PopPendingOperations. - void DeferOperation(Common::UniqueFunction&& func) { - pending_ops.emplace(std::move(func), CurrentTick()); - } - - /// Defers an operation until the gpu has reached the current cpu tick. - /// Runs as soon as possible in another thread. - void DeferPriorityOperation(Common::UniqueFunction&& func) { - { - std::unique_lock lk(priority_pending_ops_mutex); - priority_pending_ops.emplace(std::move(func), CurrentTick()); - } - priority_pending_ops_cv.notify_one(); - } - - static std::mutex submit_mutex; - -private: - void AllocateWorkerCommandBuffers(); - - void SubmitExecution(SubmitInfo& info); - - void PriorityPendingOpsThread(std::stop_token stoken); - -private: - const Instance& instance; - MasterSemaphore master_semaphore; - CommandPool command_pool; - DynamicState dynamic_state; - vk::CommandBuffer current_cmdbuf; - std::condition_variable_any event_cv; - struct PendingOp { - Common::UniqueFunction callback; - u64 gpu_tick; - }; - std::queue pending_ops; - std::queue priority_pending_ops; - std::mutex priority_pending_ops_mutex; - std::condition_variable_any priority_pending_ops_cv; - std::jthread priority_pending_ops_thread; - RenderState render_state; - bool is_rendering = false; - tracy::VkCtxScope* profiler_scope{}; -}; - -} // namespace Vulkan +// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include +#include +#include + +#include "common/unique_function.h" +#include "video_core/amdgpu/regs_color.h" +#include "video_core/amdgpu/regs_primitive.h" +#include "video_core/renderer_vulkan/vk_master_semaphore.h" +#include "video_core/renderer_vulkan/vk_resource_pool.h" + +namespace tracy { +class VkCtxScope; +} + +namespace Vulkan { + +class Instance; + +struct RenderState { + std::array color_attachments; + vk::RenderingAttachmentInfo depth_attachment; + vk::RenderingAttachmentInfo stencil_attachment; + u32 num_color_attachments; + u32 num_layers; + bool has_depth; + bool has_stencil; + u32 width; + u32 height; + + RenderState() { + std::memset(this, 0, sizeof(*this)); + color_attachments.fill(vk::RenderingAttachmentInfo{}); + depth_attachment = vk::RenderingAttachmentInfo{}; + stencil_attachment = vk::RenderingAttachmentInfo{}; + num_layers = 1; + } + + bool operator==(const RenderState& other) const noexcept { + return std::memcmp(this, &other, sizeof(RenderState)) == 0; + } +}; + +struct SubmitInfo { + std::array wait_semas; + std::array wait_ticks; + std::array signal_semas; + std::array signal_ticks; + vk::Fence fence; + u32 num_wait_semas; + u32 num_signal_semas; + + void AddWait(vk::Semaphore semaphore, u64 tick = 1) { + wait_semas[num_wait_semas] = semaphore; + wait_ticks[num_wait_semas++] = tick; + } + + void AddSignal(vk::Semaphore semaphore, u64 tick = 1) { + signal_semas[num_signal_semas] = semaphore; + signal_ticks[num_signal_semas++] = tick; + } + + void AddSignal(vk::Fence fence) { + this->fence = fence; + } +}; + +using Viewports = boost::container::static_vector; +using Scissors = boost::container::static_vector; +using ColorWriteMasks = std::array; +struct StencilOps { + vk::StencilOp fail_op{}; + vk::StencilOp pass_op{}; + vk::StencilOp depth_fail_op{}; + vk::CompareOp compare_op{}; + + bool operator==(const StencilOps& other) const { + return fail_op == other.fail_op && pass_op == other.pass_op && + depth_fail_op == other.depth_fail_op && compare_op == other.compare_op; + } +}; +struct DynamicState { + struct { + bool viewports : 1; + bool scissors : 1; + + bool depth_test_enabled : 1; + bool depth_write_enabled : 1; + bool depth_compare_op : 1; + + bool depth_bounds_test_enabled : 1; + bool depth_bounds : 1; + + bool depth_bias_enabled : 1; + bool depth_bias : 1; + + bool stencil_test_enabled : 1; + bool stencil_front_ops : 1; + bool stencil_front_reference : 1; + bool stencil_front_write_mask : 1; + bool stencil_front_compare_mask : 1; + bool stencil_back_ops : 1; + bool stencil_back_reference : 1; + bool stencil_back_write_mask : 1; + bool stencil_back_compare_mask : 1; + + bool primitive_restart_enable : 1; + bool rasterizer_discard_enable : 1; + bool cull_mode : 1; + bool front_face : 1; + + bool blend_constants : 1; + bool color_write_masks : 1; + bool line_width : 1; + bool feedback_loop_enabled : 1; + } dirty_state{}; + + Viewports viewports{}; + Scissors scissors{}; + + bool depth_test_enabled{}; + bool depth_write_enabled{}; + vk::CompareOp depth_compare_op{}; + + bool depth_bounds_test_enabled{}; + float depth_bounds_min{}; + float depth_bounds_max{}; + + bool depth_bias_enabled{}; + float depth_bias_constant{}; + float depth_bias_clamp{}; + float depth_bias_slope{}; + + bool stencil_test_enabled{}; + StencilOps stencil_front_ops{}; + u32 stencil_front_reference{}; + u32 stencil_front_write_mask{}; + u32 stencil_front_compare_mask{}; + StencilOps stencil_back_ops{}; + u32 stencil_back_reference{}; + u32 stencil_back_write_mask{}; + u32 stencil_back_compare_mask{}; + + bool primitive_restart_enable{}; + bool rasterizer_discard_enable{}; + vk::CullModeFlags cull_mode{}; + vk::FrontFace front_face{}; + + std::array blend_constants{}; + ColorWriteMasks color_write_masks{}; + float line_width{}; + bool feedback_loop_enabled{}; + + /// Commits the dynamic state to the provided command buffer. + void Commit(const Instance& instance, const vk::CommandBuffer& cmdbuf); + + /// Invalidates all dynamic state to be flushed into the next command buffer. + void Invalidate() { + std::memset(&dirty_state, 0xFF, sizeof(dirty_state)); + } + + void SetViewports(const Viewports& viewports_) { + if (!std::ranges::equal(viewports, viewports_)) { + viewports = viewports_; + dirty_state.viewports = true; + } + } + + void SetScissors(const Scissors& scissors_) { + if (!std::ranges::equal(scissors, scissors_)) { + scissors = scissors_; + dirty_state.scissors = true; + } + } + + void SetDepthTestEnabled(const bool enabled) { + if (depth_test_enabled != enabled) { + depth_test_enabled = enabled; + dirty_state.depth_test_enabled = true; + } + } + + void SetDepthWriteEnabled(const bool enabled) { + if (depth_write_enabled != enabled) { + depth_write_enabled = enabled; + dirty_state.depth_write_enabled = true; + } + } + + void SetDepthCompareOp(const vk::CompareOp compare_op) { + if (depth_compare_op != compare_op) { + depth_compare_op = compare_op; + dirty_state.depth_compare_op = true; + } + } + + void SetDepthBoundsTestEnabled(const bool enabled) { + if (depth_bounds_test_enabled != enabled) { + depth_bounds_test_enabled = enabled; + dirty_state.depth_bounds_test_enabled = true; + } + } + + void SetDepthBounds(const float min, const float max) { + if (depth_bounds_min != min || depth_bounds_max != max) { + depth_bounds_min = min; + depth_bounds_max = max; + dirty_state.depth_bounds = true; + } + } + + void SetDepthBiasEnabled(const bool enabled) { + if (depth_bias_enabled != enabled) { + depth_bias_enabled = enabled; + dirty_state.depth_bias_enabled = true; + } + } + + void SetDepthBias(const float constant, const float clamp, const float slope) { + if (depth_bias_constant != constant || depth_bias_clamp != clamp || + depth_bias_slope != slope) { + depth_bias_constant = constant; + depth_bias_clamp = clamp; + depth_bias_slope = slope; + dirty_state.depth_bias = true; + } + } + + void SetStencilTestEnabled(const bool enabled) { + if (stencil_test_enabled != enabled) { + stencil_test_enabled = enabled; + dirty_state.stencil_test_enabled = true; + } + } + + void SetStencilOps(const StencilOps& front_ops, const StencilOps& back_ops) { + if (stencil_front_ops != front_ops) { + stencil_front_ops = front_ops; + dirty_state.stencil_front_ops = true; + } + if (stencil_back_ops != back_ops) { + stencil_back_ops = back_ops; + dirty_state.stencil_back_ops = true; + } + } + + void SetStencilReferences(const u32 front_reference, const u32 back_reference) { + if (stencil_front_reference != front_reference) { + stencil_front_reference = front_reference; + dirty_state.stencil_front_reference = true; + } + if (stencil_back_reference != back_reference) { + stencil_back_reference = back_reference; + dirty_state.stencil_back_reference = true; + } + } + + void SetStencilWriteMasks(const u32 front_write_mask, const u32 back_write_mask) { + if (stencil_front_write_mask != front_write_mask) { + stencil_front_write_mask = front_write_mask; + dirty_state.stencil_front_write_mask = true; + } + if (stencil_back_write_mask != back_write_mask) { + stencil_back_write_mask = back_write_mask; + dirty_state.stencil_back_write_mask = true; + } + } + + void SetStencilCompareMasks(const u32 front_compare_mask, const u32 back_compare_mask) { + if (stencil_front_compare_mask != front_compare_mask) { + stencil_front_compare_mask = front_compare_mask; + dirty_state.stencil_front_compare_mask = true; + } + if (stencil_back_compare_mask != back_compare_mask) { + stencil_back_compare_mask = back_compare_mask; + dirty_state.stencil_back_compare_mask = true; + } + } + + void SetPrimitiveRestartEnabled(const bool enabled) { + if (primitive_restart_enable != enabled) { + primitive_restart_enable = enabled; + dirty_state.primitive_restart_enable = true; + } + } + + void SetCullMode(const vk::CullModeFlags cull_mode_) { + if (cull_mode != cull_mode_) { + cull_mode = cull_mode_; + dirty_state.cull_mode = true; + } + } + + void SetFrontFace(const vk::FrontFace front_face_) { + if (front_face != front_face_) { + front_face = front_face_; + dirty_state.front_face = true; + } + } + + void SetBlendConstants(const std::array blend_constants_) { + if (blend_constants != blend_constants_) { + blend_constants = blend_constants_; + dirty_state.blend_constants = true; + } + } + + void SetRasterizerDiscardEnabled(const bool enabled) { + if (rasterizer_discard_enable != enabled) { + rasterizer_discard_enable = enabled; + dirty_state.rasterizer_discard_enable = true; + } + } + + void SetColorWriteMasks(const ColorWriteMasks& color_write_masks_) { + if (!std::ranges::equal(color_write_masks, color_write_masks_)) { + color_write_masks = color_write_masks_; + dirty_state.color_write_masks = true; + } + } + + void SetLineWidth(const float width) { + if (line_width != width) { + line_width = width; + dirty_state.line_width = true; + } + } + + void SetAttachmentFeedbackLoopEnabled(const bool enabled) { + if (feedback_loop_enabled != enabled) { + feedback_loop_enabled = enabled; + dirty_state.feedback_loop_enabled = true; + } + } +}; + +class Scheduler { +public: + explicit Scheduler(const Instance& instance); + ~Scheduler(); + + /// Sends the current execution context to the GPU + /// and increments the scheduler timeline semaphore. + void Flush(SubmitInfo& info); + + /// Sends the current execution context to the GPU + /// and increments the scheduler timeline semaphore. + void Flush(); + + /// Sends the current execution context to the GPU and waits for it to complete. + void Finish(); + + /// Waits for the given tick to trigger on the GPU. + void Wait(u64 tick); + + /// Registers a semaphore wait for cross-queue synchronization (async compute). + void Wait(vk::Semaphore semaphore, u64 value); + + /// Returns the timeline semaphore handle for cross-queue sync. + vk::Semaphore GetTimelineSemaphore() const { + return master_semaphore.Handle(); + } + + /// Attempts to execute operations whose tick the GPU has caught up with. + void PopPendingOperations(); + + /// Starts a new rendering scope with provided state. + void BeginRendering(const RenderState& new_state); + + /// Ends current rendering scope. + void EndRendering(); + + /// Returns the current render state. + const RenderState& GetRenderState() const { + return render_state; + } + + /// Returns the current pipeline dynamic state tracking. + DynamicState& GetDynamicState() { + return dynamic_state; + } + + /// Returns the current command buffer. + vk::CommandBuffer CommandBuffer() const { + return current_cmdbuf; + } + + /// Returns the current command buffer tick. + [[nodiscard]] u64 CurrentTick() const noexcept { + return master_semaphore.CurrentTick(); + } + + /// Returns true when a tick has been triggered by the GPU. + [[nodiscard]] bool IsFree(u64 tick) noexcept { + if (master_semaphore.IsFree(tick)) { + return true; + } + master_semaphore.Refresh(); + return master_semaphore.IsFree(tick); + } + + /// Returns the master timeline semaphore. + [[nodiscard]] MasterSemaphore* GetMasterSemaphore() noexcept { + return &master_semaphore; + } + + /// Defers an operation until the gpu has reached the current cpu tick. + /// Will be run when submitting or calling PopPendingOperations. + void DeferOperation(Common::UniqueFunction&& func) { + pending_ops.emplace(std::move(func), CurrentTick()); + } + + /// Defers an operation until the gpu has reached the current cpu tick. + /// Runs as soon as possible in another thread. + void DeferPriorityOperation(Common::UniqueFunction&& func) { + { + std::unique_lock lk(priority_pending_ops_mutex); + priority_pending_ops.emplace(std::move(func), CurrentTick()); + } + priority_pending_ops_cv.notify_one(); + } + + static std::mutex submit_mutex; + +private: + void AllocateWorkerCommandBuffers(); + + void SubmitExecution(SubmitInfo& info); + + void PriorityPendingOpsThread(std::stop_token stoken); + +private: + const Instance& instance; + MasterSemaphore master_semaphore; + CommandPool command_pool; + DynamicState dynamic_state; + vk::CommandBuffer current_cmdbuf; + std::condition_variable_any event_cv; + struct PendingOp { + Common::UniqueFunction callback; + u64 gpu_tick; + }; + std::queue pending_ops; + std::queue priority_pending_ops; + std::mutex priority_pending_ops_mutex; + std::condition_variable_any priority_pending_ops_cv; + std::jthread priority_pending_ops_thread; + RenderState render_state; + bool is_rendering = false; + tracy::VkCtxScope* profiler_scope{}; + + // For cross-queue sync (async compute) + std::vector wait_semaphores; + std::vector wait_values; +}; + +} // namespace Vulkan diff --git a/src/video_core/texture_cache/image.cpp b/src/video_core/texture_cache/image.cpp index 418641bc347..f8f1be5ff20 100644 --- a/src/video_core/texture_cache/image.cpp +++ b/src/video_core/texture_cache/image.cpp @@ -1,4 +1,5 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +#include // SPDX-License-Identifier: GPL-2.0-or-later #include @@ -152,6 +153,13 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, ? image_format_properties.value.imageFormatProperties.sampleCounts : vk::SampleCountFlagBits::e1; + // Check if we need concurrent sharing for async compute + const bool has_async = instance->HasDedicatedComputeQueue(); + std::array queue_families = { + instance->GetGraphicsQueueFamilyIndex(), + instance->GetComputeQueueFamilyIndex(), + }; + const vk::ImageCreateInfo image_ci = { .flags = flags, .imageType = ConvertImageType(info.type), @@ -166,6 +174,9 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, .samples = LiverpoolToVK::NumSamples(info.num_samples, supported_samples), .tiling = tiling, .usage = usage_flags, + .sharingMode = has_async ? vk::SharingMode::eConcurrent : vk::SharingMode::eExclusive, + .queueFamilyIndexCount = has_async ? static_cast(queue_families.size()) : 0, + .pQueueFamilyIndices = has_async ? queue_families.data() : nullptr, .initialLayout = vk::ImageLayout::eUndefined, };