From 7d763f060eb0fe151a629aa36cce3d7ce076e12a Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Thu, 18 Jun 2020 17:47:19 -0300
Subject: vk_update_descriptor: Upload descriptor sets data directly

Instead of copying to a temporary payload before sending the update task
to the worker thread, insert elements to the payload directly.
---
 src/video_core/renderer_vulkan/vk_rasterizer.cpp   |  4 +--
 .../renderer_vulkan/vk_update_descriptor.cpp       | 36 ++++++++--------------
 .../renderer_vulkan/vk_update_descriptor.h         | 32 +++++++++----------
 3 files changed, 30 insertions(+), 42 deletions(-)

(limited to 'src/video_core')
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 184b2238a..91da9ff80 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -1154,7 +1154,7 @@ void RasterizerVulkan::SetupTexture(const Tegra::Texture::FullTextureInfo& textu
     const auto sampler = sampler_cache.GetSampler(texture.tsc);
     update_descriptor_queue.AddSampledImage(sampler, image_view);
 
-    const auto image_layout = update_descriptor_queue.GetLastImageLayout();
+    VkImageLayout* const image_layout = update_descriptor_queue.LastImageLayout();
     *image_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
     sampled_views.push_back(ImageView{std::move(view), image_layout});
 }
@@ -1180,7 +1180,7 @@ void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const Ima
         view->GetImageView(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
     update_descriptor_queue.AddImage(image_view);
 
-    const auto image_layout = update_descriptor_queue.GetLastImageLayout();
+    VkImageLayout* const image_layout = update_descriptor_queue.LastImageLayout();
     *image_layout = VK_IMAGE_LAYOUT_GENERAL;
     image_views.push_back(ImageView{std::move(view), image_layout});
 }
diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp
index 681ecde98..351c048d2 100644
--- a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp
+++ b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp
@@ -24,35 +24,25 @@ void VKUpdateDescriptorQueue::TickFrame() {
 }
 
 void VKUpdateDescriptorQueue::Acquire() {
-    entries.clear();
-}
+    // Minimum number of entries required.
+    // This is the maximum number of entries a single draw call migth use.
+    static constexpr std::size_t MIN_ENTRIES = 0x400;
 
-void VKUpdateDescriptorQueue::Send(VkDescriptorUpdateTemplateKHR update_template,
-                                   VkDescriptorSet set) {
-    if (payload.size() + entries.size() >= payload.max_size()) {
+    if (payload.size() + MIN_ENTRIES >= payload.max_size()) {
         LOG_WARNING(Render_Vulkan, "Payload overflow, waiting for worker thread");
         scheduler.WaitWorker();
         payload.clear();
     }
+    upload_start = &*payload.end();
+}
 
-    // TODO(Rodrigo): Rework to write the payload directly
-    const auto payload_start = payload.data() + payload.size();
-    for (const auto& entry : entries) {
-        if (const auto image = std::get_if<VkDescriptorImageInfo>(&entry)) {
-            payload.push_back(*image);
-        } else if (const auto buffer = std::get_if<VkDescriptorBufferInfo>(&entry)) {
-            payload.push_back(*buffer);
-        } else if (const auto texel = std::get_if<VkBufferView>(&entry)) {
-            payload.push_back(*texel);
-        } else {
-            UNREACHABLE();
-        }
-    }
-
-    scheduler.Record(
-        [payload_start, set, update_template, logical = &device.GetLogical()](vk::CommandBuffer) {
-            logical->UpdateDescriptorSet(set, update_template, payload_start);
-        });
+void VKUpdateDescriptorQueue::Send(VkDescriptorUpdateTemplateKHR update_template,
+                                   VkDescriptorSet set) {
+    const void* const data = upload_start;
+    const vk::Device* const logical = &device.GetLogical();
+    scheduler.Record([data, logical, set, update_template](vk::CommandBuffer) {
+        logical->UpdateDescriptorSet(set, update_template, data);
+    });
 }
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.h b/src/video_core/renderer_vulkan/vk_update_descriptor.h
index cc7e3dff4..945320c72 100644
--- a/src/video_core/renderer_vulkan/vk_update_descriptor.h
+++ b/src/video_core/renderer_vulkan/vk_update_descriptor.h
@@ -15,17 +15,13 @@ namespace Vulkan {
 class VKDevice;
 class VKScheduler;
 
-class DescriptorUpdateEntry {
-public:
-    explicit DescriptorUpdateEntry() {}
-
-    DescriptorUpdateEntry(VkDescriptorImageInfo image) : image{image} {}
+struct DescriptorUpdateEntry {
+    DescriptorUpdateEntry(VkDescriptorImageInfo image_) : image{image_} {}
 
-    DescriptorUpdateEntry(VkDescriptorBufferInfo buffer) : buffer{buffer} {}
+    DescriptorUpdateEntry(VkDescriptorBufferInfo buffer_) : buffer{buffer_} {}
 
-    DescriptorUpdateEntry(VkBufferView texel_buffer) : texel_buffer{texel_buffer} {}
+    DescriptorUpdateEntry(VkBufferView texel_buffer_) : texel_buffer{texel_buffer_} {}
 
-private:
     union {
         VkDescriptorImageInfo image;
         VkDescriptorBufferInfo buffer;
@@ -45,32 +41,34 @@ public:
     void Send(VkDescriptorUpdateTemplateKHR update_template, VkDescriptorSet set);
 
     void AddSampledImage(VkSampler sampler, VkImageView image_view) {
-        entries.emplace_back(VkDescriptorImageInfo{sampler, image_view, {}});
+        payload.emplace_back(VkDescriptorImageInfo{sampler, image_view, {}});
     }
 
     void AddImage(VkImageView image_view) {
-        entries.emplace_back(VkDescriptorImageInfo{{}, image_view, {}});
+        payload.emplace_back(VkDescriptorImageInfo{{}, image_view, {}});
     }
 
     void AddBuffer(VkBuffer buffer, u64 offset, std::size_t size) {
-        entries.emplace_back(VkDescriptorBufferInfo{buffer, offset, size});
+        payload.emplace_back(VkDescriptorBufferInfo{buffer, offset, size});
     }
 
     void AddTexelBuffer(VkBufferView texel_buffer) {
-        entries.emplace_back(texel_buffer);
+        payload.emplace_back(texel_buffer);
     }
 
-    VkImageLayout* GetLastImageLayout() {
-        return &std::get<VkDescriptorImageInfo>(entries.back()).imageLayout;
+    VkImageLayout* LastImageLayout() {
+        return &payload.back().image.imageLayout;
     }
 
-private:
-    using Variant = std::variant<VkDescriptorImageInfo, VkDescriptorBufferInfo, VkBufferView>;
+    const VkImageLayout* LastImageLayout() const {
+        return &payload.back().image.imageLayout;
+    }
 
+private:
     const VKDevice& device;
     VKScheduler& scheduler;
 
-    boost::container::static_vector<Variant, 0x400> entries;
+    const DescriptorUpdateEntry* upload_start = nullptr;
     boost::container::static_vector<DescriptorUpdateEntry, 0x10000> payload;
 };
 
-- 
cgit v1.2.3


From cf137ea40b8770310773cf9d51ae5e47bdbddf9d Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Thu, 18 Jun 2020 18:16:21 -0300
Subject: vk_rasterizer: Don't preserve contents on full screen clears

There's no need to load contents from the CPU when a clear resets all
the contents of the underlying memory. This is already implemented on
OpenGL and the texture cache.
---
 src/video_core/renderer_vulkan/vk_rasterizer.cpp | 60 +++++++++++++++++++++---
 src/video_core/renderer_vulkan/vk_rasterizer.h   |  5 +-
 2 files changed, 58 insertions(+), 7 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 184b2238a..a5fd68358 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -143,6 +143,49 @@ Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry
     }
 }
 
+/// @brief Determine if an attachment to be updated has to preserve contents
+/// @param is_clear True when a clear is being executed
+/// @param regs 3D registers
+/// @return True when the contents have to be preserved
+bool HasToPreserveColorContents(bool is_clear, const Maxwell& regs) {
+    if (!is_clear) {
+        return true;
+    }
+    // First we have to make sure all clear masks are enabled.
+    if (!regs.clear_buffers.R || !regs.clear_buffers.G || !regs.clear_buffers.B ||
+        !regs.clear_buffers.A) {
+        return true;
+    }
+    // If scissors are disabled, the whole screen is cleared
+    if (!regs.clear_flags.scissor) {
+        return false;
+    }
+    // Then we have to confirm scissor testing clears the whole image
+    const std::size_t index = regs.clear_buffers.RT;
+    const auto& scissor = regs.scissor_test[0];
+    return scissor.min_x > 0 || scissor.min_y > 0 || scissor.max_x < regs.rt[index].width ||
+           scissor.max_y < regs.rt[index].height;
+}
+
+/// @brief Determine if an attachment to be updated has to preserve contents
+/// @param is_clear True when a clear is being executed
+/// @param regs 3D registers
+/// @return True when the contents have to be preserved
+bool HasToPreserveDepthContents(bool is_clear, const Maxwell& regs) {
+    // If we are not clearing, the contents have to be preserved
+    if (!is_clear) {
+        return true;
+    }
+    // For depth stencil clears we only have to confirm scissor test covers the whole image
+    if (!regs.clear_flags.scissor) {
+        return false;
+    }
+    // Make sure the clear cover the whole image
+    const auto& scissor = regs.scissor_test[0];
+    return scissor.min_x > 0 || scissor.min_y > 0 || scissor.max_x < regs.zeta_width ||
+           scissor.max_y < regs.zeta_height;
+}
+
 } // Anonymous namespace
 
 class BufferBindings final {
@@ -344,7 +387,7 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
 
     buffer_cache.Unmap();
 
-    const Texceptions texceptions = UpdateAttachments();
+    const Texceptions texceptions = UpdateAttachments(false);
     SetupImageTransitions(texceptions, color_attachments, zeta_attachment);
 
     key.renderpass_params = GetRenderPassParams(texceptions);
@@ -400,7 +443,7 @@ void RasterizerVulkan::Clear() {
         return;
     }
 
-    [[maybe_unused]] const auto texceptions = UpdateAttachments();
+    [[maybe_unused]] const auto texceptions = UpdateAttachments(true);
     DEBUG_ASSERT(texceptions.none());
     SetupImageTransitions(0, color_attachments, zeta_attachment);
 
@@ -677,9 +720,12 @@ void RasterizerVulkan::FlushWork() {
     draw_counter = 0;
 }
 
-RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() {
+RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments(bool is_clear) {
     MICROPROFILE_SCOPE(Vulkan_RenderTargets);
-    auto& dirty = system.GPU().Maxwell3D().dirty.flags;
+    auto& maxwell3d = system.GPU().Maxwell3D();
+    auto& dirty = maxwell3d.dirty.flags;
+    auto& regs = maxwell3d.regs;
+
     const bool update_rendertargets = dirty[VideoCommon::Dirty::RenderTargets];
     dirty[VideoCommon::Dirty::RenderTargets] = false;
 
@@ -688,7 +734,8 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() {
     Texceptions texceptions;
     for (std::size_t rt = 0; rt < Maxwell::NumRenderTargets; ++rt) {
         if (update_rendertargets) {
-            color_attachments[rt] = texture_cache.GetColorBufferSurface(rt, true);
+            const bool preserve_contents = HasToPreserveColorContents(is_clear, regs);
+            color_attachments[rt] = texture_cache.GetColorBufferSurface(rt, preserve_contents);
         }
         if (color_attachments[rt] && WalkAttachmentOverlaps(*color_attachments[rt])) {
             texceptions[rt] = true;
@@ -696,7 +743,8 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() {
     }
 
     if (update_rendertargets) {
-        zeta_attachment = texture_cache.GetDepthBufferSurface(true);
+        const bool preserve_contents = HasToPreserveDepthContents(is_clear, regs);
+        zeta_attachment = texture_cache.GetDepthBufferSurface(preserve_contents);
     }
     if (zeta_attachment && WalkAttachmentOverlaps(*zeta_attachment)) {
         texceptions[ZETA_TEXCEPTION_INDEX] = true;
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index c8c187606..83e00e7e9 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -159,7 +159,10 @@ private:
 
     void FlushWork();
 
-    Texceptions UpdateAttachments();
+    /// @brief Updates the currently bound attachments
+    /// @param is_clear True when the framebuffer is updated as a clear
+    /// @return Bitfield of attachments being used as sampled textures
+    Texceptions UpdateAttachments(bool is_clear);
 
     std::tuple<VkFramebuffer, VkExtent2D> ConfigureFramebuffers(VkRenderPass renderpass);
 
-- 
cgit v1.2.3


From 4514b80b3eedff01e994f225ea3d2da292c23e01 Mon Sep 17 00:00:00 2001
From: Lioncash <mathew1800@gmail.com>
Date: Fri, 19 Jun 2020 21:55:00 -0400
Subject: buffer_cache: Eliminate local variable shadowing

We can just make use of the instance in the scope above this one.
---
 src/video_core/buffer_cache/buffer_cache.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 308d8b55f..bae1d527c 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -47,7 +47,7 @@ public:
                             bool is_written = false, bool use_fast_cbuf = false) {
         std::lock_guard lock{mutex};
 
-        const auto& memory_manager = system.GPU().MemoryManager();
+        auto& memory_manager = system.GPU().MemoryManager();
         const std::optional<VAddr> cpu_addr_opt = memory_manager.GpuToCpuAddress(gpu_addr);
         if (!cpu_addr_opt) {
             return {GetEmptyBuffer(size), 0};
@@ -59,7 +59,6 @@ public:
         constexpr std::size_t max_stream_size = 0x800;
         if (use_fast_cbuf || size < max_stream_size) {
             if (!is_written && !IsRegionWritten(cpu_addr, cpu_addr + size - 1)) {
-                auto& memory_manager = system.GPU().MemoryManager();
                 const bool is_granular = memory_manager.IsGranularRange(gpu_addr, size);
                 if (use_fast_cbuf) {
                     u8* dest;
-- 
cgit v1.2.3


From 811bff009eca0d0fa2ddb1455fc73fdaec4474da Mon Sep 17 00:00:00 2001
From: Lioncash <mathew1800@gmail.com>
Date: Fri, 19 Jun 2020 21:57:41 -0400
Subject: macro_jit_x64: Eliminate variable shadowing in
 Compile_ProcessResult()

We can reduce the capture scope so that it's not possible for both "reg"
variables to clash with one another.

While we're at it, we can prevent unnecessary copies while we're at it.
---
 src/video_core/macro/macro_jit_x64.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp
index bee34a7c0..9eface47e 100644
--- a/src/video_core/macro/macro_jit_x64.cpp
+++ b/src/video_core/macro/macro_jit_x64.cpp
@@ -546,7 +546,7 @@ Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) {
 }
 
 void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) {
-    auto SetRegister = [=](u32 reg, Xbyak::Reg32 result) {
+    const auto SetRegister = [this](u32 reg, const Xbyak::Reg32& result) {
         // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero
         // register.
         if (reg == 0) {
@@ -554,7 +554,7 @@ void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u3
         }
         mov(dword[STATE + offsetof(JITState, registers) + reg * sizeof(u32)], result);
     };
-    auto SetMethodAddress = [=](Xbyak::Reg32 reg) { mov(METHOD_ADDRESS, reg); };
+    const auto SetMethodAddress = [this](const Xbyak::Reg32& reg) { mov(METHOD_ADDRESS, reg); };
 
     switch (operation) {
     case Macro::ResultOperation::IgnoreAndFetch:
-- 
cgit v1.2.3


From 479605b3e5a3b88128455b8357da471c713d0f90 Mon Sep 17 00:00:00 2001
From: Lioncash <mathew1800@gmail.com>
Date: Fri, 19 Jun 2020 22:02:56 -0400
Subject: memory_manager: Eliminate variable shadowing

Renames some variables to prevent ones in inner scopes from shadowing
outer-scoped variables.

The Copy* functions have no shadowing, but we rename them anyways to
remain consistent with the other functions.
---
 src/video_core/memory_manager.cpp | 40 +++++++++++++++++++++------------------
 src/video_core/memory_manager.h   | 12 ++++++------
 2 files changed, 28 insertions(+), 24 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index dbee9f634..ff5505d12 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -210,10 +210,11 @@ bool MemoryManager::IsBlockContinuous(const GPUVAddr start, const std::size_t si
     return range == inner_size;
 }
 
-void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const {
+void MemoryManager::ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer,
+                              const std::size_t size) const {
     std::size_t remaining_size{size};
-    std::size_t page_index{src_addr >> page_bits};
-    std::size_t page_offset{src_addr & page_mask};
+    std::size_t page_index{gpu_src_addr >> page_bits};
+    std::size_t page_offset{gpu_src_addr & page_mask};
 
     auto& memory = system.Memory();
 
@@ -234,11 +235,11 @@ void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::s
     }
 }
 
-void MemoryManager::ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer,
+void MemoryManager::ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer,
                                     const std::size_t size) const {
     std::size_t remaining_size{size};
-    std::size_t page_index{src_addr >> page_bits};
-    std::size_t page_offset{src_addr & page_mask};
+    std::size_t page_index{gpu_src_addr >> page_bits};
+    std::size_t page_offset{gpu_src_addr & page_mask};
 
     auto& memory = system.Memory();
 
@@ -259,10 +260,11 @@ void MemoryManager::ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer,
     }
 }
 
-void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size) {
+void MemoryManager::WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer,
+                               const std::size_t size) {
     std::size_t remaining_size{size};
-    std::size_t page_index{dest_addr >> page_bits};
-    std::size_t page_offset{dest_addr & page_mask};
+    std::size_t page_index{gpu_dest_addr >> page_bits};
+    std::size_t page_offset{gpu_dest_addr & page_mask};
 
     auto& memory = system.Memory();
 
@@ -283,11 +285,11 @@ void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const
     }
 }
 
-void MemoryManager::WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer,
+void MemoryManager::WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer,
                                      const std::size_t size) {
     std::size_t remaining_size{size};
-    std::size_t page_index{dest_addr >> page_bits};
-    std::size_t page_offset{dest_addr & page_mask};
+    std::size_t page_index{gpu_dest_addr >> page_bits};
+    std::size_t page_offset{gpu_dest_addr & page_mask};
 
     auto& memory = system.Memory();
 
@@ -306,16 +308,18 @@ void MemoryManager::WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer,
     }
 }
 
-void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) {
+void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr,
+                              const std::size_t size) {
     std::vector<u8> tmp_buffer(size);
-    ReadBlock(src_addr, tmp_buffer.data(), size);
-    WriteBlock(dest_addr, tmp_buffer.data(), size);
+    ReadBlock(gpu_src_addr, tmp_buffer.data(), size);
+    WriteBlock(gpu_dest_addr, tmp_buffer.data(), size);
 }
 
-void MemoryManager::CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) {
+void MemoryManager::CopyBlockUnsafe(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr,
+                                    const std::size_t size) {
     std::vector<u8> tmp_buffer(size);
-    ReadBlockUnsafe(src_addr, tmp_buffer.data(), size);
-    WriteBlockUnsafe(dest_addr, tmp_buffer.data(), size);
+    ReadBlockUnsafe(gpu_src_addr, tmp_buffer.data(), size);
+    WriteBlockUnsafe(gpu_dest_addr, tmp_buffer.data(), size);
 }
 
 bool MemoryManager::IsGranularRange(GPUVAddr gpu_addr, std::size_t size) {
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index 0ddd52d5a..87658e87a 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -79,9 +79,9 @@ public:
      * in the Host Memory counterpart. Note: This functions cause Host GPU Memory
      * Flushes and Invalidations, respectively to each operation.
      */
-    void ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const;
-    void WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size);
-    void CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size);
+    void ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;
+    void WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
+    void CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size);
 
     /**
      * ReadBlockUnsafe and WriteBlockUnsafe are special versions of ReadBlock and
@@ -93,9 +93,9 @@ public:
      * WriteBlockUnsafe instead of WriteBlock since it shouldn't invalidate the texture
      * being flushed.
      */
-    void ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const;
-    void WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer, std::size_t size);
-    void CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size);
+    void ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;
+    void WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
+    void CopyBlockUnsafe(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size);
 
     /**
      * IsGranularRange checks if a gpu region can be simply read with a pointer
-- 
cgit v1.2.3


From a6e5b84d1fbfa976819645d8b7234d847756fc88 Mon Sep 17 00:00:00 2001
From: Lioncash <mathew1800@gmail.com>
Date: Fri, 19 Jun 2020 23:01:56 -0400
Subject: vulkan/wrapper: Remove noexcept from GetSurfaceCapabilitiesKHR()

Check() can throw an exception if the Vulkan result isn't successful.

We remove the check so that std::terminate isn't outright called and
allows for better debugging (should it ever actually fail).
---
 src/video_core/renderer_vulkan/wrapper.cpp | 3 +--
 src/video_core/renderer_vulkan/wrapper.h   | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/renderer_vulkan/wrapper.cpp b/src/video_core/renderer_vulkan/wrapper.cpp
index 2ce9b0626..42eff85d3 100644
--- a/src/video_core/renderer_vulkan/wrapper.cpp
+++ b/src/video_core/renderer_vulkan/wrapper.cpp
@@ -725,8 +725,7 @@ bool PhysicalDevice::GetSurfaceSupportKHR(u32 queue_family_index, VkSurfaceKHR s
     return supported == VK_TRUE;
 }
 
-VkSurfaceCapabilitiesKHR PhysicalDevice::GetSurfaceCapabilitiesKHR(VkSurfaceKHR surface) const
-    noexcept {
+VkSurfaceCapabilitiesKHR PhysicalDevice::GetSurfaceCapabilitiesKHR(VkSurfaceKHR surface) const {
     VkSurfaceCapabilitiesKHR capabilities;
     Check(dld->vkGetPhysicalDeviceSurfaceCapabilitiesKHR(physical_device, surface, &capabilities));
     return capabilities;
diff --git a/src/video_core/renderer_vulkan/wrapper.h b/src/video_core/renderer_vulkan/wrapper.h
index 98937a77a..da42ca88e 100644
--- a/src/video_core/renderer_vulkan/wrapper.h
+++ b/src/video_core/renderer_vulkan/wrapper.h
@@ -779,7 +779,7 @@ public:
 
     bool GetSurfaceSupportKHR(u32 queue_family_index, VkSurfaceKHR) const;
 
-    VkSurfaceCapabilitiesKHR GetSurfaceCapabilitiesKHR(VkSurfaceKHR) const noexcept;
+    VkSurfaceCapabilitiesKHR GetSurfaceCapabilitiesKHR(VkSurfaceKHR) const;
 
     std::vector<VkSurfaceFormatKHR> GetSurfaceFormatsKHR(VkSurfaceKHR) const;
 
-- 
cgit v1.2.3


From 480e1fa987ce427ce3208a49ae3f08494c417c5c Mon Sep 17 00:00:00 2001
From: Morph <39850852+Morph1984@users.noreply.github.com>
Date: Sun, 14 Jun 2020 00:02:42 -0400
Subject: decode/image: Implement B10G11R11F

- Used by Kirby Star Allies
---
 src/video_core/shader/decode/image.cpp | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/shader/decode/image.cpp b/src/video_core/shader/decode/image.cpp
index 60b6ad72a..07778dc3e 100644
--- a/src/video_core/shader/decode/image.cpp
+++ b/src/video_core/shader/decode/image.cpp
@@ -97,6 +97,7 @@ ComponentType GetComponentType(Tegra::Engines::SamplerDescriptor descriptor,
         break;
     case TextureFormat::B5G6R5:
     case TextureFormat::B6G5R5:
+    case TextureFormat::BF10GF11RF11:
         if (component == 0) {
             return descriptor.b_type;
         }
@@ -119,7 +120,7 @@ ComponentType GetComponentType(Tegra::Engines::SamplerDescriptor descriptor,
         }
         break;
     }
-    UNIMPLEMENTED_MSG("texture format not implement={}", format);
+    UNIMPLEMENTED_MSG("Texture format not implemented={}", format);
     return ComponentType::FLOAT;
 }
 
@@ -191,6 +192,14 @@ u32 GetComponentSize(TextureFormat format, std::size_t component) {
             return 6;
         }
         return 0;
+    case TextureFormat::BF10GF11RF11:
+        if (component == 1 || component == 2) {
+            return 11;
+        }
+        if (component == 0) {
+            return 10;
+        }
+        return 0;
     case TextureFormat::G8R24:
         if (component == 0) {
             return 8;
@@ -211,10 +220,9 @@ u32 GetComponentSize(TextureFormat format, std::size_t component) {
         return (component == 0 || component == 1) ? 8 : 0;
     case TextureFormat::G4R4:
         return (component == 0 || component == 1) ? 4 : 0;
-    default:
-        UNIMPLEMENTED_MSG("texture format not implement={}", format);
-        return 0;
     }
+    UNIMPLEMENTED_MSG("Texture format not implemented={}", format);
+    return 0;
 }
 
 std::size_t GetImageComponentMask(TextureFormat format) {
@@ -235,6 +243,7 @@ std::size_t GetImageComponentMask(TextureFormat format) {
     case TextureFormat::R32_B24G8:
     case TextureFormat::B5G6R5:
     case TextureFormat::B6G5R5:
+    case TextureFormat::BF10GF11RF11:
         return std::size_t{R | G | B};
     case TextureFormat::R32_G32:
     case TextureFormat::R16_G16:
@@ -248,10 +257,9 @@ std::size_t GetImageComponentMask(TextureFormat format) {
     case TextureFormat::R8:
     case TextureFormat::R1:
         return std::size_t{R};
-    default:
-        UNIMPLEMENTED_MSG("texture format not implement={}", format);
-        return std::size_t{R | G | B | A};
     }
+    UNIMPLEMENTED_MSG("Texture format not implemented={}", format);
+    return std::size_t{R | G | B | A};
 }
 
 std::size_t GetImageTypeNumCoordinates(Tegra::Shader::ImageType image_type) {
@@ -299,7 +307,7 @@ std::pair<Node, bool> ShaderIR::GetComponentValue(ComponentType component_type,
             return {std::move(original_value), true};
         }
     default:
-        UNIMPLEMENTED_MSG("Unimplement component type={}", component_type);
+        UNIMPLEMENTED_MSG("Unimplemented component type={}", component_type);
         return {std::move(original_value), true};
     }
 }
@@ -459,7 +467,7 @@ u32 ShaderIR::DecodeImage(NodeBlock& bb, u32 pc) {
             default:
                 break;
             }
-            UNIMPLEMENTED_MSG("Unimplemented operation={} type={}",
+            UNIMPLEMENTED_MSG("Unimplemented operation={}, type={}",
                               static_cast<u64>(instr.suatom_d.operation.Value()),
                               static_cast<u64>(instr.suatom_d.operation_type.Value()));
             return OperationCode::AtomicImageAdd;
-- 
cgit v1.2.3


From ef53b2fd08f1122f22456500bfdc707f1c18906c Mon Sep 17 00:00:00 2001
From: Lioncash <mathew1800@gmail.com>
Date: Fri, 19 Jun 2020 23:13:48 -0400
Subject: texture_cache: Fix incorrect address used in a DeduceSurface() call

Previously the source was being deduced twice in a row.
---
 src/video_core/texture_cache/texture_cache.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/video_core')

diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index b543fc8c0..85075e868 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -1053,7 +1053,7 @@ private:
     void DeduceBestBlit(SurfaceParams& src_params, SurfaceParams& dst_params,
                         const GPUVAddr src_gpu_addr, const GPUVAddr dst_gpu_addr) {
         auto deduced_src = DeduceSurface(src_gpu_addr, src_params);
-        auto deduced_dst = DeduceSurface(src_gpu_addr, src_params);
+        auto deduced_dst = DeduceSurface(dst_gpu_addr, dst_params);
         if (deduced_src.Failed() || deduced_dst.Failed()) {
             return;
         }
-- 
cgit v1.2.3


From 1e65da971bf6edd5611e6e409ba1cc4f99e58655 Mon Sep 17 00:00:00 2001
From: Morph <39850852+Morph1984@users.noreply.github.com>
Date: Sat, 20 Jun 2020 07:41:55 -0400
Subject: gl_device: Check for GL_EXT_texture_shadow_lod

---
 src/video_core/renderer_opengl/gl_device.cpp | 2 ++
 src/video_core/renderer_opengl/gl_device.h   | 5 +++++
 2 files changed, 7 insertions(+)

(limited to 'src/video_core')

diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index b31d604e4..1011c7738 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -216,6 +216,7 @@ Device::Device()
     has_shader_ballot = GLAD_GL_ARB_shader_ballot;
     has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array;
     has_image_load_formatted = HasExtension(extensions, "GL_EXT_shader_image_load_formatted");
+    has_texture_shadow_lod = HasExtension(extensions, "GL_EXT_texture_shadow_lod");
     has_astc = IsASTCSupported();
     has_variable_aoffi = TestVariableAoffi();
     has_component_indexing_bug = is_amd;
@@ -245,6 +246,7 @@ Device::Device(std::nullptr_t) {
     has_shader_ballot = true;
     has_vertex_viewport_layer = true;
     has_image_load_formatted = true;
+    has_texture_shadow_lod = true;
     has_variable_aoffi = true;
 }
 
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index 145347943..c86e709b1 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -68,6 +68,10 @@ public:
         return has_image_load_formatted;
     }
 
+    bool HasTextureShadowLod() const {
+        return has_texture_shadow_lod;
+    }
+
     bool HasASTC() const {
         return has_astc;
     }
@@ -110,6 +114,7 @@ private:
     bool has_shader_ballot{};
     bool has_vertex_viewport_layer{};
     bool has_image_load_formatted{};
+    bool has_texture_shadow_lod{};
     bool has_astc{};
     bool has_variable_aoffi{};
     bool has_component_indexing_bug{};
-- 
cgit v1.2.3


From f77c897b8d5287adb64e08f65e494dac45033de3 Mon Sep 17 00:00:00 2001
From: Morph <39850852+Morph1984@users.noreply.github.com>
Date: Sat, 20 Jun 2020 07:43:04 -0400
Subject: gl_shader_decompiler: Enable GL_EXT_texture_shadow_lod if available

Enable GL_EXT_texture_shadow_lod if available. If this extension is not available, such as on Intel/AMD proprietary drivers, use textureGrad as a workaround.
---
 .../renderer_opengl/gl_shader_decompiler.cpp       | 50 +++++++++++++++++++---
 1 file changed, 43 insertions(+), 7 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index d6e30b321..2c49aeaac 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -37,6 +37,7 @@ using Tegra::Shader::IpaMode;
 using Tegra::Shader::IpaSampleMode;
 using Tegra::Shader::PixelImap;
 using Tegra::Shader::Register;
+using Tegra::Shader::TextureType;
 using VideoCommon::Shader::BuildTransformFeedback;
 using VideoCommon::Shader::Registry;
 
@@ -526,6 +527,9 @@ private:
         if (device.HasImageLoadFormatted()) {
             code.AddLine("#extension GL_EXT_shader_image_load_formatted : require");
         }
+        if (device.HasTextureShadowLod()) {
+            code.AddLine("#extension GL_EXT_texture_shadow_lod : require");
+        }
         if (device.HasWarpIntrinsics()) {
             code.AddLine("#extension GL_NV_gpu_shader5 : require");
             code.AddLine("#extension GL_NV_shader_thread_group : require");
@@ -909,13 +913,13 @@ private:
                     return "samplerBuffer";
                 }
                 switch (sampler.type) {
-                case Tegra::Shader::TextureType::Texture1D:
+                case TextureType::Texture1D:
                     return "sampler1D";
-                case Tegra::Shader::TextureType::Texture2D:
+                case TextureType::Texture2D:
                     return "sampler2D";
-                case Tegra::Shader::TextureType::Texture3D:
+                case TextureType::Texture3D:
                     return "sampler3D";
-                case Tegra::Shader::TextureType::TextureCube:
+                case TextureType::TextureCube:
                     return "samplerCube";
                 default:
                     UNREACHABLE();
@@ -1380,8 +1384,19 @@ private:
         const std::size_t count = operation.GetOperandsCount();
         const bool has_array = meta->sampler.is_array;
         const bool has_shadow = meta->sampler.is_shadow;
+        const bool workaround_lod_array_shadow_as_grad =
+            !device.HasTextureShadowLod() && function_suffix == "Lod" && meta->sampler.is_shadow &&
+            ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) ||
+             meta->sampler.type == TextureType::TextureCube);
+
+        std::string expr = "texture";
+
+        if (workaround_lod_array_shadow_as_grad) {
+            expr += "Grad";
+        } else {
+            expr += function_suffix;
+        }
 
-        std::string expr = "texture" + function_suffix;
         if (!meta->aoffi.empty()) {
             expr += "Offset";
         } else if (!meta->ptp.empty()) {
@@ -1415,6 +1430,16 @@ private:
             expr += ')';
         }
 
+        if (workaround_lod_array_shadow_as_grad) {
+            switch (meta->sampler.type) {
+            case TextureType::Texture2D:
+                return expr + ", vec2(0.0), vec2(0.0))";
+            case TextureType::TextureCube:
+                return expr + ", vec3(0.0), vec3(0.0))";
+            }
+            UNREACHABLE();
+        }
+
         for (const auto& variant : extras) {
             if (const auto argument = std::get_if<TextureArgument>(&variant)) {
                 expr += GenerateTextureArgument(*argument);
@@ -2041,8 +2066,19 @@ private:
         const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
         ASSERT(meta);
 
-        std::string expr = GenerateTexture(
-            operation, "Lod", {TextureArgument{Type::Float, meta->lod}, TextureOffset{}});
+        std::string expr{};
+
+        if (!device.HasTextureShadowLod() && meta->sampler.is_shadow &&
+            ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) ||
+             meta->sampler.type == TextureType::TextureCube)) {
+            LOG_ERROR(Render_OpenGL,
+                      "Device lacks GL_EXT_texture_shadow_lod, using textureGrad as a workaround");
+            expr = GenerateTexture(operation, "Lod", {});
+        } else {
+            expr = GenerateTexture(operation, "Lod",
+                                   {TextureArgument{Type::Float, meta->lod}, TextureOffset{}});
+        }
+
         if (meta->sampler.is_shadow) {
             expr = "vec4(" + expr + ')';
         }
-- 
cgit v1.2.3


From 2f09c7ddd314f03da0fbafacfcae6b0a47a209ae Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Mon, 22 Jun 2020 04:10:45 -0300
Subject: renderer_vulkan: Update validation layer name and test before
 enabling

Update validation layer string to VK_LAYER_KHRONOS_validation.

While we are at it, properly check for available validation layers
before enabling them.
---
 src/video_core/renderer_vulkan/renderer_vulkan.cpp | 28 ++++++++++++++++++----
 src/video_core/renderer_vulkan/wrapper.cpp         | 16 ++++++++++++-
 src/video_core/renderer_vulkan/wrapper.h           |  4 ++++
 3 files changed, 43 insertions(+), 5 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
index cd9673d1f..2d9b18ed9 100644
--- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
@@ -155,11 +155,31 @@ vk::Instance CreateInstance(Common::DynamicLibrary& library, vk::InstanceDispatc
         }
     }
 
-    static constexpr std::array layers_data{"VK_LAYER_LUNARG_standard_validation"};
-    vk::Span<const char*> layers = layers_data;
-    if (!enable_layers) {
-        layers = {};
+    std::vector<const char*> layers;
+    layers.reserve(1);
+    if (enable_layers) {
+        layers.push_back("VK_LAYER_KHRONOS_validation");
+    }
+
+    const std::optional layer_properties = vk::EnumerateInstanceLayerProperties(dld);
+    if (!layer_properties) {
+        LOG_ERROR(Render_Vulkan, "Failed to query layer properties, disabling layers");
+        layers.clear();
+    }
+
+    for (auto layer_it = layers.begin(); layer_it != layers.end();) {
+        const char* const layer = *layer_it;
+        const auto it = std::find_if(
+            layer_properties->begin(), layer_properties->end(),
+            [layer](const VkLayerProperties& prop) { return !std::strcmp(layer, prop.layerName); });
+        if (it == layer_properties->end()) {
+            LOG_ERROR(Render_Vulkan, "Layer {} not available, removing it", layer);
+            layer_it = layers.erase(layer_it);
+        } else {
+            ++layer_it;
+        }
     }
+
     vk::Instance instance = vk::Instance::Create(layers, extensions, dld);
     if (!instance) {
         LOG_ERROR(Render_Vulkan, "Failed to create Vulkan instance");
diff --git a/src/video_core/renderer_vulkan/wrapper.cpp b/src/video_core/renderer_vulkan/wrapper.cpp
index 42eff85d3..0d485a662 100644
--- a/src/video_core/renderer_vulkan/wrapper.cpp
+++ b/src/video_core/renderer_vulkan/wrapper.cpp
@@ -153,7 +153,8 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
 
 bool Load(InstanceDispatch& dld) noexcept {
 #define X(name) Proc(dld.name, dld, #name)
-    return X(vkCreateInstance) && X(vkEnumerateInstanceExtensionProperties);
+    return X(vkCreateInstance) && X(vkEnumerateInstanceExtensionProperties) &&
+           X(vkEnumerateInstanceLayerProperties);
 #undef X
 }
 
@@ -770,4 +771,17 @@ std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProp
     return properties;
 }
 
+std::optional<std::vector<VkLayerProperties>> EnumerateInstanceLayerProperties(
+    const InstanceDispatch& dld) {
+    u32 num;
+    if (dld.vkEnumerateInstanceLayerProperties(&num, nullptr) != VK_SUCCESS) {
+        return std::nullopt;
+    }
+    std::vector<VkLayerProperties> properties(num);
+    if (dld.vkEnumerateInstanceLayerProperties(&num, properties.data()) != VK_SUCCESS) {
+        return std::nullopt;
+    }
+    return properties;
+}
+
 } // namespace Vulkan::vk
diff --git a/src/video_core/renderer_vulkan/wrapper.h b/src/video_core/renderer_vulkan/wrapper.h
index da42ca88e..d56fdb3f9 100644
--- a/src/video_core/renderer_vulkan/wrapper.h
+++ b/src/video_core/renderer_vulkan/wrapper.h
@@ -141,6 +141,7 @@ struct InstanceDispatch {
     PFN_vkCreateInstance vkCreateInstance;
     PFN_vkDestroyInstance vkDestroyInstance;
     PFN_vkEnumerateInstanceExtensionProperties vkEnumerateInstanceExtensionProperties;
+    PFN_vkEnumerateInstanceLayerProperties vkEnumerateInstanceLayerProperties;
 
     PFN_vkCreateDebugUtilsMessengerEXT vkCreateDebugUtilsMessengerEXT;
     PFN_vkCreateDevice vkCreateDevice;
@@ -996,4 +997,7 @@ private:
 std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProperties(
     const InstanceDispatch& dld);
 
+std::optional<std::vector<VkLayerProperties>> EnumerateInstanceLayerProperties(
+    const InstanceDispatch& dld);
+
 } // namespace Vulkan::vk
-- 
cgit v1.2.3


From 544b15e8e415d56b415189717805a88b2e5dc06f Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Mon, 22 Jun 2020 11:29:55 -0400
Subject: TextureCache: Fix case where layer goes off bound.

The returned layer is expected to be between 0 and the depth of the
surface, anything larger is off bounds.
---
 src/video_core/texture_cache/surface_base.cpp | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'src/video_core')

diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp
index 94d3a6ae5..0caf3b4f0 100644
--- a/src/video_core/texture_cache/surface_base.cpp
+++ b/src/video_core/texture_cache/surface_base.cpp
@@ -120,6 +120,9 @@ std::optional<std::pair<u32, u32>> SurfaceBaseImpl::GetLayerMipmap(
     }
     const auto relative_address{static_cast<GPUVAddr>(candidate_gpu_addr - gpu_addr)};
     const auto layer{static_cast<u32>(relative_address / layer_size)};
+    if (layer >= params.depth) {
+        return {};
+    }
     const GPUVAddr mipmap_address = relative_address - layer_size * layer;
     const auto mipmap_it =
         Common::BinaryFind(mipmap_offsets.begin(), mipmap_offsets.end(), mipmap_address);
-- 
cgit v1.2.3


From 39ab33ee1c976d2653ceef724c0e60ece0c2ba06 Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Mon, 22 Jun 2020 20:46:25 -0300
Subject: shader/half_set: Implement HSET2_IMM

Add HSET2_IMM. Due to the complexity of the encoding avoid using
BitField unions and read the relevant bits from the code itself.
This is less error prone.
---
 src/video_core/engines/shader_bytecode.h  |  8 +++
 src/video_core/shader/decode/half_set.cpp | 88 +++++++++++++++++++++++--------
 2 files changed, 75 insertions(+), 21 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index e7cb87589..d374b73cf 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -661,6 +661,10 @@ union Instruction {
     constexpr Instruction(u64 value) : value{value} {}
     constexpr Instruction(const Instruction& instr) : value(instr.value) {}
 
+    constexpr bool Bit(u64 offset) const {
+        return ((value >> offset) & 1) != 0;
+    }
+
     BitField<0, 8, Register> gpr0;
     BitField<8, 8, Register> gpr8;
     union {
@@ -1874,7 +1878,9 @@ public:
         HSETP2_C,
         HSETP2_R,
         HSETP2_IMM,
+        HSET2_C,
         HSET2_R,
+        HSET2_IMM,
         POPC_C,
         POPC_R,
         POPC_IMM,
@@ -2194,7 +2200,9 @@ private:
             INST("0111111-1-------", Id::HSETP2_C, Type::HalfSetPredicate, "HSETP2_C"),
             INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP2_R"),
             INST("0111111-0-------", Id::HSETP2_IMM, Type::HalfSetPredicate, "HSETP2_IMM"),
+            INST("0111110-1-------", Id::HSET2_C, Type::HalfSet, "HSET2_C"),
             INST("0101110100011---", Id::HSET2_R, Type::HalfSet, "HSET2_R"),
+            INST("0111110-0-------", Id::HSET2_IMM, Type::HalfSet, "HSET2_IMM"),
             INST("010110111010----", Id::FCMP_RR, Type::Arithmetic, "FCMP_RR"),
             INST("010010111010----", Id::FCMP_RC, Type::Arithmetic, "FCMP_RC"),
             INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"),
diff --git a/src/video_core/shader/decode/half_set.cpp b/src/video_core/shader/decode/half_set.cpp
index 848e46874..b2e88fa20 100644
--- a/src/video_core/shader/decode/half_set.cpp
+++ b/src/video_core/shader/decode/half_set.cpp
@@ -13,55 +13,101 @@
 
 namespace VideoCommon::Shader {
 
+using std::move;
 using Tegra::Shader::Instruction;
 using Tegra::Shader::OpCode;
+using Tegra::Shader::PredCondition;
 
 u32 ShaderIR::DecodeHalfSet(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
 
-    if (instr.hset2.ftz == 0) {
-        LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName());
+    PredCondition cond;
+    bool bf;
+    bool ftz;
+    bool neg_a;
+    bool abs_a;
+    bool neg_b;
+    bool abs_b;
+    switch (opcode->get().GetId()) {
+    case OpCode::Id::HSET2_C:
+    case OpCode::Id::HSET2_IMM:
+        cond = instr.hsetp2.cbuf_and_imm.cond;
+        bf = instr.Bit(53);
+        ftz = instr.Bit(54);
+        neg_a = instr.Bit(43);
+        abs_a = instr.Bit(44);
+        neg_b = instr.Bit(56);
+        abs_b = instr.Bit(54);
+        break;
+    case OpCode::Id::HSET2_R:
+        cond = instr.hsetp2.reg.cond;
+        bf = instr.Bit(49);
+        ftz = instr.Bit(50);
+        neg_a = instr.Bit(43);
+        abs_a = instr.Bit(44);
+        neg_b = instr.Bit(31);
+        abs_b = instr.Bit(30);
+        break;
+    default:
+        UNREACHABLE();
     }
 
-    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hset2.type_a);
-    op_a = GetOperandAbsNegHalf(op_a, instr.hset2.abs_a, instr.hset2.negate_a);
-
-    Node op_b = [&]() {
+    Node op_b = [this, instr, opcode] {
         switch (opcode->get().GetId()) {
+        case OpCode::Id::HSET2_C:
+            // Inform as unimplemented as this is not tested.
+            UNIMPLEMENTED_MSG("HSET2_C is not implemented");
+            return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
         case OpCode::Id::HSET2_R:
             return GetRegister(instr.gpr20);
+        case OpCode::Id::HSET2_IMM:
+            return UnpackHalfImmediate(instr, true);
         default:
             UNREACHABLE();
-            return Immediate(0);
+            return Node{};
         }
     }();
-    op_b = UnpackHalfFloat(op_b, instr.hset2.type_b);
-    op_b = GetOperandAbsNegHalf(op_b, instr.hset2.abs_b, instr.hset2.negate_b);
 
-    const Node second_pred = GetPredicate(instr.hset2.pred39, instr.hset2.neg_pred);
+    if (!ftz) {
+        LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName());
+    }
+
+    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hset2.type_a);
+    op_a = GetOperandAbsNegHalf(op_a, abs_a, neg_a);
+
+    switch (opcode->get().GetId()) {
+    case OpCode::Id::HSET2_R:
+        op_b = GetOperandAbsNegHalf(move(op_b), abs_b, neg_b);
+        [[fallthrough]];
+    case OpCode::Id::HSET2_C:
+        op_b = UnpackHalfFloat(move(op_b), instr.hset2.type_b);
+        break;
+    default:
+        break;
+    }
 
-    const Node comparison_pair = GetPredicateComparisonHalf(instr.hset2.cond, op_a, op_b);
+    Node second_pred = GetPredicate(instr.hset2.pred39, instr.hset2.neg_pred);
+
+    Node comparison_pair = GetPredicateComparisonHalf(cond, op_a, op_b);
 
     const OperationCode combiner = GetPredicateCombiner(instr.hset2.op);
 
     // HSET2 operates on each half float in the pack.
     std::array<Node, 2> values;
     for (u32 i = 0; i < 2; ++i) {
-        const u32 raw_value = instr.hset2.bf ? 0x3c00 : 0xffff;
-        const Node true_value = Immediate(raw_value << (i * 16));
-        const Node false_value = Immediate(0);
-
-        const Node comparison =
-            Operation(OperationCode::LogicalPick2, comparison_pair, Immediate(i));
-        const Node predicate = Operation(combiner, comparison, second_pred);
+        const u32 raw_value = bf ? 0x3c00 : 0xffff;
+        Node true_value = Immediate(raw_value << (i * 16));
+        Node false_value = Immediate(0);
 
+        Node comparison = Operation(OperationCode::LogicalPick2, comparison_pair, Immediate(i));
+        Node predicate = Operation(combiner, comparison, second_pred);
         values[i] =
-            Operation(OperationCode::Select, NO_PRECISE, predicate, true_value, false_value);
+            Operation(OperationCode::Select, predicate, move(true_value), move(false_value));
     }
 
-    const Node value = Operation(OperationCode::UBitwiseOr, NO_PRECISE, values[0], values[1]);
-    SetRegister(bb, instr.gpr0, value);
+    Node value = Operation(OperationCode::UBitwiseOr, values[0], values[1]);
+    SetRegister(bb, instr.gpr0, move(value));
 
     return pc;
 }
-- 
cgit v1.2.3


From 9f54cd4dad58c2c99874a9fe6bb4c34052a65555 Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Tue, 23 Jun 2020 22:51:03 -0300
Subject: gl_shader_cache: Avoid use after move for program size

All programs had a size of zero due to this bug, skipping invalidations.

While we are at it, remove some unused forward declarations.
---
 src/video_core/renderer_opengl/gl_shader_cache.cpp | 12 +++++++-----
 src/video_core/renderer_opengl/gl_shader_cache.h   |  1 -
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 46e780a06..c6a3bf3a1 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -460,8 +460,9 @@ Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
         const u8* host_ptr_b = memory_manager.GetPointer(address_b);
         code_b = GetShaderCode(memory_manager, address_b, host_ptr_b, false);
     }
+    const std::size_t code_size = code.size() * sizeof(u64);
 
-    const auto unique_identifier = GetUniqueIdentifier(
+    const u64 unique_identifier = GetUniqueIdentifier(
         GetShaderType(program), program == Maxwell::ShaderProgram::VertexA, code, code_b);
 
     const ShaderParameters params{system,    disk_cache, device,
@@ -477,7 +478,7 @@ Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
 
     Shader* const result = shader.get();
     if (cpu_addr) {
-        Register(std::move(shader), *cpu_addr, code.size() * sizeof(u64));
+        Register(std::move(shader), *cpu_addr, code_size);
     } else {
         null_shader = std::move(shader);
     }
@@ -495,8 +496,9 @@ Shader* ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
 
     const auto host_ptr{memory_manager.GetPointer(code_addr)};
     // No kernel found, create a new one
-    auto code{GetShaderCode(memory_manager, code_addr, host_ptr, true)};
-    const auto unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)};
+    ProgramCode code{GetShaderCode(memory_manager, code_addr, host_ptr, true)};
+    const std::size_t code_size{code.size() * sizeof(u64)};
+    const u64 unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)};
 
     const ShaderParameters params{system,    disk_cache, device,
                                   *cpu_addr, host_ptr,   unique_identifier};
@@ -511,7 +513,7 @@ Shader* ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
 
     Shader* const result = kernel.get();
     if (cpu_addr) {
-        Register(std::move(kernel), *cpu_addr, code.size() * sizeof(u64));
+        Register(std::move(kernel), *cpu_addr, code_size);
     } else {
         null_kernel = std::move(kernel);
     }
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index 6848f1388..994aaeaf2 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -37,7 +37,6 @@ namespace OpenGL {
 
 class Device;
 class RasterizerOpenGL;
-struct UnspecializedShader;
 
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
-- 
cgit v1.2.3


From 6ce5f3120be6a65a798d3abc6fda0fe6171d0296 Mon Sep 17 00:00:00 2001
From: David Marcec <dmarcecguzman@gmail.com>
Date: Fri, 5 Jun 2020 01:42:19 +1000
Subject: Macro HLE support

---
 src/video_core/CMakeLists.txt              |   2 +
 src/video_core/engines/maxwell_3d.cpp      |   2 +-
 src/video_core/engines/maxwell_3d.h        |   4 ++
 src/video_core/macro/macro.cpp             |  35 ++++++++--
 src/video_core/macro/macro.h               |  19 ++++-
 src/video_core/macro/macro_hle.cpp         | 108 +++++++++++++++++++++++++++++
 src/video_core/macro/macro_hle.h           |  43 ++++++++++++
 src/video_core/macro/macro_interpreter.cpp |   3 +-
 src/video_core/macro/macro_jit_x64.cpp     |   3 +-
 9 files changed, 209 insertions(+), 10 deletions(-)
 create mode 100644 src/video_core/macro/macro_hle.cpp
 create mode 100644 src/video_core/macro/macro_hle.h

(limited to 'src/video_core')

diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 099bb446e..2dc752aa9 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -27,6 +27,8 @@ add_library(video_core STATIC
     engines/shader_type.h
     macro/macro.cpp
     macro/macro.h
+    macro/macro_hle.cpp
+    macro/macro_hle.h
     macro/macro_interpreter.cpp
     macro/macro_interpreter.h
     macro/macro_jit_x64.cpp
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index ea3c8a963..c01436295 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -128,7 +128,7 @@ void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters)
         ((method - MacroRegistersStart) >> 1) % static_cast<u32>(macro_positions.size());
 
     // Execute the current macro.
-    macro_engine->Execute(macro_positions[entry], parameters);
+    macro_engine->Execute(*this, macro_positions[entry], parameters);
     if (mme_draw.current_mode != MMEDrawMode::Undefined) {
         FlushMMEInlineDraw();
     }
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index d5fe25065..5926c4d2d 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -1418,6 +1418,10 @@ public:
         return execute_on;
     }
 
+    VideoCore::RasterizerInterface& GetRasterizer() {
+        return rasterizer;
+    }
+
     /// Notify a memory write has happened.
     void OnMemoryWrite() {
         dirty.flags |= dirty.on_write_stores;
diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp
index 89077a2d8..c8aa2534a 100644
--- a/src/video_core/macro/macro.cpp
+++ b/src/video_core/macro/macro.cpp
@@ -2,23 +2,37 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <boost/container_hash/hash.hpp>
 #include "common/assert.h"
 #include "common/logging/log.h"
 #include "core/settings.h"
+#include "video_core/engines/maxwell_3d.h"
 #include "video_core/macro/macro.h"
+#include "video_core/macro/macro_hle.h"
 #include "video_core/macro/macro_interpreter.h"
 #include "video_core/macro/macro_jit_x64.h"
 
 namespace Tegra {
 
+MacroEngine::MacroEngine(Engines::Maxwell3D& maxwell3d)
+    : hle_macros{std::make_unique<Tegra::HLEMacro>(maxwell3d)} {}
+
+MacroEngine::~MacroEngine() {}
+
 void MacroEngine::AddCode(u32 method, u32 data) {
     uploaded_macro_code[method].push_back(data);
 }
 
-void MacroEngine::Execute(u32 method, const std::vector<u32>& parameters) {
+void MacroEngine::Execute(Engines::Maxwell3D& maxwell3d, u32 method,
+                          const std::vector<u32>& parameters) {
     auto compiled_macro = macro_cache.find(method);
     if (compiled_macro != macro_cache.end()) {
-        compiled_macro->second->Execute(parameters, method);
+        const auto& cache_info = compiled_macro->second;
+        if (cache_info.has_hle_program) {
+            cache_info.hle_program->Execute(parameters, method);
+        } else {
+            cache_info.lle_program->Execute(parameters, method);
+        }
     } else {
         // Macro not compiled, check if it's uploaded and if so, compile it
         auto macro_code = uploaded_macro_code.find(method);
@@ -26,8 +40,21 @@ void MacroEngine::Execute(u32 method, const std::vector<u32>& parameters) {
             UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method);
             return;
         }
-        macro_cache[method] = Compile(macro_code->second);
-        macro_cache[method]->Execute(parameters, method);
+        auto& cache_info = macro_cache[method];
+        cache_info.hash = boost::hash_value(macro_code->second);
+        cache_info.lle_program = Compile(macro_code->second);
+
+        auto hle_program = hle_macros->GetHLEProgram(cache_info.hash);
+        if (hle_program.has_value()) {
+            cache_info.has_hle_program = true;
+            cache_info.hle_program = std::move(hle_program.value());
+        }
+
+        if (cache_info.has_hle_program) {
+            cache_info.hle_program->Execute(parameters, method);
+        } else {
+            cache_info.lle_program->Execute(parameters, method);
+        }
     }
 }
 
diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h
index b76ed891f..5fa8023af 100644
--- a/src/video_core/macro/macro.h
+++ b/src/video_core/macro/macro.h
@@ -11,9 +11,11 @@
 #include "common/common_types.h"
 
 namespace Tegra {
+
 namespace Engines {
 class Maxwell3D;
 }
+
 namespace Macro {
 constexpr std::size_t NUM_MACRO_REGISTERS = 8;
 enum class Operation : u32 {
@@ -94,6 +96,8 @@ union MethodAddress {
 
 } // namespace Macro
 
+class HLEMacro;
+
 class CachedMacro {
 public:
     virtual ~CachedMacro() = default;
@@ -107,20 +111,29 @@ public:
 
 class MacroEngine {
 public:
-    virtual ~MacroEngine() = default;
+    MacroEngine(Engines::Maxwell3D& maxwell3d);
+    virtual ~MacroEngine();
 
     // Store the uploaded macro code to compile them when they're called.
     void AddCode(u32 method, u32 data);
 
     // Compiles the macro if its not in the cache, and executes the compiled macro
-    void Execute(u32 method, const std::vector<u32>& parameters);
+    void Execute(Engines::Maxwell3D& maxwell3d, u32 method, const std::vector<u32>& parameters);
 
 protected:
     virtual std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) = 0;
 
 private:
-    std::unordered_map<u32, std::unique_ptr<CachedMacro>> macro_cache;
+    struct CacheInfo {
+        std::unique_ptr<CachedMacro> lle_program{};
+        std::unique_ptr<CachedMacro> hle_program{};
+        u64 hash{};
+        bool has_hle_program{};
+    };
+
+    std::unordered_map<u32, CacheInfo> macro_cache;
     std::unordered_map<u32, std::vector<u32>> uploaded_macro_code;
+    std::unique_ptr<HLEMacro> hle_macros;
 };
 
 std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d);
diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp
new file mode 100644
index 000000000..51827c822
--- /dev/null
+++ b/src/video_core/macro/macro_hle.cpp
@@ -0,0 +1,108 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <unordered_map>
+#include <vector>
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/macro/macro_hle.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace Tegra {
+
+// HLE'd functions
+static void HLE_771BB18C62444DA0(Engines::Maxwell3D& maxwell3d,
+                                 const std::vector<u32>& parameters) {
+    const u32 instance_count = parameters[2] & maxwell3d.GetRegisterValue(0xD1B);
+
+    maxwell3d.regs.draw.topology.Assign(
+        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0] &
+                                                                        ~(0x3ffffff << 26)));
+    maxwell3d.regs.vb_base_instance = parameters[5];
+    maxwell3d.mme_draw.instance_count = instance_count;
+    maxwell3d.regs.vb_element_base = parameters[3];
+    maxwell3d.regs.index_array.count = parameters[1];
+    maxwell3d.regs.index_array.first = parameters[4];
+
+    if (maxwell3d.ShouldExecute()) {
+        maxwell3d.GetRasterizer().Draw(true, true);
+    }
+    maxwell3d.regs.index_array.count = 0;
+    maxwell3d.mme_draw.instance_count = 0;
+}
+
+static void HLE_0D61FC9FAAC9FCAD(Engines::Maxwell3D& maxwell3d,
+                                 const std::vector<u32>& parameters) {
+    const u32 count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
+
+    maxwell3d.regs.vertex_buffer.first = parameters[3];
+    maxwell3d.regs.vertex_buffer.count = parameters[1];
+    maxwell3d.regs.vb_base_instance = parameters[4];
+    maxwell3d.regs.draw.topology.Assign(
+        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]));
+    maxwell3d.mme_draw.instance_count = count;
+
+    if (maxwell3d.ShouldExecute()) {
+        maxwell3d.GetRasterizer().Draw(false, true);
+    }
+    maxwell3d.regs.vertex_buffer.count = 0;
+    maxwell3d.mme_draw.instance_count = 0;
+}
+
+static void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d,
+                                 const std::vector<u32>& parameters) {
+    const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
+    const u32 element_base = parameters[4];
+    const u32 base_instance = parameters[5];
+    maxwell3d.regs.index_array.first = parameters[3];
+    maxwell3d.regs.reg_array[0x446] = element_base; // vertex id base?
+    maxwell3d.regs.index_array.count = parameters[1];
+    maxwell3d.regs.vb_element_base = element_base;
+    maxwell3d.regs.vb_base_instance = base_instance;
+    maxwell3d.regs.const_buffer.cb_pos = 0x640;
+    maxwell3d.mme_draw.instance_count = instance_count;
+    maxwell3d.regs.const_buffer.cb_data[0] = element_base;
+    maxwell3d.regs.const_buffer.cb_data[1] = base_instance;
+    maxwell3d.regs.draw.topology.Assign(
+        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]));
+    if (maxwell3d.ShouldExecute()) {
+        maxwell3d.GetRasterizer().Draw(true, true);
+    }
+    maxwell3d.regs.reg_array[0x446] = 0x0; // vertex id base?
+    maxwell3d.regs.index_array.count = 0;
+    maxwell3d.regs.vb_element_base = 0x0;
+    maxwell3d.regs.vb_base_instance = 0x0;
+    maxwell3d.regs.const_buffer.cb_pos = 0x640;
+    maxwell3d.regs.const_buffer.cb_data[0] = 0;
+    maxwell3d.regs.const_buffer.cb_data[1] = 0;
+    maxwell3d.mme_draw.instance_count = 0;
+}
+
+static const std::unordered_map<u64, HLEFunction> hle_funcs{
+    {0x771BB18C62444DA0, &HLE_771BB18C62444DA0},
+    {0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD},
+    {0x0217920100488FF7, &HLE_0217920100488FF7},
+};
+
+HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
+HLEMacro::~HLEMacro() = default;
+
+std::optional<std::unique_ptr<CachedMacro>> HLEMacro::GetHLEProgram(u64 hash) const {
+    auto it = hle_funcs.find(hash);
+    if (it != hle_funcs.end()) {
+        return std::make_unique<HLEMacroImpl>(maxwell3d, it->second);
+    } else {
+        return {};
+    }
+}
+
+HLEMacroImpl::~HLEMacroImpl() = default;
+
+HLEMacroImpl::HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func)
+    : maxwell3d(maxwell3d), func(func) {}
+
+void HLEMacroImpl::Execute(const std::vector<u32>& parameters, u32 method) {
+    func(maxwell3d, parameters);
+}
+
+} // namespace Tegra
diff --git a/src/video_core/macro/macro_hle.h b/src/video_core/macro/macro_hle.h
new file mode 100644
index 000000000..de7f43dc4
--- /dev/null
+++ b/src/video_core/macro/macro_hle.h
@@ -0,0 +1,43 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <optional>
+#include <vector>
+#include "common/common_types.h"
+#include "video_core/macro/macro.h"
+
+namespace Tegra {
+
+namespace Engines {
+class Maxwell3D;
+}
+
+using HLEFunction = void (*)(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters);
+
+class HLEMacro {
+public:
+    HLEMacro(Engines::Maxwell3D& maxwell3d);
+    ~HLEMacro();
+    std::optional<std::unique_ptr<CachedMacro>> GetHLEProgram(u64 hash) const;
+
+private:
+    Engines::Maxwell3D& maxwell3d;
+};
+
+class HLEMacroImpl : public CachedMacro {
+public:
+    explicit HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func);
+    ~HLEMacroImpl();
+
+    void Execute(const std::vector<u32>& parameters, u32 method) override;
+
+private:
+    Engines::Maxwell3D& maxwell3d;
+    HLEFunction func;
+};
+
+} // namespace Tegra
diff --git a/src/video_core/macro/macro_interpreter.cpp b/src/video_core/macro/macro_interpreter.cpp
index 5edff27aa..aa5256419 100644
--- a/src/video_core/macro/macro_interpreter.cpp
+++ b/src/video_core/macro/macro_interpreter.cpp
@@ -11,7 +11,8 @@
 MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192));
 
 namespace Tegra {
-MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
+MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d)
+    : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {}
 
 std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) {
     return std::make_unique<MacroInterpreterImpl>(maxwell3d, code);
diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp
index 30abb66e5..07292702f 100644
--- a/src/video_core/macro/macro_jit_x64.cpp
+++ b/src/video_core/macro/macro_jit_x64.cpp
@@ -28,7 +28,8 @@ static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({
     BRANCH_HOLDER,
 });
 
-MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
+MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d)
+    : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {}
 
 std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) {
     return std::make_unique<MacroJITx64Impl>(maxwell3d, code);
-- 
cgit v1.2.3


From 74b4334d510b58d96e8305bc3f5a7c8d05e842ba Mon Sep 17 00:00:00 2001
From: David Marcec <dmarcecguzman@gmail.com>
Date: Fri, 5 Jun 2020 12:59:59 +1000
Subject: Fix constbuffer for 0217920100488FF7

---
 src/video_core/macro/macro_hle.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp
index 51827c822..887f40310 100644
--- a/src/video_core/macro/macro_hle.cpp
+++ b/src/video_core/macro/macro_hle.cpp
@@ -59,10 +59,10 @@ static void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d,
     maxwell3d.regs.index_array.count = parameters[1];
     maxwell3d.regs.vb_element_base = element_base;
     maxwell3d.regs.vb_base_instance = base_instance;
-    maxwell3d.regs.const_buffer.cb_pos = 0x640;
     maxwell3d.mme_draw.instance_count = instance_count;
-    maxwell3d.regs.const_buffer.cb_data[0] = element_base;
-    maxwell3d.regs.const_buffer.cb_data[1] = base_instance;
+    maxwell3d.CallMethodFromMME(0x8e3, 0x640);
+    maxwell3d.CallMethodFromMME(0x8e4, element_base);
+    maxwell3d.CallMethodFromMME(0x8e5, base_instance);
     maxwell3d.regs.draw.topology.Assign(
         static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]));
     if (maxwell3d.ShouldExecute()) {
@@ -72,10 +72,10 @@ static void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d,
     maxwell3d.regs.index_array.count = 0;
     maxwell3d.regs.vb_element_base = 0x0;
     maxwell3d.regs.vb_base_instance = 0x0;
-    maxwell3d.regs.const_buffer.cb_pos = 0x640;
-    maxwell3d.regs.const_buffer.cb_data[0] = 0;
-    maxwell3d.regs.const_buffer.cb_data[1] = 0;
     maxwell3d.mme_draw.instance_count = 0;
+    maxwell3d.CallMethodFromMME(0x8e3, 0x640);
+    maxwell3d.CallMethodFromMME(0x8e4, 0x0);
+    maxwell3d.CallMethodFromMME(0x8e5, 0x0);
 }
 
 static const std::unordered_map<u64, HLEFunction> hle_funcs{
-- 
cgit v1.2.3


From fabdf5d3850c078d173653f259845c26a2ce6e7d Mon Sep 17 00:00:00 2001
From: David Marcec <dmarcecguzman@gmail.com>
Date: Fri, 5 Jun 2020 13:09:52 +1000
Subject: Addressed issues

---
 src/video_core/engines/maxwell_3d.h |  4 ++++
 src/video_core/macro/macro.cpp      |  2 +-
 src/video_core/macro/macro.h        |  2 +-
 src/video_core/macro/macro_hle.cpp  | 20 ++++++++++----------
 src/video_core/macro/macro_hle.h    |  2 +-
 5 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 5926c4d2d..ef1618990 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -1422,6 +1422,10 @@ public:
         return rasterizer;
     }
 
+    const VideoCore::RasterizerInterface& GetRasterizer() const {
+        return rasterizer;
+    }
+
     /// Notify a memory write has happened.
     void OnMemoryWrite() {
         dirty.flags |= dirty.on_write_stores;
diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp
index c8aa2534a..ef7dad349 100644
--- a/src/video_core/macro/macro.cpp
+++ b/src/video_core/macro/macro.cpp
@@ -17,7 +17,7 @@ namespace Tegra {
 MacroEngine::MacroEngine(Engines::Maxwell3D& maxwell3d)
     : hle_macros{std::make_unique<Tegra::HLEMacro>(maxwell3d)} {}
 
-MacroEngine::~MacroEngine() {}
+MacroEngine::~MacroEngine() = default;
 
 void MacroEngine::AddCode(u32 method, u32 data) {
     uploaded_macro_code[method].push_back(data);
diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h
index 5fa8023af..4d00b84b0 100644
--- a/src/video_core/macro/macro.h
+++ b/src/video_core/macro/macro.h
@@ -111,7 +111,7 @@ public:
 
 class MacroEngine {
 public:
-    MacroEngine(Engines::Maxwell3D& maxwell3d);
+    explicit MacroEngine(Engines::Maxwell3D& maxwell3d);
     virtual ~MacroEngine();
 
     // Store the uploaded macro code to compile them when they're called.
diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp
index 887f40310..1f1348df3 100644
--- a/src/video_core/macro/macro_hle.cpp
+++ b/src/video_core/macro/macro_hle.cpp
@@ -2,7 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#include <unordered_map>
+#include <array>
 #include <vector>
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/macro/macro_hle.h"
@@ -78,22 +78,22 @@ static void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d,
     maxwell3d.CallMethodFromMME(0x8e5, 0x0);
 }
 
-static const std::unordered_map<u64, HLEFunction> hle_funcs{
-    {0x771BB18C62444DA0, &HLE_771BB18C62444DA0},
-    {0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD},
-    {0x0217920100488FF7, &HLE_0217920100488FF7},
+static const std::array<std::pair<u64, HLEFunction>, 3> hle_funcs{
+    std::make_pair<u64, HLEFunction>(0x771BB18C62444DA0, &HLE_771BB18C62444DA0),
+    std::make_pair<u64, HLEFunction>(0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD),
+    std::make_pair<u64, HLEFunction>(0x0217920100488FF7, &HLE_0217920100488FF7),
 };
 
 HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
 HLEMacro::~HLEMacro() = default;
 
 std::optional<std::unique_ptr<CachedMacro>> HLEMacro::GetHLEProgram(u64 hash) const {
-    auto it = hle_funcs.find(hash);
-    if (it != hle_funcs.end()) {
-        return std::make_unique<HLEMacroImpl>(maxwell3d, it->second);
-    } else {
-        return {};
+    const auto it = std::find_if(hle_funcs.begin(), hle_funcs.end(),
+                                 [hash](auto& pair) { return pair.first == hash; });
+    if (it == hle_funcs.end()) {
+        return std::nullopt;
     }
+    return std::make_unique<HLEMacroImpl>(maxwell3d, it->second);
 }
 
 HLEMacroImpl::~HLEMacroImpl() = default;
diff --git a/src/video_core/macro/macro_hle.h b/src/video_core/macro/macro_hle.h
index de7f43dc4..7cd492a8f 100644
--- a/src/video_core/macro/macro_hle.h
+++ b/src/video_core/macro/macro_hle.h
@@ -20,7 +20,7 @@ using HLEFunction = void (*)(Engines::Maxwell3D& maxwell3d, const std::vector<u3
 
 class HLEMacro {
 public:
-    HLEMacro(Engines::Maxwell3D& maxwell3d);
+    explicit HLEMacro(Engines::Maxwell3D& maxwell3d);
     ~HLEMacro();
     std::optional<std::unique_ptr<CachedMacro>> GetHLEProgram(u64 hash) const;
 
-- 
cgit v1.2.3


From 52340e94ac5a64572643f01a23316ad492a40f66 Mon Sep 17 00:00:00 2001
From: David Marcec <dmarcecguzman@gmail.com>
Date: Fri, 5 Jun 2020 14:00:00 +1000
Subject: clear mme draw mode

We already draw, so we can clear it
---
 src/video_core/macro/macro_hle.cpp | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'src/video_core')

diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp
index 1f1348df3..689533f6a 100644
--- a/src/video_core/macro/macro_hle.cpp
+++ b/src/video_core/macro/macro_hle.cpp
@@ -29,6 +29,7 @@ static void HLE_771BB18C62444DA0(Engines::Maxwell3D& maxwell3d,
     }
     maxwell3d.regs.index_array.count = 0;
     maxwell3d.mme_draw.instance_count = 0;
+    maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
 }
 
 static void HLE_0D61FC9FAAC9FCAD(Engines::Maxwell3D& maxwell3d,
@@ -47,6 +48,7 @@ static void HLE_0D61FC9FAAC9FCAD(Engines::Maxwell3D& maxwell3d,
     }
     maxwell3d.regs.vertex_buffer.count = 0;
     maxwell3d.mme_draw.instance_count = 0;
+    maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
 }
 
 static void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d,
@@ -76,6 +78,7 @@ static void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d,
     maxwell3d.CallMethodFromMME(0x8e3, 0x640);
     maxwell3d.CallMethodFromMME(0x8e4, 0x0);
     maxwell3d.CallMethodFromMME(0x8e5, 0x0);
+    maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
 }
 
 static const std::array<std::pair<u64, HLEFunction>, 3> hle_funcs{
-- 
cgit v1.2.3


From f5e2aec4220ee2b72ec2986e0e60625897b2fd44 Mon Sep 17 00:00:00 2001
From: David Marcec <dmarcecguzman@gmail.com>
Date: Wed, 24 Jun 2020 12:18:33 +1000
Subject: addressed issues

---
 src/video_core/macro/macro_hle.cpp | 10 ++++++----
 src/video_core/macro/macro_hle.h   |  1 +
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp
index 689533f6a..410f99018 100644
--- a/src/video_core/macro/macro_hle.cpp
+++ b/src/video_core/macro/macro_hle.cpp
@@ -10,6 +10,7 @@
 
 namespace Tegra {
 
+namespace {
 // HLE'd functions
 static void HLE_771BB18C62444DA0(Engines::Maxwell3D& maxwell3d,
                                  const std::vector<u32>& parameters) {
@@ -80,19 +81,20 @@ static void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d,
     maxwell3d.CallMethodFromMME(0x8e5, 0x0);
     maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
 }
+} // namespace
 
-static const std::array<std::pair<u64, HLEFunction>, 3> hle_funcs{
+constexpr std::array<std::pair<u64, HLEFunction>, 3> hle_funcs{{
     std::make_pair<u64, HLEFunction>(0x771BB18C62444DA0, &HLE_771BB18C62444DA0),
     std::make_pair<u64, HLEFunction>(0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD),
     std::make_pair<u64, HLEFunction>(0x0217920100488FF7, &HLE_0217920100488FF7),
-};
+}};
 
 HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
 HLEMacro::~HLEMacro() = default;
 
 std::optional<std::unique_ptr<CachedMacro>> HLEMacro::GetHLEProgram(u64 hash) const {
-    const auto it = std::find_if(hle_funcs.begin(), hle_funcs.end(),
-                                 [hash](auto& pair) { return pair.first == hash; });
+    const auto it = std::find_if(hle_funcs.cbegin(), hle_funcs.cend(),
+                                 [hash](const auto& pair) { return pair.first == hash; });
     if (it == hle_funcs.end()) {
         return std::nullopt;
     }
diff --git a/src/video_core/macro/macro_hle.h b/src/video_core/macro/macro_hle.h
index 7cd492a8f..37af875a0 100644
--- a/src/video_core/macro/macro_hle.h
+++ b/src/video_core/macro/macro_hle.h
@@ -22,6 +22,7 @@ class HLEMacro {
 public:
     explicit HLEMacro(Engines::Maxwell3D& maxwell3d);
     ~HLEMacro();
+
     std::optional<std::unique_ptr<CachedMacro>> GetHLEProgram(u64 hash) const;
 
 private:
-- 
cgit v1.2.3


From da79ec9565f670bcf1f09fdf7d9ae0241d97a241 Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Mon, 11 May 2020 16:18:53 -0300
Subject: gl_stream_buffer: Always use persistent memory maps

yuzu no longer supports platforms without persistent maps.
---
 .../renderer_opengl/gl_stream_buffer.cpp           | 40 +++++++---------------
 src/video_core/renderer_opengl/gl_stream_buffer.h  |  4 +--
 2 files changed, 14 insertions(+), 30 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
index 932a2f69e..9cf0f6b46 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@@ -14,8 +14,7 @@ MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
 
 namespace OpenGL {
 
-OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent,
-                                 bool use_persistent)
+OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent)
     : buffer_size(size) {
     gl_buffer.Create();
 
@@ -29,23 +28,16 @@ OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool p
         allocate_size *= 2;
     }
 
-    if (use_persistent) {
-        persistent = true;
-        coherent = prefer_coherent;
-        const GLbitfield flags =
-            GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0);
-        glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags);
-        mapped_ptr = static_cast<u8*>(glMapNamedBufferRange(
-            gl_buffer.handle, 0, buffer_size, flags | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT)));
-    } else {
-        glNamedBufferData(gl_buffer.handle, allocate_size, nullptr, GL_STREAM_DRAW);
-    }
+    coherent = prefer_coherent;
+    const GLbitfield flags =
+        GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0);
+    glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags);
+    mapped_ptr = static_cast<u8*>(glMapNamedBufferRange(
+        gl_buffer.handle, 0, buffer_size, flags | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT)));
 }
 
 OGLStreamBuffer::~OGLStreamBuffer() {
-    if (persistent) {
-        glUnmapNamedBuffer(gl_buffer.handle);
-    }
+    glUnmapNamedBuffer(gl_buffer.handle);
     gl_buffer.Release();
 }
 
@@ -63,16 +55,14 @@ std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr a
         buffer_pos = 0;
         invalidate = true;
 
-        if (persistent) {
-            glUnmapNamedBuffer(gl_buffer.handle);
-        }
+        glUnmapNamedBuffer(gl_buffer.handle);
     }
 
-    if (invalidate || !persistent) {
+    if (invalidate) {
         MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
-        GLbitfield flags = GL_MAP_WRITE_BIT | (persistent ? GL_MAP_PERSISTENT_BIT : 0) |
-                           (coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT) |
-                           (invalidate ? GL_MAP_INVALIDATE_BUFFER_BIT : GL_MAP_UNSYNCHRONIZED_BIT);
+        const GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT |
+                                 GL_MAP_INVALIDATE_BUFFER_BIT |
+                                 (coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT);
         mapped_ptr = static_cast<u8*>(
             glMapNamedBufferRange(gl_buffer.handle, buffer_pos, buffer_size - buffer_pos, flags));
         mapped_offset = buffer_pos;
@@ -88,10 +78,6 @@ void OGLStreamBuffer::Unmap(GLsizeiptr size) {
         glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos - mapped_offset, size);
     }
 
-    if (!persistent) {
-        glUnmapNamedBuffer(gl_buffer.handle);
-    }
-
     buffer_pos += size;
 }
 
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h
index 866da3594..65c3da93f 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.h
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.h
@@ -13,8 +13,7 @@ namespace OpenGL {
 
 class OGLStreamBuffer : private NonCopyable {
 public:
-    explicit OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent = false,
-                             bool use_persistent = true);
+    explicit OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent = false);
     ~OGLStreamBuffer();
 
     /*
@@ -41,7 +40,6 @@ private:
     OGLBuffer gl_buffer;
 
     bool coherent = false;
-    bool persistent = false;
 
     GLintptr buffer_pos = 0;
     GLsizeiptr buffer_size = 0;
-- 
cgit v1.2.3


From 00c66a728958c3b2804131ce5baf44880119e018 Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Mon, 11 May 2020 16:21:08 -0300
Subject: gl_stream_buffer: Always use a non-coherent buffer

---
 src/video_core/renderer_opengl/gl_stream_buffer.cpp | 20 +++++++++-----------
 src/video_core/renderer_opengl/gl_stream_buffer.h   |  4 +---
 2 files changed, 10 insertions(+), 14 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
index 9cf0f6b46..aeafcfbfe 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@@ -4,6 +4,7 @@
 
 #include <deque>
 #include <vector>
+
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/microprofile.h"
@@ -14,8 +15,7 @@ MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
 
 namespace OpenGL {
 
-OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent)
-    : buffer_size(size) {
+OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage) : buffer_size(size) {
     gl_buffer.Create();
 
     GLsizeiptr allocate_size = size;
@@ -28,12 +28,10 @@ OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool p
         allocate_size *= 2;
     }
 
-    coherent = prefer_coherent;
-    const GLbitfield flags =
-        GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0);
+    static constexpr GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT;
     glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags);
-    mapped_ptr = static_cast<u8*>(glMapNamedBufferRange(
-        gl_buffer.handle, 0, buffer_size, flags | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT)));
+    mapped_ptr = static_cast<u8*>(
+        glMapNamedBufferRange(gl_buffer.handle, 0, buffer_size, flags | GL_MAP_FLUSH_EXPLICIT_BIT));
 }
 
 OGLStreamBuffer::~OGLStreamBuffer() {
@@ -59,10 +57,10 @@ std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr a
     }
 
     if (invalidate) {
+        static const GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT |
+                                        GL_MAP_INVALIDATE_BUFFER_BIT | GL_MAP_FLUSH_EXPLICIT_BIT;
+
         MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
-        const GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT |
-                                 GL_MAP_INVALIDATE_BUFFER_BIT |
-                                 (coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT);
         mapped_ptr = static_cast<u8*>(
             glMapNamedBufferRange(gl_buffer.handle, buffer_pos, buffer_size - buffer_pos, flags));
         mapped_offset = buffer_pos;
@@ -74,7 +72,7 @@ std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr a
 void OGLStreamBuffer::Unmap(GLsizeiptr size) {
     ASSERT(size <= mapped_size);
 
-    if (!coherent && size > 0) {
+    if (size > 0) {
         glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos - mapped_offset, size);
     }
 
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h
index 65c3da93f..826c2e361 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.h
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.h
@@ -13,7 +13,7 @@ namespace OpenGL {
 
 class OGLStreamBuffer : private NonCopyable {
 public:
-    explicit OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent = false);
+    explicit OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage);
     ~OGLStreamBuffer();
 
     /*
@@ -39,8 +39,6 @@ public:
 private:
     OGLBuffer gl_buffer;
 
-    bool coherent = false;
-
     GLintptr buffer_pos = 0;
     GLsizeiptr buffer_size = 0;
     GLintptr mapped_offset = 0;
-- 
cgit v1.2.3


From 73fb3a304b215abce3cfb1c0c5eb2b43740b65ed Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Thu, 18 Jun 2020 03:54:13 -0300
Subject: gl_device: Expose NV_vertex_buffer_unified_memory except on Turing

Expose NV_vertex_buffer_unified_memory when the driver supports it.

This commit adds a function the determine if a GL_RENDERER is a Turing
GPU. This is required because on Turing GPUs Nvidia's driver crashes
when the buffer is marked as resident or on DeleteBuffers. Without a
synchronous debug output (single threaded driver), it's likely that
the driver will crash in the first blocking call.
---
 src/video_core/renderer_opengl/gl_device.cpp | 26 +++++++++++++++++++++++++-
 src/video_core/renderer_opengl/gl_device.h   |  5 +++++
 2 files changed, 30 insertions(+), 1 deletion(-)

(limited to 'src/video_core')

diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index 1011c7738..447a19595 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -188,16 +188,32 @@ bool IsASTCSupported() {
     return true;
 }
 
+/// @brief Returns true when a GL_RENDERER is a Turing GPU
+/// @param renderer GL_RENDERER string
+bool IsTuring(std::string_view renderer) {
+    static constexpr std::array<std::string_view, 12> TURING_GPUS = {
+        "GTX 1650",        "GTX 1660",        "RTX 2060",        "RTX 2070",
+        "RTX 2080",        "TITAN RTX",       "Quadro RTX 3000", "Quadro RTX 4000",
+        "Quadro RTX 5000", "Quadro RTX 6000", "Quadro RTX 8000", "Tesla T4",
+    };
+    return std::any_of(TURING_GPUS.begin(), TURING_GPUS.end(),
+                       [renderer](std::string_view candidate) {
+                           return renderer.find(candidate) != std::string_view::npos;
+                       });
+}
+
 } // Anonymous namespace
 
 Device::Device()
     : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} {
     const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
+    const std::string_view renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER));
     const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION));
     const std::vector extensions = GetExtensions();
 
     const bool is_nvidia = vendor == "NVIDIA Corporation";
     const bool is_amd = vendor == "ATI Technologies Inc.";
+    const bool is_turing = is_nvidia && IsTuring(renderer);
 
     bool disable_fast_buffer_sub_data = false;
     if (is_nvidia && version == "4.6.0 NVIDIA 443.24") {
@@ -221,8 +237,16 @@ Device::Device()
     has_variable_aoffi = TestVariableAoffi();
     has_component_indexing_bug = is_amd;
     has_precise_bug = TestPreciseBug();
-    has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data;
     has_nv_viewport_array2 = GLAD_GL_NV_viewport_array2;
+
+    // At the moment of writing this, only Nvidia's driver optimizes BufferSubData on exclusive
+    // uniform buffers as "push constants"
+    has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data;
+
+    // Nvidia's driver on Turing GPUs randomly crashes when the buffer is made resident, or on
+    // DeleteBuffers. Disable unified memory on these devices.
+    has_vertex_buffer_unified_memory = GLAD_GL_NV_vertex_buffer_unified_memory && !is_turing;
+
     use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 &&
                            GLAD_GL_NV_compute_program5 && GLAD_GL_NV_transform_feedback &&
                            GLAD_GL_NV_transform_feedback2;
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index c86e709b1..e1d811966 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -72,6 +72,10 @@ public:
         return has_texture_shadow_lod;
     }
 
+    bool HasVertexBufferUnifiedMemory() const {
+        return has_vertex_buffer_unified_memory;
+    }
+
     bool HasASTC() const {
         return has_astc;
     }
@@ -115,6 +119,7 @@ private:
     bool has_vertex_viewport_layer{};
     bool has_image_load_formatted{};
     bool has_texture_shadow_lod{};
+    bool has_vertex_buffer_unified_memory{};
     bool has_astc{};
     bool has_variable_aoffi{};
     bool has_component_indexing_bug{};
-- 
cgit v1.2.3


From 32485917ba7cb7b2f0cad766c0897365294650a7 Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Mon, 11 May 2020 16:35:04 -0300
Subject: gl_buffer_cache: Mark buffers as resident

Make stream buffer and cached buffers as resident and query their
address. This allows us to use GPU addresses for several proprietary
Nvidia extensions.
---
 src/video_core/buffer_cache/buffer_cache.h         | 21 ++++++-----
 src/video_core/renderer_opengl/gl_buffer_cache.cpp | 24 ++++++++----
 src/video_core/renderer_opengl/gl_buffer_cache.h   | 20 +++++++---
 src/video_core/renderer_opengl/gl_rasterizer.cpp   | 44 +++++++++++-----------
 .../renderer_opengl/gl_stream_buffer.cpp           | 11 +++++-
 src/video_core/renderer_opengl/gl_stream_buffer.h  | 11 +++++-
 src/video_core/renderer_vulkan/vk_buffer_cache.cpp |  4 +-
 src/video_core/renderer_vulkan/vk_buffer_cache.h   |  6 ++-
 src/video_core/renderer_vulkan/vk_rasterizer.cpp   | 31 ++++++++-------
 src/video_core/renderer_vulkan/vk_stream_buffer.h  |  6 ++-
 10 files changed, 111 insertions(+), 67 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index bae1d527c..6ea59253a 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -41,7 +41,11 @@ class BufferCache {
     static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS;
 
 public:
-    using BufferInfo = std::pair<BufferType, u64>;
+    struct BufferInfo {
+        BufferType handle;
+        u64 offset;
+        u64 address;
+    };
 
     BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
                             bool is_written = false, bool use_fast_cbuf = false) {
@@ -50,7 +54,7 @@ public:
         auto& memory_manager = system.GPU().MemoryManager();
         const std::optional<VAddr> cpu_addr_opt = memory_manager.GpuToCpuAddress(gpu_addr);
         if (!cpu_addr_opt) {
-            return {GetEmptyBuffer(size), 0};
+            return GetEmptyBuffer(size);
         }
         const VAddr cpu_addr = *cpu_addr_opt;
 
@@ -88,7 +92,7 @@ public:
         Buffer* const block = GetBlock(cpu_addr, size);
         MapInterval* const map = MapAddress(block, gpu_addr, cpu_addr, size);
         if (!map) {
-            return {GetEmptyBuffer(size), 0};
+            return GetEmptyBuffer(size);
         }
         if (is_written) {
             map->MarkAsModified(true, GetModifiedTicks());
@@ -101,7 +105,7 @@ public:
             }
         }
 
-        return {block->Handle(), static_cast<u64>(block->Offset(cpu_addr))};
+        return BufferInfo{block->Handle(), block->Offset(cpu_addr), block->Address()};
     }
 
     /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
@@ -254,13 +258,12 @@ public:
         committed_flushes.pop_front();
     }
 
-    virtual BufferType GetEmptyBuffer(std::size_t size) = 0;
+    virtual BufferInfo GetEmptyBuffer(std::size_t size) = 0;
 
 protected:
     explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
-                         std::unique_ptr<StreamBuffer> stream_buffer_)
-        : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer_)},
-          stream_buffer_handle{stream_buffer->Handle()} {}
+                         std::unique_ptr<StreamBuffer> stream_buffer)
+        : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)} {}
 
     ~BufferCache() = default;
 
@@ -449,7 +452,7 @@ private:
 
         buffer_ptr += size;
         buffer_offset += size;
-        return {stream_buffer_handle, uploaded_offset};
+        return BufferInfo{stream_buffer->Handle(), uploaded_offset, stream_buffer->Address()};
     }
 
     void AlignBuffer(std::size_t alignment) {
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index ad0577a4f..e09b47f57 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -22,21 +22,28 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
 MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));
 
-Buffer::Buffer(VAddr cpu_addr, const std::size_t size) : VideoCommon::BufferBlock{cpu_addr, size} {
+Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size)
+    : VideoCommon::BufferBlock{cpu_addr, size} {
     gl_buffer.Create();
     glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE);
+        glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
+    }
 }
 
 Buffer::~Buffer() = default;
 
 OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
-                               const Device& device, std::size_t stream_size)
-    : GenericBufferCache{rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {
+                               const Device& device_, std::size_t stream_size)
+    : GenericBufferCache{rasterizer, system,
+                         std::make_unique<OGLStreamBuffer>(device_, stream_size, true)},
+      device{device_} {
     if (!device.HasFastBufferSubData()) {
         return;
     }
 
-    static constexpr auto size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);
+    static constexpr GLsizeiptr size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);
     glCreateBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
     for (const GLuint cbuf : cbufs) {
         glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW);
@@ -48,11 +55,11 @@ OGLBufferCache::~OGLBufferCache() {
 }
 
 std::shared_ptr<Buffer> OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
-    return std::make_shared<Buffer>(cpu_addr, size);
+    return std::make_shared<Buffer>(device, cpu_addr, size);
 }
 
-GLuint OGLBufferCache::GetEmptyBuffer(std::size_t) {
-    return 0;
+OGLBufferCache::BufferInfo OGLBufferCache::GetEmptyBuffer(std::size_t) {
+    return {0, 0, 0};
 }
 
 void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
@@ -79,8 +86,9 @@ OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_poi
                                                              std::size_t size) {
     DEBUG_ASSERT(cbuf_cursor < std::size(cbufs));
     const GLuint cbuf = cbufs[cbuf_cursor++];
+
     glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer);
-    return {cbuf, 0};
+    return {cbuf, 0, 0};
 }
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index a49aaf9c4..6462cfae5 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -25,15 +25,20 @@ class RasterizerOpenGL;
 
 class Buffer : public VideoCommon::BufferBlock {
 public:
-    explicit Buffer(VAddr cpu_addr, const std::size_t size);
+    explicit Buffer(const Device& device, VAddr cpu_addr, std::size_t size);
     ~Buffer();
 
-    GLuint Handle() const {
+    GLuint Handle() const noexcept {
         return gl_buffer.handle;
     }
 
+    u64 Address() const noexcept {
+        return gpu_address;
+    }
+
 private:
     OGLBuffer gl_buffer;
+    u64 gpu_address = 0;
 };
 
 using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
@@ -43,7 +48,7 @@ public:
                             const Device& device, std::size_t stream_size);
     ~OGLBufferCache();
 
-    GLuint GetEmptyBuffer(std::size_t) override;
+    BufferInfo GetEmptyBuffer(std::size_t) override;
 
     void Acquire() noexcept {
         cbuf_cursor = 0;
@@ -64,10 +69,13 @@ protected:
     BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override;
 
 private:
+    static constexpr std::size_t NUM_CBUFS = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
+                                             Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
+
+    const Device& device;
+
     std::size_t cbuf_cursor = 0;
-    std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
-                           Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram>
-        cbufs;
+    std::array<GLuint, NUM_CBUFS> cbufs{};
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 2d6c11320..7cb378a71 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -253,8 +253,8 @@ void RasterizerOpenGL::SetupVertexBuffer() {
             glBindVertexBuffer(static_cast<GLuint>(index), 0, 0, vertex_array.stride);
             continue;
         }
-        const auto [vertex_buffer, vertex_buffer_offset] = buffer_cache.UploadMemory(start, size);
-        glBindVertexBuffer(static_cast<GLuint>(index), vertex_buffer, vertex_buffer_offset,
+        const auto info = buffer_cache.UploadMemory(start, size);
+        glBindVertexBuffer(static_cast<GLuint>(index), info.handle, info.offset,
                            vertex_array.stride);
     }
 }
@@ -285,9 +285,9 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() {
     MICROPROFILE_SCOPE(OpenGL_Index);
     const auto& regs = system.GPU().Maxwell3D().regs;
     const std::size_t size = CalculateIndexBufferSize();
-    const auto [buffer, offset] = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, buffer);
-    return offset;
+    const auto info = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
+    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, info.handle);
+    return info.offset;
 }
 
 void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
@@ -643,9 +643,9 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
     if (!device.UseAssemblyShaders()) {
         MaxwellUniformData ubo;
         ubo.SetFromRegs(gpu);
-        const auto [buffer, offset] =
+        const auto info =
             buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
-        glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset,
+        glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, info.handle, info.offset,
                           static_cast<GLsizeiptr>(sizeof(ubo)));
     }
 
@@ -956,8 +956,7 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
         if (device.UseAssemblyShaders()) {
             glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0);
         } else {
-            glBindBufferRange(GL_UNIFORM_BUFFER, binding,
-                              buffer_cache.GetEmptyBuffer(sizeof(float)), 0, sizeof(float));
+            glBindBufferRange(GL_UNIFORM_BUFFER, binding, 0, 0, sizeof(float));
         }
         return;
     }
@@ -970,24 +969,25 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
 
     const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment();
     const GPUVAddr gpu_addr = buffer.address;
-    auto [cbuf, offset] = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload);
+    auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload);
 
     if (device.UseAssemblyShaders()) {
         UNIMPLEMENTED_IF(use_unified);
-        if (offset != 0) {
+        if (info.offset != 0) {
             const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
-            glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size);
-            cbuf = staging_cbuf;
-            offset = 0;
+            glCopyNamedBufferSubData(info.handle, staging_cbuf, info.offset, 0, size);
+            info.handle = staging_cbuf;
+            info.offset = 0;
         }
-        glBindBufferRangeNV(stage, binding, cbuf, offset, size);
+        glBindBufferRangeNV(stage, binding, info.handle, info.offset, size);
         return;
     }
 
     if (use_unified) {
-        glCopyNamedBufferSubData(cbuf, unified_uniform_buffer.handle, offset, unified_offset, size);
+        glCopyNamedBufferSubData(info.handle, unified_uniform_buffer.handle, info.offset,
+                                 unified_offset, size);
     } else {
-        glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size);
+        glBindBufferRange(GL_UNIFORM_BUFFER, binding, info.handle, info.offset, size);
     }
 }
 
@@ -1023,9 +1023,8 @@ void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) {
 void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry,
                                          GPUVAddr gpu_addr, std::size_t size) {
     const auto alignment{device.GetShaderStorageBufferAlignment()};
-    const auto [ssbo, buffer_offset] =
-        buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written);
-    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, ssbo, buffer_offset,
+    const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written);
+    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset,
                       static_cast<GLsizeiptr>(size));
 }
 
@@ -1712,8 +1711,9 @@ void RasterizerOpenGL::EndTransformFeedback() {
         const GLuint handle = transform_feedback_buffers[index].handle;
         const GPUVAddr gpu_addr = binding.Address();
         const std::size_t size = binding.buffer_size;
-        const auto [dest_buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
-        glCopyNamedBufferSubData(handle, dest_buffer, 0, offset, static_cast<GLsizeiptr>(size));
+        const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
+        glCopyNamedBufferSubData(handle, info.handle, 0, info.offset,
+                                 static_cast<GLsizeiptr>(size));
     }
 }
 
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
index aeafcfbfe..164df4feb 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@@ -2,12 +2,13 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#include <deque>
+#include <tuple>
 #include <vector>
 
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/microprofile.h"
+#include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"
 
 MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
@@ -15,7 +16,8 @@ MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
 
 namespace OpenGL {
 
-OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage) : buffer_size(size) {
+OGLStreamBuffer::OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage)
+    : buffer_size(size) {
     gl_buffer.Create();
 
     GLsizeiptr allocate_size = size;
@@ -32,6 +34,11 @@ OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage) : buff
     glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags);
     mapped_ptr = static_cast<u8*>(
         glMapNamedBufferRange(gl_buffer.handle, 0, buffer_size, flags | GL_MAP_FLUSH_EXPLICIT_BIT));
+
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_ONLY);
+        glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
+    }
 }
 
 OGLStreamBuffer::~OGLStreamBuffer() {
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h
index 826c2e361..e67a82980 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.h
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.h
@@ -11,9 +11,11 @@
 
 namespace OpenGL {
 
+class Device;
+
 class OGLStreamBuffer : private NonCopyable {
 public:
-    explicit OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage);
+    explicit OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage);
     ~OGLStreamBuffer();
 
     /*
@@ -32,13 +34,18 @@ public:
         return gl_buffer.handle;
     }
 
-    GLsizeiptr Size() const {
+    u64 Address() const {
+        return gpu_address;
+    }
+
+    GLsizeiptr Size() const noexcept {
         return buffer_size;
     }
 
 private:
     OGLBuffer gl_buffer;
 
+    GLuint64EXT gpu_address = 0;
     GLintptr buffer_pos = 0;
     GLsizeiptr buffer_size = 0;
     GLintptr mapped_offset = 0;
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index 1fde38328..df258d7a4 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -71,14 +71,14 @@ std::shared_ptr<Buffer> VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t s
     return std::make_shared<Buffer>(device, memory_manager, cpu_addr, size);
 }
 
-VkBuffer VKBufferCache::GetEmptyBuffer(std::size_t size) {
+VKBufferCache::BufferInfo VKBufferCache::GetEmptyBuffer(std::size_t size) {
     size = std::max(size, std::size_t(4));
     const auto& empty = staging_pool.GetUnusedBuffer(size, false);
     scheduler.RequestOutsideRenderPassOperationContext();
     scheduler.Record([size, buffer = *empty.handle](vk::CommandBuffer cmdbuf) {
         cmdbuf.FillBuffer(buffer, 0, size, 0);
     });
-    return *empty.handle;
+    return {*empty.handle, 0, 0};
 }
 
 void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h
index 9ebbef835..682383ff2 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.h
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h
@@ -33,6 +33,10 @@ public:
         return *buffer.handle;
     }
 
+    u64 Address() const {
+        return 0;
+    }
+
 private:
     VKBuffer buffer;
 };
@@ -44,7 +48,7 @@ public:
                            VKScheduler& scheduler, VKStagingBufferPool& staging_pool);
     ~VKBufferCache();
 
-    VkBuffer GetEmptyBuffer(std::size_t size) override;
+    BufferInfo GetEmptyBuffer(std::size_t size) override;
 
 protected:
     std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 29001953c..e3714ee6d 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -870,10 +870,10 @@ void RasterizerVulkan::BeginTransformFeedback() {
     UNIMPLEMENTED_IF(binding.buffer_offset != 0);
 
     const GPUVAddr gpu_addr = binding.Address();
-    const auto size = static_cast<VkDeviceSize>(binding.buffer_size);
-    const auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
+    const VkDeviceSize size = static_cast<VkDeviceSize>(binding.buffer_size);
+    const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
 
-    scheduler.Record([buffer = buffer, offset = offset, size](vk::CommandBuffer cmdbuf) {
+    scheduler.Record([buffer = info.handle, offset = info.offset, size](vk::CommandBuffer cmdbuf) {
         cmdbuf.BindTransformFeedbackBuffersEXT(0, 1, &buffer, &offset, &size);
         cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr);
     });
@@ -925,8 +925,8 @@ void RasterizerVulkan::SetupVertexArrays(FixedPipelineState::VertexInput& vertex
             buffer_bindings.AddVertexBinding(DefaultBuffer(), 0);
             continue;
         }
-        const auto [buffer, offset] = buffer_cache.UploadMemory(start, size);
-        buffer_bindings.AddVertexBinding(buffer, offset);
+        const auto info = buffer_cache.UploadMemory(start, size);
+        buffer_bindings.AddVertexBinding(info.handle, info.offset);
     }
 }
 
@@ -948,7 +948,9 @@ void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawPar
             break;
         }
         const GPUVAddr gpu_addr = regs.index_array.IndexStart();
-        auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
+        const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
+        VkBuffer buffer = info.handle;
+        u64 offset = info.offset;
         std::tie(buffer, offset) = quad_indexed_pass.Assemble(
             regs.index_array.format, params.num_vertices, params.base_vertex, buffer, offset);
 
@@ -962,7 +964,9 @@ void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawPar
             break;
         }
         const GPUVAddr gpu_addr = regs.index_array.IndexStart();
-        auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
+        const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
+        VkBuffer buffer = info.handle;
+        u64 offset = info.offset;
 
         auto format = regs.index_array.format;
         const bool is_uint8 = format == Maxwell::IndexFormat::UnsignedByte;
@@ -1109,10 +1113,9 @@ void RasterizerVulkan::SetupConstBuffer(const ConstBufferEntry& entry,
         Common::AlignUp(CalculateConstBufferSize(entry, buffer), 4 * sizeof(float));
     ASSERT(size <= MaxConstbufferSize);
 
-    const auto [buffer_handle, offset] =
+    const auto info =
         buffer_cache.UploadMemory(buffer.address, size, device.GetUniformBufferAlignment());
-
-    update_descriptor_queue.AddBuffer(buffer_handle, offset, size);
+    update_descriptor_queue.AddBuffer(info.handle, info.offset, size);
 }
 
 void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address) {
@@ -1126,14 +1129,14 @@ void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAdd
         // Note: Do *not* use DefaultBuffer() here, storage buffers can be written breaking the
         // default buffer.
         static constexpr std::size_t dummy_size = 4;
-        const auto buffer = buffer_cache.GetEmptyBuffer(dummy_size);
-        update_descriptor_queue.AddBuffer(buffer, 0, dummy_size);
+        const auto info = buffer_cache.GetEmptyBuffer(dummy_size);
+        update_descriptor_queue.AddBuffer(info.handle, info.offset, dummy_size);
         return;
     }
 
-    const auto [buffer, offset] = buffer_cache.UploadMemory(
+    const auto info = buffer_cache.UploadMemory(
         actual_addr, size, device.GetStorageBufferAlignment(), entry.IsWritten());
-    update_descriptor_queue.AddBuffer(buffer, offset, size);
+    update_descriptor_queue.AddBuffer(info.handle, info.offset, size);
 }
 
 void RasterizerVulkan::SetupUniformTexels(const Tegra::Texture::TICEntry& tic,
diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.h b/src/video_core/renderer_vulkan/vk_stream_buffer.h
index c765c60a0..689f0d276 100644
--- a/src/video_core/renderer_vulkan/vk_stream_buffer.h
+++ b/src/video_core/renderer_vulkan/vk_stream_buffer.h
@@ -35,10 +35,14 @@ public:
     /// Ensures that "size" bytes of memory are available to the GPU, potentially recording a copy.
     void Unmap(u64 size);
 
-    VkBuffer Handle() const {
+    VkBuffer Handle() const noexcept {
         return *buffer;
     }
 
+    u64 Address() const noexcept {
+        return 0;
+    }
+
 private:
     struct Watch final {
         VKFenceWatch fence;
-- 
cgit v1.2.3


From 41a4090320ee52e914e8b4c789dfe14210794fed Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Thu, 18 Jun 2020 03:56:31 -0300
Subject: gl_rasterizer: Use NV_vertex_buffer_unified_memory for vertex buffer
 robustness

Switch games are allowed to bind less data than what they use in a
vertex buffer, the expected behavior here is that these values are read
as zero. At the moment of writing this only D3D12, OpenGL and NVN through
NV_vertex_buffer_unified_memory support vertex buffer with a size limit.

In theory this could be emulated on Vulkan creating a new VkBuffer for
each (handle, offset, length) tuple and binding the expected data to it.
This is likely going to be slow and memory expensive when used on the
vertex buffer and we have to do it on all draws because we can't know
without analyzing indices when a game is going to read vertex data out
of bounds.

This is not a problem on OpenGL's BufferAddressRangeNV because it takes
a length parameter, unlike Vulkan's CmdBindVertexBuffers that only takes
buffers and offsets (the length is implicit in VkBuffer). It isn't a
problem on D3D12 either, because D3D12_VERTEX_BUFFER_VIEW on
IASetVertexBuffers takes SizeInBytes as a parameter (although I am not
familiar with robustness on D3D12).

Currently this only implements buffer ranges for vertex buffers,
although indices can also be affected. A KHR_robustness profile is not
created, but Nvidia's driver reads out of bound vertex data as zero
anyway, this might have to be changed in the future.

- Fixes SMO random triangles when capturing an enemy, getting hit, or
looking at the environment on certain maps.
---
 src/video_core/renderer_opengl/gl_rasterizer.cpp   | 28 +++++++++++++++-------
 src/video_core/renderer_opengl/renderer_opengl.cpp | 17 ++++++++++++-
 src/video_core/renderer_opengl/renderer_opengl.h   |  3 +++
 3 files changed, 39 insertions(+), 9 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 7cb378a71..362457ffe 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -61,7 +61,8 @@ constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE =
 constexpr std::size_t TOTAL_CONST_BUFFER_BYTES =
     NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage;
 
-constexpr std::size_t NumSupportedVertexAttributes = 16;
+constexpr std::size_t NUM_SUPPORTED_VERTEX_ATTRIBUTES = 16;
+constexpr std::size_t NUM_SUPPORTED_VERTEX_BINDINGS = 16;
 
 template <typename Engine, typename Entry>
 Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,
@@ -193,7 +194,7 @@ void RasterizerOpenGL::SetupVertexFormat() {
     // avoid OpenGL errors.
     // TODO(Subv): Analyze the shader to identify which attributes are actually used and don't
     // assume every shader uses them all.
-    for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) {
+    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) {
         if (!flags[Dirty::VertexFormat0 + index]) {
             continue;
         }
@@ -231,9 +232,11 @@ void RasterizerOpenGL::SetupVertexBuffer() {
 
     MICROPROFILE_SCOPE(OpenGL_VB);
 
+    const bool use_unified_memory = device.HasVertexBufferUnifiedMemory();
+
     // Upload all guest vertex arrays sequentially to our buffer
     const auto& regs = gpu.regs;
-    for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) {
+    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_BINDINGS; ++index) {
         if (!flags[Dirty::VertexBuffer0 + index]) {
             continue;
         }
@@ -246,16 +249,25 @@ void RasterizerOpenGL::SetupVertexBuffer() {
 
         const GPUVAddr start = vertex_array.StartAddress();
         const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
-
         ASSERT(end >= start);
+
+        const GLuint gl_index = static_cast<GLuint>(index);
         const u64 size = end - start;
         if (size == 0) {
-            glBindVertexBuffer(static_cast<GLuint>(index), 0, 0, vertex_array.stride);
+            glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
+            if (use_unified_memory) {
+                glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, 0, 0);
+            }
             continue;
         }
         const auto info = buffer_cache.UploadMemory(start, size);
-        glBindVertexBuffer(static_cast<GLuint>(index), info.handle, info.offset,
-                           vertex_array.stride);
+        if (use_unified_memory) {
+            glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
+            glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index,
+                                   info.address + info.offset, size);
+        } else {
+            glBindVertexBuffer(gl_index, info.handle, info.offset, vertex_array.stride);
+        }
     }
 }
 
@@ -268,7 +280,7 @@ void RasterizerOpenGL::SetupVertexInstances() {
     flags[Dirty::VertexInstances] = false;
 
     const auto& regs = gpu.regs;
-    for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) {
+    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) {
         if (!flags[Dirty::VertexInstance0 + index]) {
             continue;
         }
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 6214fcbc3..c40adb6e7 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -488,6 +488,15 @@ void RendererOpenGL::InitOpenGLObjects() {
 
     // Clear screen to black
     LoadColorToActiveGLTexture(0, 0, 0, 0, screen_info.texture);
+
+    // Enable unified vertex attributes and query vertex buffer address when the driver supports it
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
+
+        glMakeNamedBufferResidentNV(vertex_buffer.handle, GL_READ_ONLY);
+        glGetNamedBufferParameterui64vNV(vertex_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV,
+                                         &vertex_buffer_address);
+    }
 }
 
 void RendererOpenGL::AddTelemetryFields() {
@@ -656,7 +665,13 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
                          offsetof(ScreenRectVertex, tex_coord));
     glVertexAttribBinding(PositionLocation, 0);
     glVertexAttribBinding(TexCoordLocation, 0);
-    glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex));
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glBindVertexBuffer(0, 0, 0, sizeof(ScreenRectVertex));
+        glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, 0, vertex_buffer_address,
+                               sizeof(vertices));
+    } else {
+        glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex));
+    }
 
     glBindTextureUnit(0, screen_info.display_texture);
     glBindSampler(0, 0);
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
index 61bf507f4..8b18d32e6 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -107,6 +107,9 @@ private:
     OGLPipeline pipeline;
     OGLFramebuffer screenshot_framebuffer;
 
+    // GPU address of the vertex buffer
+    GLuint64EXT vertex_buffer_address = 0;
+
     /// Display information for Switch screen
     ScreenInfo screen_info;
 
-- 
cgit v1.2.3


From 39c97f1b652898dbd0e5e6d028de2ba4b9fa94a0 Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Thu, 18 Jun 2020 21:53:47 -0300
Subject: gl_stream_buffer: Use InvalidateBufferData instead unmap and map

Making the stream buffer resident increases GPU usage significantly on
some games. This seems to be addressed invalidating the stream buffer
with InvalidateBufferData instead of using a Unmap + Map (with
invalidation flags).
---
 src/video_core/renderer_opengl/gl_stream_buffer.cpp | 19 +++++--------------
 src/video_core/renderer_opengl/gl_stream_buffer.h   |  1 -
 2 files changed, 5 insertions(+), 15 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
index 164df4feb..3655ff629 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@@ -57,30 +57,21 @@ std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr a
 
     bool invalidate = false;
     if (buffer_pos + size > buffer_size) {
+        MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
+        glInvalidateBufferData(gl_buffer.handle);
+
         buffer_pos = 0;
         invalidate = true;
-
-        glUnmapNamedBuffer(gl_buffer.handle);
-    }
-
-    if (invalidate) {
-        static const GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT |
-                                        GL_MAP_INVALIDATE_BUFFER_BIT | GL_MAP_FLUSH_EXPLICIT_BIT;
-
-        MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
-        mapped_ptr = static_cast<u8*>(
-            glMapNamedBufferRange(gl_buffer.handle, buffer_pos, buffer_size - buffer_pos, flags));
-        mapped_offset = buffer_pos;
     }
 
-    return std::make_tuple(mapped_ptr + buffer_pos - mapped_offset, buffer_pos, invalidate);
+    return std::make_tuple(mapped_ptr + buffer_pos, buffer_pos, invalidate);
 }
 
 void OGLStreamBuffer::Unmap(GLsizeiptr size) {
     ASSERT(size <= mapped_size);
 
     if (size > 0) {
-        glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos - mapped_offset, size);
+        glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos, size);
     }
 
     buffer_pos += size;
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h
index e67a82980..307a67113 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.h
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.h
@@ -48,7 +48,6 @@ private:
     GLuint64EXT gpu_address = 0;
     GLintptr buffer_pos = 0;
     GLsizeiptr buffer_size = 0;
-    GLintptr mapped_offset = 0;
     GLsizeiptr mapped_size = 0;
     u8* mapped_ptr = nullptr;
 };
-- 
cgit v1.2.3


From 32a2dcd4153f4e2aea7b5f88c85d8a352f647f12 Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Fri, 19 Jun 2020 20:47:48 -0300
Subject: buffer_cache: Use buffer methods instead of cache virtual methods

---
 src/video_core/buffer_cache/buffer_cache.h         | 23 ++----
 src/video_core/renderer_opengl/gl_buffer_cache.cpp | 38 +++++----
 src/video_core/renderer_opengl/gl_buffer_cache.h   | 16 ++--
 src/video_core/renderer_vulkan/vk_buffer_cache.cpp | 89 +++++++++++-----------
 src/video_core/renderer_vulkan/vk_buffer_cache.h   | 23 +++---
 5 files changed, 90 insertions(+), 99 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 6ea59253a..cf8bdd021 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -269,15 +269,6 @@ protected:
 
     virtual std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) = 0;
 
-    virtual void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                 const u8* data) = 0;
-
-    virtual void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                   u8* data) = 0;
-
-    virtual void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                           std::size_t dst_offset, std::size_t size) = 0;
-
     virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) {
         return {};
     }
@@ -339,11 +330,11 @@ private:
             const VAddr cpu_addr_end = cpu_addr + size;
             if (memory_manager.IsGranularRange(gpu_addr, size)) {
                 u8* host_ptr = memory_manager.GetPointer(gpu_addr);
-                UploadBlockData(*block, block->Offset(cpu_addr), size, host_ptr);
+                block->Upload(block->Offset(cpu_addr), size, host_ptr);
             } else {
                 staging_buffer.resize(size);
                 memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
-                UploadBlockData(*block, block->Offset(cpu_addr), size, staging_buffer.data());
+                block->Upload(block->Offset(cpu_addr), size, staging_buffer.data());
             }
             return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr));
         }
@@ -402,7 +393,7 @@ private:
             }
             staging_buffer.resize(size);
             system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
-            UploadBlockData(*block, block->Offset(interval.lower()), size, staging_buffer.data());
+            block->Upload(block->Offset(interval.lower()), size, staging_buffer.data());
         }
     }
 
@@ -439,7 +430,7 @@ private:
 
         const std::size_t size = map->end - map->start;
         staging_buffer.resize(size);
-        DownloadBlockData(*block, block->Offset(map->start), size, staging_buffer.data());
+        block->Download(block->Offset(map->start), size, staging_buffer.data());
         system.Memory().WriteBlockUnsafe(map->start, staging_buffer.data(), size);
         map->MarkAsModified(false, 0);
     }
@@ -467,7 +458,7 @@ private:
         const std::size_t new_size = old_size + BLOCK_PAGE_SIZE;
         const VAddr cpu_addr = buffer->CpuAddr();
         std::shared_ptr<Buffer> new_buffer = CreateBlock(cpu_addr, new_size);
-        CopyBlock(*buffer, *new_buffer, 0, 0, old_size);
+        new_buffer->CopyFrom(*buffer, 0, 0, old_size);
         QueueDestruction(std::move(buffer));
 
         const VAddr cpu_addr_end = cpu_addr + new_size - 1;
@@ -489,8 +480,8 @@ private:
         const std::size_t new_size = size_1 + size_2;
 
         std::shared_ptr<Buffer> new_buffer = CreateBlock(new_addr, new_size);
-        CopyBlock(*first, *new_buffer, 0, new_buffer->Offset(first_addr), size_1);
-        CopyBlock(*second, *new_buffer, 0, new_buffer->Offset(second_addr), size_2);
+        new_buffer->CopyFrom(*first, 0, new_buffer->Offset(first_addr), size_1);
+        new_buffer->CopyFrom(*second, 0, new_buffer->Offset(second_addr), size_2);
         QueueDestruction(std::move(first));
         QueueDestruction(std::move(second));
 
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index e09b47f57..d9f7b4cc6 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -34,6 +34,24 @@ Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size)
 
 Buffer::~Buffer() = default;
 
+void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) const {
+    glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size),
+                         data);
+}
+
+void Buffer::Download(std::size_t offset, std::size_t size, u8* data) const {
+    MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
+    glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
+    glGetNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size),
+                            data);
+}
+
+void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                      std::size_t size) const {
+    glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast<GLintptr>(src_offset),
+                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
+}
+
 OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
                                const Device& device_, std::size_t stream_size)
     : GenericBufferCache{rasterizer, system,
@@ -62,26 +80,6 @@ OGLBufferCache::BufferInfo OGLBufferCache::GetEmptyBuffer(std::size_t) {
     return {0, 0, 0};
 }
 
-void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                     const u8* data) {
-    glNamedBufferSubData(buffer.Handle(), static_cast<GLintptr>(offset),
-                         static_cast<GLsizeiptr>(size), data);
-}
-
-void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                       u8* data) {
-    MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
-    glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
-    glGetNamedBufferSubData(buffer.Handle(), static_cast<GLintptr>(offset),
-                            static_cast<GLsizeiptr>(size), data);
-}
-
-void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                               std::size_t dst_offset, std::size_t size) {
-    glCopyNamedBufferSubData(src.Handle(), dst.Handle(), static_cast<GLintptr>(src_offset),
-                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
-}
-
 OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer,
                                                              std::size_t size) {
     DEBUG_ASSERT(cbuf_cursor < std::size(cbufs));
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index 6462cfae5..59d95adbc 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -28,6 +28,13 @@ public:
     explicit Buffer(const Device& device, VAddr cpu_addr, std::size_t size);
     ~Buffer();
 
+    void Upload(std::size_t offset, std::size_t size, const u8* data) const;
+
+    void Download(std::size_t offset, std::size_t size, u8* data) const;
+
+    void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                  std::size_t size) const;
+
     GLuint Handle() const noexcept {
         return gl_buffer.handle;
     }
@@ -57,15 +64,6 @@ public:
 protected:
     std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
 
-    void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                         const u8* data) override;
-
-    void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                           u8* data) override;
-
-    void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                   std::size_t dst_offset, std::size_t size) override;
-
     BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override;
 
 private:
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index df258d7a4..f10f96cd8 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -37,9 +37,9 @@ std::unique_ptr<VKStreamBuffer> CreateStreamBuffer(const VKDevice& device, VKSch
 
 } // Anonymous namespace
 
-Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VAddr cpu_addr,
-               std::size_t size)
-    : VideoCommon::BufferBlock{cpu_addr, size} {
+Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VKScheduler& scheduler_,
+               VKStagingBufferPool& staging_pool_, VAddr cpu_addr, std::size_t size)
+    : VideoCommon::BufferBlock{cpu_addr, size}, scheduler{scheduler_}, staging_pool{staging_pool_} {
     VkBufferCreateInfo ci;
     ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
     ci.pNext = nullptr;
@@ -56,40 +56,15 @@ Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VAddr cp
 
 Buffer::~Buffer() = default;
 
-VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
-                             const VKDevice& device, VKMemoryManager& memory_manager,
-                             VKScheduler& scheduler, VKStagingBufferPool& staging_pool)
-    : VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer>{rasterizer, system,
-                                                                 CreateStreamBuffer(device,
-                                                                                    scheduler)},
-      device{device}, memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{
-                                                                                staging_pool} {}
-
-VKBufferCache::~VKBufferCache() = default;
-
-std::shared_ptr<Buffer> VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
-    return std::make_shared<Buffer>(device, memory_manager, cpu_addr, size);
-}
-
-VKBufferCache::BufferInfo VKBufferCache::GetEmptyBuffer(std::size_t size) {
-    size = std::max(size, std::size_t(4));
-    const auto& empty = staging_pool.GetUnusedBuffer(size, false);
-    scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([size, buffer = *empty.handle](vk::CommandBuffer cmdbuf) {
-        cmdbuf.FillBuffer(buffer, 0, size, 0);
-    });
-    return {*empty.handle, 0, 0};
-}
-
-void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                    const u8* data) {
+void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) const {
     const auto& staging = staging_pool.GetUnusedBuffer(size, true);
     std::memcpy(staging.commit->Map(size), data, size);
 
     scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([staging = *staging.handle, buffer = buffer.Handle(), offset,
-                      size](vk::CommandBuffer cmdbuf) {
-        cmdbuf.CopyBuffer(staging, buffer, VkBufferCopy{0, offset, size});
+
+    const VkBuffer handle = Handle();
+    scheduler.Record([staging = *staging.handle, handle, offset, size](vk::CommandBuffer cmdbuf) {
+        cmdbuf.CopyBuffer(staging, handle, VkBufferCopy{0, offset, size});
 
         VkBufferMemoryBarrier barrier;
         barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
@@ -98,7 +73,7 @@ void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, st
         barrier.dstAccessMask = UPLOAD_ACCESS_BARRIERS;
         barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
         barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-        barrier.buffer = buffer;
+        barrier.buffer = handle;
         barrier.offset = offset;
         barrier.size = size;
         cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, UPLOAD_PIPELINE_STAGE, 0, {},
@@ -106,12 +81,12 @@ void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, st
     });
 }
 
-void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                      u8* data) {
+void Buffer::Download(std::size_t offset, std::size_t size, u8* data) const {
     const auto& staging = staging_pool.GetUnusedBuffer(size, true);
     scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([staging = *staging.handle, buffer = buffer.Handle(), offset,
-                      size](vk::CommandBuffer cmdbuf) {
+
+    const VkBuffer handle = Handle();
+    scheduler.Record([staging = *staging.handle, handle, offset, size](vk::CommandBuffer cmdbuf) {
         VkBufferMemoryBarrier barrier;
         barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
         barrier.pNext = nullptr;
@@ -119,7 +94,7 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset,
         barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
         barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
         barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-        barrier.buffer = buffer;
+        barrier.buffer = handle;
         barrier.offset = offset;
         barrier.size = size;
 
@@ -127,17 +102,19 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset,
                                    VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
                                    VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                                VK_PIPELINE_STAGE_TRANSFER_BIT, 0, {}, barrier, {});
-        cmdbuf.CopyBuffer(buffer, staging, VkBufferCopy{offset, 0, size});
+        cmdbuf.CopyBuffer(handle, staging, VkBufferCopy{offset, 0, size});
     });
     scheduler.Finish();
 
     std::memcpy(data, staging.commit->Map(size), size);
 }
 
-void VKBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                              std::size_t dst_offset, std::size_t size) {
+void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                      std::size_t size) const {
     scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([src_buffer = src.Handle(), dst_buffer = dst.Handle(), src_offset, dst_offset,
+
+    const VkBuffer dst_buffer = Handle();
+    scheduler.Record([src_buffer = src.Handle(), dst_buffer, src_offset, dst_offset,
                       size](vk::CommandBuffer cmdbuf) {
         cmdbuf.CopyBuffer(src_buffer, dst_buffer, VkBufferCopy{src_offset, dst_offset, size});
 
@@ -165,4 +142,30 @@ void VKBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t
     });
 }
 
+VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
+                             const VKDevice& device, VKMemoryManager& memory_manager,
+                             VKScheduler& scheduler, VKStagingBufferPool& staging_pool)
+    : VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer>{rasterizer, system,
+                                                                 CreateStreamBuffer(device,
+                                                                                    scheduler)},
+      device{device}, memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{
+                                                                                staging_pool} {}
+
+VKBufferCache::~VKBufferCache() = default;
+
+std::shared_ptr<Buffer> VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
+    return std::make_shared<Buffer>(device, memory_manager, scheduler, staging_pool, cpu_addr,
+                                    size);
+}
+
+VKBufferCache::BufferInfo VKBufferCache::GetEmptyBuffer(std::size_t size) {
+    size = std::max(size, std::size_t(4));
+    const auto& empty = staging_pool.GetUnusedBuffer(size, false);
+    scheduler.RequestOutsideRenderPassOperationContext();
+    scheduler.Record([size, buffer = *empty.handle](vk::CommandBuffer cmdbuf) {
+        cmdbuf.FillBuffer(buffer, 0, size, 0);
+    });
+    return {*empty.handle, 0, 0};
+}
+
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h
index 682383ff2..3630aca77 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.h
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h
@@ -25,10 +25,17 @@ class VKScheduler;
 
 class Buffer final : public VideoCommon::BufferBlock {
 public:
-    explicit Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VAddr cpu_addr,
-                    std::size_t size);
+    explicit Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VKScheduler& scheduler,
+                    VKStagingBufferPool& staging_pool, VAddr cpu_addr, std::size_t size);
     ~Buffer();
 
+    void Upload(std::size_t offset, std::size_t size, const u8* data) const;
+
+    void Download(std::size_t offset, std::size_t size, u8* data) const;
+
+    void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                  std::size_t size) const;
+
     VkBuffer Handle() const {
         return *buffer.handle;
     }
@@ -38,6 +45,9 @@ public:
     }
 
 private:
+    VKScheduler& scheduler;
+    VKStagingBufferPool& staging_pool;
+
     VKBuffer buffer;
 };
 
@@ -53,15 +63,6 @@ public:
 protected:
     std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
 
-    void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                         const u8* data) override;
-
-    void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                           u8* data) override;
-
-    void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                   std::size_t dst_offset, std::size_t size) override;
-
 private:
     const VKDevice& device;
     VKMemoryManager& memory_manager;
-- 
cgit v1.2.3


From bc8d3b8f82c06e5d0b5a7c1640ef00b83e826dbf Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Thu, 25 Jun 2020 01:28:45 -0300
Subject: gl_device: Enable NV_vertex_buffer_unified_memory on Turing devices

Once we make sure not to corrupt Nvidia's driver, we can safely use
resident buffers on Turing devices.

See GitHub pull request #4156
---
 src/video_core/renderer_opengl/gl_device.cpp | 20 +-------------------
 1 file changed, 1 insertion(+), 19 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index 447a19595..bb1375f82 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -188,20 +188,6 @@ bool IsASTCSupported() {
     return true;
 }
 
-/// @brief Returns true when a GL_RENDERER is a Turing GPU
-/// @param renderer GL_RENDERER string
-bool IsTuring(std::string_view renderer) {
-    static constexpr std::array<std::string_view, 12> TURING_GPUS = {
-        "GTX 1650",        "GTX 1660",        "RTX 2060",        "RTX 2070",
-        "RTX 2080",        "TITAN RTX",       "Quadro RTX 3000", "Quadro RTX 4000",
-        "Quadro RTX 5000", "Quadro RTX 6000", "Quadro RTX 8000", "Tesla T4",
-    };
-    return std::any_of(TURING_GPUS.begin(), TURING_GPUS.end(),
-                       [renderer](std::string_view candidate) {
-                           return renderer.find(candidate) != std::string_view::npos;
-                       });
-}
-
 } // Anonymous namespace
 
 Device::Device()
@@ -213,7 +199,6 @@ Device::Device()
 
     const bool is_nvidia = vendor == "NVIDIA Corporation";
     const bool is_amd = vendor == "ATI Technologies Inc.";
-    const bool is_turing = is_nvidia && IsTuring(renderer);
 
     bool disable_fast_buffer_sub_data = false;
     if (is_nvidia && version == "4.6.0 NVIDIA 443.24") {
@@ -238,15 +223,12 @@ Device::Device()
     has_component_indexing_bug = is_amd;
     has_precise_bug = TestPreciseBug();
     has_nv_viewport_array2 = GLAD_GL_NV_viewport_array2;
+    has_vertex_buffer_unified_memory = GLAD_GL_NV_vertex_buffer_unified_memory;
 
     // At the moment of writing this, only Nvidia's driver optimizes BufferSubData on exclusive
     // uniform buffers as "push constants"
     has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data;
 
-    // Nvidia's driver on Turing GPUs randomly crashes when the buffer is made resident, or on
-    // DeleteBuffers. Disable unified memory on these devices.
-    has_vertex_buffer_unified_memory = GLAD_GL_NV_vertex_buffer_unified_memory && !is_turing;
-
     use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 &&
                            GLAD_GL_NV_compute_program5 && GLAD_GL_NV_transform_feedback &&
                            GLAD_GL_NV_transform_feedback2;
-- 
cgit v1.2.3


From a927d8be52c343bc1025e5df822c56470eb27919 Mon Sep 17 00:00:00 2001
From: David Marcec <dmarcecguzman@gmail.com>
Date: Thu, 25 Jun 2020 19:12:56 +1000
Subject: gl_device: Fix IsASTCSupported

Other targets were never actually checked
---
 src/video_core/renderer_opengl/gl_device.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/video_core')

diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index 447a19595..b6b6659c1 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -178,7 +178,7 @@ bool IsASTCSupported() {
         for (const GLenum format : formats) {
             for (const GLenum support : required_support) {
                 GLint value;
-                glGetInternalformativ(GL_TEXTURE_2D, format, support, 1, &value);
+                glGetInternalformativ(target, format, support, 1, &value);
                 if (value != GL_FULL_SUPPORT) {
                     return false;
                 }
-- 
cgit v1.2.3


From 6481d91e4a5b5fbae899c3a7924af0b132c16bc8 Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Fri, 26 Jun 2020 16:58:40 -0300
Subject: gl_buffer_cache: Copy to buffers created as STREAM_READ before
 downloading

After marking buffers as resident, Nvidia's driver seems to take a
slow path. To workaround this issue, copy to a STREAM_READ buffer and
then call GetNamedBufferSubData on it.

This is a temporary solution until we have asynchronous flushing.
---
 src/video_core/buffer_cache/buffer_cache.h         |  6 ++----
 src/video_core/renderer_opengl/gl_buffer_cache.cpp | 17 ++++++++++++-----
 src/video_core/renderer_opengl/gl_buffer_cache.h   |  7 ++++---
 src/video_core/renderer_vulkan/vk_buffer_cache.cpp |  6 +++---
 src/video_core/renderer_vulkan/vk_buffer_cache.h   |  6 +++---
 5 files changed, 24 insertions(+), 18 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index cf8bdd021..c6479af9f 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -322,8 +322,7 @@ protected:
     }
 
 private:
-    MapInterval* MapAddress(const Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr,
-                            std::size_t size) {
+    MapInterval* MapAddress(Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size) {
         const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size);
         if (overlaps.empty()) {
             auto& memory_manager = system.GPU().MemoryManager();
@@ -377,8 +376,7 @@ private:
         return map;
     }
 
-    void UpdateBlock(const Buffer* block, VAddr start, VAddr end,
-                     const VectorMapInterval& overlaps) {
+    void UpdateBlock(Buffer* block, VAddr start, VAddr end, const VectorMapInterval& overlaps) {
         const IntervalType base_interval{start, end};
         IntervalSet interval_set{};
         interval_set.add(base_interval);
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index d9f7b4cc6..e461e4c70 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -34,20 +34,27 @@ Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size)
 
 Buffer::~Buffer() = default;
 
-void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) const {
+void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) {
     glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size),
                          data);
 }
 
-void Buffer::Download(std::size_t offset, std::size_t size, u8* data) const {
+void Buffer::Download(std::size_t offset, std::size_t size, u8* data) {
     MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
+    const GLsizeiptr gl_size = static_cast<GLsizeiptr>(size);
+    const GLintptr gl_offset = static_cast<GLintptr>(offset);
+    if (read_buffer.handle == 0) {
+        read_buffer.Create();
+        glNamedBufferData(read_buffer.handle, static_cast<GLsizeiptr>(Size()), nullptr,
+                          GL_STREAM_READ);
+    }
     glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
-    glGetNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size),
-                            data);
+    glCopyNamedBufferSubData(gl_buffer.handle, read_buffer.handle, gl_offset, gl_offset, gl_size);
+    glGetNamedBufferSubData(read_buffer.handle, gl_offset, gl_size, data);
 }
 
 void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
-                      std::size_t size) const {
+                      std::size_t size) {
     glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast<GLintptr>(src_offset),
                              static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
 }
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index 59d95adbc..88fdc0536 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -28,12 +28,12 @@ public:
     explicit Buffer(const Device& device, VAddr cpu_addr, std::size_t size);
     ~Buffer();
 
-    void Upload(std::size_t offset, std::size_t size, const u8* data) const;
+    void Upload(std::size_t offset, std::size_t size, const u8* data);
 
-    void Download(std::size_t offset, std::size_t size, u8* data) const;
+    void Download(std::size_t offset, std::size_t size, u8* data);
 
     void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
-                  std::size_t size) const;
+                  std::size_t size);
 
     GLuint Handle() const noexcept {
         return gl_buffer.handle;
@@ -45,6 +45,7 @@ public:
 
 private:
     OGLBuffer gl_buffer;
+    OGLBuffer read_buffer;
     u64 gpu_address = 0;
 };
 
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index f10f96cd8..2be38d419 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -56,7 +56,7 @@ Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VKSchedu
 
 Buffer::~Buffer() = default;
 
-void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) const {
+void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) {
     const auto& staging = staging_pool.GetUnusedBuffer(size, true);
     std::memcpy(staging.commit->Map(size), data, size);
 
@@ -81,7 +81,7 @@ void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) const
     });
 }
 
-void Buffer::Download(std::size_t offset, std::size_t size, u8* data) const {
+void Buffer::Download(std::size_t offset, std::size_t size, u8* data) {
     const auto& staging = staging_pool.GetUnusedBuffer(size, true);
     scheduler.RequestOutsideRenderPassOperationContext();
 
@@ -110,7 +110,7 @@ void Buffer::Download(std::size_t offset, std::size_t size, u8* data) const {
 }
 
 void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
-                      std::size_t size) const {
+                      std::size_t size) {
     scheduler.RequestOutsideRenderPassOperationContext();
 
     const VkBuffer dst_buffer = Handle();
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h
index 3630aca77..991ee451c 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.h
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h
@@ -29,12 +29,12 @@ public:
                     VKStagingBufferPool& staging_pool, VAddr cpu_addr, std::size_t size);
     ~Buffer();
 
-    void Upload(std::size_t offset, std::size_t size, const u8* data) const;
+    void Upload(std::size_t offset, std::size_t size, const u8* data);
 
-    void Download(std::size_t offset, std::size_t size, u8* data) const;
+    void Download(std::size_t offset, std::size_t size, u8* data);
 
     void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
-                  std::size_t size) const;
+                  std::size_t size);
 
     VkBuffer Handle() const {
         return *buffer.handle;
-- 
cgit v1.2.3


From 1d6be9febf7b9613014ec60fc0ec42e40cc073c9 Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Fri, 26 Jun 2020 19:22:29 -0300
Subject: video_core/compatible_formats: Table to test if two formats are legal
 to view or copy

Add a flat table to test if it's legal to create a texture view between
two formats or copy betweem them.

This table is based on ARB_copy_image and ARB_texture_view. Copies are
more permissive than views.
---
 src/video_core/CMakeLists.txt         |   2 +
 src/video_core/compatible_formats.cpp | 162 ++++++++++++++++++++++++++++++++++
 src/video_core/compatible_formats.h   |  32 +++++++
 3 files changed, 196 insertions(+)
 create mode 100644 src/video_core/compatible_formats.cpp
 create mode 100644 src/video_core/compatible_formats.h

(limited to 'src/video_core')

diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 2dc752aa9..21c46a567 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -3,6 +3,8 @@ add_library(video_core STATIC
     buffer_cache/buffer_cache.h
     buffer_cache/map_interval.cpp
     buffer_cache/map_interval.h
+    compatible_formats.cpp
+    compatible_formats.h
     dirty_flags.cpp
     dirty_flags.h
     dma_pusher.cpp
diff --git a/src/video_core/compatible_formats.cpp b/src/video_core/compatible_formats.cpp
new file mode 100644
index 000000000..01e5c26ae
--- /dev/null
+++ b/src/video_core/compatible_formats.cpp
@@ -0,0 +1,162 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <array>
+#include <bitset>
+#include <cstddef>
+
+#include "video_core/compatible_formats.h"
+#include "video_core/surface.h"
+
+namespace VideoCore::Surface {
+
+namespace {
+
+// Compatibility table taken from Table 3.X.2 in:
+// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_view.txt
+
+constexpr std::array VIEW_CLASS_128_BITS = {
+    PixelFormat::RGBA32F,
+    PixelFormat::RGBA32UI,
+};
+// Missing formats:
+// PixelFormat::RGBA32I
+
+constexpr std::array VIEW_CLASS_96_BITS = {
+    PixelFormat::RGB32F,
+};
+// Missing formats:
+// PixelFormat::RGB32UI,
+// PixelFormat::RGB32I,
+
+constexpr std::array VIEW_CLASS_64_BITS = {
+    PixelFormat::RGBA16F, PixelFormat::RG32F,   PixelFormat::RGBA16UI, PixelFormat::RG32UI,
+    PixelFormat::RGBA16U, PixelFormat::RGBA16F, PixelFormat::RGBA16S,
+};
+// Missing formats:
+// PixelFormat::RGBA16I
+// PixelFormat::RG32I
+
+// TODO: How should we handle 48 bits?
+
+constexpr std::array VIEW_CLASS_32_BITS = {
+    PixelFormat::RG16F,        PixelFormat::R11FG11FB10F, PixelFormat::R32F,
+    PixelFormat::A2B10G10R10U, PixelFormat::RG16UI,       PixelFormat::R32UI,
+    PixelFormat::RG16I,        PixelFormat::R32I,         PixelFormat::ABGR8U,
+    PixelFormat::RG16,         PixelFormat::ABGR8S,       PixelFormat::RG16S,
+    PixelFormat::RGBA8_SRGB,   PixelFormat::E5B9G9R9F,    PixelFormat::BGRA8,
+    PixelFormat::BGRA8_SRGB,
+};
+// Missing formats:
+// PixelFormat::RGBA8UI
+// PixelFormat::RGBA8I
+// PixelFormat::RGB10_A2_UI
+
+// TODO: How should we handle 24 bits?
+
+constexpr std::array VIEW_CLASS_16_BITS = {
+    PixelFormat::R16F, PixelFormat::RG8UI, PixelFormat::R16UI, PixelFormat::R16I,
+    PixelFormat::RG8U, PixelFormat::R16U,  PixelFormat::RG8S,  PixelFormat::R16S,
+};
+// Missing formats:
+// PixelFormat::RG8I
+
+constexpr std::array VIEW_CLASS_8_BITS = {
+    PixelFormat::R8UI,
+    PixelFormat::R8U,
+};
+// Missing formats:
+// PixelFormat::R8I
+// PixelFormat::R8S
+
+constexpr std::array VIEW_CLASS_RGTC1_RED = {
+    PixelFormat::DXN1,
+};
+// Missing formats:
+// COMPRESSED_SIGNED_RED_RGTC1
+
+constexpr std::array VIEW_CLASS_RGTC2_RG = {
+    PixelFormat::DXN2UNORM,
+    PixelFormat::DXN2SNORM,
+};
+
+constexpr std::array VIEW_CLASS_BPTC_UNORM = {
+    PixelFormat::BC7U,
+    PixelFormat::BC7U_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_BPTC_FLOAT = {
+    PixelFormat::BC6H_SF16,
+    PixelFormat::BC6H_UF16,
+};
+
+// Compatibility table taken from Table 4.X.1 in:
+// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_copy_image.txt
+
+constexpr std::array COPY_CLASS_128_BITS = {
+    PixelFormat::RGBA32UI,   PixelFormat::RGBA32F,   PixelFormat::DXT23,
+    PixelFormat::DXT23_SRGB, PixelFormat::DXT45,     PixelFormat::DXT45_SRGB,
+    PixelFormat::DXN2SNORM,  PixelFormat::BC7U,      PixelFormat::BC7U_SRGB,
+    PixelFormat::BC6H_SF16,  PixelFormat::BC6H_UF16,
+};
+// Missing formats:
+// PixelFormat::RGBA32I
+// COMPRESSED_RG_RGTC2
+
+constexpr std::array COPY_CLASS_64_BITS = {
+    PixelFormat::RGBA16F, PixelFormat::RG32F,   PixelFormat::RGBA16UI,  PixelFormat::RG32UI,
+    PixelFormat::RGBA16U, PixelFormat::RGBA16S, PixelFormat::DXT1_SRGB, PixelFormat::DXT1,
+
+};
+// Missing formats:
+// PixelFormat::RGBA16I
+// PixelFormat::RG32I,
+// COMPRESSED_RGB_S3TC_DXT1_EXT
+// COMPRESSED_SRGB_S3TC_DXT1_EXT
+// COMPRESSED_RGBA_S3TC_DXT1_EXT
+// COMPRESSED_SIGNED_RED_RGTC1
+
+void Enable(FormatCompatibility::Table& compatiblity, size_t format_a, size_t format_b) {
+    compatiblity[format_a][format_b] = true;
+    compatiblity[format_b][format_a] = true;
+}
+
+void Enable(FormatCompatibility::Table& compatibility, PixelFormat format_a, PixelFormat format_b) {
+    Enable(compatibility, static_cast<size_t>(format_a), static_cast<size_t>(format_b));
+}
+
+template <typename Range>
+void EnableRange(FormatCompatibility::Table& compatibility, const Range& range) {
+    for (auto it_a = range.begin(); it_a != range.end(); ++it_a) {
+        for (auto it_b = it_a; it_b != range.end(); ++it_b) {
+            Enable(*it_a, *it_b);
+        }
+    }
+}
+
+} // Anonymous namespace
+
+FormatCompatibility::FormatCompatibility() {
+    for (size_t i = 0; i < MaxPixelFormat; ++i) {
+        // Identity is allowed
+        Enable(view, i, i);
+    }
+
+    EnableRange(view, VIEW_CLASS_128_BITS);
+    EnableRange(view, VIEW_CLASS_96_BITS);
+    EnableRange(view, VIEW_CLASS_64_BITS);
+    EnableRange(view, VIEW_CLASS_32_BITS);
+    EnableRange(view, VIEW_CLASS_16_BITS);
+    EnableRange(view, VIEW_CLASS_8_BITS);
+    EnableRange(view, VIEW_CLASS_RGTC1_RED);
+    EnableRange(view, VIEW_CLASS_RGTC2_RG);
+    EnableRange(view, VIEW_CLASS_BPTC_UNORM);
+    EnableRange(view, VIEW_CLASS_BPTC_FLOAT);
+
+    copy = view;
+    EnableRange(copy, COPY_CLASS_128_BITS);
+    EnableRange(copy, COPY_CLASS_64_BITS);
+}
+
+} // namespace VideoCore::Surface
diff --git a/src/video_core/compatible_formats.h b/src/video_core/compatible_formats.h
new file mode 100644
index 000000000..d1082566d
--- /dev/null
+++ b/src/video_core/compatible_formats.h
@@ -0,0 +1,32 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <array>
+#include <bitset>
+#include <cstddef>
+
+#include "video_core/surface.h"
+
+namespace VideoCore::Surface {
+
+class FormatCompatibility {
+public:
+    using Table = std::array<std::bitset<MaxPixelFormat>, MaxPixelFormat>;
+
+    explicit FormatCompatibility();
+
+    bool TestView(PixelFormat format_a, PixelFormat format_b) const noexcept {
+        return view[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)];
+    }
+
+    bool TestCopy(PixelFormat format_a, PixelFormat format_b) const noexcept {
+        return copy[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)];
+    }
+
+private:
+    Table view;
+    Table copy;
+};
+
+} // namespace VideoCore::Surface
-- 
cgit v1.2.3


From bb2cbdf7047ed765c236e2da0c04420082d7fd8f Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Fri, 26 Jun 2020 19:25:49 -0300
Subject: texture_cache: Test format compatibility before copying

Avoid illegal copies. This intercepts the last step of a copy to avoid
generating validation errors or corrupting the driver on some instances.

We can create views and emit copies accordingly in future commits and
remove this last-step validation.
---
 src/video_core/compatible_formats.cpp        |  2 +-
 src/video_core/texture_cache/texture_cache.h | 25 ++++++++++++++++++++-----
 2 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/compatible_formats.cpp b/src/video_core/compatible_formats.cpp
index 01e5c26ae..6c426b035 100644
--- a/src/video_core/compatible_formats.cpp
+++ b/src/video_core/compatible_formats.cpp
@@ -130,7 +130,7 @@ template <typename Range>
 void EnableRange(FormatCompatibility::Table& compatibility, const Range& range) {
     for (auto it_a = range.begin(); it_a != range.end(); ++it_a) {
         for (auto it_b = it_a; it_b != range.end(); ++it_b) {
-            Enable(*it_a, *it_b);
+            Enable(compatibility, *it_a, *it_b);
         }
     }
 }
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 85075e868..6207d8dfe 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -24,6 +24,7 @@
 #include "core/core.h"
 #include "core/memory.h"
 #include "core/settings.h"
+#include "video_core/compatible_formats.h"
 #include "video_core/dirty_flags.h"
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/engines/maxwell_3d.h"
@@ -47,8 +48,8 @@ class RasterizerInterface;
 
 namespace VideoCommon {
 
+using VideoCore::Surface::FormatCompatibility;
 using VideoCore::Surface::PixelFormat;
-
 using VideoCore::Surface::SurfaceTarget;
 using RenderTargetConfig = Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig;
 
@@ -595,7 +596,7 @@ private:
         } else {
             new_surface = GetUncachedSurface(gpu_addr, params);
         }
-        const auto& final_params = new_surface->GetSurfaceParams();
+        const SurfaceParams& final_params = new_surface->GetSurfaceParams();
         if (cr_params.type != final_params.type) {
             if (Settings::IsGPULevelExtreme()) {
                 BufferCopy(current_surface, new_surface);
@@ -603,7 +604,7 @@ private:
         } else {
             std::vector<CopyParams> bricks = current_surface->BreakDown(final_params);
             for (auto& brick : bricks) {
-                ImageCopy(current_surface, new_surface, brick);
+                TryCopyImage(current_surface, new_surface, brick);
             }
         }
         Unregister(current_surface);
@@ -694,7 +695,7 @@ private:
                 }
                 const CopyParams copy_params(0, 0, 0, 0, 0, base_layer, 0, mipmap, width, height,
                                              src_params.depth);
-                ImageCopy(surface, new_surface, copy_params);
+                TryCopyImage(surface, new_surface, copy_params);
             }
         }
         if (passed_tests == 0) {
@@ -791,7 +792,7 @@ private:
             const u32 width = params.width;
             const u32 height = params.height;
             const CopyParams copy_params(0, 0, 0, 0, 0, slice, 0, 0, width, height, 1);
-            ImageCopy(surface, new_surface, copy_params);
+            TryCopyImage(surface, new_surface, copy_params);
         }
         for (const auto& surface : overlaps) {
             Unregister(surface);
@@ -1192,6 +1193,19 @@ private:
         return {};
     }
 
+    /// Try to do an image copy logging when formats are incompatible.
+    void TryCopyImage(TSurface& src, TSurface& dst, const CopyParams& copy) {
+        const SurfaceParams& src_params = src->GetSurfaceParams();
+        const SurfaceParams& dst_params = dst->GetSurfaceParams();
+        if (!format_compatibility.TestCopy(src_params.pixel_format, dst_params.pixel_format)) {
+            LOG_ERROR(HW_GPU, "Illegal copy between formats={{{}, {}}}",
+                      static_cast<int>(dst_params.pixel_format),
+                      static_cast<int>(src_params.pixel_format));
+            return;
+        }
+        ImageCopy(src, dst, copy);
+    }
+
     constexpr PixelFormat GetSiblingFormat(PixelFormat format) const {
         return siblings_table[static_cast<std::size_t>(format)];
     }
@@ -1241,6 +1255,7 @@ private:
     VideoCore::RasterizerInterface& rasterizer;
 
     FormatLookupTable format_lookup_table;
+    FormatCompatibility format_compatibility;
 
     u64 ticks{};
 
-- 
cgit v1.2.3


From e31425df3877636c098ec7426ebd2067920715cb Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Mon, 24 Feb 2020 22:04:12 -0400
Subject: General: Recover Prometheus project from harddrive failure

This commit: Implements CPU Interrupts, Replaces Cycle Timing for Host
Timing, Reworks the Kernel's Scheduler, Introduce Idle State and
Suspended State, Recreates the bootmanager, Initializes Multicore
system.
---
 src/common/thread.cpp                              |   6 +
 src/core/CMakeLists.txt                            |   4 +-
 src/core/arm/arm_interface.h                       |   5 +-
 src/core/arm/cpu_interrupt_handler.cpp             |  29 ++
 src/core/arm/cpu_interrupt_handler.h               |  39 ++
 src/core/arm/dynarmic/arm_dynarmic_32.cpp          |   6 +-
 src/core/arm/dynarmic/arm_dynarmic_32.h            |   4 +-
 src/core/arm/dynarmic/arm_dynarmic_64.cpp          |  28 +-
 src/core/arm/dynarmic/arm_dynarmic_64.h            |   4 +-
 src/core/arm/unicorn/arm_unicorn.cpp               |  14 +-
 src/core/arm/unicorn/arm_unicorn.h                 |   3 +-
 src/core/core.cpp                                  |  57 +--
 src/core/core.h                                    |  34 +-
 src/core/core_manager.cpp                          |   4 +-
 src/core/core_timing.cpp                           | 208 +++++------
 src/core/core_timing.h                             | 108 +++---
 src/core/cpu_manager.cpp                           | 194 ++++++++--
 src/core/cpu_manager.h                             |  49 ++-
 src/core/hle/kernel/kernel.cpp                     |  84 ++++-
 src/core/hle/kernel/kernel.h                       |  19 +
 src/core/hle/kernel/physical_core.cpp              |  37 +-
 src/core/hle/kernel/physical_core.h                |  21 ++
 src/core/hle/kernel/process.cpp                    |  17 +-
 src/core/hle/kernel/scheduler.cpp                  | 415 +++++++++++++++------
 src/core/hle/kernel/scheduler.h                    |  94 +++--
 src/core/hle/kernel/svc.cpp                        |  21 +-
 src/core/hle/kernel/thread.cpp                     | 232 +++++-------
 src/core/hle/kernel/thread.h                       |  81 +++-
 src/core/hle/kernel/time_manager.cpp               |   2 +-
 src/core/hle/service/hid/controllers/debug_pad.cpp |   2 +-
 src/core/hle/service/hid/controllers/gesture.cpp   |   2 +-
 src/core/hle/service/hid/controllers/keyboard.cpp  |   2 +-
 src/core/hle/service/hid/controllers/mouse.cpp     |   2 +-
 src/core/hle/service/hid/controllers/npad.cpp      |   2 +-
 src/core/hle/service/hid/controllers/stubbed.cpp   |   2 +-
 .../hle/service/hid/controllers/touchscreen.cpp    |   4 +-
 src/core/hle/service/hid/controllers/xpad.cpp      |   2 +-
 src/core/hle/service/hid/hid.cpp                   |  16 +-
 src/core/hle/service/hid/irs.cpp                   |   2 +-
 .../hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp  |   3 +-
 src/core/hle/service/nvflinger/nvflinger.cpp       |  13 +-
 .../service/time/standard_steady_clock_core.cpp    |   5 +-
 .../service/time/tick_based_steady_clock_core.cpp  |   5 +-
 src/core/hle/service/time/time.cpp                 |   5 +-
 src/core/hle/service/time/time_sharedmemory.cpp    |   3 +-
 src/core/memory.cpp                                |  11 +-
 src/core/memory.h                                  |   2 +-
 src/core/memory/cheat_engine.cpp                   |   8 +-
 src/core/tools/freezer.cpp                         |   8 +-
 src/tests/CMakeLists.txt                           |   1 -
 src/tests/core/core_timing.cpp                     | 184 +++++----
 src/video_core/gpu.cpp                             |   5 +-
 src/yuzu/bootmanager.cpp                           |  32 +-
 src/yuzu/bootmanager.h                             |   7 +
 src/yuzu/debugger/wait_tree.cpp                    |   6 +-
 src/yuzu_cmd/yuzu.cpp                              |   2 +-
 src/yuzu_tester/yuzu.cpp                           |   2 +-
 57 files changed, 1341 insertions(+), 816 deletions(-)
 create mode 100644 src/core/arm/cpu_interrupt_handler.cpp
 create mode 100644 src/core/arm/cpu_interrupt_handler.h

(limited to 'src/video_core')

diff --git a/src/common/thread.cpp b/src/common/thread.cpp
index 0cd2d10bf..c9684aed9 100644
--- a/src/common/thread.cpp
+++ b/src/common/thread.cpp
@@ -70,6 +70,12 @@ void SetCurrentThreadName(const char* name) {
 }
 #endif
 
+#if defined(_WIN32)
+void SetCurrentThreadName(const char* name) {
+    // Do Nothing on MingW
+}
+#endif
+
 #endif
 
 } // namespace Common
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index efbad628f..552094ddb 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -7,6 +7,8 @@ endif()
 add_library(core STATIC
     arm/arm_interface.h
     arm/arm_interface.cpp
+    arm/cpu_interrupt_handler.cpp
+    arm/cpu_interrupt_handler.h
     arm/exclusive_monitor.cpp
     arm/exclusive_monitor.h
     arm/unicorn/arm_unicorn.cpp
@@ -547,8 +549,6 @@ add_library(core STATIC
     hle/service/vi/vi_u.h
     hle/service/wlan/wlan.cpp
     hle/service/wlan/wlan.h
-    host_timing.cpp
-    host_timing.h
     loader/deconstructed_rom_directory.cpp
     loader/deconstructed_rom_directory.h
     loader/elf.cpp
diff --git a/src/core/arm/arm_interface.h b/src/core/arm/arm_interface.h
index cb2e640e2..87a1c29cc 100644
--- a/src/core/arm/arm_interface.h
+++ b/src/core/arm/arm_interface.h
@@ -18,11 +18,13 @@ enum class VMAPermission : u8;
 
 namespace Core {
 class System;
+class CPUInterruptHandler;
 
 /// Generic ARMv8 CPU interface
 class ARM_Interface : NonCopyable {
 public:
-    explicit ARM_Interface(System& system_) : system{system_} {}
+    explicit ARM_Interface(System& system_, CPUInterruptHandler& interrupt_handler)
+        : system{system_}, interrupt_handler{interrupt_handler} {}
     virtual ~ARM_Interface() = default;
 
     struct ThreadContext32 {
@@ -175,6 +177,7 @@ public:
 protected:
     /// System context that this ARM interface is running under.
     System& system;
+    CPUInterruptHandler& interrupt_handler;
 };
 
 } // namespace Core
diff --git a/src/core/arm/cpu_interrupt_handler.cpp b/src/core/arm/cpu_interrupt_handler.cpp
new file mode 100644
index 000000000..2f1a1a269
--- /dev/null
+++ b/src/core/arm/cpu_interrupt_handler.cpp
@@ -0,0 +1,29 @@
+// Copyright 2020 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/thread.h"
+#include "core/arm/cpu_interrupt_handler.h"
+
+namespace Core {
+
+CPUInterruptHandler::CPUInterruptHandler() : is_interrupted{} {
+    interrupt_event = std::make_unique<Common::Event>();
+}
+
+CPUInterruptHandler::~CPUInterruptHandler() = default;
+
+void CPUInterruptHandler::SetInterrupt(bool is_interrupted_) {
+    if (is_interrupted_) {
+        interrupt_event->Set();
+    }
+    this->is_interrupted = is_interrupted_;
+}
+
+void CPUInterruptHandler::AwaitInterrupt() {
+    interrupt_event->Wait();
+}
+
+} // namespace Core
diff --git a/src/core/arm/cpu_interrupt_handler.h b/src/core/arm/cpu_interrupt_handler.h
new file mode 100644
index 000000000..91c31a271
--- /dev/null
+++ b/src/core/arm/cpu_interrupt_handler.h
@@ -0,0 +1,39 @@
+// Copyright 2020 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+
+namespace Common {
+class Event;
+}
+
+namespace Core {
+
+class CPUInterruptHandler {
+public:
+    CPUInterruptHandler();
+    ~CPUInterruptHandler();
+
+    CPUInterruptHandler(const CPUInterruptHandler&) = delete;
+    CPUInterruptHandler& operator=(const CPUInterruptHandler&) = delete;
+
+    CPUInterruptHandler(CPUInterruptHandler&&) = default;
+    CPUInterruptHandler& operator=(CPUInterruptHandler&&) = default;
+
+    constexpr bool IsInterrupted() const {
+        return is_interrupted;
+    }
+
+    void SetInterrupt(bool is_interrupted);
+
+    void AwaitInterrupt();
+
+private:
+    bool is_interrupted{};
+    std::unique_ptr<Common::Event> interrupt_event;
+};
+
+} // namespace Core
diff --git a/src/core/arm/dynarmic/arm_dynarmic_32.cpp b/src/core/arm/dynarmic/arm_dynarmic_32.cpp
index 4c8663d03..0b7aa6a69 100644
--- a/src/core/arm/dynarmic/arm_dynarmic_32.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_32.cpp
@@ -114,9 +114,9 @@ void ARM_Dynarmic_32::Step() {
     jit->Step();
 }
 
-ARM_Dynarmic_32::ARM_Dynarmic_32(System& system, ExclusiveMonitor& exclusive_monitor,
-                                 std::size_t core_index)
-    : ARM_Interface{system}, cb(std::make_unique<DynarmicCallbacks32>(*this)),
+ARM_Dynarmic_32::ARM_Dynarmic_32(System& system, CPUInterruptHandler& interrupt_handler,
+                                 ExclusiveMonitor& exclusive_monitor, std::size_t core_index)
+    : ARM_Interface{system, interrupt_handler}, cb(std::make_unique<DynarmicCallbacks32>(*this)),
       cp15(std::make_shared<DynarmicCP15>(*this)), core_index{core_index},
       exclusive_monitor{dynamic_cast<DynarmicExclusiveMonitor&>(exclusive_monitor)} {}
 
diff --git a/src/core/arm/dynarmic/arm_dynarmic_32.h b/src/core/arm/dynarmic/arm_dynarmic_32.h
index e5b92d7bb..1e7e17e64 100644
--- a/src/core/arm/dynarmic/arm_dynarmic_32.h
+++ b/src/core/arm/dynarmic/arm_dynarmic_32.h
@@ -21,6 +21,7 @@ class Memory;
 
 namespace Core {
 
+class CPUInterruptHandler;
 class DynarmicCallbacks32;
 class DynarmicCP15;
 class DynarmicExclusiveMonitor;
@@ -28,7 +29,8 @@ class System;
 
 class ARM_Dynarmic_32 final : public ARM_Interface {
 public:
-    ARM_Dynarmic_32(System& system, ExclusiveMonitor& exclusive_monitor, std::size_t core_index);
+    ARM_Dynarmic_32(System& system, CPUInterruptHandler& interrupt_handler,
+                    ExclusiveMonitor& exclusive_monitor, std::size_t core_index);
     ~ARM_Dynarmic_32() override;
 
     void SetPC(u64 pc) override;
diff --git a/src/core/arm/dynarmic/arm_dynarmic_64.cpp b/src/core/arm/dynarmic/arm_dynarmic_64.cpp
index 5f5e36d94..5e316ffd4 100644
--- a/src/core/arm/dynarmic/arm_dynarmic_64.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_64.cpp
@@ -9,6 +9,7 @@
 #include "common/logging/log.h"
 #include "common/microprofile.h"
 #include "common/page_table.h"
+#include "core/arm/cpu_interrupt_handler.h"
 #include "core/arm/dynarmic/arm_dynarmic_64.h"
 #include "core/core.h"
 #include "core/core_manager.h"
@@ -108,23 +109,16 @@ public:
     }
 
     void AddTicks(u64 ticks) override {
-        // Divide the number of ticks by the amount of CPU cores. TODO(Subv): This yields only a
-        // rough approximation of the amount of executed ticks in the system, it may be thrown off
-        // if not all cores are doing a similar amount of work. Instead of doing this, we should
-        // device a way so that timing is consistent across all cores without increasing the ticks 4
-        // times.
-        u64 amortized_ticks = (ticks - num_interpreted_instructions) / Core::NUM_CPU_CORES;
-        // Always execute at least one tick.
-        amortized_ticks = std::max<u64>(amortized_ticks, 1);
-
-        parent.system.CoreTiming().AddTicks(amortized_ticks);
-        num_interpreted_instructions = 0;
+        /// We are using host timing, NOP
     }
     u64 GetTicksRemaining() override {
-        return std::max(parent.system.CoreTiming().GetDowncount(), s64{0});
+        if (!parent.interrupt_handler.IsInterrupted()) {
+            return 1000ULL;
+        }
+        return 0ULL;
     }
     u64 GetCNTPCT() override {
-        return Timing::CpuCyclesToClockCycles(parent.system.CoreTiming().GetTicks());
+        return parent.system.CoreTiming().GetClockTicks();
     }
 
     ARM_Dynarmic_64& parent;
@@ -183,10 +177,10 @@ void ARM_Dynarmic_64::Step() {
     cb->InterpreterFallback(jit->GetPC(), 1);
 }
 
-ARM_Dynarmic_64::ARM_Dynarmic_64(System& system, ExclusiveMonitor& exclusive_monitor,
-                                 std::size_t core_index)
-    : ARM_Interface{system}, cb(std::make_unique<DynarmicCallbacks64>(*this)),
-      inner_unicorn{system, ARM_Unicorn::Arch::AArch64}, core_index{core_index},
+ARM_Dynarmic_64::ARM_Dynarmic_64(System& system, CPUInterruptHandler& interrupt_handler,
+                                 ExclusiveMonitor& exclusive_monitor, std::size_t core_index)
+    : ARM_Interface{system, interrupt_handler}, cb(std::make_unique<DynarmicCallbacks64>(*this)),
+      inner_unicorn{system, interrupt_handler, ARM_Unicorn::Arch::AArch64}, core_index{core_index},
       exclusive_monitor{dynamic_cast<DynarmicExclusiveMonitor&>(exclusive_monitor)} {}
 
 ARM_Dynarmic_64::~ARM_Dynarmic_64() = default;
diff --git a/src/core/arm/dynarmic/arm_dynarmic_64.h b/src/core/arm/dynarmic/arm_dynarmic_64.h
index 647cecaf0..9e94b58c2 100644
--- a/src/core/arm/dynarmic/arm_dynarmic_64.h
+++ b/src/core/arm/dynarmic/arm_dynarmic_64.h
@@ -22,12 +22,14 @@ class Memory;
 namespace Core {
 
 class DynarmicCallbacks64;
+class CPUInterruptHandler;
 class DynarmicExclusiveMonitor;
 class System;
 
 class ARM_Dynarmic_64 final : public ARM_Interface {
 public:
-    ARM_Dynarmic_64(System& system, ExclusiveMonitor& exclusive_monitor, std::size_t core_index);
+    ARM_Dynarmic_64(System& system, CPUInterruptHandler& interrupt_handler,
+                    ExclusiveMonitor& exclusive_monitor, std::size_t core_index);
     ~ARM_Dynarmic_64() override;
 
     void SetPC(u64 pc) override;
diff --git a/src/core/arm/unicorn/arm_unicorn.cpp b/src/core/arm/unicorn/arm_unicorn.cpp
index e40e9626a..0393fe641 100644
--- a/src/core/arm/unicorn/arm_unicorn.cpp
+++ b/src/core/arm/unicorn/arm_unicorn.cpp
@@ -6,6 +6,7 @@
 #include <unicorn/arm64.h>
 #include "common/assert.h"
 #include "common/microprofile.h"
+#include "core/arm/cpu_interrupt_handler.h"
 #include "core/arm/unicorn/arm_unicorn.h"
 #include "core/core.h"
 #include "core/core_timing.h"
@@ -62,7 +63,8 @@ static bool UnmappedMemoryHook(uc_engine* uc, uc_mem_type type, u64 addr, int si
     return false;
 }
 
-ARM_Unicorn::ARM_Unicorn(System& system, Arch architecture) : ARM_Interface{system} {
+ARM_Unicorn::ARM_Unicorn(System& system, CPUInterruptHandler& interrupt_handler, Arch architecture)
+    : ARM_Interface{system, interrupt_handler} {
     const auto arch = architecture == Arch::AArch32 ? UC_ARCH_ARM : UC_ARCH_ARM64;
     CHECKED(uc_open(arch, UC_MODE_ARM, &uc));
 
@@ -160,8 +162,12 @@ void ARM_Unicorn::Run() {
     if (GDBStub::IsServerEnabled()) {
         ExecuteInstructions(std::max(4000000U, 0U));
     } else {
-        ExecuteInstructions(
-            std::max(std::size_t(system.CoreTiming().GetDowncount()), std::size_t{0}));
+        while (true) {
+            if (interrupt_handler.IsInterrupted()) {
+                return;
+            }
+            ExecuteInstructions(10);
+        }
     }
 }
 
@@ -183,8 +189,6 @@ void ARM_Unicorn::ExecuteInstructions(std::size_t num_instructions) {
                            UC_PROT_READ | UC_PROT_WRITE | UC_PROT_EXEC, page_buffer.data()));
     CHECKED(uc_emu_start(uc, GetPC(), 1ULL << 63, 0, num_instructions));
     CHECKED(uc_mem_unmap(uc, map_addr, page_buffer.size()));
-
-    system.CoreTiming().AddTicks(num_instructions);
     if (GDBStub::IsServerEnabled()) {
         if (last_bkpt_hit && last_bkpt.type == GDBStub::BreakpointType::Execute) {
             uc_reg_write(uc, UC_ARM64_REG_PC, &last_bkpt.address);
diff --git a/src/core/arm/unicorn/arm_unicorn.h b/src/core/arm/unicorn/arm_unicorn.h
index 725c65085..0a4c087cd 100644
--- a/src/core/arm/unicorn/arm_unicorn.h
+++ b/src/core/arm/unicorn/arm_unicorn.h
@@ -11,6 +11,7 @@
 
 namespace Core {
 
+class CPUInterruptHandler;
 class System;
 
 class ARM_Unicorn final : public ARM_Interface {
@@ -20,7 +21,7 @@ public:
         AArch64, // 64-bit ARM
     };
 
-    explicit ARM_Unicorn(System& system, Arch architecture);
+    explicit ARM_Unicorn(System& system, CPUInterruptHandler& interrupt_handler, Arch architecture);
     ~ARM_Unicorn() override;
 
     void SetPC(u64 pc) override;
diff --git a/src/core/core.cpp b/src/core/core.cpp
index f9f8a3000..e8936b09d 100644
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -11,7 +11,6 @@
 #include "common/string_util.h"
 #include "core/arm/exclusive_monitor.h"
 #include "core/core.h"
-#include "core/core_manager.h"
 #include "core/core_timing.h"
 #include "core/cpu_manager.h"
 #include "core/device_memory.h"
@@ -117,23 +116,30 @@ struct System::Impl {
         : kernel{system}, fs_controller{system}, memory{system},
           cpu_manager{system}, reporter{system}, applet_manager{system} {}
 
-    CoreManager& CurrentCoreManager() {
-        return cpu_manager.GetCurrentCoreManager();
-    }
-
     Kernel::PhysicalCore& CurrentPhysicalCore() {
-        const auto index = cpu_manager.GetActiveCoreIndex();
-        return kernel.PhysicalCore(index);
+        return kernel.CurrentPhysicalCore();
     }
 
     Kernel::PhysicalCore& GetPhysicalCore(std::size_t index) {
         return kernel.PhysicalCore(index);
     }
 
-    ResultStatus RunLoop(bool tight_loop) {
+    ResultStatus Run() {
         status = ResultStatus::Success;
 
-        cpu_manager.RunLoop(tight_loop);
+        kernel.Suspend(false);
+        core_timing.SyncPause(false);
+        cpu_manager.Pause(false);
+
+        return status;
+    }
+
+    ResultStatus Pause() {
+        status = ResultStatus::Success;
+
+        kernel.Suspend(true);
+        core_timing.SyncPause(true);
+        cpu_manager.Pause(true);
 
         return status;
     }
@@ -143,7 +149,7 @@ struct System::Impl {
 
         device_memory = std::make_unique<Core::DeviceMemory>(system);
 
-        core_timing.Initialize();
+        core_timing.Initialize([&system]() { system.RegisterHostThread(); });
         kernel.Initialize();
         cpu_manager.Initialize();
 
@@ -387,20 +393,24 @@ struct System::Impl {
 System::System() : impl{std::make_unique<Impl>(*this)} {}
 System::~System() = default;
 
-CoreManager& System::CurrentCoreManager() {
-    return impl->CurrentCoreManager();
+CpuManager& System::GetCpuManager() {
+    return impl->cpu_manager;
+}
+
+const CpuManager& System::GetCpuManager() const {
+    return impl->cpu_manager;
 }
 
-const CoreManager& System::CurrentCoreManager() const {
-    return impl->CurrentCoreManager();
+System::ResultStatus System::Run() {
+    return impl->Run();
 }
 
-System::ResultStatus System::RunLoop(bool tight_loop) {
-    return impl->RunLoop(tight_loop);
+System::ResultStatus System::Pause() {
+    return impl->Pause();
 }
 
 System::ResultStatus System::SingleStep() {
-    return RunLoop(false);
+    return ResultStatus::Success;
 }
 
 void System::InvalidateCpuInstructionCaches() {
@@ -444,7 +454,9 @@ const ARM_Interface& System::CurrentArmInterface() const {
 }
 
 std::size_t System::CurrentCoreIndex() const {
-    return impl->cpu_manager.GetActiveCoreIndex();
+    std::size_t core = impl->kernel.GetCurrentHostThreadID();
+    ASSERT(core < Core::Hardware::NUM_CPU_CORES);
+    return core;
 }
 
 Kernel::Scheduler& System::CurrentScheduler() {
@@ -497,15 +509,6 @@ const ARM_Interface& System::ArmInterface(std::size_t core_index) const {
     return impl->GetPhysicalCore(core_index).ArmInterface();
 }
 
-CoreManager& System::GetCoreManager(std::size_t core_index) {
-    return impl->cpu_manager.GetCoreManager(core_index);
-}
-
-const CoreManager& System::GetCoreManager(std::size_t core_index) const {
-    ASSERT(core_index < NUM_CPU_CORES);
-    return impl->cpu_manager.GetCoreManager(core_index);
-}
-
 ExclusiveMonitor& System::Monitor() {
     return impl->kernel.GetExclusiveMonitor();
 }
diff --git a/src/core/core.h b/src/core/core.h
index acc53d6a1..7f170fc54 100644
--- a/src/core/core.h
+++ b/src/core/core.h
@@ -90,7 +90,7 @@ class InterruptManager;
 namespace Core {
 
 class ARM_Interface;
-class CoreManager;
+class CpuManager;
 class DeviceMemory;
 class ExclusiveMonitor;
 class FrameLimiter;
@@ -136,16 +136,18 @@ public:
     };
 
     /**
-     * Run the core CPU loop
-     * This function runs the core for the specified number of CPU instructions before trying to
-     * update hardware. This is much faster than SingleStep (and should be equivalent), as the CPU
-     * is not required to do a full dispatch with each instruction. NOTE: the number of instructions
-     * requested is not guaranteed to run, as this will be interrupted preemptively if a hardware
-     * update is requested (e.g. on a thread switch).
-     * @param tight_loop If false, the CPU single-steps.
-     * @return Result status, indicating whether or not the operation succeeded.
+     * Run the OS and Application
+     * This function will start emulation and run the competent devices
+     */
+    ResultStatus Run();
+
+    /**
+     * Pause the OS and Application
+     * This function will pause emulation and stop the competent devices
      */
-    ResultStatus RunLoop(bool tight_loop = true);
+    ResultStatus Pause();
+
+
 
     /**
      * Step the CPU one instruction
@@ -215,11 +217,9 @@ public:
     /// Gets a const reference to an ARM interface from the CPU core with the specified index
     const ARM_Interface& ArmInterface(std::size_t core_index) const;
 
-    /// Gets a CPU interface to the CPU core with the specified index
-    CoreManager& GetCoreManager(std::size_t core_index);
+    CpuManager& GetCpuManager();
 
-    /// Gets a CPU interface to the CPU core with the specified index
-    const CoreManager& GetCoreManager(std::size_t core_index) const;
+    const CpuManager& GetCpuManager() const;
 
     /// Gets a reference to the exclusive monitor
     ExclusiveMonitor& Monitor();
@@ -373,12 +373,6 @@ public:
 private:
     System();
 
-    /// Returns the currently running CPU core
-    CoreManager& CurrentCoreManager();
-
-    /// Returns the currently running CPU core
-    const CoreManager& CurrentCoreManager() const;
-
     /**
      * Initialize the emulated system.
      * @param emu_window Reference to the host-system window used for video output and keyboard
diff --git a/src/core/core_manager.cpp b/src/core/core_manager.cpp
index b6b797c80..45f0bb547 100644
--- a/src/core/core_manager.cpp
+++ b/src/core/core_manager.cpp
@@ -34,7 +34,6 @@ void CoreManager::RunLoop(bool tight_loop) {
     // instead advance to the next event and try to yield to the next thread
     if (Kernel::GetCurrentThread() == nullptr) {
         LOG_TRACE(Core, "Core-{} idling", core_index);
-        core_timing.Idle();
     } else {
         if (tight_loop) {
             physical_core.Run();
@@ -42,7 +41,6 @@ void CoreManager::RunLoop(bool tight_loop) {
             physical_core.Step();
         }
     }
-    core_timing.Advance();
 
     Reschedule();
 }
@@ -59,7 +57,7 @@ void CoreManager::Reschedule() {
     // Lock the global kernel mutex when we manipulate the HLE state
     std::lock_guard lock(HLE::g_hle_lock);
 
-    global_scheduler.SelectThread(core_index);
+    // global_scheduler.SelectThread(core_index);
 
     physical_core.Scheduler().TryDoContextSwitch();
 }
diff --git a/src/core/core_timing.cpp b/src/core/core_timing.cpp
index 46d4178c4..a3ce69790 100644
--- a/src/core/core_timing.cpp
+++ b/src/core/core_timing.cpp
@@ -1,5 +1,5 @@
-// Copyright 2008 Dolphin Emulator Project / 2017 Citra Emulator Project
-// Licensed under GPLv2+
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #include "core/core_timing.h"
@@ -10,20 +10,16 @@
 #include <tuple>
 
 #include "common/assert.h"
-#include "common/thread.h"
 #include "core/core_timing_util.h"
-#include "core/hardware_properties.h"
 
 namespace Core::Timing {
 
-constexpr int MAX_SLICE_LENGTH = 10000;
-
 std::shared_ptr<EventType> CreateEvent(std::string name, TimedCallback&& callback) {
     return std::make_shared<EventType>(std::move(callback), std::move(name));
 }
 
 struct CoreTiming::Event {
-    s64 time;
+    u64 time;
     u64 fifo_order;
     u64 userdata;
     std::weak_ptr<EventType> type;
@@ -39,51 +35,74 @@ struct CoreTiming::Event {
     }
 };
 
-CoreTiming::CoreTiming() = default;
-CoreTiming::~CoreTiming() = default;
+CoreTiming::CoreTiming() {
+    clock =
+        Common::CreateBestMatchingClock(Core::Hardware::BASE_CLOCK_RATE, Core::Hardware::CNTFREQ);
+}
 
-void CoreTiming::Initialize() {
-    downcounts.fill(MAX_SLICE_LENGTH);
-    time_slice.fill(MAX_SLICE_LENGTH);
-    slice_length = MAX_SLICE_LENGTH;
-    global_timer = 0;
-    idled_cycles = 0;
-    current_context = 0;
+CoreTiming::~CoreTiming() = default;
 
-    // The time between CoreTiming being initialized and the first call to Advance() is considered
-    // the slice boundary between slice -1 and slice 0. Dispatcher loops must call Advance() before
-    // executing the first cycle of each slice to prepare the slice length and downcount for
-    // that slice.
-    is_global_timer_sane = true;
+void CoreTiming::ThreadEntry(CoreTiming& instance) {
+    std::string name = "yuzu:HostTiming";
+    Common::SetCurrentThreadName(name.c_str());
+    instance.on_thread_init();
+    instance.ThreadLoop();
+}
 
+void CoreTiming::Initialize(std::function<void(void)>&& on_thread_init_) {
+    on_thread_init = std::move(on_thread_init_);
     event_fifo_id = 0;
-
     const auto empty_timed_callback = [](u64, s64) {};
     ev_lost = CreateEvent("_lost_event", empty_timed_callback);
+    timer_thread = std::make_unique<std::thread>(ThreadEntry, std::ref(*this));
 }
 
 void CoreTiming::Shutdown() {
+    paused = true;
+    shutting_down = true;
+    event.Set();
+    timer_thread->join();
     ClearPendingEvents();
+    timer_thread.reset();
+    has_started = false;
 }
 
-void CoreTiming::ScheduleEvent(s64 cycles_into_future, const std::shared_ptr<EventType>& event_type,
-                               u64 userdata) {
-    std::lock_guard guard{inner_mutex};
-    const s64 timeout = GetTicks() + cycles_into_future;
+void CoreTiming::Pause(bool is_paused) {
+    paused = is_paused;
+}
 
-    // If this event needs to be scheduled before the next advance(), force one early
-    if (!is_global_timer_sane) {
-        ForceExceptionCheck(cycles_into_future);
+void CoreTiming::SyncPause(bool is_paused) {
+    if (is_paused == paused && paused_set == paused) {
+        return;
     }
+    Pause(is_paused);
+    event.Set();
+    while (paused_set != is_paused)
+        ;
+}
+
+bool CoreTiming::IsRunning() const {
+    return !paused_set;
+}
+
+bool CoreTiming::HasPendingEvents() const {
+    return !(wait_set && event_queue.empty());
+}
+
+void CoreTiming::ScheduleEvent(s64 ns_into_future, const std::shared_ptr<EventType>& event_type,
+                               u64 userdata) {
+    basic_lock.lock();
+    const u64 timeout = static_cast<u64>(GetGlobalTimeNs().count() + ns_into_future);
 
     event_queue.emplace_back(Event{timeout, event_fifo_id++, userdata, event_type});
 
     std::push_heap(event_queue.begin(), event_queue.end(), std::greater<>());
+    basic_lock.unlock();
+    event.Set();
 }
 
 void CoreTiming::UnscheduleEvent(const std::shared_ptr<EventType>& event_type, u64 userdata) {
-    std::lock_guard guard{inner_mutex};
-
+    basic_lock.lock();
     const auto itr = std::remove_if(event_queue.begin(), event_queue.end(), [&](const Event& e) {
         return e.type.lock().get() == event_type.get() && e.userdata == userdata;
     });
@@ -93,23 +112,23 @@ void CoreTiming::UnscheduleEvent(const std::shared_ptr<EventType>& event_type, u
         event_queue.erase(itr, event_queue.end());
         std::make_heap(event_queue.begin(), event_queue.end(), std::greater<>());
     }
+    basic_lock.unlock();
 }
 
-u64 CoreTiming::GetTicks() const {
-    u64 ticks = static_cast<u64>(global_timer);
-    if (!is_global_timer_sane) {
-        ticks += accumulated_ticks;
-    }
-    return ticks;
+void CoreTiming::AddTicks(std::size_t core_index, u64 ticks) {
+    ticks_count[core_index] += ticks;
+}
+
+void CoreTiming::ResetTicks(std::size_t core_index) {
+    ticks_count[core_index] = 0;
 }
 
-u64 CoreTiming::GetIdleTicks() const {
-    return static_cast<u64>(idled_cycles);
+u64 CoreTiming::GetCPUTicks() const {
+    return clock->GetCPUCycles();
 }
 
-void CoreTiming::AddTicks(u64 ticks) {
-    accumulated_ticks += ticks;
-    downcounts[current_context] -= static_cast<s64>(ticks);
+u64 CoreTiming::GetClockTicks() const {
+    return clock->GetClockCycles();
 }
 
 void CoreTiming::ClearPendingEvents() {
@@ -117,7 +136,7 @@ void CoreTiming::ClearPendingEvents() {
 }
 
 void CoreTiming::RemoveEvent(const std::shared_ptr<EventType>& event_type) {
-    std::lock_guard guard{inner_mutex};
+    basic_lock.lock();
 
     const auto itr = std::remove_if(event_queue.begin(), event_queue.end(), [&](const Event& e) {
         return e.type.lock().get() == event_type.get();
@@ -128,99 +147,64 @@ void CoreTiming::RemoveEvent(const std::shared_ptr<EventType>& event_type) {
         event_queue.erase(itr, event_queue.end());
         std::make_heap(event_queue.begin(), event_queue.end(), std::greater<>());
     }
+    basic_lock.unlock();
 }
 
-void CoreTiming::ForceExceptionCheck(s64 cycles) {
-    cycles = std::max<s64>(0, cycles);
-    if (downcounts[current_context] <= cycles) {
-        return;
-    }
-
-    // downcount is always (much) smaller than MAX_INT so we can safely cast cycles to an int
-    // here. Account for cycles already executed by adjusting the g.slice_length
-    downcounts[current_context] = static_cast<int>(cycles);
-}
-
-std::optional<u64> CoreTiming::NextAvailableCore(const s64 needed_ticks) const {
-    const u64 original_context = current_context;
-    u64 next_context = (original_context + 1) % num_cpu_cores;
-    while (next_context != original_context) {
-        if (time_slice[next_context] >= needed_ticks) {
-            return {next_context};
-        } else if (time_slice[next_context] >= 0) {
-            return std::nullopt;
-        }
-        next_context = (next_context + 1) % num_cpu_cores;
-    }
-    return std::nullopt;
-}
-
-void CoreTiming::Advance() {
-    std::unique_lock<std::mutex> guard(inner_mutex);
-
-    const u64 cycles_executed = accumulated_ticks;
-    time_slice[current_context] = std::max<s64>(0, time_slice[current_context] - accumulated_ticks);
-    global_timer += cycles_executed;
-
-    is_global_timer_sane = true;
+std::optional<u64> CoreTiming::Advance() {
+    advance_lock.lock();
+    basic_lock.lock();
+    global_timer = GetGlobalTimeNs().count();
 
     while (!event_queue.empty() && event_queue.front().time <= global_timer) {
         Event evt = std::move(event_queue.front());
         std::pop_heap(event_queue.begin(), event_queue.end(), std::greater<>());
         event_queue.pop_back();
-        inner_mutex.unlock();
+        basic_lock.unlock();
 
         if (auto event_type{evt.type.lock()}) {
             event_type->callback(evt.userdata, global_timer - evt.time);
         }
 
-        inner_mutex.lock();
+        basic_lock.lock();
     }
 
-    is_global_timer_sane = false;
-
-    // Still events left (scheduled in the future)
     if (!event_queue.empty()) {
-        const s64 needed_ticks =
-            std::min<s64>(event_queue.front().time - global_timer, MAX_SLICE_LENGTH);
-        const auto next_core = NextAvailableCore(needed_ticks);
-        if (next_core) {
-            downcounts[*next_core] = needed_ticks;
-        }
+        const u64 next_time = event_queue.front().time - global_timer;
+        basic_lock.unlock();
+        advance_lock.unlock();
+        return next_time;
+    } else {
+        basic_lock.unlock();
+        advance_lock.unlock();
+        return std::nullopt;
     }
-
-    accumulated_ticks = 0;
-
-    downcounts[current_context] = time_slice[current_context];
 }
 
-void CoreTiming::ResetRun() {
-    downcounts.fill(MAX_SLICE_LENGTH);
-    time_slice.fill(MAX_SLICE_LENGTH);
-    current_context = 0;
-    // Still events left (scheduled in the future)
-    if (!event_queue.empty()) {
-        const s64 needed_ticks =
-            std::min<s64>(event_queue.front().time - global_timer, MAX_SLICE_LENGTH);
-        downcounts[current_context] = needed_ticks;
+void CoreTiming::ThreadLoop() {
+    has_started = true;
+    while (!shutting_down) {
+        while (!paused) {
+            paused_set = false;
+            const auto next_time = Advance();
+            if (next_time) {
+                std::chrono::nanoseconds next_time_ns = std::chrono::nanoseconds(*next_time);
+                event.WaitFor(next_time_ns);
+            } else {
+                wait_set = true;
+                event.Wait();
+            }
+            wait_set = false;
+        }
+        paused_set = true;
     }
-
-    is_global_timer_sane = false;
-    accumulated_ticks = 0;
 }
 
-void CoreTiming::Idle() {
-    accumulated_ticks += downcounts[current_context];
-    idled_cycles += downcounts[current_context];
-    downcounts[current_context] = 0;
+std::chrono::nanoseconds CoreTiming::GetGlobalTimeNs() const {
+    return clock->GetTimeNS();
 }
 
 std::chrono::microseconds CoreTiming::GetGlobalTimeUs() const {
-    return std::chrono::microseconds{GetTicks() * 1000000 / Hardware::BASE_CLOCK_RATE};
-}
-
-s64 CoreTiming::GetDowncount() const {
-    return downcounts[current_context];
+    return clock->GetTimeUS();
 }
 
 } // namespace Core::Timing
diff --git a/src/core/core_timing.h b/src/core/core_timing.h
index d50f4eb8a..707c8ef0c 100644
--- a/src/core/core_timing.h
+++ b/src/core/core_timing.h
@@ -1,19 +1,25 @@
-// Copyright 2008 Dolphin Emulator Project / 2017 Citra Emulator Project
-// Licensed under GPLv2+
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #pragma once
 
+#include <atomic>
 #include <chrono>
 #include <functional>
 #include <memory>
 #include <mutex>
 #include <optional>
 #include <string>
+#include <thread>
 #include <vector>
 
 #include "common/common_types.h"
+#include "common/spin_lock.h"
+#include "common/thread.h"
 #include "common/threadsafe_queue.h"
+#include "common/wall_clock.h"
+#include "core/hardware_properties.h"
 
 namespace Core::Timing {
 
@@ -56,58 +62,55 @@ public:
 
     /// CoreTiming begins at the boundary of timing slice -1. An initial call to Advance() is
     /// required to end slice - 1 and start slice 0 before the first cycle of code is executed.
-    void Initialize();
+    void Initialize(std::function<void(void)>&& on_thread_init_);
 
     /// Tears down all timing related functionality.
     void Shutdown();
 
-    /// After the first Advance, the slice lengths and the downcount will be reduced whenever an
-    /// event is scheduled earlier than the current values.
-    ///
-    /// Scheduling from a callback will not update the downcount until the Advance() completes.
-    void ScheduleEvent(s64 cycles_into_future, const std::shared_ptr<EventType>& event_type,
-                       u64 userdata = 0);
+    /// Pauses/Unpauses the execution of the timer thread.
+    void Pause(bool is_paused);
 
-    void UnscheduleEvent(const std::shared_ptr<EventType>& event_type, u64 userdata);
+    /// Pauses/Unpauses the execution of the timer thread and waits until paused.
+    void SyncPause(bool is_paused);
 
-    /// We only permit one event of each type in the queue at a time.
-    void RemoveEvent(const std::shared_ptr<EventType>& event_type);
+    /// Checks if core timing is running.
+    bool IsRunning() const;
 
-    void ForceExceptionCheck(s64 cycles);
+    /// Checks if the timer thread has started.
+    bool HasStarted() const {
+        return has_started;
+    }
 
-    /// This should only be called from the emu thread, if you are calling it any other thread,
-    /// you are doing something evil
-    u64 GetTicks() const;
+    /// Checks if there are any pending time events.
+    bool HasPendingEvents() const;
 
-    u64 GetIdleTicks() const;
+    /// Schedules an event in core timing
+    void ScheduleEvent(s64 ns_into_future, const std::shared_ptr<EventType>& event_type,
+                       u64 userdata = 0);
 
-    void AddTicks(u64 ticks);
+    void UnscheduleEvent(const std::shared_ptr<EventType>& event_type, u64 userdata);
 
-    /// Advance must be called at the beginning of dispatcher loops, not the end. Advance() ends
-    /// the previous timing slice and begins the next one, you must Advance from the previous
-    /// slice to the current one before executing any cycles. CoreTiming starts in slice -1 so an
-    /// Advance() is required to initialize the slice length before the first cycle of emulated
-    /// instructions is executed.
-    void Advance();
+    /// We only permit one event of each type in the queue at a time.
+    void RemoveEvent(const std::shared_ptr<EventType>& event_type);
 
-    /// Pretend that the main CPU has executed enough cycles to reach the next event.
-    void Idle();
+    void AddTicks(std::size_t core_index, u64 ticks);
 
-    std::chrono::microseconds GetGlobalTimeUs() const;
+    void ResetTicks(std::size_t core_index);
 
-    void ResetRun();
+    /// Returns current time in emulated CPU cycles
+    u64 GetCPUTicks() const;
 
-    s64 GetDowncount() const;
+    /// Returns current time in emulated in Clock cycles
+    u64 GetClockTicks() const;
 
-    void SwitchContext(u64 new_context) {
-        current_context = new_context;
-    }
+    /// Returns current time in microseconds.
+    std::chrono::microseconds GetGlobalTimeUs() const;
 
-    bool CanCurrentContextRun() const {
-        return time_slice[current_context] > 0;
-    }
+    /// Returns current time in nanoseconds.
+    std::chrono::nanoseconds GetGlobalTimeNs() const;
 
-    std::optional<u64> NextAvailableCore(const s64 needed_ticks) const;
+    /// Checks for events manually and returns time in nanoseconds for next event, threadsafe.
+    std::optional<u64> Advance();
 
 private:
     struct Event;
@@ -115,21 +118,14 @@ private:
     /// Clear all pending events. This should ONLY be done on exit.
     void ClearPendingEvents();
 
-    static constexpr u64 num_cpu_cores = 4;
+    static void ThreadEntry(CoreTiming& instance);
+    void ThreadLoop();
+
+    std::unique_ptr<Common::WallClock> clock;
 
-    s64 global_timer = 0;
-    s64 idled_cycles = 0;
-    s64 slice_length = 0;
-    u64 accumulated_ticks = 0;
-    std::array<s64, num_cpu_cores> downcounts{};
-    // Slice of time assigned to each core per run.
-    std::array<s64, num_cpu_cores> time_slice{};
-    u64 current_context = 0;
+    u64 global_timer = 0;
 
-    // Are we in a function that has been called from Advance()
-    // If events are scheduled from a function that gets called from Advance(),
-    // don't change slice_length and downcount.
-    bool is_global_timer_sane = false;
+    std::chrono::nanoseconds start_point;
 
     // The queue is a min-heap using std::make_heap/push_heap/pop_heap.
     // We don't use std::priority_queue because we need to be able to serialize, unserialize and
@@ -139,8 +135,18 @@ private:
     u64 event_fifo_id = 0;
 
     std::shared_ptr<EventType> ev_lost;
-
-    std::mutex inner_mutex;
+    Common::Event event{};
+    Common::SpinLock basic_lock{};
+    Common::SpinLock advance_lock{};
+    std::unique_ptr<std::thread> timer_thread;
+    std::atomic<bool> paused{};
+    std::atomic<bool> paused_set{};
+    std::atomic<bool> wait_set{};
+    std::atomic<bool> shutting_down{};
+    std::atomic<bool> has_started{};
+    std::function<void(void)> on_thread_init{};
+
+    std::array<std::atomic<u64>, Core::Hardware::NUM_CPU_CORES> ticks_count{};
 };
 
 /// Creates a core timing event with the given name and callback.
diff --git a/src/core/cpu_manager.cpp b/src/core/cpu_manager.cpp
index 70ddbdcca..494850992 100644
--- a/src/core/cpu_manager.cpp
+++ b/src/core/cpu_manager.cpp
@@ -2,80 +2,192 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include "common/fiber.h"
+#include "common/thread.h"
 #include "core/arm/exclusive_monitor.h"
 #include "core/core.h"
-#include "core/core_manager.h"
 #include "core/core_timing.h"
 #include "core/cpu_manager.h"
 #include "core/gdbstub/gdbstub.h"
+#include "core/hle/kernel/kernel.h"
+#include "core/hle/kernel/physical_core.h"
+#include "core/hle/kernel/scheduler.h"
+#include "core/hle/kernel/thread.h"
 
 namespace Core {
 
 CpuManager::CpuManager(System& system) : system{system} {}
 CpuManager::~CpuManager() = default;
 
+void CpuManager::ThreadStart(CpuManager& cpu_manager, std::size_t core) {
+    cpu_manager.RunThread(core);
+}
+
 void CpuManager::Initialize() {
-    for (std::size_t index = 0; index < core_managers.size(); ++index) {
-        core_managers[index] = std::make_unique<CoreManager>(system, index);
+    running_mode = true;
+    for (std::size_t core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+        core_data[core].host_thread =
+            std::make_unique<std::thread>(ThreadStart, std::ref(*this), core);
     }
 }
 
 void CpuManager::Shutdown() {
-    for (auto& cpu_core : core_managers) {
-        cpu_core.reset();
+    running_mode = false;
+    Pause(false);
+    for (std::size_t core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+        core_data[core].host_thread->join();
     }
 }
 
-CoreManager& CpuManager::GetCoreManager(std::size_t index) {
-    return *core_managers.at(index);
+void CpuManager::GuestThreadFunction(void* cpu_manager_) {
+    CpuManager* cpu_manager = static_cast<CpuManager*>(cpu_manager_);
+    cpu_manager->RunGuestThread();
 }
 
-const CoreManager& CpuManager::GetCoreManager(std::size_t index) const {
-    return *core_managers.at(index);
+void CpuManager::IdleThreadFunction(void* cpu_manager_) {
+    CpuManager* cpu_manager = static_cast<CpuManager*>(cpu_manager_);
+    cpu_manager->RunIdleThread();
 }
 
-CoreManager& CpuManager::GetCurrentCoreManager() {
-    // Otherwise, use single-threaded mode active_core variable
-    return *core_managers[active_core];
+void CpuManager::SuspendThreadFunction(void* cpu_manager_) {
+    CpuManager* cpu_manager = static_cast<CpuManager*>(cpu_manager_);
+    cpu_manager->RunSuspendThread();
 }
 
-const CoreManager& CpuManager::GetCurrentCoreManager() const {
-    // Otherwise, use single-threaded mode active_core variable
-    return *core_managers[active_core];
+std::function<void(void*)> CpuManager::GetGuestThreadStartFunc() {
+    return std::function<void(void*)>(GuestThreadFunction);
 }
 
-void CpuManager::RunLoop(bool tight_loop) {
-    if (GDBStub::IsServerEnabled()) {
-        GDBStub::HandlePacket();
-
-        // If the loop is halted and we want to step, use a tiny (1) number of instructions to
-        // execute. Otherwise, get out of the loop function.
-        if (GDBStub::GetCpuHaltFlag()) {
-            if (GDBStub::GetCpuStepFlag()) {
-                tight_loop = false;
-            } else {
-                return;
-            }
-        }
+std::function<void(void*)> CpuManager::GetIdleThreadStartFunc() {
+    return std::function<void(void*)>(IdleThreadFunction);
+}
+
+std::function<void(void*)> CpuManager::GetSuspendThreadStartFunc() {
+    return std::function<void(void*)>(SuspendThreadFunction);
+}
+
+void* CpuManager::GetStartFuncParamater() {
+    return static_cast<void*>(this);
+}
+
+void CpuManager::RunGuestThread() {
+    auto& kernel = system.Kernel();
+    {
+        auto& sched = kernel.CurrentScheduler();
+        sched.OnThreadStart();
+    }
+    while (true) {
+        auto& physical_core = kernel.CurrentPhysicalCore();
+        LOG_CRITICAL(Core_ARM, "Running Guest Thread");
+        physical_core.Idle();
+        LOG_CRITICAL(Core_ARM, "Leaving Guest Thread");
+        // physical_core.Run();
+        auto& scheduler = physical_core.Scheduler();
+        scheduler.TryDoContextSwitch();
     }
+}
 
-    auto& core_timing = system.CoreTiming();
-    core_timing.ResetRun();
-    bool keep_running{};
-    do {
-        keep_running = false;
-        for (active_core = 0; active_core < NUM_CPU_CORES; ++active_core) {
-            core_timing.SwitchContext(active_core);
-            if (core_timing.CanCurrentContextRun()) {
-                core_managers[active_core]->RunLoop(tight_loop);
+void CpuManager::RunIdleThread() {
+    auto& kernel = system.Kernel();
+    while (true) {
+        auto& physical_core = kernel.CurrentPhysicalCore();
+        LOG_CRITICAL(Core_ARM, "Running Idle Thread");
+        physical_core.Idle();
+        auto& scheduler = physical_core.Scheduler();
+        scheduler.TryDoContextSwitch();
+    }
+}
+
+void CpuManager::RunSuspendThread() {
+    LOG_CRITICAL(Core_ARM, "Suspending Thread Entered");
+    auto& kernel = system.Kernel();
+    {
+        auto& sched = kernel.CurrentScheduler();
+        sched.OnThreadStart();
+    }
+    while (true) {
+        auto core = kernel.GetCurrentHostThreadID();
+        auto& scheduler = kernel.CurrentScheduler();
+        Kernel::Thread* current_thread = scheduler.GetCurrentThread();
+        LOG_CRITICAL(Core_ARM, "Suspending Core {}", core);
+        Common::Fiber::YieldTo(current_thread->GetHostContext(), core_data[core].host_context);
+        LOG_CRITICAL(Core_ARM, "Unsuspending Core {}", core);
+        ASSERT(scheduler.ContextSwitchPending());
+        ASSERT(core == kernel.GetCurrentHostThreadID());
+        scheduler.TryDoContextSwitch();
+    }
+}
+
+void CpuManager::Pause(bool paused) {
+    if (!paused) {
+        bool all_not_barrier = false;
+        while (!all_not_barrier) {
+            all_not_barrier = true;
+            for (std::size_t core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+                all_not_barrier &=
+                    !core_data[core].is_running.load() && core_data[core].initialized.load();
             }
-            keep_running |= core_timing.CanCurrentContextRun();
         }
-    } while (keep_running);
+        for (std::size_t core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+            core_data[core].enter_barrier->Set();
+        }
+        if (paused_state.load()) {
+            bool all_barrier = false;
+            while (!all_barrier) {
+                all_barrier = true;
+                for (std::size_t core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+                    all_barrier &=
+                        core_data[core].is_paused.load() && core_data[core].initialized.load();
+                }
+            }
+            for (std::size_t core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+                core_data[core].exit_barrier->Set();
+            }
+        }
+    } else {
+        /// Wait until all cores are paused.
+        bool all_barrier = false;
+        while (!all_barrier) {
+            all_barrier = true;
+            for (std::size_t core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+                all_barrier &=
+                    core_data[core].is_paused.load() && core_data[core].initialized.load();
+            }
+        }
+        /// Don't release the barrier
+    }
+    paused_state = paused;
+}
 
-    if (GDBStub::IsServerEnabled()) {
-        GDBStub::SetCpuStepFlag(false);
+void CpuManager::RunThread(std::size_t core) {
+    /// Initialization
+    system.RegisterCoreThread(core);
+    std::string name = "yuzu:CoreHostThread_" + std::to_string(core);
+    Common::SetCurrentThreadName(name.c_str());
+    auto& data = core_data[core];
+    data.enter_barrier = std::make_unique<Common::Event>();
+    data.exit_barrier = std::make_unique<Common::Event>();
+    data.host_context = Common::Fiber::ThreadToFiber();
+    data.is_running = false;
+    data.initialized = true;
+    /// Running
+    while (running_mode) {
+        data.is_running = false;
+        data.enter_barrier->Wait();
+        auto& scheduler = system.Kernel().CurrentScheduler();
+        Kernel::Thread* current_thread = scheduler.GetCurrentThread();
+        data.is_running = true;
+        Common::Fiber::YieldTo(data.host_context, current_thread->GetHostContext());
+        data.is_running = false;
+        data.is_paused = true;
+        data.exit_barrier->Wait();
+        data.is_paused = false;
     }
+    /// Time to cleanup
+    data.host_context->Exit();
+    data.enter_barrier.reset();
+    data.exit_barrier.reset();
+    data.initialized = false;
 }
 
 } // namespace Core
diff --git a/src/core/cpu_manager.h b/src/core/cpu_manager.h
index 97554d1bb..8103ae857 100644
--- a/src/core/cpu_manager.h
+++ b/src/core/cpu_manager.h
@@ -5,12 +5,18 @@
 #pragma once
 
 #include <array>
+#include <functional>
 #include <memory>
+#include <thread>
 #include "core/hardware_properties.h"
 
+namespace Common {
+class Event;
+class Fiber;
+} // namespace Common
+
 namespace Core {
 
-class CoreManager;
 class System;
 
 class CpuManager {
@@ -27,21 +33,40 @@ public:
     void Initialize();
     void Shutdown();
 
-    CoreManager& GetCoreManager(std::size_t index);
-    const CoreManager& GetCoreManager(std::size_t index) const;
+    void Pause(bool paused);
+
+    std::function<void(void*)> GetGuestThreadStartFunc();
+    std::function<void(void*)> GetIdleThreadStartFunc();
+    std::function<void(void*)> GetSuspendThreadStartFunc();
+    void* GetStartFuncParamater();
 
-    CoreManager& GetCurrentCoreManager();
-    const CoreManager& GetCurrentCoreManager() const;
+private:
+    static void GuestThreadFunction(void* cpu_manager);
+    static void IdleThreadFunction(void* cpu_manager);
+    static void SuspendThreadFunction(void* cpu_manager);
 
-    std::size_t GetActiveCoreIndex() const {
-        return active_core;
-    }
+    void RunGuestThread();
+    void RunIdleThread();
+    void RunSuspendThread();
 
-    void RunLoop(bool tight_loop);
+    static void ThreadStart(CpuManager& cpu_manager, std::size_t core);
 
-private:
-    std::array<std::unique_ptr<CoreManager>, Hardware::NUM_CPU_CORES> core_managers;
-    std::size_t active_core{}; ///< Active core, only used in single thread mode
+    void RunThread(std::size_t core);
+
+    struct CoreData {
+        std::shared_ptr<Common::Fiber> host_context;
+        std::unique_ptr<Common::Event> enter_barrier;
+        std::unique_ptr<Common::Event> exit_barrier;
+        std::atomic<bool> is_running;
+        std::atomic<bool> is_paused;
+        std::atomic<bool> initialized;
+        std::unique_ptr<std::thread> host_thread;
+    };
+
+    std::atomic<bool> running_mode{};
+    std::atomic<bool> paused_state{};
+
+    std::array<CoreData, Core::Hardware::NUM_CPU_CORES> core_data{};
 
     System& system;
 };
diff --git a/src/core/hle/kernel/kernel.cpp b/src/core/hle/kernel/kernel.cpp
index 7655382fa..ba051a7d8 100644
--- a/src/core/hle/kernel/kernel.cpp
+++ b/src/core/hle/kernel/kernel.cpp
@@ -13,11 +13,13 @@
 
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "common/thread.h"
 #include "core/arm/arm_interface.h"
 #include "core/arm/exclusive_monitor.h"
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "core/core_timing_util.h"
+#include "core/cpu_manager.h"
 #include "core/device_memory.h"
 #include "core/hardware_properties.h"
 #include "core/hle/kernel/client_port.h"
@@ -117,7 +119,9 @@ struct KernelCore::Impl {
         InitializeSystemResourceLimit(kernel);
         InitializeMemoryLayout();
         InitializeThreads();
-        InitializePreemption();
+        InitializePreemption(kernel);
+        InitializeSchedulers();
+        InitializeSuspendThreads();
     }
 
     void Shutdown() {
@@ -155,6 +159,12 @@ struct KernelCore::Impl {
         }
     }
 
+    void InitializeSchedulers() {
+        for (std::size_t i = 0; i < Core::Hardware::NUM_CPU_CORES; i++) {
+            cores[i].Scheduler().Initialize();
+        }
+    }
+
     // Creates the default system resource limit
     void InitializeSystemResourceLimit(KernelCore& kernel) {
         system_resource_limit = ResourceLimit::Create(kernel);
@@ -178,10 +188,13 @@ struct KernelCore::Impl {
             Core::Timing::CreateEvent("ThreadWakeupCallback", ThreadWakeupCallback);
     }
 
-    void InitializePreemption() {
-        preemption_event =
-            Core::Timing::CreateEvent("PreemptionCallback", [this](u64 userdata, s64 cycles_late) {
-                global_scheduler.PreemptThreads();
+    void InitializePreemption(KernelCore& kernel) {
+        preemption_event = Core::Timing::CreateEvent(
+            "PreemptionCallback", [this, &kernel](u64 userdata, s64 cycles_late) {
+                {
+                    SchedulerLock lock(kernel);
+                    global_scheduler.PreemptThreads();
+                }
                 s64 time_interval = Core::Timing::msToCycles(std::chrono::milliseconds(10));
                 system.CoreTiming().ScheduleEvent(time_interval, preemption_event);
             });
@@ -190,6 +203,20 @@ struct KernelCore::Impl {
         system.CoreTiming().ScheduleEvent(time_interval, preemption_event);
     }
 
+    void InitializeSuspendThreads() {
+        for (std::size_t i = 0; i < Core::Hardware::NUM_CPU_CORES; i++) {
+            std::string name = "Suspend Thread Id:" + std::to_string(i);
+            std::function<void(void*)> init_func =
+                system.GetCpuManager().GetSuspendThreadStartFunc();
+            void* init_func_parameter = system.GetCpuManager().GetStartFuncParamater();
+            ThreadType type =
+                static_cast<ThreadType>(THREADTYPE_KERNEL | THREADTYPE_HLE | THREADTYPE_SUSPEND);
+            auto thread_res = Thread::Create(system, type, name, 0, 0, 0, static_cast<u32>(i), 0,
+                                             nullptr, std::move(init_func), init_func_parameter);
+            suspend_threads[i] = std::move(thread_res).Unwrap();
+        }
+    }
+
     void MakeCurrentProcess(Process* process) {
         current_process = process;
 
@@ -201,7 +228,10 @@ struct KernelCore::Impl {
             core.SetIs64Bit(process->Is64BitProcess());
         }
 
-        system.Memory().SetCurrentPageTable(*process);
+        u32 core_id = GetCurrentHostThreadID();
+        if (core_id < Core::Hardware::NUM_CPU_CORES) {
+            system.Memory().SetCurrentPageTable(*process, core_id);
+        }
     }
 
     void RegisterCoreThread(std::size_t core_id) {
@@ -219,7 +249,9 @@ struct KernelCore::Impl {
         std::unique_lock lock{register_thread_mutex};
         const std::thread::id this_id = std::this_thread::get_id();
         const auto it = host_thread_ids.find(this_id);
-        ASSERT(it == host_thread_ids.end());
+        if (it != host_thread_ids.end()) {
+            return;
+        }
         host_thread_ids[this_id] = registered_thread_ids++;
     }
 
@@ -343,6 +375,8 @@ struct KernelCore::Impl {
     std::shared_ptr<Kernel::SharedMemory> irs_shared_mem;
     std::shared_ptr<Kernel::SharedMemory> time_shared_mem;
 
+    std::array<std::shared_ptr<Thread>, Core::Hardware::NUM_CPU_CORES> suspend_threads{};
+
     // System context
     Core::System& system;
 };
@@ -412,6 +446,26 @@ const Kernel::PhysicalCore& KernelCore::PhysicalCore(std::size_t id) const {
     return impl->cores[id];
 }
 
+Kernel::PhysicalCore& KernelCore::CurrentPhysicalCore() {
+    u32 core_id = impl->GetCurrentHostThreadID();
+    ASSERT(core_id < Core::Hardware::NUM_CPU_CORES);
+    return impl->cores[core_id];
+}
+
+const Kernel::PhysicalCore& KernelCore::CurrentPhysicalCore() const {
+    u32 core_id = impl->GetCurrentHostThreadID();
+    ASSERT(core_id < Core::Hardware::NUM_CPU_CORES);
+    return impl->cores[core_id];
+}
+
+Kernel::Scheduler& KernelCore::CurrentScheduler() {
+    return CurrentPhysicalCore().Scheduler();
+}
+
+const Kernel::Scheduler& KernelCore::CurrentScheduler() const {
+    return CurrentPhysicalCore().Scheduler();
+}
+
 Kernel::Synchronization& KernelCore::Synchronization() {
     return impl->synchronization;
 }
@@ -557,4 +611,20 @@ const Kernel::SharedMemory& KernelCore::GetTimeSharedMem() const {
     return *impl->time_shared_mem;
 }
 
+void KernelCore::Suspend(bool in_suspention) {
+    const bool should_suspend = exception_exited || in_suspention;
+    {
+        SchedulerLock lock(*this);
+        ThreadStatus status = should_suspend ? ThreadStatus::Ready : ThreadStatus::WaitSleep;
+        for (std::size_t i = 0; i < Core::Hardware::NUM_CPU_CORES; i++) {
+            impl->suspend_threads[i]->SetStatus(status);
+        }
+    }
+}
+
+void KernelCore::ExceptionalExit() {
+    exception_exited = true;
+    Suspend(true);
+}
+
 } // namespace Kernel
diff --git a/src/core/hle/kernel/kernel.h b/src/core/hle/kernel/kernel.h
index 83de1f542..5d32a8329 100644
--- a/src/core/hle/kernel/kernel.h
+++ b/src/core/hle/kernel/kernel.h
@@ -110,6 +110,18 @@ public:
     /// Gets the an instance of the respective physical CPU core.
     const Kernel::PhysicalCore& PhysicalCore(std::size_t id) const;
 
+    /// Gets the sole instance of the Scheduler at the current running core.
+    Kernel::Scheduler& CurrentScheduler();
+
+    /// Gets the sole instance of the Scheduler at the current running core.
+    const Kernel::Scheduler& CurrentScheduler() const;
+
+    /// Gets the an instance of the current physical CPU core.
+    Kernel::PhysicalCore& CurrentPhysicalCore();
+
+    /// Gets the an instance of the current physical CPU core.
+    const Kernel::PhysicalCore& CurrentPhysicalCore() const;
+
     /// Gets the an instance of the Synchronization Interface.
     Kernel::Synchronization& Synchronization();
 
@@ -191,6 +203,12 @@ public:
     /// Gets the shared memory object for Time services.
     const Kernel::SharedMemory& GetTimeSharedMem() const;
 
+    /// Suspend/unsuspend the OS.
+    void Suspend(bool in_suspention);
+
+    /// Exceptional exit the OS.
+    void ExceptionalExit();
+
 private:
     friend class Object;
     friend class Process;
@@ -219,6 +237,7 @@ private:
 
     struct Impl;
     std::unique_ptr<Impl> impl;
+    bool exception_exited{};
 };
 
 } // namespace Kernel
diff --git a/src/core/hle/kernel/physical_core.cpp b/src/core/hle/kernel/physical_core.cpp
index a15011076..69202540b 100644
--- a/src/core/hle/kernel/physical_core.cpp
+++ b/src/core/hle/kernel/physical_core.cpp
@@ -2,12 +2,15 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include "common/assert.h"
 #include "common/logging/log.h"
+#include "common/spin_lock.h"
 #include "core/arm/arm_interface.h"
 #ifdef ARCHITECTURE_x86_64
 #include "core/arm/dynarmic/arm_dynarmic_32.h"
 #include "core/arm/dynarmic/arm_dynarmic_64.h"
 #endif
+#include "core/arm/cpu_interrupt_handler.h"
 #include "core/arm/exclusive_monitor.h"
 #include "core/arm/unicorn/arm_unicorn.h"
 #include "core/core.h"
@@ -19,21 +22,23 @@ namespace Kernel {
 
 PhysicalCore::PhysicalCore(Core::System& system, std::size_t id,
                            Core::ExclusiveMonitor& exclusive_monitor)
-    : core_index{id} {
+    : interrupt_handler{}, core_index{id} {
 #ifdef ARCHITECTURE_x86_64
-    arm_interface_32 =
-        std::make_unique<Core::ARM_Dynarmic_32>(system, exclusive_monitor, core_index);
-    arm_interface_64 =
-        std::make_unique<Core::ARM_Dynarmic_64>(system, exclusive_monitor, core_index);
-
+    arm_interface_32 = std::make_unique<Core::ARM_Dynarmic_32>(system, interrupt_handler,
+                                                               exclusive_monitor, core_index);
+    arm_interface_64 = std::make_unique<Core::ARM_Dynarmic_64>(system, interrupt_handler,
+                                                               exclusive_monitor, core_index);
 #else
     using Core::ARM_Unicorn;
-    arm_interface_32 = std::make_unique<ARM_Unicorn>(system, ARM_Unicorn::Arch::AArch32);
-    arm_interface_64 = std::make_unique<ARM_Unicorn>(system, ARM_Unicorn::Arch::AArch64);
+    arm_interface_32 =
+        std::make_unique<ARM_Unicorn>(system, interrupt_handler, ARM_Unicorn::Arch::AArch32);
+    arm_interface_64 =
+        std::make_unique<ARM_Unicorn>(system, interrupt_handler, ARM_Unicorn::Arch::AArch64);
     LOG_WARNING(Core, "CPU JIT requested, but Dynarmic not available");
 #endif
 
     scheduler = std::make_unique<Kernel::Scheduler>(system, core_index);
+    guard = std::make_unique<Common::SpinLock>();
 }
 
 PhysicalCore::~PhysicalCore() = default;
@@ -47,6 +52,10 @@ void PhysicalCore::Step() {
     arm_interface->Step();
 }
 
+void PhysicalCore::Idle() {
+    interrupt_handler.AwaitInterrupt();
+}
+
 void PhysicalCore::Stop() {
     arm_interface->PrepareReschedule();
 }
@@ -63,4 +72,16 @@ void PhysicalCore::SetIs64Bit(bool is_64_bit) {
     }
 }
 
+void PhysicalCore::Interrupt() {
+    guard->lock();
+    interrupt_handler.SetInterrupt(true);
+    guard->unlock();
+}
+
+void PhysicalCore::ClearInterrupt() {
+    guard->lock();
+    interrupt_handler.SetInterrupt(false);
+    guard->unlock();
+}
+
 } // namespace Kernel
diff --git a/src/core/hle/kernel/physical_core.h b/src/core/hle/kernel/physical_core.h
index 3269166be..c3da30b72 100644
--- a/src/core/hle/kernel/physical_core.h
+++ b/src/core/hle/kernel/physical_core.h
@@ -7,6 +7,12 @@
 #include <cstddef>
 #include <memory>
 
+#include "core/arm/cpu_interrupt_handler.h"
+
+namespace Common {
+    class SpinLock;
+}
+
 namespace Kernel {
 class Scheduler;
 } // namespace Kernel
@@ -32,11 +38,24 @@ public:
 
     /// Execute current jit state
     void Run();
+    /// Set this core in IdleState.
+    void Idle();
     /// Execute a single instruction in current jit.
     void Step();
     /// Stop JIT execution/exit
     void Stop();
 
+    /// Interrupt this physical core.
+    void Interrupt();
+
+    /// Clear this core's interrupt
+    void ClearInterrupt();
+
+    /// Check if this core is interrupted
+    bool IsInterrupted() const {
+        return interrupt_handler.IsInterrupted();
+    }
+
     // Shutdown this physical core.
     void Shutdown();
 
@@ -71,11 +90,13 @@ public:
     void SetIs64Bit(bool is_64_bit);
 
 private:
+    Core::CPUInterruptHandler interrupt_handler;
     std::size_t core_index;
     std::unique_ptr<Core::ARM_Interface> arm_interface_32;
     std::unique_ptr<Core::ARM_Interface> arm_interface_64;
     std::unique_ptr<Kernel::Scheduler> scheduler;
     Core::ARM_Interface* arm_interface{};
+    std::unique_ptr<Common::SpinLock> guard;
 };
 
 } // namespace Kernel
diff --git a/src/core/hle/kernel/process.cpp b/src/core/hle/kernel/process.cpp
index c4c5199b1..7e26a54f4 100644
--- a/src/core/hle/kernel/process.cpp
+++ b/src/core/hle/kernel/process.cpp
@@ -30,14 +30,15 @@ namespace {
 /**
  * Sets up the primary application thread
  *
+ * @param system The system instance to create the main thread under.
  * @param owner_process The parent process for the main thread
- * @param kernel The kernel instance to create the main thread under.
  * @param priority The priority to give the main thread
  */
-void SetupMainThread(Process& owner_process, KernelCore& kernel, u32 priority, VAddr stack_top) {
+void SetupMainThread(Core::System& system, Process& owner_process, u32 priority, VAddr stack_top) {
     const VAddr entry_point = owner_process.PageTable().GetCodeRegionStart();
-    auto thread_res = Thread::Create(kernel, "main", entry_point, priority, 0,
-                                     owner_process.GetIdealCore(), stack_top, owner_process);
+    ThreadType type = THREADTYPE_USER;
+    auto thread_res = Thread::Create(system, type, "main", entry_point, priority, 0,
+                                     owner_process.GetIdealCore(), stack_top, &owner_process);
 
     std::shared_ptr<Thread> thread = std::move(thread_res).Unwrap();
 
@@ -48,8 +49,12 @@ void SetupMainThread(Process& owner_process, KernelCore& kernel, u32 priority, V
     thread->GetContext32().cpu_registers[1] = thread_handle;
     thread->GetContext64().cpu_registers[1] = thread_handle;
 
+    auto& kernel = system.Kernel();
     // Threads by default are dormant, wake up the main thread so it runs when the scheduler fires
-    thread->ResumeFromWait();
+    {
+        SchedulerLock lock{kernel};
+        thread->SetStatus(ThreadStatus::Ready);
+    }
 }
 } // Anonymous namespace
 
@@ -294,7 +299,7 @@ void Process::Run(s32 main_thread_priority, u64 stack_size) {
 
     ChangeStatus(ProcessStatus::Running);
 
-    SetupMainThread(*this, kernel, main_thread_priority, main_thread_stack_top);
+    SetupMainThread(system, *this, main_thread_priority, main_thread_stack_top);
     resource_limit->Reserve(ResourceType::Threads, 1);
     resource_limit->Reserve(ResourceType::PhysicalMemory, main_thread_stack_size);
 }
diff --git a/src/core/hle/kernel/scheduler.cpp b/src/core/hle/kernel/scheduler.cpp
index 1140c72a3..5166020a0 100644
--- a/src/core/hle/kernel/scheduler.cpp
+++ b/src/core/hle/kernel/scheduler.cpp
@@ -11,11 +11,15 @@
 #include <utility>
 
 #include "common/assert.h"
+#include "common/bit_util.h"
+#include "common/fiber.h"
 #include "common/logging/log.h"
 #include "core/arm/arm_interface.h"
 #include "core/core.h"
 #include "core/core_timing.h"
+#include "core/cpu_manager.h"
 #include "core/hle/kernel/kernel.h"
+#include "core/hle/kernel/physical_core.h"
 #include "core/hle/kernel/process.h"
 #include "core/hle/kernel/scheduler.h"
 #include "core/hle/kernel/time_manager.h"
@@ -27,78 +31,108 @@ GlobalScheduler::GlobalScheduler(KernelCore& kernel) : kernel{kernel} {}
 GlobalScheduler::~GlobalScheduler() = default;
 
 void GlobalScheduler::AddThread(std::shared_ptr<Thread> thread) {
+    global_list_guard.lock();
     thread_list.push_back(std::move(thread));
+    global_list_guard.unlock();
 }
 
 void GlobalScheduler::RemoveThread(std::shared_ptr<Thread> thread) {
+    global_list_guard.lock();
     thread_list.erase(std::remove(thread_list.begin(), thread_list.end(), thread),
                       thread_list.end());
+    global_list_guard.unlock();
 }
 
-void GlobalScheduler::UnloadThread(std::size_t core) {
-    Scheduler& sched = kernel.Scheduler(core);
-    sched.UnloadThread();
-}
-
-void GlobalScheduler::SelectThread(std::size_t core) {
+u32 GlobalScheduler::SelectThreads() {
     const auto update_thread = [](Thread* thread, Scheduler& sched) {
+        sched.guard.lock();
         if (thread != sched.selected_thread.get()) {
             if (thread == nullptr) {
                 ++sched.idle_selection_count;
             }
             sched.selected_thread = SharedFrom(thread);
         }
-        sched.is_context_switch_pending = sched.selected_thread != sched.current_thread;
+        const bool reschedule_pending = sched.selected_thread != sched.current_thread;
+        sched.is_context_switch_pending = reschedule_pending;
         std::atomic_thread_fence(std::memory_order_seq_cst);
+        sched.guard.unlock();
+        return reschedule_pending;
     };
-    Scheduler& sched = kernel.Scheduler(core);
-    Thread* current_thread = nullptr;
-    // Step 1: Get top thread in schedule queue.
-    current_thread = scheduled_queue[core].empty() ? nullptr : scheduled_queue[core].front();
-    if (current_thread) {
-        update_thread(current_thread, sched);
-        return;
+    if (!is_reselection_pending.load()) {
+        return 0;
     }
-    // Step 2: Try selecting a suggested thread.
-    Thread* winner = nullptr;
-    std::set<s32> sug_cores;
-    for (auto thread : suggested_queue[core]) {
-        s32 this_core = thread->GetProcessorID();
-        Thread* thread_on_core = nullptr;
-        if (this_core >= 0) {
-            thread_on_core = scheduled_queue[this_core].front();
-        }
-        if (this_core < 0 || thread != thread_on_core) {
-            winner = thread;
-            break;
+    std::array<Thread*, Core::Hardware::NUM_CPU_CORES> top_threads{};
+
+    u32 idle_cores{};
+
+    // Step 1: Get top thread in schedule queue.
+    for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+        Thread* top_thread =
+            scheduled_queue[core].empty() ? nullptr : scheduled_queue[core].front();
+        if (top_thread != nullptr) {
+            // TODO(Blinkhawk): Implement Thread Pinning
+        } else {
+            idle_cores |= (1ul << core);
         }
-        sug_cores.insert(this_core);
+        top_threads[core] = top_thread;
     }
-    // if we got a suggested thread, select it, else do a second pass.
-    if (winner && winner->GetPriority() > 2) {
-        if (winner->IsRunning()) {
-            UnloadThread(static_cast<u32>(winner->GetProcessorID()));
+
+    while (idle_cores != 0) {
+        u32 core_id = Common::CountTrailingZeroes32(idle_cores);
+
+        if (!suggested_queue[core_id].empty()) {
+            std::array<s32, Core::Hardware::NUM_CPU_CORES> migration_candidates{};
+            std::size_t num_candidates = 0;
+            auto iter = suggested_queue[core_id].begin();
+            Thread* suggested = nullptr;
+            // Step 2: Try selecting a suggested thread.
+            while (iter != suggested_queue[core_id].end()) {
+                suggested = *iter;
+                iter++;
+                s32 suggested_core_id = suggested->GetProcessorID();
+                Thread* top_thread =
+                    suggested_core_id > 0 ? top_threads[suggested_core_id] : nullptr;
+                if (top_thread != suggested) {
+                    if (top_thread != nullptr &&
+                        top_thread->GetPriority() < THREADPRIO_MAX_CORE_MIGRATION) {
+                        suggested = nullptr;
+                        break;
+                        // There's a too high thread to do core migration, cancel
+                    }
+                    TransferToCore(suggested->GetPriority(), static_cast<s32>(core_id), suggested);
+                    break;
+                }
+                migration_candidates[num_candidates++] = suggested_core_id;
+            }
+            // Step 3: Select a suggested thread from another core
+            if (suggested == nullptr) {
+                for (std::size_t i = 0; i < num_candidates; i++) {
+                    s32 candidate_core = migration_candidates[i];
+                    suggested = top_threads[candidate_core];
+                    auto it = scheduled_queue[candidate_core].begin();
+                    it++;
+                    Thread* next = it != scheduled_queue[candidate_core].end() ? *it : nullptr;
+                    if (next != nullptr) {
+                        TransferToCore(suggested->GetPriority(), static_cast<s32>(core_id),
+                                       suggested);
+                        top_threads[candidate_core] = next;
+                        break;
+                    }
+                }
+            }
+            top_threads[core_id] = suggested;
         }
-        TransferToCore(winner->GetPriority(), static_cast<s32>(core), winner);
-        update_thread(winner, sched);
-        return;
+
+        idle_cores &= ~(1ul << core_id);
     }
-    // Step 3: Select a suggested thread from another core
-    for (auto& src_core : sug_cores) {
-        auto it = scheduled_queue[src_core].begin();
-        it++;
-        if (it != scheduled_queue[src_core].end()) {
-            Thread* thread_on_core = scheduled_queue[src_core].front();
-            Thread* to_change = *it;
-            if (thread_on_core->IsRunning() || to_change->IsRunning()) {
-                UnloadThread(static_cast<u32>(src_core));
-            }
-            TransferToCore(thread_on_core->GetPriority(), static_cast<s32>(core), thread_on_core);
-            current_thread = thread_on_core;
-            break;
+    u32 cores_needing_context_switch{};
+    for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+        Scheduler& sched = kernel.Scheduler(core);
+        if (update_thread(top_threads[core], sched)) {
+            cores_needing_context_switch |= (1ul << core);
         }
     }
-    update_thread(current_thread, sched);
+    return cores_needing_context_switch;
 }
 
 bool GlobalScheduler::YieldThread(Thread* yielding_thread) {
@@ -153,9 +187,6 @@ bool GlobalScheduler::YieldThreadAndBalanceLoad(Thread* yielding_thread) {
 
     if (winner != nullptr) {
         if (winner != yielding_thread) {
-            if (winner->IsRunning()) {
-                UnloadThread(static_cast<u32>(winner->GetProcessorID()));
-            }
             TransferToCore(winner->GetPriority(), s32(core_id), winner);
         }
     } else {
@@ -195,9 +226,6 @@ bool GlobalScheduler::YieldThreadAndWaitForLoadBalancing(Thread* yielding_thread
         }
         if (winner != nullptr) {
             if (winner != yielding_thread) {
-                if (winner->IsRunning()) {
-                    UnloadThread(static_cast<u32>(winner->GetProcessorID()));
-                }
                 TransferToCore(winner->GetPriority(), static_cast<s32>(core_id), winner);
             }
         } else {
@@ -213,7 +241,9 @@ void GlobalScheduler::PreemptThreads() {
         const u32 priority = preemption_priorities[core_id];
 
         if (scheduled_queue[core_id].size(priority) > 0) {
-            scheduled_queue[core_id].front(priority)->IncrementYieldCount();
+            if (scheduled_queue[core_id].size(priority) > 1) {
+                scheduled_queue[core_id].front(priority)->IncrementYieldCount();
+            }
             scheduled_queue[core_id].yield(priority);
             if (scheduled_queue[core_id].size(priority) > 1) {
                 scheduled_queue[core_id].front(priority)->IncrementYieldCount();
@@ -247,9 +277,6 @@ void GlobalScheduler::PreemptThreads() {
         }
 
         if (winner != nullptr) {
-            if (winner->IsRunning()) {
-                UnloadThread(static_cast<u32>(winner->GetProcessorID()));
-            }
             TransferToCore(winner->GetPriority(), s32(core_id), winner);
             current_thread =
                 winner->GetPriority() <= current_thread->GetPriority() ? winner : current_thread;
@@ -280,9 +307,6 @@ void GlobalScheduler::PreemptThreads() {
             }
 
             if (winner != nullptr) {
-                if (winner->IsRunning()) {
-                    UnloadThread(static_cast<u32>(winner->GetProcessorID()));
-                }
                 TransferToCore(winner->GetPriority(), s32(core_id), winner);
                 current_thread = winner;
             }
@@ -292,6 +316,28 @@ void GlobalScheduler::PreemptThreads() {
     }
 }
 
+void GlobalScheduler::EnableInterruptAndSchedule(u32 cores_pending_reschedule,
+                                                 Core::EmuThreadHandle global_thread) {
+    u32 current_core = global_thread.host_handle;
+    bool must_context_switch = global_thread.guest_handle != InvalidHandle &&
+                               (current_core < Core::Hardware::NUM_CPU_CORES);
+    while (cores_pending_reschedule != 0) {
+        u32 core = Common::CountTrailingZeroes32(cores_pending_reschedule);
+        ASSERT(core < Core::Hardware::NUM_CPU_CORES);
+        if (!must_context_switch || core != current_core) {
+            auto& phys_core = kernel.PhysicalCore(core);
+            phys_core.Interrupt();
+        } else {
+            must_context_switch = true;
+        }
+        cores_pending_reschedule &= ~(1ul << core);
+    }
+    if (must_context_switch) {
+        auto& core_scheduler = kernel.CurrentScheduler();
+        core_scheduler.TryDoContextSwitch();
+    }
+}
+
 void GlobalScheduler::Suggest(u32 priority, std::size_t core, Thread* thread) {
     suggested_queue[core].add(thread, priority);
 }
@@ -349,6 +395,108 @@ bool GlobalScheduler::AskForReselectionOrMarkRedundant(Thread* current_thread,
     }
 }
 
+void GlobalScheduler::AdjustSchedulingOnStatus(Thread* thread, u32 old_flags) {
+    if (old_flags == thread->scheduling_state) {
+        return;
+    }
+
+    if (static_cast<ThreadSchedStatus>(old_flags & static_cast<u32>(ThreadSchedMasks::LowMask)) ==
+        ThreadSchedStatus::Runnable) {
+        // In this case the thread was running, now it's pausing/exitting
+        if (thread->processor_id >= 0) {
+            Unschedule(thread->current_priority, static_cast<u32>(thread->processor_id), thread);
+        }
+
+        for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+            if (core != static_cast<u32>(thread->processor_id) &&
+                ((thread->affinity_mask >> core) & 1) != 0) {
+                Unsuggest(thread->current_priority, core, thread);
+            }
+        }
+    } else if (thread->GetSchedulingStatus() == ThreadSchedStatus::Runnable) {
+        // The thread is now set to running from being stopped
+        if (thread->processor_id >= 0) {
+            Schedule(thread->current_priority, static_cast<u32>(thread->processor_id), thread);
+        }
+
+        for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+            if (core != static_cast<u32>(thread->processor_id) &&
+                ((thread->affinity_mask >> core) & 1) != 0) {
+                Suggest(thread->current_priority, core, thread);
+            }
+        }
+    }
+
+    SetReselectionPending();
+}
+
+void GlobalScheduler::AdjustSchedulingOnPriority(Thread* thread, u32 old_priority) {
+    if (thread->GetSchedulingStatus() != ThreadSchedStatus::Runnable) {
+        return;
+    }
+    if (thread->processor_id >= 0) {
+        Unschedule(old_priority, static_cast<u32>(thread->processor_id), thread);
+    }
+
+    for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+        if (core != static_cast<u32>(thread->processor_id) &&
+            ((thread->affinity_mask >> core) & 1) != 0) {
+            Unsuggest(old_priority, core, thread);
+        }
+    }
+
+    if (thread->processor_id >= 0) {
+        // TODO(Blinkhawk): compare it with current thread running on current core, instead of
+        // checking running
+        if (thread->IsRunning()) {
+            SchedulePrepend(thread->current_priority, static_cast<u32>(thread->processor_id),
+                            thread);
+        } else {
+            Schedule(thread->current_priority, static_cast<u32>(thread->processor_id), thread);
+        }
+    }
+
+    for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+        if (core != static_cast<u32>(thread->processor_id) &&
+            ((thread->affinity_mask >> core) & 1) != 0) {
+            Suggest(thread->current_priority, core, thread);
+        }
+    }
+    thread->IncrementYieldCount();
+    SetReselectionPending();
+}
+
+void GlobalScheduler::AdjustSchedulingOnAffinity(Thread* thread, u64 old_affinity_mask,
+                                                 s32 old_core) {
+    if (thread->GetSchedulingStatus() != ThreadSchedStatus::Runnable ||
+        thread->current_priority >= THREADPRIO_COUNT) {
+        return;
+    }
+
+    for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+        if (((old_affinity_mask >> core) & 1) != 0) {
+            if (core == static_cast<u32>(old_core)) {
+                Unschedule(thread->current_priority, core, thread);
+            } else {
+                Unsuggest(thread->current_priority, core, thread);
+            }
+        }
+    }
+
+    for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+        if (((thread->affinity_mask >> core) & 1) != 0) {
+            if (core == static_cast<u32>(thread->processor_id)) {
+                Schedule(thread->current_priority, core, thread);
+            } else {
+                Suggest(thread->current_priority, core, thread);
+            }
+        }
+    }
+
+    thread->IncrementYieldCount();
+    SetReselectionPending();
+}
+
 void GlobalScheduler::Shutdown() {
     for (std::size_t core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
         scheduled_queue[core].clear();
@@ -374,13 +522,12 @@ void GlobalScheduler::Unlock() {
         ASSERT(scope_lock > 0);
         return;
     }
-    for (std::size_t i = 0; i < Core::Hardware::NUM_CPU_CORES; i++) {
-        SelectThread(i);
-    }
+    u32 cores_pending_reschedule = SelectThreads();
+    Core::EmuThreadHandle leaving_thread = current_owner;
     current_owner = Core::EmuThreadHandle::InvalidHandle();
     scope_lock = 1;
     inner_lock.unlock();
-    // TODO(Blinkhawk): Setup the interrupts and change context on current core.
+    EnableInterruptAndSchedule(cores_pending_reschedule, leaving_thread);
 }
 
 Scheduler::Scheduler(Core::System& system, std::size_t core_id)
@@ -393,56 +540,83 @@ bool Scheduler::HaveReadyThreads() const {
 }
 
 Thread* Scheduler::GetCurrentThread() const {
-    return current_thread.get();
+    if (current_thread) {
+        return current_thread.get();
+    }
+    return idle_thread.get();
 }
 
 Thread* Scheduler::GetSelectedThread() const {
     return selected_thread.get();
 }
 
-void Scheduler::SelectThreads() {
-    system.GlobalScheduler().SelectThread(core_id);
-}
-
 u64 Scheduler::GetLastContextSwitchTicks() const {
     return last_context_switch_time;
 }
 
 void Scheduler::TryDoContextSwitch() {
+    auto& phys_core = system.Kernel().CurrentPhysicalCore();
+    if (phys_core.IsInterrupted()) {
+        phys_core.ClearInterrupt();
+    }
+    guard.lock();
     if (is_context_switch_pending) {
         SwitchContext();
+    } else {
+        guard.unlock();
     }
 }
 
-void Scheduler::UnloadThread() {
-    Thread* const previous_thread = GetCurrentThread();
-    Process* const previous_process = system.Kernel().CurrentProcess();
+void Scheduler::OnThreadStart() {
+    SwitchContextStep2();
+}
 
-    UpdateLastContextSwitchTime(previous_thread, previous_process);
+void Scheduler::SwitchContextStep2() {
+    Thread* previous_thread = current_thread.get();
+    Thread* new_thread = selected_thread.get();
 
-    // Save context for previous thread
-    if (previous_thread) {
-        system.ArmInterface(core_id).SaveContext(previous_thread->GetContext32());
-        system.ArmInterface(core_id).SaveContext(previous_thread->GetContext64());
-        // Save the TPIDR_EL0 system register in case it was modified.
-        previous_thread->SetTPIDR_EL0(system.ArmInterface(core_id).GetTPIDR_EL0());
+    // Load context of new thread
+    Process* const previous_process =
+        previous_thread != nullptr ? previous_thread->GetOwnerProcess() : nullptr;
 
-        if (previous_thread->GetStatus() == ThreadStatus::Running) {
-            // This is only the case when a reschedule is triggered without the current thread
-            // yielding execution (i.e. an event triggered, system core time-sliced, etc)
-            previous_thread->SetStatus(ThreadStatus::Ready);
+    if (new_thread) {
+        new_thread->context_guard.lock();
+        ASSERT_MSG(new_thread->GetProcessorID() == s32(this->core_id),
+                   "Thread must be assigned to this core.");
+        ASSERT_MSG(new_thread->GetStatus() == ThreadStatus::Ready,
+                   "Thread must be ready to become running.");
+
+        // Cancel any outstanding wakeup events for this thread
+        current_thread = SharedFrom(new_thread);
+        new_thread->SetStatus(ThreadStatus::Running);
+        new_thread->SetIsRunning(true);
+
+        auto* const thread_owner_process = current_thread->GetOwnerProcess();
+        if (previous_process != thread_owner_process && thread_owner_process != nullptr) {
+            system.Kernel().MakeCurrentProcess(thread_owner_process);
         }
-        previous_thread->SetIsRunning(false);
+        if (!new_thread->IsHLEThread()) {
+            auto& cpu_core = system.ArmInterface(core_id);
+            cpu_core.LoadContext(new_thread->GetContext32());
+            cpu_core.LoadContext(new_thread->GetContext64());
+            cpu_core.SetTlsAddress(new_thread->GetTLSAddress());
+            cpu_core.SetTPIDR_EL0(new_thread->GetTPIDR_EL0());
+        }
+    } else {
+        current_thread = nullptr;
+        // Note: We do not reset the current process and current page table when idling because
+        // technically we haven't changed processes, our threads are just paused.
     }
-    current_thread = nullptr;
+    guard.unlock();
 }
 
 void Scheduler::SwitchContext() {
-    Thread* const previous_thread = GetCurrentThread();
-    Thread* const new_thread = GetSelectedThread();
+    Thread* previous_thread = current_thread.get();
+    Thread* new_thread = selected_thread.get();
 
     is_context_switch_pending = false;
     if (new_thread == previous_thread) {
+        guard.unlock();
         return;
     }
 
@@ -452,51 +626,44 @@ void Scheduler::SwitchContext() {
 
     // Save context for previous thread
     if (previous_thread) {
-        system.ArmInterface(core_id).SaveContext(previous_thread->GetContext32());
-        system.ArmInterface(core_id).SaveContext(previous_thread->GetContext64());
-        // Save the TPIDR_EL0 system register in case it was modified.
-        previous_thread->SetTPIDR_EL0(system.ArmInterface(core_id).GetTPIDR_EL0());
+        if (!previous_thread->IsHLEThread()) {
+            auto& cpu_core = system.ArmInterface(core_id);
+            cpu_core.SaveContext(previous_thread->GetContext32());
+            cpu_core.SaveContext(previous_thread->GetContext64());
+            // Save the TPIDR_EL0 system register in case it was modified.
+            previous_thread->SetTPIDR_EL0(cpu_core.GetTPIDR_EL0());
 
+        }
         if (previous_thread->GetStatus() == ThreadStatus::Running) {
-            // This is only the case when a reschedule is triggered without the current thread
-            // yielding execution (i.e. an event triggered, system core time-sliced, etc)
             previous_thread->SetStatus(ThreadStatus::Ready);
         }
         previous_thread->SetIsRunning(false);
+        previous_thread->context_guard.unlock();
     }
 
-    // Load context of new thread
-    if (new_thread) {
-        ASSERT_MSG(new_thread->GetProcessorID() == s32(this->core_id),
-                   "Thread must be assigned to this core.");
-        ASSERT_MSG(new_thread->GetStatus() == ThreadStatus::Ready,
-                   "Thread must be ready to become running.");
-
-        // Cancel any outstanding wakeup events for this thread
-        new_thread->CancelWakeupTimer();
-        current_thread = SharedFrom(new_thread);
-        new_thread->SetStatus(ThreadStatus::Running);
-        new_thread->SetIsRunning(true);
-
-        auto* const thread_owner_process = current_thread->GetOwnerProcess();
-        if (previous_process != thread_owner_process) {
-            system.Kernel().MakeCurrentProcess(thread_owner_process);
-        }
+    std::shared_ptr<Common::Fiber> old_context;
+    if (previous_thread != nullptr) {
+        old_context = previous_thread->GetHostContext();
+    } else {
+        old_context = idle_thread->GetHostContext();
+    }
 
-        system.ArmInterface(core_id).LoadContext(new_thread->GetContext32());
-        system.ArmInterface(core_id).LoadContext(new_thread->GetContext64());
-        system.ArmInterface(core_id).SetTlsAddress(new_thread->GetTLSAddress());
-        system.ArmInterface(core_id).SetTPIDR_EL0(new_thread->GetTPIDR_EL0());
+    std::shared_ptr<Common::Fiber> next_context;
+    if (new_thread != nullptr) {
+        next_context = new_thread->GetHostContext();
     } else {
-        current_thread = nullptr;
-        // Note: We do not reset the current process and current page table when idling because
-        // technically we haven't changed processes, our threads are just paused.
+        next_context = idle_thread->GetHostContext();
     }
+
+    Common::Fiber::YieldTo(old_context, next_context);
+    /// When a thread wakes up, the scheduler may have changed to other in another core.
+    auto& next_scheduler = system.Kernel().CurrentScheduler();
+    next_scheduler.SwitchContextStep2();
 }
 
 void Scheduler::UpdateLastContextSwitchTime(Thread* thread, Process* process) {
     const u64 prev_switch_ticks = last_context_switch_time;
-    const u64 most_recent_switch_ticks = system.CoreTiming().GetTicks();
+    const u64 most_recent_switch_ticks = system.CoreTiming().GetCPUTicks();
     const u64 update_ticks = most_recent_switch_ticks - prev_switch_ticks;
 
     if (thread != nullptr) {
@@ -510,6 +677,16 @@ void Scheduler::UpdateLastContextSwitchTime(Thread* thread, Process* process) {
     last_context_switch_time = most_recent_switch_ticks;
 }
 
+void Scheduler::Initialize() {
+    std::string name = "Idle Thread Id:" + std::to_string(core_id);
+    std::function<void(void*)> init_func = system.GetCpuManager().GetIdleThreadStartFunc();
+    void* init_func_parameter = system.GetCpuManager().GetStartFuncParamater();
+    ThreadType type = static_cast<ThreadType>(THREADTYPE_KERNEL | THREADTYPE_HLE | THREADTYPE_IDLE);
+    auto thread_res = Thread::Create(system, type, name, 0, 64, 0, static_cast<u32>(core_id), 0,
+                                     nullptr, std::move(init_func), init_func_parameter);
+    idle_thread = std::move(thread_res).Unwrap();
+}
+
 void Scheduler::Shutdown() {
     current_thread = nullptr;
     selected_thread = nullptr;
diff --git a/src/core/hle/kernel/scheduler.h b/src/core/hle/kernel/scheduler.h
index 07df33f9c..16655b03f 100644
--- a/src/core/hle/kernel/scheduler.h
+++ b/src/core/hle/kernel/scheduler.h
@@ -11,6 +11,7 @@
 
 #include "common/common_types.h"
 #include "common/multi_level_queue.h"
+#include "common/spin_lock.h"
 #include "core/hardware_properties.h"
 #include "core/hle/kernel/thread.h"
 
@@ -41,41 +42,17 @@ public:
         return thread_list;
     }
 
-    /**
-     * Add a thread to the suggested queue of a cpu core. Suggested threads may be
-     * picked if no thread is scheduled to run on the core.
-     */
-    void Suggest(u32 priority, std::size_t core, Thread* thread);
-
-    /**
-     * Remove a thread to the suggested queue of a cpu core. Suggested threads may be
-     * picked if no thread is scheduled to run on the core.
-     */
-    void Unsuggest(u32 priority, std::size_t core, Thread* thread);
-
-    /**
-     * Add a thread to the scheduling queue of a cpu core. The thread is added at the
-     * back the queue in its priority level.
-     */
-    void Schedule(u32 priority, std::size_t core, Thread* thread);
+    /// Notify the scheduler a thread's status has changed.
+    void AdjustSchedulingOnStatus(Thread* thread, u32 old_flags);
 
-    /**
-     * Add a thread to the scheduling queue of a cpu core. The thread is added at the
-     * front the queue in its priority level.
-     */
-    void SchedulePrepend(u32 priority, std::size_t core, Thread* thread);
+    /// Notify the scheduler a thread's priority has changed.
+    void AdjustSchedulingOnPriority(Thread* thread, u32 old_priority);
 
-    /// Reschedule an already scheduled thread based on a new priority
-    void Reschedule(u32 priority, std::size_t core, Thread* thread);
-
-    /// Unschedules a thread.
-    void Unschedule(u32 priority, std::size_t core, Thread* thread);
-
-    /// Selects a core and forces it to unload its current thread's context
-    void UnloadThread(std::size_t core);
+    /// Notify the scheduler a thread's core and/or affinity mask has changed.
+    void AdjustSchedulingOnAffinity(Thread* thread, u64 old_affinity_mask, s32 old_core);
 
     /**
-     * Takes care of selecting the new scheduled thread in three steps:
+     * Takes care of selecting the new scheduled threads in three steps:
      *
      * 1. First a thread is selected from the top of the priority queue. If no thread
      *    is obtained then we move to step two, else we are done.
@@ -85,8 +62,10 @@ public:
      *
      * 3. Third is no suggested thread is found, we do a second pass and pick a running
      *    thread in another core and swap it with its current thread.
+     *
+     * returns the cores needing scheduling.
      */
-    void SelectThread(std::size_t core);
+    u32 SelectThreads();
 
     bool HaveReadyThreads(std::size_t core_id) const {
         return !scheduled_queue[core_id].empty();
@@ -149,6 +128,39 @@ private:
     /// Unlocks the scheduler, reselects threads, interrupts cores for rescheduling
     /// and reschedules current core if needed.
     void Unlock();
+
+    void EnableInterruptAndSchedule(u32 cores_pending_reschedule, Core::EmuThreadHandle global_thread);
+
+    /**
+     * Add a thread to the suggested queue of a cpu core. Suggested threads may be
+     * picked if no thread is scheduled to run on the core.
+     */
+    void Suggest(u32 priority, std::size_t core, Thread* thread);
+
+    /**
+     * Remove a thread to the suggested queue of a cpu core. Suggested threads may be
+     * picked if no thread is scheduled to run on the core.
+     */
+    void Unsuggest(u32 priority, std::size_t core, Thread* thread);
+
+    /**
+     * Add a thread to the scheduling queue of a cpu core. The thread is added at the
+     * back the queue in its priority level.
+     */
+    void Schedule(u32 priority, std::size_t core, Thread* thread);
+
+    /**
+     * Add a thread to the scheduling queue of a cpu core. The thread is added at the
+     * front the queue in its priority level.
+     */
+    void SchedulePrepend(u32 priority, std::size_t core, Thread* thread);
+
+    /// Reschedule an already scheduled thread based on a new priority
+    void Reschedule(u32 priority, std::size_t core, Thread* thread);
+
+    /// Unschedules a thread.
+    void Unschedule(u32 priority, std::size_t core, Thread* thread);
+
     /**
      * Transfers a thread into an specific core. If the destination_core is -1
      * it will be unscheduled from its source code and added into its suggested
@@ -174,6 +186,8 @@ private:
     std::atomic<s64> scope_lock{};
     Core::EmuThreadHandle current_owner{Core::EmuThreadHandle::InvalidHandle()};
 
+    Common::SpinLock global_list_guard{};
+
     /// Lists all thread ids that aren't deleted/etc.
     std::vector<std::shared_ptr<Thread>> thread_list;
     KernelCore& kernel;
@@ -190,12 +204,6 @@ public:
     /// Reschedules to the next available thread (call after current thread is suspended)
     void TryDoContextSwitch();
 
-    /// Unloads currently running thread
-    void UnloadThread();
-
-    /// Select the threads in top of the scheduling multilist.
-    void SelectThreads();
-
     /// Gets the current running thread
     Thread* GetCurrentThread() const;
 
@@ -209,15 +217,22 @@ public:
         return is_context_switch_pending;
     }
 
+    void Initialize();
+
     /// Shutdowns the scheduler.
     void Shutdown();
 
+    void OnThreadStart();
+
 private:
     friend class GlobalScheduler;
 
     /// Switches the CPU's active thread context to that of the specified thread
     void SwitchContext();
 
+    /// When a thread wakes up, it must run this through it's new scheduler
+    void SwitchContextStep2();
+
     /**
      * Called on every context switch to update the internal timestamp
      * This also updates the running time ticks for the given thread and
@@ -233,12 +248,15 @@ private:
 
     std::shared_ptr<Thread> current_thread = nullptr;
     std::shared_ptr<Thread> selected_thread = nullptr;
+    std::shared_ptr<Thread> idle_thread = nullptr;
 
     Core::System& system;
     u64 last_context_switch_time = 0;
     u64 idle_selection_count = 0;
     const std::size_t core_id;
 
+    Common::SpinLock guard{};
+
     bool is_context_switch_pending = false;
 };
 
diff --git a/src/core/hle/kernel/svc.cpp b/src/core/hle/kernel/svc.cpp
index 4ae4529f5..d7f0dcabd 100644
--- a/src/core/hle/kernel/svc.cpp
+++ b/src/core/hle/kernel/svc.cpp
@@ -863,9 +863,9 @@ static ResultCode GetInfo(Core::System& system, u64* result, u64 info_id, u64 ha
         if (same_thread && info_sub_id == 0xFFFFFFFFFFFFFFFF) {
             const u64 thread_ticks = current_thread->GetTotalCPUTimeTicks();
 
-            out_ticks = thread_ticks + (core_timing.GetTicks() - prev_ctx_ticks);
+            out_ticks = thread_ticks + (core_timing.GetCPUTicks() - prev_ctx_ticks);
         } else if (same_thread && info_sub_id == system.CurrentCoreIndex()) {
-            out_ticks = core_timing.GetTicks() - prev_ctx_ticks;
+            out_ticks = core_timing.GetCPUTicks() - prev_ctx_ticks;
         }
 
         *result = out_ticks;
@@ -1428,9 +1428,10 @@ static ResultCode CreateThread(Core::System& system, Handle* out_handle, VAddr e
 
     ASSERT(kernel.CurrentProcess()->GetResourceLimit()->Reserve(ResourceType::Threads, 1));
 
+    ThreadType type = THREADTYPE_USER;
     CASCADE_RESULT(std::shared_ptr<Thread> thread,
-                   Thread::Create(kernel, "", entry_point, priority, arg, processor_id, stack_top,
-                                  *current_process));
+                   Thread::Create(system, type, "", entry_point, priority, arg, processor_id, stack_top,
+                                  current_process));
 
     const auto new_thread_handle = current_process->GetHandleTable().Create(thread);
     if (new_thread_handle.Failed()) {
@@ -1513,13 +1514,6 @@ static void SleepThread(Core::System& system, s64 nanoseconds) {
     } else {
         current_thread->Sleep(nanoseconds);
     }
-
-    if (is_redundant) {
-        // If it's redundant, the core is pretty much idle. Some games keep idling
-        // a core while it's doing nothing, we advance timing to avoid costly continuous
-        // calls.
-        system.CoreTiming().AddTicks(2000);
-    }
     system.PrepareReschedule(current_thread->GetProcessorID());
 }
 
@@ -1725,10 +1719,7 @@ static u64 GetSystemTick(Core::System& system) {
     auto& core_timing = system.CoreTiming();
 
     // Returns the value of cntpct_el0 (https://switchbrew.org/wiki/SVC#svcGetSystemTick)
-    const u64 result{Core::Timing::CpuCyclesToClockCycles(system.CoreTiming().GetTicks())};
-
-    // Advance time to defeat dumb games that busy-wait for the frame to end.
-    core_timing.AddTicks(400);
+    const u64 result{system.CoreTiming().GetClockTicks()};
 
     return result;
 }
diff --git a/src/core/hle/kernel/thread.cpp b/src/core/hle/kernel/thread.cpp
index db7f379ac..8cb3593db 100644
--- a/src/core/hle/kernel/thread.cpp
+++ b/src/core/hle/kernel/thread.cpp
@@ -9,12 +9,14 @@
 
 #include "common/assert.h"
 #include "common/common_types.h"
+#include "common/fiber.h"
 #include "common/logging/log.h"
 #include "common/thread_queue_list.h"
 #include "core/arm/arm_interface.h"
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "core/core_timing_util.h"
+#include "core/cpu_manager.h"
 #include "core/hardware_properties.h"
 #include "core/hle/kernel/errors.h"
 #include "core/hle/kernel/handle_table.h"
@@ -23,6 +25,7 @@
 #include "core/hle/kernel/process.h"
 #include "core/hle/kernel/scheduler.h"
 #include "core/hle/kernel/thread.h"
+#include "core/hle/kernel/time_manager.h"
 #include "core/hle/result.h"
 #include "core/memory.h"
 
@@ -44,6 +47,7 @@ Thread::Thread(KernelCore& kernel) : SynchronizationObject{kernel} {}
 Thread::~Thread() = default;
 
 void Thread::Stop() {
+    SchedulerLock lock(kernel);
     // Cancel any outstanding wakeup events for this thread
     Core::System::GetInstance().CoreTiming().UnscheduleEvent(kernel.ThreadWakeupCallbackEventType(),
                                                              global_handle);
@@ -71,9 +75,8 @@ void Thread::WakeAfterDelay(s64 nanoseconds) {
 
     // This function might be called from any thread so we have to be cautious and use the
     // thread-safe version of ScheduleEvent.
-    const s64 cycles = Core::Timing::nsToCycles(std::chrono::nanoseconds{nanoseconds});
     Core::System::GetInstance().CoreTiming().ScheduleEvent(
-        cycles, kernel.ThreadWakeupCallbackEventType(), global_handle);
+        nanoseconds, kernel.ThreadWakeupCallbackEventType(), global_handle);
 }
 
 void Thread::CancelWakeupTimer() {
@@ -125,6 +128,16 @@ void Thread::ResumeFromWait() {
     SetStatus(ThreadStatus::Ready);
 }
 
+void Thread::OnWakeUp() {
+    SchedulerLock lock(kernel);
+    if (activity == ThreadActivity::Paused) {
+        SetStatus(ThreadStatus::Paused);
+        return;
+    }
+
+    SetStatus(ThreadStatus::Ready);
+}
+
 void Thread::CancelWait() {
     if (GetSchedulingStatus() != ThreadSchedStatus::Paused) {
         is_sync_cancelled = true;
@@ -153,12 +166,29 @@ static void ResetThreadContext64(Core::ARM_Interface::ThreadContext64& context,
     context.fpcr = 0;
 }
 
-ResultVal<std::shared_ptr<Thread>> Thread::Create(KernelCore& kernel, std::string name,
-                                                  VAddr entry_point, u32 priority, u64 arg,
-                                                  s32 processor_id, VAddr stack_top,
-                                                  Process& owner_process) {
+std::shared_ptr<Common::Fiber> Thread::GetHostContext() const {
+    return host_context;
+}
+
+ResultVal<std::shared_ptr<Thread>> Thread::Create(Core::System& system, ThreadType type_flags,
+                                                  std::string name, VAddr entry_point, u32 priority,
+                                                  u64 arg, s32 processor_id, VAddr stack_top,
+                                                  Process* owner_process) {
+    std::function<void(void*)> init_func = system.GetCpuManager().GetGuestThreadStartFunc();
+    void* init_func_parameter = system.GetCpuManager().GetStartFuncParamater();
+    return Create(system, type_flags, name, entry_point, priority, arg, processor_id, stack_top,
+                  owner_process, std::move(init_func), init_func_parameter);
+}
+
+ResultVal<std::shared_ptr<Thread>> Thread::Create(Core::System& system, ThreadType type_flags,
+                                                  std::string name, VAddr entry_point, u32 priority,
+                                                  u64 arg, s32 processor_id, VAddr stack_top,
+                                                  Process* owner_process,
+                                                  std::function<void(void*)>&& thread_start_func,
+                                                  void* thread_start_parameter) {
+    auto& kernel = system.Kernel();
     // Check if priority is in ranged. Lowest priority -> highest priority id.
-    if (priority > THREADPRIO_LOWEST) {
+    if (priority > THREADPRIO_LOWEST && (type_flags & THREADTYPE_IDLE == 0)) {
         LOG_ERROR(Kernel_SVC, "Invalid thread priority: {}", priority);
         return ERR_INVALID_THREAD_PRIORITY;
     }
@@ -168,11 +198,12 @@ ResultVal<std::shared_ptr<Thread>> Thread::Create(KernelCore& kernel, std::strin
         return ERR_INVALID_PROCESSOR_ID;
     }
 
-    auto& system = Core::System::GetInstance();
-    if (!system.Memory().IsValidVirtualAddress(owner_process, entry_point)) {
-        LOG_ERROR(Kernel_SVC, "(name={}): invalid entry {:016X}", name, entry_point);
-        // TODO (bunnei): Find the correct error code to use here
-        return RESULT_UNKNOWN;
+    if (owner_process) {
+        if (!system.Memory().IsValidVirtualAddress(*owner_process, entry_point)) {
+            LOG_ERROR(Kernel_SVC, "(name={}): invalid entry {:016X}", name, entry_point);
+            // TODO (bunnei): Find the correct error code to use here
+            return RESULT_UNKNOWN;
+        }
     }
 
     std::shared_ptr<Thread> thread = std::make_shared<Thread>(kernel);
@@ -183,7 +214,7 @@ ResultVal<std::shared_ptr<Thread>> Thread::Create(KernelCore& kernel, std::strin
     thread->stack_top = stack_top;
     thread->tpidr_el0 = 0;
     thread->nominal_priority = thread->current_priority = priority;
-    thread->last_running_ticks = system.CoreTiming().GetTicks();
+    thread->last_running_ticks = 0;
     thread->processor_id = processor_id;
     thread->ideal_core = processor_id;
     thread->affinity_mask = 1ULL << processor_id;
@@ -193,16 +224,27 @@ ResultVal<std::shared_ptr<Thread>> Thread::Create(KernelCore& kernel, std::strin
     thread->wait_handle = 0;
     thread->name = std::move(name);
     thread->global_handle = kernel.GlobalHandleTable().Create(thread).Unwrap();
-    thread->owner_process = &owner_process;
-    auto& scheduler = kernel.GlobalScheduler();
-    scheduler.AddThread(thread);
-    thread->tls_address = thread->owner_process->CreateTLSRegion();
-
-    thread->owner_process->RegisterThread(thread.get());
-
-    ResetThreadContext32(thread->context_32, static_cast<u32>(stack_top),
-                         static_cast<u32>(entry_point), static_cast<u32>(arg));
-    ResetThreadContext64(thread->context_64, stack_top, entry_point, arg);
+    thread->owner_process = owner_process;
+    thread->type = type_flags;
+    if ((type_flags & THREADTYPE_IDLE) == 0) {
+        auto& scheduler = kernel.GlobalScheduler();
+        scheduler.AddThread(thread);
+    }
+    if (owner_process) {
+        thread->tls_address = thread->owner_process->CreateTLSRegion();
+        thread->owner_process->RegisterThread(thread.get());
+    } else {
+        thread->tls_address = 0;
+    }
+    // TODO(peachum): move to ScheduleThread() when scheduler is added so selected core is used
+    // to initialize the context
+    if ((type_flags & THREADTYPE_HLE) == 0) {
+        ResetThreadContext32(thread->context_32, static_cast<u32>(stack_top),
+                             static_cast<u32>(entry_point), static_cast<u32>(arg));
+        ResetThreadContext64(thread->context_64, stack_top, entry_point, arg);
+    }
+    thread->host_context =
+        std::make_shared<Common::Fiber>(std::move(thread_start_func), thread_start_parameter);
 
     return MakeResult<std::shared_ptr<Thread>>(std::move(thread));
 }
@@ -258,7 +300,7 @@ void Thread::SetStatus(ThreadStatus new_status) {
     }
 
     if (status == ThreadStatus::Running) {
-        last_running_ticks = Core::System::GetInstance().CoreTiming().GetTicks();
+        last_running_ticks = Core::System::GetInstance().CoreTiming().GetCPUTicks();
     }
 
     status = new_status;
@@ -375,38 +417,55 @@ void Thread::SetActivity(ThreadActivity value) {
 }
 
 void Thread::Sleep(s64 nanoseconds) {
-    // Sleep current thread and check for next thread to schedule
-    SetStatus(ThreadStatus::WaitSleep);
+    Handle event_handle{};
+    {
+        SchedulerLockAndSleep lock(kernel, event_handle, this, nanoseconds);
+        SetStatus(ThreadStatus::WaitSleep);
+    }
 
-    // Create an event to wake the thread up after the specified nanosecond delay has passed
-    WakeAfterDelay(nanoseconds);
+    if (event_handle != InvalidHandle) {
+        auto& time_manager = kernel.TimeManager();
+        time_manager.UnscheduleTimeEvent(event_handle);
+    }
 }
 
 bool Thread::YieldSimple() {
-    auto& scheduler = kernel.GlobalScheduler();
-    return scheduler.YieldThread(this);
+    bool result{};
+    {
+        SchedulerLock lock(kernel);
+        result = kernel.GlobalScheduler().YieldThread(this);
+    }
+    return result;
 }
 
 bool Thread::YieldAndBalanceLoad() {
-    auto& scheduler = kernel.GlobalScheduler();
-    return scheduler.YieldThreadAndBalanceLoad(this);
+    bool result{};
+    {
+        SchedulerLock lock(kernel);
+        result = kernel.GlobalScheduler().YieldThreadAndBalanceLoad(this);
+    }
+    return result;
 }
 
 bool Thread::YieldAndWaitForLoadBalancing() {
-    auto& scheduler = kernel.GlobalScheduler();
-    return scheduler.YieldThreadAndWaitForLoadBalancing(this);
+    bool result{};
+    {
+        SchedulerLock lock(kernel);
+        result = kernel.GlobalScheduler().YieldThreadAndWaitForLoadBalancing(this);
+    }
+    return result;
 }
 
 void Thread::SetSchedulingStatus(ThreadSchedStatus new_status) {
     const u32 old_flags = scheduling_state;
     scheduling_state = (scheduling_state & static_cast<u32>(ThreadSchedMasks::HighMask)) |
                        static_cast<u32>(new_status);
-    AdjustSchedulingOnStatus(old_flags);
+    kernel.GlobalScheduler().AdjustSchedulingOnStatus(this, old_flags);
 }
 
 void Thread::SetCurrentPriority(u32 new_priority) {
     const u32 old_priority = std::exchange(current_priority, new_priority);
-    AdjustSchedulingOnPriority(old_priority);
+    kernel.GlobalScheduler().AdjustSchedulingOnPriority(this, old_priority);
 }
 
 ResultCode Thread::SetCoreAndAffinityMask(s32 new_core, u64 new_affinity_mask) {
@@ -443,111 +502,12 @@ ResultCode Thread::SetCoreAndAffinityMask(s32 new_core, u64 new_affinity_mask) {
                     processor_id = ideal_core;
                 }
             }
-            AdjustSchedulingOnAffinity(old_affinity_mask, old_core);
+            kernel.GlobalScheduler().AdjustSchedulingOnAffinity(this, old_affinity_mask, old_core);
         }
     }
     return RESULT_SUCCESS;
 }
 
-void Thread::AdjustSchedulingOnStatus(u32 old_flags) {
-    if (old_flags == scheduling_state) {
-        return;
-    }
-
-    auto& scheduler = kernel.GlobalScheduler();
-    if (static_cast<ThreadSchedStatus>(old_flags & static_cast<u32>(ThreadSchedMasks::LowMask)) ==
-        ThreadSchedStatus::Runnable) {
-        // In this case the thread was running, now it's pausing/exitting
-        if (processor_id >= 0) {
-            scheduler.Unschedule(current_priority, static_cast<u32>(processor_id), this);
-        }
-
-        for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
-            if (core != static_cast<u32>(processor_id) && ((affinity_mask >> core) & 1) != 0) {
-                scheduler.Unsuggest(current_priority, core, this);
-            }
-        }
-    } else if (GetSchedulingStatus() == ThreadSchedStatus::Runnable) {
-        // The thread is now set to running from being stopped
-        if (processor_id >= 0) {
-            scheduler.Schedule(current_priority, static_cast<u32>(processor_id), this);
-        }
-
-        for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
-            if (core != static_cast<u32>(processor_id) && ((affinity_mask >> core) & 1) != 0) {
-                scheduler.Suggest(current_priority, core, this);
-            }
-        }
-    }
-
-    scheduler.SetReselectionPending();
-}
-
-void Thread::AdjustSchedulingOnPriority(u32 old_priority) {
-    if (GetSchedulingStatus() != ThreadSchedStatus::Runnable) {
-        return;
-    }
-    auto& scheduler = kernel.GlobalScheduler();
-    if (processor_id >= 0) {
-        scheduler.Unschedule(old_priority, static_cast<u32>(processor_id), this);
-    }
-
-    for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
-        if (core != static_cast<u32>(processor_id) && ((affinity_mask >> core) & 1) != 0) {
-            scheduler.Unsuggest(old_priority, core, this);
-        }
-    }
-
-    // Add thread to the new priority queues.
-    Thread* current_thread = GetCurrentThread();
-
-    if (processor_id >= 0) {
-        if (current_thread == this) {
-            scheduler.SchedulePrepend(current_priority, static_cast<u32>(processor_id), this);
-        } else {
-            scheduler.Schedule(current_priority, static_cast<u32>(processor_id), this);
-        }
-    }
-
-    for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
-        if (core != static_cast<u32>(processor_id) && ((affinity_mask >> core) & 1) != 0) {
-            scheduler.Suggest(current_priority, core, this);
-        }
-    }
-
-    scheduler.SetReselectionPending();
-}
-
-void Thread::AdjustSchedulingOnAffinity(u64 old_affinity_mask, s32 old_core) {
-    auto& scheduler = kernel.GlobalScheduler();
-    if (GetSchedulingStatus() != ThreadSchedStatus::Runnable ||
-        current_priority >= THREADPRIO_COUNT) {
-        return;
-    }
-
-    for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
-        if (((old_affinity_mask >> core) & 1) != 0) {
-            if (core == static_cast<u32>(old_core)) {
-                scheduler.Unschedule(current_priority, core, this);
-            } else {
-                scheduler.Unsuggest(current_priority, core, this);
-            }
-        }
-    }
-
-    for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
-        if (((affinity_mask >> core) & 1) != 0) {
-            if (core == static_cast<u32>(processor_id)) {
-                scheduler.Schedule(current_priority, core, this);
-            } else {
-                scheduler.Suggest(current_priority, core, this);
-            }
-        }
-    }
-
-    scheduler.SetReselectionPending();
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 /**
diff --git a/src/core/hle/kernel/thread.h b/src/core/hle/kernel/thread.h
index 23fdef8a4..33d340b47 100644
--- a/src/core/hle/kernel/thread.h
+++ b/src/core/hle/kernel/thread.h
@@ -9,23 +9,42 @@
 #include <vector>
 
 #include "common/common_types.h"
+#include "common/spin_lock.h"
 #include "core/arm/arm_interface.h"
 #include "core/hle/kernel/object.h"
 #include "core/hle/kernel/synchronization_object.h"
 #include "core/hle/result.h"
 
+namespace Common {
+class Fiber;
+}
+
+namespace Core {
+class System;
+}
+
 namespace Kernel {
 
+class GlobalScheduler;
 class KernelCore;
 class Process;
 class Scheduler;
 
 enum ThreadPriority : u32 {
-    THREADPRIO_HIGHEST = 0,       ///< Highest thread priority
-    THREADPRIO_USERLAND_MAX = 24, ///< Highest thread priority for userland apps
-    THREADPRIO_DEFAULT = 44,      ///< Default thread priority for userland apps
-    THREADPRIO_LOWEST = 63,       ///< Lowest thread priority
-    THREADPRIO_COUNT = 64,        ///< Total number of possible thread priorities.
+    THREADPRIO_HIGHEST = 0,             ///< Highest thread priority
+    THREADPRIO_MAX_CORE_MIGRATION = 2,  ///< Highest priority for a core migration
+    THREADPRIO_USERLAND_MAX = 24,       ///< Highest thread priority for userland apps
+    THREADPRIO_DEFAULT = 44,            ///< Default thread priority for userland apps
+    THREADPRIO_LOWEST = 63,             ///< Lowest thread priority
+    THREADPRIO_COUNT = 64,              ///< Total number of possible thread priorities.
+};
+
+enum ThreadType : u32 {
+    THREADTYPE_USER = 0x1,
+    THREADTYPE_KERNEL = 0x2,
+    THREADTYPE_HLE = 0x4,
+    THREADTYPE_IDLE = 0x8,
+    THREADTYPE_SUSPEND = 0x10,
 };
 
 enum ThreadProcessorId : s32 {
@@ -111,22 +130,43 @@ public:
         std::function<bool(ThreadWakeupReason reason, std::shared_ptr<Thread> thread,
                            std::shared_ptr<SynchronizationObject> object, std::size_t index)>;
 
+   /**
+    * Creates and returns a new thread. The new thread is immediately scheduled
+    * @param system The instance of the whole system
+    * @param name The friendly name desired for the thread
+    * @param entry_point The address at which the thread should start execution
+    * @param priority The thread's priority
+    * @param arg User data to pass to the thread
+    * @param processor_id The ID(s) of the processors on which the thread is desired to be run
+    * @param stack_top The address of the thread's stack top
+    * @param owner_process The parent process for the thread, if null, it's a kernel thread
+    * @return A shared pointer to the newly created thread
+    */
+   static ResultVal<std::shared_ptr<Thread>> Create(Core::System& system, ThreadType type_flags, std::string name,
+                                                    VAddr entry_point, u32 priority, u64 arg,
+                                                    s32 processor_id, VAddr stack_top,
+                                                    Process* owner_process);
+
     /**
      * Creates and returns a new thread. The new thread is immediately scheduled
-     * @param kernel The kernel instance this thread will be created under.
+     * @param system The instance of the whole system
      * @param name The friendly name desired for the thread
      * @param entry_point The address at which the thread should start execution
      * @param priority The thread's priority
      * @param arg User data to pass to the thread
      * @param processor_id The ID(s) of the processors on which the thread is desired to be run
      * @param stack_top The address of the thread's stack top
-     * @param owner_process The parent process for the thread
+     * @param owner_process The parent process for the thread, if null, it's a kernel thread
+     * @param thread_start_func The function where the host context will start.
+     * @param thread_start_parameter The parameter which will passed to host context on init
      * @return A shared pointer to the newly created thread
      */
-    static ResultVal<std::shared_ptr<Thread>> Create(KernelCore& kernel, std::string name,
+    static ResultVal<std::shared_ptr<Thread>> Create(Core::System& system, ThreadType type_flags, std::string name,
                                                      VAddr entry_point, u32 priority, u64 arg,
                                                      s32 processor_id, VAddr stack_top,
-                                                     Process& owner_process);
+                                                     Process* owner_process,
+                                                     std::function<void(void*)>&& thread_start_func,
+                                                     void* thread_start_parameter);
 
     std::string GetName() const override {
         return name;
@@ -192,7 +232,9 @@ public:
     }
 
     /// Resumes a thread from waiting
-    void ResumeFromWait();
+    void /* deprecated */ ResumeFromWait();
+
+    void OnWakeUp();
 
     /// Cancels a waiting operation that this thread may or may not be within.
     ///
@@ -206,10 +248,10 @@ public:
      * Schedules an event to wake up the specified thread after the specified delay
      * @param nanoseconds The time this thread will be allowed to sleep for
      */
-    void WakeAfterDelay(s64 nanoseconds);
+    void /* deprecated */ WakeAfterDelay(s64 nanoseconds);
 
     /// Cancel any outstanding wakeup events for this thread
-    void CancelWakeupTimer();
+    void /* deprecated */ CancelWakeupTimer();
 
     /**
      * Sets the result after the thread awakens (from svcWaitSynchronization)
@@ -290,6 +332,12 @@ public:
         return context_64;
     }
 
+    bool IsHLEThread() const {
+        return (type & THREADTYPE_HLE) != 0;
+    }
+
+    std::shared_ptr<Common::Fiber> GetHostContext() const;
+
     ThreadStatus GetStatus() const {
         return status;
     }
@@ -467,16 +515,19 @@ public:
     }
 
 private:
+    friend class GlobalScheduler;
+    friend class Scheduler;
+
     void SetSchedulingStatus(ThreadSchedStatus new_status);
     void SetCurrentPriority(u32 new_priority);
     ResultCode SetCoreAndAffinityMask(s32 new_core, u64 new_affinity_mask);
 
-    void AdjustSchedulingOnStatus(u32 old_flags);
-    void AdjustSchedulingOnPriority(u32 old_priority);
     void AdjustSchedulingOnAffinity(u64 old_affinity_mask, s32 old_core);
 
     ThreadContext32 context_32{};
     ThreadContext64 context_64{};
+    Common::SpinLock context_guard{};
+    std::shared_ptr<Common::Fiber> host_context{};
 
     u64 thread_id = 0;
 
@@ -485,6 +536,8 @@ private:
     VAddr entry_point = 0;
     VAddr stack_top = 0;
 
+    ThreadType type;
+
     /// Nominal thread priority, as set by the emulated application.
     /// The nominal priority is the thread priority without priority
     /// inheritance taken into account.
diff --git a/src/core/hle/kernel/time_manager.cpp b/src/core/hle/kernel/time_manager.cpp
index 21b290468..0b8f0d993 100644
--- a/src/core/hle/kernel/time_manager.cpp
+++ b/src/core/hle/kernel/time_manager.cpp
@@ -19,7 +19,7 @@ TimeManager::TimeManager(Core::System& system) : system{system} {
             Handle proper_handle = static_cast<Handle>(thread_handle);
             std::shared_ptr<Thread> thread =
                 this->system.Kernel().RetrieveThreadFromGlobalHandleTable(proper_handle);
-            thread->ResumeFromWait();
+            thread->OnWakeUp();
         });
 }
 
diff --git a/src/core/hle/service/hid/controllers/debug_pad.cpp b/src/core/hle/service/hid/controllers/debug_pad.cpp
index 1f2131ec8..cb35919e9 100644
--- a/src/core/hle/service/hid/controllers/debug_pad.cpp
+++ b/src/core/hle/service/hid/controllers/debug_pad.cpp
@@ -23,7 +23,7 @@ void Controller_DebugPad::OnRelease() {}
 
 void Controller_DebugPad::OnUpdate(const Core::Timing::CoreTiming& core_timing, u8* data,
                                    std::size_t size) {
-    shared_memory.header.timestamp = core_timing.GetTicks();
+    shared_memory.header.timestamp = core_timing.GetCPUTicks();
     shared_memory.header.total_entry_count = 17;
 
     if (!IsControllerActivated()) {
diff --git a/src/core/hle/service/hid/controllers/gesture.cpp b/src/core/hle/service/hid/controllers/gesture.cpp
index 6e990dd00..b7b7bfeae 100644
--- a/src/core/hle/service/hid/controllers/gesture.cpp
+++ b/src/core/hle/service/hid/controllers/gesture.cpp
@@ -19,7 +19,7 @@ void Controller_Gesture::OnRelease() {}
 
 void Controller_Gesture::OnUpdate(const Core::Timing::CoreTiming& core_timing, u8* data,
                                   std::size_t size) {
-    shared_memory.header.timestamp = core_timing.GetTicks();
+    shared_memory.header.timestamp = core_timing.GetCPUTicks();
     shared_memory.header.total_entry_count = 17;
 
     if (!IsControllerActivated()) {
diff --git a/src/core/hle/service/hid/controllers/keyboard.cpp b/src/core/hle/service/hid/controllers/keyboard.cpp
index 9a8d354ba..feae89525 100644
--- a/src/core/hle/service/hid/controllers/keyboard.cpp
+++ b/src/core/hle/service/hid/controllers/keyboard.cpp
@@ -21,7 +21,7 @@ void Controller_Keyboard::OnRelease() {}
 
 void Controller_Keyboard::OnUpdate(const Core::Timing::CoreTiming& core_timing, u8* data,
                                    std::size_t size) {
-    shared_memory.header.timestamp = core_timing.GetTicks();
+    shared_memory.header.timestamp = core_timing.GetCPUTicks();
     shared_memory.header.total_entry_count = 17;
 
     if (!IsControllerActivated()) {
diff --git a/src/core/hle/service/hid/controllers/mouse.cpp b/src/core/hle/service/hid/controllers/mouse.cpp
index 93d88ea50..ac40989c5 100644
--- a/src/core/hle/service/hid/controllers/mouse.cpp
+++ b/src/core/hle/service/hid/controllers/mouse.cpp
@@ -19,7 +19,7 @@ void Controller_Mouse::OnRelease() {}
 
 void Controller_Mouse::OnUpdate(const Core::Timing::CoreTiming& core_timing, u8* data,
                                 std::size_t size) {
-    shared_memory.header.timestamp = core_timing.GetTicks();
+    shared_memory.header.timestamp = core_timing.GetCPUTicks();
     shared_memory.header.total_entry_count = 17;
 
     if (!IsControllerActivated()) {
diff --git a/src/core/hle/service/hid/controllers/npad.cpp b/src/core/hle/service/hid/controllers/npad.cpp
index 6fbee7efa..ef67ad690 100644
--- a/src/core/hle/service/hid/controllers/npad.cpp
+++ b/src/core/hle/service/hid/controllers/npad.cpp
@@ -328,7 +328,7 @@ void Controller_NPad::OnUpdate(const Core::Timing::CoreTiming& core_timing, u8*
             const auto& last_entry =
                 main_controller->npad[main_controller->common.last_entry_index];
 
-            main_controller->common.timestamp = core_timing.GetTicks();
+            main_controller->common.timestamp = core_timing.GetCPUTicks();
             main_controller->common.last_entry_index =
                 (main_controller->common.last_entry_index + 1) % 17;
 
diff --git a/src/core/hle/service/hid/controllers/stubbed.cpp b/src/core/hle/service/hid/controllers/stubbed.cpp
index 9e527d176..e7483bfa2 100644
--- a/src/core/hle/service/hid/controllers/stubbed.cpp
+++ b/src/core/hle/service/hid/controllers/stubbed.cpp
@@ -23,7 +23,7 @@ void Controller_Stubbed::OnUpdate(const Core::Timing::CoreTiming& core_timing, u
     }
 
     CommonHeader header{};
-    header.timestamp = core_timing.GetTicks();
+    header.timestamp = core_timing.GetCPUTicks();
     header.total_entry_count = 17;
     header.entry_count = 0;
     header.last_entry_index = 0;
diff --git a/src/core/hle/service/hid/controllers/touchscreen.cpp b/src/core/hle/service/hid/controllers/touchscreen.cpp
index 1c6e55566..e326f8f5c 100644
--- a/src/core/hle/service/hid/controllers/touchscreen.cpp
+++ b/src/core/hle/service/hid/controllers/touchscreen.cpp
@@ -22,7 +22,7 @@ void Controller_Touchscreen::OnRelease() {}
 
 void Controller_Touchscreen::OnUpdate(const Core::Timing::CoreTiming& core_timing, u8* data,
                                       std::size_t size) {
-    shared_memory.header.timestamp = core_timing.GetTicks();
+    shared_memory.header.timestamp = core_timing.GetCPUTicks();
     shared_memory.header.total_entry_count = 17;
 
     if (!IsControllerActivated()) {
@@ -49,7 +49,7 @@ void Controller_Touchscreen::OnUpdate(const Core::Timing::CoreTiming& core_timin
         touch_entry.diameter_x = Settings::values.touchscreen.diameter_x;
         touch_entry.diameter_y = Settings::values.touchscreen.diameter_y;
         touch_entry.rotation_angle = Settings::values.touchscreen.rotation_angle;
-        const u64 tick = core_timing.GetTicks();
+        const u64 tick = core_timing.GetCPUTicks();
         touch_entry.delta_time = tick - last_touch;
         last_touch = tick;
         touch_entry.finger = Settings::values.touchscreen.finger;
diff --git a/src/core/hle/service/hid/controllers/xpad.cpp b/src/core/hle/service/hid/controllers/xpad.cpp
index 27511b27b..2503ef241 100644
--- a/src/core/hle/service/hid/controllers/xpad.cpp
+++ b/src/core/hle/service/hid/controllers/xpad.cpp
@@ -20,7 +20,7 @@ void Controller_XPad::OnRelease() {}
 void Controller_XPad::OnUpdate(const Core::Timing::CoreTiming& core_timing, u8* data,
                                std::size_t size) {
     for (auto& xpad_entry : shared_memory.shared_memory_entries) {
-        xpad_entry.header.timestamp = core_timing.GetTicks();
+        xpad_entry.header.timestamp = core_timing.GetCPUTicks();
         xpad_entry.header.total_entry_count = 17;
 
         if (!IsControllerActivated()) {
diff --git a/src/core/hle/service/hid/hid.cpp b/src/core/hle/service/hid/hid.cpp
index 57d5edea7..e9020e0dc 100644
--- a/src/core/hle/service/hid/hid.cpp
+++ b/src/core/hle/service/hid/hid.cpp
@@ -39,11 +39,9 @@ namespace Service::HID {
 
 // Updating period for each HID device.
 // TODO(ogniK): Find actual polling rate of hid
-constexpr s64 pad_update_ticks = static_cast<s64>(Core::Hardware::BASE_CLOCK_RATE / 66);
-[[maybe_unused]] constexpr s64 accelerometer_update_ticks =
-    static_cast<s64>(Core::Hardware::BASE_CLOCK_RATE / 100);
-[[maybe_unused]] constexpr s64 gyroscope_update_ticks =
-    static_cast<s64>(Core::Hardware::BASE_CLOCK_RATE / 100);
+constexpr s64 pad_update_ticks = static_cast<s64>(1000000000 / 66);
+[[maybe_unused]] constexpr s64 accelerometer_update_ticks = static_cast<s64>(1000000000 / 100);
+[[maybe_unused]] constexpr s64 gyroscope_update_ticks = static_cast<s64>(1000000000 / 100);
 constexpr std::size_t SHARED_MEMORY_SIZE = 0x40000;
 
 IAppletResource::IAppletResource(Core::System& system)
@@ -78,8 +76,8 @@ IAppletResource::IAppletResource(Core::System& system)
 
     // Register update callbacks
     pad_update_event =
-        Core::Timing::CreateEvent("HID::UpdatePadCallback", [this](u64 userdata, s64 cycles_late) {
-            UpdateControllers(userdata, cycles_late);
+        Core::Timing::CreateEvent("HID::UpdatePadCallback", [this](u64 userdata, s64 ns_late) {
+            UpdateControllers(userdata, ns_late);
         });
 
     // TODO(shinyquagsire23): Other update callbacks? (accel, gyro?)
@@ -109,7 +107,7 @@ void IAppletResource::GetSharedMemoryHandle(Kernel::HLERequestContext& ctx) {
     rb.PushCopyObjects(shared_mem);
 }
 
-void IAppletResource::UpdateControllers(u64 userdata, s64 cycles_late) {
+void IAppletResource::UpdateControllers(u64 userdata, s64 ns_late) {
     auto& core_timing = system.CoreTiming();
 
     const bool should_reload = Settings::values.is_device_reload_pending.exchange(false);
@@ -120,7 +118,7 @@ void IAppletResource::UpdateControllers(u64 userdata, s64 cycles_late) {
         controller->OnUpdate(core_timing, shared_mem->GetPointer(), SHARED_MEMORY_SIZE);
     }
 
-    core_timing.ScheduleEvent(pad_update_ticks - cycles_late, pad_update_event);
+    core_timing.ScheduleEvent(pad_update_ticks - ns_late, pad_update_event);
 }
 
 class IActiveVibrationDeviceList final : public ServiceFramework<IActiveVibrationDeviceList> {
diff --git a/src/core/hle/service/hid/irs.cpp b/src/core/hle/service/hid/irs.cpp
index 36ed6f7da..e82fd031b 100644
--- a/src/core/hle/service/hid/irs.cpp
+++ b/src/core/hle/service/hid/irs.cpp
@@ -98,7 +98,7 @@ void IRS::GetImageTransferProcessorState(Kernel::HLERequestContext& ctx) {
 
     IPC::ResponseBuilder rb{ctx, 5};
     rb.Push(RESULT_SUCCESS);
-    rb.PushRaw<u64>(system.CoreTiming().GetTicks());
+    rb.PushRaw<u64>(system.CoreTiming().GetCPUTicks());
     rb.PushRaw<u32>(0);
 }
 
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp
index 0d913334e..fba89e7a6 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp
@@ -200,8 +200,7 @@ u32 nvhost_ctrl_gpu::GetGpuTime(const std::vector<u8>& input, std::vector<u8>& o
 
     IoctlGetGpuTime params{};
     std::memcpy(&params, input.data(), input.size());
-    const auto ns = Core::Timing::CyclesToNs(system.CoreTiming().GetTicks());
-    params.gpu_time = static_cast<u64_le>(ns.count());
+    params.gpu_time = static_cast<u64_le>(system.CoreTiming().GetGlobalTimeNs().count());
     std::memcpy(output.data(), &params, output.size());
     return 0;
 }
diff --git a/src/core/hle/service/nvflinger/nvflinger.cpp b/src/core/hle/service/nvflinger/nvflinger.cpp
index 437bc5dee..aaf28995d 100644
--- a/src/core/hle/service/nvflinger/nvflinger.cpp
+++ b/src/core/hle/service/nvflinger/nvflinger.cpp
@@ -27,8 +27,8 @@
 
 namespace Service::NVFlinger {
 
-constexpr s64 frame_ticks = static_cast<s64>(Core::Hardware::BASE_CLOCK_RATE / 60);
-constexpr s64 frame_ticks_30fps = static_cast<s64>(Core::Hardware::BASE_CLOCK_RATE / 30);
+constexpr s64 frame_ticks = static_cast<s64>(1000000000 / 60);
+constexpr s64 frame_ticks_30fps = static_cast<s64>(1000000000 / 30);
 
 NVFlinger::NVFlinger(Core::System& system) : system(system) {
     displays.emplace_back(0, "Default", system);
@@ -39,11 +39,10 @@ NVFlinger::NVFlinger(Core::System& system) : system(system) {
 
     // Schedule the screen composition events
     composition_event =
-        Core::Timing::CreateEvent("ScreenComposition", [this](u64 userdata, s64 cycles_late) {
+        Core::Timing::CreateEvent("ScreenComposition", [this](u64 userdata, s64 ns_late) {
             Compose();
-            const auto ticks =
-                Settings::values.force_30fps_mode ? frame_ticks_30fps : GetNextTicks();
-            this->system.CoreTiming().ScheduleEvent(std::max<s64>(0LL, ticks - cycles_late),
+            const auto ticks = GetNextTicks();
+            this->system.CoreTiming().ScheduleEvent(std::max<s64>(0LL, ticks - ns_late),
                                                     composition_event);
         });
 
@@ -223,7 +222,7 @@ void NVFlinger::Compose() {
 
 s64 NVFlinger::GetNextTicks() const {
     constexpr s64 max_hertz = 120LL;
-    return (Core::Hardware::BASE_CLOCK_RATE * (1LL << swap_interval)) / max_hertz;
+    return (1000000000 * (1LL << swap_interval)) / max_hertz;
 }
 
 } // namespace Service::NVFlinger
diff --git a/src/core/hle/service/time/standard_steady_clock_core.cpp b/src/core/hle/service/time/standard_steady_clock_core.cpp
index 1575f0b49..59a272f4a 100644
--- a/src/core/hle/service/time/standard_steady_clock_core.cpp
+++ b/src/core/hle/service/time/standard_steady_clock_core.cpp
@@ -11,9 +11,8 @@
 namespace Service::Time::Clock {
 
 TimeSpanType StandardSteadyClockCore::GetCurrentRawTimePoint(Core::System& system) {
-    const TimeSpanType ticks_time_span{TimeSpanType::FromTicks(
-        Core::Timing::CpuCyclesToClockCycles(system.CoreTiming().GetTicks()),
-        Core::Hardware::CNTFREQ)};
+    const TimeSpanType ticks_time_span{
+        TimeSpanType::FromTicks(system.CoreTiming().GetClockTicks(), Core::Hardware::CNTFREQ)};
     TimeSpanType raw_time_point{setup_value.nanoseconds + ticks_time_span.nanoseconds};
 
     if (raw_time_point.nanoseconds < cached_raw_time_point.nanoseconds) {
diff --git a/src/core/hle/service/time/tick_based_steady_clock_core.cpp b/src/core/hle/service/time/tick_based_steady_clock_core.cpp
index 44d5bc651..8baaa2a6a 100644
--- a/src/core/hle/service/time/tick_based_steady_clock_core.cpp
+++ b/src/core/hle/service/time/tick_based_steady_clock_core.cpp
@@ -11,9 +11,8 @@
 namespace Service::Time::Clock {
 
 SteadyClockTimePoint TickBasedSteadyClockCore::GetTimePoint(Core::System& system) {
-    const TimeSpanType ticks_time_span{TimeSpanType::FromTicks(
-        Core::Timing::CpuCyclesToClockCycles(system.CoreTiming().GetTicks()),
-        Core::Hardware::CNTFREQ)};
+    const TimeSpanType ticks_time_span{
+        TimeSpanType::FromTicks(system.CoreTiming().GetClockTicks(), Core::Hardware::CNTFREQ)};
 
     return {ticks_time_span.ToSeconds(), GetClockSourceId()};
 }
diff --git a/src/core/hle/service/time/time.cpp b/src/core/hle/service/time/time.cpp
index 67f1bbcf3..4cf58a61a 100644
--- a/src/core/hle/service/time/time.cpp
+++ b/src/core/hle/service/time/time.cpp
@@ -234,9 +234,8 @@ void Module::Interface::CalculateMonotonicSystemClockBaseTimePoint(Kernel::HLERe
     const auto current_time_point{steady_clock_core.GetCurrentTimePoint(system)};
 
     if (current_time_point.clock_source_id == context.steady_time_point.clock_source_id) {
-        const auto ticks{Clock::TimeSpanType::FromTicks(
-            Core::Timing::CpuCyclesToClockCycles(system.CoreTiming().GetTicks()),
-            Core::Hardware::CNTFREQ)};
+        const auto ticks{Clock::TimeSpanType::FromTicks(system.CoreTiming().GetClockTicks(),
+                                                        Core::Hardware::CNTFREQ)};
         const s64 base_time_point{context.offset + current_time_point.time_point -
                                   ticks.ToSeconds()};
         IPC::ResponseBuilder rb{ctx, (sizeof(s64) / 4) + 2};
diff --git a/src/core/hle/service/time/time_sharedmemory.cpp b/src/core/hle/service/time/time_sharedmemory.cpp
index 999ec1e51..e0ae9f874 100644
--- a/src/core/hle/service/time/time_sharedmemory.cpp
+++ b/src/core/hle/service/time/time_sharedmemory.cpp
@@ -30,8 +30,7 @@ void SharedMemory::SetupStandardSteadyClock(Core::System& system,
                                             const Common::UUID& clock_source_id,
                                             Clock::TimeSpanType current_time_point) {
     const Clock::TimeSpanType ticks_time_span{Clock::TimeSpanType::FromTicks(
-        Core::Timing::CpuCyclesToClockCycles(system.CoreTiming().GetTicks()),
-        Core::Hardware::CNTFREQ)};
+        system.CoreTiming().GetClockTicks(), Core::Hardware::CNTFREQ)};
     const Clock::SteadyClockContext context{
         static_cast<u64>(current_time_point.nanoseconds - ticks_time_span.nanoseconds),
         clock_source_id};
diff --git a/src/core/memory.cpp b/src/core/memory.cpp
index 9d87045a0..66634596d 100644
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -29,15 +29,12 @@ namespace Core::Memory {
 struct Memory::Impl {
     explicit Impl(Core::System& system_) : system{system_} {}
 
-    void SetCurrentPageTable(Kernel::Process& process) {
+    void SetCurrentPageTable(Kernel::Process& process, u32 core_id) {
         current_page_table = &process.PageTable().PageTableImpl();
 
         const std::size_t address_space_width = process.PageTable().GetAddressSpaceWidth();
 
-        system.ArmInterface(0).PageTableChanged(*current_page_table, address_space_width);
-        system.ArmInterface(1).PageTableChanged(*current_page_table, address_space_width);
-        system.ArmInterface(2).PageTableChanged(*current_page_table, address_space_width);
-        system.ArmInterface(3).PageTableChanged(*current_page_table, address_space_width);
+        system.ArmInterface(core_id).PageTableChanged(*current_page_table, address_space_width);
     }
 
     void MapMemoryRegion(Common::PageTable& page_table, VAddr base, u64 size, PAddr target) {
@@ -689,8 +686,8 @@ struct Memory::Impl {
 Memory::Memory(Core::System& system) : impl{std::make_unique<Impl>(system)} {}
 Memory::~Memory() = default;
 
-void Memory::SetCurrentPageTable(Kernel::Process& process) {
-    impl->SetCurrentPageTable(process);
+void Memory::SetCurrentPageTable(Kernel::Process& process, u32 core_id) {
+    impl->SetCurrentPageTable(process, core_id);
 }
 
 void Memory::MapMemoryRegion(Common::PageTable& page_table, VAddr base, u64 size, PAddr target) {
diff --git a/src/core/memory.h b/src/core/memory.h
index 9292f3b0a..93f0c1d6c 100644
--- a/src/core/memory.h
+++ b/src/core/memory.h
@@ -64,7 +64,7 @@ public:
      *
      * @param process The process to use the page table of.
      */
-    void SetCurrentPageTable(Kernel::Process& process);
+    void SetCurrentPageTable(Kernel::Process& process, u32 core_id);
 
     /**
      * Maps an allocated buffer onto a region of the emulated process address space.
diff --git a/src/core/memory/cheat_engine.cpp b/src/core/memory/cheat_engine.cpp
index b139e8465..53d27859b 100644
--- a/src/core/memory/cheat_engine.cpp
+++ b/src/core/memory/cheat_engine.cpp
@@ -20,7 +20,7 @@
 
 namespace Core::Memory {
 
-constexpr s64 CHEAT_ENGINE_TICKS = static_cast<s64>(Core::Hardware::BASE_CLOCK_RATE / 12);
+constexpr s64 CHEAT_ENGINE_TICKS = static_cast<s64>(1000000000 / 12);
 constexpr u32 KEYPAD_BITMASK = 0x3FFFFFF;
 
 StandardVmCallbacks::StandardVmCallbacks(Core::System& system, const CheatProcessMetadata& metadata)
@@ -190,7 +190,7 @@ CheatEngine::~CheatEngine() {
 void CheatEngine::Initialize() {
     event = Core::Timing::CreateEvent(
         "CheatEngine::FrameCallback::" + Common::HexToString(metadata.main_nso_build_id),
-        [this](u64 userdata, s64 cycles_late) { FrameCallback(userdata, cycles_late); });
+        [this](u64 userdata, s64 ns_late) { FrameCallback(userdata, ns_late); });
     core_timing.ScheduleEvent(CHEAT_ENGINE_TICKS, event);
 
     metadata.process_id = system.CurrentProcess()->GetProcessID();
@@ -217,7 +217,7 @@ void CheatEngine::Reload(std::vector<CheatEntry> cheats) {
 
 MICROPROFILE_DEFINE(Cheat_Engine, "Add-Ons", "Cheat Engine", MP_RGB(70, 200, 70));
 
-void CheatEngine::FrameCallback(u64 userdata, s64 cycles_late) {
+void CheatEngine::FrameCallback(u64 userdata, s64 ns_late) {
     if (is_pending_reload.exchange(false)) {
         vm.LoadProgram(cheats);
     }
@@ -230,7 +230,7 @@ void CheatEngine::FrameCallback(u64 userdata, s64 cycles_late) {
 
     vm.Execute(metadata);
 
-    core_timing.ScheduleEvent(CHEAT_ENGINE_TICKS - cycles_late, event);
+    core_timing.ScheduleEvent(CHEAT_ENGINE_TICKS - ns_late, event);
 }
 
 } // namespace Core::Memory
diff --git a/src/core/tools/freezer.cpp b/src/core/tools/freezer.cpp
index b2c6c537e..8b0c50d11 100644
--- a/src/core/tools/freezer.cpp
+++ b/src/core/tools/freezer.cpp
@@ -14,7 +14,7 @@
 namespace Tools {
 namespace {
 
-constexpr s64 MEMORY_FREEZER_TICKS = static_cast<s64>(Core::Hardware::BASE_CLOCK_RATE / 60);
+constexpr s64 MEMORY_FREEZER_TICKS = static_cast<s64>(1000000000 / 60);
 
 u64 MemoryReadWidth(Core::Memory::Memory& memory, u32 width, VAddr addr) {
     switch (width) {
@@ -57,7 +57,7 @@ Freezer::Freezer(Core::Timing::CoreTiming& core_timing_, Core::Memory::Memory& m
     : core_timing{core_timing_}, memory{memory_} {
     event = Core::Timing::CreateEvent(
         "MemoryFreezer::FrameCallback",
-        [this](u64 userdata, s64 cycles_late) { FrameCallback(userdata, cycles_late); });
+        [this](u64 userdata, s64 ns_late) { FrameCallback(userdata, ns_late); });
     core_timing.ScheduleEvent(MEMORY_FREEZER_TICKS, event);
 }
 
@@ -158,7 +158,7 @@ std::vector<Freezer::Entry> Freezer::GetEntries() const {
     return entries;
 }
 
-void Freezer::FrameCallback(u64 userdata, s64 cycles_late) {
+void Freezer::FrameCallback(u64 userdata, s64 ns_late) {
     if (!IsActive()) {
         LOG_DEBUG(Common_Memory, "Memory freezer has been deactivated, ending callback events.");
         return;
@@ -173,7 +173,7 @@ void Freezer::FrameCallback(u64 userdata, s64 cycles_late) {
         MemoryWriteWidth(memory, entry.width, entry.address, entry.value);
     }
 
-    core_timing.ScheduleEvent(MEMORY_FREEZER_TICKS - cycles_late, event);
+    core_timing.ScheduleEvent(MEMORY_FREEZER_TICKS - ns_late, event);
 }
 
 void Freezer::FillEntryReads() {
diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt
index 3f750b51c..47ef30aa9 100644
--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@@ -8,7 +8,6 @@ add_executable(tests
     core/arm/arm_test_common.cpp
     core/arm/arm_test_common.h
     core/core_timing.cpp
-    core/host_timing.cpp
     tests.cpp
 )
 
diff --git a/src/tests/core/core_timing.cpp b/src/tests/core/core_timing.cpp
index ff2d11cc8..795f3da09 100644
--- a/src/tests/core/core_timing.cpp
+++ b/src/tests/core/core_timing.cpp
@@ -16,31 +16,30 @@
 
 namespace {
 // Numbers are chosen randomly to make sure the correct one is given.
-constexpr std::array<u64, 5> CB_IDS{{42, 144, 93, 1026, UINT64_C(0xFFFF7FFFF7FFFF)}};
-constexpr int MAX_SLICE_LENGTH = 10000; // Copied from CoreTiming internals
+static constexpr std::array<u64, 5> CB_IDS{{42, 144, 93, 1026, UINT64_C(0xFFFF7FFFF7FFFF)}};
+static constexpr int MAX_SLICE_LENGTH = 10000; // Copied from CoreTiming internals
+static constexpr std::array<u64, 5> calls_order{{2, 0, 1, 4, 3}};
+static std::array<s64, 5> delays{};
 
 std::bitset<CB_IDS.size()> callbacks_ran_flags;
 u64 expected_callback = 0;
 s64 lateness = 0;
 
 template <unsigned int IDX>
-void CallbackTemplate(u64 userdata, s64 cycles_late) {
+void HostCallbackTemplate(u64 userdata, s64 nanoseconds_late) {
     static_assert(IDX < CB_IDS.size(), "IDX out of range");
     callbacks_ran_flags.set(IDX);
     REQUIRE(CB_IDS[IDX] == userdata);
-    REQUIRE(CB_IDS[IDX] == expected_callback);
-    REQUIRE(lateness == cycles_late);
+    REQUIRE(CB_IDS[IDX] == CB_IDS[calls_order[expected_callback]]);
+    delays[IDX] = nanoseconds_late;
+    ++expected_callback;
 }
 
 u64 callbacks_done = 0;
 
-void EmptyCallback(u64 userdata, s64 cycles_late) {
-    ++callbacks_done;
-}
-
 struct ScopeInit final {
     ScopeInit() {
-        core_timing.Initialize();
+        core_timing.Initialize([]() {});
     }
     ~ScopeInit() {
         core_timing.Shutdown();
@@ -49,110 +48,97 @@ struct ScopeInit final {
     Core::Timing::CoreTiming core_timing;
 };
 
-void AdvanceAndCheck(Core::Timing::CoreTiming& core_timing, u32 idx, u32 context = 0,
-                     int expected_lateness = 0, int cpu_downcount = 0) {
-    callbacks_ran_flags = 0;
-    expected_callback = CB_IDS[idx];
-    lateness = expected_lateness;
-
-    // Pretend we executed X cycles of instructions.
-    core_timing.SwitchContext(context);
-    core_timing.AddTicks(core_timing.GetDowncount() - cpu_downcount);
-    core_timing.Advance();
-    core_timing.SwitchContext((context + 1) % 4);
-
-    REQUIRE(decltype(callbacks_ran_flags)().set(idx) == callbacks_ran_flags);
-}
-} // Anonymous namespace
-
 TEST_CASE("CoreTiming[BasicOrder]", "[core]") {
     ScopeInit guard;
     auto& core_timing = guard.core_timing;
+    std::vector<std::shared_ptr<Core::Timing::EventType>> events{
+        Core::Timing::CreateEvent("callbackA", HostCallbackTemplate<0>),
+        Core::Timing::CreateEvent("callbackB", HostCallbackTemplate<1>),
+        Core::Timing::CreateEvent("callbackC", HostCallbackTemplate<2>),
+        Core::Timing::CreateEvent("callbackD", HostCallbackTemplate<3>),
+        Core::Timing::CreateEvent("callbackE", HostCallbackTemplate<4>),
+    };
+
+    expected_callback = 0;
+
+    core_timing.SyncPause(true);
+
+    u64 one_micro = 1000U;
+    for (std::size_t i = 0; i < events.size(); i++) {
+        u64 order = calls_order[i];
+        core_timing.ScheduleEvent(i * one_micro + 100U, events[order], CB_IDS[order]);
+    }
+    /// test pause
+    REQUIRE(callbacks_ran_flags.none());
 
-    std::shared_ptr<Core::Timing::EventType> cb_a =
-        Core::Timing::CreateEvent("callbackA", CallbackTemplate<0>);
-    std::shared_ptr<Core::Timing::EventType> cb_b =
-        Core::Timing::CreateEvent("callbackB", CallbackTemplate<1>);
-    std::shared_ptr<Core::Timing::EventType> cb_c =
-        Core::Timing::CreateEvent("callbackC", CallbackTemplate<2>);
-    std::shared_ptr<Core::Timing::EventType> cb_d =
-        Core::Timing::CreateEvent("callbackD", CallbackTemplate<3>);
-    std::shared_ptr<Core::Timing::EventType> cb_e =
-        Core::Timing::CreateEvent("callbackE", CallbackTemplate<4>);
-
-    // Enter slice 0
-    core_timing.ResetRun();
-
-    // D -> B -> C -> A -> E
-    core_timing.SwitchContext(0);
-    core_timing.ScheduleEvent(1000, cb_a, CB_IDS[0]);
-    REQUIRE(1000 == core_timing.GetDowncount());
-    core_timing.ScheduleEvent(500, cb_b, CB_IDS[1]);
-    REQUIRE(500 == core_timing.GetDowncount());
-    core_timing.ScheduleEvent(800, cb_c, CB_IDS[2]);
-    REQUIRE(500 == core_timing.GetDowncount());
-    core_timing.ScheduleEvent(100, cb_d, CB_IDS[3]);
-    REQUIRE(100 == core_timing.GetDowncount());
-    core_timing.ScheduleEvent(1200, cb_e, CB_IDS[4]);
-    REQUIRE(100 == core_timing.GetDowncount());
-
-    AdvanceAndCheck(core_timing, 3, 0);
-    AdvanceAndCheck(core_timing, 1, 1);
-    AdvanceAndCheck(core_timing, 2, 2);
-    AdvanceAndCheck(core_timing, 0, 3);
-    AdvanceAndCheck(core_timing, 4, 0);
-}
-
-TEST_CASE("CoreTiming[FairSharing]", "[core]") {
+    core_timing.Pause(false); // No need to sync
 
-    ScopeInit guard;
-    auto& core_timing = guard.core_timing;
+    while (core_timing.HasPendingEvents())
+        ;
 
-    std::shared_ptr<Core::Timing::EventType> empty_callback =
-        Core::Timing::CreateEvent("empty_callback", EmptyCallback);
+    REQUIRE(callbacks_ran_flags.all());
 
-    callbacks_done = 0;
-    u64 MAX_CALLBACKS = 10;
-    for (std::size_t i = 0; i < 10; i++) {
-        core_timing.ScheduleEvent(i * 3333U, empty_callback, 0);
+    for (std::size_t i = 0; i < delays.size(); i++) {
+        const double delay = static_cast<double>(delays[i]);
+        const double micro = delay / 1000.0f;
+        const double mili = micro / 1000.0f;
+        printf("HostTimer Pausing Delay[%zu]: %.3f %.6f\n", i, micro, mili);
     }
+}
 
-    const s64 advances = MAX_SLICE_LENGTH / 10;
-    core_timing.ResetRun();
-    u64 current_time = core_timing.GetTicks();
-    bool keep_running{};
-    do {
-        keep_running = false;
-        for (u32 active_core = 0; active_core < 4; ++active_core) {
-            core_timing.SwitchContext(active_core);
-            if (core_timing.CanCurrentContextRun()) {
-                core_timing.AddTicks(std::min<s64>(advances, core_timing.GetDowncount()));
-                core_timing.Advance();
-            }
-            keep_running |= core_timing.CanCurrentContextRun();
-        }
-    } while (keep_running);
-    u64 current_time_2 = core_timing.GetTicks();
-
-    REQUIRE(MAX_CALLBACKS == callbacks_done);
-    REQUIRE(current_time_2 == current_time + MAX_SLICE_LENGTH * 4);
+#pragma optimize("", off)
+u64 TestTimerSpeed(Core::Timing::CoreTiming& core_timing) {
+    u64 start = core_timing.GetGlobalTimeNs().count();
+    u64 placebo = 0;
+    for (std::size_t i = 0; i < 1000; i++) {
+        placebo += core_timing.GetGlobalTimeNs().count();
+    }
+    u64 end = core_timing.GetGlobalTimeNs().count();
+    return (end - start);
 }
+#pragma optimize("", on)
 
-TEST_CASE("Core::Timing[PredictableLateness]", "[core]") {
+TEST_CASE("CoreTiming[BasicOrderNoPausing]", "[core]") {
     ScopeInit guard;
     auto& core_timing = guard.core_timing;
+    std::vector<std::shared_ptr<Core::Timing::EventType>> events{
+        Core::Timing::CreateEvent("callbackA", HostCallbackTemplate<0>),
+        Core::Timing::CreateEvent("callbackB", HostCallbackTemplate<1>),
+        Core::Timing::CreateEvent("callbackC", HostCallbackTemplate<2>),
+        Core::Timing::CreateEvent("callbackD", HostCallbackTemplate<3>),
+        Core::Timing::CreateEvent("callbackE", HostCallbackTemplate<4>),
+    };
+
+    core_timing.SyncPause(true);
+    core_timing.SyncPause(false);
+
+    expected_callback = 0;
+
+    u64 start = core_timing.GetGlobalTimeNs().count();
+    u64 one_micro = 1000U;
+    for (std::size_t i = 0; i < events.size(); i++) {
+        u64 order = calls_order[i];
+        core_timing.ScheduleEvent(i * one_micro + 100U, events[order], CB_IDS[order]);
+    }
+    u64 end = core_timing.GetGlobalTimeNs().count();
+    const double scheduling_time = static_cast<double>(end - start);
+    const double timer_time = static_cast<double>(TestTimerSpeed(core_timing));
 
-    std::shared_ptr<Core::Timing::EventType> cb_a =
-        Core::Timing::CreateEvent("callbackA", CallbackTemplate<0>);
-    std::shared_ptr<Core::Timing::EventType> cb_b =
-        Core::Timing::CreateEvent("callbackB", CallbackTemplate<1>);
+    while (core_timing.HasPendingEvents())
+        ;
 
-    // Enter slice 0
-    core_timing.ResetRun();
+    REQUIRE(callbacks_ran_flags.all());
 
-    core_timing.ScheduleEvent(100, cb_a, CB_IDS[0]);
-    core_timing.ScheduleEvent(200, cb_b, CB_IDS[1]);
+    for (std::size_t i = 0; i < delays.size(); i++) {
+        const double delay = static_cast<double>(delays[i]);
+        const double micro = delay / 1000.0f;
+        const double mili = micro / 1000.0f;
+        printf("HostTimer No Pausing Delay[%zu]: %.3f %.6f\n", i, micro, mili);
+    }
 
-    AdvanceAndCheck(core_timing, 0, 0, 10, -10); // (100 - 10)
-    AdvanceAndCheck(core_timing, 1, 1, 50, -50);
+    const double micro = scheduling_time / 1000.0f;
+    const double mili = micro / 1000.0f;
+    printf("HostTimer No Pausing Scheduling Time: %.3f %.6f\n", micro, mili);
+    printf("HostTimer No Pausing Timer Time: %.3f %.6f\n", timer_time / 1000.f,
+           timer_time / 1000000.f);
 }
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 8eb017f65..482e49711 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -2,6 +2,8 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <chrono>
+
 #include "common/assert.h"
 #include "common/microprofile.h"
 #include "core/core.h"
@@ -154,8 +156,7 @@ u64 GPU::GetTicks() const {
     constexpr u64 gpu_ticks_num = 384;
     constexpr u64 gpu_ticks_den = 625;
 
-    const u64 cpu_ticks = system.CoreTiming().GetTicks();
-    u64 nanoseconds = Core::Timing::CyclesToNs(cpu_ticks).count();
+    u64 nanoseconds = system.CoreTiming().GetGlobalTimeNs().count();
     if (Settings::values.use_fast_gpu_time) {
         nanoseconds /= 256;
     }
diff --git a/src/yuzu/bootmanager.cpp b/src/yuzu/bootmanager.cpp
index bfeb16458..9ceb6c8d7 100644
--- a/src/yuzu/bootmanager.cpp
+++ b/src/yuzu/bootmanager.cpp
@@ -52,6 +52,8 @@ void EmuThread::run() {
 
     emit LoadProgress(VideoCore::LoadCallbackStage::Prepare, 0, 0);
 
+    Core::System::GetInstance().RegisterHostThread();
+
     Core::System::GetInstance().Renderer().Rasterizer().LoadDiskResources(
         stop_run, [this](VideoCore::LoadCallbackStage stage, std::size_t value, std::size_t total) {
             emit LoadProgress(stage, value, total);
@@ -65,28 +67,30 @@ void EmuThread::run() {
     bool was_active = false;
     while (!stop_run) {
         if (running) {
-            if (!was_active)
+            if (was_active) {
                 emit DebugModeLeft();
+            }
 
-            Core::System::ResultStatus result = Core::System::GetInstance().RunLoop();
+            running_guard = true;
+            Core::System::ResultStatus result = Core::System::GetInstance().Run();
             if (result != Core::System::ResultStatus::Success) {
+                running_guard = false;
                 this->SetRunning(false);
                 emit ErrorThrown(result, Core::System::GetInstance().GetStatusDetails());
             }
+            running_wait.Wait();
+            result = Core::System::GetInstance().Pause();
+            if (result != Core::System::ResultStatus::Success) {
+                running_guard = false;
+                this->SetRunning(false);
+                emit ErrorThrown(result, Core::System::GetInstance().GetStatusDetails());
+            }
+            running_guard = false;
 
-            was_active = running || exec_step;
-            if (!was_active && !stop_run)
-                emit DebugModeEntered();
-        } else if (exec_step) {
-            if (!was_active)
-                emit DebugModeLeft();
-
-            exec_step = false;
-            Core::System::GetInstance().SingleStep();
+            was_active = true;
             emit DebugModeEntered();
-            yieldCurrentThread();
-
-            was_active = false;
+        } else if (exec_step) {
+            UNIMPLEMENTED();
         } else {
             std::unique_lock lock{running_mutex};
             running_cv.wait(lock, [this] { return IsRunning() || exec_step || stop_run; });
diff --git a/src/yuzu/bootmanager.h b/src/yuzu/bootmanager.h
index 3626604ca..768568b3e 100644
--- a/src/yuzu/bootmanager.h
+++ b/src/yuzu/bootmanager.h
@@ -59,6 +59,11 @@ public:
         this->running = running;
         lock.unlock();
         running_cv.notify_all();
+        if (!running) {
+            running_wait.Set();
+            /// Wait until effectively paused
+            while (running_guard);
+        }
     }
 
     /**
@@ -84,6 +89,8 @@ private:
     std::atomic_bool stop_run{false};
     std::mutex running_mutex;
     std::condition_variable running_cv;
+    Common::Event running_wait{};
+    std::atomic_bool running_guard{false};
 
 signals:
     /**
diff --git a/src/yuzu/debugger/wait_tree.cpp b/src/yuzu/debugger/wait_tree.cpp
index c1ea25fb8..765908c5a 100644
--- a/src/yuzu/debugger/wait_tree.cpp
+++ b/src/yuzu/debugger/wait_tree.cpp
@@ -59,8 +59,10 @@ std::vector<std::unique_ptr<WaitTreeThread>> WaitTreeItem::MakeThreadItemList()
     std::size_t row = 0;
     auto add_threads = [&](const std::vector<std::shared_ptr<Kernel::Thread>>& threads) {
         for (std::size_t i = 0; i < threads.size(); ++i) {
-            item_list.push_back(std::make_unique<WaitTreeThread>(*threads[i]));
-            item_list.back()->row = row;
+            if (!threads[i]->IsHLEThread()) {
+                item_list.push_back(std::make_unique<WaitTreeThread>(*threads[i]));
+                item_list.back()->row = row;
+            }
             ++row;
         }
     };
diff --git a/src/yuzu_cmd/yuzu.cpp b/src/yuzu_cmd/yuzu.cpp
index 4d2ea7e9e..1e5377840 100644
--- a/src/yuzu_cmd/yuzu.cpp
+++ b/src/yuzu_cmd/yuzu.cpp
@@ -237,7 +237,7 @@ int main(int argc, char** argv) {
 
     std::thread render_thread([&emu_window] { emu_window->Present(); });
     while (emu_window->IsOpen()) {
-        system.RunLoop();
+        //system.RunLoop();
     }
     render_thread.join();
 
diff --git a/src/yuzu_tester/yuzu.cpp b/src/yuzu_tester/yuzu.cpp
index 676e70ebd..1a45506d4 100644
--- a/src/yuzu_tester/yuzu.cpp
+++ b/src/yuzu_tester/yuzu.cpp
@@ -256,7 +256,7 @@ int main(int argc, char** argv) {
     system.Renderer().Rasterizer().LoadDiskResources();
 
     while (!finished) {
-        system.RunLoop();
+        //system.RunLoop();
     }
 
     detached_tasks.WaitForAllTasks();
-- 
cgit v1.2.3


From dc580582034fb5937aa53176fdaa4bd0fc4acce8 Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Tue, 25 Feb 2020 11:12:46 -0400
Subject: General: Setup yuzu threads' microprofile, naming and registry.

---
 src/core/core_timing.cpp      | 5 +++--
 src/core/cpu_manager.cpp      | 5 ++++-
 src/video_core/gpu_thread.cpp | 6 +++++-
 src/yuzu/bootmanager.cpp      | 4 +++-
 src/yuzu/main.cpp             | 2 ++
 src/yuzu_cmd/yuzu.cpp         | 6 ++++--
 src/yuzu_tester/yuzu.cpp      | 6 ++++--
 7 files changed, 25 insertions(+), 9 deletions(-)

(limited to 'src/video_core')

diff --git a/src/core/core_timing.cpp b/src/core/core_timing.cpp
index a3ce69790..cc32a853b 100644
--- a/src/core/core_timing.cpp
+++ b/src/core/core_timing.cpp
@@ -2,14 +2,14 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#include "core/core_timing.h"
-
 #include <algorithm>
 #include <mutex>
 #include <string>
 #include <tuple>
 
 #include "common/assert.h"
+#include "common/microprofile.h"
+#include "core/core_timing.h"
 #include "core/core_timing_util.h"
 
 namespace Core::Timing {
@@ -44,6 +44,7 @@ CoreTiming::~CoreTiming() = default;
 
 void CoreTiming::ThreadEntry(CoreTiming& instance) {
     std::string name = "yuzu:HostTiming";
+    MicroProfileOnThreadCreate(name.c_str());
     Common::SetCurrentThreadName(name.c_str());
     instance.on_thread_init();
     instance.ThreadLoop();
diff --git a/src/core/cpu_manager.cpp b/src/core/cpu_manager.cpp
index ff2fe8ead..9b9337131 100644
--- a/src/core/cpu_manager.cpp
+++ b/src/core/cpu_manager.cpp
@@ -3,6 +3,7 @@
 // Refer to the license.txt file included.
 
 #include "common/fiber.h"
+#include "common/microprofile.h"
 #include "common/thread.h"
 #include "core/arm/exclusive_monitor.h"
 #include "core/core.h"
@@ -36,6 +37,7 @@ void CpuManager::Shutdown() {
     Pause(false);
     for (std::size_t core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
         core_data[core].host_thread->join();
+        core_data[core].host_thread.reset();
     }
 }
 
@@ -80,7 +82,7 @@ void CpuManager::RunGuestThread() {
         auto& physical_core = kernel.CurrentPhysicalCore();
         if (!physical_core.IsInterrupted()) {
             physical_core.Idle();
-            //physical_core.Run();
+            // physical_core.Run();
         }
         auto& scheduler = physical_core.Scheduler();
         scheduler.TryDoContextSwitch();
@@ -159,6 +161,7 @@ void CpuManager::RunThread(std::size_t core) {
     /// Initialization
     system.RegisterCoreThread(core);
     std::string name = "yuzu:CoreHostThread_" + std::to_string(core);
+    MicroProfileOnThreadCreate(name.c_str());
     Common::SetCurrentThreadName(name.c_str());
     auto& data = core_data[core];
     data.enter_barrier = std::make_unique<Common::Event>();
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index c3bb4fe06..323185bfc 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -4,6 +4,7 @@
 
 #include "common/assert.h"
 #include "common/microprofile.h"
+#include "common/thread.h"
 #include "core/core.h"
 #include "core/frontend/emu_window.h"
 #include "core/settings.h"
@@ -18,7 +19,10 @@ namespace VideoCommon::GPUThread {
 static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
                       Core::Frontend::GraphicsContext& context, Tegra::DmaPusher& dma_pusher,
                       SynchState& state) {
-    MicroProfileOnThreadCreate("GpuThread");
+    std::string name = "yuzu:GPU";
+    MicroProfileOnThreadCreate(name.c_str());
+    Common::SetCurrentThreadName(name.c_str());
+    system.RegisterHostThread();
 
     // Wait for first GPU command before acquiring the window context
     while (state.queue.Empty())
diff --git a/src/yuzu/bootmanager.cpp b/src/yuzu/bootmanager.cpp
index 9ceb6c8d7..468dde782 100644
--- a/src/yuzu/bootmanager.cpp
+++ b/src/yuzu/bootmanager.cpp
@@ -44,7 +44,9 @@ EmuThread::EmuThread() = default;
 EmuThread::~EmuThread() = default;
 
 void EmuThread::run() {
-    MicroProfileOnThreadCreate("EmuThread");
+    std::string name = "yuzu:EmuControlThread";
+    MicroProfileOnThreadCreate(name.c_str());
+    Common::SetCurrentThreadName(name.c_str());
 
     // Main process has been loaded. Make the context current to this thread and begin GPU and CPU
     // execution.
diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp
index ba69139e5..de0c7fe8c 100644
--- a/src/yuzu/main.cpp
+++ b/src/yuzu/main.cpp
@@ -925,6 +925,8 @@ bool GMainWindow::LoadROM(const QString& filename) {
         nullptr,                                     // E-Commerce
     });
 
+    system.RegisterHostThread();
+
     const Core::System::ResultStatus result{system.Load(*render_window, filename.toStdString())};
 
     const auto drd_callout =
diff --git a/src/yuzu_cmd/yuzu.cpp b/src/yuzu_cmd/yuzu.cpp
index 38ffdfbd3..e6c6a839d 100644
--- a/src/yuzu_cmd/yuzu.cpp
+++ b/src/yuzu_cmd/yuzu.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <chrono>
 #include <iostream>
 #include <memory>
 #include <string>
@@ -237,8 +238,9 @@ int main(int argc, char** argv) {
 
     std::thread render_thread([&emu_window] { emu_window->Present(); });
     system.Run();
-    while (emu_window->IsOpen())
-        ;
+    while (emu_window->IsOpen()) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    }
     system.Pause();
     render_thread.join();
 
diff --git a/src/yuzu_tester/yuzu.cpp b/src/yuzu_tester/yuzu.cpp
index d62686dd2..083667baf 100644
--- a/src/yuzu_tester/yuzu.cpp
+++ b/src/yuzu_tester/yuzu.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <chrono>
 #include <iostream>
 #include <memory>
 #include <string>
@@ -256,8 +257,9 @@ int main(int argc, char** argv) {
     system.Renderer().Rasterizer().LoadDiskResources();
 
     system.Run();
-    while (!finished)
-        ;
+    while (!finished) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    }
     system.Pause();
 
     detached_tasks.WaitForAllTasks();
-- 
cgit v1.2.3


From ad92865497f83fe4c19cd9ab78cce9da1a8c3a6c Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Fri, 3 Apr 2020 11:58:43 -0400
Subject: General: Correct rebase, sync gpu and context management.

---
 src/core/core.cpp             |  3 +--
 src/core/cpu_manager.cpp      | 11 ++---------
 src/core/cpu_manager.h        |  7 -------
 src/video_core/gpu.h          |  6 ++++++
 src/video_core/gpu_asynch.cpp |  9 ++++++++-
 src/video_core/gpu_asynch.h   |  2 ++
 src/video_core/gpu_synch.cpp  |  8 +++++++-
 src/video_core/gpu_synch.h    |  2 ++
 src/yuzu/bootmanager.cpp      | 29 +++++++++++++++++------------
 9 files changed, 45 insertions(+), 32 deletions(-)

(limited to 'src/video_core')

diff --git a/src/core/core.cpp b/src/core/core.cpp
index 40eea297e..3393c33eb 100644
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -151,7 +151,6 @@ struct System::Impl {
         cpu_manager.SetMulticore(is_multicore);
         cpu_manager.SetAsyncGpu(is_async_gpu);
         core_timing.SetMulticore(is_multicore);
-        cpu_manager.SetRenderWindow(emu_window);
 
         core_timing.Initialize([&system]() { system.RegisterHostThread(); });
         kernel.Initialize();
@@ -435,7 +434,7 @@ bool System::IsPoweredOn() const {
 }
 
 void System::PrepareReschedule() {
-    //impl->CurrentPhysicalCore().Stop();
+    // impl->CurrentPhysicalCore().Stop();
 }
 
 void System::PrepareReschedule(const u32 core_index) {
diff --git a/src/core/cpu_manager.cpp b/src/core/cpu_manager.cpp
index b7c2a7832..63c578852 100644
--- a/src/core/cpu_manager.cpp
+++ b/src/core/cpu_manager.cpp
@@ -9,12 +9,12 @@
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "core/cpu_manager.h"
-#include "core/frontend/emu_window.h"
 #include "core/gdbstub/gdbstub.h"
 #include "core/hle/kernel/kernel.h"
 #include "core/hle/kernel/physical_core.h"
 #include "core/hle/kernel/scheduler.h"
 #include "core/hle/kernel/thread.h"
+#include "video_core/gpu.h"
 
 namespace Core {
 
@@ -25,10 +25,6 @@ void CpuManager::ThreadStart(CpuManager& cpu_manager, std::size_t core) {
     cpu_manager.RunThread(core);
 }
 
-void CpuManager::SetRenderWindow(Core::Frontend::EmuWindow& render_window) {
-    this->render_window = &render_window;
-}
-
 void CpuManager::Initialize() {
     running_mode = true;
     if (is_multicore) {
@@ -354,7 +350,7 @@ void CpuManager::RunThread(std::size_t core) {
         data.is_running = false;
         data.enter_barrier->Wait();
         if (sc_sync_first_use) {
-            render_window->MakeCurrent();
+            system.GPU().ObtainContext();
             sc_sync_first_use = false;
         }
         auto& scheduler = system.Kernel().CurrentScheduler();
@@ -366,9 +362,6 @@ void CpuManager::RunThread(std::size_t core) {
         data.exit_barrier->Wait();
         data.is_paused = false;
     }
-    if (sc_sync) {
-        render_window->DoneCurrent();
-    }
     /// Time to cleanup
     data.host_context->Exit();
     data.enter_barrier.reset();
diff --git a/src/core/cpu_manager.h b/src/core/cpu_manager.h
index ae55d6427..35929ed94 100644
--- a/src/core/cpu_manager.h
+++ b/src/core/cpu_manager.h
@@ -16,10 +16,6 @@ class Event;
 class Fiber;
 } // namespace Common
 
-namespace Core::Frontend {
-class EmuWindow;
-} // namespace Core::Frontend
-
 namespace Core {
 
 class System;
@@ -61,8 +57,6 @@ public:
         return current_core.load();
     }
 
-    void SetRenderWindow(Core::Frontend::EmuWindow& render_window);
-
 private:
     static void GuestThreadFunction(void* cpu_manager);
     static void GuestRewindFunction(void* cpu_manager);
@@ -106,7 +100,6 @@ private:
     std::size_t preemption_count{};
     std::size_t idle_count{};
     static constexpr std::size_t max_cycle_runs = 5;
-    Core::Frontend::EmuWindow* render_window;
 
     System& system;
 };
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index a1b4c305c..2c42483bd 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -284,6 +284,12 @@ public:
     /// core timing events.
     virtual void Start() = 0;
 
+    /// Obtain the CPU Context
+    virtual void ObtainContext() = 0;
+
+    /// Release the CPU Context
+    virtual void ReleaseContext() = 0;
+
     /// Push GPU command entries to be processed
     virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0;
 
diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp
index 53305ab43..7b855f63e 100644
--- a/src/video_core/gpu_asynch.cpp
+++ b/src/video_core/gpu_asynch.cpp
@@ -19,10 +19,17 @@ GPUAsynch::GPUAsynch(Core::System& system, std::unique_ptr<VideoCore::RendererBa
 GPUAsynch::~GPUAsynch() = default;
 
 void GPUAsynch::Start() {
-    cpu_context->MakeCurrent();
     gpu_thread.StartThread(*renderer, *gpu_context, *dma_pusher);
 }
 
+void GPUAsynch::ObtainContext() {
+    cpu_context->MakeCurrent();
+}
+
+void GPUAsynch::ReleaseContext() {
+    cpu_context->DoneCurrent();
+}
+
 void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) {
     gpu_thread.SubmitList(std::move(entries));
 }
diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h
index 517658612..15e9f1d38 100644
--- a/src/video_core/gpu_asynch.h
+++ b/src/video_core/gpu_asynch.h
@@ -25,6 +25,8 @@ public:
     ~GPUAsynch() override;
 
     void Start() override;
+    void ObtainContext() override;
+    void ReleaseContext() override;
     void PushGPUEntries(Tegra::CommandList&& entries) override;
     void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
     void FlushRegion(VAddr addr, u64 size) override;
diff --git a/src/video_core/gpu_synch.cpp b/src/video_core/gpu_synch.cpp
index 6f38a672a..aaeb9811d 100644
--- a/src/video_core/gpu_synch.cpp
+++ b/src/video_core/gpu_synch.cpp
@@ -13,10 +13,16 @@ GPUSynch::GPUSynch(Core::System& system, std::unique_ptr<VideoCore::RendererBase
 
 GPUSynch::~GPUSynch() = default;
 
-void GPUSynch::Start() {
+void GPUSynch::Start() {}
+
+void GPUSynch::ObtainContext() {
     context->MakeCurrent();
 }
 
+void GPUSynch::ReleaseContext() {
+    context->DoneCurrent();
+}
+
 void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) {
     dma_pusher->Push(std::move(entries));
     dma_pusher->DispatchCalls();
diff --git a/src/video_core/gpu_synch.h b/src/video_core/gpu_synch.h
index 4a6e9a01d..762c20aa5 100644
--- a/src/video_core/gpu_synch.h
+++ b/src/video_core/gpu_synch.h
@@ -24,6 +24,8 @@ public:
     ~GPUSynch() override;
 
     void Start() override;
+    void ObtainContext() override;
+    void ReleaseContext() override;
     void PushGPUEntries(Tegra::CommandList&& entries) override;
     void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
     void FlushRegion(VAddr addr, u64 size) override;
diff --git a/src/yuzu/bootmanager.cpp b/src/yuzu/bootmanager.cpp
index 6aa161e99..5f93bd432 100644
--- a/src/yuzu/bootmanager.cpp
+++ b/src/yuzu/bootmanager.cpp
@@ -48,24 +48,29 @@ void EmuThread::run() {
     MicroProfileOnThreadCreate(name.c_str());
     Common::SetCurrentThreadName(name.c_str());
 
+    auto& system = Core::System::GetInstance();
+
+    system.RegisterHostThread();
+
+    auto& gpu = system.GPU();
+
     // Main process has been loaded. Make the context current to this thread and begin GPU and CPU
     // execution.
-    Core::System::GetInstance().GPU().Start();
+    gpu.Start();
 
-    emit LoadProgress(VideoCore::LoadCallbackStage::Prepare, 0, 0);
+    gpu.ObtainContext();
 
-    Core::System::GetInstance().RegisterHostThread();
-
-    context.MakeCurrent();
+    emit LoadProgress(VideoCore::LoadCallbackStage::Prepare, 0, 0);
 
-    Core::System::GetInstance().Renderer().Rasterizer().LoadDiskResources(
+    system.Renderer().Rasterizer().LoadDiskResources(
         stop_run, [this](VideoCore::LoadCallbackStage stage, std::size_t value, std::size_t total) {
             emit LoadProgress(stage, value, total);
         });
 
     emit LoadProgress(VideoCore::LoadCallbackStage::Complete, 0, 0);
 
-    context.DoneCurrent();
+    gpu.ReleaseContext();
+
 
     // Holds whether the cpu was running during the last iteration,
     // so that the DebugModeLeft signal can be emitted before the
@@ -78,18 +83,18 @@ void EmuThread::run() {
             }
 
             running_guard = true;
-            Core::System::ResultStatus result = Core::System::GetInstance().Run();
+            Core::System::ResultStatus result = system.Run();
             if (result != Core::System::ResultStatus::Success) {
                 running_guard = false;
                 this->SetRunning(false);
-                emit ErrorThrown(result, Core::System::GetInstance().GetStatusDetails());
+                emit ErrorThrown(result, system.GetStatusDetails());
             }
             running_wait.Wait();
-            result = Core::System::GetInstance().Pause();
+            result = system.Pause();
             if (result != Core::System::ResultStatus::Success) {
                 running_guard = false;
                 this->SetRunning(false);
-                emit ErrorThrown(result, Core::System::GetInstance().GetStatusDetails());
+                emit ErrorThrown(result, system.GetStatusDetails());
             }
             running_guard = false;
 
@@ -106,7 +111,7 @@ void EmuThread::run() {
     }
 
     // Shutdown the core emulation
-    Core::System::GetInstance().Shutdown();
+    system.Shutdown();
 
 #if MICROPROFILE_ENABLED
     MicroProfileOnThreadExit();
-- 
cgit v1.2.3


From 528b19a84287167d7699465e495b196d216b99db Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Sun, 5 Apr 2020 09:48:53 -0400
Subject: General: Tune the priority of main emulation threads so they have
 higher priority than less important helper threads.

---
 src/common/thread.cpp                           | 46 +++++++++++++++++++++++++
 src/common/thread.h                             |  9 +++++
 src/core/core_timing.cpp                        |  1 +
 src/core/cpu_manager.cpp                        |  1 +
 src/video_core/gpu_thread.cpp                   |  1 +
 src/video_core/renderer_vulkan/vk_scheduler.cpp |  2 ++
 6 files changed, 60 insertions(+)

(limited to 'src/video_core')

diff --git a/src/common/thread.cpp b/src/common/thread.cpp
index c9684aed9..33c8437f5 100644
--- a/src/common/thread.cpp
+++ b/src/common/thread.cpp
@@ -25,6 +25,52 @@
 
 namespace Common {
 
+#ifdef _WIN32
+
+void SetCurrentThreadPriority(ThreadPriority new_priority) {
+    auto handle = GetCurrentThread();
+    int windows_priority = 0;
+    switch (new_priority) {
+        case ThreadPriority::Low:
+            windows_priority = THREAD_PRIORITY_BELOW_NORMAL;
+            break;
+        case ThreadPriority::Normal:
+            windows_priority = THREAD_PRIORITY_NORMAL;
+            break;
+        case ThreadPriority::High:
+            windows_priority = THREAD_PRIORITY_ABOVE_NORMAL;
+            break;
+        case ThreadPriority::VeryHigh:
+            windows_priority = THREAD_PRIORITY_HIGHEST;
+            break;
+        default:
+            windows_priority = THREAD_PRIORITY_NORMAL;
+            break;
+    }
+    SetThreadPriority(handle, windows_priority);
+}
+
+#else
+
+void SetCurrentThreadPriority(ThreadPriority new_priority) {
+    pthread_t this_thread = pthread_self();
+
+    s32 max_prio = sched_get_priority_max(SCHED_OTHER);
+    s32 min_prio = sched_get_priority_min(SCHED_OTHER);
+    u32 level = static_cast<u32>(new_priority) + 1;
+
+    struct sched_param params;
+    if (max_prio > min_prio) {
+        params.sched_priority = min_prio + ((max_prio - min_prio) * level) / 4;
+    } else {
+        params.sched_priority = min_prio - ((min_prio - max_prio) * level) / 4;
+    }
+
+    pthread_setschedparam(this_thread, SCHED_OTHER, &params);
+}
+
+#endif
+
 #ifdef _MSC_VER
 
 // Sets the debugger-visible name of the current thread.
diff --git a/src/common/thread.h b/src/common/thread.h
index 127cc7e23..52b359413 100644
--- a/src/common/thread.h
+++ b/src/common/thread.h
@@ -86,6 +86,15 @@ private:
     std::size_t generation = 0; // Incremented once each time the barrier is used
 };
 
+enum class ThreadPriority : u32 {
+    Low = 0,
+    Normal = 1,
+    High = 2,
+    VeryHigh = 3,
+};
+
+void SetCurrentThreadPriority(ThreadPriority new_priority);
+
 void SetCurrentThreadName(const char* name);
 
 } // namespace Common
diff --git a/src/core/core_timing.cpp b/src/core/core_timing.cpp
index b02119494..032b29e33 100644
--- a/src/core/core_timing.cpp
+++ b/src/core/core_timing.cpp
@@ -48,6 +48,7 @@ void CoreTiming::ThreadEntry(CoreTiming& instance) {
     std::string name = "yuzu:HostTiming";
     MicroProfileOnThreadCreate(name.c_str());
     Common::SetCurrentThreadName(name.c_str());
+    Common::SetCurrentThreadPriority(Common::ThreadPriority::VeryHigh);
     instance.on_thread_init();
     instance.ThreadLoop();
 }
diff --git a/src/core/cpu_manager.cpp b/src/core/cpu_manager.cpp
index 63c578852..32afcf3ae 100644
--- a/src/core/cpu_manager.cpp
+++ b/src/core/cpu_manager.cpp
@@ -337,6 +337,7 @@ void CpuManager::RunThread(std::size_t core) {
     }
     MicroProfileOnThreadCreate(name.c_str());
     Common::SetCurrentThreadName(name.c_str());
+    Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
     auto& data = core_data[core];
     data.enter_barrier = std::make_unique<Common::Event>();
     data.exit_barrier = std::make_unique<Common::Event>();
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index 323185bfc..738c6f0c1 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -22,6 +22,7 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
     std::string name = "yuzu:GPU";
     MicroProfileOnThreadCreate(name.c_str());
     Common::SetCurrentThreadName(name.c_str());
+    Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
     system.RegisterHostThread();
 
     // Wait for first GPU command before acquiring the window context
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp
index 82ec9180e..56524e6f3 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@@ -9,6 +9,7 @@
 #include <utility>
 
 #include "common/microprofile.h"
+#include "common/thread.h"
 #include "video_core/renderer_vulkan/vk_device.h"
 #include "video_core/renderer_vulkan/vk_query_cache.h"
 #include "video_core/renderer_vulkan/vk_resource_manager.h"
@@ -133,6 +134,7 @@ void VKScheduler::BindGraphicsPipeline(VkPipeline pipeline) {
 }
 
 void VKScheduler::WorkerThread() {
+    Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
     std::unique_lock lock{mutex};
     do {
         cv.wait(lock, [this] { return !chunk_queue.Empty() || quit; });
-- 
cgit v1.2.3


From 78d80d99a07893b79cecefbb613bf326c8e783eb Mon Sep 17 00:00:00 2001
From: Morph <39850852+Morph1984@users.noreply.github.com>
Date: Sun, 28 Jun 2020 02:48:14 -0400
Subject: maxwell_to_gl: Add 32 bit component sizes to (un)signed scaled
 formats

Add 32 bit component sizes to (un)signed scaled formats and group (un)signed normalized, scaled, and integer formats together.
---
 src/video_core/renderer_opengl/maxwell_to_gl.h | 34 +++-----------------------
 1 file changed, 4 insertions(+), 30 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 35e329240..8f3871e90 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -26,8 +26,9 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
 inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
     switch (attrib.type) {
-    case Maxwell::VertexAttribute::Type::UnsignedInt:
     case Maxwell::VertexAttribute::Type::UnsignedNorm:
+    case Maxwell::VertexAttribute::Type::UnsignedScaled:
+    case Maxwell::VertexAttribute::Type::UnsignedInt:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_8:
         case Maxwell::VertexAttribute::Size::Size_8_8:
@@ -48,8 +49,9 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
             return GL_UNSIGNED_INT_2_10_10_10_REV;
         }
         break;
-    case Maxwell::VertexAttribute::Type::SignedInt:
     case Maxwell::VertexAttribute::Type::SignedNorm:
+    case Maxwell::VertexAttribute::Type::SignedScaled:
+    case Maxwell::VertexAttribute::Type::SignedInt:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_8:
         case Maxwell::VertexAttribute::Size::Size_8_8:
@@ -84,34 +86,6 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
             return GL_FLOAT;
         }
         break;
-    case Maxwell::VertexAttribute::Type::UnsignedScaled:
-        switch (attrib.size) {
-        case Maxwell::VertexAttribute::Size::Size_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return GL_UNSIGNED_BYTE;
-        case Maxwell::VertexAttribute::Size::Size_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return GL_UNSIGNED_SHORT;
-        }
-        break;
-    case Maxwell::VertexAttribute::Type::SignedScaled:
-        switch (attrib.size) {
-        case Maxwell::VertexAttribute::Size::Size_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return GL_BYTE;
-        case Maxwell::VertexAttribute::Size::Size_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return GL_SHORT;
-        }
-        break;
     }
     UNIMPLEMENTED_MSG("Unimplemented vertex type={} and size={}", attrib.TypeString(),
                       attrib.SizeString());
-- 
cgit v1.2.3


From 4a35df337b7aaa3d4056a5b10da471bff11b4b2f Mon Sep 17 00:00:00 2001
From: Morph <39850852+Morph1984@users.noreply.github.com>
Date: Sun, 28 Jun 2020 02:49:17 -0400
Subject: maxwell_to_vk: Reorder vertex formats and add A2B10G10R10 for all
 types except float

---
 src/video_core/renderer_vulkan/maxwell_to_vk.cpp | 144 +++++++++++------------
 1 file changed, 69 insertions(+), 75 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index 1f2b6734b..d7f1ae89f 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -294,6 +294,28 @@ VkPrimitiveTopology PrimitiveTopology([[maybe_unused]] const VKDevice& device,
 
 VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttribute::Size size) {
     switch (type) {
+    case Maxwell::VertexAttribute::Type::UnsignedNorm:
+        switch (size) {
+        case Maxwell::VertexAttribute::Size::Size_8:
+            return VK_FORMAT_R8_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_8_8:
+            return VK_FORMAT_R8G8_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_8_8_8:
+            return VK_FORMAT_R8G8B8_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
+            return VK_FORMAT_R8G8B8A8_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_16:
+            return VK_FORMAT_R16_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_16_16:
+            return VK_FORMAT_R16G16_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_16_16_16:
+            return VK_FORMAT_R16G16B16_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
+            return VK_FORMAT_R16G16B16A16_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+            return VK_FORMAT_A2B10G10R10_UNORM_PACK32;
+        }
+        break;
     case Maxwell::VertexAttribute::Type::SignedNorm:
         switch (size) {
         case Maxwell::VertexAttribute::Size::Size_8:
@@ -314,62 +336,50 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
             return VK_FORMAT_R16G16B16A16_SNORM;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return VK_FORMAT_A2B10G10R10_SNORM_PACK32;
-        default:
-            break;
         }
         break;
-    case Maxwell::VertexAttribute::Type::UnsignedNorm:
+    case Maxwell::VertexAttribute::Type::UnsignedScaled:
         switch (size) {
         case Maxwell::VertexAttribute::Size::Size_8:
-            return VK_FORMAT_R8_UNORM;
+            return VK_FORMAT_R8_USCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8:
-            return VK_FORMAT_R8G8_UNORM;
+            return VK_FORMAT_R8G8_USCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8_8:
-            return VK_FORMAT_R8G8B8_UNORM;
+            return VK_FORMAT_R8G8B8_USCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return VK_FORMAT_R8G8B8A8_UNORM;
+            return VK_FORMAT_R8G8B8A8_USCALED;
         case Maxwell::VertexAttribute::Size::Size_16:
-            return VK_FORMAT_R16_UNORM;
+            return VK_FORMAT_R16_USCALED;
         case Maxwell::VertexAttribute::Size::Size_16_16:
-            return VK_FORMAT_R16G16_UNORM;
+            return VK_FORMAT_R16G16_USCALED;
         case Maxwell::VertexAttribute::Size::Size_16_16_16:
-            return VK_FORMAT_R16G16B16_UNORM;
+            return VK_FORMAT_R16G16B16_USCALED;
         case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return VK_FORMAT_R16G16B16A16_UNORM;
+            return VK_FORMAT_R16G16B16A16_USCALED;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
-            return VK_FORMAT_A2B10G10R10_UNORM_PACK32;
-        default:
-            break;
+            return VK_FORMAT_A2B10G10R10_USCALED_PACK32;
         }
         break;
-    case Maxwell::VertexAttribute::Type::SignedInt:
+    case Maxwell::VertexAttribute::Type::SignedScaled:
         switch (size) {
         case Maxwell::VertexAttribute::Size::Size_8:
-            return VK_FORMAT_R8_SINT;
+            return VK_FORMAT_R8_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8:
-            return VK_FORMAT_R8G8_SINT;
+            return VK_FORMAT_R8G8_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8_8:
-            return VK_FORMAT_R8G8B8_SINT;
+            return VK_FORMAT_R8G8B8_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return VK_FORMAT_R8G8B8A8_SINT;
+            return VK_FORMAT_R8G8B8A8_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_16:
-            return VK_FORMAT_R16_SINT;
+            return VK_FORMAT_R16_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_16_16:
-            return VK_FORMAT_R16G16_SINT;
+            return VK_FORMAT_R16G16_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_16_16_16:
-            return VK_FORMAT_R16G16B16_SINT;
+            return VK_FORMAT_R16G16B16_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return VK_FORMAT_R16G16B16A16_SINT;
-        case Maxwell::VertexAttribute::Size::Size_32:
-            return VK_FORMAT_R32_SINT;
-        case Maxwell::VertexAttribute::Size::Size_32_32:
-            return VK_FORMAT_R32G32_SINT;
-        case Maxwell::VertexAttribute::Size::Size_32_32_32:
-            return VK_FORMAT_R32G32B32_SINT;
-        case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
-            return VK_FORMAT_R32G32B32A32_SINT;
-        default:
-            break;
+            return VK_FORMAT_R16G16B16A16_SSCALED;
+        case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+            return VK_FORMAT_A2B10G10R10_SSCALED_PACK32;
         }
         break;
     case Maxwell::VertexAttribute::Type::UnsignedInt:
@@ -398,56 +408,50 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
             return VK_FORMAT_R32G32B32_UINT;
         case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
             return VK_FORMAT_R32G32B32A32_UINT;
-        default:
-            break;
+        case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+            return VK_FORMAT_A2B10G10R10_UINT_PACK32;
         }
         break;
-    case Maxwell::VertexAttribute::Type::UnsignedScaled:
+    case Maxwell::VertexAttribute::Type::SignedInt:
         switch (size) {
         case Maxwell::VertexAttribute::Size::Size_8:
-            return VK_FORMAT_R8_USCALED;
+            return VK_FORMAT_R8_SINT;
         case Maxwell::VertexAttribute::Size::Size_8_8:
-            return VK_FORMAT_R8G8_USCALED;
+            return VK_FORMAT_R8G8_SINT;
         case Maxwell::VertexAttribute::Size::Size_8_8_8:
-            return VK_FORMAT_R8G8B8_USCALED;
+            return VK_FORMAT_R8G8B8_SINT;
         case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return VK_FORMAT_R8G8B8A8_USCALED;
+            return VK_FORMAT_R8G8B8A8_SINT;
         case Maxwell::VertexAttribute::Size::Size_16:
-            return VK_FORMAT_R16_USCALED;
+            return VK_FORMAT_R16_SINT;
         case Maxwell::VertexAttribute::Size::Size_16_16:
-            return VK_FORMAT_R16G16_USCALED;
+            return VK_FORMAT_R16G16_SINT;
         case Maxwell::VertexAttribute::Size::Size_16_16_16:
-            return VK_FORMAT_R16G16B16_USCALED;
+            return VK_FORMAT_R16G16B16_SINT;
         case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return VK_FORMAT_R16G16B16A16_USCALED;
-        default:
-            break;
+            return VK_FORMAT_R16G16B16A16_SINT;
+        case Maxwell::VertexAttribute::Size::Size_32:
+            return VK_FORMAT_R32_SINT;
+        case Maxwell::VertexAttribute::Size::Size_32_32:
+            return VK_FORMAT_R32G32_SINT;
+        case Maxwell::VertexAttribute::Size::Size_32_32_32:
+            return VK_FORMAT_R32G32B32_SINT;
+        case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
+            return VK_FORMAT_R32G32B32A32_SINT;
+        case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+            return VK_FORMAT_A2B10G10R10_SINT_PACK32;
         }
         break;
-    case Maxwell::VertexAttribute::Type::SignedScaled:
+    case Maxwell::VertexAttribute::Type::Float:
         switch (size) {
-        case Maxwell::VertexAttribute::Size::Size_8:
-            return VK_FORMAT_R8_SSCALED;
-        case Maxwell::VertexAttribute::Size::Size_8_8:
-            return VK_FORMAT_R8G8_SSCALED;
-        case Maxwell::VertexAttribute::Size::Size_8_8_8:
-            return VK_FORMAT_R8G8B8_SSCALED;
-        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return VK_FORMAT_R8G8B8A8_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_16:
-            return VK_FORMAT_R16_SSCALED;
+            return VK_FORMAT_R16_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_16_16:
-            return VK_FORMAT_R16G16_SSCALED;
+            return VK_FORMAT_R16G16_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_16_16_16:
-            return VK_FORMAT_R16G16B16_SSCALED;
+            return VK_FORMAT_R16G16B16_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return VK_FORMAT_R16G16B16A16_SSCALED;
-        default:
-            break;
-        }
-        break;
-    case Maxwell::VertexAttribute::Type::Float:
-        switch (size) {
+            return VK_FORMAT_R16G16B16A16_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_32:
             return VK_FORMAT_R32_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_32_32:
@@ -456,16 +460,6 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
             return VK_FORMAT_R32G32B32_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
             return VK_FORMAT_R32G32B32A32_SFLOAT;
-        case Maxwell::VertexAttribute::Size::Size_16:
-            return VK_FORMAT_R16_SFLOAT;
-        case Maxwell::VertexAttribute::Size::Size_16_16:
-            return VK_FORMAT_R16G16_SFLOAT;
-        case Maxwell::VertexAttribute::Size::Size_16_16_16:
-            return VK_FORMAT_R16G16B16_SFLOAT;
-        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return VK_FORMAT_R16G16B16A16_SFLOAT;
-        default:
-            break;
         }
         break;
     }
-- 
cgit v1.2.3


From 10eca7f651d0dc407c7c4076d11e0b960d9dedd4 Mon Sep 17 00:00:00 2001
From: Morph <39850852+Morph1984@users.noreply.github.com>
Date: Mon, 29 Jun 2020 11:48:38 -0400
Subject: maxwell_to_gl: Rename VertexType() to VertexFormat()

---
 src/video_core/renderer_opengl/gl_rasterizer.cpp | 5 +++--
 src/video_core/renderer_opengl/maxwell_to_gl.h   | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 362457ffe..e960a0ef1 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -213,9 +213,10 @@ void RasterizerOpenGL::SetupVertexFormat() {
         if (attrib.type == Maxwell::VertexAttribute::Type::SignedInt ||
             attrib.type == Maxwell::VertexAttribute::Type::UnsignedInt) {
             glVertexAttribIFormat(gl_index, attrib.ComponentCount(),
-                                  MaxwellToGL::VertexType(attrib), attrib.offset);
+                                  MaxwellToGL::VertexFormat(attrib), attrib.offset);
         } else {
-            glVertexAttribFormat(gl_index, attrib.ComponentCount(), MaxwellToGL::VertexType(attrib),
+            glVertexAttribFormat(gl_index, attrib.ComponentCount(),
+                                 MaxwellToGL::VertexFormat(attrib),
                                  attrib.IsNormalized() ? GL_TRUE : GL_FALSE, attrib.offset);
         }
         glVertexAttribBinding(gl_index, attrib.buffer);
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 8f3871e90..774e70a5b 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -24,7 +24,7 @@ namespace MaxwellToGL {
 
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
-inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
+inline GLenum VertexFormat(Maxwell::VertexAttribute attrib) {
     switch (attrib.type) {
     case Maxwell::VertexAttribute::Type::UnsignedNorm:
     case Maxwell::VertexAttribute::Type::UnsignedScaled:
@@ -87,7 +87,7 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
         }
         break;
     }
-    UNIMPLEMENTED_MSG("Unimplemented vertex type={} and size={}", attrib.TypeString(),
+    UNIMPLEMENTED_MSG("Unimplemented vertex format of type={} and size={}", attrib.TypeString(),
                       attrib.SizeString());
     return {};
 }
-- 
cgit v1.2.3


From 7c970132b5dd6eaa40f114355e0125091ceb8142 Mon Sep 17 00:00:00 2001
From: David <25727384+ogniK5377@users.noreply.github.com>
Date: Tue, 30 Jun 2020 15:32:24 +1000
Subject: macro: Add support for "middle methods" on the code cache (#4112)

Macro code is just uploaded sequentially from a starting address, however that does not mean the entry point for the macro is at that address. This PR adds preliminary support for executing macros in the middle of our cached code.
---
 src/video_core/macro/macro.cpp | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp
index ef7dad349..a50e7b4e0 100644
--- a/src/video_core/macro/macro.cpp
+++ b/src/video_core/macro/macro.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <optional>
 #include <boost/container_hash/hash.hpp>
 #include "common/assert.h"
 #include "common/logging/log.h"
@@ -35,22 +36,40 @@ void MacroEngine::Execute(Engines::Maxwell3D& maxwell3d, u32 method,
         }
     } else {
         // Macro not compiled, check if it's uploaded and if so, compile it
-        auto macro_code = uploaded_macro_code.find(method);
+        std::optional<u32> mid_method = std::nullopt;
+        const auto macro_code = uploaded_macro_code.find(method);
         if (macro_code == uploaded_macro_code.end()) {
-            UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method);
-            return;
+            for (const auto& [method_base, code] : uploaded_macro_code) {
+                if (method >= method_base && (method - method_base) < code.size()) {
+                    mid_method = method_base;
+                    break;
+                }
+            }
+            if (!mid_method.has_value()) {
+                UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method);
+                return;
+            }
         }
         auto& cache_info = macro_cache[method];
-        cache_info.hash = boost::hash_value(macro_code->second);
-        cache_info.lle_program = Compile(macro_code->second);
+
+        if (!mid_method.has_value()) {
+            cache_info.lle_program = Compile(macro_code->second);
+            cache_info.hash = boost::hash_value(macro_code->second);
+        } else {
+            const auto& macro_cached = uploaded_macro_code[mid_method.value()];
+            const auto rebased_method = method - mid_method.value();
+            auto& code = uploaded_macro_code[method];
+            code.resize(macro_cached.size() - rebased_method);
+            std::memcpy(code.data(), macro_cached.data() + rebased_method,
+                        code.size() * sizeof(u32));
+            cache_info.hash = boost::hash_value(code);
+            cache_info.lle_program = Compile(code);
+        }
 
         auto hle_program = hle_macros->GetHLEProgram(cache_info.hash);
         if (hle_program.has_value()) {
             cache_info.has_hle_program = true;
             cache_info.hle_program = std::move(hle_program.value());
-        }
-
-        if (cache_info.has_hle_program) {
             cache_info.hle_program->Execute(parameters, method);
         } else {
             cache_info.lle_program->Execute(parameters, method);
-- 
cgit v1.2.3


From 1b31755ba6eb3940d2ec0661337ef21913f9a756 Mon Sep 17 00:00:00 2001
From: Morph <39850852+Morph1984@users.noreply.github.com>
Date: Sat, 13 Jun 2020 11:21:27 -0400
Subject: maxwell_to_gl: Implement MirrorOnceClampOGL using GL_MIRROR_CLAMP_EXT

Like MirrorOnceBorder, this requires the GL_EXT_texture_mirror_clamp extension. This extension is unfortunately not available on Intel's drivers (both Windows proprietary and Linux Mesa). Use GL_MIRROR_CLAMP_TO_EDGE as a fallback if the extension is unavailable.
---
 src/video_core/renderer_opengl/maxwell_to_gl.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'src/video_core')

diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 774e70a5b..fe9bd4b5a 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -191,6 +191,12 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
         } else {
             return GL_MIRROR_CLAMP_TO_EDGE;
         }
+    case Tegra::Texture::WrapMode::MirrorOnceClampOGL:
+        if (GL_EXT_texture_mirror_clamp) {
+            return GL_MIRROR_CLAMP_EXT;
+        } else {
+            return GL_MIRROR_CLAMP_TO_EDGE;
+        }
     }
     UNIMPLEMENTED_MSG("Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode));
     return GL_REPEAT;
-- 
cgit v1.2.3