diff options
Diffstat (limited to 'src/video_core')
| -rw-r--r-- | src/video_core/gpu_thread.cpp | 2 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_device.cpp | 54 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_device.h | 6 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_cache.cpp | 66 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_cache.h | 1 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_decompiler.cpp | 61 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_gen.cpp | 19 | ||||
| -rw-r--r-- | src/video_core/renderer_vulkan/vk_device.cpp | 136 | ||||
| -rw-r--r-- | src/video_core/renderer_vulkan/vk_device.h | 58 | ||||
| -rw-r--r-- | src/video_core/renderer_vulkan/vk_shader_decompiler.cpp | 104 | ||||
| -rw-r--r-- | src/video_core/renderer_vulkan/vk_shader_decompiler.h | 8 |
11 files changed, 356 insertions, 159 deletions
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index 1e2ff46b0..3f0939ec9 100644 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -75,7 +75,7 @@ void ThreadManager::StartThread(VideoCore::RendererBase& renderer, Tegra::DmaPus void ThreadManager::SubmitList(Tegra::CommandList&& entries) { const u64 fence{PushCommand(SubmitListCommand(std::move(entries)))}; - const s64 synchronization_ticks{Core::Timing::usToCycles(9000)}; + const s64 synchronization_ticks{Core::Timing::usToCycles(std::chrono::microseconds{9000})}; system.CoreTiming().ScheduleEvent(synchronization_ticks, synchronization_event, fence); } diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index 1d1581f49..65a88b06c 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -2,11 +2,14 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <array> #include <cstddef> #include <glad/glad.h> #include "common/logging/log.h" +#include "common/scope_exit.h" #include "video_core/renderer_opengl/gl_device.h" +#include "video_core/renderer_opengl/gl_resource_manager.h" namespace OpenGL { @@ -24,6 +27,7 @@ Device::Device() { max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS); max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS); has_variable_aoffi = TestVariableAoffi(); + has_component_indexing_bug = TestComponentIndexingBug(); } Device::Device(std::nullptr_t) { @@ -31,6 +35,7 @@ Device::Device(std::nullptr_t) { max_vertex_attributes = 16; max_varyings = 15; has_variable_aoffi = true; + has_component_indexing_bug = false; } bool Device::TestVariableAoffi() { @@ -52,4 +57,53 @@ void main() { return supported; } +bool Device::TestComponentIndexingBug() { + constexpr char log_message[] = "Renderer_ComponentIndexingBug: {}"; + const GLchar* COMPONENT_TEST = R"(#version 430 core +layout (std430, binding = 0) buffer OutputBuffer { + uint output_value; +}; +layout (std140, binding = 0) uniform InputBuffer { + uvec4 input_value[4096]; +}; +layout (location = 0) uniform uint idx; +void main() { + output_value = input_value[idx >> 2][idx & 3]; +})"; + const GLuint shader{glCreateShaderProgramv(GL_VERTEX_SHADER, 1, &COMPONENT_TEST)}; + SCOPE_EXIT({ glDeleteProgram(shader); }); + glUseProgram(shader); + + OGLVertexArray vao; + vao.Create(); + glBindVertexArray(vao.handle); + + constexpr std::array<GLuint, 8> values{0, 0, 0, 0, 0x1236327, 0x985482, 0x872753, 0x2378432}; + OGLBuffer ubo; + ubo.Create(); + glNamedBufferData(ubo.handle, sizeof(values), values.data(), GL_STATIC_DRAW); + glBindBufferBase(GL_UNIFORM_BUFFER, 0, ubo.handle); + + OGLBuffer ssbo; + ssbo.Create(); + glNamedBufferStorage(ssbo.handle, sizeof(GLuint), nullptr, GL_CLIENT_STORAGE_BIT); + + for (GLuint index = 4; index < 8; ++index) { + glInvalidateBufferData(ssbo.handle); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, ssbo.handle); + + glProgramUniform1ui(shader, 0, index); + glDrawArrays(GL_POINTS, 0, 1); + + GLuint result; + glGetNamedBufferSubData(ssbo.handle, 0, sizeof(result), &result); + if (result != values.at(index)) { + LOG_INFO(Render_OpenGL, log_message, true); + return true; + } + } + LOG_INFO(Render_OpenGL, log_message, false); + return false; +} + } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index de8490682..8c8c93760 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -30,13 +30,19 @@ public: return has_variable_aoffi; } + bool HasComponentIndexingBug() const { + return has_component_indexing_bug; + } + private: static bool TestVariableAoffi(); + static bool TestComponentIndexingBug(); std::size_t uniform_buffer_alignment{}; u32 max_vertex_attributes{}; u32 max_varyings{}; bool has_variable_aoffi{}; + bool has_component_indexing_bug{}; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index d66252224..ac8a9e6b7 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -35,8 +35,8 @@ struct UnspecializedShader { namespace { /// Gets the address for the specified shader stage program -GPUVAddr GetShaderAddress(Maxwell::ShaderProgram program) { - const auto& gpu{Core::System::GetInstance().GPU().Maxwell3D()}; +GPUVAddr GetShaderAddress(Core::System& system, Maxwell::ShaderProgram program) { + const auto& gpu{system.GPU().Maxwell3D()}; const auto& shader_config{gpu.regs.shader_config[static_cast<std::size_t>(program)]}; return gpu.regs.code_address.CodeAddress() + shader_config.offset; } @@ -350,7 +350,8 @@ ShaderDiskCacheUsage CachedShader::GetUsage(GLenum primitive_mode, ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system, Core::Frontend::EmuWindow& emu_window, const Device& device) - : RasterizerCache{rasterizer}, emu_window{emu_window}, device{device}, disk_cache{system} {} + : RasterizerCache{rasterizer}, system{system}, emu_window{emu_window}, device{device}, + disk_cache{system} {} void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback) { @@ -546,42 +547,45 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia } Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { - if (!Core::System::GetInstance().GPU().Maxwell3D().dirty_flags.shaders) { - return last_shaders[static_cast<u32>(program)]; + if (!system.GPU().Maxwell3D().dirty_flags.shaders) { + return last_shaders[static_cast<std::size_t>(program)]; } - auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()}; - const GPUVAddr program_addr{GetShaderAddress(program)}; + auto& memory_manager{system.GPU().MemoryManager()}; + const GPUVAddr program_addr{GetShaderAddress(system, program)}; // Look up shader in the cache based on address - const auto& host_ptr{memory_manager.GetPointer(program_addr)}; + const auto host_ptr{memory_manager.GetPointer(program_addr)}; Shader shader{TryGet(host_ptr)}; + if (shader) { + return last_shaders[static_cast<std::size_t>(program)] = shader; + } - if (!shader) { - // No shader found - create a new one - ProgramCode program_code{GetShaderCode(memory_manager, program_addr, host_ptr)}; - ProgramCode program_code_b; - if (program == Maxwell::ShaderProgram::VertexA) { - const GPUVAddr program_addr_b{GetShaderAddress(Maxwell::ShaderProgram::VertexB)}; - program_code_b = GetShaderCode(memory_manager, program_addr_b, - memory_manager.GetPointer(program_addr_b)); - } - const u64 unique_identifier = GetUniqueIdentifier(program, program_code, program_code_b); - const VAddr cpu_addr{*memory_manager.GpuToCpuAddress(program_addr)}; - const auto found = precompiled_shaders.find(unique_identifier); - if (found != precompiled_shaders.end()) { - shader = - std::make_shared<CachedShader>(cpu_addr, unique_identifier, program, disk_cache, - precompiled_programs, found->second, host_ptr); - } else { - shader = std::make_shared<CachedShader>( - device, cpu_addr, unique_identifier, program, disk_cache, precompiled_programs, - std::move(program_code), std::move(program_code_b), host_ptr); - } - Register(shader); + // No shader found - create a new one + ProgramCode program_code{GetShaderCode(memory_manager, program_addr, host_ptr)}; + ProgramCode program_code_b; + if (program == Maxwell::ShaderProgram::VertexA) { + const GPUVAddr program_addr_b{GetShaderAddress(system, Maxwell::ShaderProgram::VertexB)}; + program_code_b = GetShaderCode(memory_manager, program_addr_b, + memory_manager.GetPointer(program_addr_b)); + } + + const u64 unique_identifier = GetUniqueIdentifier(program, program_code, program_code_b); + const VAddr cpu_addr{*memory_manager.GpuToCpuAddress(program_addr)}; + const auto found = precompiled_shaders.find(unique_identifier); + if (found != precompiled_shaders.end()) { + // Create a shader from the cache + shader = std::make_shared<CachedShader>(cpu_addr, unique_identifier, program, disk_cache, + precompiled_programs, found->second, host_ptr); + } else { + // Create a shader from guest memory + shader = std::make_shared<CachedShader>( + device, cpu_addr, unique_identifier, program, disk_cache, precompiled_programs, + std::move(program_code), std::move(program_code_b), host_ptr); } + Register(shader); - return last_shaders[static_cast<u32>(program)] = shader; + return last_shaders[static_cast<std::size_t>(program)] = shader; } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index 64e5a5594..09bd0761d 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -137,6 +137,7 @@ private: CachedProgram GeneratePrecompiledProgram(const ShaderDiskCacheDump& dump, const std::set<GLenum>& supported_formats); + Core::System& system; Core::Frontend::EmuWindow& emu_window; const Device& device; ShaderDiskCacheOpenGL disk_cache; diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index e9f8d40db..29de5c9db 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -45,7 +45,6 @@ struct TextureAoffi {}; using TextureArgument = std::pair<Type, Node>; using TextureIR = std::variant<TextureAoffi, TextureArgument>; -enum : u32 { POSITION_VARYING_LOCATION = 0, GENERIC_VARYING_START_LOCATION = 1 }; constexpr u32 MAX_CONSTBUFFER_ELEMENTS = static_cast<u32>(RasterizerOpenGL::MaxConstbufferSize) / (4 * sizeof(float)); @@ -247,6 +246,12 @@ private: code.AddLine("layout ({}, max_vertices = {}) out;", topology, max_vertices); code.AddNewLine(); + code.AddLine("in gl_PerVertex {{"); + ++code.scope; + code.AddLine("vec4 gl_Position;"); + --code.scope; + code.AddLine("}} gl_in[];"); + DeclareVertexRedeclarations(); } @@ -349,7 +354,7 @@ private: } void DeclareInputAttribute(Attribute::Index index, bool skip_unused) { - const u32 generic_index{GetGenericAttributeIndex(index)}; + const u32 location{GetGenericAttributeIndex(index)}; std::string name{GetInputAttribute(index)}; if (stage == ShaderStage::Geometry) { @@ -358,19 +363,13 @@ private: std::string suffix; if (stage == ShaderStage::Fragment) { - const auto input_mode{header.ps.GetAttributeUse(generic_index)}; + const auto input_mode{header.ps.GetAttributeUse(location)}; if (skip_unused && input_mode == AttributeUse::Unused) { return; } suffix = GetInputFlags(input_mode); } - u32 location = generic_index; - if (stage != ShaderStage::Vertex) { - // If inputs are varyings, add an offset - location += GENERIC_VARYING_START_LOCATION; - } - code.AddLine("layout (location = {}) {} in vec4 {};", location, suffix, name); } @@ -395,7 +394,7 @@ private: } void DeclareOutputAttribute(Attribute::Index index) { - const u32 location{GetGenericAttributeIndex(index) + GENERIC_VARYING_START_LOCATION}; + const u32 location{GetGenericAttributeIndex(index)}; code.AddLine("layout (location = {}) out vec4 {};", location, GetOutputAttribute(index)); } @@ -577,9 +576,26 @@ private: if (std::holds_alternative<OperationNode>(*offset)) { // Indirect access const std::string final_offset = code.GenerateTemporary(); - code.AddLine("uint {} = (ftou({}) / 4);", final_offset, Visit(offset)); - return fmt::format("{}[{} / 4][{} % 4]", GetConstBuffer(cbuf->GetIndex()), - final_offset, final_offset); + code.AddLine("uint {} = ftou({}) >> 2;", final_offset, Visit(offset)); + + if (!device.HasComponentIndexingBug()) { + return fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()), + final_offset, final_offset); + } + + // AMD's proprietary GLSL compiler emits ill code for variable component access. + // To bypass this driver bug generate 4 ifs, one per each component. + const std::string pack = code.GenerateTemporary(); + code.AddLine("vec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()), + final_offset); + + const std::string result = code.GenerateTemporary(); + code.AddLine("float {};", result); + for (u32 swizzle = 0; swizzle < 4; ++swizzle) { + code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result, + pack, GetSwizzle(swizzle)); + } + return result; } UNREACHABLE_MSG("Unmanaged offset node type"); @@ -633,10 +649,14 @@ private: switch (attribute) { case Attribute::Index::Position: - if (stage != ShaderStage::Fragment) { - return GeometryPass("position") + GetSwizzle(element); - } else { + switch (stage) { + case ShaderStage::Geometry: + return fmt::format("gl_in[ftou({})].gl_Position{}", Visit(buffer), + GetSwizzle(element)); + case ShaderStage::Fragment: return element == 3 ? "1.0f" : ("gl_FragCoord"s + GetSwizzle(element)); + default: + UNREACHABLE(); } case Attribute::Index::PointCoord: switch (element) { @@ -921,7 +941,7 @@ private: target = [&]() -> std::string { switch (const auto attribute = abuf->GetIndex(); abuf->GetIndex()) { case Attribute::Index::Position: - return "position"s + GetSwizzle(abuf->GetElement()); + return "gl_Position"s + GetSwizzle(abuf->GetElement()); case Attribute::Index::PointSize: return "gl_PointSize"; case Attribute::Index::ClipDistances0123: @@ -1506,9 +1526,7 @@ private: // If a geometry shader is attached, it will always flip (it's the last stage before // fragment). For more info about flipping, refer to gl_shader_gen.cpp. - code.AddLine("position.xy *= viewport_flip.xy;"); - code.AddLine("gl_Position = position;"); - code.AddLine("position.w = 1.0;"); + code.AddLine("gl_Position.xy *= viewport_flip.xy;"); code.AddLine("EmitVertex();"); return {}; } @@ -1746,8 +1764,7 @@ private: } u32 GetNumPhysicalVaryings() const { - return std::min<u32>(device.GetMaxVaryings() - GENERIC_VARYING_START_LOCATION, - Maxwell::NumVaryings); + return std::min<u32>(device.GetMaxVaryings(), Maxwell::NumVaryings); } const Device& device; diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp index d2bb705a9..c845b29aa 100644 --- a/src/video_core/renderer_opengl/gl_shader_gen.cpp +++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp @@ -23,8 +23,6 @@ ProgramResult GenerateVertexShader(const Device& device, const ShaderSetup& setu out += GetCommonDeclarations(); out += R"( -layout (location = 0) out vec4 position; - layout (std140, binding = EMULATION_UBO_BINDING) uniform vs_config { vec4 viewport_flip; uvec4 config_pack; // instance_id, flip_stage, y_direction, padding @@ -48,7 +46,6 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform vs_config { out += R"( void main() { - position = vec4(0.0, 0.0, 0.0, 0.0); execute_vertex(); )"; @@ -59,19 +56,12 @@ void main() { out += R"( // Set Position Y direction - position.y *= utof(config_pack[2]); + gl_Position.y *= utof(config_pack[2]); // Check if the flip stage is VertexB // Config pack's second value is flip_stage if (config_pack[1] == 1) { // Viewport can be flipped, which is unsupported by glViewport - position.xy *= viewport_flip.xy; - } - gl_Position = position; - - // TODO(bunnei): This is likely a hack, position.w should be interpolated as 1.0 - // For now, this is here to bring order in lieu of proper emulation - if (config_pack[1] == 1) { - position.w = 1.0; + gl_Position.xy *= viewport_flip.xy; } })"; @@ -85,9 +75,6 @@ ProgramResult GenerateGeometryShader(const Device& device, const ShaderSetup& se out += GetCommonDeclarations(); out += R"( -layout (location = 0) in vec4 gs_position[]; -layout (location = 0) out vec4 position; - layout (std140, binding = EMULATION_UBO_BINDING) uniform gs_config { vec4 viewport_flip; uvec4 config_pack; // instance_id, flip_stage, y_direction, padding @@ -124,8 +111,6 @@ layout (location = 5) out vec4 FragColor5; layout (location = 6) out vec4 FragColor6; layout (location = 7) out vec4 FragColor7; -layout (location = 0) in noperspective vec4 position; - layout (std140, binding = EMULATION_UBO_BINDING) uniform fs_config { vec4 viewport_flip; uvec4 config_pack; // instance_id, flip_stage, y_direction, padding diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp index 00242ecbe..3b966ddc3 100644 --- a/src/video_core/renderer_vulkan/vk_device.cpp +++ b/src/video_core/renderer_vulkan/vk_device.cpp @@ -18,6 +18,7 @@ constexpr std::array<vk::Format, 3> Depth24UnormS8Uint = { vk::Format::eD32SfloatS8Uint, vk::Format::eD16UnormS8Uint, {}}; constexpr std::array<vk::Format, 3> Depth16UnormS8Uint = { vk::Format::eD24UnormS8Uint, vk::Format::eD32SfloatS8Uint, {}}; +constexpr std::array<vk::Format, 2> Astc = {vk::Format::eA8B8G8R8UnormPack32, {}}; } // namespace Alternatives @@ -51,15 +52,19 @@ VKDevice::VKDevice(const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDevice phy : physical{physical}, format_properties{GetFormatProperties(dldi, physical)} { SetupFamilies(dldi, surface); SetupProperties(dldi); + SetupFeatures(dldi); } VKDevice::~VKDevice() = default; bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instance) { - const auto queue_cis = GetDeviceQueueCreateInfos(); - vk::PhysicalDeviceFeatures device_features{}; + vk::PhysicalDeviceFeatures device_features; + device_features.vertexPipelineStoresAndAtomics = true; + device_features.independentBlend = true; + device_features.textureCompressionASTC_LDR = is_optimal_astc_supported; - const std::vector<const char*> extensions = {VK_KHR_SWAPCHAIN_EXTENSION_NAME}; + const auto queue_cis = GetDeviceQueueCreateInfos(); + const std::vector<const char*> extensions = LoadExtensions(dldi); const vk::DeviceCreateInfo device_ci({}, static_cast<u32>(queue_cis.size()), queue_cis.data(), 0, nullptr, static_cast<u32>(extensions.size()), extensions.data(), &device_features); @@ -90,7 +95,7 @@ vk::Format VKDevice::GetSupportedFormat(vk::Format wanted_format, LOG_CRITICAL(Render_Vulkan, "Format={} with usage={} and type={} has no defined alternatives and host " "hardware does not support it", - static_cast<u32>(wanted_format), static_cast<u32>(wanted_usage), + vk::to_string(wanted_format), vk::to_string(wanted_usage), static_cast<u32>(format_type)); UNREACHABLE(); return wanted_format; @@ -118,6 +123,30 @@ vk::Format VKDevice::GetSupportedFormat(vk::Format wanted_format, return wanted_format; } +bool VKDevice::IsOptimalAstcSupported(const vk::PhysicalDeviceFeatures& features, + const vk::DispatchLoaderDynamic& dldi) const { + if (!features.textureCompressionASTC_LDR) { + return false; + } + const auto format_feature_usage{ + vk::FormatFeatureFlagBits::eSampledImage | vk::FormatFeatureFlagBits::eBlitSrc | + vk::FormatFeatureFlagBits::eBlitDst | vk::FormatFeatureFlagBits::eTransferSrc | + vk::FormatFeatureFlagBits::eTransferDst}; + constexpr std::array<vk::Format, 9> astc_formats = { + vk::Format::eAstc4x4UnormBlock, vk::Format::eAstc4x4SrgbBlock, + vk::Format::eAstc8x8SrgbBlock, vk::Format::eAstc8x6SrgbBlock, + vk::Format::eAstc5x4SrgbBlock, vk::Format::eAstc5x5UnormBlock, + vk::Format::eAstc5x5SrgbBlock, vk::Format::eAstc10x8UnormBlock, + vk::Format::eAstc10x8SrgbBlock}; + for (const auto format : astc_formats) { + const auto format_properties{physical.getFormatProperties(format, dldi)}; + if (!(format_properties.optimalTilingFeatures & format_feature_usage)) { + return false; + } + } + return true; +} + bool VKDevice::IsFormatSupported(vk::Format wanted_format, vk::FormatFeatureFlags wanted_usage, FormatType format_type) const { const auto it = format_properties.find(wanted_format); @@ -132,11 +161,9 @@ bool VKDevice::IsFormatSupported(vk::Format wanted_format, vk::FormatFeatureFlag bool VKDevice::IsSuitable(const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDevice physical, vk::SurfaceKHR surface) { - const std::string swapchain_extension = VK_KHR_SWAPCHAIN_EXTENSION_NAME; - bool has_swapchain{}; for (const auto& prop : physical.enumerateDeviceExtensionProperties(nullptr, dldi)) { - has_swapchain |= prop.extensionName == swapchain_extension; + has_swapchain |= prop.extensionName == std::string(VK_KHR_SWAPCHAIN_EXTENSION_NAME); } if (!has_swapchain) { // The device doesn't support creating swapchains. @@ -160,8 +187,14 @@ bool VKDevice::IsSuitable(const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDev } // TODO(Rodrigo): Check if the device matches all requeriments. - const vk::PhysicalDeviceProperties props = physical.getProperties(dldi); - if (props.limits.maxUniformBufferRange < 65536) { + const auto properties{physical.getProperties(dldi)}; + const auto limits{properties.limits}; + if (limits.maxUniformBufferRange < 65536) { + return false; + } + + const vk::PhysicalDeviceFeatures features{physical.getFeatures(dldi)}; + if (!features.vertexPipelineStoresAndAtomics || !features.independentBlend) { return false; } @@ -169,6 +202,30 @@ bool VKDevice::IsSuitable(const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDev return true; } +std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynamic& dldi) { + std::vector<const char*> extensions; + extensions.reserve(2); + extensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME); + + const auto Test = [&](const vk::ExtensionProperties& extension, + std::optional<std::reference_wrapper<bool>> status, const char* name, + u32 revision) { + if (extension.extensionName != std::string(name)) { + return; + } + extensions.push_back(name); + if (status) { + status->get() = true; + } + }; + + for (const auto& extension : physical.enumerateDeviceExtensionProperties(nullptr, dldi)) { + Test(extension, ext_scalar_block_layout, VK_EXT_SCALAR_BLOCK_LAYOUT_EXTENSION_NAME, 1); + } + + return extensions; +} + void VKDevice::SetupFamilies(const vk::DispatchLoaderDynamic& dldi, vk::SurfaceKHR surface) { std::optional<u32> graphics_family_, present_family_; @@ -196,10 +253,16 @@ void VKDevice::SetupProperties(const vk::DispatchLoaderDynamic& dldi) { const vk::PhysicalDeviceProperties props = physical.getProperties(dldi); device_type = props.deviceType; uniform_buffer_alignment = static_cast<u64>(props.limits.minUniformBufferOffsetAlignment); + max_storage_buffer_range = static_cast<u64>(props.limits.maxStorageBufferRange); +} + +void VKDevice::SetupFeatures(const vk::DispatchLoaderDynamic& dldi) { + const auto supported_features{physical.getFeatures(dldi)}; + is_optimal_astc_supported = IsOptimalAstcSupported(supported_features, dldi); } std::vector<vk::DeviceQueueCreateInfo> VKDevice::GetDeviceQueueCreateInfos() const { - static const float QUEUE_PRIORITY = 1.f; + static const float QUEUE_PRIORITY = 1.0f; std::set<u32> unique_queue_families = {graphics_family, present_family}; std::vector<vk::DeviceQueueCreateInfo> queue_cis; @@ -212,26 +275,43 @@ std::vector<vk::DeviceQueueCreateInfo> VKDevice::GetDeviceQueueCreateInfos() con std::map<vk::Format, vk::FormatProperties> VKDevice::GetFormatProperties( const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDevice physical) { + static constexpr std::array formats{vk::Format::eA8B8G8R8UnormPack32, + vk::Format::eB5G6R5UnormPack16, + vk::Format::eA2B10G10R10UnormPack32, + vk::Format::eR32G32B32A32Sfloat, + vk::Format::eR16G16Unorm, + vk::Format::eR16G16Snorm, + vk::Format::eR8G8B8A8Srgb, + vk::Format::eR8Unorm, + vk::Format::eB10G11R11UfloatPack32, + vk::Format::eR32Sfloat, + vk::Format::eR16Sfloat, + vk::Format::eR16G16B16A16Sfloat, + vk::Format::eD32Sfloat, + vk::Format::eD16Unorm, + vk::Format::eD16UnormS8Uint, + vk::Format::eD24UnormS8Uint, + vk::Format::eD32SfloatS8Uint, + vk::Format::eBc1RgbaUnormBlock, + vk::Format::eBc2UnormBlock, + vk::Format::eBc3UnormBlock, + vk::Format::eBc4UnormBlock, + vk::Format::eBc5UnormBlock, + vk::Format::eBc5SnormBlock, + vk::Format::eBc7UnormBlock, + vk::Format::eAstc4x4UnormBlock, + vk::Format::eAstc4x4SrgbBlock, + vk::Format::eAstc8x8SrgbBlock, + vk::Format::eAstc8x6SrgbBlock, + vk::Format::eAstc5x4SrgbBlock, + vk::Format::eAstc5x5UnormBlock, + vk::Format::eAstc5x5SrgbBlock, + vk::Format::eAstc10x8UnormBlock, + vk::Format::eAstc10x8SrgbBlock}; std::map<vk::Format, vk::FormatProperties> format_properties; - - const auto AddFormatQuery = [&format_properties, &dldi, physical](vk::Format format) { + for (const auto format : formats) { format_properties.emplace(format, physical.getFormatProperties(format, dldi)); - }; - AddFormatQuery(vk::Format::eA8B8G8R8UnormPack32); - AddFormatQuery(vk::Format::eB5G6R5UnormPack16); - AddFormatQuery(vk::Format::eA2B10G10R10UnormPack32); - AddFormatQuery(vk::Format::eR8G8B8A8Srgb); - AddFormatQuery(vk::Format::eR8Unorm); - AddFormatQuery(vk::Format::eD32Sfloat); - AddFormatQuery(vk::Format::eD16Unorm); - AddFormatQuery(vk::Format::eD16UnormS8Uint); - AddFormatQuery(vk::Format::eD24UnormS8Uint); - AddFormatQuery(vk::Format::eD32SfloatS8Uint); - AddFormatQuery(vk::Format::eBc1RgbaUnormBlock); - AddFormatQuery(vk::Format::eBc2UnormBlock); - AddFormatQuery(vk::Format::eBc3UnormBlock); - AddFormatQuery(vk::Format::eBc4UnormBlock); - + } return format_properties; } diff --git a/src/video_core/renderer_vulkan/vk_device.h b/src/video_core/renderer_vulkan/vk_device.h index e87c7a508..537825d8b 100644 --- a/src/video_core/renderer_vulkan/vk_device.h +++ b/src/video_core/renderer_vulkan/vk_device.h @@ -11,7 +11,7 @@ namespace Vulkan { -/// Format usage descriptor +/// Format usage descriptor. enum class FormatType { Linear, Optimal, Buffer }; /// Handles data specific to a physical device. @@ -34,12 +34,12 @@ public: vk::Format GetSupportedFormat(vk::Format wanted_format, vk::FormatFeatureFlags wanted_usage, FormatType format_type) const; - /// Returns the dispatch loader with direct function pointers of the device + /// Returns the dispatch loader with direct function pointers of the device. const vk::DispatchLoaderDynamic& GetDispatchLoader() const { return dld; } - /// Returns the logical device + /// Returns the logical device. vk::Device GetLogical() const { return logical.get(); } @@ -69,30 +69,55 @@ public: return present_family; } - /// Returns if the device is integrated with the host CPU + /// Returns if the device is integrated with the host CPU. bool IsIntegrated() const { return device_type == vk::PhysicalDeviceType::eIntegratedGpu; } - /// Returns uniform buffer alignment requeriment + /// Returns uniform buffer alignment requeriment. u64 GetUniformBufferAlignment() const { return uniform_buffer_alignment; } + /// Returns the maximum range for storage buffers. + u64 GetMaxStorageBufferRange() const { + return max_storage_buffer_range; + } + + /// Returns true if ASTC is natively supported. + bool IsOptimalAstcSupported() const { + return is_optimal_astc_supported; + } + + /// Returns true if the device supports VK_EXT_scalar_block_layout. + bool IsExtScalarBlockLayoutSupported() const { + return ext_scalar_block_layout; + } + /// Checks if the physical device is suitable. static bool IsSuitable(const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDevice physical, vk::SurfaceKHR surface); private: + /// Loads extensions into a vector and stores available ones in this object. + std::vector<const char*> LoadExtensions(const vk::DispatchLoaderDynamic& dldi); + /// Sets up queue families. void SetupFamilies(const vk::DispatchLoaderDynamic& dldi, vk::SurfaceKHR surface); /// Sets up device properties. void SetupProperties(const vk::DispatchLoaderDynamic& dldi); + /// Sets up device features. + void SetupFeatures(const vk::DispatchLoaderDynamic& dldi); + /// Returns a list of queue initialization descriptors. std::vector<vk::DeviceQueueCreateInfo> GetDeviceQueueCreateInfos() const; + /// Returns true if ASTC textures are natively supported. + bool IsOptimalAstcSupported(const vk::PhysicalDeviceFeatures& features, + const vk::DispatchLoaderDynamic& dldi) const; + /// Returns true if a format is supported. bool IsFormatSupported(vk::Format wanted_format, vk::FormatFeatureFlags wanted_usage, FormatType format_type) const; @@ -101,16 +126,19 @@ private: static std::map<vk::Format, vk::FormatProperties> GetFormatProperties( const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDevice physical); - const vk::PhysicalDevice physical; ///< Physical device - vk::DispatchLoaderDynamic dld; ///< Device function pointers - UniqueDevice logical; ///< Logical device - vk::Queue graphics_queue; ///< Main graphics queue - vk::Queue present_queue; ///< Main present queue - u32 graphics_family{}; ///< Main graphics queue family index - u32 present_family{}; ///< Main present queue family index - vk::PhysicalDeviceType device_type; ///< Physical device type - u64 uniform_buffer_alignment{}; ///< Uniform buffer alignment requeriment - std::map<vk::Format, vk::FormatProperties> format_properties; ///< Format properties dictionary + const vk::PhysicalDevice physical; ///< Physical device. + vk::DispatchLoaderDynamic dld; ///< Device function pointers. + UniqueDevice logical; ///< Logical device. + vk::Queue graphics_queue; ///< Main graphics queue. + vk::Queue present_queue; ///< Main present queue. + u32 graphics_family{}; ///< Main graphics queue family index. + u32 present_family{}; ///< Main present queue family index. + vk::PhysicalDeviceType device_type; ///< Physical device type. + u64 uniform_buffer_alignment{}; ///< Uniform buffer alignment requeriment. + u64 max_storage_buffer_range{}; ///< Max storage buffer size. + bool is_optimal_astc_supported{}; ///< Support for native ASTC. + bool ext_scalar_block_layout{}; ///< Support for VK_EXT_scalar_block_layout. + std::map<vk::Format, vk::FormatProperties> format_properties; ///< Format properties dictionary. }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index a5b25aeff..a85fcae5a 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -17,6 +17,7 @@ #include "video_core/engines/maxwell_3d.h" #include "video_core/engines/shader_bytecode.h" #include "video_core/engines/shader_header.h" +#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_shader_decompiler.h" #include "video_core/shader/shader_ir.h" @@ -33,7 +34,8 @@ using ShaderStage = Tegra::Engines::Maxwell3D::Regs::ShaderStage; using Operation = const OperationNode&; // TODO(Rodrigo): Use rasterizer's value -constexpr u32 MAX_CONSTBUFFER_ELEMENTS = 0x1000; +constexpr u32 MAX_CONSTBUFFER_FLOATS = 0x4000; +constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_FLOATS / 4; constexpr u32 STAGE_BINDING_STRIDE = 0x100; enum class Type { Bool, Bool2, Float, Int, Uint, HalfFloat }; @@ -87,8 +89,8 @@ bool IsPrecise(Operation operand) { class SPIRVDecompiler : public Sirit::Module { public: - explicit SPIRVDecompiler(const ShaderIR& ir, ShaderStage stage) - : Module(0x00010300), ir{ir}, stage{stage}, header{ir.GetHeader()} { + explicit SPIRVDecompiler(const VKDevice& device, const ShaderIR& ir, ShaderStage stage) + : Module(0x00010300), device{device}, ir{ir}, stage{stage}, header{ir.GetHeader()} { AddCapability(spv::Capability::Shader); AddExtension("SPV_KHR_storage_buffer_storage_class"); AddExtension("SPV_KHR_variable_pointers"); @@ -195,7 +197,9 @@ public: entries.samplers.emplace_back(sampler); } for (const auto& attribute : ir.GetInputAttributes()) { - entries.attributes.insert(GetGenericAttributeLocation(attribute)); + if (IsGenericAttribute(attribute)) { + entries.attributes.insert(GetGenericAttributeLocation(attribute)); + } } entries.clip_distances = ir.GetClipDistances(); entries.shader_length = ir.GetLength(); @@ -210,7 +214,6 @@ private: std::array<OperationDecompilerFn, static_cast<std::size_t>(OperationCode::Amount)>; static constexpr auto INTERNAL_FLAGS_COUNT = static_cast<std::size_t>(InternalFlag::Amount); - static constexpr u32 CBUF_STRIDE = 16; void AllocateBindings() { const u32 binding_base = static_cast<u32>(stage) * STAGE_BINDING_STRIDE; @@ -315,6 +318,7 @@ private: constexpr std::array<const char*, INTERNAL_FLAGS_COUNT> names = {"zero", "sign", "carry", "overflow"}; for (std::size_t flag = 0; flag < INTERNAL_FLAGS_COUNT; ++flag) { + const auto flag_code = static_cast<InternalFlag>(flag); const Id id = OpVariable(t_prv_bool, spv::StorageClass::Private, v_false); internal_flags[flag] = AddGlobalVariable(Name(id, names[flag])); } @@ -374,7 +378,9 @@ private: u32 binding = const_buffers_base_binding; for (const auto& entry : ir.GetConstantBuffers()) { const auto [index, size] = entry; - const Id id = OpVariable(t_cbuf_ubo, spv::StorageClass::Uniform); + const Id type = + device.IsExtScalarBlockLayoutSupported() ? t_cbuf_scalar_ubo : t_cbuf_std140_ubo; + const Id id = OpVariable(type, spv::StorageClass::Uniform); AddGlobalVariable(Name(id, fmt::format("cbuf_{}", index))); Decorate(id, spv::Decoration::Binding, binding++); @@ -569,33 +575,35 @@ private: const Node offset = cbuf->GetOffset(); const Id buffer_id = constant_buffers.at(cbuf->GetIndex()); - Id buffer_index{}; - Id buffer_element{}; - - if (const auto immediate = std::get_if<ImmediateNode>(offset)) { - // Direct access - const u32 offset_imm = immediate->GetValue(); - ASSERT(offset_imm % 4 == 0); - buffer_index = Constant(t_uint, offset_imm / 16); - buffer_element = Constant(t_uint, (offset_imm / 4) % 4); - - } else if (std::holds_alternative<OperationNode>(*offset)) { - // Indirect access - // TODO(Rodrigo): Use a uniform buffer stride of 4 and drop this slow math (which - // emits sub-optimal code on GLSL from my testing). - const Id offset_id = BitcastTo<Type::Uint>(Visit(offset)); - const Id unsafe_offset = Emit(OpUDiv(t_uint, offset_id, Constant(t_uint, 4))); - const Id final_offset = Emit( - OpUMod(t_uint, unsafe_offset, Constant(t_uint, MAX_CONSTBUFFER_ELEMENTS - 1))); - buffer_index = Emit(OpUDiv(t_uint, final_offset, Constant(t_uint, 4))); - buffer_element = Emit(OpUMod(t_uint, final_offset, Constant(t_uint, 4))); - + Id pointer{}; + if (device.IsExtScalarBlockLayoutSupported()) { + const Id buffer_offset = Emit(OpShiftRightLogical( + t_uint, BitcastTo<Type::Uint>(Visit(offset)), Constant(t_uint, 2u))); + pointer = Emit( + OpAccessChain(t_cbuf_float, buffer_id, Constant(t_uint, 0u), buffer_offset)); } else { - UNREACHABLE_MSG("Unmanaged offset node type"); + Id buffer_index{}; + Id buffer_element{}; + if (const auto immediate = std::get_if<ImmediateNode>(offset)) { + // Direct access + const u32 offset_imm = immediate->GetValue(); + ASSERT(offset_imm % 4 == 0); + buffer_index = Constant(t_uint, offset_imm / 16); + buffer_element = Constant(t_uint, (offset_imm / 4) % 4); + } else if (std::holds_alternative<OperationNode>(*offset)) { + // Indirect access + const Id offset_id = BitcastTo<Type::Uint>(Visit(offset)); + const Id unsafe_offset = Emit(OpUDiv(t_uint, offset_id, Constant(t_uint, 4))); + const Id final_offset = Emit(OpUMod( + t_uint, unsafe_offset, Constant(t_uint, MAX_CONSTBUFFER_ELEMENTS - 1))); + buffer_index = Emit(OpUDiv(t_uint, final_offset, Constant(t_uint, 4))); + buffer_element = Emit(OpUMod(t_uint, final_offset, Constant(t_uint, 4))); + } else { + UNREACHABLE_MSG("Unmanaged offset node type"); + } + pointer = Emit(OpAccessChain(t_cbuf_float, buffer_id, Constant(t_uint, 0), + buffer_index, buffer_element)); } - - const Id pointer = Emit(OpAccessChain(t_cbuf_float, buffer_id, Constant(t_uint, 0), - buffer_index, buffer_element)); return Emit(OpLoad(t_float, pointer)); } else if (const auto gmem = std::get_if<GmemNode>(node)) { @@ -612,7 +620,9 @@ private: // It's invalid to call conditional on nested nodes, use an operation instead const Id true_label = OpLabel(); const Id skip_label = OpLabel(); - Emit(OpBranchConditional(Visit(conditional->GetCondition()), true_label, skip_label)); + const Id condition = Visit(conditional->GetCondition()); + Emit(OpSelectionMerge(skip_label, spv::SelectionControlMask::MaskNone)); + Emit(OpBranchConditional(condition, true_label, skip_label)); Emit(true_label); VisitBasicBlock(conditional->GetCode()); @@ -968,11 +978,11 @@ private: case ShaderStage::Vertex: { // TODO(Rodrigo): We should use VK_EXT_depth_range_unrestricted instead, but it doesn't // seem to be working on Nvidia's drivers and Intel (mesa and blob) doesn't support it. - const Id position = AccessElement(t_float4, per_vertex, position_index); - Id depth = Emit(OpLoad(t_float, AccessElement(t_out_float, position, 2))); + const Id z_pointer = AccessElement(t_out_float, per_vertex, position_index, 2u); + Id depth = Emit(OpLoad(t_float, z_pointer)); depth = Emit(OpFAdd(t_float, depth, Constant(t_float, 1.0f))); depth = Emit(OpFMul(t_float, depth, Constant(t_float, 0.5f))); - Emit(OpStore(AccessElement(t_out_float, position, 2), depth)); + Emit(OpStore(z_pointer, depth)); break; } case ShaderStage::Fragment: { @@ -1311,6 +1321,7 @@ private: &SPIRVDecompiler::WorkGroupId<2>, }; + const VKDevice& device; const ShaderIR& ir; const ShaderStage stage; const Tegra::Shader::Header header; @@ -1349,12 +1360,18 @@ private: const Id t_out_float4 = Name(TypePointer(spv::StorageClass::Output, t_float4), "out_float4"); const Id t_cbuf_float = TypePointer(spv::StorageClass::Uniform, t_float); - const Id t_cbuf_array = - Decorate(Name(TypeArray(t_float4, Constant(t_uint, MAX_CONSTBUFFER_ELEMENTS)), "CbufArray"), - spv::Decoration::ArrayStride, CBUF_STRIDE); - const Id t_cbuf_struct = MemberDecorate( - Decorate(TypeStruct(t_cbuf_array), spv::Decoration::Block), 0, spv::Decoration::Offset, 0); - const Id t_cbuf_ubo = TypePointer(spv::StorageClass::Uniform, t_cbuf_struct); + const Id t_cbuf_std140 = Decorate( + Name(TypeArray(t_float4, Constant(t_uint, MAX_CONSTBUFFER_ELEMENTS)), "CbufStd140Array"), + spv::Decoration::ArrayStride, 16u); + const Id t_cbuf_scalar = Decorate( + Name(TypeArray(t_float, Constant(t_uint, MAX_CONSTBUFFER_FLOATS)), "CbufScalarArray"), + spv::Decoration::ArrayStride, 4u); + const Id t_cbuf_std140_struct = MemberDecorate( + Decorate(TypeStruct(t_cbuf_std140), spv::Decoration::Block), 0, spv::Decoration::Offset, 0); + const Id t_cbuf_scalar_struct = MemberDecorate( + Decorate(TypeStruct(t_cbuf_scalar), spv::Decoration::Block), 0, spv::Decoration::Offset, 0); + const Id t_cbuf_std140_ubo = TypePointer(spv::StorageClass::Uniform, t_cbuf_std140_struct); + const Id t_cbuf_scalar_ubo = TypePointer(spv::StorageClass::Uniform, t_cbuf_scalar_struct); const Id t_gmem_float = TypePointer(spv::StorageClass::StorageBuffer, t_float); const Id t_gmem_array = @@ -1403,8 +1420,9 @@ private: std::map<u32, Id> labels; }; -DecompilerResult Decompile(const VideoCommon::Shader::ShaderIR& ir, Maxwell::ShaderStage stage) { - auto decompiler = std::make_unique<SPIRVDecompiler>(ir, stage); +DecompilerResult Decompile(const VKDevice& device, const VideoCommon::Shader::ShaderIR& ir, + Maxwell::ShaderStage stage) { + auto decompiler = std::make_unique<SPIRVDecompiler>(device, ir, stage); decompiler->Decompile(); return {std::move(decompiler), decompiler->GetShaderEntries()}; } diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.h b/src/video_core/renderer_vulkan/vk_shader_decompiler.h index 329d8fa38..f90541cc1 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h @@ -20,10 +20,13 @@ namespace VideoCommon::Shader { class ShaderIR; } +namespace Vulkan { +class VKDevice; +} + namespace Vulkan::VKShader { using Maxwell = Tegra::Engines::Maxwell3D::Regs; - using SamplerEntry = VideoCommon::Shader::Sampler; constexpr u32 DESCRIPTOR_SET = 0; @@ -75,6 +78,7 @@ struct ShaderEntries { using DecompilerResult = std::pair<std::unique_ptr<Sirit::Module>, ShaderEntries>; -DecompilerResult Decompile(const VideoCommon::Shader::ShaderIR& ir, Maxwell::ShaderStage stage); +DecompilerResult Decompile(const VKDevice& device, const VideoCommon::Shader::ShaderIR& ir, + Maxwell::ShaderStage stage); } // namespace Vulkan::VKShader |
