diff options
Diffstat (limited to 'src/video_core')
37 files changed, 3330 insertions, 210 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 142852082..ccfed4f2e 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -153,6 +153,9 @@ if (ENABLE_VULKAN) renderer_vulkan/fixed_pipeline_state.h renderer_vulkan/maxwell_to_vk.cpp renderer_vulkan/maxwell_to_vk.h + renderer_vulkan/renderer_vulkan.h + renderer_vulkan/vk_blit_screen.cpp + renderer_vulkan/vk_blit_screen.h renderer_vulkan/vk_buffer_cache.cpp renderer_vulkan/vk_buffer_cache.h renderer_vulkan/vk_compute_pass.cpp @@ -171,6 +174,7 @@ if (ENABLE_VULKAN) renderer_vulkan/vk_memory_manager.h renderer_vulkan/vk_pipeline_cache.cpp renderer_vulkan/vk_pipeline_cache.h + renderer_vulkan/vk_rasterizer.cpp renderer_vulkan/vk_rasterizer.h renderer_vulkan/vk_renderpass_cache.cpp renderer_vulkan/vk_renderpass_cache.h @@ -190,8 +194,11 @@ if (ENABLE_VULKAN) renderer_vulkan/vk_stream_buffer.h renderer_vulkan/vk_swapchain.cpp renderer_vulkan/vk_swapchain.h + renderer_vulkan/vk_texture_cache.cpp + renderer_vulkan/vk_texture_cache.h renderer_vulkan/vk_update_descriptor.cpp - renderer_vulkan/vk_update_descriptor.h) + renderer_vulkan/vk_update_descriptor.h + ) target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include) target_compile_definitions(video_core PRIVATE HAS_VULKAN) diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 1d1f780e7..58dfa8033 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -91,6 +91,7 @@ void Maxwell3D::InitializeRegisterDefaults() { regs.rasterize_enable = 1; regs.rt_separate_frag_data = 1; regs.framebuffer_srgb = 1; + regs.cull.front_face = Maxwell3D::Regs::Cull::FrontFace::ClockWise; mme_inline[MAXWELL3D_REG_INDEX(draw.vertex_end_gl)] = true; mme_inline[MAXWELL3D_REG_INDEX(draw.vertex_begin_gl)] = true; diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index a35e7a195..ee79260fc 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -1018,7 +1018,14 @@ public: } } instanced_arrays; - INSERT_UNION_PADDING_WORDS(0x6); + INSERT_UNION_PADDING_WORDS(0x4); + + union { + BitField<0, 1, u32> enable; + BitField<4, 8, u32> unk4; + } vp_point_size; + + INSERT_UNION_PADDING_WORDS(1); Cull cull; @@ -1271,8 +1278,6 @@ public: } dirty{}; - std::array<u8, Regs::NUM_REGS> dirty_pointers{}; - /// Reads a register value located at the input method address u32 GetRegisterValue(u32 method) const; @@ -1367,6 +1372,8 @@ private: bool execute_on{true}; + std::array<u8, Regs::NUM_REGS> dirty_pointers{}; + /// Retrieves information about a specific TIC entry from the TIC buffer. Texture::TICEntry GetTICEntry(u32 tic_index) const; @@ -1503,6 +1510,7 @@ ASSERT_REG_POSITION(primitive_restart, 0x591); ASSERT_REG_POSITION(index_array, 0x5F2); ASSERT_REG_POSITION(polygon_offset_clamp, 0x61F); ASSERT_REG_POSITION(instanced_arrays, 0x620); +ASSERT_REG_POSITION(vp_point_size, 0x644); ASSERT_REG_POSITION(cull, 0x646); ASSERT_REG_POSITION(pixel_center_integer, 0x649); ASSERT_REG_POSITION(viewport_transform_enabled, 0x64B); diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index 57b57c647..f443ec0fe 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -215,6 +215,40 @@ enum class F2fRoundingOp : u64 { Trunc = 11, }; +enum class AtomicOp : u64 { + Add = 0, + Min = 1, + Max = 2, + Inc = 3, + Dec = 4, + And = 5, + Or = 6, + Xor = 7, + Exch = 8, +}; + +enum class GlobalAtomicOp : u64 { + Add = 0, + Min = 1, + Max = 2, + Inc = 3, + Dec = 4, + And = 5, + Or = 6, + Xor = 7, + Exch = 8, + SafeAdd = 10, +}; + +enum class GlobalAtomicType : u64 { + U32 = 0, + S32 = 1, + U64 = 2, + F32_FTZ_RN = 3, + F16x2_FTZ_RN = 4, + S64 = 5, +}; + enum class UniformType : u64 { UnsignedByte = 0, SignedByte = 1, @@ -236,6 +270,13 @@ enum class StoreType : u64 { Bits128 = 6, }; +enum class AtomicType : u64 { + U32 = 0, + S32 = 1, + U64 = 2, + S64 = 3, +}; + enum class IMinMaxExchange : u64 { None = 0, XLo = 1, @@ -939,6 +980,22 @@ union Instruction { } stg; union { + BitField<52, 4, GlobalAtomicOp> operation; + BitField<49, 3, GlobalAtomicType> type; + BitField<28, 20, s64> offset; + } atom; + + union { + BitField<52, 4, AtomicOp> operation; + BitField<28, 2, AtomicType> type; + BitField<30, 22, s64> offset; + + s32 GetImmediateOffset() const { + return static_cast<s32>(offset << 2); + } + } atoms; + + union { BitField<32, 1, PhysicalAttributeDirection> direction; BitField<47, 3, AttributeSize> size; BitField<20, 11, u64> address; @@ -1659,9 +1716,11 @@ public: ST_A, ST_L, ST_S, - ST, // Store in generic memory - STG, // Store in global memory - AL2P, // Transforms attribute memory into physical memory + ST, // Store in generic memory + STG, // Store in global memory + ATOM, // Atomic operation on global memory + ATOMS, // Atomic operation on shared memory + AL2P, // Transforms attribute memory into physical memory TEX, TEX_B, // Texture Load Bindless TXQ, // Texture Query @@ -1964,6 +2023,8 @@ private: INST("1110111101010---", Id::ST_L, Type::Memory, "ST_L"), INST("101-------------", Id::ST, Type::Memory, "ST"), INST("1110111011011---", Id::STG, Type::Memory, "STG"), + INST("11101101--------", Id::ATOM, Type::Memory, "ATOM"), + INST("11101100--------", Id::ATOMS, Type::Memory, "ATOMS"), INST("1110111110100---", Id::AL2P, Type::Memory, "AL2P"), INST("110000----111---", Id::TEX, Type::Texture, "TEX"), INST("1101111010111---", Id::TEX_B, Type::Texture, "TEX_B"), diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 672051102..c428f06e4 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -1272,6 +1272,7 @@ void RasterizerOpenGL::SyncPointState() { const auto& regs = system.GPU().Maxwell3D().regs; // Limit the point size to 1 since nouveau sometimes sets a point size of 0 (and that's invalid // in OpenGL). + state.point.program_control = regs.vp_point_size.enable != 0; state.point.size = std::max(1.0f, regs.point_size); } diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index de742d11c..3c5bdd377 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -34,9 +34,6 @@ using VideoCommon::Shader::ShaderIR; namespace { -// One UBO is always reserved for emulation values on staged shaders -constexpr u32 STAGE_RESERVED_UBOS = 1; - constexpr u32 STAGE_MAIN_OFFSET = 10; constexpr u32 KERNEL_MAIN_OFFSET = 0; @@ -243,7 +240,6 @@ CachedProgram BuildShader(const Device& device, u64 unique_identifier, ShaderTyp if (!code_b.empty()) { ir_b.emplace(code_b, main_offset, COMPILER_SETTINGS, locker); } - const auto entries = GLShader::GetEntries(ir); std::string source = fmt::format(R"(// {} #version 430 core @@ -264,6 +260,10 @@ CachedProgram BuildShader(const Device& device, u64 unique_identifier, ShaderTyp "#extension GL_NV_shader_thread_group : require\n" "#extension GL_NV_shader_thread_shuffle : require\n"; } + // This pragma stops Nvidia's driver from over optimizing math (probably using fp16 operations) + // on places where we don't want to. + // Thanks to Ryujinx for finding this workaround. + source += "#pragma optionNV(fastmath off)\n"; if (shader_type == ShaderType::Geometry) { const auto [glsl_topology, max_vertices] = GetPrimitiveDescription(variant.primitive_mode); @@ -314,9 +314,10 @@ std::unordered_set<GLenum> GetSupportedFormats() { CachedShader::CachedShader(const ShaderParameters& params, ShaderType shader_type, GLShader::ShaderEntries entries, ProgramCode code, ProgramCode code_b) - : RasterizerCacheObject{params.host_ptr}, system{params.system}, disk_cache{params.disk_cache}, - device{params.device}, cpu_addr{params.cpu_addr}, unique_identifier{params.unique_identifier}, - shader_type{shader_type}, entries{entries}, code{std::move(code)}, code_b{std::move(code_b)} { + : RasterizerCacheObject{params.host_ptr}, system{params.system}, + disk_cache{params.disk_cache}, device{params.device}, cpu_addr{params.cpu_addr}, + unique_identifier{params.unique_identifier}, shader_type{shader_type}, + entries{std::move(entries)}, code{std::move(code)}, code_b{std::move(code_b)} { if (!params.precompiled_variants) { return; } diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index f9f7a97b5..a1ac3d7a9 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -1019,7 +1019,6 @@ private: } return {{"gl_ViewportIndex", Type::Int}}; case 3: - UNIMPLEMENTED_MSG("Requires some state changes for gl_PointSize to work in shader"); return {{"gl_PointSize", Type::Float}}; } return {}; @@ -1856,6 +1855,13 @@ private: Type::Uint}; } + template <const std::string_view& opname, Type type> + Expression Atomic(Operation operation) { + return {fmt::format("atomic{}({}, {})", opname, Visit(operation[0]).GetCode(), + Visit(operation[1]).As(type)), + type}; + } + Expression Branch(Operation operation) { const auto target = std::get_if<ImmediateNode>(&*operation[0]); UNIMPLEMENTED_IF(!target); @@ -2194,6 +2200,8 @@ private: &GLSLDecompiler::AtomicImage<Func::Xor>, &GLSLDecompiler::AtomicImage<Func::Exchange>, + &GLSLDecompiler::Atomic<Func::Add, Type::Uint>, + &GLSLDecompiler::Branch, &GLSLDecompiler::BranchIndirect, &GLSLDecompiler::PushFlowStack, @@ -2313,7 +2321,7 @@ public: explicit ExprDecompiler(GLSLDecompiler& decomp) : decomp{decomp} {} void operator()(const ExprAnd& expr) { - inner += "( "; + inner += '('; std::visit(*this, *expr.operand1); inner += " && "; std::visit(*this, *expr.operand2); @@ -2321,7 +2329,7 @@ public: } void operator()(const ExprOr& expr) { - inner += "( "; + inner += '('; std::visit(*this, *expr.operand1); inner += " || "; std::visit(*this, *expr.operand2); @@ -2339,28 +2347,7 @@ public: } void operator()(const ExprCondCode& expr) { - const Node cc = decomp.ir.GetConditionCode(expr.cc); - std::string target; - - if (const auto pred = std::get_if<PredicateNode>(&*cc)) { - const auto index = pred->GetIndex(); - switch (index) { - case Tegra::Shader::Pred::NeverExecute: - target = "false"; - break; - case Tegra::Shader::Pred::UnusedIndex: - target = "true"; - break; - default: - target = decomp.GetPredicate(index); - break; - } - } else if (const auto flag = std::get_if<InternalFlagNode>(&*cc)) { - target = decomp.GetInternalFlag(flag->GetFlag()); - } else { - UNREACHABLE(); - } - inner += target; + inner += decomp.Visit(decomp.ir.GetConditionCode(expr.cc)).AsBool(); } void operator()(const ExprVar& expr) { @@ -2372,8 +2359,7 @@ public: } void operator()(VideoCommon::Shader::ExprGprEqual& expr) { - inner += - "( ftou(" + decomp.GetRegister(expr.gpr) + ") == " + std::to_string(expr.value) + ')'; + inner += fmt::format("(ftou({}) == {})", decomp.GetRegister(expr.gpr), expr.value); } const std::string& GetResult() const { @@ -2381,8 +2367,8 @@ public: } private: - std::string inner; GLSLDecompiler& decomp; + std::string inner; }; class ASTDecompiler { diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp index df2e2395a..cc185e9e1 100644 --- a/src/video_core/renderer_opengl/gl_state.cpp +++ b/src/video_core/renderer_opengl/gl_state.cpp @@ -127,6 +127,7 @@ void OpenGLState::ApplyClipDistances() { } void OpenGLState::ApplyPointSize() { + Enable(GL_PROGRAM_POINT_SIZE, cur_state.point.program_control, point.program_control); if (UpdateValue(cur_state.point.size, point.size)) { glPointSize(point.size); } diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h index fb180f302..678e5cd89 100644 --- a/src/video_core/renderer_opengl/gl_state.h +++ b/src/video_core/renderer_opengl/gl_state.h @@ -131,7 +131,8 @@ public: std::array<Viewport, Tegra::Engines::Maxwell3D::Regs::NumViewports> viewports; struct { - float size = 1.0f; // GL_POINT_SIZE + bool program_control = false; // GL_PROGRAM_POINT_SIZE + GLfloat size = 1.0f; // GL_POINT_SIZE } point; struct { diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index b790b0ef4..d4b81cd87 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -44,7 +44,7 @@ struct FormatTuple { constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format_tuples = {{ {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, false}, // ABGR8U - {GL_RGBA8, GL_RGBA, GL_BYTE, false}, // ABGR8S + {GL_RGBA8_SNORM, GL_RGBA, GL_BYTE, false}, // ABGR8S {GL_RGBA8UI, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, false}, // ABGR8UI {GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV, false}, // B5G6R5U {GL_RGB10_A2, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, false}, // A2B10G10R10U @@ -83,9 +83,9 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format {GL_RGB32F, GL_RGB, GL_FLOAT, false}, // RGB32F {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, false}, // RGBA8_SRGB {GL_RG8, GL_RG, GL_UNSIGNED_BYTE, false}, // RG8U - {GL_RG8, GL_RG, GL_BYTE, false}, // RG8S + {GL_RG8_SNORM, GL_RG, GL_BYTE, false}, // RG8S {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT, false}, // RG32UI - {GL_RGB16F, GL_RGBA16, GL_HALF_FLOAT, false}, // RGBX16F + {GL_RGB16F, GL_RGBA, GL_HALF_FLOAT, false}, // RGBX16F {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT, false}, // R32UI {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, false}, // ASTC_2D_8X8 {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, false}, // ASTC_2D_8X5 @@ -176,6 +176,19 @@ GLint GetSwizzleSource(SwizzleSource source) { return GL_NONE; } +GLenum GetComponent(PixelFormat format, bool is_first) { + switch (format) { + case PixelFormat::Z24S8: + case PixelFormat::Z32FS8: + return is_first ? GL_DEPTH_COMPONENT : GL_STENCIL_INDEX; + case PixelFormat::S8Z24: + return is_first ? GL_STENCIL_INDEX : GL_DEPTH_COMPONENT; + default: + UNREACHABLE(); + return GL_DEPTH_COMPONENT; + } +} + void ApplyTextureDefaults(const SurfaceParams& params, GLuint texture) { if (params.IsBuffer()) { return; @@ -184,7 +197,7 @@ void ApplyTextureDefaults(const SurfaceParams& params, GLuint texture) { glTextureParameteri(texture, GL_TEXTURE_MAG_FILTER, GL_LINEAR); glTextureParameteri(texture, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); glTextureParameteri(texture, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); - glTextureParameteri(texture, GL_TEXTURE_MAX_LEVEL, params.num_levels - 1); + glTextureParameteri(texture, GL_TEXTURE_MAX_LEVEL, static_cast<GLint>(params.num_levels - 1)); if (params.num_levels == 1) { glTextureParameterf(texture, GL_TEXTURE_LOD_BIAS, 1000.0f); } @@ -253,14 +266,12 @@ void CachedSurface::DownloadTexture(std::vector<u8>& staging_buffer) { glPixelStorei(GL_PACK_ALIGNMENT, std::min(8U, params.GetRowAlignment(level))); glPixelStorei(GL_PACK_ROW_LENGTH, static_cast<GLint>(params.GetMipWidth(level))); const std::size_t mip_offset = params.GetHostMipmapLevelOffset(level); + u8* const mip_data = staging_buffer.data() + mip_offset; + const GLsizei size = static_cast<GLsizei>(params.GetHostMipmapSize(level)); if (is_compressed) { - glGetCompressedTextureImage(texture.handle, level, - static_cast<GLsizei>(params.GetHostMipmapSize(level)), - staging_buffer.data() + mip_offset); + glGetCompressedTextureImage(texture.handle, level, size, mip_data); } else { - glGetTextureImage(texture.handle, level, format, type, - static_cast<GLsizei>(params.GetHostMipmapSize(level)), - staging_buffer.data() + mip_offset); + glGetTextureImage(texture.handle, level, format, type, size, mip_data); } } } @@ -418,11 +429,21 @@ void CachedSurfaceView::ApplySwizzle(SwizzleSource x_source, SwizzleSource y_sou if (new_swizzle == swizzle) return; swizzle = new_swizzle; - const std::array<GLint, 4> gl_swizzle = {GetSwizzleSource(x_source), GetSwizzleSource(y_source), - GetSwizzleSource(z_source), - GetSwizzleSource(w_source)}; + const std::array gl_swizzle = {GetSwizzleSource(x_source), GetSwizzleSource(y_source), + GetSwizzleSource(z_source), GetSwizzleSource(w_source)}; const GLuint handle = GetTexture(); - glTextureParameteriv(handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data()); + const PixelFormat format = surface.GetSurfaceParams().pixel_format; + switch (format) { + case PixelFormat::Z24S8: + case PixelFormat::Z32FS8: + case PixelFormat::S8Z24: + glTextureParameteri(handle, GL_DEPTH_STENCIL_TEXTURE_MODE, + GetComponent(format, x_source == SwizzleSource::R)); + break; + default: + glTextureParameteriv(handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data()); + break; + } } OGLTextureView CachedSurfaceView::CreateTextureView() const { @@ -531,8 +552,11 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view, const Common::Rectangle<u32>& dst_rect = copy_config.dst_rect; const bool is_linear = copy_config.filter == Tegra::Engines::Fermi2D::Filter::Linear; - glBlitFramebuffer(src_rect.left, src_rect.top, src_rect.right, src_rect.bottom, dst_rect.left, - dst_rect.top, dst_rect.right, dst_rect.bottom, buffers, + glBlitFramebuffer(static_cast<GLint>(src_rect.left), static_cast<GLint>(src_rect.top), + static_cast<GLint>(src_rect.right), static_cast<GLint>(src_rect.bottom), + static_cast<GLint>(dst_rect.left), static_cast<GLint>(dst_rect.top), + static_cast<GLint>(dst_rect.right), static_cast<GLint>(dst_rect.bottom), + buffers, is_linear && (buffers == GL_COLOR_BUFFER_BIT) ? GL_LINEAR : GL_NEAREST); } diff --git a/src/video_core/renderer_opengl/utils.cpp b/src/video_core/renderer_opengl/utils.cpp index 9770dda1c..ac99e6385 100644 --- a/src/video_core/renderer_opengl/utils.cpp +++ b/src/video_core/renderer_opengl/utils.cpp @@ -6,16 +6,20 @@ #include <vector> #include <fmt/format.h> - #include <glad/glad.h> -#include "common/assert.h" #include "common/common_types.h" -#include "common/scope_exit.h" #include "video_core/renderer_opengl/utils.h" namespace OpenGL { +struct VertexArrayPushBuffer::Entry { + GLuint binding_index{}; + const GLuint* buffer{}; + GLintptr offset{}; + GLsizei stride{}; +}; + VertexArrayPushBuffer::VertexArrayPushBuffer() = default; VertexArrayPushBuffer::~VertexArrayPushBuffer() = default; @@ -47,6 +51,13 @@ void VertexArrayPushBuffer::Bind() { } } +struct BindBuffersRangePushBuffer::Entry { + GLuint binding; + const GLuint* buffer; + GLintptr offset; + GLsizeiptr size; +}; + BindBuffersRangePushBuffer::BindBuffersRangePushBuffer(GLenum target) : target{target} {} BindBuffersRangePushBuffer::~BindBuffersRangePushBuffer() = default; diff --git a/src/video_core/renderer_opengl/utils.h b/src/video_core/renderer_opengl/utils.h index d56153fe7..3ad7c02d4 100644 --- a/src/video_core/renderer_opengl/utils.h +++ b/src/video_core/renderer_opengl/utils.h @@ -26,12 +26,7 @@ public: void Bind(); private: - struct Entry { - GLuint binding_index{}; - const GLuint* buffer{}; - GLintptr offset{}; - GLsizei stride{}; - }; + struct Entry; GLuint vao{}; const GLuint* index_buffer{}; @@ -50,12 +45,7 @@ public: void Bind(); private: - struct Entry { - GLuint binding; - const GLuint* buffer; - GLintptr offset; - GLsizeiptr size; - }; + struct Entry; GLenum target; std::vector<Entry> entries; diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index 000e3616d..331808113 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -44,7 +44,7 @@ vk::SamplerMipmapMode MipmapMode(Tegra::Texture::TextureMipmapFilter mipmap_filt return {}; } -vk::SamplerAddressMode WrapMode(Tegra::Texture::WrapMode wrap_mode, +vk::SamplerAddressMode WrapMode(const VKDevice& device, Tegra::Texture::WrapMode wrap_mode, Tegra::Texture::TextureFilter filter) { switch (wrap_mode) { case Tegra::Texture::WrapMode::Wrap: @@ -56,7 +56,12 @@ vk::SamplerAddressMode WrapMode(Tegra::Texture::WrapMode wrap_mode, case Tegra::Texture::WrapMode::Border: return vk::SamplerAddressMode::eClampToBorder; case Tegra::Texture::WrapMode::Clamp: - // TODO(Rodrigo): Emulate GL_CLAMP properly + if (device.GetDriverID() == vk::DriverIdKHR::eNvidiaProprietary) { + // Nvidia's Vulkan driver defaults to GL_CLAMP on invalid enumerations, we can hack this + // by sending an invalid enumeration. + return static_cast<vk::SamplerAddressMode>(0xcafe); + } + // TODO(Rodrigo): Emulate GL_CLAMP properly on other vendors switch (filter) { case Tegra::Texture::TextureFilter::Nearest: return vk::SamplerAddressMode::eClampToEdge; diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.h b/src/video_core/renderer_vulkan/maxwell_to_vk.h index 1534b738b..7e9678b7b 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.h +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.h @@ -22,7 +22,7 @@ vk::Filter Filter(Tegra::Texture::TextureFilter filter); vk::SamplerMipmapMode MipmapMode(Tegra::Texture::TextureMipmapFilter mipmap_filter); -vk::SamplerAddressMode WrapMode(Tegra::Texture::WrapMode wrap_mode, +vk::SamplerAddressMode WrapMode(const VKDevice& device, Tegra::Texture::WrapMode wrap_mode, Tegra::Texture::TextureFilter filter); vk::CompareOp DepthCompareFunction(Tegra::Texture::DepthCompareFunc depth_compare_func); diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h new file mode 100644 index 000000000..a472c5dc9 --- /dev/null +++ b/src/video_core/renderer_vulkan/renderer_vulkan.h @@ -0,0 +1,72 @@ +// Copyright 2018 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <optional> +#include <vector> +#include "video_core/renderer_base.h" +#include "video_core/renderer_vulkan/declarations.h" + +namespace Core { +class System; +} + +namespace Vulkan { + +class VKBlitScreen; +class VKDevice; +class VKFence; +class VKMemoryManager; +class VKResourceManager; +class VKSwapchain; +class VKScheduler; +class VKImage; + +struct VKScreenInfo { + VKImage* image{}; + u32 width{}; + u32 height{}; + bool is_srgb{}; +}; + +class RendererVulkan final : public VideoCore::RendererBase { +public: + explicit RendererVulkan(Core::Frontend::EmuWindow& window, Core::System& system); + ~RendererVulkan() override; + + /// Swap buffers (render frame) + void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override; + + /// Initialize the renderer + bool Init() override; + + /// Shutdown the renderer + void ShutDown() override; + +private: + std::optional<vk::DebugUtilsMessengerEXT> CreateDebugCallback( + const vk::DispatchLoaderDynamic& dldi); + + bool PickDevices(const vk::DispatchLoaderDynamic& dldi); + + void Report() const; + + Core::System& system; + + vk::Instance instance; + vk::SurfaceKHR surface; + + VKScreenInfo screen_info; + + UniqueDebugUtilsMessengerEXT debug_callback; + std::unique_ptr<VKDevice> device; + std::unique_ptr<VKSwapchain> swapchain; + std::unique_ptr<VKMemoryManager> memory_manager; + std::unique_ptr<VKResourceManager> resource_manager; + std::unique_ptr<VKScheduler> scheduler; + std::unique_ptr<VKBlitScreen> blit_screen; +}; + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp new file mode 100644 index 000000000..855cfc883 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -0,0 +1,627 @@ +// Copyright 2018 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <array> +#include <cstring> +#include <memory> +#include <tuple> +#include <vector> + +#include "common/assert.h" +#include "common/common_types.h" +#include "common/math_util.h" + +#include "core/core.h" +#include "core/frontend/emu_window.h" +#include "core/memory.h" + +#include "video_core/gpu.h" +#include "video_core/morton.h" +#include "video_core/rasterizer_interface.h" +#include "video_core/renderer_vulkan/declarations.h" +#include "video_core/renderer_vulkan/renderer_vulkan.h" +#include "video_core/renderer_vulkan/vk_blit_screen.h" +#include "video_core/renderer_vulkan/vk_device.h" +#include "video_core/renderer_vulkan/vk_image.h" +#include "video_core/renderer_vulkan/vk_memory_manager.h" +#include "video_core/renderer_vulkan/vk_resource_manager.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_shader_util.h" +#include "video_core/renderer_vulkan/vk_swapchain.h" +#include "video_core/surface.h" + +namespace Vulkan { + +namespace { + +// Generated from the "shaders/" directory, read the instructions there. +constexpr u8 blit_vertex_code[] = { + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x07, 0x00, 0x08, 0x00, 0x27, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, + 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x0f, 0x00, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, + 0x00, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x25, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x0b, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x0b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x05, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x04, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x05, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x11, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x24, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x25, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x09, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x06, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x0c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, + 0x0c, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x04, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, + 0x11, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x12, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x12, 0x00, 0x00, 0x00, + 0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x14, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f, 0x20, 0x00, 0x04, 0x00, 0x21, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x24, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x25, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x14, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, + 0x13, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x10, 0x00, 0x00, 0x00, + 0x16, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x1a, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x1d, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x50, 0x00, 0x07, 0x00, 0x07, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x91, 0x00, 0x05, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x05, 0x00, 0x21, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0x22, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, + 0x3e, 0x00, 0x03, 0x00, 0x24, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, + 0x38, 0x00, 0x01, 0x00}; + +constexpr u8 blit_fragment_code[] = { + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x07, 0x00, 0x08, 0x00, 0x14, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, + 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x0f, 0x00, 0x07, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, + 0x00, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x10, 0x00, 0x03, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x0d, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x0d, 0x00, 0x00, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x11, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x09, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x19, 0x00, 0x09, 0x00, 0x0a, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x03, 0x00, + 0x0b, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00, + 0x0d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x0f, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x10, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x10, 0x00, 0x00, 0x00, + 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, + 0x0d, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, + 0x11, 0x00, 0x00, 0x00, 0x57, 0x00, 0x05, 0x00, 0x07, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0x09, 0x00, 0x00, 0x00, + 0x13, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00}; + +struct ScreenRectVertex { + ScreenRectVertex() = default; + explicit ScreenRectVertex(f32 x, f32 y, f32 u, f32 v) : position{{x, y}}, tex_coord{{u, v}} {} + + std::array<f32, 2> position; + std::array<f32, 2> tex_coord; + + static vk::VertexInputBindingDescription GetDescription() { + return vk::VertexInputBindingDescription(0, sizeof(ScreenRectVertex), + vk::VertexInputRate::eVertex); + } + + static std::array<vk::VertexInputAttributeDescription, 2> GetAttributes() { + return {vk::VertexInputAttributeDescription(0, 0, vk::Format::eR32G32Sfloat, + offsetof(ScreenRectVertex, position)), + vk::VertexInputAttributeDescription(1, 0, vk::Format::eR32G32Sfloat, + offsetof(ScreenRectVertex, tex_coord))}; + } +}; + +constexpr std::array<f32, 4 * 4> MakeOrthographicMatrix(f32 width, f32 height) { + // clang-format off + return { 2.f / width, 0.f, 0.f, 0.f, + 0.f, 2.f / height, 0.f, 0.f, + 0.f, 0.f, 1.f, 0.f, + -1.f, -1.f, 0.f, 1.f}; + // clang-format on +} + +std::size_t GetBytesPerPixel(const Tegra::FramebufferConfig& framebuffer) { + using namespace VideoCore::Surface; + return GetBytesPerPixel(PixelFormatFromGPUPixelFormat(framebuffer.pixel_format)); +} + +std::size_t GetSizeInBytes(const Tegra::FramebufferConfig& framebuffer) { + return static_cast<std::size_t>(framebuffer.stride) * + static_cast<std::size_t>(framebuffer.height) * GetBytesPerPixel(framebuffer); +} + +vk::Format GetFormat(const Tegra::FramebufferConfig& framebuffer) { + switch (framebuffer.pixel_format) { + case Tegra::FramebufferConfig::PixelFormat::ABGR8: + return vk::Format::eA8B8G8R8UnormPack32; + case Tegra::FramebufferConfig::PixelFormat::RGB565: + return vk::Format::eR5G6B5UnormPack16; + default: + UNIMPLEMENTED_MSG("Unknown framebuffer pixel format: {}", + static_cast<u32>(framebuffer.pixel_format)); + return vk::Format::eA8B8G8R8UnormPack32; + } +} + +} // Anonymous namespace + +struct VKBlitScreen::BufferData { + struct { + std::array<f32, 4 * 4> modelview_matrix; + } uniform; + + std::array<ScreenRectVertex, 4> vertices; + + // Unaligned image data goes here +}; + +VKBlitScreen::VKBlitScreen(Core::System& system, Core::Frontend::EmuWindow& render_window, + VideoCore::RasterizerInterface& rasterizer, const VKDevice& device, + VKResourceManager& resource_manager, VKMemoryManager& memory_manager, + VKSwapchain& swapchain, VKScheduler& scheduler, + const VKScreenInfo& screen_info) + : system{system}, render_window{render_window}, rasterizer{rasterizer}, device{device}, + resource_manager{resource_manager}, memory_manager{memory_manager}, swapchain{swapchain}, + scheduler{scheduler}, image_count{swapchain.GetImageCount()}, screen_info{screen_info} { + watches.resize(image_count); + std::generate(watches.begin(), watches.end(), + []() { return std::make_unique<VKFenceWatch>(); }); + + CreateStaticResources(); + CreateDynamicResources(); +} + +VKBlitScreen::~VKBlitScreen() = default; + +void VKBlitScreen::Recreate() { + CreateDynamicResources(); +} + +std::tuple<VKFence&, vk::Semaphore> VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, + bool use_accelerated) { + RefreshResources(framebuffer); + + // Finish any pending renderpass + scheduler.RequestOutsideRenderPassOperationContext(); + + const std::size_t image_index = swapchain.GetImageIndex(); + watches[image_index]->Watch(scheduler.GetFence()); + + VKImage* blit_image = use_accelerated ? screen_info.image : raw_images[image_index].get(); + + UpdateDescriptorSet(image_index, blit_image->GetPresentView()); + + BufferData data; + SetUniformData(data, framebuffer); + SetVertexData(data, framebuffer); + + auto map = buffer_commit->Map(); + std::memcpy(map.GetAddress(), &data, sizeof(data)); + + if (!use_accelerated) { + const u64 image_offset = GetRawImageOffset(framebuffer, image_index); + + const auto pixel_format = + VideoCore::Surface::PixelFormatFromGPUPixelFormat(framebuffer.pixel_format); + const VAddr framebuffer_addr = framebuffer.address + framebuffer.offset; + const auto host_ptr = system.Memory().GetPointer(framebuffer_addr); + rasterizer.FlushRegion(ToCacheAddr(host_ptr), GetSizeInBytes(framebuffer)); + + // TODO(Rodrigo): Read this from HLE + constexpr u32 block_height_log2 = 4; + VideoCore::MortonSwizzle(VideoCore::MortonSwizzleMode::MortonToLinear, pixel_format, + framebuffer.stride, block_height_log2, framebuffer.height, 0, 1, 1, + map.GetAddress() + image_offset, host_ptr); + + blit_image->Transition(0, 1, 0, 1, vk::PipelineStageFlagBits::eTransfer, + vk::AccessFlagBits::eTransferWrite, + vk::ImageLayout::eTransferDstOptimal); + + const vk::BufferImageCopy copy(image_offset, 0, 0, + {vk::ImageAspectFlagBits::eColor, 0, 0, 1}, {0, 0, 0}, + {framebuffer.width, framebuffer.height, 1}); + scheduler.Record([buffer_handle = *buffer, image = blit_image->GetHandle(), + copy](auto cmdbuf, auto& dld) { + cmdbuf.copyBufferToImage(buffer_handle, image, vk::ImageLayout::eTransferDstOptimal, + {copy}, dld); + }); + } + map.Release(); + + blit_image->Transition(0, 1, 0, 1, vk::PipelineStageFlagBits::eFragmentShader, + vk::AccessFlagBits::eShaderRead, + vk::ImageLayout::eShaderReadOnlyOptimal); + + scheduler.Record([renderpass = *renderpass, framebuffer = *framebuffers[image_index], + descriptor_set = descriptor_sets[image_index], buffer = *buffer, + size = swapchain.GetSize(), pipeline = *pipeline, + layout = *pipeline_layout](auto cmdbuf, auto& dld) { + const vk::ClearValue clear_color{std::array{0.0f, 0.0f, 0.0f, 1.0f}}; + const vk::RenderPassBeginInfo renderpass_bi(renderpass, framebuffer, {{0, 0}, size}, 1, + &clear_color); + + cmdbuf.beginRenderPass(renderpass_bi, vk::SubpassContents::eInline, dld); + cmdbuf.bindPipeline(vk::PipelineBindPoint::eGraphics, pipeline, dld); + cmdbuf.setViewport( + 0, + {{0.0f, 0.0f, static_cast<f32>(size.width), static_cast<f32>(size.height), 0.0f, 1.0f}}, + dld); + cmdbuf.setScissor(0, {{{0, 0}, size}}, dld); + + cmdbuf.bindVertexBuffers(0, {buffer}, {offsetof(BufferData, vertices)}, dld); + cmdbuf.bindDescriptorSets(vk::PipelineBindPoint::eGraphics, layout, 0, {descriptor_set}, {}, + dld); + cmdbuf.draw(4, 1, 0, 0, dld); + cmdbuf.endRenderPass(dld); + }); + + return {scheduler.GetFence(), *semaphores[image_index]}; +} + +void VKBlitScreen::CreateStaticResources() { + CreateShaders(); + CreateSemaphores(); + CreateDescriptorPool(); + CreateDescriptorSetLayout(); + CreateDescriptorSets(); + CreatePipelineLayout(); + CreateSampler(); +} + +void VKBlitScreen::CreateDynamicResources() { + CreateRenderPass(); + CreateFramebuffers(); + CreateGraphicsPipeline(); +} + +void VKBlitScreen::RefreshResources(const Tegra::FramebufferConfig& framebuffer) { + if (framebuffer.width == raw_width && framebuffer.height == raw_height && !raw_images.empty()) { + return; + } + raw_width = framebuffer.width; + raw_height = framebuffer.height; + ReleaseRawImages(); + + CreateStagingBuffer(framebuffer); + CreateRawImages(framebuffer); +} + +void VKBlitScreen::CreateShaders() { + vertex_shader = BuildShader(device, sizeof(blit_vertex_code), blit_vertex_code); + fragment_shader = BuildShader(device, sizeof(blit_fragment_code), blit_fragment_code); +} + +void VKBlitScreen::CreateSemaphores() { + const auto dev = device.GetLogical(); + const auto& dld = device.GetDispatchLoader(); + + semaphores.resize(image_count); + for (std::size_t i = 0; i < image_count; ++i) { + semaphores[i] = dev.createSemaphoreUnique({}, nullptr, dld); + } +} + +void VKBlitScreen::CreateDescriptorPool() { + const std::array<vk::DescriptorPoolSize, 2> pool_sizes{ + vk::DescriptorPoolSize{vk::DescriptorType::eUniformBuffer, static_cast<u32>(image_count)}, + vk::DescriptorPoolSize{vk::DescriptorType::eCombinedImageSampler, + static_cast<u32>(image_count)}}; + const vk::DescriptorPoolCreateInfo pool_ci( + {}, static_cast<u32>(image_count), static_cast<u32>(pool_sizes.size()), pool_sizes.data()); + const auto dev = device.GetLogical(); + descriptor_pool = dev.createDescriptorPoolUnique(pool_ci, nullptr, device.GetDispatchLoader()); +} + +void VKBlitScreen::CreateRenderPass() { + const vk::AttachmentDescription color_attachment( + {}, swapchain.GetImageFormat(), vk::SampleCountFlagBits::e1, vk::AttachmentLoadOp::eClear, + vk::AttachmentStoreOp::eStore, vk::AttachmentLoadOp::eDontCare, + vk::AttachmentStoreOp::eDontCare, vk::ImageLayout::eUndefined, + vk::ImageLayout::ePresentSrcKHR); + + const vk::AttachmentReference color_attachment_ref(0, vk::ImageLayout::eColorAttachmentOptimal); + + const vk::SubpassDescription subpass_description({}, vk::PipelineBindPoint::eGraphics, 0, + nullptr, 1, &color_attachment_ref, nullptr, + nullptr, 0, nullptr); + + const vk::SubpassDependency dependency( + VK_SUBPASS_EXTERNAL, 0, vk::PipelineStageFlagBits::eColorAttachmentOutput, + vk::PipelineStageFlagBits::eColorAttachmentOutput, {}, + vk::AccessFlagBits::eColorAttachmentRead | vk::AccessFlagBits::eColorAttachmentWrite, {}); + + const vk::RenderPassCreateInfo renderpass_ci({}, 1, &color_attachment, 1, &subpass_description, + 1, &dependency); + + const auto dev = device.GetLogical(); + renderpass = dev.createRenderPassUnique(renderpass_ci, nullptr, device.GetDispatchLoader()); +} + +void VKBlitScreen::CreateDescriptorSetLayout() { + const std::array<vk::DescriptorSetLayoutBinding, 2> layout_bindings{ + vk::DescriptorSetLayoutBinding(0, vk::DescriptorType::eUniformBuffer, 1, + vk::ShaderStageFlagBits::eVertex, nullptr), + vk::DescriptorSetLayoutBinding(1, vk::DescriptorType::eCombinedImageSampler, 1, + vk::ShaderStageFlagBits::eFragment, nullptr)}; + const vk::DescriptorSetLayoutCreateInfo descriptor_layout_ci( + {}, static_cast<u32>(layout_bindings.size()), layout_bindings.data()); + + const auto dev = device.GetLogical(); + const auto& dld = device.GetDispatchLoader(); + descriptor_set_layout = dev.createDescriptorSetLayoutUnique(descriptor_layout_ci, nullptr, dld); +} + +void VKBlitScreen::CreateDescriptorSets() { + const auto dev = device.GetLogical(); + const auto& dld = device.GetDispatchLoader(); + + descriptor_sets.resize(image_count); + for (std::size_t i = 0; i < image_count; ++i) { + const vk::DescriptorSetLayout layout = *descriptor_set_layout; + const vk::DescriptorSetAllocateInfo descriptor_set_ai(*descriptor_pool, 1, &layout); + const vk::Result result = + dev.allocateDescriptorSets(&descriptor_set_ai, &descriptor_sets[i], dld); + ASSERT(result == vk::Result::eSuccess); + } +} + +void VKBlitScreen::CreatePipelineLayout() { + const vk::PipelineLayoutCreateInfo pipeline_layout_ci({}, 1, &descriptor_set_layout.get(), 0, + nullptr); + const auto dev = device.GetLogical(); + const auto& dld = device.GetDispatchLoader(); + pipeline_layout = dev.createPipelineLayoutUnique(pipeline_layout_ci, nullptr, dld); +} + +void VKBlitScreen::CreateGraphicsPipeline() { + const std::array shader_stages = { + vk::PipelineShaderStageCreateInfo({}, vk::ShaderStageFlagBits::eVertex, *vertex_shader, + "main", nullptr), + vk::PipelineShaderStageCreateInfo({}, vk::ShaderStageFlagBits::eFragment, *fragment_shader, + "main", nullptr)}; + + const auto vertex_binding_description = ScreenRectVertex::GetDescription(); + const auto vertex_attrs_description = ScreenRectVertex::GetAttributes(); + const vk::PipelineVertexInputStateCreateInfo vertex_input( + {}, 1, &vertex_binding_description, static_cast<u32>(vertex_attrs_description.size()), + vertex_attrs_description.data()); + + const vk::PipelineInputAssemblyStateCreateInfo input_assembly( + {}, vk::PrimitiveTopology::eTriangleStrip, false); + + // Set a dummy viewport, it's going to be replaced by dynamic states. + const vk::Viewport viewport(0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 1.0f); + const vk::Rect2D scissor({0, 0}, {1, 1}); + + const vk::PipelineViewportStateCreateInfo viewport_state({}, 1, &viewport, 1, &scissor); + + const vk::PipelineRasterizationStateCreateInfo rasterizer( + {}, false, false, vk::PolygonMode::eFill, vk::CullModeFlagBits::eNone, + vk::FrontFace::eClockwise, false, 0.0f, 0.0f, 0.0f, 1.0f); + + const vk::PipelineMultisampleStateCreateInfo multisampling({}, vk::SampleCountFlagBits::e1, + false, 0.0f, nullptr, false, false); + + const vk::PipelineColorBlendAttachmentState color_blend_attachment( + false, vk::BlendFactor::eZero, vk::BlendFactor::eZero, vk::BlendOp::eAdd, + vk::BlendFactor::eZero, vk::BlendFactor::eZero, vk::BlendOp::eAdd, + vk::ColorComponentFlagBits::eR | vk::ColorComponentFlagBits::eG | + vk::ColorComponentFlagBits::eB | vk::ColorComponentFlagBits::eA); + + const vk::PipelineColorBlendStateCreateInfo color_blending( + {}, false, vk::LogicOp::eCopy, 1, &color_blend_attachment, {0.0f, 0.0f, 0.0f, 0.0f}); + + const std::array<vk::DynamicState, 2> dynamic_states = {vk::DynamicState::eViewport, + vk::DynamicState::eScissor}; + + const vk::PipelineDynamicStateCreateInfo dynamic_state( + {}, static_cast<u32>(dynamic_states.size()), dynamic_states.data()); + + const vk::GraphicsPipelineCreateInfo pipeline_ci( + {}, static_cast<u32>(shader_stages.size()), shader_stages.data(), &vertex_input, + &input_assembly, nullptr, &viewport_state, &rasterizer, &multisampling, nullptr, + &color_blending, &dynamic_state, *pipeline_layout, *renderpass, 0, nullptr, 0); + + const auto dev = device.GetLogical(); + const auto& dld = device.GetDispatchLoader(); + pipeline = dev.createGraphicsPipelineUnique({}, pipeline_ci, nullptr, dld); +} + +void VKBlitScreen::CreateSampler() { + const auto dev = device.GetLogical(); + const auto& dld = device.GetDispatchLoader(); + const vk::SamplerCreateInfo sampler_ci( + {}, vk::Filter::eLinear, vk::Filter::eLinear, vk::SamplerMipmapMode::eLinear, + vk::SamplerAddressMode::eClampToBorder, vk::SamplerAddressMode::eClampToBorder, + vk::SamplerAddressMode::eClampToBorder, 0.0f, false, 0.0f, false, vk::CompareOp::eNever, + 0.0f, 0.0f, vk::BorderColor::eFloatOpaqueBlack, false); + sampler = dev.createSamplerUnique(sampler_ci, nullptr, dld); +} + +void VKBlitScreen::CreateFramebuffers() { + const vk::Extent2D size{swapchain.GetSize()}; + framebuffers.clear(); + framebuffers.resize(image_count); + + const auto dev = device.GetLogical(); + const auto& dld = device.GetDispatchLoader(); + + for (std::size_t i = 0; i < image_count; ++i) { + const vk::ImageView image_view{swapchain.GetImageViewIndex(i)}; + const vk::FramebufferCreateInfo framebuffer_ci({}, *renderpass, 1, &image_view, size.width, + size.height, 1); + framebuffers[i] = dev.createFramebufferUnique(framebuffer_ci, nullptr, dld); + } +} + +void VKBlitScreen::ReleaseRawImages() { + for (std::size_t i = 0; i < raw_images.size(); ++i) { + watches[i]->Wait(); + } + raw_images.clear(); + raw_buffer_commits.clear(); + buffer.reset(); + buffer_commit.reset(); +} + +void VKBlitScreen::CreateStagingBuffer(const Tegra::FramebufferConfig& framebuffer) { + const auto dev = device.GetLogical(); + const auto& dld = device.GetDispatchLoader(); + + const vk::BufferCreateInfo buffer_ci({}, CalculateBufferSize(framebuffer), + vk::BufferUsageFlagBits::eTransferSrc | + vk::BufferUsageFlagBits::eVertexBuffer | + vk::BufferUsageFlagBits::eUniformBuffer, + vk::SharingMode::eExclusive, 0, nullptr); + buffer = dev.createBufferUnique(buffer_ci, nullptr, dld); + buffer_commit = memory_manager.Commit(*buffer, true); +} + +void VKBlitScreen::CreateRawImages(const Tegra::FramebufferConfig& framebuffer) { + raw_images.resize(image_count); + raw_buffer_commits.resize(image_count); + + const auto format = GetFormat(framebuffer); + for (std::size_t i = 0; i < image_count; ++i) { + const vk::ImageCreateInfo image_ci( + {}, vk::ImageType::e2D, format, {framebuffer.width, framebuffer.height, 1}, 1, 1, + vk::SampleCountFlagBits::e1, vk::ImageTiling::eOptimal, + vk::ImageUsageFlagBits::eTransferDst | vk::ImageUsageFlagBits::eSampled, + vk::SharingMode::eExclusive, 0, nullptr, vk::ImageLayout::eUndefined); + + raw_images[i] = + std::make_unique<VKImage>(device, scheduler, image_ci, vk::ImageAspectFlagBits::eColor); + raw_buffer_commits[i] = memory_manager.Commit(raw_images[i]->GetHandle(), false); + } +} + +void VKBlitScreen::UpdateDescriptorSet(std::size_t image_index, vk::ImageView image_view) const { + const vk::DescriptorSet descriptor_set = descriptor_sets[image_index]; + + const vk::DescriptorBufferInfo buffer_info(*buffer, offsetof(BufferData, uniform), + sizeof(BufferData::uniform)); + const vk::WriteDescriptorSet ubo_write(descriptor_set, 0, 0, 1, + vk::DescriptorType::eUniformBuffer, nullptr, + &buffer_info, nullptr); + + const vk::DescriptorImageInfo image_info(*sampler, image_view, + vk::ImageLayout::eShaderReadOnlyOptimal); + const vk::WriteDescriptorSet sampler_write(descriptor_set, 1, 0, 1, + vk::DescriptorType::eCombinedImageSampler, + &image_info, nullptr, nullptr); + + const auto dev = device.GetLogical(); + const auto& dld = device.GetDispatchLoader(); + dev.updateDescriptorSets({ubo_write, sampler_write}, {}, dld); +} + +void VKBlitScreen::SetUniformData(BufferData& data, + const Tegra::FramebufferConfig& framebuffer) const { + const auto& layout = render_window.GetFramebufferLayout(); + data.uniform.modelview_matrix = + MakeOrthographicMatrix(static_cast<f32>(layout.width), static_cast<f32>(layout.height)); +} + +void VKBlitScreen::SetVertexData(BufferData& data, + const Tegra::FramebufferConfig& framebuffer) const { + const auto& framebuffer_transform_flags = framebuffer.transform_flags; + const auto& framebuffer_crop_rect = framebuffer.crop_rect; + + static constexpr Common::Rectangle<f32> texcoords{0.f, 0.f, 1.f, 1.f}; + auto left = texcoords.left; + auto right = texcoords.right; + + switch (framebuffer_transform_flags) { + case Tegra::FramebufferConfig::TransformFlags::Unset: + break; + case Tegra::FramebufferConfig::TransformFlags::FlipV: + // Flip the framebuffer vertically + left = texcoords.right; + right = texcoords.left; + break; + default: + UNIMPLEMENTED_MSG("Unsupported framebuffer_transform_flags={}", + static_cast<u32>(framebuffer_transform_flags)); + break; + } + + UNIMPLEMENTED_IF(framebuffer_crop_rect.top != 0); + UNIMPLEMENTED_IF(framebuffer_crop_rect.left != 0); + + // Scale the output by the crop width/height. This is commonly used with 1280x720 rendering + // (e.g. handheld mode) on a 1920x1080 framebuffer. + f32 scale_u = 1.0f; + f32 scale_v = 1.0f; + if (framebuffer_crop_rect.GetWidth() > 0) { + scale_u = static_cast<f32>(framebuffer_crop_rect.GetWidth()) / + static_cast<f32>(screen_info.width); + } + if (framebuffer_crop_rect.GetHeight() > 0) { + scale_v = static_cast<f32>(framebuffer_crop_rect.GetHeight()) / + static_cast<f32>(screen_info.height); + } + + const auto& screen = render_window.GetFramebufferLayout().screen; + const auto x = static_cast<f32>(screen.left); + const auto y = static_cast<f32>(screen.top); + const auto w = static_cast<f32>(screen.GetWidth()); + const auto h = static_cast<f32>(screen.GetHeight()); + data.vertices[0] = ScreenRectVertex(x, y, texcoords.top * scale_u, left * scale_v); + data.vertices[1] = ScreenRectVertex(x + w, y, texcoords.bottom * scale_u, left * scale_v); + data.vertices[2] = ScreenRectVertex(x, y + h, texcoords.top * scale_u, right * scale_v); + data.vertices[3] = ScreenRectVertex(x + w, y + h, texcoords.bottom * scale_u, right * scale_v); +} + +u64 VKBlitScreen::CalculateBufferSize(const Tegra::FramebufferConfig& framebuffer) const { + return sizeof(BufferData) + GetSizeInBytes(framebuffer) * image_count; +} + +u64 VKBlitScreen::GetRawImageOffset(const Tegra::FramebufferConfig& framebuffer, + std::size_t image_index) const { + constexpr auto first_image_offset = static_cast<u64>(sizeof(BufferData)); + return first_image_offset + GetSizeInBytes(framebuffer) * image_index; +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.h b/src/video_core/renderer_vulkan/vk_blit_screen.h new file mode 100644 index 000000000..ea680b3f5 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_blit_screen.h @@ -0,0 +1,119 @@ +// Copyright 2018 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <memory> +#include <tuple> + +#include "video_core/renderer_vulkan/declarations.h" +#include "video_core/renderer_vulkan/vk_memory_manager.h" +#include "video_core/renderer_vulkan/vk_resource_manager.h" + +namespace Core { +class System; +} + +namespace Core::Frontend { +class EmuWindow; +} + +namespace Tegra { +struct FramebufferConfig; +} + +namespace VideoCore { +class RasterizerInterface; +} + +namespace Vulkan { + +struct ScreenInfo; +class RasterizerVulkan; +class VKDevice; +class VKFence; +class VKImage; +class VKScheduler; +class VKSwapchain; + +class VKBlitScreen final { +public: + explicit VKBlitScreen(Core::System& system, Core::Frontend::EmuWindow& render_window, + VideoCore::RasterizerInterface& rasterizer, const VKDevice& device, + VKResourceManager& resource_manager, VKMemoryManager& memory_manager, + VKSwapchain& swapchain, VKScheduler& scheduler, + const VKScreenInfo& screen_info); + ~VKBlitScreen(); + + void Recreate(); + + std::tuple<VKFence&, vk::Semaphore> Draw(const Tegra::FramebufferConfig& framebuffer, + bool use_accelerated); + +private: + struct BufferData; + + void CreateStaticResources(); + void CreateShaders(); + void CreateSemaphores(); + void CreateDescriptorPool(); + void CreateRenderPass(); + void CreateDescriptorSetLayout(); + void CreateDescriptorSets(); + void CreatePipelineLayout(); + void CreateGraphicsPipeline(); + void CreateSampler(); + + void CreateDynamicResources(); + void CreateFramebuffers(); + + void RefreshResources(const Tegra::FramebufferConfig& framebuffer); + void ReleaseRawImages(); + void CreateStagingBuffer(const Tegra::FramebufferConfig& framebuffer); + void CreateRawImages(const Tegra::FramebufferConfig& framebuffer); + + void UpdateDescriptorSet(std::size_t image_index, vk::ImageView image_view) const; + void SetUniformData(BufferData& data, const Tegra::FramebufferConfig& framebuffer) const; + void SetVertexData(BufferData& data, const Tegra::FramebufferConfig& framebuffer) const; + + u64 CalculateBufferSize(const Tegra::FramebufferConfig& framebuffer) const; + u64 GetRawImageOffset(const Tegra::FramebufferConfig& framebuffer, + std::size_t image_index) const; + + Core::System& system; + Core::Frontend::EmuWindow& render_window; + VideoCore::RasterizerInterface& rasterizer; + const VKDevice& device; + VKResourceManager& resource_manager; + VKMemoryManager& memory_manager; + VKSwapchain& swapchain; + VKScheduler& scheduler; + const std::size_t image_count; + const VKScreenInfo& screen_info; + + UniqueShaderModule vertex_shader; + UniqueShaderModule fragment_shader; + UniqueDescriptorPool descriptor_pool; + UniqueDescriptorSetLayout descriptor_set_layout; + UniquePipelineLayout pipeline_layout; + UniquePipeline pipeline; + UniqueRenderPass renderpass; + std::vector<UniqueFramebuffer> framebuffers; + std::vector<vk::DescriptorSet> descriptor_sets; + UniqueSampler sampler; + + UniqueBuffer buffer; + VKMemoryCommit buffer_commit; + + std::vector<std::unique_ptr<VKFenceWatch>> watches; + + std::vector<UniqueSemaphore> semaphores; + std::vector<std::unique_ptr<VKImage>> raw_images; + std::vector<VKMemoryCommit> raw_buffer_commits; + u32 raw_width = 0; + u32 raw_height = 0; +}; + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 2e0536bf6..b155dfb49 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -191,8 +191,7 @@ UniquePipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& render const vk::PipelineRasterizationStateCreateInfo rasterizer_ci( {}, rs.depth_clamp_enable, false, vk::PolygonMode::eFill, rs.cull_enable ? MaxwellToVK::CullFace(rs.cull_face) : vk::CullModeFlagBits::eNone, - rs.cull_enable ? MaxwellToVK::FrontFace(rs.front_face) : vk::FrontFace::eCounterClockwise, - rs.depth_bias_enable, 0.0f, 0.0f, 0.0f, 1.0f); + MaxwellToVK::FrontFace(rs.front_face), rs.depth_bias_enable, 0.0f, 0.0f, 0.0f, 1.0f); const vk::PipelineMultisampleStateCreateInfo multisampling_ci( {}, vk::SampleCountFlagBits::e1, false, 0.0f, nullptr, false, false); diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 48e23d4cd..7ddf7d3ee 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -325,9 +325,6 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) { specialization.tessellation.primitive = fixed_state.tessellation.primitive; specialization.tessellation.spacing = fixed_state.tessellation.spacing; specialization.tessellation.clockwise = fixed_state.tessellation.clockwise; - for (const auto& rt : key.renderpass_params.color_attachments) { - specialization.enabled_rendertargets.set(rt.index); - } SPIRVProgram program; std::vector<vk::DescriptorSetLayoutBinding> bindings; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp new file mode 100644 index 000000000..d2c6b1189 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -0,0 +1,1141 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <array> +#include <memory> +#include <mutex> +#include <vector> + +#include <boost/container/static_vector.hpp> +#include <boost/functional/hash.hpp> + +#include "common/alignment.h" +#include "common/assert.h" +#include "common/logging/log.h" +#include "common/microprofile.h" +#include "core/core.h" +#include "core/memory.h" +#include "video_core/engines/kepler_compute.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/renderer_vulkan/declarations.h" +#include "video_core/renderer_vulkan/fixed_pipeline_state.h" +#include "video_core/renderer_vulkan/maxwell_to_vk.h" +#include "video_core/renderer_vulkan/renderer_vulkan.h" +#include "video_core/renderer_vulkan/vk_buffer_cache.h" +#include "video_core/renderer_vulkan/vk_compute_pass.h" +#include "video_core/renderer_vulkan/vk_compute_pipeline.h" +#include "video_core/renderer_vulkan/vk_descriptor_pool.h" +#include "video_core/renderer_vulkan/vk_device.h" +#include "video_core/renderer_vulkan/vk_graphics_pipeline.h" +#include "video_core/renderer_vulkan/vk_pipeline_cache.h" +#include "video_core/renderer_vulkan/vk_rasterizer.h" +#include "video_core/renderer_vulkan/vk_renderpass_cache.h" +#include "video_core/renderer_vulkan/vk_resource_manager.h" +#include "video_core/renderer_vulkan/vk_sampler_cache.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" +#include "video_core/renderer_vulkan/vk_texture_cache.h" +#include "video_core/renderer_vulkan/vk_update_descriptor.h" + +namespace Vulkan { + +using Maxwell = Tegra::Engines::Maxwell3D::Regs; + +MICROPROFILE_DEFINE(Vulkan_WaitForWorker, "Vulkan", "Wait for worker", MP_RGB(255, 192, 192)); +MICROPROFILE_DEFINE(Vulkan_Drawing, "Vulkan", "Record drawing", MP_RGB(192, 128, 128)); +MICROPROFILE_DEFINE(Vulkan_Compute, "Vulkan", "Record compute", MP_RGB(192, 128, 128)); +MICROPROFILE_DEFINE(Vulkan_Clearing, "Vulkan", "Record clearing", MP_RGB(192, 128, 128)); +MICROPROFILE_DEFINE(Vulkan_Geometry, "Vulkan", "Setup geometry", MP_RGB(192, 128, 128)); +MICROPROFILE_DEFINE(Vulkan_ConstBuffers, "Vulkan", "Setup constant buffers", MP_RGB(192, 128, 128)); +MICROPROFILE_DEFINE(Vulkan_GlobalBuffers, "Vulkan", "Setup global buffers", MP_RGB(192, 128, 128)); +MICROPROFILE_DEFINE(Vulkan_RenderTargets, "Vulkan", "Setup render targets", MP_RGB(192, 128, 128)); +MICROPROFILE_DEFINE(Vulkan_Textures, "Vulkan", "Setup textures", MP_RGB(192, 128, 128)); +MICROPROFILE_DEFINE(Vulkan_Images, "Vulkan", "Setup images", MP_RGB(192, 128, 128)); +MICROPROFILE_DEFINE(Vulkan_PipelineCache, "Vulkan", "Pipeline cache", MP_RGB(192, 128, 128)); + +namespace { + +constexpr auto ComputeShaderIndex = static_cast<std::size_t>(Tegra::Engines::ShaderType::Compute); + +vk::Viewport GetViewportState(const VKDevice& device, const Maxwell& regs, std::size_t index) { + const auto& viewport = regs.viewport_transform[index]; + const float x = viewport.translate_x - viewport.scale_x; + const float y = viewport.translate_y - viewport.scale_y; + const float width = viewport.scale_x * 2.0f; + const float height = viewport.scale_y * 2.0f; + + const float reduce_z = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne; + float near = viewport.translate_z - viewport.scale_z * reduce_z; + float far = viewport.translate_z + viewport.scale_z; + if (!device.IsExtDepthRangeUnrestrictedSupported()) { + near = std::clamp(near, 0.0f, 1.0f); + far = std::clamp(far, 0.0f, 1.0f); + } + + return vk::Viewport(x, y, width != 0 ? width : 1.0f, height != 0 ? height : 1.0f, near, far); +} + +constexpr vk::Rect2D GetScissorState(const Maxwell& regs, std::size_t index) { + const auto& scissor = regs.scissor_test[index]; + if (!scissor.enable) { + return {{0, 0}, {INT32_MAX, INT32_MAX}}; + } + const u32 width = scissor.max_x - scissor.min_x; + const u32 height = scissor.max_y - scissor.min_y; + return {{static_cast<s32>(scissor.min_x), static_cast<s32>(scissor.min_y)}, {width, height}}; +} + +std::array<GPUVAddr, Maxwell::MaxShaderProgram> GetShaderAddresses( + const std::array<Shader, Maxwell::MaxShaderProgram>& shaders) { + std::array<GPUVAddr, Maxwell::MaxShaderProgram> addresses; + for (std::size_t i = 0; i < std::size(addresses); ++i) { + addresses[i] = shaders[i] ? shaders[i]->GetGpuAddr() : 0; + } + return addresses; +} + +void TransitionImages(const std::vector<ImageView>& views, vk::PipelineStageFlags pipeline_stage, + vk::AccessFlags access) { + for (auto& [view, layout] : views) { + view->Transition(*layout, pipeline_stage, access); + } +} + +template <typename Engine, typename Entry> +Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry, + std::size_t stage) { + const auto stage_type = static_cast<Tegra::Engines::ShaderType>(stage); + if (entry.IsBindless()) { + const Tegra::Texture::TextureHandle tex_handle = + engine.AccessConstBuffer32(stage_type, entry.GetBuffer(), entry.GetOffset()); + return engine.GetTextureInfo(tex_handle); + } + if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) { + return engine.GetStageTexture(stage_type, entry.GetOffset()); + } else { + return engine.GetTexture(entry.GetOffset()); + } +} + +} // Anonymous namespace + +class BufferBindings final { +public: + void AddVertexBinding(const vk::Buffer* buffer, vk::DeviceSize offset) { + vertex.buffer_ptrs[vertex.num_buffers] = buffer; + vertex.offsets[vertex.num_buffers] = offset; + ++vertex.num_buffers; + } + + void SetIndexBinding(const vk::Buffer* buffer, vk::DeviceSize offset, vk::IndexType type) { + index.buffer = buffer; + index.offset = offset; + index.type = type; + } + + void Bind(VKScheduler& scheduler) const { + // Use this large switch case to avoid dispatching more memory in the record lambda than + // what we need. It looks horrible, but it's the best we can do on standard C++. + switch (vertex.num_buffers) { + case 0: + return BindStatic<0>(scheduler); + case 1: + return BindStatic<1>(scheduler); + case 2: + return BindStatic<2>(scheduler); + case 3: + return BindStatic<3>(scheduler); + case 4: + return BindStatic<4>(scheduler); + case 5: + return BindStatic<5>(scheduler); + case 6: + return BindStatic<6>(scheduler); + case 7: + return BindStatic<7>(scheduler); + case 8: + return BindStatic<8>(scheduler); + case 9: + return BindStatic<9>(scheduler); + case 10: + return BindStatic<10>(scheduler); + case 11: + return BindStatic<11>(scheduler); + case 12: + return BindStatic<12>(scheduler); + case 13: + return BindStatic<13>(scheduler); + case 14: + return BindStatic<14>(scheduler); + case 15: + return BindStatic<15>(scheduler); + case 16: + return BindStatic<16>(scheduler); + case 17: + return BindStatic<17>(scheduler); + case 18: + return BindStatic<18>(scheduler); + case 19: + return BindStatic<19>(scheduler); + case 20: + return BindStatic<20>(scheduler); + case 21: + return BindStatic<21>(scheduler); + case 22: + return BindStatic<22>(scheduler); + case 23: + return BindStatic<23>(scheduler); + case 24: + return BindStatic<24>(scheduler); + case 25: + return BindStatic<25>(scheduler); + case 26: + return BindStatic<26>(scheduler); + case 27: + return BindStatic<27>(scheduler); + case 28: + return BindStatic<28>(scheduler); + case 29: + return BindStatic<29>(scheduler); + case 30: + return BindStatic<30>(scheduler); + case 31: + return BindStatic<31>(scheduler); + case 32: + return BindStatic<32>(scheduler); + } + UNREACHABLE(); + } + +private: + // Some of these fields are intentionally left uninitialized to avoid initializing them twice. + struct { + std::size_t num_buffers = 0; + std::array<const vk::Buffer*, Maxwell::NumVertexArrays> buffer_ptrs; + std::array<vk::DeviceSize, Maxwell::NumVertexArrays> offsets; + } vertex; + + struct { + const vk::Buffer* buffer = nullptr; + vk::DeviceSize offset; + vk::IndexType type; + } index; + + template <std::size_t N> + void BindStatic(VKScheduler& scheduler) const { + if (index.buffer != nullptr) { + BindStatic<N, true>(scheduler); + } else { + BindStatic<N, false>(scheduler); + } + } + + template <std::size_t N, bool is_indexed> + void BindStatic(VKScheduler& scheduler) const { + static_assert(N <= Maxwell::NumVertexArrays); + if constexpr (N == 0) { + return; + } + + std::array<vk::Buffer, N> buffers; + std::transform(vertex.buffer_ptrs.begin(), vertex.buffer_ptrs.begin() + N, buffers.begin(), + [](const auto ptr) { return *ptr; }); + + std::array<vk::DeviceSize, N> offsets; + std::copy(vertex.offsets.begin(), vertex.offsets.begin() + N, offsets.begin()); + + if constexpr (is_indexed) { + // Indexed draw + scheduler.Record([buffers, offsets, index_buffer = *index.buffer, + index_offset = index.offset, + index_type = index.type](auto cmdbuf, auto& dld) { + cmdbuf.bindIndexBuffer(index_buffer, index_offset, index_type, dld); + cmdbuf.bindVertexBuffers(0, static_cast<u32>(N), buffers.data(), offsets.data(), + dld); + }); + } else { + // Array draw + scheduler.Record([buffers, offsets](auto cmdbuf, auto& dld) { + cmdbuf.bindVertexBuffers(0, static_cast<u32>(N), buffers.data(), offsets.data(), + dld); + }); + } + } +}; + +void RasterizerVulkan::DrawParameters::Draw(vk::CommandBuffer cmdbuf, + const vk::DispatchLoaderDynamic& dld) const { + if (is_indexed) { + cmdbuf.drawIndexed(num_vertices, num_instances, 0, base_vertex, base_instance, dld); + } else { + cmdbuf.draw(num_vertices, num_instances, base_vertex, base_instance, dld); + } +} + +RasterizerVulkan::RasterizerVulkan(Core::System& system, Core::Frontend::EmuWindow& renderer, + VKScreenInfo& screen_info, const VKDevice& device, + VKResourceManager& resource_manager, + VKMemoryManager& memory_manager, VKScheduler& scheduler) + : RasterizerAccelerated{system.Memory()}, system{system}, render_window{renderer}, + screen_info{screen_info}, device{device}, resource_manager{resource_manager}, + memory_manager{memory_manager}, scheduler{scheduler}, + staging_pool(device, memory_manager, scheduler), descriptor_pool(device), + update_descriptor_queue(device, scheduler), + quad_array_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue), + uint8_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue), + texture_cache(system, *this, device, resource_manager, memory_manager, scheduler, + staging_pool), + pipeline_cache(system, *this, device, scheduler, descriptor_pool, update_descriptor_queue), + buffer_cache(*this, system, device, memory_manager, scheduler, staging_pool), + sampler_cache(device) {} + +RasterizerVulkan::~RasterizerVulkan() = default; + +bool RasterizerVulkan::DrawBatch(bool is_indexed) { + Draw(is_indexed, false); + return true; +} + +bool RasterizerVulkan::DrawMultiBatch(bool is_indexed) { + Draw(is_indexed, true); + return true; +} + +void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) { + MICROPROFILE_SCOPE(Vulkan_Drawing); + + FlushWork(); + + const auto& gpu = system.GPU().Maxwell3D(); + GraphicsPipelineCacheKey key{GetFixedPipelineState(gpu.regs)}; + + buffer_cache.Map(CalculateGraphicsStreamBufferSize(is_indexed)); + + BufferBindings buffer_bindings; + const DrawParameters draw_params = + SetupGeometry(key.fixed_state, buffer_bindings, is_indexed, is_instanced); + + update_descriptor_queue.Acquire(); + sampled_views.clear(); + image_views.clear(); + + const auto shaders = pipeline_cache.GetShaders(); + key.shaders = GetShaderAddresses(shaders); + SetupShaderDescriptors(shaders); + + buffer_cache.Unmap(); + + const auto texceptions = UpdateAttachments(); + SetupImageTransitions(texceptions, color_attachments, zeta_attachment); + + key.renderpass_params = GetRenderPassParams(texceptions); + + auto& pipeline = pipeline_cache.GetGraphicsPipeline(key); + scheduler.BindGraphicsPipeline(pipeline.GetHandle()); + + const auto renderpass = pipeline.GetRenderPass(); + const auto [framebuffer, render_area] = ConfigureFramebuffers(renderpass); + scheduler.RequestRenderpass({renderpass, framebuffer, {{0, 0}, render_area}, 0, nullptr}); + + UpdateDynamicStates(); + + buffer_bindings.Bind(scheduler); + + if (device.IsNvDeviceDiagnosticCheckpoints()) { + scheduler.Record( + [&pipeline](auto cmdbuf, auto& dld) { cmdbuf.setCheckpointNV(&pipeline, dld); }); + } + + const auto pipeline_layout = pipeline.GetLayout(); + const auto descriptor_set = pipeline.CommitDescriptorSet(); + scheduler.Record([pipeline_layout, descriptor_set, draw_params](auto cmdbuf, auto& dld) { + if (descriptor_set) { + cmdbuf.bindDescriptorSets(vk::PipelineBindPoint::eGraphics, pipeline_layout, + DESCRIPTOR_SET, 1, &descriptor_set, 0, nullptr, dld); + } + draw_params.Draw(cmdbuf, dld); + }); +} + +void RasterizerVulkan::Clear() { + MICROPROFILE_SCOPE(Vulkan_Clearing); + + const auto& gpu = system.GPU().Maxwell3D(); + if (!system.GPU().Maxwell3D().ShouldExecute()) { + return; + } + + const auto& regs = gpu.regs; + const bool use_color = regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B || + regs.clear_buffers.A; + const bool use_depth = regs.clear_buffers.Z; + const bool use_stencil = regs.clear_buffers.S; + if (!use_color && !use_depth && !use_stencil) { + return; + } + // Clearing images requires to be out of a renderpass + scheduler.RequestOutsideRenderPassOperationContext(); + + // TODO(Rodrigo): Implement clears rendering a quad or using beginning a renderpass. + + if (use_color) { + View color_view; + { + MICROPROFILE_SCOPE(Vulkan_RenderTargets); + color_view = texture_cache.GetColorBufferSurface(regs.clear_buffers.RT.Value(), false); + } + + color_view->Transition(vk::ImageLayout::eTransferDstOptimal, + vk::PipelineStageFlagBits::eTransfer, + vk::AccessFlagBits::eTransferWrite); + + const std::array clear_color = {regs.clear_color[0], regs.clear_color[1], + regs.clear_color[2], regs.clear_color[3]}; + const vk::ClearColorValue clear(clear_color); + scheduler.Record([image = color_view->GetImage(), + subresource = color_view->GetImageSubresourceRange(), + clear](auto cmdbuf, auto& dld) { + cmdbuf.clearColorImage(image, vk::ImageLayout::eTransferDstOptimal, clear, subresource, + dld); + }); + } + if (use_depth || use_stencil) { + View zeta_surface; + { + MICROPROFILE_SCOPE(Vulkan_RenderTargets); + zeta_surface = texture_cache.GetDepthBufferSurface(false); + } + + zeta_surface->Transition(vk::ImageLayout::eTransferDstOptimal, + vk::PipelineStageFlagBits::eTransfer, + vk::AccessFlagBits::eTransferWrite); + + const vk::ClearDepthStencilValue clear(regs.clear_depth, + static_cast<u32>(regs.clear_stencil)); + scheduler.Record([image = zeta_surface->GetImage(), + subresource = zeta_surface->GetImageSubresourceRange(), + clear](auto cmdbuf, auto& dld) { + cmdbuf.clearDepthStencilImage(image, vk::ImageLayout::eTransferDstOptimal, clear, + subresource, dld); + }); + } +} + +void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) { + MICROPROFILE_SCOPE(Vulkan_Compute); + update_descriptor_queue.Acquire(); + sampled_views.clear(); + image_views.clear(); + + const auto& launch_desc = system.GPU().KeplerCompute().launch_description; + const ComputePipelineCacheKey key{ + code_addr, + launch_desc.shared_alloc, + {launch_desc.block_dim_x, launch_desc.block_dim_y, launch_desc.block_dim_z}}; + auto& pipeline = pipeline_cache.GetComputePipeline(key); + + // Compute dispatches can't be executed inside a renderpass + scheduler.RequestOutsideRenderPassOperationContext(); + + buffer_cache.Map(CalculateComputeStreamBufferSize()); + + const auto& entries = pipeline.GetEntries(); + SetupComputeConstBuffers(entries); + SetupComputeGlobalBuffers(entries); + SetupComputeTexelBuffers(entries); + SetupComputeTextures(entries); + SetupComputeImages(entries); + + buffer_cache.Unmap(); + + TransitionImages(sampled_views, vk::PipelineStageFlagBits::eComputeShader, + vk::AccessFlagBits::eShaderRead); + TransitionImages(image_views, vk::PipelineStageFlagBits::eComputeShader, + vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite); + + if (device.IsNvDeviceDiagnosticCheckpoints()) { + scheduler.Record( + [&pipeline](auto cmdbuf, auto& dld) { cmdbuf.setCheckpointNV(nullptr, dld); }); + } + + scheduler.Record([grid_x = launch_desc.grid_dim_x, grid_y = launch_desc.grid_dim_y, + grid_z = launch_desc.grid_dim_z, pipeline_handle = pipeline.GetHandle(), + layout = pipeline.GetLayout(), + descriptor_set = pipeline.CommitDescriptorSet()](auto cmdbuf, auto& dld) { + cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline_handle, dld); + cmdbuf.bindDescriptorSets(vk::PipelineBindPoint::eCompute, layout, DESCRIPTOR_SET, 1, + &descriptor_set, 0, nullptr, dld); + cmdbuf.dispatch(grid_x, grid_y, grid_z, dld); + }); +} + +void RasterizerVulkan::FlushAll() {} + +void RasterizerVulkan::FlushRegion(CacheAddr addr, u64 size) { + texture_cache.FlushRegion(addr, size); + buffer_cache.FlushRegion(addr, size); +} + +void RasterizerVulkan::InvalidateRegion(CacheAddr addr, u64 size) { + texture_cache.InvalidateRegion(addr, size); + pipeline_cache.InvalidateRegion(addr, size); + buffer_cache.InvalidateRegion(addr, size); +} + +void RasterizerVulkan::FlushAndInvalidateRegion(CacheAddr addr, u64 size) { + FlushRegion(addr, size); + InvalidateRegion(addr, size); +} + +void RasterizerVulkan::FlushCommands() { + if (draw_counter > 0) { + draw_counter = 0; + scheduler.Flush(); + } +} + +void RasterizerVulkan::TickFrame() { + draw_counter = 0; + update_descriptor_queue.TickFrame(); + buffer_cache.TickFrame(); + staging_pool.TickFrame(); +} + +bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, + const Tegra::Engines::Fermi2D::Regs::Surface& dst, + const Tegra::Engines::Fermi2D::Config& copy_config) { + texture_cache.DoFermiCopy(src, dst, copy_config); + return true; +} + +bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config, + VAddr framebuffer_addr, u32 pixel_stride) { + if (!framebuffer_addr) { + return false; + } + + const u8* host_ptr{system.Memory().GetPointer(framebuffer_addr)}; + const auto surface{texture_cache.TryFindFramebufferSurface(host_ptr)}; + if (!surface) { + return false; + } + + // Verify that the cached surface is the same size and format as the requested framebuffer + const auto& params{surface->GetSurfaceParams()}; + const auto& pixel_format{ + VideoCore::Surface::PixelFormatFromGPUPixelFormat(config.pixel_format)}; + ASSERT_MSG(params.width == config.width, "Framebuffer width is different"); + ASSERT_MSG(params.height == config.height, "Framebuffer height is different"); + + screen_info.image = &surface->GetImage(); + screen_info.width = params.width; + screen_info.height = params.height; + screen_info.is_srgb = surface->GetSurfaceParams().srgb_conversion; + return true; +} + +void RasterizerVulkan::FlushWork() { + static constexpr u32 DRAWS_TO_DISPATCH = 4096; + + // Only check multiples of 8 draws + static_assert(DRAWS_TO_DISPATCH % 8 == 0); + if ((++draw_counter & 7) != 7) { + return; + } + + if (draw_counter < DRAWS_TO_DISPATCH) { + // Send recorded tasks to the worker thread + scheduler.DispatchWork(); + return; + } + + // Otherwise (every certain number of draws) flush execution. + // This submits commands to the Vulkan driver. + scheduler.Flush(); + draw_counter = 0; +} + +RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() { + MICROPROFILE_SCOPE(Vulkan_RenderTargets); + auto& dirty = system.GPU().Maxwell3D().dirty; + const bool update_rendertargets = dirty.render_settings; + dirty.render_settings = false; + + texture_cache.GuardRenderTargets(true); + + Texceptions texceptions; + for (std::size_t rt = 0; rt < Maxwell::NumRenderTargets; ++rt) { + if (update_rendertargets) { + color_attachments[rt] = texture_cache.GetColorBufferSurface(rt, true); + } + if (color_attachments[rt] && WalkAttachmentOverlaps(*color_attachments[rt])) { + texceptions.set(rt); + } + } + + if (update_rendertargets) { + zeta_attachment = texture_cache.GetDepthBufferSurface(true); + } + if (zeta_attachment && WalkAttachmentOverlaps(*zeta_attachment)) { + texceptions.set(ZETA_TEXCEPTION_INDEX); + } + + texture_cache.GuardRenderTargets(false); + + return texceptions; +} + +bool RasterizerVulkan::WalkAttachmentOverlaps(const CachedSurfaceView& attachment) { + bool overlap = false; + for (auto& [view, layout] : sampled_views) { + if (!attachment.IsSameSurface(*view)) { + continue; + } + overlap = true; + *layout = vk::ImageLayout::eGeneral; + } + return overlap; +} + +std::tuple<vk::Framebuffer, vk::Extent2D> RasterizerVulkan::ConfigureFramebuffers( + vk::RenderPass renderpass) { + FramebufferCacheKey key{renderpass, std::numeric_limits<u32>::max(), + std::numeric_limits<u32>::max()}; + + const auto MarkAsModifiedAndPush = [&](const View& view) { + if (view == nullptr) { + return false; + } + key.views.push_back(view->GetHandle()); + key.width = std::min(key.width, view->GetWidth()); + key.height = std::min(key.height, view->GetHeight()); + return true; + }; + + for (std::size_t index = 0; index < std::size(color_attachments); ++index) { + if (MarkAsModifiedAndPush(color_attachments[index])) { + texture_cache.MarkColorBufferInUse(index); + } + } + if (MarkAsModifiedAndPush(zeta_attachment)) { + texture_cache.MarkDepthBufferInUse(); + } + + const auto [fbentry, is_cache_miss] = framebuffer_cache.try_emplace(key); + auto& framebuffer = fbentry->second; + if (is_cache_miss) { + const vk::FramebufferCreateInfo framebuffer_ci({}, key.renderpass, + static_cast<u32>(key.views.size()), + key.views.data(), key.width, key.height, 1); + const auto dev = device.GetLogical(); + const auto& dld = device.GetDispatchLoader(); + framebuffer = dev.createFramebufferUnique(framebuffer_ci, nullptr, dld); + } + + return {*framebuffer, vk::Extent2D{key.width, key.height}}; +} + +RasterizerVulkan::DrawParameters RasterizerVulkan::SetupGeometry(FixedPipelineState& fixed_state, + BufferBindings& buffer_bindings, + bool is_indexed, + bool is_instanced) { + MICROPROFILE_SCOPE(Vulkan_Geometry); + + const auto& gpu = system.GPU().Maxwell3D(); + const auto& regs = gpu.regs; + + SetupVertexArrays(fixed_state.vertex_input, buffer_bindings); + + const u32 base_instance = regs.vb_base_instance; + const u32 num_instances = is_instanced ? gpu.mme_draw.instance_count : 1; + const u32 base_vertex = is_indexed ? regs.vb_element_base : regs.vertex_buffer.first; + const u32 num_vertices = is_indexed ? regs.index_array.count : regs.vertex_buffer.count; + + DrawParameters params{base_instance, num_instances, base_vertex, num_vertices, is_indexed}; + SetupIndexBuffer(buffer_bindings, params, is_indexed); + + return params; +} + +void RasterizerVulkan::SetupShaderDescriptors( + const std::array<Shader, Maxwell::MaxShaderProgram>& shaders) { + texture_cache.GuardSamplers(true); + + for (std::size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) { + // Skip VertexA stage + const auto& shader = shaders[stage + 1]; + if (!shader) { + continue; + } + const auto& entries = shader->GetEntries(); + SetupGraphicsConstBuffers(entries, stage); + SetupGraphicsGlobalBuffers(entries, stage); + SetupGraphicsTexelBuffers(entries, stage); + SetupGraphicsTextures(entries, stage); + SetupGraphicsImages(entries, stage); + } + texture_cache.GuardSamplers(false); +} + +void RasterizerVulkan::SetupImageTransitions( + Texceptions texceptions, const std::array<View, Maxwell::NumRenderTargets>& color_attachments, + const View& zeta_attachment) { + TransitionImages(sampled_views, vk::PipelineStageFlagBits::eAllGraphics, + vk::AccessFlagBits::eShaderRead); + TransitionImages(image_views, vk::PipelineStageFlagBits::eAllGraphics, + vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite); + + for (std::size_t rt = 0; rt < std::size(color_attachments); ++rt) { + const auto color_attachment = color_attachments[rt]; + if (color_attachment == nullptr) { + continue; + } + const auto image_layout = + texceptions[rt] ? vk::ImageLayout::eGeneral : vk::ImageLayout::eColorAttachmentOptimal; + color_attachment->Transition( + image_layout, vk::PipelineStageFlagBits::eColorAttachmentOutput, + vk::AccessFlagBits::eColorAttachmentRead | vk::AccessFlagBits::eColorAttachmentWrite); + } + + if (zeta_attachment != nullptr) { + const auto image_layout = texceptions[ZETA_TEXCEPTION_INDEX] + ? vk::ImageLayout::eGeneral + : vk::ImageLayout::eDepthStencilAttachmentOptimal; + zeta_attachment->Transition(image_layout, vk::PipelineStageFlagBits::eLateFragmentTests, + vk::AccessFlagBits::eDepthStencilAttachmentRead | + vk::AccessFlagBits::eDepthStencilAttachmentWrite); + } +} + +void RasterizerVulkan::UpdateDynamicStates() { + auto& gpu = system.GPU().Maxwell3D(); + UpdateViewportsState(gpu); + UpdateScissorsState(gpu); + UpdateDepthBias(gpu); + UpdateBlendConstants(gpu); + UpdateDepthBounds(gpu); + UpdateStencilFaces(gpu); +} + +void RasterizerVulkan::SetupVertexArrays(FixedPipelineState::VertexInput& vertex_input, + BufferBindings& buffer_bindings) { + const auto& regs = system.GPU().Maxwell3D().regs; + + for (u32 index = 0; index < static_cast<u32>(Maxwell::NumVertexAttributes); ++index) { + const auto& attrib = regs.vertex_attrib_format[index]; + if (!attrib.IsValid()) { + continue; + } + + const auto& buffer = regs.vertex_array[attrib.buffer]; + ASSERT(buffer.IsEnabled()); + + vertex_input.attributes[vertex_input.num_attributes++] = + FixedPipelineState::VertexAttribute(index, attrib.buffer, attrib.type, attrib.size, + attrib.offset); + } + + for (u32 index = 0; index < static_cast<u32>(Maxwell::NumVertexArrays); ++index) { + const auto& vertex_array = regs.vertex_array[index]; + if (!vertex_array.IsEnabled()) { + continue; + } + + const GPUVAddr start{vertex_array.StartAddress()}; + const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()}; + + ASSERT(end > start); + const std::size_t size{end - start + 1}; + const auto [buffer, offset] = buffer_cache.UploadMemory(start, size); + + vertex_input.bindings[vertex_input.num_bindings++] = FixedPipelineState::VertexBinding( + index, vertex_array.stride, + regs.instanced_arrays.IsInstancingEnabled(index) ? vertex_array.divisor : 0); + buffer_bindings.AddVertexBinding(buffer, offset); + } +} + +void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawParameters& params, + bool is_indexed) { + const auto& regs = system.GPU().Maxwell3D().regs; + switch (regs.draw.topology) { + case Maxwell::PrimitiveTopology::Quads: + if (params.is_indexed) { + UNIMPLEMENTED(); + } else { + const auto [buffer, offset] = + quad_array_pass.Assemble(params.num_vertices, params.base_vertex); + buffer_bindings.SetIndexBinding(&buffer, offset, vk::IndexType::eUint32); + params.base_vertex = 0; + params.num_vertices = params.num_vertices * 6 / 4; + params.is_indexed = true; + } + break; + default: { + if (!is_indexed) { + break; + } + const GPUVAddr gpu_addr = regs.index_array.IndexStart(); + auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize()); + + auto format = regs.index_array.format; + const bool is_uint8 = format == Maxwell::IndexFormat::UnsignedByte; + if (is_uint8 && !device.IsExtIndexTypeUint8Supported()) { + std::tie(buffer, offset) = uint8_pass.Assemble(params.num_vertices, *buffer, offset); + format = Maxwell::IndexFormat::UnsignedShort; + } + + buffer_bindings.SetIndexBinding(buffer, offset, MaxwellToVK::IndexFormat(device, format)); + break; + } + } +} + +void RasterizerVulkan::SetupGraphicsConstBuffers(const ShaderEntries& entries, std::size_t stage) { + MICROPROFILE_SCOPE(Vulkan_ConstBuffers); + const auto& gpu = system.GPU().Maxwell3D(); + const auto& shader_stage = gpu.state.shader_stages[stage]; + for (const auto& entry : entries.const_buffers) { + SetupConstBuffer(entry, shader_stage.const_buffers[entry.GetIndex()]); + } +} + +void RasterizerVulkan::SetupGraphicsGlobalBuffers(const ShaderEntries& entries, std::size_t stage) { + MICROPROFILE_SCOPE(Vulkan_GlobalBuffers); + auto& gpu{system.GPU()}; + const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage]}; + + for (const auto& entry : entries.global_buffers) { + const auto addr = cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset(); + SetupGlobalBuffer(entry, addr); + } +} + +void RasterizerVulkan::SetupGraphicsTexelBuffers(const ShaderEntries& entries, std::size_t stage) { + MICROPROFILE_SCOPE(Vulkan_Textures); + const auto& gpu = system.GPU().Maxwell3D(); + for (const auto& entry : entries.texel_buffers) { + const auto image = GetTextureInfo(gpu, entry, stage).tic; + SetupTexelBuffer(image, entry); + } +} + +void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, std::size_t stage) { + MICROPROFILE_SCOPE(Vulkan_Textures); + const auto& gpu = system.GPU().Maxwell3D(); + for (const auto& entry : entries.samplers) { + const auto texture = GetTextureInfo(gpu, entry, stage); + SetupTexture(texture, entry); + } +} + +void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage) { + MICROPROFILE_SCOPE(Vulkan_Images); + const auto& gpu = system.GPU().KeplerCompute(); + for (const auto& entry : entries.images) { + const auto tic = GetTextureInfo(gpu, entry, stage).tic; + SetupImage(tic, entry); + } +} + +void RasterizerVulkan::SetupComputeConstBuffers(const ShaderEntries& entries) { + MICROPROFILE_SCOPE(Vulkan_ConstBuffers); + const auto& launch_desc = system.GPU().KeplerCompute().launch_description; + for (const auto& entry : entries.const_buffers) { + const auto& config = launch_desc.const_buffer_config[entry.GetIndex()]; + const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value(); + Tegra::Engines::ConstBufferInfo buffer; + buffer.address = config.Address(); + buffer.size = config.size; + buffer.enabled = mask[entry.GetIndex()]; + SetupConstBuffer(entry, buffer); + } +} + +void RasterizerVulkan::SetupComputeGlobalBuffers(const ShaderEntries& entries) { + MICROPROFILE_SCOPE(Vulkan_GlobalBuffers); + const auto cbufs{system.GPU().KeplerCompute().launch_description.const_buffer_config}; + for (const auto& entry : entries.global_buffers) { + const auto addr{cbufs[entry.GetCbufIndex()].Address() + entry.GetCbufOffset()}; + SetupGlobalBuffer(entry, addr); + } +} + +void RasterizerVulkan::SetupComputeTexelBuffers(const ShaderEntries& entries) { + MICROPROFILE_SCOPE(Vulkan_Textures); + const auto& gpu = system.GPU().KeplerCompute(); + for (const auto& entry : entries.texel_buffers) { + const auto image = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic; + SetupTexelBuffer(image, entry); + } +} + +void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) { + MICROPROFILE_SCOPE(Vulkan_Textures); + const auto& gpu = system.GPU().KeplerCompute(); + for (const auto& entry : entries.samplers) { + const auto texture = GetTextureInfo(gpu, entry, ComputeShaderIndex); + SetupTexture(texture, entry); + } +} + +void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) { + MICROPROFILE_SCOPE(Vulkan_Images); + const auto& gpu = system.GPU().KeplerCompute(); + for (const auto& entry : entries.images) { + const auto tic = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic; + SetupImage(tic, entry); + } +} + +void RasterizerVulkan::SetupConstBuffer(const ConstBufferEntry& entry, + const Tegra::Engines::ConstBufferInfo& buffer) { + // Align the size to avoid bad std140 interactions + const std::size_t size = + Common::AlignUp(CalculateConstBufferSize(entry, buffer), 4 * sizeof(float)); + ASSERT(size <= MaxConstbufferSize); + + const auto [buffer_handle, offset] = + buffer_cache.UploadMemory(buffer.address, size, device.GetUniformBufferAlignment()); + + update_descriptor_queue.AddBuffer(buffer_handle, offset, size); +} + +void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address) { + auto& memory_manager{system.GPU().MemoryManager()}; + const auto actual_addr = memory_manager.Read<u64>(address); + const auto size = memory_manager.Read<u32>(address + 8); + + if (size == 0) { + // Sometimes global memory pointers don't have a proper size. Upload a dummy entry because + // Vulkan doesn't like empty buffers. + constexpr std::size_t dummy_size = 4; + const auto buffer = buffer_cache.GetEmptyBuffer(dummy_size); + update_descriptor_queue.AddBuffer(buffer, 0, dummy_size); + return; + } + + const auto [buffer, offset] = buffer_cache.UploadMemory( + actual_addr, size, device.GetStorageBufferAlignment(), entry.IsWritten()); + update_descriptor_queue.AddBuffer(buffer, offset, size); +} + +void RasterizerVulkan::SetupTexelBuffer(const Tegra::Texture::TICEntry& tic, + const TexelBufferEntry& entry) { + const auto view = texture_cache.GetTextureSurface(tic, entry); + ASSERT(view->IsBufferView()); + + update_descriptor_queue.AddTexelBuffer(view->GetBufferView()); +} + +void RasterizerVulkan::SetupTexture(const Tegra::Texture::FullTextureInfo& texture, + const SamplerEntry& entry) { + auto view = texture_cache.GetTextureSurface(texture.tic, entry); + ASSERT(!view->IsBufferView()); + + const auto image_view = view->GetHandle(texture.tic.x_source, texture.tic.y_source, + texture.tic.z_source, texture.tic.w_source); + const auto sampler = sampler_cache.GetSampler(texture.tsc); + update_descriptor_queue.AddSampledImage(sampler, image_view); + + const auto image_layout = update_descriptor_queue.GetLastImageLayout(); + *image_layout = vk::ImageLayout::eShaderReadOnlyOptimal; + sampled_views.push_back(ImageView{std::move(view), image_layout}); +} + +void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry) { + auto view = texture_cache.GetImageSurface(tic, entry); + + if (entry.IsWritten()) { + view->MarkAsModified(texture_cache.Tick()); + } + + UNIMPLEMENTED_IF(tic.IsBuffer()); + + const auto image_view = view->GetHandle(tic.x_source, tic.y_source, tic.z_source, tic.w_source); + update_descriptor_queue.AddImage(image_view); + + const auto image_layout = update_descriptor_queue.GetLastImageLayout(); + *image_layout = vk::ImageLayout::eGeneral; + image_views.push_back(ImageView{std::move(view), image_layout}); +} + +void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D& gpu) { + if (!gpu.dirty.viewport_transform && scheduler.TouchViewports()) { + return; + } + gpu.dirty.viewport_transform = false; + const auto& regs = gpu.regs; + const std::array viewports{ + GetViewportState(device, regs, 0), GetViewportState(device, regs, 1), + GetViewportState(device, regs, 2), GetViewportState(device, regs, 3), + GetViewportState(device, regs, 4), GetViewportState(device, regs, 5), + GetViewportState(device, regs, 6), GetViewportState(device, regs, 7), + GetViewportState(device, regs, 8), GetViewportState(device, regs, 9), + GetViewportState(device, regs, 10), GetViewportState(device, regs, 11), + GetViewportState(device, regs, 12), GetViewportState(device, regs, 13), + GetViewportState(device, regs, 14), GetViewportState(device, regs, 15)}; + scheduler.Record([viewports](auto cmdbuf, auto& dld) { + cmdbuf.setViewport(0, static_cast<u32>(viewports.size()), viewports.data(), dld); + }); +} + +void RasterizerVulkan::UpdateScissorsState(Tegra::Engines::Maxwell3D& gpu) { + if (!gpu.dirty.scissor_test && scheduler.TouchScissors()) { + return; + } + gpu.dirty.scissor_test = false; + const auto& regs = gpu.regs; + const std::array scissors = { + GetScissorState(regs, 0), GetScissorState(regs, 1), GetScissorState(regs, 2), + GetScissorState(regs, 3), GetScissorState(regs, 4), GetScissorState(regs, 5), + GetScissorState(regs, 6), GetScissorState(regs, 7), GetScissorState(regs, 8), + GetScissorState(regs, 9), GetScissorState(regs, 10), GetScissorState(regs, 11), + GetScissorState(regs, 12), GetScissorState(regs, 13), GetScissorState(regs, 14), + GetScissorState(regs, 15)}; + scheduler.Record([scissors](auto cmdbuf, auto& dld) { + cmdbuf.setScissor(0, static_cast<u32>(scissors.size()), scissors.data(), dld); + }); +} + +void RasterizerVulkan::UpdateDepthBias(Tegra::Engines::Maxwell3D& gpu) { + if (!gpu.dirty.polygon_offset && scheduler.TouchDepthBias()) { + return; + } + gpu.dirty.polygon_offset = false; + const auto& regs = gpu.regs; + scheduler.Record([constant = regs.polygon_offset_units, clamp = regs.polygon_offset_clamp, + factor = regs.polygon_offset_factor](auto cmdbuf, auto& dld) { + cmdbuf.setDepthBias(constant, clamp, factor / 2.0f, dld); + }); +} + +void RasterizerVulkan::UpdateBlendConstants(Tegra::Engines::Maxwell3D& gpu) { + if (!gpu.dirty.blend_state && scheduler.TouchBlendConstants()) { + return; + } + gpu.dirty.blend_state = false; + const std::array blend_color = {gpu.regs.blend_color.r, gpu.regs.blend_color.g, + gpu.regs.blend_color.b, gpu.regs.blend_color.a}; + scheduler.Record([blend_color](auto cmdbuf, auto& dld) { + cmdbuf.setBlendConstants(blend_color.data(), dld); + }); +} + +void RasterizerVulkan::UpdateDepthBounds(Tegra::Engines::Maxwell3D& gpu) { + if (!gpu.dirty.depth_bounds_values && scheduler.TouchDepthBounds()) { + return; + } + gpu.dirty.depth_bounds_values = false; + const auto& regs = gpu.regs; + scheduler.Record([min = regs.depth_bounds[0], max = regs.depth_bounds[1]]( + auto cmdbuf, auto& dld) { cmdbuf.setDepthBounds(min, max, dld); }); +} + +void RasterizerVulkan::UpdateStencilFaces(Tegra::Engines::Maxwell3D& gpu) { + if (!gpu.dirty.stencil_test && scheduler.TouchStencilValues()) { + return; + } + gpu.dirty.stencil_test = false; + const auto& regs = gpu.regs; + if (regs.stencil_two_side_enable) { + // Separate values per face + scheduler.Record( + [front_ref = regs.stencil_front_func_ref, front_write_mask = regs.stencil_front_mask, + front_test_mask = regs.stencil_front_func_mask, back_ref = regs.stencil_back_func_ref, + back_write_mask = regs.stencil_back_mask, + back_test_mask = regs.stencil_back_func_mask](auto cmdbuf, auto& dld) { + // Front face + cmdbuf.setStencilReference(vk::StencilFaceFlagBits::eFront, front_ref, dld); + cmdbuf.setStencilWriteMask(vk::StencilFaceFlagBits::eFront, front_write_mask, dld); + cmdbuf.setStencilCompareMask(vk::StencilFaceFlagBits::eFront, front_test_mask, dld); + + // Back face + cmdbuf.setStencilReference(vk::StencilFaceFlagBits::eBack, back_ref, dld); + cmdbuf.setStencilWriteMask(vk::StencilFaceFlagBits::eBack, back_write_mask, dld); + cmdbuf.setStencilCompareMask(vk::StencilFaceFlagBits::eBack, back_test_mask, dld); + }); + } else { + // Front face defines both faces + scheduler.Record([ref = regs.stencil_back_func_ref, write_mask = regs.stencil_back_mask, + test_mask = regs.stencil_back_func_mask](auto cmdbuf, auto& dld) { + cmdbuf.setStencilReference(vk::StencilFaceFlagBits::eFrontAndBack, ref, dld); + cmdbuf.setStencilWriteMask(vk::StencilFaceFlagBits::eFrontAndBack, write_mask, dld); + cmdbuf.setStencilCompareMask(vk::StencilFaceFlagBits::eFrontAndBack, test_mask, dld); + }); + } +} + +std::size_t RasterizerVulkan::CalculateGraphicsStreamBufferSize(bool is_indexed) const { + std::size_t size = CalculateVertexArraysSize(); + if (is_indexed) { + size = Common::AlignUp(size, 4) + CalculateIndexBufferSize(); + } + size += Maxwell::MaxConstBuffers * (MaxConstbufferSize + device.GetUniformBufferAlignment()); + return size; +} + +std::size_t RasterizerVulkan::CalculateComputeStreamBufferSize() const { + return Tegra::Engines::KeplerCompute::NumConstBuffers * + (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); +} + +std::size_t RasterizerVulkan::CalculateVertexArraysSize() const { + const auto& regs = system.GPU().Maxwell3D().regs; + + std::size_t size = 0; + for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) { + // This implementation assumes that all attributes are used in the shader. + const GPUVAddr start{regs.vertex_array[index].StartAddress()}; + const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()}; + DEBUG_ASSERT(end > start); + + size += (end - start + 1) * regs.vertex_array[index].enable; + } + return size; +} + +std::size_t RasterizerVulkan::CalculateIndexBufferSize() const { + const auto& regs = system.GPU().Maxwell3D().regs; + return static_cast<std::size_t>(regs.index_array.count) * + static_cast<std::size_t>(regs.index_array.FormatSizeInBytes()); +} + +std::size_t RasterizerVulkan::CalculateConstBufferSize( + const ConstBufferEntry& entry, const Tegra::Engines::ConstBufferInfo& buffer) const { + if (entry.IsIndirect()) { + // Buffer is accessed indirectly, so upload the entire thing + return buffer.size; + } else { + // Buffer is accessed directly, upload just what we use + return entry.GetSize(); + } +} + +RenderPassParams RasterizerVulkan::GetRenderPassParams(Texceptions texceptions) const { + using namespace VideoCore::Surface; + + const auto& regs = system.GPU().Maxwell3D().regs; + RenderPassParams renderpass_params; + + for (std::size_t rt = 0; rt < static_cast<std::size_t>(regs.rt_control.count); ++rt) { + const auto& rendertarget = regs.rt[rt]; + if (rendertarget.Address() == 0 || rendertarget.format == Tegra::RenderTargetFormat::NONE) + continue; + renderpass_params.color_attachments.push_back(RenderPassParams::ColorAttachment{ + static_cast<u32>(rt), PixelFormatFromRenderTargetFormat(rendertarget.format), + texceptions.test(rt)}); + } + + renderpass_params.has_zeta = regs.zeta_enable; + if (renderpass_params.has_zeta) { + renderpass_params.zeta_pixel_format = PixelFormatFromDepthFormat(regs.zeta.format); + renderpass_params.zeta_texception = texceptions[ZETA_TEXCEPTION_INDEX]; + } + + return renderpass_params; +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index fc324952b..7be71e734 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -4,10 +4,260 @@ #pragma once +#include <array> +#include <bitset> +#include <memory> +#include <utility> +#include <vector> + +#include <boost/container/static_vector.hpp> +#include <boost/functional/hash.hpp> + +#include "common/common_types.h" +#include "video_core/memory_manager.h" +#include "video_core/rasterizer_accelerated.h" #include "video_core/rasterizer_interface.h" +#include "video_core/renderer_vulkan/declarations.h" +#include "video_core/renderer_vulkan/fixed_pipeline_state.h" +#include "video_core/renderer_vulkan/vk_buffer_cache.h" +#include "video_core/renderer_vulkan/vk_compute_pass.h" +#include "video_core/renderer_vulkan/vk_descriptor_pool.h" +#include "video_core/renderer_vulkan/vk_memory_manager.h" +#include "video_core/renderer_vulkan/vk_pipeline_cache.h" +#include "video_core/renderer_vulkan/vk_renderpass_cache.h" +#include "video_core/renderer_vulkan/vk_resource_manager.h" +#include "video_core/renderer_vulkan/vk_sampler_cache.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" +#include "video_core/renderer_vulkan/vk_texture_cache.h" +#include "video_core/renderer_vulkan/vk_update_descriptor.h" + +namespace Core { +class System; +} + +namespace Core::Frontend { +class EmuWindow; +} + +namespace Tegra::Engines { +class Maxwell3D; +} + +namespace Vulkan { + +struct VKScreenInfo; + +using ImageViewsPack = + boost::container::static_vector<vk::ImageView, Maxwell::NumRenderTargets + 1>; + +struct FramebufferCacheKey { + vk::RenderPass renderpass{}; + u32 width = 0; + u32 height = 0; + ImageViewsPack views; + + std::size_t Hash() const noexcept { + std::size_t hash = 0; + boost::hash_combine(hash, static_cast<VkRenderPass>(renderpass)); + for (const auto& view : views) { + boost::hash_combine(hash, static_cast<VkImageView>(view)); + } + boost::hash_combine(hash, width); + boost::hash_combine(hash, height); + return hash; + } + + bool operator==(const FramebufferCacheKey& rhs) const noexcept { + return std::tie(renderpass, views, width, height) == + std::tie(rhs.renderpass, rhs.views, rhs.width, rhs.height); + } +}; + +} // namespace Vulkan + +namespace std { + +template <> +struct hash<Vulkan::FramebufferCacheKey> { + std::size_t operator()(const Vulkan::FramebufferCacheKey& k) const noexcept { + return k.Hash(); + } +}; + +} // namespace std namespace Vulkan { -class RasterizerVulkan : public VideoCore::RasterizerInterface {}; +class BufferBindings; + +struct ImageView { + View view; + vk::ImageLayout* layout = nullptr; +}; + +class RasterizerVulkan : public VideoCore::RasterizerAccelerated { +public: + explicit RasterizerVulkan(Core::System& system, Core::Frontend::EmuWindow& render_window, + VKScreenInfo& screen_info, const VKDevice& device, + VKResourceManager& resource_manager, VKMemoryManager& memory_manager, + VKScheduler& scheduler); + ~RasterizerVulkan() override; + + bool DrawBatch(bool is_indexed) override; + bool DrawMultiBatch(bool is_indexed) override; + void Clear() override; + void DispatchCompute(GPUVAddr code_addr) override; + void FlushAll() override; + void FlushRegion(CacheAddr addr, u64 size) override; + void InvalidateRegion(CacheAddr addr, u64 size) override; + void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override; + void FlushCommands() override; + void TickFrame() override; + bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, + const Tegra::Engines::Fermi2D::Regs::Surface& dst, + const Tegra::Engines::Fermi2D::Config& copy_config) override; + bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, + u32 pixel_stride) override; + + /// Maximum supported size that a constbuffer can have in bytes. + static constexpr std::size_t MaxConstbufferSize = 0x10000; + static_assert(MaxConstbufferSize % (4 * sizeof(float)) == 0, + "The maximum size of a constbuffer must be a multiple of the size of GLvec4"); + +private: + struct DrawParameters { + void Draw(vk::CommandBuffer cmdbuf, const vk::DispatchLoaderDynamic& dld) const; + + u32 base_instance = 0; + u32 num_instances = 0; + u32 base_vertex = 0; + u32 num_vertices = 0; + bool is_indexed = 0; + }; + + using Texceptions = std::bitset<Maxwell::NumRenderTargets + 1>; + + static constexpr std::size_t ZETA_TEXCEPTION_INDEX = 8; + + void Draw(bool is_indexed, bool is_instanced); + + void FlushWork(); + + Texceptions UpdateAttachments(); + + std::tuple<vk::Framebuffer, vk::Extent2D> ConfigureFramebuffers(vk::RenderPass renderpass); + + /// Setups geometry buffers and state. + DrawParameters SetupGeometry(FixedPipelineState& fixed_state, BufferBindings& buffer_bindings, + bool is_indexed, bool is_instanced); + + /// Setup descriptors in the graphics pipeline. + void SetupShaderDescriptors(const std::array<Shader, Maxwell::MaxShaderProgram>& shaders); + + void SetupImageTransitions(Texceptions texceptions, + const std::array<View, Maxwell::NumRenderTargets>& color_attachments, + const View& zeta_attachment); + + void UpdateDynamicStates(); + + bool WalkAttachmentOverlaps(const CachedSurfaceView& attachment); + + void SetupVertexArrays(FixedPipelineState::VertexInput& vertex_input, + BufferBindings& buffer_bindings); + + void SetupIndexBuffer(BufferBindings& buffer_bindings, DrawParameters& params, bool is_indexed); + + /// Setup constant buffers in the graphics pipeline. + void SetupGraphicsConstBuffers(const ShaderEntries& entries, std::size_t stage); + + /// Setup global buffers in the graphics pipeline. + void SetupGraphicsGlobalBuffers(const ShaderEntries& entries, std::size_t stage); + + /// Setup texel buffers in the graphics pipeline. + void SetupGraphicsTexelBuffers(const ShaderEntries& entries, std::size_t stage); + + /// Setup textures in the graphics pipeline. + void SetupGraphicsTextures(const ShaderEntries& entries, std::size_t stage); + + /// Setup images in the graphics pipeline. + void SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage); + + /// Setup constant buffers in the compute pipeline. + void SetupComputeConstBuffers(const ShaderEntries& entries); + + /// Setup global buffers in the compute pipeline. + void SetupComputeGlobalBuffers(const ShaderEntries& entries); + + /// Setup texel buffers in the compute pipeline. + void SetupComputeTexelBuffers(const ShaderEntries& entries); + + /// Setup textures in the compute pipeline. + void SetupComputeTextures(const ShaderEntries& entries); + + /// Setup images in the compute pipeline. + void SetupComputeImages(const ShaderEntries& entries); + + void SetupConstBuffer(const ConstBufferEntry& entry, + const Tegra::Engines::ConstBufferInfo& buffer); + + void SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address); + + void SetupTexelBuffer(const Tegra::Texture::TICEntry& image, const TexelBufferEntry& entry); + + void SetupTexture(const Tegra::Texture::FullTextureInfo& texture, const SamplerEntry& entry); + + void SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry); + + void UpdateViewportsState(Tegra::Engines::Maxwell3D& gpu); + void UpdateScissorsState(Tegra::Engines::Maxwell3D& gpu); + void UpdateDepthBias(Tegra::Engines::Maxwell3D& gpu); + void UpdateBlendConstants(Tegra::Engines::Maxwell3D& gpu); + void UpdateDepthBounds(Tegra::Engines::Maxwell3D& gpu); + void UpdateStencilFaces(Tegra::Engines::Maxwell3D& gpu); + + std::size_t CalculateGraphicsStreamBufferSize(bool is_indexed) const; + + std::size_t CalculateComputeStreamBufferSize() const; + + std::size_t CalculateVertexArraysSize() const; + + std::size_t CalculateIndexBufferSize() const; + + std::size_t CalculateConstBufferSize(const ConstBufferEntry& entry, + const Tegra::Engines::ConstBufferInfo& buffer) const; + + RenderPassParams GetRenderPassParams(Texceptions texceptions) const; + + Core::System& system; + Core::Frontend::EmuWindow& render_window; + VKScreenInfo& screen_info; + const VKDevice& device; + VKResourceManager& resource_manager; + VKMemoryManager& memory_manager; + VKScheduler& scheduler; + + VKStagingBufferPool staging_pool; + VKDescriptorPool descriptor_pool; + VKUpdateDescriptorQueue update_descriptor_queue; + QuadArrayPass quad_array_pass; + Uint8Pass uint8_pass; + + VKTextureCache texture_cache; + VKPipelineCache pipeline_cache; + VKBufferCache buffer_cache; + VKSamplerCache sampler_cache; + + std::array<View, Maxwell::NumRenderTargets> color_attachments; + View zeta_attachment; + + std::vector<ImageView> sampled_views; + std::vector<ImageView> image_views; + + u32 draw_counter = 0; + + // TODO(Rodrigo): Invalidate on image destruction + std::unordered_map<FramebufferCacheKey, UniqueFramebuffer> framebuffer_cache; +}; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp index 1ce583f75..0a8ec8398 100644 --- a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp @@ -46,9 +46,9 @@ UniqueSampler VKSamplerCache::CreateSampler(const Tegra::Texture::TSCEntry& tsc) {}, MaxwellToVK::Sampler::Filter(tsc.mag_filter), MaxwellToVK::Sampler::Filter(tsc.min_filter), MaxwellToVK::Sampler::MipmapMode(tsc.mipmap_filter), - MaxwellToVK::Sampler::WrapMode(tsc.wrap_u, tsc.mag_filter), - MaxwellToVK::Sampler::WrapMode(tsc.wrap_v, tsc.mag_filter), - MaxwellToVK::Sampler::WrapMode(tsc.wrap_p, tsc.mag_filter), tsc.GetLodBias(), + MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_u, tsc.mag_filter), + MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_v, tsc.mag_filter), + MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_p, tsc.mag_filter), tsc.GetLodBias(), has_anisotropy, max_anisotropy, tsc.depth_compare_enabled, MaxwellToVK::Sampler::DepthCompareFunction(tsc.depth_compare_func), tsc.GetMinLod(), tsc.GetMaxLod(), vk_border_color.value_or(vk::BorderColor::eFloatTransparentBlack), diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index 8fe852ce8..1ab22251e 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -542,11 +542,10 @@ private: return; } - for (u32 rt = 0; rt < static_cast<u32>(frag_colors.size()); ++rt) { - if (!specialization.enabled_rendertargets[rt]) { + for (u32 rt = 0; rt < static_cast<u32>(std::size(frag_colors)); ++rt) { + if (!IsRenderTargetEnabled(rt)) { continue; } - const Id id = AddGlobalVariable(OpVariable(t_out_float4, spv::StorageClass::Output)); Name(id, fmt::format("frag_color{}", rt)); Decorate(id, spv::Decoration::Location, rt); @@ -852,6 +851,15 @@ private: return binding; } + bool IsRenderTargetEnabled(u32 rt) const { + for (u32 component = 0; component < 4; ++component) { + if (header.ps.IsColorComponentOutputEnabled(rt, component)) { + return true; + } + } + return false; + } + bool IsInputAttributeArray() const { return stage == ShaderType::TesselationControl || stage == ShaderType::TesselationEval || stage == ShaderType::Geometry; @@ -1115,15 +1123,7 @@ private: } if (const auto gmem = std::get_if<GmemNode>(&*node)) { - const Id gmem_buffer = global_buffers.at(gmem->GetDescriptor()); - const Id real = AsUint(Visit(gmem->GetRealAddress())); - const Id base = AsUint(Visit(gmem->GetBaseAddress())); - - Id offset = OpISub(t_uint, real, base); - offset = OpUDiv(t_uint, offset, Constant(t_uint, 4U)); - return {OpLoad(t_float, - OpAccessChain(t_gmem_float, gmem_buffer, Constant(t_uint, 0U), offset)), - Type::Float}; + return {OpLoad(t_uint, GetGlobalMemoryPointer(*gmem)), Type::Uint}; } if (const auto lmem = std::get_if<LmemNode>(&*node)) { @@ -1134,10 +1134,7 @@ private: } if (const auto smem = std::get_if<SmemNode>(&*node)) { - Id address = AsUint(Visit(smem->GetAddress())); - address = OpShiftRightLogical(t_uint, address, Constant(t_uint, 2U)); - const Id pointer = OpAccessChain(t_smem_uint, shared_memory, address); - return {OpLoad(t_uint, pointer), Type::Uint}; + return {OpLoad(t_uint, GetSharedMemoryPointer(*smem)), Type::Uint}; } if (const auto internal_flag = std::get_if<InternalFlagNode>(&*node)) { @@ -1331,20 +1328,10 @@ private: target = {OpAccessChain(t_prv_float, local_memory, address), Type::Float}; } else if (const auto smem = std::get_if<SmemNode>(&*dest)) { - ASSERT(stage == ShaderType::Compute); - Id address = AsUint(Visit(smem->GetAddress())); - address = OpShiftRightLogical(t_uint, address, Constant(t_uint, 2U)); - target = {OpAccessChain(t_smem_uint, shared_memory, address), Type::Uint}; + target = {GetSharedMemoryPointer(*smem), Type::Uint}; } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) { - const Id real = AsUint(Visit(gmem->GetRealAddress())); - const Id base = AsUint(Visit(gmem->GetBaseAddress())); - const Id diff = OpISub(t_uint, real, base); - const Id offset = OpShiftRightLogical(t_uint, diff, Constant(t_uint, 2)); - - const Id gmem_buffer = global_buffers.at(gmem->GetDescriptor()); - target = {OpAccessChain(t_gmem_float, gmem_buffer, Constant(t_uint, 0), offset), - Type::Float}; + target = {GetGlobalMemoryPointer(*gmem), Type::Uint}; } else { UNIMPLEMENTED(); @@ -1796,6 +1783,24 @@ private: return {}; } + Expression AtomicAdd(Operation operation) { + Id pointer; + if (const auto smem = std::get_if<SmemNode>(&*operation[0])) { + pointer = GetSharedMemoryPointer(*smem); + } else if (const auto gmem = std::get_if<GmemNode>(&*operation[0])) { + pointer = GetGlobalMemoryPointer(*gmem); + } else { + UNREACHABLE(); + return {Constant(t_uint, 0), Type::Uint}; + } + + const Id scope = Constant(t_uint, static_cast<u32>(spv::Scope::Device)); + const Id semantics = Constant(t_uint, 0U); + + const Id value = AsUint(Visit(operation[1])); + return {OpAtomicIAdd(t_uint, pointer, scope, semantics, value), Type::Uint}; + } + Expression Branch(Operation operation) { const auto& target = std::get<ImmediateNode>(*operation[0]); OpStore(jmp_to, Constant(t_uint, target.GetValue())); @@ -1876,19 +1881,14 @@ private: // rendertargets/components are skipped in the register assignment. u32 current_reg = 0; for (u32 rt = 0; rt < Maxwell::NumRenderTargets; ++rt) { - if (!specialization.enabled_rendertargets[rt]) { - // Skip rendertargets that are not enabled - continue; - } // TODO(Subv): Figure out how dual-source blending is configured in the Switch. for (u32 component = 0; component < 4; ++component) { - const Id pointer = AccessElement(t_out_float, frag_colors.at(rt), component); - if (header.ps.IsColorComponentOutputEnabled(rt, component)) { - OpStore(pointer, SafeGetRegister(current_reg)); - ++current_reg; - } else { - OpStore(pointer, component == 3 ? v_float_one : v_float_zero); + if (!header.ps.IsColorComponentOutputEnabled(rt, component)) { + continue; } + const Id pointer = AccessElement(t_out_float, frag_colors[rt], component); + OpStore(pointer, SafeGetRegister(current_reg)); + ++current_reg; } } if (header.ps.omap.depth) { @@ -2227,6 +2227,22 @@ private: return {}; } + Id GetGlobalMemoryPointer(const GmemNode& gmem) { + const Id real = AsUint(Visit(gmem.GetRealAddress())); + const Id base = AsUint(Visit(gmem.GetBaseAddress())); + const Id diff = OpISub(t_uint, real, base); + const Id offset = OpShiftRightLogical(t_uint, diff, Constant(t_uint, 2)); + const Id buffer = global_buffers.at(gmem.GetDescriptor()); + return OpAccessChain(t_gmem_uint, buffer, Constant(t_uint, 0), offset); + } + + Id GetSharedMemoryPointer(const SmemNode& smem) { + ASSERT(stage == ShaderType::Compute); + Id address = AsUint(Visit(smem.GetAddress())); + address = OpShiftRightLogical(t_uint, address, Constant(t_uint, 2U)); + return OpAccessChain(t_smem_uint, shared_memory, address); + } + static constexpr std::array operation_decompilers = { &SPIRVDecompiler::Assign, @@ -2373,6 +2389,8 @@ private: &SPIRVDecompiler::AtomicImageXor, &SPIRVDecompiler::AtomicImageExchange, + &SPIRVDecompiler::AtomicAdd, + &SPIRVDecompiler::Branch, &SPIRVDecompiler::BranchIndirect, &SPIRVDecompiler::PushFlowStack, @@ -2467,9 +2485,9 @@ private: Id t_smem_uint{}; - const Id t_gmem_float = TypePointer(spv::StorageClass::StorageBuffer, t_float); + const Id t_gmem_uint = TypePointer(spv::StorageClass::StorageBuffer, t_uint); const Id t_gmem_array = - Name(Decorate(TypeRuntimeArray(t_float), spv::Decoration::ArrayStride, 4U), "GmemArray"); + Name(Decorate(TypeRuntimeArray(t_uint), spv::Decoration::ArrayStride, 4U), "GmemArray"); const Id t_gmem_struct = MemberDecorate( Decorate(TypeStruct(t_gmem_array), spv::Decoration::Block), 0, spv::Decoration::Offset, 0); const Id t_gmem_ssbo = TypePointer(spv::StorageClass::StorageBuffer, t_gmem_struct); diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.h b/src/video_core/renderer_vulkan/vk_shader_decompiler.h index 10794be1c..f5dc14d9e 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h @@ -102,9 +102,6 @@ struct Specialization final { Maxwell::TessellationSpacing spacing{}; bool clockwise{}; } tessellation; - - // Fragment specific - std::bitset<8> enabled_rendertargets; }; // Old gcc versions don't consider this trivially copyable. // static_assert(std::is_trivially_copyable_v<Specialization>); diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h index 02310375f..4d9488f49 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h @@ -13,6 +13,7 @@ #include "video_core/renderer_vulkan/declarations.h" #include "video_core/renderer_vulkan/vk_memory_manager.h" +#include "video_core/renderer_vulkan/vk_resource_manager.h" namespace Vulkan { diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp index ebc68f030..f47b691a8 100644 --- a/src/video_core/renderer_vulkan/vk_swapchain.cpp +++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp @@ -123,7 +123,7 @@ bool VKSwapchain::Present(vk::Semaphore render_semaphore, VKFence& fence) { ASSERT(fences[image_index] == nullptr); fences[image_index] = &fence; - frame_index = (frame_index + 1) % image_count; + frame_index = (frame_index + 1) % static_cast<u32>(image_count); return recreated; } diff --git a/src/video_core/renderer_vulkan/vk_swapchain.h b/src/video_core/renderer_vulkan/vk_swapchain.h index a1e7938d2..2f3b2ccd5 100644 --- a/src/video_core/renderer_vulkan/vk_swapchain.h +++ b/src/video_core/renderer_vulkan/vk_swapchain.h @@ -40,19 +40,19 @@ public: return extent; } - u32 GetImageCount() const { + std::size_t GetImageCount() const { return image_count; } - u32 GetImageIndex() const { + std::size_t GetImageIndex() const { return image_index; } - vk::Image GetImageIndex(u32 index) const { + vk::Image GetImageIndex(std::size_t index) const { return images[index]; } - vk::ImageView GetImageViewIndex(u32 index) const { + vk::ImageView GetImageViewIndex(std::size_t index) const { return *image_views[index]; } @@ -77,7 +77,7 @@ private: UniqueSwapchainKHR swapchain; - u32 image_count{}; + std::size_t image_count{}; std::vector<vk::Image> images; std::vector<UniqueImageView> image_views; std::vector<UniqueFramebuffer> framebuffers; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp new file mode 100644 index 000000000..51b0d38a6 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -0,0 +1,475 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <array> +#include <cstddef> +#include <cstring> +#include <memory> +#include <variant> +#include <vector> + +#include "common/alignment.h" +#include "common/assert.h" +#include "common/common_types.h" +#include "core/core.h" +#include "core/memory.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/morton.h" +#include "video_core/renderer_vulkan/declarations.h" +#include "video_core/renderer_vulkan/maxwell_to_vk.h" +#include "video_core/renderer_vulkan/vk_device.h" +#include "video_core/renderer_vulkan/vk_memory_manager.h" +#include "video_core/renderer_vulkan/vk_rasterizer.h" +#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" +#include "video_core/renderer_vulkan/vk_texture_cache.h" +#include "video_core/surface.h" +#include "video_core/textures/convert.h" + +namespace Vulkan { + +using VideoCore::MortonSwizzle; +using VideoCore::MortonSwizzleMode; + +using Tegra::Texture::SwizzleSource; +using VideoCore::Surface::PixelFormat; +using VideoCore::Surface::SurfaceCompression; +using VideoCore::Surface::SurfaceTarget; + +namespace { + +vk::ImageType SurfaceTargetToImage(SurfaceTarget target) { + switch (target) { + case SurfaceTarget::Texture1D: + case SurfaceTarget::Texture1DArray: + return vk::ImageType::e1D; + case SurfaceTarget::Texture2D: + case SurfaceTarget::Texture2DArray: + case SurfaceTarget::TextureCubemap: + case SurfaceTarget::TextureCubeArray: + return vk::ImageType::e2D; + case SurfaceTarget::Texture3D: + return vk::ImageType::e3D; + } + UNREACHABLE_MSG("Unknown texture target={}", static_cast<u32>(target)); + return {}; +} + +vk::ImageAspectFlags PixelFormatToImageAspect(PixelFormat pixel_format) { + if (pixel_format < PixelFormat::MaxColorFormat) { + return vk::ImageAspectFlagBits::eColor; + } else if (pixel_format < PixelFormat::MaxDepthFormat) { + return vk::ImageAspectFlagBits::eDepth; + } else if (pixel_format < PixelFormat::MaxDepthStencilFormat) { + return vk::ImageAspectFlagBits::eDepth | vk::ImageAspectFlagBits::eStencil; + } else { + UNREACHABLE_MSG("Invalid pixel format={}", static_cast<u32>(pixel_format)); + return vk::ImageAspectFlagBits::eColor; + } +} + +vk::ImageViewType GetImageViewType(SurfaceTarget target) { + switch (target) { + case SurfaceTarget::Texture1D: + return vk::ImageViewType::e1D; + case SurfaceTarget::Texture2D: + return vk::ImageViewType::e2D; + case SurfaceTarget::Texture3D: + return vk::ImageViewType::e3D; + case SurfaceTarget::Texture1DArray: + return vk::ImageViewType::e1DArray; + case SurfaceTarget::Texture2DArray: + return vk::ImageViewType::e2DArray; + case SurfaceTarget::TextureCubemap: + return vk::ImageViewType::eCube; + case SurfaceTarget::TextureCubeArray: + return vk::ImageViewType::eCubeArray; + case SurfaceTarget::TextureBuffer: + break; + } + UNREACHABLE(); + return {}; +} + +UniqueBuffer CreateBuffer(const VKDevice& device, const SurfaceParams& params) { + // TODO(Rodrigo): Move texture buffer creation to the buffer cache + const vk::BufferCreateInfo buffer_ci({}, params.GetHostSizeInBytes(), + vk::BufferUsageFlagBits::eUniformTexelBuffer | + vk::BufferUsageFlagBits::eTransferSrc | + vk::BufferUsageFlagBits::eTransferDst, + vk::SharingMode::eExclusive, 0, nullptr); + const auto dev = device.GetLogical(); + const auto& dld = device.GetDispatchLoader(); + return dev.createBufferUnique(buffer_ci, nullptr, dld); +} + +vk::BufferViewCreateInfo GenerateBufferViewCreateInfo(const VKDevice& device, + const SurfaceParams& params, + vk::Buffer buffer) { + ASSERT(params.IsBuffer()); + + const auto format = + MaxwellToVK::SurfaceFormat(device, FormatType::Buffer, params.pixel_format).format; + return vk::BufferViewCreateInfo({}, buffer, format, 0, params.GetHostSizeInBytes()); +} + +vk::ImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceParams& params) { + constexpr auto sample_count = vk::SampleCountFlagBits::e1; + constexpr auto tiling = vk::ImageTiling::eOptimal; + + ASSERT(!params.IsBuffer()); + + const auto [format, attachable, storage] = + MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, params.pixel_format); + + auto image_usage = vk::ImageUsageFlagBits::eSampled | vk::ImageUsageFlagBits::eTransferDst | + vk::ImageUsageFlagBits::eTransferSrc; + if (attachable) { + image_usage |= params.IsPixelFormatZeta() ? vk::ImageUsageFlagBits::eDepthStencilAttachment + : vk::ImageUsageFlagBits::eColorAttachment; + } + if (storage) { + image_usage |= vk::ImageUsageFlagBits::eStorage; + } + + vk::ImageCreateFlags flags; + vk::Extent3D extent; + switch (params.target) { + case SurfaceTarget::TextureCubemap: + case SurfaceTarget::TextureCubeArray: + flags |= vk::ImageCreateFlagBits::eCubeCompatible; + [[fallthrough]]; + case SurfaceTarget::Texture1D: + case SurfaceTarget::Texture1DArray: + case SurfaceTarget::Texture2D: + case SurfaceTarget::Texture2DArray: + extent = vk::Extent3D(params.width, params.height, 1); + break; + case SurfaceTarget::Texture3D: + extent = vk::Extent3D(params.width, params.height, params.depth); + break; + case SurfaceTarget::TextureBuffer: + UNREACHABLE(); + } + + return vk::ImageCreateInfo(flags, SurfaceTargetToImage(params.target), format, extent, + params.num_levels, static_cast<u32>(params.GetNumLayers()), + sample_count, tiling, image_usage, vk::SharingMode::eExclusive, 0, + nullptr, vk::ImageLayout::eUndefined); +} + +} // Anonymous namespace + +CachedSurface::CachedSurface(Core::System& system, const VKDevice& device, + VKResourceManager& resource_manager, VKMemoryManager& memory_manager, + VKScheduler& scheduler, VKStagingBufferPool& staging_pool, + GPUVAddr gpu_addr, const SurfaceParams& params) + : SurfaceBase<View>{gpu_addr, params}, system{system}, device{device}, + resource_manager{resource_manager}, memory_manager{memory_manager}, scheduler{scheduler}, + staging_pool{staging_pool} { + if (params.IsBuffer()) { + buffer = CreateBuffer(device, params); + commit = memory_manager.Commit(*buffer, false); + + const auto buffer_view_ci = GenerateBufferViewCreateInfo(device, params, *buffer); + format = buffer_view_ci.format; + + const auto dev = device.GetLogical(); + const auto& dld = device.GetDispatchLoader(); + buffer_view = dev.createBufferViewUnique(buffer_view_ci, nullptr, dld); + } else { + const auto image_ci = GenerateImageCreateInfo(device, params); + format = image_ci.format; + + image.emplace(device, scheduler, image_ci, PixelFormatToImageAspect(params.pixel_format)); + commit = memory_manager.Commit(image->GetHandle(), false); + } + + // TODO(Rodrigo): Move this to a virtual function. + main_view = CreateViewInner( + ViewParams(params.target, 0, static_cast<u32>(params.GetNumLayers()), 0, params.num_levels), + true); +} + +CachedSurface::~CachedSurface() = default; + +void CachedSurface::UploadTexture(const std::vector<u8>& staging_buffer) { + // To upload data we have to be outside of a renderpass + scheduler.RequestOutsideRenderPassOperationContext(); + + if (params.IsBuffer()) { + UploadBuffer(staging_buffer); + } else { + UploadImage(staging_buffer); + } +} + +void CachedSurface::DownloadTexture(std::vector<u8>& staging_buffer) { + UNIMPLEMENTED_IF(params.IsBuffer()); + + if (params.pixel_format == VideoCore::Surface::PixelFormat::A1B5G5R5U) { + LOG_WARNING(Render_Vulkan, "A1B5G5R5 flushing is stubbed"); + } + + // We can't copy images to buffers inside a renderpass + scheduler.RequestOutsideRenderPassOperationContext(); + + FullTransition(vk::PipelineStageFlagBits::eTransfer, vk::AccessFlagBits::eTransferRead, + vk::ImageLayout::eTransferSrcOptimal); + + const auto& buffer = staging_pool.GetUnusedBuffer(host_memory_size, true); + // TODO(Rodrigo): Do this in a single copy + for (u32 level = 0; level < params.num_levels; ++level) { + scheduler.Record([image = image->GetHandle(), buffer = *buffer.handle, + copy = GetBufferImageCopy(level)](auto cmdbuf, auto& dld) { + cmdbuf.copyImageToBuffer(image, vk::ImageLayout::eTransferSrcOptimal, buffer, {copy}, + dld); + }); + } + scheduler.Finish(); + + // TODO(Rodrigo): Use an intern buffer for staging buffers and avoid this unnecessary memcpy. + std::memcpy(staging_buffer.data(), buffer.commit->Map(host_memory_size), host_memory_size); +} + +void CachedSurface::DecorateSurfaceName() { + // TODO(Rodrigo): Add name decorations +} + +View CachedSurface::CreateView(const ViewParams& params) { + return CreateViewInner(params, false); +} + +View CachedSurface::CreateViewInner(const ViewParams& params, bool is_proxy) { + // TODO(Rodrigo): Add name decorations + return views[params] = std::make_shared<CachedSurfaceView>(device, *this, params, is_proxy); +} + +void CachedSurface::UploadBuffer(const std::vector<u8>& staging_buffer) { + const auto& src_buffer = staging_pool.GetUnusedBuffer(host_memory_size, true); + std::memcpy(src_buffer.commit->Map(host_memory_size), staging_buffer.data(), host_memory_size); + + scheduler.Record([src_buffer = *src_buffer.handle, dst_buffer = *buffer, + size = params.GetHostSizeInBytes()](auto cmdbuf, auto& dld) { + const vk::BufferCopy copy(0, 0, size); + cmdbuf.copyBuffer(src_buffer, dst_buffer, {copy}, dld); + + cmdbuf.pipelineBarrier( + vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eVertexShader, {}, {}, + {vk::BufferMemoryBarrier(vk::AccessFlagBits::eTransferWrite, + vk::AccessFlagBits::eShaderRead, 0, 0, dst_buffer, 0, size)}, + {}, dld); + }); +} + +void CachedSurface::UploadImage(const std::vector<u8>& staging_buffer) { + const auto& src_buffer = staging_pool.GetUnusedBuffer(host_memory_size, true); + std::memcpy(src_buffer.commit->Map(host_memory_size), staging_buffer.data(), host_memory_size); + + FullTransition(vk::PipelineStageFlagBits::eTransfer, vk::AccessFlagBits::eTransferWrite, + vk::ImageLayout::eTransferDstOptimal); + + for (u32 level = 0; level < params.num_levels; ++level) { + vk::BufferImageCopy copy = GetBufferImageCopy(level); + const auto& dld = device.GetDispatchLoader(); + if (image->GetAspectMask() == + (vk::ImageAspectFlagBits::eDepth | vk::ImageAspectFlagBits::eStencil)) { + vk::BufferImageCopy depth = copy; + vk::BufferImageCopy stencil = copy; + depth.imageSubresource.aspectMask = vk::ImageAspectFlagBits::eDepth; + stencil.imageSubresource.aspectMask = vk::ImageAspectFlagBits::eStencil; + scheduler.Record([buffer = *src_buffer.handle, image = image->GetHandle(), depth, + stencil](auto cmdbuf, auto& dld) { + cmdbuf.copyBufferToImage(buffer, image, vk::ImageLayout::eTransferDstOptimal, + {depth, stencil}, dld); + }); + } else { + scheduler.Record([buffer = *src_buffer.handle, image = image->GetHandle(), + copy](auto cmdbuf, auto& dld) { + cmdbuf.copyBufferToImage(buffer, image, vk::ImageLayout::eTransferDstOptimal, + {copy}, dld); + }); + } + } +} + +vk::BufferImageCopy CachedSurface::GetBufferImageCopy(u32 level) const { + const u32 vk_depth = params.target == SurfaceTarget::Texture3D ? params.GetMipDepth(level) : 1; + const auto compression_type = params.GetCompressionType(); + const std::size_t mip_offset = compression_type == SurfaceCompression::Converted + ? params.GetConvertedMipmapOffset(level) + : params.GetHostMipmapLevelOffset(level); + + return vk::BufferImageCopy( + mip_offset, 0, 0, + {image->GetAspectMask(), level, 0, static_cast<u32>(params.GetNumLayers())}, {0, 0, 0}, + {params.GetMipWidth(level), params.GetMipHeight(level), vk_depth}); +} + +vk::ImageSubresourceRange CachedSurface::GetImageSubresourceRange() const { + return {image->GetAspectMask(), 0, params.num_levels, 0, + static_cast<u32>(params.GetNumLayers())}; +} + +CachedSurfaceView::CachedSurfaceView(const VKDevice& device, CachedSurface& surface, + const ViewParams& params, bool is_proxy) + : VideoCommon::ViewBase{params}, params{surface.GetSurfaceParams()}, + image{surface.GetImageHandle()}, buffer_view{surface.GetBufferViewHandle()}, + aspect_mask{surface.GetAspectMask()}, device{device}, surface{surface}, + base_layer{params.base_layer}, num_layers{params.num_layers}, base_level{params.base_level}, + num_levels{params.num_levels}, image_view_type{image ? GetImageViewType(params.target) + : vk::ImageViewType{}} {} + +CachedSurfaceView::~CachedSurfaceView() = default; + +vk::ImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y_source, + SwizzleSource z_source, SwizzleSource w_source) { + const u32 swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source); + if (last_image_view && last_swizzle == swizzle) { + return last_image_view; + } + last_swizzle = swizzle; + + const auto [entry, is_cache_miss] = view_cache.try_emplace(swizzle); + auto& image_view = entry->second; + if (!is_cache_miss) { + return last_image_view = *image_view; + } + + auto swizzle_x = MaxwellToVK::SwizzleSource(x_source); + auto swizzle_y = MaxwellToVK::SwizzleSource(y_source); + auto swizzle_z = MaxwellToVK::SwizzleSource(z_source); + auto swizzle_w = MaxwellToVK::SwizzleSource(w_source); + + if (params.pixel_format == VideoCore::Surface::PixelFormat::A1B5G5R5U) { + // A1B5G5R5 is implemented as A1R5G5B5, we have to change the swizzle here. + std::swap(swizzle_x, swizzle_z); + } + + // Games can sample depth or stencil values on textures. This is decided by the swizzle value on + // hardware. To emulate this on Vulkan we specify it in the aspect. + vk::ImageAspectFlags aspect = aspect_mask; + if (aspect == (vk::ImageAspectFlagBits::eDepth | vk::ImageAspectFlagBits::eStencil)) { + UNIMPLEMENTED_IF(x_source != SwizzleSource::R && x_source != SwizzleSource::G); + const bool is_first = x_source == SwizzleSource::R; + switch (params.pixel_format) { + case VideoCore::Surface::PixelFormat::Z24S8: + case VideoCore::Surface::PixelFormat::Z32FS8: + aspect = is_first ? vk::ImageAspectFlagBits::eDepth : vk::ImageAspectFlagBits::eStencil; + break; + case VideoCore::Surface::PixelFormat::S8Z24: + aspect = is_first ? vk::ImageAspectFlagBits::eStencil : vk::ImageAspectFlagBits::eDepth; + break; + default: + aspect = vk::ImageAspectFlagBits::eDepth; + UNIMPLEMENTED(); + } + + // Vulkan doesn't seem to understand swizzling of a depth stencil image, use identity + swizzle_x = vk::ComponentSwizzle::eR; + swizzle_y = vk::ComponentSwizzle::eG; + swizzle_z = vk::ComponentSwizzle::eB; + swizzle_w = vk::ComponentSwizzle::eA; + } + + const vk::ImageViewCreateInfo image_view_ci( + {}, surface.GetImageHandle(), image_view_type, surface.GetImage().GetFormat(), + {swizzle_x, swizzle_y, swizzle_z, swizzle_w}, + {aspect, base_level, num_levels, base_layer, num_layers}); + + const auto dev = device.GetLogical(); + image_view = dev.createImageViewUnique(image_view_ci, nullptr, device.GetDispatchLoader()); + return last_image_view = *image_view; +} + +VKTextureCache::VKTextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer, + const VKDevice& device, VKResourceManager& resource_manager, + VKMemoryManager& memory_manager, VKScheduler& scheduler, + VKStagingBufferPool& staging_pool) + : TextureCache(system, rasterizer), device{device}, resource_manager{resource_manager}, + memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{staging_pool} {} + +VKTextureCache::~VKTextureCache() = default; + +Surface VKTextureCache::CreateSurface(GPUVAddr gpu_addr, const SurfaceParams& params) { + return std::make_shared<CachedSurface>(system, device, resource_manager, memory_manager, + scheduler, staging_pool, gpu_addr, params); +} + +void VKTextureCache::ImageCopy(Surface& src_surface, Surface& dst_surface, + const VideoCommon::CopyParams& copy_params) { + const bool src_3d = src_surface->GetSurfaceParams().target == SurfaceTarget::Texture3D; + const bool dst_3d = dst_surface->GetSurfaceParams().target == SurfaceTarget::Texture3D; + UNIMPLEMENTED_IF(src_3d); + + // The texture cache handles depth in OpenGL terms, we have to handle it as subresource and + // dimension respectively. + const u32 dst_base_layer = dst_3d ? 0 : copy_params.dest_z; + const u32 dst_offset_z = dst_3d ? copy_params.dest_z : 0; + + const u32 extent_z = dst_3d ? copy_params.depth : 1; + const u32 num_layers = dst_3d ? 1 : copy_params.depth; + + // We can't copy inside a renderpass + scheduler.RequestOutsideRenderPassOperationContext(); + + src_surface->Transition(copy_params.source_z, copy_params.depth, copy_params.source_level, 1, + vk::PipelineStageFlagBits::eTransfer, vk::AccessFlagBits::eTransferRead, + vk::ImageLayout::eTransferSrcOptimal); + dst_surface->Transition( + dst_base_layer, num_layers, copy_params.dest_level, 1, vk::PipelineStageFlagBits::eTransfer, + vk::AccessFlagBits::eTransferWrite, vk::ImageLayout::eTransferDstOptimal); + + const auto& dld{device.GetDispatchLoader()}; + const vk::ImageSubresourceLayers src_subresource( + src_surface->GetAspectMask(), copy_params.source_level, copy_params.source_z, num_layers); + const vk::ImageSubresourceLayers dst_subresource( + dst_surface->GetAspectMask(), copy_params.dest_level, dst_base_layer, num_layers); + const vk::Offset3D src_offset(copy_params.source_x, copy_params.source_y, 0); + const vk::Offset3D dst_offset(copy_params.dest_x, copy_params.dest_y, dst_offset_z); + const vk::Extent3D extent(copy_params.width, copy_params.height, extent_z); + const vk::ImageCopy copy(src_subresource, src_offset, dst_subresource, dst_offset, extent); + const vk::Image src_image = src_surface->GetImageHandle(); + const vk::Image dst_image = dst_surface->GetImageHandle(); + scheduler.Record([src_image, dst_image, copy](auto cmdbuf, auto& dld) { + cmdbuf.copyImage(src_image, vk::ImageLayout::eTransferSrcOptimal, dst_image, + vk::ImageLayout::eTransferDstOptimal, {copy}, dld); + }); +} + +void VKTextureCache::ImageBlit(View& src_view, View& dst_view, + const Tegra::Engines::Fermi2D::Config& copy_config) { + // We can't blit inside a renderpass + scheduler.RequestOutsideRenderPassOperationContext(); + + src_view->Transition(vk::ImageLayout::eTransferSrcOptimal, vk::PipelineStageFlagBits::eTransfer, + vk::AccessFlagBits::eTransferRead); + dst_view->Transition(vk::ImageLayout::eTransferDstOptimal, vk::PipelineStageFlagBits::eTransfer, + vk::AccessFlagBits::eTransferWrite); + + const auto& cfg = copy_config; + const auto src_top_left = vk::Offset3D(cfg.src_rect.left, cfg.src_rect.top, 0); + const auto src_bot_right = vk::Offset3D(cfg.src_rect.right, cfg.src_rect.bottom, 1); + const auto dst_top_left = vk::Offset3D(cfg.dst_rect.left, cfg.dst_rect.top, 0); + const auto dst_bot_right = vk::Offset3D(cfg.dst_rect.right, cfg.dst_rect.bottom, 1); + const vk::ImageBlit blit(src_view->GetImageSubresourceLayers(), {src_top_left, src_bot_right}, + dst_view->GetImageSubresourceLayers(), {dst_top_left, dst_bot_right}); + const bool is_linear = copy_config.filter == Tegra::Engines::Fermi2D::Filter::Linear; + + const auto& dld{device.GetDispatchLoader()}; + scheduler.Record([src_image = src_view->GetImage(), dst_image = dst_view->GetImage(), blit, + is_linear](auto cmdbuf, auto& dld) { + cmdbuf.blitImage(src_image, vk::ImageLayout::eTransferSrcOptimal, dst_image, + vk::ImageLayout::eTransferDstOptimal, {blit}, + is_linear ? vk::Filter::eLinear : vk::Filter::eNearest, dld); + }); +} + +void VKTextureCache::BufferCopy(Surface& src_surface, Surface& dst_surface) { + // Currently unimplemented. PBO copies should be dropped and we should use a render pass to + // convert from color to depth and viceversa. + LOG_WARNING(Render_Vulkan, "Unimplemented"); +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h new file mode 100644 index 000000000..d3edbe80c --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -0,0 +1,239 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> +#include <unordered_map> + +#include "common/assert.h" +#include "common/common_types.h" +#include "common/logging/log.h" +#include "common/math_util.h" +#include "video_core/gpu.h" +#include "video_core/rasterizer_cache.h" +#include "video_core/renderer_vulkan/declarations.h" +#include "video_core/renderer_vulkan/vk_image.h" +#include "video_core/renderer_vulkan/vk_memory_manager.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/texture_cache/surface_base.h" +#include "video_core/texture_cache/texture_cache.h" +#include "video_core/textures/decoders.h" + +namespace Core { +class System; +} + +namespace VideoCore { +class RasterizerInterface; +} + +namespace Vulkan { + +class RasterizerVulkan; +class VKDevice; +class VKResourceManager; +class VKScheduler; +class VKStagingBufferPool; + +class CachedSurfaceView; +class CachedSurface; + +using Surface = std::shared_ptr<CachedSurface>; +using View = std::shared_ptr<CachedSurfaceView>; +using TextureCacheBase = VideoCommon::TextureCache<Surface, View>; + +using VideoCommon::SurfaceParams; +using VideoCommon::ViewParams; + +class CachedSurface final : public VideoCommon::SurfaceBase<View> { + friend CachedSurfaceView; + +public: + explicit CachedSurface(Core::System& system, const VKDevice& device, + VKResourceManager& resource_manager, VKMemoryManager& memory_manager, + VKScheduler& scheduler, VKStagingBufferPool& staging_pool, + GPUVAddr gpu_addr, const SurfaceParams& params); + ~CachedSurface(); + + void UploadTexture(const std::vector<u8>& staging_buffer) override; + void DownloadTexture(std::vector<u8>& staging_buffer) override; + + void FullTransition(vk::PipelineStageFlags new_stage_mask, vk::AccessFlags new_access, + vk::ImageLayout new_layout) { + image->Transition(0, static_cast<u32>(params.GetNumLayers()), 0, params.num_levels, + new_stage_mask, new_access, new_layout); + } + + void Transition(u32 base_layer, u32 num_layers, u32 base_level, u32 num_levels, + vk::PipelineStageFlags new_stage_mask, vk::AccessFlags new_access, + vk::ImageLayout new_layout) { + image->Transition(base_layer, num_layers, base_level, num_levels, new_stage_mask, + new_access, new_layout); + } + + VKImage& GetImage() { + return *image; + } + + const VKImage& GetImage() const { + return *image; + } + + vk::Image GetImageHandle() const { + return image->GetHandle(); + } + + vk::ImageAspectFlags GetAspectMask() const { + return image->GetAspectMask(); + } + + vk::BufferView GetBufferViewHandle() const { + return *buffer_view; + } + +protected: + void DecorateSurfaceName(); + + View CreateView(const ViewParams& params) override; + View CreateViewInner(const ViewParams& params, bool is_proxy); + +private: + void UploadBuffer(const std::vector<u8>& staging_buffer); + + void UploadImage(const std::vector<u8>& staging_buffer); + + vk::BufferImageCopy GetBufferImageCopy(u32 level) const; + + vk::ImageSubresourceRange GetImageSubresourceRange() const; + + Core::System& system; + const VKDevice& device; + VKResourceManager& resource_manager; + VKMemoryManager& memory_manager; + VKScheduler& scheduler; + VKStagingBufferPool& staging_pool; + + std::optional<VKImage> image; + UniqueBuffer buffer; + UniqueBufferView buffer_view; + VKMemoryCommit commit; + + vk::Format format; +}; + +class CachedSurfaceView final : public VideoCommon::ViewBase { +public: + explicit CachedSurfaceView(const VKDevice& device, CachedSurface& surface, + const ViewParams& params, bool is_proxy); + ~CachedSurfaceView(); + + vk::ImageView GetHandle(Tegra::Texture::SwizzleSource x_source, + Tegra::Texture::SwizzleSource y_source, + Tegra::Texture::SwizzleSource z_source, + Tegra::Texture::SwizzleSource w_source); + + bool IsSameSurface(const CachedSurfaceView& rhs) const { + return &surface == &rhs.surface; + } + + vk::ImageView GetHandle() { + return GetHandle(Tegra::Texture::SwizzleSource::R, Tegra::Texture::SwizzleSource::G, + Tegra::Texture::SwizzleSource::B, Tegra::Texture::SwizzleSource::A); + } + + u32 GetWidth() const { + return params.GetMipWidth(base_level); + } + + u32 GetHeight() const { + return params.GetMipHeight(base_level); + } + + bool IsBufferView() const { + return buffer_view; + } + + vk::Image GetImage() const { + return image; + } + + vk::BufferView GetBufferView() const { + return buffer_view; + } + + vk::ImageSubresourceRange GetImageSubresourceRange() const { + return {aspect_mask, base_level, num_levels, base_layer, num_layers}; + } + + vk::ImageSubresourceLayers GetImageSubresourceLayers() const { + return {surface.GetAspectMask(), base_level, base_layer, num_layers}; + } + + void Transition(vk::ImageLayout new_layout, vk::PipelineStageFlags new_stage_mask, + vk::AccessFlags new_access) const { + surface.Transition(base_layer, num_layers, base_level, num_levels, new_stage_mask, + new_access, new_layout); + } + + void MarkAsModified(u64 tick) { + surface.MarkAsModified(true, tick); + } + +private: + static u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source, + Tegra::Texture::SwizzleSource y_source, + Tegra::Texture::SwizzleSource z_source, + Tegra::Texture::SwizzleSource w_source) { + return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) | + (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source); + } + + // Store a copy of these values to avoid double dereference when reading them + const SurfaceParams params; + const vk::Image image; + const vk::BufferView buffer_view; + const vk::ImageAspectFlags aspect_mask; + + const VKDevice& device; + CachedSurface& surface; + const u32 base_layer; + const u32 num_layers; + const u32 base_level; + const u32 num_levels; + const vk::ImageViewType image_view_type; + + vk::ImageView last_image_view; + u32 last_swizzle{}; + + std::unordered_map<u32, UniqueImageView> view_cache; +}; + +class VKTextureCache final : public TextureCacheBase { +public: + explicit VKTextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer, + const VKDevice& device, VKResourceManager& resource_manager, + VKMemoryManager& memory_manager, VKScheduler& scheduler, + VKStagingBufferPool& staging_pool); + ~VKTextureCache(); + +private: + Surface CreateSurface(GPUVAddr gpu_addr, const SurfaceParams& params) override; + + void ImageCopy(Surface& src_surface, Surface& dst_surface, + const VideoCommon::CopyParams& copy_params) override; + + void ImageBlit(View& src_view, View& dst_view, + const Tegra::Engines::Fermi2D::Config& copy_config) override; + + void BufferCopy(Surface& src_surface, Surface& dst_surface) override; + + const VKDevice& device; + VKResourceManager& resource_manager; + VKMemoryManager& memory_manager; + VKScheduler& scheduler; + VKStagingBufferPool& staging_pool; +}; + +} // namespace Vulkan diff --git a/src/video_core/shader/control_flow.cpp b/src/video_core/shader/control_flow.cpp index b427ac873..0229733b6 100644 --- a/src/video_core/shader/control_flow.cpp +++ b/src/video_core/shader/control_flow.cpp @@ -65,7 +65,7 @@ struct BlockInfo { struct CFGRebuildState { explicit CFGRebuildState(const ProgramCode& program_code, u32 start, ConstBufferLocker& locker) - : program_code{program_code}, start{start}, locker{locker} {} + : program_code{program_code}, locker{locker}, start{start} {} const ProgramCode& program_code; ConstBufferLocker& locker; diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp index c934d0719..b5fbc4d58 100644 --- a/src/video_core/shader/decode/memory.cpp +++ b/src/video_core/shader/decode/memory.cpp @@ -6,6 +6,7 @@ #include <vector> #include <fmt/format.h> +#include "common/alignment.h" #include "common/assert.h" #include "common/common_types.h" #include "common/logging/log.h" @@ -15,44 +16,75 @@ namespace VideoCommon::Shader { +using Tegra::Shader::AtomicOp; +using Tegra::Shader::AtomicType; using Tegra::Shader::Attribute; +using Tegra::Shader::GlobalAtomicOp; +using Tegra::Shader::GlobalAtomicType; using Tegra::Shader::Instruction; using Tegra::Shader::OpCode; using Tegra::Shader::Register; +using Tegra::Shader::StoreType; namespace { -u32 GetLdgMemorySize(Tegra::Shader::UniformType uniform_type) { +bool IsUnaligned(Tegra::Shader::UniformType uniform_type) { + return uniform_type == Tegra::Shader::UniformType::UnsignedByte || + uniform_type == Tegra::Shader::UniformType::UnsignedShort; +} + +u32 GetUnalignedMask(Tegra::Shader::UniformType uniform_type) { switch (uniform_type) { case Tegra::Shader::UniformType::UnsignedByte: - case Tegra::Shader::UniformType::Single: - return 1; - case Tegra::Shader::UniformType::Double: - return 2; - case Tegra::Shader::UniformType::Quad: - case Tegra::Shader::UniformType::UnsignedQuad: - return 4; + return 0b11; + case Tegra::Shader::UniformType::UnsignedShort: + return 0b10; default: - UNIMPLEMENTED_MSG("Unimplemented size={}!", static_cast<u32>(uniform_type)); - return 1; + UNREACHABLE(); + return 0; } } -u32 GetStgMemorySize(Tegra::Shader::UniformType uniform_type) { +u32 GetMemorySize(Tegra::Shader::UniformType uniform_type) { switch (uniform_type) { + case Tegra::Shader::UniformType::UnsignedByte: + return 8; + case Tegra::Shader::UniformType::UnsignedShort: + return 16; case Tegra::Shader::UniformType::Single: - return 1; + return 32; case Tegra::Shader::UniformType::Double: - return 2; + return 64; case Tegra::Shader::UniformType::Quad: case Tegra::Shader::UniformType::UnsignedQuad: - return 4; + return 128; default: UNIMPLEMENTED_MSG("Unimplemented size={}!", static_cast<u32>(uniform_type)); - return 1; + return 32; } } +Node ExtractUnaligned(Node value, Node address, u32 mask, u32 size) { + Node offset = Operation(OperationCode::UBitwiseAnd, address, Immediate(mask)); + offset = Operation(OperationCode::ULogicalShiftLeft, std::move(offset), Immediate(3)); + return Operation(OperationCode::UBitfieldExtract, std::move(value), std::move(offset), + Immediate(size)); +} + +Node InsertUnaligned(Node dest, Node value, Node address, u32 mask, u32 size) { + Node offset = Operation(OperationCode::UBitwiseAnd, std::move(address), Immediate(mask)); + offset = Operation(OperationCode::ULogicalShiftLeft, std::move(offset), Immediate(3)); + return Operation(OperationCode::UBitfieldInsert, std::move(dest), std::move(value), + std::move(offset), Immediate(size)); +} + +Node Sign16Extend(Node value) { + Node sign = Operation(OperationCode::UBitwiseAnd, value, Immediate(1U << 15)); + Node is_sign = Operation(OperationCode::LogicalUEqual, std::move(sign), Immediate(1U << 15)); + Node extend = Operation(OperationCode::Select, is_sign, Immediate(0xFFFF0000), Immediate(0)); + return Operation(OperationCode::UBitwiseOr, std::move(value), std::move(extend)); +} + } // Anonymous namespace u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { @@ -128,26 +160,31 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { LOG_DEBUG(HW_GPU, "LD_L cache management mode: {}", static_cast<u64>(instr.ld_l.unknown)); [[fallthrough]]; case OpCode::Id::LD_S: { - const auto GetMemory = [&](s32 offset) { + const auto GetAddress = [&](s32 offset) { ASSERT(offset % 4 == 0); const Node immediate_offset = Immediate(static_cast<s32>(instr.smem_imm) + offset); - const Node address = Operation(OperationCode::IAdd, NO_PRECISE, GetRegister(instr.gpr8), - immediate_offset); - return opcode->get().GetId() == OpCode::Id::LD_S ? GetSharedMemory(address) - : GetLocalMemory(address); + return Operation(OperationCode::IAdd, GetRegister(instr.gpr8), immediate_offset); + }; + const auto GetMemory = [&](s32 offset) { + return opcode->get().GetId() == OpCode::Id::LD_S ? GetSharedMemory(GetAddress(offset)) + : GetLocalMemory(GetAddress(offset)); }; switch (instr.ldst_sl.type.Value()) { - case Tegra::Shader::StoreType::Bits32: - case Tegra::Shader::StoreType::Bits64: - case Tegra::Shader::StoreType::Bits128: { - const u32 count = [&]() { + case StoreType::Signed16: + SetRegister(bb, instr.gpr0, + Sign16Extend(ExtractUnaligned(GetMemory(0), GetAddress(0), 0b10, 16))); + break; + case StoreType::Bits32: + case StoreType::Bits64: + case StoreType::Bits128: { + const u32 count = [&] { switch (instr.ldst_sl.type.Value()) { - case Tegra::Shader::StoreType::Bits32: + case StoreType::Bits32: return 1; - case Tegra::Shader::StoreType::Bits64: + case StoreType::Bits64: return 2; - case Tegra::Shader::StoreType::Bits128: + case StoreType::Bits128: return 4; default: UNREACHABLE(); @@ -184,9 +221,10 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { }(); const auto [real_address_base, base_address, descriptor] = - TrackGlobalMemory(bb, instr, false); + TrackGlobalMemory(bb, instr, true, false); - const u32 count = GetLdgMemorySize(type); + const u32 size = GetMemorySize(type); + const u32 count = Common::AlignUp(size, 32) / 32; if (!real_address_base || !base_address) { // Tracking failed, load zeroes. for (u32 i = 0; i < count; ++i) { @@ -200,14 +238,10 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { const Node real_address = Operation(OperationCode::UAdd, real_address_base, it_offset); Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor); - if (type == Tegra::Shader::UniformType::UnsignedByte) { - // To handle unaligned loads get the byte used to dereferenced global memory - // and extract that byte from the loaded uint32. - Node byte = Operation(OperationCode::UBitwiseAnd, real_address, Immediate(3)); - byte = Operation(OperationCode::ULogicalShiftLeft, std::move(byte), Immediate(3)); - - gmem = Operation(OperationCode::UBitfieldExtract, std::move(gmem), std::move(byte), - Immediate(8)); + // To handle unaligned loads get the bytes used to dereference global memory and extract + // those bytes from the loaded u32. + if (IsUnaligned(type)) { + gmem = ExtractUnaligned(gmem, real_address, GetUnalignedMask(type), size); } SetTemporary(bb, i, gmem); @@ -259,21 +293,28 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { return Operation(OperationCode::IAdd, NO_PRECISE, GetRegister(instr.gpr8), immediate); }; - const auto set_memory = opcode->get().GetId() == OpCode::Id::ST_L - ? &ShaderIR::SetLocalMemory - : &ShaderIR::SetSharedMemory; + const bool is_local = opcode->get().GetId() == OpCode::Id::ST_L; + const auto set_memory = is_local ? &ShaderIR::SetLocalMemory : &ShaderIR::SetSharedMemory; + const auto get_memory = is_local ? &ShaderIR::GetLocalMemory : &ShaderIR::GetSharedMemory; switch (instr.ldst_sl.type.Value()) { - case Tegra::Shader::StoreType::Bits128: + case StoreType::Bits128: (this->*set_memory)(bb, GetAddress(12), GetRegister(instr.gpr0.Value() + 3)); (this->*set_memory)(bb, GetAddress(8), GetRegister(instr.gpr0.Value() + 2)); [[fallthrough]]; - case Tegra::Shader::StoreType::Bits64: + case StoreType::Bits64: (this->*set_memory)(bb, GetAddress(4), GetRegister(instr.gpr0.Value() + 1)); [[fallthrough]]; - case Tegra::Shader::StoreType::Bits32: + case StoreType::Bits32: (this->*set_memory)(bb, GetAddress(0), GetRegister(instr.gpr0)); break; + case StoreType::Signed16: { + Node address = GetAddress(0); + Node memory = (this->*get_memory)(address); + (this->*set_memory)( + bb, address, InsertUnaligned(memory, GetRegister(instr.gpr0), address, 0b10, 16)); + break; + } default: UNIMPLEMENTED_MSG("{} unhandled type: {}", opcode->get().GetName(), static_cast<u32>(instr.ldst_sl.type.Value())); @@ -295,23 +336,67 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { } }(); + // For unaligned reads we have to read memory too. + const bool is_read = IsUnaligned(type); const auto [real_address_base, base_address, descriptor] = - TrackGlobalMemory(bb, instr, true); + TrackGlobalMemory(bb, instr, is_read, true); if (!real_address_base || !base_address) { // Tracking failed, skip the store. break; } - const u32 count = GetStgMemorySize(type); + const u32 size = GetMemorySize(type); + const u32 count = Common::AlignUp(size, 32) / 32; for (u32 i = 0; i < count; ++i) { const Node it_offset = Immediate(i * 4); const Node real_address = Operation(OperationCode::UAdd, real_address_base, it_offset); const Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor); - const Node value = GetRegister(instr.gpr0.Value() + i); + Node value = GetRegister(instr.gpr0.Value() + i); + + if (IsUnaligned(type)) { + const u32 mask = GetUnalignedMask(type); + value = InsertUnaligned(gmem, std::move(value), real_address, mask, size); + } + bb.push_back(Operation(OperationCode::Assign, gmem, value)); } break; } + case OpCode::Id::ATOM: { + UNIMPLEMENTED_IF_MSG(instr.atom.operation != GlobalAtomicOp::Add, "operation={}", + static_cast<int>(instr.atom.operation.Value())); + UNIMPLEMENTED_IF_MSG(instr.atom.type != GlobalAtomicType::S32, "type={}", + static_cast<int>(instr.atom.type.Value())); + + const auto [real_address, base_address, descriptor] = + TrackGlobalMemory(bb, instr, true, true); + if (!real_address || !base_address) { + // Tracking failed, skip atomic. + break; + } + + Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor); + Node value = Operation(OperationCode::AtomicAdd, std::move(gmem), GetRegister(instr.gpr20)); + SetRegister(bb, instr.gpr0, std::move(value)); + break; + } + case OpCode::Id::ATOMS: { + UNIMPLEMENTED_IF_MSG(instr.atoms.operation != AtomicOp::Add, "operation={}", + static_cast<int>(instr.atoms.operation.Value())); + UNIMPLEMENTED_IF_MSG(instr.atoms.type != AtomicType::U32, "type={}", + static_cast<int>(instr.atoms.type.Value())); + + const s32 offset = instr.atoms.GetImmediateOffset(); + Node address = GetRegister(instr.gpr8); + address = Operation(OperationCode::IAdd, std::move(address), Immediate(offset)); + + Node memory = GetSharedMemory(std::move(address)); + Node data = GetRegister(instr.gpr20); + + Node value = Operation(OperationCode::AtomicAdd, std::move(memory), std::move(data)); + SetRegister(bb, instr.gpr0, std::move(value)); + break; + } case OpCode::Id::AL2P: { // Ignore al2p.direction since we don't care about it. @@ -336,7 +421,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { std::tuple<Node, Node, GlobalMemoryBase> ShaderIR::TrackGlobalMemory(NodeBlock& bb, Instruction instr, - bool is_write) { + bool is_read, bool is_write) { const auto addr_register{GetRegister(instr.gmem.gpr)}; const auto immediate_offset{static_cast<u32>(instr.gmem.offset)}; @@ -351,11 +436,8 @@ std::tuple<Node, Node, GlobalMemoryBase> ShaderIR::TrackGlobalMemory(NodeBlock& const GlobalMemoryBase descriptor{index, offset}; const auto& [entry, is_new] = used_global_memory.try_emplace(descriptor); auto& usage = entry->second; - if (is_write) { - usage.is_written = true; - } else { - usage.is_read = true; - } + usage.is_written |= is_write; + usage.is_read |= is_read; const auto real_address = Operation(OperationCode::UAdd, NO_PRECISE, Immediate(immediate_offset), addr_register); diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp index 4b14cdf58..0b567e39d 100644 --- a/src/video_core/shader/decode/texture.cpp +++ b/src/video_core/shader/decode/texture.cpp @@ -161,16 +161,16 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { case OpCode::Id::TXD: { UNIMPLEMENTED_IF_MSG(instr.txd.UsesMiscMode(TextureMiscMode::AOFFI), "AOFFI is not implemented"); - UNIMPLEMENTED_IF_MSG(instr.txd.is_array != 0, "TXD Array is not implemented"); + const bool is_array = instr.txd.is_array != 0; u64 base_reg = instr.gpr8.Value(); const auto derivate_reg = instr.gpr20.Value(); const auto texture_type = instr.txd.texture_type.Value(); const auto coord_count = GetCoordCount(texture_type); - const Sampler* sampler = is_bindless - ? GetBindlessSampler(base_reg, {{texture_type, false, false}}) - : GetSampler(instr.sampler, {{texture_type, false, false}}); + const Sampler* sampler = + is_bindless ? GetBindlessSampler(base_reg, {{texture_type, is_array, false}}) + : GetSampler(instr.sampler, {{texture_type, is_array, false}}); Node4 values; if (sampler == nullptr) { for (u32 element = 0; element < values.size(); ++element) { @@ -179,6 +179,7 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { WriteTexInstructionFloat(bb, instr, values); break; } + if (is_bindless) { base_reg++; } @@ -192,8 +193,14 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { derivates.push_back(GetRegister(derivate_reg + derivate + 1)); } + Node array_node = {}; + if (is_array) { + const Node info_reg = GetRegister(base_reg + coord_count); + array_node = BitfieldExtract(info_reg, 0, 16); + } + for (u32 element = 0; element < values.size(); ++element) { - MetaTexture meta{*sampler, {}, {}, {}, {}, derivates, {}, {}, {}, element}; + MetaTexture meta{*sampler, array_node, {}, {}, {}, derivates, {}, {}, {}, element}; values[element] = Operation(OperationCode::TextureGradient, std::move(meta), coords); } @@ -794,14 +801,10 @@ std::tuple<std::size_t, std::size_t> ShaderIR::ValidateAndGetCoordinateElement( std::vector<Node> ShaderIR::GetAoffiCoordinates(Node aoffi_reg, std::size_t coord_count, bool is_tld4) { - const auto [coord_offsets, size, wrap_value, - diff_value] = [is_tld4]() -> std::tuple<std::array<u32, 3>, u32, s32, s32> { - if (is_tld4) { - return {{0, 8, 16}, 6, 32, 64}; - } else { - return {{0, 4, 8}, 4, 8, 16}; - } - }(); + const std::array coord_offsets = is_tld4 ? std::array{0U, 8U, 16U} : std::array{0U, 4U, 8U}; + const u32 size = is_tld4 ? 6 : 4; + const s32 wrap_value = is_tld4 ? 32 : 8; + const s32 diff_value = is_tld4 ? 64 : 16; const u32 mask = (1U << size) - 1; std::vector<Node> aoffi; @@ -814,7 +817,7 @@ std::vector<Node> ShaderIR::GetAoffiCoordinates(Node aoffi_reg, std::size_t coor LOG_WARNING(HW_GPU, "AOFFI constant folding failed, some hardware might have graphical issues"); for (std::size_t coord = 0; coord < coord_count; ++coord) { - const Node value = BitfieldExtract(aoffi_reg, coord_offsets.at(coord), size); + const Node value = BitfieldExtract(aoffi_reg, coord_offsets[coord], size); const Node condition = Operation(OperationCode::LogicalIGreaterEqual, value, Immediate(wrap_value)); const Node negative = Operation(OperationCode::IAdd, value, Immediate(-diff_value)); @@ -824,7 +827,7 @@ std::vector<Node> ShaderIR::GetAoffiCoordinates(Node aoffi_reg, std::size_t coor } for (std::size_t coord = 0; coord < coord_count; ++coord) { - s32 value = (*aoffi_immediate >> coord_offsets.at(coord)) & mask; + s32 value = (*aoffi_immediate >> coord_offsets[coord]) & mask; if (value >= wrap_value) { value -= diff_value; } diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h index 4e155542a..9af1f0228 100644 --- a/src/video_core/shader/node.h +++ b/src/video_core/shader/node.h @@ -162,6 +162,8 @@ enum class OperationCode { AtomicImageXor, /// (MetaImage, int[N] coords) -> void AtomicImageExchange, /// (MetaImage, int[N] coords) -> void + AtomicAdd, /// (memory, {u}int) -> {u}int + Branch, /// (uint branch_target) -> void BranchIndirect, /// (uint branch_target) -> void PushFlowStack, /// (uint branch_target) -> void diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h index aacd0a0da..ba1db4c11 100644 --- a/src/video_core/shader/shader_ir.h +++ b/src/video_core/shader/shader_ir.h @@ -394,7 +394,7 @@ private: std::tuple<Node, Node, GlobalMemoryBase> TrackGlobalMemory(NodeBlock& bb, Tegra::Shader::Instruction instr, - bool is_write); + bool is_read, bool is_write); /// Register new amending code and obtain the reference id. std::size_t DeclareAmend(Node new_amend); diff --git a/src/video_core/texture_cache/format_lookup_table.cpp b/src/video_core/texture_cache/format_lookup_table.cpp index 271e67533..81fb9f633 100644 --- a/src/video_core/texture_cache/format_lookup_table.cpp +++ b/src/video_core/texture_cache/format_lookup_table.cpp @@ -95,7 +95,7 @@ constexpr std::array<Table, 74> DefinitionTable = {{ {TextureFormat::ZF32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::Z32F}, {TextureFormat::Z16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::Z16}, {TextureFormat::S8Z24, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8Z24}, - {TextureFormat::ZF32_X24S8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::Z32FS8}, + {TextureFormat::ZF32_X24S8, C, FLOAT, UINT, UNORM, UNORM, PixelFormat::Z32FS8}, {TextureFormat::DXT1, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXT1}, {TextureFormat::DXT1, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXT1_SRGB}, diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp index 829268b4c..84469b7ba 100644 --- a/src/video_core/texture_cache/surface_base.cpp +++ b/src/video_core/texture_cache/surface_base.cpp @@ -135,7 +135,7 @@ std::vector<CopyParams> SurfaceBaseImpl::BreakDownLayered(const SurfaceParams& i for (u32 level = 0; level < mipmaps; level++) { const u32 width = SurfaceParams::IntersectWidth(params, in_params, level, level); const u32 height = SurfaceParams::IntersectHeight(params, in_params, level, level); - result.emplace_back(width, height, layer, level); + result.emplace_back(0, 0, layer, 0, 0, layer, level, level, width, height, 1); } } return result; diff --git a/src/video_core/texture_cache/surface_params.h b/src/video_core/texture_cache/surface_params.h index 992b5c022..9256fd6d9 100644 --- a/src/video_core/texture_cache/surface_params.h +++ b/src/video_core/texture_cache/surface_params.h @@ -209,6 +209,11 @@ public: return target == VideoCore::Surface::SurfaceTarget::TextureBuffer; } + /// Returns the number of layers in the surface. + std::size_t GetNumLayers() const { + return is_layered ? depth : 1; + } + /// Returns the debug name of the texture for use in graphic debuggers. std::string TargetName() const; @@ -287,10 +292,6 @@ private: /// Returns the size of a layer std::size_t GetLayerSize(bool as_host_size, bool uncompressed) const; - std::size_t GetNumLayers() const { - return is_layered ? depth : 1; - } - /// Returns true if these parameters are from a layered surface. bool IsLayered() const; }; |
