diff options
Diffstat (limited to 'src/video_core')
21 files changed, 313 insertions, 167 deletions
diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp index 98a8b5337..7ff44f06d 100644 --- a/src/video_core/engines/fermi_2d.cpp +++ b/src/video_core/engines/fermi_2d.cpp @@ -29,8 +29,8 @@ void Fermi2D::CallMethod(const GPU::MethodCall& method_call) { } void Fermi2D::HandleSurfaceCopy() { - LOG_WARNING(HW_GPU, "Requested a surface copy with operation {}", - static_cast<u32>(regs.operation)); + LOG_DEBUG(HW_GPU, "Requested a surface copy with operation {}", + static_cast<u32>(regs.operation)); // TODO(Subv): Only raw copies are implemented. ASSERT(regs.operation == Operation::SrcCopy); diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index e9c15beff..b318aedb8 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -639,7 +639,7 @@ void Maxwell3D::ProcessSyncPoint() { } void Maxwell3D::DrawArrays() { - LOG_DEBUG(HW_GPU, "called, topology={}, count={}", static_cast<u32>(regs.draw.topology.Value()), + LOG_TRACE(HW_GPU, "called, topology={}, count={}", static_cast<u32>(regs.draw.topology.Value()), regs.vertex_buffer.count); ASSERT_MSG(!(regs.index_array.count && regs.vertex_buffer.count), "Both indexed and direct?"); diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index 052e6d24e..28272ef6f 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -566,6 +566,13 @@ enum class ImageAtomicOperation : u64 { Exch = 8, }; +enum class ShuffleOperation : u64 { + Idx = 0, // shuffleNV + Up = 1, // shuffleUpNV + Down = 2, // shuffleDownNV + Bfly = 3, // shuffleXorNV +}; + union Instruction { Instruction& operator=(const Instruction& instr) { value = instr.value; @@ -600,6 +607,15 @@ union Instruction { } vote; union { + BitField<30, 2, ShuffleOperation> operation; + BitField<48, 3, u64> pred48; + BitField<28, 1, u64> is_index_imm; + BitField<29, 1, u64> is_mask_imm; + BitField<20, 5, u64> index_imm; + BitField<34, 13, u64> mask_imm; + } shfl; + + union { BitField<8, 8, Register> gpr; BitField<20, 24, s64> offset; } gmem; @@ -934,6 +950,11 @@ union Instruction { } isetp; union { + BitField<48, 1, u64> is_signed; + BitField<49, 3, PredCondition> cond; + } icmp; + + union { BitField<0, 3, u64> pred0; BitField<3, 3, u64> pred3; BitField<12, 3, u64> pred12; @@ -1542,6 +1563,7 @@ public: BRK, DEPBAR, VOTE, + SHFL, BFE_C, BFE_R, BFE_IMM, @@ -1628,6 +1650,10 @@ public: SEL_C, SEL_R, SEL_IMM, + ICMP_RC, + ICMP_R, + ICMP_CR, + ICMP_IMM, MUFU, // Multi-Function Operator RRO_C, // Range Reduction Operator RRO_R, @@ -1833,6 +1859,7 @@ private: INST("111000110000----", Id::EXIT, Type::Flow, "EXIT"), INST("1111000011110---", Id::DEPBAR, Type::Synch, "DEPBAR"), INST("0101000011011---", Id::VOTE, Type::Warp, "VOTE"), + INST("1110111100010---", Id::SHFL, Type::Warp, "SHFL"), INST("1110111111011---", Id::LD_A, Type::Memory, "LD_A"), INST("1110111101001---", Id::LD_S, Type::Memory, "LD_S"), INST("1110111101000---", Id::LD_L, Type::Memory, "LD_L"), @@ -1892,6 +1919,10 @@ private: INST("0100110010100---", Id::SEL_C, Type::ArithmeticInteger, "SEL_C"), INST("0101110010100---", Id::SEL_R, Type::ArithmeticInteger, "SEL_R"), INST("0011100-10100---", Id::SEL_IMM, Type::ArithmeticInteger, "SEL_IMM"), + INST("010100110100----", Id::ICMP_RC, Type::ArithmeticInteger, "ICMP_RC"), + INST("010110110100----", Id::ICMP_R, Type::ArithmeticInteger, "ICMP_R"), + INST("010010110100----", Id::ICMP_CR, Type::ArithmeticInteger, "ICMP_CR"), + INST("0011011-0100----", Id::ICMP_IMM, Type::ArithmeticInteger, "ICMP_IMM"), INST("0101101111011---", Id::LEA_R2, Type::ArithmeticInteger, "LEA_R2"), INST("0101101111010---", Id::LEA_R1, Type::ArithmeticInteger, "LEA_R1"), INST("001101101101----", Id::LEA_IMM, Type::ArithmeticInteger, "LEA_IMM"), diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 2c47541cb..76cfe8107 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -122,6 +122,7 @@ u32 RenderTargetBytesPerPixel(RenderTargetFormat format) { case RenderTargetFormat::RGBA16_UINT: case RenderTargetFormat::RGBA16_UNORM: case RenderTargetFormat::RGBA16_FLOAT: + case RenderTargetFormat::RGBX16_FLOAT: case RenderTargetFormat::RG32_FLOAT: case RenderTargetFormat::RG32_UINT: return 8; diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index 78bc0601a..29fa8e95b 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -42,6 +42,7 @@ enum class RenderTargetFormat : u32 { RGBA16_FLOAT = 0xCA, RG32_FLOAT = 0xCB, RG32_UINT = 0xCD, + RGBX16_FLOAT = 0xCE, BGRA8_UNORM = 0xCF, BGRA8_SRGB = 0xD0, RGB10_A2_UNORM = 0xD1, diff --git a/src/video_core/morton.cpp b/src/video_core/morton.cpp index 084f85e67..ab71870ab 100644 --- a/src/video_core/morton.cpp +++ b/src/video_core/morton.cpp @@ -83,6 +83,7 @@ static constexpr ConversionArray morton_to_linear_fns = { MortonCopy<true, PixelFormat::RG8U>, MortonCopy<true, PixelFormat::RG8S>, MortonCopy<true, PixelFormat::RG32UI>, + MortonCopy<true, PixelFormat::RGBX16F>, MortonCopy<true, PixelFormat::R32UI>, MortonCopy<true, PixelFormat::ASTC_2D_8X8>, MortonCopy<true, PixelFormat::ASTC_2D_8X5>, @@ -151,6 +152,7 @@ static constexpr ConversionArray linear_to_morton_fns = { MortonCopy<false, PixelFormat::RG8U>, MortonCopy<false, PixelFormat::RG8S>, MortonCopy<false, PixelFormat::RG32UI>, + MortonCopy<false, PixelFormat::RGBX16F>, MortonCopy<false, PixelFormat::R32UI>, nullptr, nullptr, diff --git a/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp b/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp index 7c926bd48..a5d69d78d 100644 --- a/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp @@ -35,21 +35,16 @@ OGLFramebuffer FramebufferCacheOpenGL::CreateFramebuffer(const FramebufferCacheK local_state.draw.draw_framebuffer = framebuffer.handle; local_state.ApplyFramebufferState(); - if (key.is_single_buffer) { - if (key.color_attachments[0] != GL_NONE && key.colors[0]) { - key.colors[0]->Attach(key.color_attachments[0], GL_DRAW_FRAMEBUFFER); - glDrawBuffer(key.color_attachments[0]); - } else { - glDrawBuffer(GL_NONE); - } - } else { - for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) { - if (key.colors[index]) { - key.colors[index]->Attach(GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(index), - GL_DRAW_FRAMEBUFFER); - } + for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) { + if (key.colors[index]) { + key.colors[index]->Attach(GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(index), + GL_DRAW_FRAMEBUFFER); } + } + if (key.colors_count) { glDrawBuffers(key.colors_count, key.color_attachments.data()); + } else { + glDrawBuffer(GL_NONE); } if (key.zeta) { @@ -67,9 +62,9 @@ std::size_t FramebufferCacheKey::Hash() const { } bool FramebufferCacheKey::operator==(const FramebufferCacheKey& rhs) const { - return std::tie(is_single_buffer, stencil_enable, colors_count, color_attachments, colors, - zeta) == std::tie(rhs.is_single_buffer, rhs.stencil_enable, rhs.colors_count, - rhs.color_attachments, rhs.colors, rhs.zeta); + return std::tie(stencil_enable, colors_count, color_attachments, colors, zeta) == + std::tie(rhs.stencil_enable, rhs.colors_count, rhs.color_attachments, rhs.colors, + rhs.zeta); } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_framebuffer_cache.h b/src/video_core/renderer_opengl/gl_framebuffer_cache.h index a3a996353..424344c48 100644 --- a/src/video_core/renderer_opengl/gl_framebuffer_cache.h +++ b/src/video_core/renderer_opengl/gl_framebuffer_cache.h @@ -19,7 +19,6 @@ namespace OpenGL { struct alignas(sizeof(u64)) FramebufferCacheKey { - bool is_single_buffer = false; bool stencil_enable = false; u16 colors_count = 0; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 246b892c5..6a17bed72 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -382,99 +382,51 @@ void RasterizerOpenGL::LoadDiskResources(const std::atomic_bool& stop_loading, shader_cache.LoadDiskCache(stop_loading, callback); } -std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers( - OpenGLState& current_state, bool using_color_fb, bool using_depth_fb, bool preserve_contents, - std::optional<std::size_t> single_color_target) { +void RasterizerOpenGL::ConfigureFramebuffers() { MICROPROFILE_SCOPE(OpenGL_Framebuffer); auto& gpu = system.GPU().Maxwell3D(); - const auto& regs = gpu.regs; - - const FramebufferConfigState fb_config_state{using_color_fb, using_depth_fb, preserve_contents, - single_color_target}; - if (fb_config_state == current_framebuffer_config_state && !gpu.dirty.render_settings) { - // Only skip if the previous ConfigureFramebuffers call was from the same kind (multiple or - // single color targets). This is done because the guest registers may not change but the - // host framebuffer may contain different attachments - return current_depth_stencil_usage; + if (!gpu.dirty.render_settings) { + return; } gpu.dirty.render_settings = false; - current_framebuffer_config_state = fb_config_state; texture_cache.GuardRenderTargets(true); - View depth_surface{}; - if (using_depth_fb) { - depth_surface = texture_cache.GetDepthBufferSurface(preserve_contents); - } else { - texture_cache.SetEmptyDepthBuffer(); - } + View depth_surface = texture_cache.GetDepthBufferSurface(true); + const auto& regs = gpu.regs; + state.framebuffer_srgb.enabled = regs.framebuffer_srgb != 0; UNIMPLEMENTED_IF(regs.rt_separate_frag_data == 0); // Bind the framebuffer surfaces - current_state.framebuffer_srgb.enabled = regs.framebuffer_srgb != 0; - FramebufferCacheKey fbkey; + for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) { + View color_surface{texture_cache.GetColorBufferSurface(index, true)}; - if (using_color_fb) { - if (single_color_target) { - // Used when just a single color attachment is enabled, e.g. for clearing a color buffer - View color_surface{ - texture_cache.GetColorBufferSurface(*single_color_target, preserve_contents)}; - - if (color_surface) { - // Assume that a surface will be written to if it is used as a framebuffer, even if - // the shader doesn't actually write to it. - texture_cache.MarkColorBufferInUse(*single_color_target); - } - - fbkey.is_single_buffer = true; - fbkey.color_attachments[0] = - GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(*single_color_target); - fbkey.colors[0] = color_surface; - for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) { - if (index != *single_color_target) { - texture_cache.SetEmptyColorBuffer(index); - } - } - } else { - // Multiple color attachments are enabled - for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) { - View color_surface{texture_cache.GetColorBufferSurface(index, preserve_contents)}; - - if (color_surface) { - // Assume that a surface will be written to if it is used as a framebuffer, even - // if the shader doesn't actually write to it. - texture_cache.MarkColorBufferInUse(index); - } - - fbkey.color_attachments[index] = - GL_COLOR_ATTACHMENT0 + regs.rt_control.GetMap(index); - fbkey.colors[index] = color_surface; - } - fbkey.is_single_buffer = false; - fbkey.colors_count = regs.rt_control.count; + if (color_surface) { + // Assume that a surface will be written to if it is used as a framebuffer, even + // if the shader doesn't actually write to it. + texture_cache.MarkColorBufferInUse(index); } - } else { - // No color attachments are enabled - leave them as zero - fbkey.is_single_buffer = true; + + fbkey.color_attachments[index] = GL_COLOR_ATTACHMENT0 + regs.rt_control.GetMap(index); + fbkey.colors[index] = std::move(color_surface); } + fbkey.colors_count = regs.rt_control.count; if (depth_surface) { // Assume that a surface will be written to if it is used as a framebuffer, even if // the shader doesn't actually write to it. texture_cache.MarkDepthBufferInUse(); - fbkey.zeta = depth_surface; fbkey.stencil_enable = depth_surface->GetSurfaceParams().type == SurfaceType::DepthStencil; + fbkey.zeta = std::move(depth_surface); } texture_cache.GuardRenderTargets(false); - current_state.draw.draw_framebuffer = framebuffer_cache.GetFramebuffer(fbkey); - SyncViewport(current_state); - - return current_depth_stencil_usage = {static_cast<bool>(depth_surface), fbkey.stencil_enable}; + state.draw.draw_framebuffer = framebuffer_cache.GetFramebuffer(fbkey); + SyncViewport(state); } void RasterizerOpenGL::ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb, @@ -684,7 +636,7 @@ void RasterizerOpenGL::DrawPrelude() { SetupShaders(primitive_mode); texture_cache.GuardSamplers(false); - ConfigureFramebuffers(state); + ConfigureFramebuffers(); // Signal the buffer cache that we are not going to upload more things. const bool invalidate = buffer_cache.Unmap(); diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 682f0becc..9c10ebda3 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -77,39 +77,8 @@ public: const VideoCore::DiskResourceLoadCallback& callback) override; private: - struct FramebufferConfigState { - bool using_color_fb{}; - bool using_depth_fb{}; - bool preserve_contents{}; - std::optional<std::size_t> single_color_target; - - bool operator==(const FramebufferConfigState& rhs) const { - return std::tie(using_color_fb, using_depth_fb, preserve_contents, - single_color_target) == std::tie(rhs.using_color_fb, rhs.using_depth_fb, - rhs.preserve_contents, - rhs.single_color_target); - } - bool operator!=(const FramebufferConfigState& rhs) const { - return !operator==(rhs); - } - }; - - /** - * Configures the color and depth framebuffer states. - * - * @param current_state The current OpenGL state. - * @param using_color_fb If true, configure color framebuffers. - * @param using_depth_fb If true, configure the depth/stencil framebuffer. - * @param preserve_contents If true, tries to preserve data from a previously used - * framebuffer. - * @param single_color_target Specifies if a single color buffer target should be used. - * - * @returns If depth (first) or stencil (second) are being stored in the bound zeta texture - * (requires using_depth_fb to be true) - */ - std::pair<bool, bool> ConfigureFramebuffers( - OpenGLState& current_state, bool using_color_fb = true, bool using_depth_fb = true, - bool preserve_contents = true, std::optional<std::size_t> single_color_target = {}); + /// Configures the color and depth framebuffer states. + void ConfigureFramebuffers(); void ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb, bool using_depth_fb, bool using_stencil_fb); @@ -231,9 +200,6 @@ private: OGLVertexArray> vertex_array_cache; - FramebufferConfigState current_framebuffer_config_state; - std::pair<bool, bool> current_depth_stencil_usage{}; - static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024; OGLBufferCache buffer_cache; diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 909ccb82c..0dbc4c02f 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -214,7 +214,8 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn std::string source = "#version 430 core\n" "#extension GL_ARB_separate_shader_objects : enable\n" "#extension GL_NV_gpu_shader5 : enable\n" - "#extension GL_NV_shader_thread_group : enable\n"; + "#extension GL_NV_shader_thread_group : enable\n" + "#extension GL_NV_shader_thread_shuffle : enable\n"; if (entries.shader_viewport_layer_array) { source += "#extension GL_ARB_shader_viewport_layer_array : enable\n"; } diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index f7e86ab26..74cb59bc1 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -1029,10 +1029,10 @@ private: return {std::move(temporary), value.GetType()}; } - Expression GetOutputAttribute(const AbufNode* abuf) { + std::optional<Expression> GetOutputAttribute(const AbufNode* abuf) { switch (const auto attribute = abuf->GetIndex()) { case Attribute::Index::Position: - return {"gl_Position"s + GetSwizzle(abuf->GetElement()), Type::Float}; + return {{"gl_Position"s + GetSwizzle(abuf->GetElement()), Type::Float}}; case Attribute::Index::LayerViewportPointSize: switch (abuf->GetElement()) { case 0: @@ -1042,25 +1042,25 @@ private: if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) { return {}; } - return {"gl_Layer", Type::Int}; + return {{"gl_Layer", Type::Int}}; case 2: if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) { return {}; } - return {"gl_ViewportIndex", Type::Int}; + return {{"gl_ViewportIndex", Type::Int}}; case 3: UNIMPLEMENTED_MSG("Requires some state changes for gl_PointSize to work in shader"); - return {"gl_PointSize", Type::Float}; + return {{"gl_PointSize", Type::Float}}; } return {}; case Attribute::Index::ClipDistances0123: - return {fmt::format("gl_ClipDistance[{}]", abuf->GetElement()), Type::Float}; + return {{fmt::format("gl_ClipDistance[{}]", abuf->GetElement()), Type::Float}}; case Attribute::Index::ClipDistances4567: - return {fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4), Type::Float}; + return {{fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4), Type::Float}}; default: if (IsGenericAttribute(attribute)) { - return {GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement()), - Type::Float}; + return { + {GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement()), Type::Float}}; } UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute)); return {}; @@ -1300,7 +1300,11 @@ private: target = {GetRegister(gpr->GetIndex()), Type::Float}; } else if (const auto abuf = std::get_if<AbufNode>(&*dest)) { UNIMPLEMENTED_IF(abuf->IsPhysicalBuffer()); - target = GetOutputAttribute(abuf); + auto output = GetOutputAttribute(abuf); + if (!output) { + return {}; + } + target = std::move(*output); } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) { if (stage == ProgramType::Compute) { LOG_WARNING(Render_OpenGL, "Local memory is stubbed on compute shaders"); @@ -1961,8 +1965,7 @@ private: Expression BallotThread(Operation operation) { const std::string value = VisitOperand(operation, 0).AsBool(); if (!device.HasWarpIntrinsics()) { - LOG_ERROR(Render_OpenGL, - "Nvidia warp intrinsics are not available and its required by a shader"); + LOG_ERROR(Render_OpenGL, "Nvidia vote intrinsics are required by this shader"); // Stub on non-Nvidia devices by simulating all threads voting the same as the active // one. return {fmt::format("({} ? 0xFFFFFFFFU : 0U)", value), Type::Uint}; @@ -1973,8 +1976,7 @@ private: Expression Vote(Operation operation, const char* func) { const std::string value = VisitOperand(operation, 0).AsBool(); if (!device.HasWarpIntrinsics()) { - LOG_ERROR(Render_OpenGL, - "Nvidia vote intrinsics are not available and its required by a shader"); + LOG_ERROR(Render_OpenGL, "Nvidia vote intrinsics are required by this shader"); // Stub with a warp size of one. return {value, Type::Bool}; } @@ -1991,15 +1993,54 @@ private: Expression VoteEqual(Operation operation) { if (!device.HasWarpIntrinsics()) { - LOG_ERROR(Render_OpenGL, - "Nvidia vote intrinsics are not available and its required by a shader"); - // We must return true here since a stub for a theoretical warp size of 1 will always - // return an equal result for all its votes. + LOG_ERROR(Render_OpenGL, "Nvidia vote intrinsics are required by this shader"); + // We must return true here since a stub for a theoretical warp size of 1. + // This will always return an equal result across all votes. return {"true", Type::Bool}; } return Vote(operation, "allThreadsEqualNV"); } + template <const std::string_view& func> + Expression Shuffle(Operation operation) { + const std::string value = VisitOperand(operation, 0).AsFloat(); + if (!device.HasWarpIntrinsics()) { + LOG_ERROR(Render_OpenGL, "Nvidia shuffle intrinsics are required by this shader"); + // On a "single-thread" device we are either on the same thread or out of bounds. Both + // cases return the passed value. + return {value, Type::Float}; + } + + const std::string index = VisitOperand(operation, 1).AsUint(); + const std::string width = VisitOperand(operation, 2).AsUint(); + return {fmt::format("{}({}, {}, {})", func, value, index, width), Type::Float}; + } + + template <const std::string_view& func> + Expression InRangeShuffle(Operation operation) { + const std::string index = VisitOperand(operation, 0).AsUint(); + const std::string width = VisitOperand(operation, 1).AsUint(); + if (!device.HasWarpIntrinsics()) { + // On a "single-thread" device we are only in bounds when the requested index is 0. + return {fmt::format("({} == 0U)", index), Type::Bool}; + } + + const std::string in_range = code.GenerateTemporary(); + code.AddLine("bool {};", in_range); + code.AddLine("{}(0U, {}, {}, {});", func, index, width, in_range); + return {in_range, Type::Bool}; + } + + struct Func final { + Func() = delete; + ~Func() = delete; + + static constexpr std::string_view ShuffleIndexed = "shuffleNV"; + static constexpr std::string_view ShuffleUp = "shuffleUpNV"; + static constexpr std::string_view ShuffleDown = "shuffleDownNV"; + static constexpr std::string_view ShuffleButterfly = "shuffleXorNV"; + }; + static constexpr std::array operation_decompilers = { &GLSLDecompiler::Assign, @@ -2162,6 +2203,16 @@ private: &GLSLDecompiler::VoteAll, &GLSLDecompiler::VoteAny, &GLSLDecompiler::VoteEqual, + + &GLSLDecompiler::Shuffle<Func::ShuffleIndexed>, + &GLSLDecompiler::Shuffle<Func::ShuffleUp>, + &GLSLDecompiler::Shuffle<Func::ShuffleDown>, + &GLSLDecompiler::Shuffle<Func::ShuffleButterfly>, + + &GLSLDecompiler::InRangeShuffle<Func::ShuffleIndexed>, + &GLSLDecompiler::InRangeShuffle<Func::ShuffleUp>, + &GLSLDecompiler::InRangeShuffle<Func::ShuffleDown>, + &GLSLDecompiler::InRangeShuffle<Func::ShuffleButterfly>, }; static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount)); diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 4f135fe03..173b76c4e 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -97,6 +97,7 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format {GL_RG8, GL_RG, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // RG8U {GL_RG8, GL_RG, GL_BYTE, ComponentType::SNorm, false}, // RG8S {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false}, // RG32UI + {GL_RGB16F, GL_RGBA16, GL_HALF_FLOAT, ComponentType::Float, false}, // RGBX16F {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false}, // R32UI {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_8X8 {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_8X5 diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h index ea77dd211..9ed738171 100644 --- a/src/video_core/renderer_opengl/maxwell_to_gl.h +++ b/src/video_core/renderer_opengl/maxwell_to_gl.h @@ -145,7 +145,7 @@ inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode, case Tegra::Texture::TextureMipmapFilter::None: return GL_LINEAR; case Tegra::Texture::TextureMipmapFilter::Nearest: - return GL_NEAREST_MIPMAP_LINEAR; + return GL_LINEAR_MIPMAP_NEAREST; case Tegra::Texture::TextureMipmapFilter::Linear: return GL_LINEAR_MIPMAP_LINEAR; } @@ -157,7 +157,7 @@ inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode, case Tegra::Texture::TextureMipmapFilter::Nearest: return GL_NEAREST_MIPMAP_NEAREST; case Tegra::Texture::TextureMipmapFilter::Linear: - return GL_LINEAR_MIPMAP_NEAREST; + return GL_NEAREST_MIPMAP_LINEAR; } } } diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index 0bbbf6851..3c5acda3e 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -143,6 +143,7 @@ static constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex {vk::Format::eUndefined, ComponentType::Invalid, false}, // RG8U {vk::Format::eUndefined, ComponentType::Invalid, false}, // RG8S {vk::Format::eUndefined, ComponentType::Invalid, false}, // RG32UI + {vk::Format::eUndefined, ComponentType::Invalid, false}, // RGBX16F {vk::Format::eUndefined, ComponentType::Invalid, false}, // R32UI {vk::Format::eUndefined, ComponentType::Invalid, false}, // ASTC_2D_8X8 {vk::Format::eUndefined, ComponentType::Invalid, false}, // ASTC_2D_8X5 diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index b9153934e..f7fbbb6e4 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -1127,6 +1127,46 @@ private: return {}; } + Id ShuffleIndexed(Operation) { + UNIMPLEMENTED(); + return {}; + } + + Id ShuffleUp(Operation) { + UNIMPLEMENTED(); + return {}; + } + + Id ShuffleDown(Operation) { + UNIMPLEMENTED(); + return {}; + } + + Id ShuffleButterfly(Operation) { + UNIMPLEMENTED(); + return {}; + } + + Id InRangeShuffleIndexed(Operation) { + UNIMPLEMENTED(); + return {}; + } + + Id InRangeShuffleUp(Operation) { + UNIMPLEMENTED(); + return {}; + } + + Id InRangeShuffleDown(Operation) { + UNIMPLEMENTED(); + return {}; + } + + Id InRangeShuffleButterfly(Operation) { + UNIMPLEMENTED(); + return {}; + } + Id DeclareBuiltIn(spv::BuiltIn builtin, spv::StorageClass storage, Id type, const std::string& name) { const Id id = OpVariable(type, storage); @@ -1431,6 +1471,16 @@ private: &SPIRVDecompiler::VoteAll, &SPIRVDecompiler::VoteAny, &SPIRVDecompiler::VoteEqual, + + &SPIRVDecompiler::ShuffleIndexed, + &SPIRVDecompiler::ShuffleUp, + &SPIRVDecompiler::ShuffleDown, + &SPIRVDecompiler::ShuffleButterfly, + + &SPIRVDecompiler::InRangeShuffleIndexed, + &SPIRVDecompiler::InRangeShuffleUp, + &SPIRVDecompiler::InRangeShuffleDown, + &SPIRVDecompiler::InRangeShuffleButterfly, }; static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount)); diff --git a/src/video_core/shader/decode/arithmetic_integer.cpp b/src/video_core/shader/decode/arithmetic_integer.cpp index c8c1a7f40..b73f6536e 100644 --- a/src/video_core/shader/decode/arithmetic_integer.cpp +++ b/src/video_core/shader/decode/arithmetic_integer.cpp @@ -138,6 +138,35 @@ u32 ShaderIR::DecodeArithmeticInteger(NodeBlock& bb, u32 pc) { SetRegister(bb, instr.gpr0, value); break; } + case OpCode::Id::ICMP_CR: + case OpCode::Id::ICMP_R: + case OpCode::Id::ICMP_RC: + case OpCode::Id::ICMP_IMM: { + const Node zero = Immediate(0); + + const auto [op_b, test] = [&]() -> std::pair<Node, Node> { + switch (opcode->get().GetId()) { + case OpCode::Id::ICMP_CR: + return {GetConstBuffer(instr.cbuf34.index, instr.cbuf34.offset), + GetRegister(instr.gpr39)}; + case OpCode::Id::ICMP_R: + return {GetRegister(instr.gpr20), GetRegister(instr.gpr39)}; + case OpCode::Id::ICMP_RC: + return {GetRegister(instr.gpr39), + GetConstBuffer(instr.cbuf34.index, instr.cbuf34.offset)}; + case OpCode::Id::ICMP_IMM: + return {Immediate(instr.alu.GetSignedImm20_20()), GetRegister(instr.gpr39)}; + default: + UNREACHABLE(); + return {zero, zero}; + } + }(); + const Node op_a = GetRegister(instr.gpr8); + const Node comparison = + GetPredicateComparisonInteger(instr.icmp.cond, instr.icmp.is_signed != 0, test, zero); + SetRegister(bb, instr.gpr0, Operation(OperationCode::Select, comparison, op_a, op_b)); + break; + } case OpCode::Id::LOP_C: case OpCode::Id::LOP_R: case OpCode::Id::LOP_IMM: { diff --git a/src/video_core/shader/decode/warp.cpp b/src/video_core/shader/decode/warp.cpp index 04ca74f46..a8e481b3c 100644 --- a/src/video_core/shader/decode/warp.cpp +++ b/src/video_core/shader/decode/warp.cpp @@ -13,6 +13,7 @@ namespace VideoCommon::Shader { using Tegra::Shader::Instruction; using Tegra::Shader::OpCode; using Tegra::Shader::Pred; +using Tegra::Shader::ShuffleOperation; using Tegra::Shader::VoteOperation; namespace { @@ -44,6 +45,52 @@ u32 ShaderIR::DecodeWarp(NodeBlock& bb, u32 pc) { SetPredicate(bb, instr.vote.dest_pred, vote); break; } + case OpCode::Id::SHFL: { + Node mask = instr.shfl.is_mask_imm ? Immediate(static_cast<u32>(instr.shfl.mask_imm)) + : GetRegister(instr.gpr39); + Node width = [&] { + // Convert the obscure SHFL mask back into GL_NV_shader_thread_shuffle's width. This has + // been done reversing Nvidia's math. It won't work on all cases due to SHFL having + // different parameters that don't properly map to GLSL's interface, but it should work + // for cases emitted by Nvidia's compiler. + if (instr.shfl.operation == ShuffleOperation::Up) { + return Operation( + OperationCode::ILogicalShiftRight, + Operation(OperationCode::IAdd, std::move(mask), Immediate(-0x2000)), + Immediate(8)); + } else { + return Operation(OperationCode::ILogicalShiftRight, + Operation(OperationCode::IAdd, Immediate(0x201F), + Operation(OperationCode::INegate, std::move(mask))), + Immediate(8)); + } + }(); + + const auto [operation, in_range] = [instr]() -> std::pair<OperationCode, OperationCode> { + switch (instr.shfl.operation) { + case ShuffleOperation::Idx: + return {OperationCode::ShuffleIndexed, OperationCode::InRangeShuffleIndexed}; + case ShuffleOperation::Up: + return {OperationCode::ShuffleUp, OperationCode::InRangeShuffleUp}; + case ShuffleOperation::Down: + return {OperationCode::ShuffleDown, OperationCode::InRangeShuffleDown}; + case ShuffleOperation::Bfly: + return {OperationCode::ShuffleButterfly, OperationCode::InRangeShuffleButterfly}; + } + UNREACHABLE_MSG("Invalid SHFL operation: {}", + static_cast<u64>(instr.shfl.operation.Value())); + return {}; + }(); + + // Setting the predicate before the register is intentional to avoid overwriting. + Node index = instr.shfl.is_index_imm ? Immediate(static_cast<u32>(instr.shfl.index_imm)) + : GetRegister(instr.gpr20); + SetPredicate(bb, instr.shfl.pred48, Operation(in_range, index, width)); + SetRegister( + bb, instr.gpr0, + Operation(operation, GetRegister(instr.gpr8), std::move(index), std::move(width))); + break; + } default: UNIMPLEMENTED_MSG("Unhandled warp instruction: {}", opcode->get().GetName()); break; diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h index 425111cc4..abf2cb1ab 100644 --- a/src/video_core/shader/node.h +++ b/src/video_core/shader/node.h @@ -181,6 +181,16 @@ enum class OperationCode { VoteAny, /// (bool) -> bool VoteEqual, /// (bool) -> bool + ShuffleIndexed, /// (uint value, uint index, uint width) -> uint + ShuffleUp, /// (uint value, uint index, uint width) -> uint + ShuffleDown, /// (uint value, uint index, uint width) -> uint + ShuffleButterfly, /// (uint value, uint index, uint width) -> uint + + InRangeShuffleIndexed, /// (uint index, uint width) -> bool + InRangeShuffleUp, /// (uint index, uint width) -> bool + InRangeShuffleDown, /// (uint index, uint width) -> bool + InRangeShuffleButterfly, /// (uint index, uint width) -> bool + Amount, }; diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp index 53d0142cb..250afc6d6 100644 --- a/src/video_core/surface.cpp +++ b/src/video_core/surface.cpp @@ -159,6 +159,8 @@ PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format) return PixelFormat::R32UI; case Tegra::RenderTargetFormat::RG32_UINT: return PixelFormat::RG32UI; + case Tegra::RenderTargetFormat::RGBX16_FLOAT: + return PixelFormat::RGBX16F; default: LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format)); UNREACHABLE(); @@ -415,6 +417,7 @@ ComponentType ComponentTypeFromRenderTarget(Tegra::RenderTargetFormat format) { case Tegra::RenderTargetFormat::RG8_SNORM: return ComponentType::SNorm; case Tegra::RenderTargetFormat::RGBA16_FLOAT: + case Tegra::RenderTargetFormat::RGBX16_FLOAT: case Tegra::RenderTargetFormat::R11G11B10_FLOAT: case Tegra::RenderTargetFormat::RGBA32_FLOAT: case Tegra::RenderTargetFormat::RG32_FLOAT: diff --git a/src/video_core/surface.h b/src/video_core/surface.h index 19268b7cd..1e1c432a5 100644 --- a/src/video_core/surface.h +++ b/src/video_core/surface.h @@ -57,36 +57,37 @@ enum class PixelFormat { RG8U = 39, RG8S = 40, RG32UI = 41, - R32UI = 42, - ASTC_2D_8X8 = 43, - ASTC_2D_8X5 = 44, - ASTC_2D_5X4 = 45, - BGRA8_SRGB = 46, - DXT1_SRGB = 47, - DXT23_SRGB = 48, - DXT45_SRGB = 49, - BC7U_SRGB = 50, - ASTC_2D_4X4_SRGB = 51, - ASTC_2D_8X8_SRGB = 52, - ASTC_2D_8X5_SRGB = 53, - ASTC_2D_5X4_SRGB = 54, - ASTC_2D_5X5 = 55, - ASTC_2D_5X5_SRGB = 56, - ASTC_2D_10X8 = 57, - ASTC_2D_10X8_SRGB = 58, + RGBX16F = 42, + R32UI = 43, + ASTC_2D_8X8 = 44, + ASTC_2D_8X5 = 45, + ASTC_2D_5X4 = 46, + BGRA8_SRGB = 47, + DXT1_SRGB = 48, + DXT23_SRGB = 49, + DXT45_SRGB = 50, + BC7U_SRGB = 51, + ASTC_2D_4X4_SRGB = 52, + ASTC_2D_8X8_SRGB = 53, + ASTC_2D_8X5_SRGB = 54, + ASTC_2D_5X4_SRGB = 55, + ASTC_2D_5X5 = 56, + ASTC_2D_5X5_SRGB = 57, + ASTC_2D_10X8 = 58, + ASTC_2D_10X8_SRGB = 59, MaxColorFormat, // Depth formats - Z32F = 59, - Z16 = 60, + Z32F = 60, + Z16 = 61, MaxDepthFormat, // DepthStencil formats - Z24S8 = 61, - S8Z24 = 62, - Z32FS8 = 63, + Z24S8 = 62, + S8Z24 = 63, + Z32FS8 = 64, MaxDepthStencilFormat, @@ -166,6 +167,7 @@ constexpr std::array<u32, MaxPixelFormat> compression_factor_shift_table = {{ 0, // RG8U 0, // RG8S 0, // RG32UI + 0, // RGBX16F 0, // R32UI 2, // ASTC_2D_8X8 2, // ASTC_2D_8X5 @@ -249,6 +251,7 @@ constexpr std::array<u32, MaxPixelFormat> block_width_table = {{ 1, // RG8U 1, // RG8S 1, // RG32UI + 1, // RGBX16F 1, // R32UI 8, // ASTC_2D_8X8 8, // ASTC_2D_8X5 @@ -324,6 +327,7 @@ constexpr std::array<u32, MaxPixelFormat> block_height_table = {{ 1, // RG8U 1, // RG8S 1, // RG32UI + 1, // RGBX16F 1, // R32UI 8, // ASTC_2D_8X8 5, // ASTC_2D_8X5 @@ -399,6 +403,7 @@ constexpr std::array<u32, MaxPixelFormat> bpp_table = {{ 16, // RG8U 16, // RG8S 64, // RG32UI + 64, // RGBX16F 32, // R32UI 128, // ASTC_2D_8X8 128, // ASTC_2D_8X5 @@ -489,6 +494,7 @@ constexpr std::array<SurfaceCompression, MaxPixelFormat> compression_type_table SurfaceCompression::None, // RG8U SurfaceCompression::None, // RG8S SurfaceCompression::None, // RG32UI + SurfaceCompression::None, // RGBX16F SurfaceCompression::None, // R32UI SurfaceCompression::Converted, // ASTC_2D_8X8 SurfaceCompression::Converted, // ASTC_2D_8X5 |
