From 31eb658fea6ecae8fb4f0dde5abdeb1d8eb44ae0 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Tue, 19 May 2020 22:01:25 -0300 Subject: maxwell_3d: Initialize polygon modes NVN expects this to be initialized as Fill, otherwise games that never bind a rasterizer state will log an invalid polygon mode. --- src/video_core/engines/maxwell_3d.cpp | 2 ++ 1 file changed, 2 insertions(+) (limited to 'src/video_core/engines') diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 024c9e43b..6113da7f8 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -107,6 +107,8 @@ void Maxwell3D::InitializeRegisterDefaults() { regs.rt_separate_frag_data = 1; regs.framebuffer_srgb = 1; regs.front_face = Maxwell3D::Regs::FrontFace::ClockWise; + regs.polygon_mode_back = Maxwell3D::Regs::PolygonMode::Fill; + regs.polygon_mode_front = Maxwell3D::Regs::PolygonMode::Fill; shadow_state = regs; -- cgit v1.2.3 From f3f056c3b60fda242ae08f3866832dd307d537ba Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Wed, 20 May 2020 01:13:40 -0300 Subject: maxwell_3d: Initialize line widths Initialize line widths to avoid setting a line width of zero. --- src/video_core/engines/maxwell_3d.cpp | 2 ++ 1 file changed, 2 insertions(+) (limited to 'src/video_core/engines') diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 6113da7f8..b5a70b9fc 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -106,6 +106,8 @@ void Maxwell3D::InitializeRegisterDefaults() { regs.rasterize_enable = 1; regs.rt_separate_frag_data = 1; regs.framebuffer_srgb = 1; + regs.line_width_aliased = 1.0f; + regs.line_width_smooth = 1.0f; regs.front_face = Maxwell3D::Regs::FrontFace::ClockWise; regs.polygon_mode_back = Maxwell3D::Regs::PolygonMode::Fill; regs.polygon_mode_front = Maxwell3D::Regs::PolygonMode::Fill; -- cgit v1.2.3 From b032ebdfee1928c4458eaf15faa0cff299371e65 Mon Sep 17 00:00:00 2001 From: David Marcec Date: Fri, 29 May 2020 14:53:27 +1000 Subject: Implement macro JIT --- src/core/settings.h | 1 + src/video_core/CMakeLists.txt | 8 +- src/video_core/engines/maxwell_3d.cpp | 19 +- src/video_core/engines/maxwell_3d.h | 19 +- src/video_core/macro/macro.cpp | 45 ++ src/video_core/macro/macro.h | 128 ++++++ src/video_core/macro/macro_interpreter.cpp | 288 +++++++++++++ src/video_core/macro/macro_interpreter.h | 102 +++++ src/video_core/macro/macro_jit_x64.cpp | 633 +++++++++++++++++++++++++++++ src/video_core/macro/macro_jit_x64.h | 98 +++++ src/video_core/macro_interpreter.cpp | 352 ---------------- src/video_core/macro_interpreter.h | 109 ----- src/yuzu/configuration/config.cpp | 3 + src/yuzu/configuration/configure_debug.cpp | 3 + src/yuzu/configuration/configure_debug.ui | 13 + src/yuzu_cmd/config.cpp | 2 + src/yuzu_cmd/default_ini.h | 2 + 17 files changed, 1335 insertions(+), 490 deletions(-) create mode 100644 src/video_core/macro/macro.cpp create mode 100644 src/video_core/macro/macro.h create mode 100644 src/video_core/macro/macro_interpreter.cpp create mode 100644 src/video_core/macro/macro_interpreter.h create mode 100644 src/video_core/macro/macro_jit_x64.cpp create mode 100644 src/video_core/macro/macro_jit_x64.h delete mode 100644 src/video_core/macro_interpreter.cpp delete mode 100644 src/video_core/macro_interpreter.h (limited to 'src/video_core/engines') diff --git a/src/core/settings.h b/src/core/settings.h index 78eb33737..36cd66fd4 100644 --- a/src/core/settings.h +++ b/src/core/settings.h @@ -474,6 +474,7 @@ struct Values { bool reporting_services; bool quest_flag; bool disable_cpu_opt; + bool disable_macro_jit; // BCAT std::string bcat_backend; diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index d6ee82836..2bf8d68ce 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -25,6 +25,12 @@ add_library(video_core STATIC engines/shader_bytecode.h engines/shader_header.h engines/shader_type.h + macro/macro.cpp + macro/macro.h + macro/macro_interpreter.cpp + macro/macro_interpreter.h + macro/macro_jit_x64.cpp + macro/macro_jit_x64.h fence_manager.h gpu.cpp gpu.h @@ -36,8 +42,6 @@ add_library(video_core STATIC gpu_thread.h guest_driver.cpp guest_driver.h - macro_interpreter.cpp - macro_interpreter.h memory_manager.cpp memory_manager.h morton.cpp diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 004f6b261..934a1d6db 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -25,9 +25,8 @@ constexpr u32 MacroRegistersStart = 0xE00; Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager) : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, - macro_interpreter{*this}, upload_state{memory_manager, regs.upload} { + macro_engine(GetMacroEngine(*this)), upload_state{memory_manager, regs.upload} { dirty.flags.flip(); - InitializeRegisterDefaults(); } @@ -116,7 +115,7 @@ void Maxwell3D::InitializeRegisterDefaults() { mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true; } -void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters) { +void Maxwell3D::CallMacroMethod(u32 method, std::vector&& parameters) { // Reset the current macro. executing_macro = 0; @@ -125,7 +124,7 @@ void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u3 ((method - MacroRegistersStart) >> 1) % static_cast(macro_positions.size()); // Execute the current macro. - macro_interpreter.Execute(macro_positions[entry], num_parameters, parameters); + macro_engine->Execute(macro_positions[entry], std::move(parameters)); if (mme_draw.current_mode != MMEDrawMode::Undefined) { FlushMMEInlineDraw(); } @@ -161,8 +160,7 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) { // Call the macro when there are no more parameters in the command buffer if (is_last_call) { - CallMacroMethod(executing_macro, macro_params.size(), macro_params.data()); - macro_params.clear(); + CallMacroMethod(executing_macro, std::move(macro_params)); } return; } @@ -197,7 +195,7 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) { break; } case MAXWELL3D_REG_INDEX(macros.data): { - ProcessMacroUpload(arg); + macro_engine->AddCode(regs.macros.upload_address, arg); break; } case MAXWELL3D_REG_INDEX(macros.bind): { @@ -306,8 +304,7 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, // Call the macro when there are no more parameters in the command buffer if (amount == methods_pending) { - CallMacroMethod(executing_macro, macro_params.size(), macro_params.data()); - macro_params.clear(); + CallMacroMethod(executing_macro, std::move(macro_params)); } return; } @@ -420,9 +417,7 @@ void Maxwell3D::FlushMMEInlineDraw() { } void Maxwell3D::ProcessMacroUpload(u32 data) { - ASSERT_MSG(regs.macros.upload_address < macro_memory.size(), - "upload_address exceeded macro_memory size!"); - macro_memory[regs.macros.upload_address++] = data; + macro_engine->AddCode(regs.macros.upload_address++, data); } void Maxwell3D::ProcessMacroBind(u32 data) { diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 05dd6b39b..077bc9841 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -23,7 +23,7 @@ #include "video_core/engines/engine_upload.h" #include "video_core/engines/shader_type.h" #include "video_core/gpu.h" -#include "video_core/macro_interpreter.h" +#include "video_core/macro/macro.h" #include "video_core/textures/texture.h" namespace Core { @@ -1411,15 +1411,6 @@ public: const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const override; - /// Memory for macro code - it's undetermined how big this is, however 1MB is much larger than - /// we've seen used. - using MacroMemory = std::array; - - /// Gets a reference to macro memory. - const MacroMemory& GetMacroMemory() const { - return macro_memory; - } - bool ShouldExecute() const { return execute_on; } @@ -1468,16 +1459,14 @@ private: std::array mme_inline{}; - /// Memory for macro code - MacroMemory macro_memory; - /// Macro method that is currently being executed / being fed parameters. u32 executing_macro = 0; /// Parameters that have been submitted to the macro call so far. std::vector macro_params; /// Interpreter for the macro codes uploaded to the GPU. - MacroInterpreter macro_interpreter; + std::unique_ptr macro_engine; + // MacroInterpreter macro_interpreter; static constexpr u32 null_cb_data = 0xFFFFFFFF; struct { @@ -1506,7 +1495,7 @@ private: * @param num_parameters Number of arguments * @param parameters Arguments to the method call */ - void CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters); + void CallMacroMethod(u32 method, std::vector&& parameters); /// Handles writes to the macro uploading register. void ProcessMacroUpload(u32 data); diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp new file mode 100644 index 000000000..85a6e5dd4 --- /dev/null +++ b/src/video_core/macro/macro.cpp @@ -0,0 +1,45 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" +#include "common/logging/log.h" +#include "core/settings.h" +#include "video_core/macro/macro.h" +#include "video_core/macro/macro_interpreter.h" +#include "video_core/macro/macro_jit_x64.h" + +namespace Tegra { + +void MacroEngine::AddCode(u32 method, u32 data) { + uploaded_macro_code[method].push_back(data); +} + +void MacroEngine::Execute(u32 method, std::vector parameters) { + auto compiled_macro = macro_cache.find(method); + if (compiled_macro != macro_cache.end()) { + compiled_macro->second->Execute(parameters, method); + } else { + // Macro not compiled, check if it's uploaded and if so, compile it + auto macro_code = uploaded_macro_code.find(method); + if (macro_code == uploaded_macro_code.end()) { + UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method); + return; + } + macro_cache[method] = Compile(macro_code->second); + macro_cache[method]->Execute(parameters, method); + } +} + +std::unique_ptr GetMacroEngine(Engines::Maxwell3D& maxwell3d) { + if (Settings::values.disable_macro_jit) { + return std::make_unique(maxwell3d); + } +#ifdef ARCHITECTURE_x86_64 + return std::make_unique(maxwell3d); +#else + return std::make_unique(maxwell3d); +#endif +} + +} // namespace Tegra diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h new file mode 100644 index 000000000..28ca243d1 --- /dev/null +++ b/src/video_core/macro/macro.h @@ -0,0 +1,128 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include +#include +#include +#include "common/bit_field.h" +#include "common/common_types.h" + +namespace Tegra { +namespace Engines { +class Maxwell3D; +} +namespace Macro { +constexpr std::size_t NUM_MACRO_REGISTERS = 8; +enum class Operation : u32 { + ALU = 0, + AddImmediate = 1, + ExtractInsert = 2, + ExtractShiftLeftImmediate = 3, + ExtractShiftLeftRegister = 4, + Read = 5, + Unused = 6, // This operation doesn't seem to be a valid encoding. + Branch = 7, +}; + +enum class ALUOperation : u32 { + Add = 0, + AddWithCarry = 1, + Subtract = 2, + SubtractWithBorrow = 3, + // Operations 4-7 don't seem to be valid encodings. + Xor = 8, + Or = 9, + And = 10, + AndNot = 11, + Nand = 12 +}; + +enum class ResultOperation : u32 { + IgnoreAndFetch = 0, + Move = 1, + MoveAndSetMethod = 2, + FetchAndSend = 3, + MoveAndSend = 4, + FetchAndSetMethod = 5, + MoveAndSetMethodFetchAndSend = 6, + MoveAndSetMethodSend = 7 +}; + +enum class BranchCondition : u32 { + Zero = 0, + NotZero = 1, +}; + +union Opcode { + u32 raw; + BitField<0, 3, Operation> operation; + BitField<4, 3, ResultOperation> result_operation; + BitField<4, 1, BranchCondition> branch_condition; + // If set on a branch, then the branch doesn't have a delay slot. + BitField<5, 1, u32> branch_annul; + BitField<7, 1, u32> is_exit; + BitField<8, 3, u32> dst; + BitField<11, 3, u32> src_a; + BitField<14, 3, u32> src_b; + // The signed immediate overlaps the second source operand and the alu operation. + BitField<14, 18, s32> immediate; + + BitField<17, 5, ALUOperation> alu_operation; + + // Bitfield instructions data + BitField<17, 5, u32> bf_src_bit; + BitField<22, 5, u32> bf_size; + BitField<27, 5, u32> bf_dst_bit; + + u32 GetBitfieldMask() const { + return (1 << bf_size) - 1; + } + + s32 GetBranchTarget() const { + return static_cast(immediate * sizeof(u32)); + } +}; + +union MethodAddress { + u32 raw; + BitField<0, 12, u32> address; + BitField<12, 6, u32> increment; +}; + +} // namespace Macro + +class CachedMacro { +public: + virtual ~CachedMacro() = default; + /** + * Executes the macro code with the specified input parameters. + * @param code The macro byte code to execute + * @param parameters The parameters of the macro + */ + virtual void Execute(std::vector& parameters, u32 method) = 0; +}; + +class MacroEngine { +public: + virtual ~MacroEngine() = default; + + // Store the uploaded macro code to compile them when they're called. + void AddCode(u32 method, u32 data); + + // Compiles the macro if its not in the cache, and executes the compiled macro + void Execute(u32 method, std::vector parameters); + +protected: + virtual std::unique_ptr Compile(const std::vector& code) = 0; + +private: + std::unordered_map> macro_cache; + std::unordered_map> uploaded_macro_code; +}; + +std::unique_ptr GetMacroEngine(Engines::Maxwell3D& maxwell3d); + +} // namespace Tegra diff --git a/src/video_core/macro/macro_interpreter.cpp b/src/video_core/macro/macro_interpreter.cpp new file mode 100644 index 000000000..e63296a21 --- /dev/null +++ b/src/video_core/macro/macro_interpreter.cpp @@ -0,0 +1,288 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" +#include "common/logging/log.h" +#include "common/microprofile.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/macro/macro_interpreter.h" + +MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192)); + +namespace Tegra { +MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} + +std::unique_ptr MacroInterpreter::Compile(const std::vector& code) { + return std::make_unique(maxwell3d, code); +} + +MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, + const std::vector& code) + : maxwell3d(maxwell3d), code(code) {} + +void MacroInterpreterImpl::Execute(std::vector& parameters, u32 method) { + MICROPROFILE_SCOPE(MacroInterp); + Reset(); + + registers[1] = parameters[0]; + num_parameters = parameters.size(); + + if (num_parameters > parameters_capacity) { + parameters_capacity = num_parameters; + this->parameters = std::make_unique(num_parameters); + } + std::memcpy(this->parameters.get(), parameters.data(), num_parameters * sizeof(u32)); + this->num_parameters = num_parameters; + + // Execute the code until we hit an exit condition. + bool keep_executing = true; + while (keep_executing) { + keep_executing = Step(false); + } + + // Assert the the macro used all the input parameters + ASSERT(next_parameter_index == num_parameters); +} + +void MacroInterpreterImpl::Reset() { + registers = {}; + pc = 0; + delayed_pc = {}; + method_address.raw = 0; + num_parameters = 0; + // The next parameter index starts at 1, because $r1 already has the value of the first + // parameter. + next_parameter_index = 1; + carry_flag = false; +} + +bool MacroInterpreterImpl::Step(bool is_delay_slot) { + u32 base_address = pc; + + Macro::Opcode opcode = GetOpcode(); + pc += 4; + + // Update the program counter if we were delayed + if (delayed_pc) { + ASSERT(is_delay_slot); + pc = *delayed_pc; + delayed_pc = {}; + } + + switch (opcode.operation) { + case Macro::Operation::ALU: { + u32 result = GetALUResult(opcode.alu_operation, GetRegister(opcode.src_a), + GetRegister(opcode.src_b)); + ProcessResult(opcode.result_operation, opcode.dst, result); + break; + } + case Macro::Operation::AddImmediate: { + ProcessResult(opcode.result_operation, opcode.dst, + GetRegister(opcode.src_a) + opcode.immediate); + break; + } + case Macro::Operation::ExtractInsert: { + u32 dst = GetRegister(opcode.src_a); + u32 src = GetRegister(opcode.src_b); + + src = (src >> opcode.bf_src_bit) & opcode.GetBitfieldMask(); + dst &= ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit); + dst |= src << opcode.bf_dst_bit; + ProcessResult(opcode.result_operation, opcode.dst, dst); + break; + } + case Macro::Operation::ExtractShiftLeftImmediate: { + u32 dst = GetRegister(opcode.src_a); + u32 src = GetRegister(opcode.src_b); + + u32 result = ((src >> dst) & opcode.GetBitfieldMask()) << opcode.bf_dst_bit; + + ProcessResult(opcode.result_operation, opcode.dst, result); + break; + } + case Macro::Operation::ExtractShiftLeftRegister: { + u32 dst = GetRegister(opcode.src_a); + u32 src = GetRegister(opcode.src_b); + + u32 result = ((src >> opcode.bf_src_bit) & opcode.GetBitfieldMask()) << dst; + + ProcessResult(opcode.result_operation, opcode.dst, result); + break; + } + case Macro::Operation::Read: { + u32 result = Read(GetRegister(opcode.src_a) + opcode.immediate); + ProcessResult(opcode.result_operation, opcode.dst, result); + break; + } + case Macro::Operation::Branch: { + ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid"); + u32 value = GetRegister(opcode.src_a); + bool taken = EvaluateBranchCondition(opcode.branch_condition, value); + if (taken) { + // Ignore the delay slot if the branch has the annul bit. + if (opcode.branch_annul) { + pc = base_address + opcode.GetBranchTarget(); + return true; + } + + delayed_pc = base_address + opcode.GetBranchTarget(); + // Execute one more instruction due to the delay slot. + return Step(true); + } + break; + } + default: + UNIMPLEMENTED_MSG("Unimplemented macro operation {}", + static_cast(opcode.operation.Value())); + } + + // An instruction with the Exit flag will not actually + // cause an exit if it's executed inside a delay slot. + if (opcode.is_exit && !is_delay_slot) { + // Exit has a delay slot, execute the next instruction + Step(true); + return false; + } + + return true; +} + +u32 MacroInterpreterImpl::GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b) { + switch (operation) { + case Macro::ALUOperation::Add: { + const u64 result{static_cast(src_a) + src_b}; + carry_flag = result > 0xffffffff; + return static_cast(result); + } + case Macro::ALUOperation::AddWithCarry: { + const u64 result{static_cast(src_a) + src_b + (carry_flag ? 1ULL : 0ULL)}; + carry_flag = result > 0xffffffff; + return static_cast(result); + } + case Macro::ALUOperation::Subtract: { + const u64 result{static_cast(src_a) - src_b}; + carry_flag = result < 0x100000000; + return static_cast(result); + } + case Macro::ALUOperation::SubtractWithBorrow: { + const u64 result{static_cast(src_a) - src_b - (carry_flag ? 0ULL : 1ULL)}; + carry_flag = result < 0x100000000; + return static_cast(result); + } + case Macro::ALUOperation::Xor: + return src_a ^ src_b; + case Macro::ALUOperation::Or: + return src_a | src_b; + case Macro::ALUOperation::And: + return src_a & src_b; + case Macro::ALUOperation::AndNot: + return src_a & ~src_b; + case Macro::ALUOperation::Nand: + return ~(src_a & src_b); + + default: + UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", static_cast(operation)); + return 0; + } +} + +void MacroInterpreterImpl::ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result) { + switch (operation) { + case Macro::ResultOperation::IgnoreAndFetch: + // Fetch parameter and ignore result. + SetRegister(reg, FetchParameter()); + break; + case Macro::ResultOperation::Move: + // Move result. + SetRegister(reg, result); + break; + case Macro::ResultOperation::MoveAndSetMethod: + // Move result and use as Method Address. + SetRegister(reg, result); + SetMethodAddress(result); + break; + case Macro::ResultOperation::FetchAndSend: + // Fetch parameter and send result. + SetRegister(reg, FetchParameter()); + Send(result); + break; + case Macro::ResultOperation::MoveAndSend: + // Move and send result. + SetRegister(reg, result); + Send(result); + break; + case Macro::ResultOperation::FetchAndSetMethod: + // Fetch parameter and use result as Method Address. + SetRegister(reg, FetchParameter()); + SetMethodAddress(result); + break; + case Macro::ResultOperation::MoveAndSetMethodFetchAndSend: + // Move result and use as Method Address, then fetch and send parameter. + SetRegister(reg, result); + SetMethodAddress(result); + Send(FetchParameter()); + break; + case Macro::ResultOperation::MoveAndSetMethodSend: + // Move result and use as Method Address, then send bits 12:17 of result. + SetRegister(reg, result); + SetMethodAddress(result); + Send((result >> 12) & 0b111111); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented result operation {}", static_cast(operation)); + } +} + +bool MacroInterpreterImpl::EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const { + switch (cond) { + case Macro::BranchCondition::Zero: + return value == 0; + case Macro::BranchCondition::NotZero: + return value != 0; + } + UNREACHABLE(); + return true; +} + +Macro::Opcode MacroInterpreterImpl::GetOpcode() const { + ASSERT((pc % sizeof(u32)) == 0); + ASSERT(pc < code.size() * sizeof(u32)); + return {code[pc / sizeof(u32)]}; +} + +u32 MacroInterpreterImpl::GetRegister(u32 register_id) const { + return registers.at(register_id); +} + +void MacroInterpreterImpl::SetRegister(u32 register_id, u32 value) { + // Register 0 is hardwired as the zero register. + // Ensure no writes to it actually occur. + if (register_id == 0) { + return; + } + + registers.at(register_id) = value; +} + +void MacroInterpreterImpl::SetMethodAddress(u32 address) { + method_address.raw = address; +} + +void MacroInterpreterImpl::Send(u32 value) { + maxwell3d.CallMethodFromMME(method_address.address, value); + // Increment the method address by the method increment. + method_address.address.Assign(method_address.address.Value() + + method_address.increment.Value()); +} + +u32 MacroInterpreterImpl::Read(u32 method) const { + return maxwell3d.GetRegisterValue(method); +} + +u32 MacroInterpreterImpl::FetchParameter() { + ASSERT(next_parameter_index < num_parameters); + return parameters[next_parameter_index++]; +} + +} // namespace Tegra diff --git a/src/video_core/macro/macro_interpreter.h b/src/video_core/macro/macro_interpreter.h new file mode 100644 index 000000000..fb923f7b9 --- /dev/null +++ b/src/video_core/macro/macro_interpreter.h @@ -0,0 +1,102 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once +#include +#include +#include +#include "common/bit_field.h" +#include "common/common_types.h" +#include "video_core/macro/macro.h" + +namespace Tegra { +namespace Engines { +class Maxwell3D; +} + +class MacroInterpreter final : public MacroEngine { +public: + explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d); + +protected: + std::unique_ptr Compile(const std::vector& code) override; + +private: + Engines::Maxwell3D& maxwell3d; +}; + +class MacroInterpreterImpl : public CachedMacro { +public: + MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, const std::vector& code); + void Execute(std::vector& parameters, u32 method) override; + +private: + /// Resets the execution engine state, zeroing registers, etc. + void Reset(); + + /** + * Executes a single macro instruction located at the current program counter. Returns whether + * the interpreter should keep running. + * @param offset Offset to start execution at. + * @param is_delay_slot Whether the current step is being executed due to a delay slot in a + * previous instruction. + */ + bool Step(bool is_delay_slot); + + /// Calculates the result of an ALU operation. src_a OP src_b; + u32 GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b); + + /// Performs the result operation on the input result and stores it in the specified register + /// (if necessary). + void ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result); + + /// Evaluates the branch condition and returns whether the branch should be taken or not. + bool EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const; + + /// Reads an opcode at the current program counter location. + Macro::Opcode GetOpcode() const; + + /// Returns the specified register's value. Register 0 is hardcoded to always return 0. + u32 GetRegister(u32 register_id) const; + + /// Sets the register to the input value. + void SetRegister(u32 register_id, u32 value); + + /// Sets the method address to use for the next Send instruction. + void SetMethodAddress(u32 address); + + /// Calls a GPU Engine method with the input parameter. + void Send(u32 value); + + /// Reads a GPU register located at the method address. + u32 Read(u32 method) const; + + /// Returns the next parameter in the parameter queue. + u32 FetchParameter(); + + Engines::Maxwell3D& maxwell3d; + + /// Current program counter + u32 pc; + /// Program counter to execute at after the delay slot is executed. + std::optional delayed_pc; + + /// General purpose macro registers. + std::array registers = {}; + + /// Method address to use for the next Send instruction. + Macro::MethodAddress method_address = {}; + + /// Input parameters of the current macro. + std::unique_ptr parameters; + std::size_t num_parameters = 0; + std::size_t parameters_capacity = 0; + /// Index of the next parameter that will be fetched by the 'parm' instruction. + u32 next_parameter_index = 0; + + bool carry_flag = false; + const std::vector& code; +}; + +} // namespace Tegra diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp new file mode 100644 index 000000000..1b657236a --- /dev/null +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -0,0 +1,633 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" +#include "common/logging/log.h" +#include "common/microprofile.h" +#include "common/x64/xbyak_util.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/macro/macro_interpreter.h" +#include "video_core/macro/macro_jit_x64.h" + +MICROPROFILE_DEFINE(MacroJitCompile, "GPU", "Compile macro JIT", MP_RGB(173, 255, 47)); +MICROPROFILE_DEFINE(MacroJitExecute, "GPU", "Execute macro JIT", MP_RGB(255, 255, 0)); + +namespace Tegra { +using JitFunction = void (MacroJITx64Impl::*)(Macro::Opcode opcode); +const std::array InstructionTable{ + &MacroJITx64Impl::Compile_ALU, + &MacroJITx64Impl::Compile_AddImmediate, + &MacroJITx64Impl::Compile_ExtractInsert, + &MacroJITx64Impl::Compile_ExtractShiftLeftImmediate, + &MacroJITx64Impl::Compile_ExtractShiftLeftRegister, + &MacroJITx64Impl::Compile_Read, + nullptr, + &MacroJITx64Impl::Compile_Branch, +}; + +static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r9; +static const Xbyak::Reg64 REGISTERS = Xbyak::util::r10; +static const Xbyak::Reg64 STATE = Xbyak::util::r11; +static const Xbyak::Reg64 NEXT_PARAMETER = Xbyak::util::r12; +static const Xbyak::Reg32 RESULT = Xbyak::util::r13d; +static const Xbyak::Reg64 RESULT_64 = Xbyak::util::r13; +static const Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d; +static const Xbyak::Reg64 METHOD_ADDRESS_64 = Xbyak::util::r14; +static const Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15; + +static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({ + PARAMETERS, + REGISTERS, + STATE, + NEXT_PARAMETER, + RESULT, + METHOD_ADDRESS, + BRANCH_HOLDER, +}); + +MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} + +std::unique_ptr MacroJITx64::Compile(const std::vector& code) { + return std::make_unique(maxwell3d, code); +} + +MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector& code) + : Xbyak::CodeGenerator(MAX_CODE_SIZE), code(code), maxwell3d(maxwell3d) { + Compile(); +} + +MacroJITx64Impl::~MacroJITx64Impl() = default; + +void MacroJITx64Impl::Execute(std::vector& parameters, u32 method) { + MICROPROFILE_SCOPE(MacroJitExecute); + ASSERT_OR_EXECUTE(program != nullptr, { return; }); + JITState state{}; + state.maxwell3d = &maxwell3d; + state.registers = {}; + state.parameters = parameters.data(); + program(&state); +} + +void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) { + const bool is_a_zero = opcode.src_a == 0; + const bool is_b_zero = opcode.src_b == 0; + const bool valid_operation = !is_a_zero && !is_b_zero; + const bool is_move_operation = !is_a_zero && is_b_zero; + const bool has_zero_register = is_a_zero || is_b_zero; + + Xbyak::Reg64 src_a; + Xbyak::Reg32 src_b; + + if (!optimizer.zero_reg_skip) { + src_a = Compile_GetRegister(opcode.src_a, RESULT_64); + src_b = Compile_GetRegister(opcode.src_b, ebx); + } else { + if (!is_a_zero) { + src_a = Compile_GetRegister(opcode.src_a, RESULT_64); + } + if (!is_b_zero) { + src_b = Compile_GetRegister(opcode.src_b, ebx); + } + } + Xbyak::Label skip_carry{}; + + bool has_emitted = false; + + switch (opcode.alu_operation) { + case Macro::ALUOperation::Add: + if (optimizer.zero_reg_skip) { + if (valid_operation) { + add(src_a, src_b); + } + } else { + add(src_a, src_b); + } + + if (!optimizer.can_skip_carry) { + setc(byte[STATE + offsetof(JITState, carry_flag)]); + } + break; + case Macro::ALUOperation::AddWithCarry: + bt(dword[STATE + offsetof(JITState, carry_flag)], 0); + adc(src_a, src_b); + setc(byte[STATE + offsetof(JITState, carry_flag)]); + break; + case Macro::ALUOperation::Subtract: + if (optimizer.zero_reg_skip) { + if (valid_operation) { + sub(src_a, src_b); + has_emitted = true; + } + } else { + sub(src_a, src_b); + has_emitted = true; + } + if (!optimizer.can_skip_carry && has_emitted) { + setc(byte[STATE + offsetof(JITState, carry_flag)]); + } + break; + case Macro::ALUOperation::SubtractWithBorrow: + bt(dword[STATE + offsetof(JITState, carry_flag)], 0); + sbb(src_a, src_b); + setc(byte[STATE + offsetof(JITState, carry_flag)]); + break; + case Macro::ALUOperation::Xor: + if (optimizer.zero_reg_skip) { + if (valid_operation) { + xor_(src_a, src_b); + } + } else { + xor_(src_a, src_b); + } + break; + case Macro::ALUOperation::Or: + if (optimizer.zero_reg_skip) { + if (valid_operation) { + or_(src_a, src_b); + } + } else { + or_(src_a, src_b); + } + break; + case Macro::ALUOperation::And: + if (optimizer.zero_reg_skip) { + if (!has_zero_register) { + and_(src_a, src_b); + } + } else { + and_(src_a, src_b); + } + break; + case Macro::ALUOperation::AndNot: + if (optimizer.zero_reg_skip) { + if (!is_a_zero) { + not_(src_b); + and_(src_a, src_b); + } + } else { + not_(src_b); + and_(src_a, src_b); + } + break; + case Macro::ALUOperation::Nand: + if (optimizer.zero_reg_skip) { + if (!is_a_zero) { + and_(src_a, src_b); + not_(src_a); + } + } else { + and_(src_a, src_b); + not_(src_a); + } + break; + default: + UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", + static_cast(opcode.alu_operation.Value())); + break; + } + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void MacroJITx64Impl::Compile_AddImmediate(Macro::Opcode opcode) { + if (optimizer.skip_dummy_addimmediate) { + // Games tend to use this as an exit instruction placeholder. It's to encode an instruction + // without doing anything. In our case we can just not emit anything. + if (opcode.result_operation == Macro::ResultOperation::Move && opcode.dst == 0) { + return; + } + } + // Check for redundant moves + if (optimizer.optimize_for_method_move && + opcode.result_operation == Macro::ResultOperation::MoveAndSetMethod) { + if (next_opcode.has_value()) { + const auto next = *next_opcode; + if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod) { + return; + } + } + } + if (optimizer.zero_reg_skip && opcode.src_a == 0) { + if (opcode.immediate == 0) { + xor_(RESULT, RESULT); + } else { + mov(RESULT, opcode.immediate); + } + } else { + auto result = Compile_GetRegister(opcode.src_a, RESULT); + if (opcode.immediate > 2) { + add(result, opcode.immediate); + } else if (opcode.immediate == 1) { + inc(result); + } else if (opcode.immediate < 0) { + sub(result, opcode.immediate * -1); + } + } + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void MacroJITx64Impl::Compile_ExtractInsert(Macro::Opcode opcode) { + auto dst = Compile_GetRegister(opcode.src_a, RESULT); + auto src = Compile_GetRegister(opcode.src_b, eax); + + if (opcode.bf_src_bit != 0 && opcode.bf_src_bit != 31) { + shr(src, opcode.bf_src_bit); + } else if (opcode.bf_src_bit == 31) { + xor_(src, src); + } + // Don't bother masking the whole register since we're using a 32 bit register + if (opcode.bf_size != 31 && opcode.bf_size != 0) { + and_(src, opcode.GetBitfieldMask()); + } else if (opcode.bf_size == 0) { + xor_(src, src); + } + if (opcode.bf_dst_bit != 31 && opcode.bf_dst_bit != 0) { + shl(src, opcode.bf_dst_bit); + } else if (opcode.bf_dst_bit == 31) { + xor_(src, src); + } + + const u32 mask = ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit); + if (mask != 0xffffffff) { + and_(dst, mask); + } + or_(dst, src); + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) { + auto dst = Compile_GetRegister(opcode.src_a, eax); + auto src = Compile_GetRegister(opcode.src_b, RESULT); + + shr(src, al); + if (opcode.bf_size != 0 && opcode.bf_size != 31) { + and_(src, opcode.GetBitfieldMask()); + } else if (opcode.bf_size == 0) { + xor_(src, src); + } + + if (opcode.bf_dst_bit != 0 && opcode.bf_dst_bit != 31) { + shl(src, opcode.bf_dst_bit); + } else if (opcode.bf_dst_bit == 31) { + xor_(src, src); + } + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) { + auto dst = Compile_GetRegister(opcode.src_a, eax); + auto src = Compile_GetRegister(opcode.src_b, RESULT); + + if (opcode.bf_src_bit != 0) { + shr(src, opcode.bf_src_bit); + } + + if (opcode.bf_size != 31) { + and_(src, opcode.GetBitfieldMask()); + } + shl(src, al); + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +static u32 Read(Engines::Maxwell3D* maxwell3d, u32 method) { + return maxwell3d->GetRegisterValue(method); +} + +static void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) { + maxwell3d->CallMethodFromMME(method_address.address, value); +} + +void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) { + if (optimizer.zero_reg_skip && opcode.src_a == 0) { + if (opcode.immediate == 0) { + xor_(RESULT, RESULT); + } else { + mov(RESULT, opcode.immediate); + } + } else { + auto result = Compile_GetRegister(opcode.src_a, RESULT); + if (opcode.immediate > 2) { + add(result, opcode.immediate); + } else if (opcode.immediate == 1) { + inc(result); + } else if (opcode.immediate < 0) { + sub(result, opcode.immediate * -1); + } + } + Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); + mov(Common::X64::ABI_PARAM1, qword[STATE]); + mov(Common::X64::ABI_PARAM2, RESULT); + Common::X64::CallFarFunction(*this, &Read); + Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); + mov(RESULT, Common::X64::ABI_RETURN.cvt32()); + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) { + Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); + mov(Common::X64::ABI_PARAM1, qword[STATE]); + mov(Common::X64::ABI_PARAM2, METHOD_ADDRESS); + mov(Common::X64::ABI_PARAM3, value); + Common::X64::CallFarFunction(*this, &Send); + Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); + + Xbyak::Label dont_process{}; + // Get increment + test(METHOD_ADDRESS, 0x3f000); + // If zero, method address doesn't update + je(dont_process); + + mov(ecx, METHOD_ADDRESS); + and_(METHOD_ADDRESS, 0xfff); + shr(ecx, 12); + and_(ecx, 0x3f); + lea(eax, ptr[rcx + METHOD_ADDRESS_64]); + sal(ecx, 12); + or_(eax, ecx); + + mov(METHOD_ADDRESS, eax); + + L(dont_process); +} + +void Tegra::MacroJITx64Impl::Compile_Branch(Macro::Opcode opcode) { + ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid"); + const s32 jump_address = + static_cast(pc) + static_cast(opcode.GetBranchTarget() / sizeof(s32)); + + Xbyak::Label end; + auto value = Compile_GetRegister(opcode.src_a, eax); + test(value, value); + if (optimizer.has_delayed_pc) { + switch (opcode.branch_condition) { + case Macro::BranchCondition::Zero: + jne(end, T_NEAR); + break; + case Macro::BranchCondition::NotZero: + je(end, T_NEAR); + break; + } + + if (opcode.branch_annul) { + xor_(BRANCH_HOLDER, BRANCH_HOLDER); + jmp(labels[jump_address], T_NEAR); + } else { + Xbyak::Label handle_post_exit{}; + Xbyak::Label skip{}; + jmp(skip, T_NEAR); + if (opcode.is_exit) { + L(handle_post_exit); + // Execute 1 instruction + mov(BRANCH_HOLDER, end_of_code); + // Jump to next instruction to skip delay slot check + jmp(labels[jump_address], T_NEAR); + } else { + L(handle_post_exit); + xor_(BRANCH_HOLDER, BRANCH_HOLDER); + jmp(labels[jump_address], T_NEAR); + } + L(skip); + mov(BRANCH_HOLDER, handle_post_exit); + jmp(delay_skip[pc], T_NEAR); + } + } else { + switch (opcode.branch_condition) { + case Macro::BranchCondition::Zero: + je(labels[jump_address], T_NEAR); + break; + case Macro::BranchCondition::NotZero: + jne(labels[jump_address], T_NEAR); + break; + } + } + + L(end); +} + +void Tegra::MacroJITx64Impl::Optimizer_ScanFlags() { + optimizer.can_skip_carry = true; + optimizer.has_delayed_pc = false; + for (auto raw_op : code) { + Macro::Opcode op{}; + op.raw = raw_op; + + if (op.operation == Macro::Operation::ALU) { + // Scan for any ALU operations which actually use the carry flag, if they don't exist in + // our current code we can skip emitting the carry flag handling operations + if (op.alu_operation == Macro::ALUOperation::AddWithCarry || + op.alu_operation == Macro::ALUOperation::SubtractWithBorrow) { + optimizer.can_skip_carry = false; + } + } + + if (op.operation == Macro::Operation::Branch) { + if (!op.branch_annul) { + optimizer.has_delayed_pc = true; + } + } + } +} + +void MacroJITx64Impl::Compile() { + MICROPROFILE_SCOPE(MacroJitCompile); + bool keep_executing = true; + labels.fill(Xbyak::Label()); + + Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8); + // JIT state + mov(STATE, Common::X64::ABI_PARAM1); + mov(PARAMETERS, qword[Common::X64::ABI_PARAM1 + + static_cast(offsetof(JITState, parameters))]); + mov(REGISTERS, Common::X64::ABI_PARAM1); + add(REGISTERS, static_cast(offsetof(JITState, registers))); + xor_(RESULT, RESULT); + xor_(METHOD_ADDRESS, METHOD_ADDRESS); + xor_(NEXT_PARAMETER, NEXT_PARAMETER); + xor_(BRANCH_HOLDER, BRANCH_HOLDER); + + mov(dword[REGISTERS + 4], Compile_FetchParameter()); + + // Track get register for zero registers and mark it as no-op + optimizer.zero_reg_skip = true; + + // AddImmediate tends to be used as a NOP instruction, if we detect this we can + // completely skip the entire code path and no emit anything + optimizer.skip_dummy_addimmediate = true; + + // SMO tends to emit a lot of unnecessary method moves, we can mitigate this by only emitting + // one if our register isn't "dirty" + optimizer.optimize_for_method_move = true; + + // Check to see if we can skip emitting certain instructions + Optimizer_ScanFlags(); + + const u32 op_count = static_cast(code.size()); + for (u32 i = 0; i < op_count; i++) { + if (i < op_count - 1) { + pc = i + 1; + next_opcode = GetOpCode(); + } else { + next_opcode = {}; + } + pc = i; + Compile_NextInstruction(); + } + + L(end_of_code); + + Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8); + ret(); + ready(); + program = getCode(); +} + +bool MacroJITx64Impl::Compile_NextInstruction() { + const auto opcode = GetOpCode(); + if (labels[pc].getAddress()) { + return false; + } + + L(labels[pc]); + + const std::size_t op = static_cast(opcode.operation.Value()); + + if (InstructionTable[op] == nullptr) { + UNIMPLEMENTED_MSG("Unimplemented opcode {}", op); + } else { + ((*this).*InstructionTable[op])(opcode); + } + + if (optimizer.has_delayed_pc) { + if (opcode.is_exit) { + mov(rax, end_of_code); + test(BRANCH_HOLDER, BRANCH_HOLDER); + cmove(BRANCH_HOLDER, rax); + // Jump to next instruction to skip delay slot check + je(labels[pc + 1], T_NEAR); + } else { + // TODO(ogniK): Optimize delay slot branching + Xbyak::Label no_delay_slot{}; + test(BRANCH_HOLDER, BRANCH_HOLDER); + je(no_delay_slot, T_NEAR); + mov(rax, BRANCH_HOLDER); + xor_(BRANCH_HOLDER, BRANCH_HOLDER); + jmp(rax); + L(no_delay_slot); + } + L(delay_skip[pc]); + if (opcode.is_exit) { + return false; + } + } else { + test(BRANCH_HOLDER, BRANCH_HOLDER); + jne(end_of_code, T_NEAR); + if (opcode.is_exit) { + inc(BRANCH_HOLDER); + return false; + } + } + return true; +} + +Xbyak::Reg32 Tegra::MacroJITx64Impl::Compile_FetchParameter() { + mov(eax, dword[PARAMETERS + NEXT_PARAMETER * sizeof(u32)]); + inc(NEXT_PARAMETER); + return eax; +} + +Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) { + if (index == 0) { + // Register 0 is always zero + xor_(dst, dst); + } else { + mov(dst, dword[REGISTERS + index * sizeof(u32)]); + } + + return dst; +} + +Xbyak::Reg64 Tegra::MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg64 dst) { + if (index == 0) { + // Register 0 is always zero + xor_(dst, dst); + } else { + mov(dst, dword[REGISTERS + index * sizeof(u32)]); + } + + return dst; +} + +void Tegra::MacroJITx64Impl::Compile_WriteCarry(Xbyak::Reg64 dst) { + Xbyak::Label zero{}, end{}; + xor_(ecx, ecx); + shr(dst, 32); + setne(cl); + mov(dword[STATE + offsetof(JITState, carry_flag)], ecx); +} + +void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) { + auto SetRegister = [=](u32 reg, Xbyak::Reg32 result) { + // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero + // register. + if (reg == 0) { + return; + } + mov(dword[REGISTERS + reg * sizeof(u32)], result); + }; + auto SetMethodAddress = [=](Xbyak::Reg32 reg) { mov(METHOD_ADDRESS, reg); }; + + switch (operation) { + case Macro::ResultOperation::IgnoreAndFetch: + SetRegister(reg, Compile_FetchParameter()); + break; + case Macro::ResultOperation::Move: + SetRegister(reg, RESULT); + break; + case Macro::ResultOperation::MoveAndSetMethod: + SetRegister(reg, RESULT); + SetMethodAddress(RESULT); + break; + case Macro::ResultOperation::FetchAndSend: + // Fetch parameter and send result. + SetRegister(reg, Compile_FetchParameter()); + Compile_Send(RESULT); + break; + case Macro::ResultOperation::MoveAndSend: + // Move and send result. + SetRegister(reg, RESULT); + Compile_Send(RESULT); + break; + case Macro::ResultOperation::FetchAndSetMethod: + // Fetch parameter and use result as Method Address. + SetRegister(reg, Compile_FetchParameter()); + SetMethodAddress(RESULT); + break; + case Macro::ResultOperation::MoveAndSetMethodFetchAndSend: + // Move result and use as Method Address, then fetch and send parameter. + SetRegister(reg, RESULT); + SetMethodAddress(RESULT); + Compile_Send(Compile_FetchParameter()); + break; + case Macro::ResultOperation::MoveAndSetMethodSend: + // Move result and use as Method Address, then send bits 12:17 of result. + SetRegister(reg, RESULT); + SetMethodAddress(RESULT); + shr(RESULT, 12); + and_(RESULT, 0b111111); + Compile_Send(RESULT); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented macro operation {}", static_cast(operation)); + } +} + +Macro::Opcode MacroJITx64Impl::GetOpCode() const { + ASSERT(pc < code.size()); + return {code[pc]}; +} + +std::bitset<32> MacroJITx64Impl::PersistentCallerSavedRegs() const { + return PERSISTENT_REGISTERS & Common::X64::ABI_ALL_CALLER_SAVED; +} + +} // namespace Tegra diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h new file mode 100644 index 000000000..71cd6a3b0 --- /dev/null +++ b/src/video_core/macro/macro_jit_x64.h @@ -0,0 +1,98 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include +#include +#include +#include "common/bit_field.h" +#include "common/common_types.h" +#include "common/x64/xbyak_abi.h" +#include "video_core/macro/macro.h" + +namespace Tegra { +namespace Engines { +class Maxwell3D; +} + +/// MAX_CODE_SIZE is arbitrarily chosen based on current booting games +constexpr size_t MAX_CODE_SIZE = 0x10000; + +class MacroJITx64 final : public MacroEngine { +public: + explicit MacroJITx64(Engines::Maxwell3D& maxwell3d); + +protected: + std::unique_ptr Compile(const std::vector& code) override; + +private: + Engines::Maxwell3D& maxwell3d; +}; + +class MacroJITx64Impl : public Xbyak::CodeGenerator, public CachedMacro { +public: + MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector& code); + ~MacroJITx64Impl(); + void Execute(std::vector& parameters, u32 method) override; + + void Compile_ALU(Macro::Opcode opcode); + void Compile_AddImmediate(Macro::Opcode opcode); + void Compile_ExtractInsert(Macro::Opcode opcode); + void Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode); + void Compile_ExtractShiftLeftRegister(Macro::Opcode opcode); + void Compile_Read(Macro::Opcode opcode); + void Compile_Branch(Macro::Opcode opcode); + +private: + void Optimizer_ScanFlags(); + + void Compile(); + bool Compile_NextInstruction(); + + Xbyak::Reg32 Compile_FetchParameter(); + Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst); + Xbyak::Reg64 Compile_GetRegister(u32 index, Xbyak::Reg64 dst); + void Compile_WriteCarry(Xbyak::Reg64 dst); + + void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg); + void Compile_Send(Xbyak::Reg32 value); + + Macro::Opcode GetOpCode() const; + std::bitset<32> PersistentCallerSavedRegs() const; + + struct JITState { + Engines::Maxwell3D* maxwell3d{}; + std::array registers{}; + u32* parameters{}; + u32 carry_flag{}; + }; + static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0"); + using ProgramType = void (*)(JITState*); + + struct OptimizerState { + bool can_skip_carry{}; + bool has_delayed_pc{}; + bool zero_reg_skip{}; + bool skip_dummy_addimmediate{}; + bool optimize_for_method_move{}; + }; + OptimizerState optimizer{}; + + std::optional next_opcode{}; + ProgramType program{nullptr}; + + std::array labels; + std::array delay_skip{}; + Xbyak::Label end_of_code{}; + + bool is_delay_slot{}; + u32 pc{}; + std::optional delayed_pc; + + const std::vector& code; + Engines::Maxwell3D& maxwell3d; +}; + +} // namespace Tegra diff --git a/src/video_core/macro_interpreter.cpp b/src/video_core/macro_interpreter.cpp deleted file mode 100644 index 947364928..000000000 --- a/src/video_core/macro_interpreter.cpp +++ /dev/null @@ -1,352 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include "common/assert.h" -#include "common/logging/log.h" -#include "common/microprofile.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/macro_interpreter.h" - -MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192)); - -namespace Tegra { -namespace { -enum class Operation : u32 { - ALU = 0, - AddImmediate = 1, - ExtractInsert = 2, - ExtractShiftLeftImmediate = 3, - ExtractShiftLeftRegister = 4, - Read = 5, - Unused = 6, // This operation doesn't seem to be a valid encoding. - Branch = 7, -}; -} // Anonymous namespace - -enum class MacroInterpreter::ALUOperation : u32 { - Add = 0, - AddWithCarry = 1, - Subtract = 2, - SubtractWithBorrow = 3, - // Operations 4-7 don't seem to be valid encodings. - Xor = 8, - Or = 9, - And = 10, - AndNot = 11, - Nand = 12 -}; - -enum class MacroInterpreter::ResultOperation : u32 { - IgnoreAndFetch = 0, - Move = 1, - MoveAndSetMethod = 2, - FetchAndSend = 3, - MoveAndSend = 4, - FetchAndSetMethod = 5, - MoveAndSetMethodFetchAndSend = 6, - MoveAndSetMethodSend = 7 -}; - -enum class MacroInterpreter::BranchCondition : u32 { - Zero = 0, - NotZero = 1, -}; - -union MacroInterpreter::Opcode { - u32 raw; - BitField<0, 3, Operation> operation; - BitField<4, 3, ResultOperation> result_operation; - BitField<4, 1, BranchCondition> branch_condition; - // If set on a branch, then the branch doesn't have a delay slot. - BitField<5, 1, u32> branch_annul; - BitField<7, 1, u32> is_exit; - BitField<8, 3, u32> dst; - BitField<11, 3, u32> src_a; - BitField<14, 3, u32> src_b; - // The signed immediate overlaps the second source operand and the alu operation. - BitField<14, 18, s32> immediate; - - BitField<17, 5, ALUOperation> alu_operation; - - // Bitfield instructions data - BitField<17, 5, u32> bf_src_bit; - BitField<22, 5, u32> bf_size; - BitField<27, 5, u32> bf_dst_bit; - - u32 GetBitfieldMask() const { - return (1 << bf_size) - 1; - } - - s32 GetBranchTarget() const { - return static_cast(immediate * sizeof(u32)); - } -}; - -MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} - -void MacroInterpreter::Execute(u32 offset, std::size_t num_parameters, const u32* parameters) { - MICROPROFILE_SCOPE(MacroInterp); - Reset(); - - registers[1] = parameters[0]; - - if (num_parameters > parameters_capacity) { - parameters_capacity = num_parameters; - this->parameters = std::make_unique(num_parameters); - } - std::memcpy(this->parameters.get(), parameters, num_parameters * sizeof(u32)); - this->num_parameters = num_parameters; - - // Execute the code until we hit an exit condition. - bool keep_executing = true; - while (keep_executing) { - keep_executing = Step(offset, false); - } - - // Assert the the macro used all the input parameters - ASSERT(next_parameter_index == num_parameters); -} - -void MacroInterpreter::Reset() { - registers = {}; - pc = 0; - delayed_pc = {}; - method_address.raw = 0; - num_parameters = 0; - // The next parameter index starts at 1, because $r1 already has the value of the first - // parameter. - next_parameter_index = 1; - carry_flag = false; -} - -bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { - u32 base_address = pc; - - Opcode opcode = GetOpcode(offset); - pc += 4; - - // Update the program counter if we were delayed - if (delayed_pc) { - ASSERT(is_delay_slot); - pc = *delayed_pc; - delayed_pc = {}; - } - - switch (opcode.operation) { - case Operation::ALU: { - u32 result = GetALUResult(opcode.alu_operation, GetRegister(opcode.src_a), - GetRegister(opcode.src_b)); - ProcessResult(opcode.result_operation, opcode.dst, result); - break; - } - case Operation::AddImmediate: { - ProcessResult(opcode.result_operation, opcode.dst, - GetRegister(opcode.src_a) + opcode.immediate); - break; - } - case Operation::ExtractInsert: { - u32 dst = GetRegister(opcode.src_a); - u32 src = GetRegister(opcode.src_b); - - src = (src >> opcode.bf_src_bit) & opcode.GetBitfieldMask(); - dst &= ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit); - dst |= src << opcode.bf_dst_bit; - ProcessResult(opcode.result_operation, opcode.dst, dst); - break; - } - case Operation::ExtractShiftLeftImmediate: { - u32 dst = GetRegister(opcode.src_a); - u32 src = GetRegister(opcode.src_b); - - u32 result = ((src >> dst) & opcode.GetBitfieldMask()) << opcode.bf_dst_bit; - - ProcessResult(opcode.result_operation, opcode.dst, result); - break; - } - case Operation::ExtractShiftLeftRegister: { - u32 dst = GetRegister(opcode.src_a); - u32 src = GetRegister(opcode.src_b); - - u32 result = ((src >> opcode.bf_src_bit) & opcode.GetBitfieldMask()) << dst; - - ProcessResult(opcode.result_operation, opcode.dst, result); - break; - } - case Operation::Read: { - u32 result = Read(GetRegister(opcode.src_a) + opcode.immediate); - ProcessResult(opcode.result_operation, opcode.dst, result); - break; - } - case Operation::Branch: { - ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid"); - u32 value = GetRegister(opcode.src_a); - bool taken = EvaluateBranchCondition(opcode.branch_condition, value); - if (taken) { - // Ignore the delay slot if the branch has the annul bit. - if (opcode.branch_annul) { - pc = base_address + opcode.GetBranchTarget(); - return true; - } - - delayed_pc = base_address + opcode.GetBranchTarget(); - // Execute one more instruction due to the delay slot. - return Step(offset, true); - } - break; - } - default: - UNIMPLEMENTED_MSG("Unimplemented macro operation {}", - static_cast(opcode.operation.Value())); - } - - // An instruction with the Exit flag will not actually - // cause an exit if it's executed inside a delay slot. - if (opcode.is_exit && !is_delay_slot) { - // Exit has a delay slot, execute the next instruction - Step(offset, true); - return false; - } - - return true; -} - -MacroInterpreter::Opcode MacroInterpreter::GetOpcode(u32 offset) const { - const auto& macro_memory{maxwell3d.GetMacroMemory()}; - ASSERT((pc % sizeof(u32)) == 0); - ASSERT((pc + offset) < macro_memory.size() * sizeof(u32)); - return {macro_memory[offset + pc / sizeof(u32)]}; -} - -u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b) { - switch (operation) { - case ALUOperation::Add: { - const u64 result{static_cast(src_a) + src_b}; - carry_flag = result > 0xffffffff; - return static_cast(result); - } - case ALUOperation::AddWithCarry: { - const u64 result{static_cast(src_a) + src_b + (carry_flag ? 1ULL : 0ULL)}; - carry_flag = result > 0xffffffff; - return static_cast(result); - } - case ALUOperation::Subtract: { - const u64 result{static_cast(src_a) - src_b}; - carry_flag = result < 0x100000000; - return static_cast(result); - } - case ALUOperation::SubtractWithBorrow: { - const u64 result{static_cast(src_a) - src_b - (carry_flag ? 0ULL : 1ULL)}; - carry_flag = result < 0x100000000; - return static_cast(result); - } - case ALUOperation::Xor: - return src_a ^ src_b; - case ALUOperation::Or: - return src_a | src_b; - case ALUOperation::And: - return src_a & src_b; - case ALUOperation::AndNot: - return src_a & ~src_b; - case ALUOperation::Nand: - return ~(src_a & src_b); - - default: - UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", static_cast(operation)); - return 0; - } -} - -void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 result) { - switch (operation) { - case ResultOperation::IgnoreAndFetch: - // Fetch parameter and ignore result. - SetRegister(reg, FetchParameter()); - break; - case ResultOperation::Move: - // Move result. - SetRegister(reg, result); - break; - case ResultOperation::MoveAndSetMethod: - // Move result and use as Method Address. - SetRegister(reg, result); - SetMethodAddress(result); - break; - case ResultOperation::FetchAndSend: - // Fetch parameter and send result. - SetRegister(reg, FetchParameter()); - Send(result); - break; - case ResultOperation::MoveAndSend: - // Move and send result. - SetRegister(reg, result); - Send(result); - break; - case ResultOperation::FetchAndSetMethod: - // Fetch parameter and use result as Method Address. - SetRegister(reg, FetchParameter()); - SetMethodAddress(result); - break; - case ResultOperation::MoveAndSetMethodFetchAndSend: - // Move result and use as Method Address, then fetch and send parameter. - SetRegister(reg, result); - SetMethodAddress(result); - Send(FetchParameter()); - break; - case ResultOperation::MoveAndSetMethodSend: - // Move result and use as Method Address, then send bits 12:17 of result. - SetRegister(reg, result); - SetMethodAddress(result); - Send((result >> 12) & 0b111111); - break; - default: - UNIMPLEMENTED_MSG("Unimplemented result operation {}", static_cast(operation)); - } -} - -u32 MacroInterpreter::FetchParameter() { - ASSERT(next_parameter_index < num_parameters); - return parameters[next_parameter_index++]; -} - -u32 MacroInterpreter::GetRegister(u32 register_id) const { - return registers.at(register_id); -} - -void MacroInterpreter::SetRegister(u32 register_id, u32 value) { - // Register 0 is hardwired as the zero register. - // Ensure no writes to it actually occur. - if (register_id == 0) { - return; - } - - registers.at(register_id) = value; -} - -void MacroInterpreter::SetMethodAddress(u32 address) { - method_address.raw = address; -} - -void MacroInterpreter::Send(u32 value) { - maxwell3d.CallMethodFromMME(method_address.address, value); - // Increment the method address by the method increment. - method_address.address.Assign(method_address.address.Value() + - method_address.increment.Value()); -} - -u32 MacroInterpreter::Read(u32 method) const { - return maxwell3d.GetRegisterValue(method); -} - -bool MacroInterpreter::EvaluateBranchCondition(BranchCondition cond, u32 value) const { - switch (cond) { - case BranchCondition::Zero: - return value == 0; - case BranchCondition::NotZero: - return value != 0; - } - UNREACHABLE(); - return true; -} - -} // namespace Tegra diff --git a/src/video_core/macro_interpreter.h b/src/video_core/macro_interpreter.h deleted file mode 100644 index 631146d89..000000000 --- a/src/video_core/macro_interpreter.h +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include -#include - -#include "common/bit_field.h" -#include "common/common_types.h" - -namespace Tegra { -namespace Engines { -class Maxwell3D; -} - -class MacroInterpreter final { -public: - explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d); - - /** - * Executes the macro code with the specified input parameters. - * @param offset Offset to start execution at. - * @param parameters The parameters of the macro. - */ - void Execute(u32 offset, std::size_t num_parameters, const u32* parameters); - -private: - enum class ALUOperation : u32; - enum class BranchCondition : u32; - enum class ResultOperation : u32; - - union Opcode; - - union MethodAddress { - u32 raw; - BitField<0, 12, u32> address; - BitField<12, 6, u32> increment; - }; - - /// Resets the execution engine state, zeroing registers, etc. - void Reset(); - - /** - * Executes a single macro instruction located at the current program counter. Returns whether - * the interpreter should keep running. - * @param offset Offset to start execution at. - * @param is_delay_slot Whether the current step is being executed due to a delay slot in a - * previous instruction. - */ - bool Step(u32 offset, bool is_delay_slot); - - /// Calculates the result of an ALU operation. src_a OP src_b; - u32 GetALUResult(ALUOperation operation, u32 src_a, u32 src_b); - - /// Performs the result operation on the input result and stores it in the specified register - /// (if necessary). - void ProcessResult(ResultOperation operation, u32 reg, u32 result); - - /// Evaluates the branch condition and returns whether the branch should be taken or not. - bool EvaluateBranchCondition(BranchCondition cond, u32 value) const; - - /// Reads an opcode at the current program counter location. - Opcode GetOpcode(u32 offset) const; - - /// Returns the specified register's value. Register 0 is hardcoded to always return 0. - u32 GetRegister(u32 register_id) const; - - /// Sets the register to the input value. - void SetRegister(u32 register_id, u32 value); - - /// Sets the method address to use for the next Send instruction. - void SetMethodAddress(u32 address); - - /// Calls a GPU Engine method with the input parameter. - void Send(u32 value); - - /// Reads a GPU register located at the method address. - u32 Read(u32 method) const; - - /// Returns the next parameter in the parameter queue. - u32 FetchParameter(); - - Engines::Maxwell3D& maxwell3d; - - /// Current program counter - u32 pc; - /// Program counter to execute at after the delay slot is executed. - std::optional delayed_pc; - - static constexpr std::size_t NumMacroRegisters = 8; - - /// General purpose macro registers. - std::array registers = {}; - - /// Method address to use for the next Send instruction. - MethodAddress method_address = {}; - - /// Input parameters of the current macro. - std::unique_ptr parameters; - std::size_t num_parameters = 0; - std::size_t parameters_capacity = 0; - /// Index of the next parameter that will be fetched by the 'parm' instruction. - u32 next_parameter_index = 0; - - bool carry_flag = false; -}; -} // namespace Tegra diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp index b08b87426..7e9073cc3 100644 --- a/src/yuzu/configuration/config.cpp +++ b/src/yuzu/configuration/config.cpp @@ -533,6 +533,8 @@ void Config::ReadDebuggingValues() { Settings::values.quest_flag = ReadSetting(QStringLiteral("quest_flag"), false).toBool(); Settings::values.disable_cpu_opt = ReadSetting(QStringLiteral("disable_cpu_opt"), false).toBool(); + Settings::values.disable_macro_jit = + ReadSetting(QStringLiteral("disable_macro_jit"), false).toBool(); qt_config->endGroup(); } @@ -1011,6 +1013,7 @@ void Config::SaveDebuggingValues() { WriteSetting(QStringLiteral("dump_nso"), Settings::values.dump_nso, false); WriteSetting(QStringLiteral("quest_flag"), Settings::values.quest_flag, false); WriteSetting(QStringLiteral("disable_cpu_opt"), Settings::values.disable_cpu_opt, false); + WriteSetting(QStringLiteral("disable_macro_jit"), Settings::values.disable_macro_jit, false); qt_config->endGroup(); } diff --git a/src/yuzu/configuration/configure_debug.cpp b/src/yuzu/configuration/configure_debug.cpp index c2026763e..2c77441fd 100644 --- a/src/yuzu/configuration/configure_debug.cpp +++ b/src/yuzu/configuration/configure_debug.cpp @@ -39,6 +39,8 @@ void ConfigureDebug::SetConfiguration() { ui->disable_cpu_opt->setChecked(Settings::values.disable_cpu_opt); ui->enable_graphics_debugging->setEnabled(!Core::System::GetInstance().IsPoweredOn()); ui->enable_graphics_debugging->setChecked(Settings::values.renderer_debug); + ui->disable_macro_jit->setEnabled(!Core::System::GetInstance().IsPoweredOn()); + ui->disable_macro_jit->setChecked(Settings::values.disable_macro_jit); } void ConfigureDebug::ApplyConfiguration() { @@ -51,6 +53,7 @@ void ConfigureDebug::ApplyConfiguration() { Settings::values.quest_flag = ui->quest_flag->isChecked(); Settings::values.disable_cpu_opt = ui->disable_cpu_opt->isChecked(); Settings::values.renderer_debug = ui->enable_graphics_debugging->isChecked(); + Settings::values.disable_macro_jit = ui->disable_macro_jit->isChecked(); Debugger::ToggleConsole(); Log::Filter filter; filter.ParseFilterString(Settings::values.log_filter); diff --git a/src/yuzu/configuration/configure_debug.ui b/src/yuzu/configuration/configure_debug.ui index e0d4c4a44..46f0208c6 100644 --- a/src/yuzu/configuration/configure_debug.ui +++ b/src/yuzu/configuration/configure_debug.ui @@ -148,6 +148,19 @@ + + + + true + + + When checked, it disables the macro Just In Time compiler. Enabled this makes games run slower + + + Disable Macro JIT + + + diff --git a/src/yuzu_cmd/config.cpp b/src/yuzu_cmd/config.cpp index c20d48c42..7240270f5 100644 --- a/src/yuzu_cmd/config.cpp +++ b/src/yuzu_cmd/config.cpp @@ -432,6 +432,8 @@ void Config::ReadValues() { Settings::values.quest_flag = sdl2_config->GetBoolean("Debugging", "quest_flag", false); Settings::values.disable_cpu_opt = sdl2_config->GetBoolean("Debugging", "disable_cpu_opt", false); + Settings::values.disable_macro_jit = + sdl2_config->GetBoolean("Debugging", "disable_macro_jit", false); const auto title_list = sdl2_config->Get("AddOns", "title_ids", ""); std::stringstream ss(title_list); diff --git a/src/yuzu_cmd/default_ini.h b/src/yuzu_cmd/default_ini.h index abc6e6e65..6f53e9659 100644 --- a/src/yuzu_cmd/default_ini.h +++ b/src/yuzu_cmd/default_ini.h @@ -291,6 +291,8 @@ quest_flag = # Determines whether or not JIT CPU optimizations are enabled # false: Optimizations Enabled, true: Optimizations Disabled disable_cpu_opt = +# Enables/Disables the macro JIT compiler +disable_macro_jit=false [WebService] # Whether or not to enable telemetry -- cgit v1.2.3 From 3a20e74f4056a03253e563510048e99ea548d5b4 Mon Sep 17 00:00:00 2001 From: David Marcec Date: Tue, 2 Jun 2020 16:37:06 +1000 Subject: Pass by reference instead of copying parameters --- src/video_core/engines/maxwell_3d.cpp | 10 ++++++---- src/video_core/engines/maxwell_3d.h | 2 +- src/video_core/macro/macro.cpp | 2 +- src/video_core/macro/macro.h | 2 +- 4 files changed, 9 insertions(+), 7 deletions(-) (limited to 'src/video_core/engines') diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 934a1d6db..0c0a7c1ed 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -115,7 +115,7 @@ void Maxwell3D::InitializeRegisterDefaults() { mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true; } -void Maxwell3D::CallMacroMethod(u32 method, std::vector&& parameters) { +void Maxwell3D::CallMacroMethod(u32 method, std::vector& parameters) { // Reset the current macro. executing_macro = 0; @@ -124,7 +124,7 @@ void Maxwell3D::CallMacroMethod(u32 method, std::vector&& parameters) { ((method - MacroRegistersStart) >> 1) % static_cast(macro_positions.size()); // Execute the current macro. - macro_engine->Execute(macro_positions[entry], std::move(parameters)); + macro_engine->Execute(macro_positions[entry], parameters); if (mme_draw.current_mode != MMEDrawMode::Undefined) { FlushMMEInlineDraw(); } @@ -160,7 +160,8 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) { // Call the macro when there are no more parameters in the command buffer if (is_last_call) { - CallMacroMethod(executing_macro, std::move(macro_params)); + CallMacroMethod(executing_macro, macro_params); + macro_params.clear(); } return; } @@ -304,7 +305,8 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, // Call the macro when there are no more parameters in the command buffer if (amount == methods_pending) { - CallMacroMethod(executing_macro, std::move(macro_params)); + CallMacroMethod(executing_macro, macro_params); + macro_params.clear(); } return; } diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 077bc9841..651f37b4d 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -1495,7 +1495,7 @@ private: * @param num_parameters Number of arguments * @param parameters Arguments to the method call */ - void CallMacroMethod(u32 method, std::vector&& parameters); + void CallMacroMethod(u32 method, std::vector& parameters); /// Handles writes to the macro uploading register. void ProcessMacroUpload(u32 data); diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp index 85a6e5dd4..d05a88479 100644 --- a/src/video_core/macro/macro.cpp +++ b/src/video_core/macro/macro.cpp @@ -15,7 +15,7 @@ void MacroEngine::AddCode(u32 method, u32 data) { uploaded_macro_code[method].push_back(data); } -void MacroEngine::Execute(u32 method, std::vector parameters) { +void MacroEngine::Execute(u32 method, std::vector& parameters) { auto compiled_macro = macro_cache.find(method); if (compiled_macro != macro_cache.end()) { compiled_macro->second->Execute(parameters, method); diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h index 28ca243d1..49fc4d6bc 100644 --- a/src/video_core/macro/macro.h +++ b/src/video_core/macro/macro.h @@ -113,7 +113,7 @@ public: void AddCode(u32 method, u32 data); // Compiles the macro if its not in the cache, and executes the compiled macro - void Execute(u32 method, std::vector parameters); + void Execute(u32 method, std::vector& parameters); protected: virtual std::unique_ptr Compile(const std::vector& code) = 0; -- cgit v1.2.3 From 411f5527d41ba5c4f09b914b4fb4df0c6493f744 Mon Sep 17 00:00:00 2001 From: David Marcec Date: Wed, 3 Jun 2020 16:33:38 +1000 Subject: Mark parameters as const --- src/video_core/engines/maxwell_3d.cpp | 2 +- src/video_core/engines/maxwell_3d.h | 3 +-- src/video_core/macro/macro.cpp | 2 +- src/video_core/macro/macro.h | 4 ++-- src/video_core/macro/macro_interpreter.cpp | 2 +- src/video_core/macro/macro_interpreter.h | 2 +- src/video_core/macro/macro_jit_x64.cpp | 2 +- src/video_core/macro/macro_jit_x64.h | 5 +++-- 8 files changed, 11 insertions(+), 11 deletions(-) (limited to 'src/video_core/engines') diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 0c0a7c1ed..14ad40250 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -115,7 +115,7 @@ void Maxwell3D::InitializeRegisterDefaults() { mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true; } -void Maxwell3D::CallMacroMethod(u32 method, std::vector& parameters) { +void Maxwell3D::CallMacroMethod(u32 method, const std::vector& parameters) { // Reset the current macro. executing_macro = 0; diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 651f37b4d..b827b112f 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -1466,7 +1466,6 @@ private: /// Interpreter for the macro codes uploaded to the GPU. std::unique_ptr macro_engine; - // MacroInterpreter macro_interpreter; static constexpr u32 null_cb_data = 0xFFFFFFFF; struct { @@ -1495,7 +1494,7 @@ private: * @param num_parameters Number of arguments * @param parameters Arguments to the method call */ - void CallMacroMethod(u32 method, std::vector& parameters); + void CallMacroMethod(u32 method, const std::vector& parameters); /// Handles writes to the macro uploading register. void ProcessMacroUpload(u32 data); diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp index d05a88479..89077a2d8 100644 --- a/src/video_core/macro/macro.cpp +++ b/src/video_core/macro/macro.cpp @@ -15,7 +15,7 @@ void MacroEngine::AddCode(u32 method, u32 data) { uploaded_macro_code[method].push_back(data); } -void MacroEngine::Execute(u32 method, std::vector& parameters) { +void MacroEngine::Execute(u32 method, const std::vector& parameters) { auto compiled_macro = macro_cache.find(method); if (compiled_macro != macro_cache.end()) { compiled_macro->second->Execute(parameters, method); diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h index 49fc4d6bc..b76ed891f 100644 --- a/src/video_core/macro/macro.h +++ b/src/video_core/macro/macro.h @@ -102,7 +102,7 @@ public: * @param code The macro byte code to execute * @param parameters The parameters of the macro */ - virtual void Execute(std::vector& parameters, u32 method) = 0; + virtual void Execute(const std::vector& parameters, u32 method) = 0; }; class MacroEngine { @@ -113,7 +113,7 @@ public: void AddCode(u32 method, u32 data); // Compiles the macro if its not in the cache, and executes the compiled macro - void Execute(u32 method, std::vector& parameters); + void Execute(u32 method, const std::vector& parameters); protected: virtual std::unique_ptr Compile(const std::vector& code) = 0; diff --git a/src/video_core/macro/macro_interpreter.cpp b/src/video_core/macro/macro_interpreter.cpp index e63296a21..5edff27aa 100644 --- a/src/video_core/macro/macro_interpreter.cpp +++ b/src/video_core/macro/macro_interpreter.cpp @@ -21,7 +21,7 @@ MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, const std::vector& code) : maxwell3d(maxwell3d), code(code) {} -void MacroInterpreterImpl::Execute(std::vector& parameters, u32 method) { +void MacroInterpreterImpl::Execute(const std::vector& parameters, u32 method) { MICROPROFILE_SCOPE(MacroInterp); Reset(); diff --git a/src/video_core/macro/macro_interpreter.h b/src/video_core/macro/macro_interpreter.h index fb923f7b9..90217fc89 100644 --- a/src/video_core/macro/macro_interpreter.h +++ b/src/video_core/macro/macro_interpreter.h @@ -29,7 +29,7 @@ private: class MacroInterpreterImpl : public CachedMacro { public: MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, const std::vector& code); - void Execute(std::vector& parameters, u32 method) override; + void Execute(const std::vector& parameters, u32 method) override; private: /// Resets the execution engine state, zeroing registers, etc. diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp index 48501e582..11c1cc3be 100644 --- a/src/video_core/macro/macro_jit_x64.cpp +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -47,7 +47,7 @@ MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vecto MacroJITx64Impl::~MacroJITx64Impl() = default; -void MacroJITx64Impl::Execute(std::vector& parameters, u32 method) { +void MacroJITx64Impl::Execute(const std::vector& parameters, u32 method) { MICROPROFILE_SCOPE(MacroJitExecute); ASSERT_OR_EXECUTE(program != nullptr, { return; }); JITState state{}; diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h index 729ed7713..6152cb501 100644 --- a/src/video_core/macro/macro_jit_x64.h +++ b/src/video_core/macro/macro_jit_x64.h @@ -13,6 +13,7 @@ #include "video_core/macro/macro.h" namespace Tegra { + namespace Engines { class Maxwell3D; } @@ -36,7 +37,7 @@ public: MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector& code); ~MacroJITx64Impl(); - void Execute(std::vector& parameters, u32 method) override; + void Execute(const std::vector& parameters, u32 method) override; void Compile_ALU(Macro::Opcode opcode); void Compile_AddImmediate(Macro::Opcode opcode); @@ -66,7 +67,7 @@ private: struct JITState { Engines::Maxwell3D* maxwell3d{}; std::array registers{}; - u32* parameters{}; + const u32* parameters{}; u32 carry_flag{}; }; static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0"); -- cgit v1.2.3 From eca3d16e548451f403cc4cc2e9fb38ed96693102 Mon Sep 17 00:00:00 2001 From: David Marcec Date: Thu, 4 Jun 2020 22:23:07 +1000 Subject: Default init labels and use initializer list for macro engine --- src/video_core/engines/maxwell_3d.cpp | 2 +- src/video_core/macro/macro_jit_x64.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'src/video_core/engines') diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 14ad40250..951016c3e 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -25,7 +25,7 @@ constexpr u32 MacroRegistersStart = 0xE00; Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager) : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, - macro_engine(GetMacroEngine(*this)), upload_state{memory_manager, regs.upload} { + macro_engine{GetMacroEngine(*this)}, upload_state{memory_manager, regs.upload} { dirty.flags.flip(); InitializeRegisterDefaults(); } diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h index 6152cb501..21ee157cf 100644 --- a/src/video_core/macro/macro_jit_x64.h +++ b/src/video_core/macro/macro_jit_x64.h @@ -85,7 +85,7 @@ private: std::optional next_opcode{}; ProgramType program{nullptr}; - std::array labels; + std::array labels{}; std::array delay_skip{}; Xbyak::Label end_of_code{}; -- cgit v1.2.3 From 5b2b6d594c6cfa77c3fb92faee63ad524bfe7204 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Thu, 4 Jun 2020 23:03:49 -0300 Subject: shader/texture: Join separate image and sampler pairs offline Games using D3D idioms can join images and samplers when a shader executes, instead of baking them into a combined sampler image. This is also possible on Vulkan. One approach to this solution would be to use separate samplers on Vulkan and leave this unimplemented on OpenGL, but we can't do this because there's no consistent way of determining which constant buffer holds a sampler and which one an image. We could in theory find the first bit and if it's in the TIC area, it's an image; but this falls apart when an image or sampler handle use an index of zero. The used approach is to track for a LOP.OR operation (this is done at an IR level, not at an ISA level), track again the constant buffers used as source and store this pair. Then, outside of shader execution, join the sample and image pair with a bitwise or operation. This approach won't work on games that truly use separate samplers in a meaningful way. For example, pooling textures in a 2D array and determining at runtime what sampler to use. This invalidates OpenGL's disk shader cache :) - Used mostly by D3D ports to Switch --- .../engines/const_buffer_engine_interface.h | 1 + src/video_core/engines/kepler_compute.cpp | 5 +- src/video_core/engines/kepler_compute.h | 2 + src/video_core/engines/maxwell_3d.cpp | 5 +- src/video_core/engines/maxwell_3d.h | 2 + src/video_core/renderer_opengl/gl_rasterizer.cpp | 16 ++++- .../renderer_opengl/gl_shader_disk_cache.cpp | 64 +++++++++++++----- .../renderer_opengl/gl_shader_disk_cache.h | 1 + src/video_core/renderer_vulkan/vk_rasterizer.cpp | 11 ++++ src/video_core/shader/decode/texture.cpp | 55 ++++++++++------ src/video_core/shader/node.h | 75 +++++++++------------- src/video_core/shader/node_helper.h | 2 +- src/video_core/shader/registry.cpp | 20 ++++++ src/video_core/shader/registry.h | 35 ++++++++++ src/video_core/shader/shader_ir.h | 4 +- src/video_core/shader/track.cpp | 24 +++++-- 16 files changed, 234 insertions(+), 88 deletions(-) (limited to 'src/video_core/engines') diff --git a/src/video_core/engines/const_buffer_engine_interface.h b/src/video_core/engines/const_buffer_engine_interface.h index ebe139504..f46e81bb7 100644 --- a/src/video_core/engines/const_buffer_engine_interface.h +++ b/src/video_core/engines/const_buffer_engine_interface.h @@ -93,6 +93,7 @@ public: virtual SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const = 0; virtual SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer, u64 offset) const = 0; + virtual SamplerDescriptor AccessSampler(u32 handle) const = 0; virtual u32 GetBoundBuffer() const = 0; virtual VideoCore::GuestDriverProfile& AccessGuestDriverProfile() = 0; diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp index f6237fc6a..a82b06a38 100644 --- a/src/video_core/engines/kepler_compute.cpp +++ b/src/video_core/engines/kepler_compute.cpp @@ -92,8 +92,11 @@ SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 con ASSERT(stage == ShaderType::Compute); const auto& tex_info_buffer = launch_description.const_buffer_config[const_buffer]; const GPUVAddr tex_info_address = tex_info_buffer.Address() + offset; + return AccessSampler(memory_manager.Read(tex_info_address)); +} - const Texture::TextureHandle tex_handle{memory_manager.Read(tex_info_address)}; +SamplerDescriptor KeplerCompute::AccessSampler(u32 handle) const { + const Texture::TextureHandle tex_handle{handle}; const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle); SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic); result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value()); diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h index 18ceedfaf..b7f668d88 100644 --- a/src/video_core/engines/kepler_compute.h +++ b/src/video_core/engines/kepler_compute.h @@ -219,6 +219,8 @@ public: SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer, u64 offset) const override; + SamplerDescriptor AccessSampler(u32 handle) const override; + u32 GetBoundBuffer() const override { return regs.tex_cb_index; } diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 13ef2e42d..d539ae7b1 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -743,8 +743,11 @@ SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_b const auto& shader = state.shader_stages[static_cast(stage)]; const auto& tex_info_buffer = shader.const_buffers[const_buffer]; const GPUVAddr tex_info_address = tex_info_buffer.address + offset; + return AccessSampler(memory_manager.Read(tex_info_address)); +} - const Texture::TextureHandle tex_handle{memory_manager.Read(tex_info_address)}; +SamplerDescriptor Maxwell3D::AccessSampler(u32 handle) const { + const Texture::TextureHandle tex_handle{handle}; const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle); SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic); result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value()); diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 05dd6b39b..c5f1b0499 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -1403,6 +1403,8 @@ public: SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer, u64 offset) const override; + SamplerDescriptor AccessSampler(u32 handle) const override; + u32 GetBoundBuffer() const override { return regs.tex_cb_index; } diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 55e79aaf6..aedcdcb78 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -65,10 +65,22 @@ constexpr std::size_t NumSupportedVertexAttributes = 16; template Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry, ShaderType shader_type, std::size_t index = 0) { + if constexpr (std::is_same_v) { + if (entry.is_separated) { + const u32 buffer_1 = entry.buffer; + const u32 buffer_2 = entry.secondary_buffer; + const u32 offset_1 = entry.offset; + const u32 offset_2 = entry.secondary_offset; + const u32 handle_1 = engine.AccessConstBuffer32(shader_type, buffer_1, offset_1); + const u32 handle_2 = engine.AccessConstBuffer32(shader_type, buffer_2, offset_2); + return engine.GetTextureInfo(handle_1 | handle_2); + } + } if (entry.is_bindless) { - const auto tex_handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset); - return engine.GetTextureInfo(tex_handle); + const u32 handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset); + return engine.GetTextureInfo(handle); } + const auto& gpu_profile = engine.AccessGuestDriverProfile(); const u32 offset = entry.offset + static_cast(index * gpu_profile.GetTextureHandlerSize()); if constexpr (std::is_same_v) { diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp index 9e95a122b..653c3f2f9 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp @@ -29,6 +29,8 @@ using VideoCommon::Shader::KeyMap; namespace { +using VideoCommon::Shader::SeparateSamplerKey; + using ShaderCacheVersionHash = std::array; struct ConstBufferKey { @@ -37,18 +39,26 @@ struct ConstBufferKey { u32 value = 0; }; -struct BoundSamplerKey { +struct BoundSamplerEntry { u32 offset = 0; Tegra::Engines::SamplerDescriptor sampler; }; -struct BindlessSamplerKey { +struct SeparateSamplerEntry { + u32 cbuf1 = 0; + u32 cbuf2 = 0; + u32 offset1 = 0; + u32 offset2 = 0; + Tegra::Engines::SamplerDescriptor sampler; +}; + +struct BindlessSamplerEntry { u32 cbuf = 0; u32 offset = 0; Tegra::Engines::SamplerDescriptor sampler; }; -constexpr u32 NativeVersion = 20; +constexpr u32 NativeVersion = 21; ShaderCacheVersionHash GetShaderCacheVersionHash() { ShaderCacheVersionHash hash{}; @@ -87,12 +97,14 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) { u32 texture_handler_size_value; u32 num_keys; u32 num_bound_samplers; + u32 num_separate_samplers; u32 num_bindless_samplers; if (file.ReadArray(&unique_identifier, 1) != 1 || file.ReadArray(&bound_buffer, 1) != 1 || file.ReadArray(&is_texture_handler_size_known, 1) != 1 || file.ReadArray(&texture_handler_size_value, 1) != 1 || file.ReadArray(&graphics_info, 1) != 1 || file.ReadArray(&compute_info, 1) != 1 || file.ReadArray(&num_keys, 1) != 1 || file.ReadArray(&num_bound_samplers, 1) != 1 || + file.ReadArray(&num_separate_samplers, 1) != 1 || file.ReadArray(&num_bindless_samplers, 1) != 1) { return false; } @@ -101,23 +113,32 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) { } std::vector flat_keys(num_keys); - std::vector flat_bound_samplers(num_bound_samplers); - std::vector flat_bindless_samplers(num_bindless_samplers); + std::vector flat_bound_samplers(num_bound_samplers); + std::vector flat_separate_samplers(num_separate_samplers); + std::vector flat_bindless_samplers(num_bindless_samplers); if (file.ReadArray(flat_keys.data(), flat_keys.size()) != flat_keys.size() || file.ReadArray(flat_bound_samplers.data(), flat_bound_samplers.size()) != flat_bound_samplers.size() || + file.ReadArray(flat_separate_samplers.data(), flat_separate_samplers.size()) != + flat_separate_samplers.size() || file.ReadArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) != flat_bindless_samplers.size()) { return false; } - for (const auto& key : flat_keys) { - keys.insert({{key.cbuf, key.offset}, key.value}); + for (const auto& entry : flat_keys) { + keys.insert({{entry.cbuf, entry.offset}, entry.value}); } - for (const auto& key : flat_bound_samplers) { - bound_samplers.emplace(key.offset, key.sampler); + for (const auto& entry : flat_bound_samplers) { + bound_samplers.emplace(entry.offset, entry.sampler); } - for (const auto& key : flat_bindless_samplers) { - bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler}); + for (const auto& entry : flat_separate_samplers) { + SeparateSamplerKey key; + key.buffers = {entry.cbuf1, entry.cbuf2}; + key.offsets = {entry.offset1, entry.offset2}; + separate_samplers.emplace(key, entry.sampler); + } + for (const auto& entry : flat_bindless_samplers) { + bindless_samplers.insert({{entry.cbuf, entry.offset}, entry.sampler}); } return true; @@ -142,6 +163,7 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const { file.WriteObject(graphics_info) != 1 || file.WriteObject(compute_info) != 1 || file.WriteObject(static_cast(keys.size())) != 1 || file.WriteObject(static_cast(bound_samplers.size())) != 1 || + file.WriteObject(static_cast(separate_samplers.size())) != 1 || file.WriteObject(static_cast(bindless_samplers.size())) != 1) { return false; } @@ -152,22 +174,34 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const { flat_keys.push_back(ConstBufferKey{address.first, address.second, value}); } - std::vector flat_bound_samplers; + std::vector flat_bound_samplers; flat_bound_samplers.reserve(bound_samplers.size()); for (const auto& [address, sampler] : bound_samplers) { - flat_bound_samplers.push_back(BoundSamplerKey{address, sampler}); + flat_bound_samplers.push_back(BoundSamplerEntry{address, sampler}); + } + + std::vector flat_separate_samplers; + flat_separate_samplers.reserve(separate_samplers.size()); + for (const auto& [key, sampler] : separate_samplers) { + SeparateSamplerEntry entry; + std::tie(entry.cbuf1, entry.cbuf2) = key.buffers; + std::tie(entry.offset1, entry.offset2) = key.offsets; + entry.sampler = sampler; + flat_separate_samplers.push_back(entry); } - std::vector flat_bindless_samplers; + std::vector flat_bindless_samplers; flat_bindless_samplers.reserve(bindless_samplers.size()); for (const auto& [address, sampler] : bindless_samplers) { flat_bindless_samplers.push_back( - BindlessSamplerKey{address.first, address.second, sampler}); + BindlessSamplerEntry{address.first, address.second, sampler}); } return file.WriteArray(flat_keys.data(), flat_keys.size()) == flat_keys.size() && file.WriteArray(flat_bound_samplers.data(), flat_bound_samplers.size()) == flat_bound_samplers.size() && + file.WriteArray(flat_separate_samplers.data(), flat_separate_samplers.size()) == + flat_separate_samplers.size() && file.WriteArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) == flat_bindless_samplers.size(); } diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h index d5be52e40..a79cef0e9 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h @@ -57,6 +57,7 @@ struct ShaderDiskCacheEntry { VideoCommon::Shader::ComputeInfo compute_info; VideoCommon::Shader::KeyMap keys; VideoCommon::Shader::BoundSamplerMap bound_samplers; + VideoCommon::Shader::SeparateSamplerMap separate_samplers; VideoCommon::Shader::BindlessSamplerMap bindless_samplers; }; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index a3d992ed3..42a20530d 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -117,6 +117,17 @@ template Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry, std::size_t stage, std::size_t index = 0) { const auto stage_type = static_cast(stage); + if constexpr (std::is_same_v) { + if (entry.is_separated) { + const u32 buffer_1 = entry.buffer; + const u32 buffer_2 = entry.secondary_buffer; + const u32 offset_1 = entry.offset; + const u32 offset_2 = entry.secondary_offset; + const u32 handle_1 = engine.AccessConstBuffer32(stage_type, buffer_1, offset_1); + const u32 handle_2 = engine.AccessConstBuffer32(stage_type, buffer_2, offset_2); + return engine.GetTextureInfo(handle_1 | handle_2); + } + } if (entry.is_bindless) { const auto tex_handle = engine.AccessConstBuffer32(stage_type, entry.buffer, entry.offset); return engine.GetTextureInfo(tex_handle); diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp index 8f0bb996e..29ebf65ba 100644 --- a/src/video_core/shader/decode/texture.cpp +++ b/src/video_core/shader/decode/texture.cpp @@ -357,13 +357,11 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { return pc; } -ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(SamplerInfo info, u32 offset, - std::optional buffer) { +ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo( + SamplerInfo info, std::optional sampler) { if (info.IsComplete()) { return info; } - const auto sampler = buffer ? registry.ObtainBindlessSampler(*buffer, offset) - : registry.ObtainBoundSampler(offset); if (!sampler) { LOG_WARNING(HW_GPU, "Unknown sampler info"); info.type = info.type.value_or(Tegra::Shader::TextureType::Texture2D); @@ -381,8 +379,8 @@ ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(SamplerInfo info, u32 offset, std::optional ShaderIR::GetSampler(Tegra::Shader::Sampler sampler, SamplerInfo sampler_info) { - const auto offset = static_cast(sampler.index.Value()); - const auto info = GetSamplerInfo(sampler_info, offset); + const u32 offset = static_cast(sampler.index.Value()); + const auto info = GetSamplerInfo(sampler_info, registry.ObtainBoundSampler(offset)); // If this sampler has already been used, return the existing mapping. const auto it = std::find_if(used_samplers.begin(), used_samplers.end(), @@ -404,20 +402,19 @@ std::optional ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg, const Node sampler_register = GetRegister(reg); const auto [base_node, tracked_sampler_info] = TrackBindlessSampler(sampler_register, global_code, static_cast(global_code.size())); - ASSERT(base_node != nullptr); - if (base_node == nullptr) { + if (!base_node) { + UNREACHABLE(); return std::nullopt; } - if (const auto bindless_sampler_info = - std::get_if(&*tracked_sampler_info)) { - const u32 buffer = bindless_sampler_info->GetIndex(); - const u32 offset = bindless_sampler_info->GetOffset(); - info = GetSamplerInfo(info, offset, buffer); + if (const auto sampler_info = std::get_if(&*tracked_sampler_info)) { + const u32 buffer = sampler_info->index; + const u32 offset = sampler_info->offset; + info = GetSamplerInfo(info, registry.ObtainBindlessSampler(buffer, offset)); // If this sampler has already been used, return the existing mapping. const auto it = std::find_if(used_samplers.begin(), used_samplers.end(), - [buffer = buffer, offset = offset](const Sampler& entry) { + [buffer, offset](const Sampler& entry) { return entry.buffer == buffer && entry.offset == offset; }); if (it != used_samplers.end()) { @@ -431,10 +428,32 @@ std::optional ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg, return used_samplers.emplace_back(next_index, offset, buffer, *info.type, *info.is_array, *info.is_shadow, *info.is_buffer, false); } - if (const auto array_sampler_info = std::get_if(&*tracked_sampler_info)) { - const u32 base_offset = array_sampler_info->GetBaseOffset() / 4; - index_var = GetCustomVariable(array_sampler_info->GetIndexVar()); - info = GetSamplerInfo(info, base_offset); + if (const auto sampler_info = std::get_if(&*tracked_sampler_info)) { + const std::pair indices = sampler_info->indices; + const std::pair offsets = sampler_info->offsets; + info = GetSamplerInfo(info, registry.ObtainSeparateSampler(indices, offsets)); + + // Try to use an already created sampler if it exists + const auto it = std::find_if( + used_samplers.begin(), used_samplers.end(), [indices, offsets](const Sampler& entry) { + return offsets == std::pair{entry.offset, entry.secondary_offset} && + indices == std::pair{entry.buffer, entry.secondary_buffer}; + }); + if (it != used_samplers.end()) { + ASSERT(it->is_separated && it->type == info.type && it->is_array == info.is_array && + it->is_shadow == info.is_shadow && it->is_buffer == info.is_buffer); + return *it; + } + + // Otherwise create a new mapping for this sampler + const u32 next_index = static_cast(used_samplers.size()); + return used_samplers.emplace_back(next_index, offsets, indices, *info.type, *info.is_array, + *info.is_shadow, *info.is_buffer); + } + if (const auto sampler_info = std::get_if(&*tracked_sampler_info)) { + const u32 base_offset = sampler_info->base_offset / 4; + index_var = GetCustomVariable(sampler_info->bindless_var); + info = GetSamplerInfo(info, registry.ObtainBoundSampler(base_offset)); // If this sampler has already been used, return the existing mapping. const auto it = std::find_if( diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h index c5e5165ff..8f230d57a 100644 --- a/src/video_core/shader/node.h +++ b/src/video_core/shader/node.h @@ -275,10 +275,11 @@ using Node = std::shared_ptr; using Node4 = std::array; using NodeBlock = std::vector; -class BindlessSamplerNode; -class ArraySamplerNode; +struct ArraySamplerNode; +struct BindlessSamplerNode; +struct SeparateSamplerNode; -using TrackSamplerData = std::variant; +using TrackSamplerData = std::variant; using TrackSampler = std::shared_ptr; struct Sampler { @@ -288,63 +289,51 @@ struct Sampler { : index{index}, offset{offset}, type{type}, is_array{is_array}, is_shadow{is_shadow}, is_buffer{is_buffer}, is_indexed{is_indexed} {} + /// Separate sampler constructor + constexpr explicit Sampler(u32 index, std::pair offsets, std::pair buffers, + Tegra::Shader::TextureType type, bool is_array, bool is_shadow, + bool is_buffer) + : index{index}, offset{offsets.first}, secondary_offset{offsets.second}, + buffer{buffers.first}, secondary_buffer{buffers.second}, type{type}, is_array{is_array}, + is_shadow{is_shadow}, is_buffer{is_buffer}, is_separated{true} {} + /// Bindless samplers constructor constexpr explicit Sampler(u32 index, u32 offset, u32 buffer, Tegra::Shader::TextureType type, bool is_array, bool is_shadow, bool is_buffer, bool is_indexed) : index{index}, offset{offset}, buffer{buffer}, type{type}, is_array{is_array}, is_shadow{is_shadow}, is_buffer{is_buffer}, is_bindless{true}, is_indexed{is_indexed} {} - u32 index = 0; ///< Emulated index given for the this sampler. - u32 offset = 0; ///< Offset in the const buffer from where the sampler is being read. - u32 buffer = 0; ///< Buffer where the bindless sampler is being read (unused on bound samplers). - u32 size = 1; ///< Size of the sampler. + u32 index = 0; ///< Emulated index given for the this sampler. + u32 offset = 0; ///< Offset in the const buffer from where the sampler is being read. + u32 secondary_offset = 0; ///< Secondary offset in the const buffer. + u32 buffer = 0; ///< Buffer where the bindless sampler is read. + u32 secondary_buffer = 0; ///< Secondary buffer where the bindless sampler is read. + u32 size = 1; ///< Size of the sampler. Tegra::Shader::TextureType type{}; ///< The type used to sample this texture (Texture2D, etc) - bool is_array = false; ///< Whether the texture is being sampled as an array texture or not. - bool is_shadow = false; ///< Whether the texture is being sampled as a depth texture or not. - bool is_buffer = false; ///< Whether the texture is a texture buffer without sampler. - bool is_bindless = false; ///< Whether this sampler belongs to a bindless texture or not. - bool is_indexed = false; ///< Whether this sampler is an indexed array of textures. + bool is_array = false; ///< Whether the texture is being sampled as an array texture or not. + bool is_shadow = false; ///< Whether the texture is being sampled as a depth texture or not. + bool is_buffer = false; ///< Whether the texture is a texture buffer without sampler. + bool is_bindless = false; ///< Whether this sampler belongs to a bindless texture or not. + bool is_indexed = false; ///< Whether this sampler is an indexed array of textures. + bool is_separated = false; ///< Whether the image and sampler is separated or not. }; /// Represents a tracked bindless sampler into a direct const buffer -class ArraySamplerNode final { -public: - explicit ArraySamplerNode(u32 index, u32 base_offset, u32 bindless_var) - : index{index}, base_offset{base_offset}, bindless_var{bindless_var} {} - - constexpr u32 GetIndex() const { - return index; - } - - constexpr u32 GetBaseOffset() const { - return base_offset; - } - - constexpr u32 GetIndexVar() const { - return bindless_var; - } - -private: +struct ArraySamplerNode { u32 index; u32 base_offset; u32 bindless_var; }; -/// Represents a tracked bindless sampler into a direct const buffer -class BindlessSamplerNode final { -public: - explicit BindlessSamplerNode(u32 index, u32 offset) : index{index}, offset{offset} {} - - constexpr u32 GetIndex() const { - return index; - } - - constexpr u32 GetOffset() const { - return offset; - } +/// Represents a tracked separate sampler image pair that was folded statically +struct SeparateSamplerNode { + std::pair indices; + std::pair offsets; +}; -private: +/// Represents a tracked bindless sampler into a direct const buffer +struct BindlessSamplerNode { u32 index; u32 offset; }; diff --git a/src/video_core/shader/node_helper.h b/src/video_core/shader/node_helper.h index 11231bbea..1e0886185 100644 --- a/src/video_core/shader/node_helper.h +++ b/src/video_core/shader/node_helper.h @@ -48,7 +48,7 @@ Node MakeNode(Args&&... args) { template TrackSampler MakeTrackSampler(Args&&... args) { static_assert(std::is_convertible_v); - return std::make_shared(T(std::forward(args)...)); + return std::make_shared(T{std::forward(args)...}); } template diff --git a/src/video_core/shader/registry.cpp b/src/video_core/shader/registry.cpp index af70b3f35..cdf274e54 100644 --- a/src/video_core/shader/registry.cpp +++ b/src/video_core/shader/registry.cpp @@ -93,6 +93,26 @@ std::optional Registry::ObtainBoundSampler(u32 offset) { return value; } +std::optional Registry::ObtainSeparateSampler( + std::pair buffers, std::pair offsets) { + SeparateSamplerKey key; + key.buffers = buffers; + key.offsets = offsets; + const auto iter = separate_samplers.find(key); + if (iter != separate_samplers.end()) { + return iter->second; + } + if (!engine) { + return std::nullopt; + } + + const u32 handle_1 = engine->AccessConstBuffer32(stage, key.buffers.first, key.offsets.first); + const u32 handle_2 = engine->AccessConstBuffer32(stage, key.buffers.second, key.offsets.second); + const SamplerDescriptor value = engine->AccessSampler(handle_1 | handle_2); + separate_samplers.emplace(key, value); + return value; +} + std::optional Registry::ObtainBindlessSampler(u32 buffer, u32 offset) { const std::pair key = {buffer, offset}; diff --git a/src/video_core/shader/registry.h b/src/video_core/shader/registry.h index 0c80d35fd..231206765 100644 --- a/src/video_core/shader/registry.h +++ b/src/video_core/shader/registry.h @@ -19,8 +19,39 @@ namespace VideoCommon::Shader { +struct SeparateSamplerKey { + std::pair buffers; + std::pair offsets; +}; + +} // namespace VideoCommon::Shader + +namespace std { + +template <> +struct hash { + std::size_t operator()(const VideoCommon::Shader::SeparateSamplerKey& key) const noexcept { + return std::hash{}(key.buffers.first ^ key.buffers.second ^ key.offsets.first ^ + key.offsets.second); + } +}; + +template <> +struct equal_to { + bool operator()(const VideoCommon::Shader::SeparateSamplerKey& lhs, + const VideoCommon::Shader::SeparateSamplerKey& rhs) const noexcept { + return lhs.buffers == rhs.buffers && lhs.offsets == rhs.offsets; + } +}; + +} // namespace std + +namespace VideoCommon::Shader { + using KeyMap = std::unordered_map, u32, Common::PairHash>; using BoundSamplerMap = std::unordered_map; +using SeparateSamplerMap = + std::unordered_map; using BindlessSamplerMap = std::unordered_map, Tegra::Engines::SamplerDescriptor, Common::PairHash>; @@ -73,6 +104,9 @@ public: std::optional ObtainBoundSampler(u32 offset); + std::optional ObtainSeparateSampler( + std::pair buffers, std::pair offsets); + std::optional ObtainBindlessSampler(u32 buffer, u32 offset); /// Inserts a key. @@ -128,6 +162,7 @@ private: Tegra::Engines::ConstBufferEngineInterface* engine = nullptr; KeyMap keys; BoundSamplerMap bound_samplers; + SeparateSamplerMap separate_samplers; BindlessSamplerMap bindless_samplers; u32 bound_buffer; GraphicsInfo graphics_info; diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h index 8f522edf0..3a98b2104 100644 --- a/src/video_core/shader/shader_ir.h +++ b/src/video_core/shader/shader_ir.h @@ -330,8 +330,8 @@ private: OperationCode GetPredicateCombiner(Tegra::Shader::PredOperation operation); /// Queries the missing sampler info from the execution context. - SamplerInfo GetSamplerInfo(SamplerInfo info, u32 offset, - std::optional buffer = std::nullopt); + SamplerInfo GetSamplerInfo(SamplerInfo info, + std::optional sampler); /// Accesses a texture sampler. std::optional GetSampler(Tegra::Shader::Sampler sampler, SamplerInfo info); diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp index 435f4facb..d5ed81442 100644 --- a/src/video_core/shader/track.cpp +++ b/src/video_core/shader/track.cpp @@ -64,7 +64,8 @@ bool AmendNodeCv(std::size_t amend_index, Node node) { if (const auto operation = std::get_if(&*node)) { operation->SetAmendIndex(amend_index); return true; - } else if (const auto conditional = std::get_if(&*node)) { + } + if (const auto conditional = std::get_if(&*node)) { conditional->SetAmendIndex(amend_index); return true; } @@ -110,10 +111,23 @@ std::pair ShaderIR::TrackBindlessSampler(Node tracked, const return TrackBindlessSampler(source, code, new_cursor); } if (const auto operation = std::get_if(&*tracked)) { - for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) { - if (auto found = TrackBindlessSampler((*operation)[i - 1], code, cursor); - std::get<0>(found)) { - // Cbuf found in operand. + const OperationNode& op = *operation; + + const OperationCode opcode = operation->GetCode(); + if (opcode == OperationCode::IBitwiseOr || opcode == OperationCode::UBitwiseOr) { + ASSERT(op.GetOperandsCount() == 2); + auto [node_a, index_a, offset_a] = TrackCbuf(op[0], code, cursor); + auto [node_b, index_b, offset_b] = TrackCbuf(op[1], code, cursor); + if (node_a && node_b) { + auto track = MakeTrackSampler(std::pair{index_a, index_b}, + std::pair{offset_a, offset_b}); + return {tracked, std::move(track)}; + } + } + std::size_t i = op.GetOperandsCount(); + while (i--) { + if (auto found = TrackBindlessSampler(op[i - 1], code, cursor); std::get<0>(found)) { + // Constant buffer found in operand. return found; } } -- cgit v1.2.3 From c95c254f3eda75476ad221a4828033f4140a3470 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Fri, 29 May 2020 23:32:41 -0300 Subject: texture_cache: Implement rendering to 3D textures This allows rendering to 3D textures with more than one slice. Applications are allowed to render to more than one slice of a texture using gl_Layer from a VTG shader. This also requires reworking how 3D texture collisions are handled, for now, this commit allows rendering to slices but not to miplevels. When a render target attempts to write to a mipmap, we fallback to the previous implementation (copying or flushing as needed). - Fixes color correction 3D textures on UE4 games (rainbow effects). - Allows Xenoblade games to render to 3D textures directly. --- src/video_core/engines/maxwell_3d.h | 1 + .../renderer_opengl/gl_texture_cache.cpp | 49 +++++---- src/video_core/renderer_opengl/gl_texture_cache.h | 6 +- src/video_core/renderer_vulkan/vk_rasterizer.cpp | 9 +- .../renderer_vulkan/vk_texture_cache.cpp | 76 ++++++++++--- src/video_core/renderer_vulkan/vk_texture_cache.h | 33 +++--- src/video_core/texture_cache/surface_base.cpp | 7 +- src/video_core/texture_cache/surface_base.h | 13 ++- src/video_core/texture_cache/surface_params.cpp | 17 ++- src/video_core/texture_cache/texture_cache.h | 119 +++++++++------------ 10 files changed, 191 insertions(+), 139 deletions(-) (limited to 'src/video_core/engines') diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index b827b112f..79fc9bbea 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -598,6 +598,7 @@ public: BitField<4, 3, u32> block_height; BitField<8, 3, u32> block_depth; BitField<12, 1, InvMemoryLayout> type; + BitField<16, 1, u32> is_3d; } memory_layout; union { BitField<0, 16, u32> layers; diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 57db5a08b..46df15dfc 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -263,9 +263,14 @@ CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& param target = GetTextureTarget(params.target); texture = CreateTexture(params, target, internal_format, texture_buffer); DecorateSurfaceName(); - main_view = CreateViewInner( - ViewParams(params.target, 0, params.is_layered ? params.depth : 1, 0, params.num_levels), - true); + + u32 num_layers = 1; + if (params.is_layered || params.target == SurfaceTarget::Texture3D) { + num_layers = params.depth; + } + + main_view = + CreateViewInner(ViewParams(params.target, 0, num_layers, 0, params.num_levels), true); } CachedSurface::~CachedSurface() = default; @@ -413,20 +418,23 @@ CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& p CachedSurfaceView::~CachedSurfaceView() = default; -void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const { +void CachedSurfaceView::Attach(GLenum attachment, GLenum fb_target) const { ASSERT(params.num_levels == 1); + if (params.target == SurfaceTarget::Texture3D) { + if (params.num_layers > 1) { + ASSERT(params.base_layer == 0); + glFramebufferTexture(fb_target, attachment, surface.texture.handle, params.base_level); + } else { + glFramebufferTexture3D(fb_target, attachment, target, surface.texture.handle, + params.base_level, params.base_layer); + } + return; + } + if (params.num_layers > 1) { - // Layered framebuffer attachments UNIMPLEMENTED_IF(params.base_layer != 0); - - switch (params.target) { - case SurfaceTarget::Texture2DArray: - glFramebufferTexture(target, attachment, GetTexture(), 0); - break; - default: - UNIMPLEMENTED(); - } + glFramebufferTexture(fb_target, attachment, GetTexture(), 0); return; } @@ -434,16 +442,16 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const { const GLuint texture = surface.GetTexture(); switch (surface.GetSurfaceParams().target) { case SurfaceTarget::Texture1D: - glFramebufferTexture1D(target, attachment, view_target, texture, params.base_level); + glFramebufferTexture1D(fb_target, attachment, view_target, texture, params.base_level); break; case SurfaceTarget::Texture2D: - glFramebufferTexture2D(target, attachment, view_target, texture, params.base_level); + glFramebufferTexture2D(fb_target, attachment, view_target, texture, params.base_level); break; case SurfaceTarget::Texture1DArray: case SurfaceTarget::Texture2DArray: case SurfaceTarget::TextureCubemap: case SurfaceTarget::TextureCubeArray: - glFramebufferTextureLayer(target, attachment, texture, params.base_level, + glFramebufferTextureLayer(fb_target, attachment, texture, params.base_level, params.base_layer); break; default: @@ -500,8 +508,13 @@ OGLTextureView CachedSurfaceView::CreateTextureView() const { OGLTextureView texture_view; texture_view.Create(); - glTextureView(texture_view.handle, target, surface.texture.handle, format, params.base_level, - params.num_levels, params.base_layer, params.num_layers); + if (target == GL_TEXTURE_3D) { + glTextureView(texture_view.handle, target, surface.texture.handle, format, + params.base_level, params.num_levels, 0, 1); + } else { + glTextureView(texture_view.handle, target, surface.texture.handle, format, + params.base_level, params.num_levels, params.base_layer, params.num_layers); + } ApplyTextureDefaults(surface.GetSurfaceParams(), texture_view.handle); return texture_view; diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 8a2ac8603..bfc4ddf5d 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -80,8 +80,10 @@ public: explicit CachedSurfaceView(CachedSurface& surface, const ViewParams& params, bool is_proxy); ~CachedSurfaceView(); - /// Attaches this texture view to the current bound GL_DRAW_FRAMEBUFFER - void Attach(GLenum attachment, GLenum target) const; + /// @brief Attaches this texture view to the currently bound fb_target framebuffer + /// @param attachment Attachment to bind textures to + /// @param fb_target Framebuffer target to attach to (e.g. DRAW_FRAMEBUFFER) + void Attach(GLenum attachment, GLenum fb_target) const; GLuint GetTexture(Tegra::Texture::SwizzleSource x_source, Tegra::Texture::SwizzleSource y_source, diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index d86c46412..19b8f9da3 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -716,7 +716,7 @@ std::tuple RasterizerVulkan::ConfigureFramebuffers( if (!view) { return false; } - key.views.push_back(view->GetHandle()); + key.views.push_back(view->GetAttachment()); key.width = std::min(key.width, view->GetWidth()); key.height = std::min(key.height, view->GetHeight()); key.layers = std::min(key.layers, view->GetNumLayers()); @@ -1137,8 +1137,8 @@ void RasterizerVulkan::SetupTexture(const Tegra::Texture::FullTextureInfo& textu auto view = texture_cache.GetTextureSurface(texture.tic, entry); ASSERT(!view->IsBufferView()); - const auto image_view = view->GetHandle(texture.tic.x_source, texture.tic.y_source, - texture.tic.z_source, texture.tic.w_source); + const VkImageView image_view = view->GetImageView(texture.tic.x_source, texture.tic.y_source, + texture.tic.z_source, texture.tic.w_source); const auto sampler = sampler_cache.GetSampler(texture.tsc); update_descriptor_queue.AddSampledImage(sampler, image_view); @@ -1164,7 +1164,8 @@ void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const Ima UNIMPLEMENTED_IF(tic.IsBuffer()); - const auto image_view = view->GetHandle(tic.x_source, tic.y_source, tic.z_source, tic.w_source); + const VkImageView image_view = + view->GetImageView(tic.x_source, tic.y_source, tic.z_source, tic.w_source); update_descriptor_queue.AddImage(image_view); const auto image_layout = update_descriptor_queue.GetLastImageLayout(); diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index ea487b770..430031665 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -167,6 +167,7 @@ VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceP ci.extent = {params.width, params.height, 1}; break; case SurfaceTarget::Texture3D: + ci.flags |= VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT; ci.extent = {params.width, params.height, params.depth}; break; case SurfaceTarget::TextureBuffer: @@ -176,6 +177,12 @@ VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceP return ci; } +u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source, Tegra::Texture::SwizzleSource y_source, + Tegra::Texture::SwizzleSource z_source, Tegra::Texture::SwizzleSource w_source) { + return (static_cast(x_source) << 24) | (static_cast(y_source) << 16) | + (static_cast(z_source) << 8) | static_cast(w_source); +} + } // Anonymous namespace CachedSurface::CachedSurface(Core::System& system, const VKDevice& device, @@ -203,9 +210,11 @@ CachedSurface::CachedSurface(Core::System& system, const VKDevice& device, } // TODO(Rodrigo): Move this to a virtual function. - main_view = CreateViewInner( - ViewParams(params.target, 0, static_cast(params.GetNumLayers()), 0, params.num_levels), - true); + u32 num_layers = 1; + if (params.is_layered || params.target == SurfaceTarget::Texture3D) { + num_layers = params.depth; + } + main_view = CreateView(ViewParams(params.target, 0, num_layers, 0, params.num_levels)); } CachedSurface::~CachedSurface() = default; @@ -253,12 +262,8 @@ void CachedSurface::DecorateSurfaceName() { } View CachedSurface::CreateView(const ViewParams& params) { - return CreateViewInner(params, false); -} - -View CachedSurface::CreateViewInner(const ViewParams& params, bool is_proxy) { // TODO(Rodrigo): Add name decorations - return views[params] = std::make_shared(device, *this, params, is_proxy); + return views[params] = std::make_shared(device, *this, params); } void CachedSurface::UploadBuffer(const std::vector& staging_buffer) { @@ -342,18 +347,27 @@ VkImageSubresourceRange CachedSurface::GetImageSubresourceRange() const { } CachedSurfaceView::CachedSurfaceView(const VKDevice& device, CachedSurface& surface, - const ViewParams& params, bool is_proxy) + const ViewParams& params) : VideoCommon::ViewBase{params}, params{surface.GetSurfaceParams()}, image{surface.GetImageHandle()}, buffer_view{surface.GetBufferViewHandle()}, aspect_mask{surface.GetAspectMask()}, device{device}, surface{surface}, - base_layer{params.base_layer}, num_layers{params.num_layers}, base_level{params.base_level}, - num_levels{params.num_levels}, image_view_type{image ? GetImageViewType(params.target) - : VK_IMAGE_VIEW_TYPE_1D} {} + base_level{params.base_level}, num_levels{params.num_levels}, + image_view_type{image ? GetImageViewType(params.target) : VK_IMAGE_VIEW_TYPE_1D} { + if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) { + base_layer = 0; + num_layers = 1; + base_slice = params.base_layer; + num_slices = params.num_layers; + } else { + base_layer = params.base_layer; + num_layers = params.num_layers; + } +} CachedSurfaceView::~CachedSurfaceView() = default; -VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y_source, - SwizzleSource z_source, SwizzleSource w_source) { +VkImageView CachedSurfaceView::GetImageView(SwizzleSource x_source, SwizzleSource y_source, + SwizzleSource z_source, SwizzleSource w_source) { const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source); if (last_image_view && last_swizzle == new_swizzle) { return last_image_view; @@ -399,6 +413,11 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y }); } + if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) { + ASSERT(base_slice == 0); + ASSERT(num_slices == params.depth); + } + VkImageViewCreateInfo ci; ci.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; ci.pNext = nullptr; @@ -417,6 +436,35 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y return last_image_view = *image_view; } +VkImageView CachedSurfaceView::GetAttachment() { + if (render_target) { + return *render_target; + } + + VkImageViewCreateInfo ci; + ci.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; + ci.pNext = nullptr; + ci.flags = 0; + ci.image = surface.GetImageHandle(); + ci.format = surface.GetImage().GetFormat(); + ci.components = {VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY, + VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY}; + ci.subresourceRange.aspectMask = aspect_mask; + ci.subresourceRange.baseMipLevel = base_level; + ci.subresourceRange.levelCount = num_levels; + if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) { + ci.viewType = num_slices > 1 ? VK_IMAGE_VIEW_TYPE_2D_ARRAY : VK_IMAGE_VIEW_TYPE_2D; + ci.subresourceRange.baseArrayLayer = base_slice; + ci.subresourceRange.layerCount = num_slices; + } else { + ci.viewType = image_view_type; + ci.subresourceRange.baseArrayLayer = base_layer; + ci.subresourceRange.layerCount = num_layers; + } + render_target = device.GetLogical().CreateImageView(ci); + return *render_target; +} + VKTextureCache::VKTextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer, const VKDevice& device, VKResourceManager& resource_manager, VKMemoryManager& memory_manager, VKScheduler& scheduler, diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index f211ccb1e..807e26c8a 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -91,7 +91,6 @@ protected: void DecorateSurfaceName(); View CreateView(const ViewParams& params) override; - View CreateViewInner(const ViewParams& params, bool is_proxy); private: void UploadBuffer(const std::vector& staging_buffer); @@ -120,23 +119,20 @@ private: class CachedSurfaceView final : public VideoCommon::ViewBase { public: explicit CachedSurfaceView(const VKDevice& device, CachedSurface& surface, - const ViewParams& params, bool is_proxy); + const ViewParams& params); ~CachedSurfaceView(); - VkImageView GetHandle(Tegra::Texture::SwizzleSource x_source, - Tegra::Texture::SwizzleSource y_source, - Tegra::Texture::SwizzleSource z_source, - Tegra::Texture::SwizzleSource w_source); + VkImageView GetImageView(Tegra::Texture::SwizzleSource x_source, + Tegra::Texture::SwizzleSource y_source, + Tegra::Texture::SwizzleSource z_source, + Tegra::Texture::SwizzleSource w_source); + + VkImageView GetAttachment(); bool IsSameSurface(const CachedSurfaceView& rhs) const { return &surface == &rhs.surface; } - VkImageView GetHandle() { - return GetHandle(Tegra::Texture::SwizzleSource::R, Tegra::Texture::SwizzleSource::G, - Tegra::Texture::SwizzleSource::B, Tegra::Texture::SwizzleSource::A); - } - u32 GetWidth() const { return params.GetMipWidth(base_level); } @@ -180,14 +176,6 @@ public: } private: - static u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source, - Tegra::Texture::SwizzleSource y_source, - Tegra::Texture::SwizzleSource z_source, - Tegra::Texture::SwizzleSource w_source) { - return (static_cast(x_source) << 24) | (static_cast(y_source) << 16) | - (static_cast(z_source) << 8) | static_cast(w_source); - } - // Store a copy of these values to avoid double dereference when reading them const SurfaceParams params; const VkImage image; @@ -196,15 +184,18 @@ private: const VKDevice& device; CachedSurface& surface; - const u32 base_layer; - const u32 num_layers; const u32 base_level; const u32 num_levels; const VkImageViewType image_view_type; + u32 base_layer = 0; + u32 num_layers = 0; + u32 base_slice = 0; + u32 num_slices = 0; VkImageView last_image_view = nullptr; u32 last_swizzle = 0; + vk::ImageView render_target; std::unordered_map view_cache; }; diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp index 715f39d0d..94d3a6ae5 100644 --- a/src/video_core/texture_cache/surface_base.cpp +++ b/src/video_core/texture_cache/surface_base.cpp @@ -248,12 +248,11 @@ void SurfaceBaseImpl::FlushBuffer(Tegra::MemoryManager& memory_manager, // Use an extra temporal buffer auto& tmp_buffer = staging_cache.GetBuffer(1); - // Special case for 3D Texture Segments - const bool must_read_current_data = - params.block_depth > 0 && params.target == VideoCore::Surface::SurfaceTarget::Texture2D; tmp_buffer.resize(guest_memory_size); host_ptr = tmp_buffer.data(); - if (must_read_current_data) { + + if (params.target == SurfaceTarget::Texture3D) { + // Special case for 3D texture segments memory_manager.ReadBlockUnsafe(gpu_addr, host_ptr, guest_memory_size); } diff --git a/src/video_core/texture_cache/surface_base.h b/src/video_core/texture_cache/surface_base.h index 79e10ffbb..173f2edba 100644 --- a/src/video_core/texture_cache/surface_base.h +++ b/src/video_core/texture_cache/surface_base.h @@ -217,8 +217,8 @@ public: } bool IsProtected() const { - // Only 3D Slices are to be protected - return is_target && params.block_depth > 0; + // Only 3D slices are to be protected + return is_target && params.target == SurfaceTarget::Texture3D; } bool IsRenderTarget() const { @@ -250,6 +250,11 @@ public: return GetView(ViewParams(overview_params.target, 0, num_layers, 0, params.num_levels)); } + TView Emplace3DView(u32 slice, u32 depth, u32 base_level, u32 num_levels) { + return GetView(ViewParams(VideoCore::Surface::SurfaceTarget::Texture3D, slice, depth, + base_level, num_levels)); + } + std::optional EmplaceIrregularView(const SurfaceParams& view_params, const GPUVAddr view_addr, const std::size_t candidate_size, const u32 mipmap, @@ -272,8 +277,8 @@ public: std::optional EmplaceView(const SurfaceParams& view_params, const GPUVAddr view_addr, const std::size_t candidate_size) { if (params.target == SurfaceTarget::Texture3D || - (params.num_levels == 1 && !params.is_layered) || - view_params.target == SurfaceTarget::Texture3D) { + view_params.target == SurfaceTarget::Texture3D || + (params.num_levels == 1 && !params.is_layered)) { return {}; } const auto layer_mipmap{GetLayerMipmap(view_addr)}; diff --git a/src/video_core/texture_cache/surface_params.cpp b/src/video_core/texture_cache/surface_params.cpp index 884fabffe..642eeb850 100644 --- a/src/video_core/texture_cache/surface_params.cpp +++ b/src/video_core/texture_cache/surface_params.cpp @@ -215,10 +215,19 @@ SurfaceParams SurfaceParams::CreateForFramebuffer(Core::System& system, std::siz params.num_levels = 1; params.emulated_levels = 1; - const bool is_layered = config.layers > 1 && params.block_depth == 0; - params.is_layered = is_layered; - params.depth = is_layered ? config.layers.Value() : 1; - params.target = is_layered ? SurfaceTarget::Texture2DArray : SurfaceTarget::Texture2D; + if (config.memory_layout.is_3d != 0) { + params.depth = config.layers.Value(); + params.is_layered = false; + params.target = SurfaceTarget::Texture3D; + } else if (config.layers > 1) { + params.depth = config.layers.Value(); + params.is_layered = true; + params.target = SurfaceTarget::Texture2DArray; + } else { + params.depth = 1; + params.is_layered = false; + params.target = SurfaceTarget::Texture2D; + } return params; } diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 6f63217a2..4ee0d76b9 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -508,12 +508,12 @@ private: return RecycleStrategy::Flush; } // 3D Textures decision - if (params.block_depth > 1 || params.target == SurfaceTarget::Texture3D) { + if (params.target == SurfaceTarget::Texture3D) { return RecycleStrategy::Flush; } for (const auto& s : overlaps) { const auto& s_params = s->GetSurfaceParams(); - if (s_params.block_depth > 1 || s_params.target == SurfaceTarget::Texture3D) { + if (s_params.target == SurfaceTarget::Texture3D) { return RecycleStrategy::Flush; } } @@ -726,76 +726,60 @@ private: * @param params The parameters on the new surface. * @param gpu_addr The starting address of the new surface. * @param cpu_addr The starting address of the new surface on physical memory. - * @param preserve_contents Indicates that the new surface should be loaded from memory or - * left blank. */ std::optional> Manage3DSurfaces(VectorSurface& overlaps, const SurfaceParams& params, - const GPUVAddr gpu_addr, - const VAddr cpu_addr, - bool preserve_contents) { - if (params.target == SurfaceTarget::Texture3D) { - bool failed = false; - if (params.num_levels > 1) { - // We can't handle mipmaps in 3D textures yet, better fallback to LLE approach - return std::nullopt; - } - TSurface new_surface = GetUncachedSurface(gpu_addr, params); - bool modified = false; - for (auto& surface : overlaps) { - const SurfaceParams& src_params = surface->GetSurfaceParams(); - if (src_params.target != SurfaceTarget::Texture2D) { - failed = true; - break; - } - if (src_params.height != params.height) { - failed = true; - break; - } - if (src_params.block_depth != params.block_depth || - src_params.block_height != params.block_height) { - failed = true; - break; + GPUVAddr gpu_addr, VAddr cpu_addr) { + if (params.num_levels > 1) { + // We can't handle mipmaps in 3D textures yet, better fallback to LLE approach + return std::nullopt; + } + + if (overlaps.size() == 1) { + const auto& surface = overlaps[0]; + const SurfaceParams& overlap_params = surface->GetSurfaceParams(); + // Don't attempt to render to textures with more than one level for now + // The texture has to be to the right or the sample address if we want to render to it + if (overlap_params.num_levels == 1 && cpu_addr >= surface->GetCpuAddr()) { + const u32 offset = static_cast(cpu_addr - surface->GetCpuAddr()); + const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset)); + if (slice < overlap_params.depth) { + auto view = surface->Emplace3DView(slice, params.depth, 0, 1); + return std::make_pair(std::move(surface), std::move(view)); } - const u32 offset = static_cast(surface->GetCpuAddr() - cpu_addr); - const auto offsets = params.GetBlockOffsetXYZ(offset); - const auto z = std::get<2>(offsets); - modified |= surface->IsModified(); - const CopyParams copy_params(0, 0, 0, 0, 0, z, 0, 0, params.width, params.height, - 1); - ImageCopy(surface, new_surface, copy_params); } - if (failed) { + } + + if (params.depth == 1) { + return std::nullopt; + } + + TSurface new_surface = GetUncachedSurface(gpu_addr, params); + bool modified = false; + for (auto& surface : overlaps) { + const SurfaceParams& src_params = surface->GetSurfaceParams(); + if (src_params.height != params.height || + src_params.block_depth != params.block_depth || + src_params.block_height != params.block_height) { return std::nullopt; } - for (const auto& surface : overlaps) { - Unregister(surface); - } - new_surface->MarkAsModified(modified, Tick()); - Register(new_surface); - auto view = new_surface->GetMainView(); - return {{std::move(new_surface), view}}; - } else { - for (const auto& surface : overlaps) { - if (!surface->MatchTarget(params.target)) { - if (overlaps.size() == 1 && surface->GetCpuAddr() == cpu_addr) { - if (Settings::IsGPULevelExtreme()) { - return std::nullopt; - } - Unregister(surface); - return InitializeSurface(gpu_addr, params, preserve_contents); - } - return std::nullopt; - } - if (surface->GetCpuAddr() != cpu_addr) { - continue; - } - if (surface->MatchesStructure(params) == MatchStructureResult::FullMatch) { - return {{surface, surface->GetMainView()}}; - } - } - return InitializeSurface(gpu_addr, params, preserve_contents); + modified |= surface->IsModified(); + + const u32 offset = static_cast(surface->GetCpuAddr() - cpu_addr); + const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset)); + const u32 width = params.width; + const u32 height = params.height; + const CopyParams copy_params(0, 0, 0, 0, 0, slice, 0, 0, width, height, 1); + ImageCopy(surface, new_surface, copy_params); } + for (const auto& surface : overlaps) { + Unregister(surface); + } + new_surface->MarkAsModified(modified, Tick()); + Register(new_surface); + + auto view = new_surface->GetMainView(); + return std::make_pair(std::move(new_surface), std::move(view)); } /** @@ -873,10 +857,9 @@ private: } } - // Check if it's a 3D texture - if (params.block_depth > 0) { - auto surface = - Manage3DSurfaces(overlaps, params, gpu_addr, cpu_addr, preserve_contents); + // Manage 3D textures + if (params.target == SurfaceTarget::Texture3D) { + auto surface = Manage3DSurfaces(overlaps, params, gpu_addr, cpu_addr); if (surface) { return *surface; } -- cgit v1.2.3 From 6ce5f3120be6a65a798d3abc6fda0fe6171d0296 Mon Sep 17 00:00:00 2001 From: David Marcec Date: Fri, 5 Jun 2020 01:42:19 +1000 Subject: Macro HLE support --- src/video_core/CMakeLists.txt | 2 + src/video_core/engines/maxwell_3d.cpp | 2 +- src/video_core/engines/maxwell_3d.h | 4 ++ src/video_core/macro/macro.cpp | 35 ++++++++-- src/video_core/macro/macro.h | 19 ++++- src/video_core/macro/macro_hle.cpp | 108 +++++++++++++++++++++++++++++ src/video_core/macro/macro_hle.h | 43 ++++++++++++ src/video_core/macro/macro_interpreter.cpp | 3 +- src/video_core/macro/macro_jit_x64.cpp | 3 +- 9 files changed, 209 insertions(+), 10 deletions(-) create mode 100644 src/video_core/macro/macro_hle.cpp create mode 100644 src/video_core/macro/macro_hle.h (limited to 'src/video_core/engines') diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 099bb446e..2dc752aa9 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -27,6 +27,8 @@ add_library(video_core STATIC engines/shader_type.h macro/macro.cpp macro/macro.h + macro/macro_hle.cpp + macro/macro_hle.h macro/macro_interpreter.cpp macro/macro_interpreter.h macro/macro_jit_x64.cpp diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index ea3c8a963..c01436295 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -128,7 +128,7 @@ void Maxwell3D::CallMacroMethod(u32 method, const std::vector& parameters) ((method - MacroRegistersStart) >> 1) % static_cast(macro_positions.size()); // Execute the current macro. - macro_engine->Execute(macro_positions[entry], parameters); + macro_engine->Execute(*this, macro_positions[entry], parameters); if (mme_draw.current_mode != MMEDrawMode::Undefined) { FlushMMEInlineDraw(); } diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index d5fe25065..5926c4d2d 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -1418,6 +1418,10 @@ public: return execute_on; } + VideoCore::RasterizerInterface& GetRasterizer() { + return rasterizer; + } + /// Notify a memory write has happened. void OnMemoryWrite() { dirty.flags |= dirty.on_write_stores; diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp index 89077a2d8..c8aa2534a 100644 --- a/src/video_core/macro/macro.cpp +++ b/src/video_core/macro/macro.cpp @@ -2,23 +2,37 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include #include "common/assert.h" #include "common/logging/log.h" #include "core/settings.h" +#include "video_core/engines/maxwell_3d.h" #include "video_core/macro/macro.h" +#include "video_core/macro/macro_hle.h" #include "video_core/macro/macro_interpreter.h" #include "video_core/macro/macro_jit_x64.h" namespace Tegra { +MacroEngine::MacroEngine(Engines::Maxwell3D& maxwell3d) + : hle_macros{std::make_unique(maxwell3d)} {} + +MacroEngine::~MacroEngine() {} + void MacroEngine::AddCode(u32 method, u32 data) { uploaded_macro_code[method].push_back(data); } -void MacroEngine::Execute(u32 method, const std::vector& parameters) { +void MacroEngine::Execute(Engines::Maxwell3D& maxwell3d, u32 method, + const std::vector& parameters) { auto compiled_macro = macro_cache.find(method); if (compiled_macro != macro_cache.end()) { - compiled_macro->second->Execute(parameters, method); + const auto& cache_info = compiled_macro->second; + if (cache_info.has_hle_program) { + cache_info.hle_program->Execute(parameters, method); + } else { + cache_info.lle_program->Execute(parameters, method); + } } else { // Macro not compiled, check if it's uploaded and if so, compile it auto macro_code = uploaded_macro_code.find(method); @@ -26,8 +40,21 @@ void MacroEngine::Execute(u32 method, const std::vector& parameters) { UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method); return; } - macro_cache[method] = Compile(macro_code->second); - macro_cache[method]->Execute(parameters, method); + auto& cache_info = macro_cache[method]; + cache_info.hash = boost::hash_value(macro_code->second); + cache_info.lle_program = Compile(macro_code->second); + + auto hle_program = hle_macros->GetHLEProgram(cache_info.hash); + if (hle_program.has_value()) { + cache_info.has_hle_program = true; + cache_info.hle_program = std::move(hle_program.value()); + } + + if (cache_info.has_hle_program) { + cache_info.hle_program->Execute(parameters, method); + } else { + cache_info.lle_program->Execute(parameters, method); + } } } diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h index b76ed891f..5fa8023af 100644 --- a/src/video_core/macro/macro.h +++ b/src/video_core/macro/macro.h @@ -11,9 +11,11 @@ #include "common/common_types.h" namespace Tegra { + namespace Engines { class Maxwell3D; } + namespace Macro { constexpr std::size_t NUM_MACRO_REGISTERS = 8; enum class Operation : u32 { @@ -94,6 +96,8 @@ union MethodAddress { } // namespace Macro +class HLEMacro; + class CachedMacro { public: virtual ~CachedMacro() = default; @@ -107,20 +111,29 @@ public: class MacroEngine { public: - virtual ~MacroEngine() = default; + MacroEngine(Engines::Maxwell3D& maxwell3d); + virtual ~MacroEngine(); // Store the uploaded macro code to compile them when they're called. void AddCode(u32 method, u32 data); // Compiles the macro if its not in the cache, and executes the compiled macro - void Execute(u32 method, const std::vector& parameters); + void Execute(Engines::Maxwell3D& maxwell3d, u32 method, const std::vector& parameters); protected: virtual std::unique_ptr Compile(const std::vector& code) = 0; private: - std::unordered_map> macro_cache; + struct CacheInfo { + std::unique_ptr lle_program{}; + std::unique_ptr hle_program{}; + u64 hash{}; + bool has_hle_program{}; + }; + + std::unordered_map macro_cache; std::unordered_map> uploaded_macro_code; + std::unique_ptr hle_macros; }; std::unique_ptr GetMacroEngine(Engines::Maxwell3D& maxwell3d); diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp new file mode 100644 index 000000000..51827c822 --- /dev/null +++ b/src/video_core/macro/macro_hle.cpp @@ -0,0 +1,108 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include +#include +#include "video_core/engines/maxwell_3d.h" +#include "video_core/macro/macro_hle.h" +#include "video_core/rasterizer_interface.h" + +namespace Tegra { + +// HLE'd functions +static void HLE_771BB18C62444DA0(Engines::Maxwell3D& maxwell3d, + const std::vector& parameters) { + const u32 instance_count = parameters[2] & maxwell3d.GetRegisterValue(0xD1B); + + maxwell3d.regs.draw.topology.Assign( + static_cast(parameters[0] & + ~(0x3ffffff << 26))); + maxwell3d.regs.vb_base_instance = parameters[5]; + maxwell3d.mme_draw.instance_count = instance_count; + maxwell3d.regs.vb_element_base = parameters[3]; + maxwell3d.regs.index_array.count = parameters[1]; + maxwell3d.regs.index_array.first = parameters[4]; + + if (maxwell3d.ShouldExecute()) { + maxwell3d.GetRasterizer().Draw(true, true); + } + maxwell3d.regs.index_array.count = 0; + maxwell3d.mme_draw.instance_count = 0; +} + +static void HLE_0D61FC9FAAC9FCAD(Engines::Maxwell3D& maxwell3d, + const std::vector& parameters) { + const u32 count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]); + + maxwell3d.regs.vertex_buffer.first = parameters[3]; + maxwell3d.regs.vertex_buffer.count = parameters[1]; + maxwell3d.regs.vb_base_instance = parameters[4]; + maxwell3d.regs.draw.topology.Assign( + static_cast(parameters[0])); + maxwell3d.mme_draw.instance_count = count; + + if (maxwell3d.ShouldExecute()) { + maxwell3d.GetRasterizer().Draw(false, true); + } + maxwell3d.regs.vertex_buffer.count = 0; + maxwell3d.mme_draw.instance_count = 0; +} + +static void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, + const std::vector& parameters) { + const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]); + const u32 element_base = parameters[4]; + const u32 base_instance = parameters[5]; + maxwell3d.regs.index_array.first = parameters[3]; + maxwell3d.regs.reg_array[0x446] = element_base; // vertex id base? + maxwell3d.regs.index_array.count = parameters[1]; + maxwell3d.regs.vb_element_base = element_base; + maxwell3d.regs.vb_base_instance = base_instance; + maxwell3d.regs.const_buffer.cb_pos = 0x640; + maxwell3d.mme_draw.instance_count = instance_count; + maxwell3d.regs.const_buffer.cb_data[0] = element_base; + maxwell3d.regs.const_buffer.cb_data[1] = base_instance; + maxwell3d.regs.draw.topology.Assign( + static_cast(parameters[0])); + if (maxwell3d.ShouldExecute()) { + maxwell3d.GetRasterizer().Draw(true, true); + } + maxwell3d.regs.reg_array[0x446] = 0x0; // vertex id base? + maxwell3d.regs.index_array.count = 0; + maxwell3d.regs.vb_element_base = 0x0; + maxwell3d.regs.vb_base_instance = 0x0; + maxwell3d.regs.const_buffer.cb_pos = 0x640; + maxwell3d.regs.const_buffer.cb_data[0] = 0; + maxwell3d.regs.const_buffer.cb_data[1] = 0; + maxwell3d.mme_draw.instance_count = 0; +} + +static const std::unordered_map hle_funcs{ + {0x771BB18C62444DA0, &HLE_771BB18C62444DA0}, + {0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD}, + {0x0217920100488FF7, &HLE_0217920100488FF7}, +}; + +HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} +HLEMacro::~HLEMacro() = default; + +std::optional> HLEMacro::GetHLEProgram(u64 hash) const { + auto it = hle_funcs.find(hash); + if (it != hle_funcs.end()) { + return std::make_unique(maxwell3d, it->second); + } else { + return {}; + } +} + +HLEMacroImpl::~HLEMacroImpl() = default; + +HLEMacroImpl::HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func) + : maxwell3d(maxwell3d), func(func) {} + +void HLEMacroImpl::Execute(const std::vector& parameters, u32 method) { + func(maxwell3d, parameters); +} + +} // namespace Tegra diff --git a/src/video_core/macro/macro_hle.h b/src/video_core/macro/macro_hle.h new file mode 100644 index 000000000..de7f43dc4 --- /dev/null +++ b/src/video_core/macro/macro_hle.h @@ -0,0 +1,43 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include +#include +#include +#include "common/common_types.h" +#include "video_core/macro/macro.h" + +namespace Tegra { + +namespace Engines { +class Maxwell3D; +} + +using HLEFunction = void (*)(Engines::Maxwell3D& maxwell3d, const std::vector& parameters); + +class HLEMacro { +public: + HLEMacro(Engines::Maxwell3D& maxwell3d); + ~HLEMacro(); + std::optional> GetHLEProgram(u64 hash) const; + +private: + Engines::Maxwell3D& maxwell3d; +}; + +class HLEMacroImpl : public CachedMacro { +public: + explicit HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func); + ~HLEMacroImpl(); + + void Execute(const std::vector& parameters, u32 method) override; + +private: + Engines::Maxwell3D& maxwell3d; + HLEFunction func; +}; + +} // namespace Tegra diff --git a/src/video_core/macro/macro_interpreter.cpp b/src/video_core/macro/macro_interpreter.cpp index 5edff27aa..aa5256419 100644 --- a/src/video_core/macro/macro_interpreter.cpp +++ b/src/video_core/macro/macro_interpreter.cpp @@ -11,7 +11,8 @@ MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192)); namespace Tegra { -MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} +MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) + : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {} std::unique_ptr MacroInterpreter::Compile(const std::vector& code) { return std::make_unique(maxwell3d, code); diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp index 30abb66e5..07292702f 100644 --- a/src/video_core/macro/macro_jit_x64.cpp +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -28,7 +28,8 @@ static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({ BRANCH_HOLDER, }); -MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} +MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d) + : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {} std::unique_ptr MacroJITx64::Compile(const std::vector& code) { return std::make_unique(maxwell3d, code); -- cgit v1.2.3 From fabdf5d3850c078d173653f259845c26a2ce6e7d Mon Sep 17 00:00:00 2001 From: David Marcec Date: Fri, 5 Jun 2020 13:09:52 +1000 Subject: Addressed issues --- src/video_core/engines/maxwell_3d.h | 4 ++++ src/video_core/macro/macro.cpp | 2 +- src/video_core/macro/macro.h | 2 +- src/video_core/macro/macro_hle.cpp | 20 ++++++++++---------- src/video_core/macro/macro_hle.h | 2 +- 5 files changed, 17 insertions(+), 13 deletions(-) (limited to 'src/video_core/engines') diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 5926c4d2d..ef1618990 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -1422,6 +1422,10 @@ public: return rasterizer; } + const VideoCore::RasterizerInterface& GetRasterizer() const { + return rasterizer; + } + /// Notify a memory write has happened. void OnMemoryWrite() { dirty.flags |= dirty.on_write_stores; diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp index c8aa2534a..ef7dad349 100644 --- a/src/video_core/macro/macro.cpp +++ b/src/video_core/macro/macro.cpp @@ -17,7 +17,7 @@ namespace Tegra { MacroEngine::MacroEngine(Engines::Maxwell3D& maxwell3d) : hle_macros{std::make_unique(maxwell3d)} {} -MacroEngine::~MacroEngine() {} +MacroEngine::~MacroEngine() = default; void MacroEngine::AddCode(u32 method, u32 data) { uploaded_macro_code[method].push_back(data); diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h index 5fa8023af..4d00b84b0 100644 --- a/src/video_core/macro/macro.h +++ b/src/video_core/macro/macro.h @@ -111,7 +111,7 @@ public: class MacroEngine { public: - MacroEngine(Engines::Maxwell3D& maxwell3d); + explicit MacroEngine(Engines::Maxwell3D& maxwell3d); virtual ~MacroEngine(); // Store the uploaded macro code to compile them when they're called. diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp index 887f40310..1f1348df3 100644 --- a/src/video_core/macro/macro_hle.cpp +++ b/src/video_core/macro/macro_hle.cpp @@ -2,7 +2,7 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include +#include #include #include "video_core/engines/maxwell_3d.h" #include "video_core/macro/macro_hle.h" @@ -78,22 +78,22 @@ static void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, maxwell3d.CallMethodFromMME(0x8e5, 0x0); } -static const std::unordered_map hle_funcs{ - {0x771BB18C62444DA0, &HLE_771BB18C62444DA0}, - {0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD}, - {0x0217920100488FF7, &HLE_0217920100488FF7}, +static const std::array, 3> hle_funcs{ + std::make_pair(0x771BB18C62444DA0, &HLE_771BB18C62444DA0), + std::make_pair(0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD), + std::make_pair(0x0217920100488FF7, &HLE_0217920100488FF7), }; HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} HLEMacro::~HLEMacro() = default; std::optional> HLEMacro::GetHLEProgram(u64 hash) const { - auto it = hle_funcs.find(hash); - if (it != hle_funcs.end()) { - return std::make_unique(maxwell3d, it->second); - } else { - return {}; + const auto it = std::find_if(hle_funcs.begin(), hle_funcs.end(), + [hash](auto& pair) { return pair.first == hash; }); + if (it == hle_funcs.end()) { + return std::nullopt; } + return std::make_unique(maxwell3d, it->second); } HLEMacroImpl::~HLEMacroImpl() = default; diff --git a/src/video_core/macro/macro_hle.h b/src/video_core/macro/macro_hle.h index de7f43dc4..7cd492a8f 100644 --- a/src/video_core/macro/macro_hle.h +++ b/src/video_core/macro/macro_hle.h @@ -20,7 +20,7 @@ using HLEFunction = void (*)(Engines::Maxwell3D& maxwell3d, const std::vector> GetHLEProgram(u64 hash) const; -- cgit v1.2.3