diff options
Diffstat (limited to 'src/video_core/shader')
| -rw-r--r-- | src/video_core/shader/shader.cpp | 5 | ||||
| -rw-r--r-- | src/video_core/shader/shader_interpreter.cpp | 79 | ||||
| -rw-r--r-- | src/video_core/shader/shader_jit_x64.cpp | 252 | ||||
| -rw-r--r-- | src/video_core/shader/shader_jit_x64.h | 15 |
4 files changed, 276 insertions, 75 deletions
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp index 4e9836c80..f89117521 100644 --- a/src/video_core/shader/shader.cpp +++ b/src/video_core/shader/shader.cpp @@ -9,6 +9,7 @@ #include "common/hash.h" #include "common/make_unique.h" +#include "common/microprofile.h" #include "common/profiler.h" #include "video_core/debug_utils/debug_utils.h" @@ -51,15 +52,19 @@ void Setup(UnitState<false>& state) { } void Shutdown() { +#ifdef ARCHITECTURE_x86_64 shader_map.clear(); +#endif // ARCHITECTURE_x86_64 } static Common::Profiling::TimingCategory shader_category("Vertex Shader"); +MICROPROFILE_DEFINE(GPU_VertexShader, "GPU", "Vertex Shader", MP_RGB(50, 50, 240)); OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes) { auto& config = g_state.regs.vs; Common::Profiling::ScopeTimer timer(shader_category); + MICROPROFILE_SCOPE(GPU_VertexShader); state.program_counter = config.main_offset; state.debug.max_offset = 0; diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp index e14de0768..69e4efa68 100644 --- a/src/video_core/shader/shader_interpreter.cpp +++ b/src/video_core/shader/shader_interpreter.cpp @@ -177,7 +177,10 @@ void RunInterpreter(UnitState<Debug>& state) { if (!swizzle.DestComponentEnabled(i)) continue; - dest[i] = std::max(src1[i], src2[i]); + // NOTE: Exact form required to match NaN semantics to hardware: + // max(0, NaN) -> NaN + // max(NaN, 0) -> 0 + dest[i] = (src1[i] > src2[i]) ? src1[i] : src2[i]; } Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; @@ -190,19 +193,29 @@ void RunInterpreter(UnitState<Debug>& state) { if (!swizzle.DestComponentEnabled(i)) continue; - dest[i] = std::min(src1[i], src2[i]); + // NOTE: Exact form required to match NaN semantics to hardware: + // min(0, NaN) -> NaN + // min(NaN, 0) -> 0 + dest[i] = (src1[i] < src2[i]) ? src1[i] : src2[i]; } Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; case OpCode::Id::DP3: case OpCode::Id::DP4: + case OpCode::Id::DPH: + case OpCode::Id::DPHI: { Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); Record<DebugDataRecord::SRC2>(state.debug, iteration, src2); Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); + + OpCode::Id opcode = instr.opcode.Value().EffectiveOpCode(); + if (opcode == OpCode::Id::DPH || opcode == OpCode::Id::DPHI) + src1[3] = float24::FromFloat32(1.0f); + float24 dot = float24::FromFloat32(0.f); - int num_components = (instr.opcode.Value() == OpCode::Id::DP3) ? 3 : 4; + int num_components = (opcode == OpCode::Id::DP3) ? 3 : 4; for (int i = 0; i < num_components; ++i) dot = dot + src1[i] * src2[i]; @@ -221,13 +234,12 @@ void RunInterpreter(UnitState<Debug>& state) { { Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); + float24 rcp_res = float24::FromFloat32(1.0f / src1[0].ToFloat32()); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; - // TODO: Be stable against division by zero! - // TODO: I think this might be wrong... we should only use one component here - dest[i] = float24::FromFloat32(1.0f / src1[i].ToFloat32()); + dest[i] = rcp_res; } Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; @@ -238,13 +250,12 @@ void RunInterpreter(UnitState<Debug>& state) { { Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); + float24 rsq_res = float24::FromFloat32(1.0f / std::sqrt(src1[0].ToFloat32())); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; - // TODO: Be stable against division by zero! - // TODO: I think this might be wrong... we should only use one component here - dest[i] = float24::FromFloat32(1.0f / sqrt(src1[i].ToFloat32())); + dest[i] = rsq_res; } Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; @@ -278,6 +289,20 @@ void RunInterpreter(UnitState<Debug>& state) { break; } + case OpCode::Id::SGE: + case OpCode::Id::SGEI: + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::SRC2>(state.debug, iteration, src2); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); + for (int i = 0; i < 4; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + dest[i] = (src1[i] >= src2[i]) ? float24::FromFloat32(1.0f) : float24::FromFloat32(0.0f); + } + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); + break; + case OpCode::Id::SLT: case OpCode::Id::SLTI: Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); @@ -334,6 +359,42 @@ void RunInterpreter(UnitState<Debug>& state) { Record<DebugDataRecord::CMP_RESULT>(state.debug, iteration, state.conditional_code); break; + case OpCode::Id::EX2: + { + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); + + // EX2 only takes first component exp2 and writes it to all dest components + float24 ex2_res = float24::FromFloat32(std::exp2(src1[0].ToFloat32())); + for (int i = 0; i < 4; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + dest[i] = ex2_res; + } + + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); + break; + } + + case OpCode::Id::LG2: + { + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); + + // LG2 only takes the first component log2 and writes it to all dest components + float24 lg2_res = float24::FromFloat32(std::log2(src1[0].ToFloat32())); + for (int i = 0; i < 4; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + dest[i] = lg2_res; + } + + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); + break; + } + default: LOG_ERROR(HW_GPU, "Unhandled arithmetic instruction: 0x%02x (%s): 0x%08x", (int)instr.opcode.Value().EffectiveOpCode(), instr.opcode.Value().GetInfo().name, instr.hex); diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp index 836942c6b..d3cfe109e 100644 --- a/src/video_core/shader/shader_jit_x64.cpp +++ b/src/video_core/shader/shader_jit_x64.cpp @@ -23,14 +23,14 @@ const JitFunction instr_table[64] = { &JitCompiler::Compile_ADD, // add &JitCompiler::Compile_DP3, // dp3 &JitCompiler::Compile_DP4, // dp4 - nullptr, // dph + &JitCompiler::Compile_DPH, // dph nullptr, // unknown - nullptr, // ex2 - nullptr, // lg2 + &JitCompiler::Compile_EX2, // ex2 + &JitCompiler::Compile_LG2, // lg2 nullptr, // unknown &JitCompiler::Compile_MUL, // mul - nullptr, // lge - nullptr, // slt + &JitCompiler::Compile_SGE, // sge + &JitCompiler::Compile_SLT, // slt &JitCompiler::Compile_FLR, // flr &JitCompiler::Compile_MAX, // max &JitCompiler::Compile_MIN, // min @@ -44,10 +44,10 @@ const JitFunction instr_table[64] = { nullptr, // unknown nullptr, // unknown nullptr, // unknown - nullptr, // dphi + &JitCompiler::Compile_DPH, // dphi nullptr, // unknown - nullptr, // sgei - &JitCompiler::Compile_SLTI, // slti + &JitCompiler::Compile_SGE, // sgei + &JitCompiler::Compile_SLT, // slti nullptr, // unknown nullptr, // unknown nullptr, // unknown @@ -115,6 +115,8 @@ static const X64Reg SRC1 = XMM1; static const X64Reg SRC2 = XMM2; /// Loaded with the third swizzled source register, otherwise can be used as a scratch register static const X64Reg SRC3 = XMM3; +/// Additional scratch register +static const X64Reg SCRATCH2 = XMM4; /// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one static const X64Reg ONE = XMM14; /// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR @@ -227,8 +229,8 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) { u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1); BLENDPS(SCRATCH, R(src), mask); } else { - MOVAPS(XMM4, R(src)); - UNPCKHPS(XMM4, R(SCRATCH)); // Unpack X/Y components of source and destination + MOVAPS(SCRATCH2, R(src)); + UNPCKHPS(SCRATCH2, R(SCRATCH)); // Unpack X/Y components of source and destination UNPCKLPS(SCRATCH, R(src)); // Unpack Z/W components of source and destination // Compute selector to selectively copy source components to destination for SHUFPS instruction @@ -236,7 +238,7 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) { ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) | ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) | ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6); - SHUFPS(SCRATCH, R(XMM4), sel); + SHUFPS(SCRATCH, R(SCRATCH2), sel); } // Store dest back to memory @@ -244,6 +246,19 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) { } } +void JitCompiler::Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch) { + MOVAPS(scratch, R(src1)); + CMPPS(scratch, R(src2), CMP_ORD); + + MULPS(src1, R(src2)); + + MOVAPS(src2, R(src1)); + CMPPS(src2, R(src2), CMP_UNORD); + + XORPS(scratch, R(src2)); + ANDPS(src1, R(scratch)); +} + void JitCompiler::Compile_EvaluateCondition(Instruction instr) { // Note: NXOR is used below to check for equality switch (instr.flow_control.op) { @@ -280,6 +295,22 @@ void JitCompiler::Compile_UniformCondition(Instruction instr) { CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0)); } +void JitCompiler::Compile_PushCallerSavedXMM() { +#ifndef _WIN32 + SUB(64, R(RSP), Imm8(2 * 16)); + MOVUPS(MDisp(RSP, 16), ONE); + MOVUPS(MDisp(RSP, 0), NEGBIT); +#endif +} + +void JitCompiler::Compile_PopCallerSavedXMM() { +#ifndef _WIN32 + MOVUPS(NEGBIT, MDisp(RSP, 0)); + MOVUPS(ONE, MDisp(RSP, 16)); + ADD(64, R(RSP), Imm8(2 * 16)); +#endif +} + void JitCompiler::Compile_ADD(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); @@ -291,21 +322,17 @@ void JitCompiler::Compile_DP3(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - if (Common::GetCPUCaps().sse4_1) { - DPPS(SRC1, R(SRC2), 0x7f); - } else { - MULPS(SRC1, R(SRC2)); + Compile_SanitizedMul(SRC1, SRC2, SCRATCH); - MOVAPS(SRC2, R(SRC1)); - SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1)); + MOVAPS(SRC2, R(SRC1)); + SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1)); - MOVAPS(SRC3, R(SRC1)); - SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2)); + MOVAPS(SRC3, R(SRC1)); + SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2)); - SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); - ADDPS(SRC1, R(SRC2)); - ADDPS(SRC1, R(SRC3)); - } + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); + ADDPS(SRC1, R(SRC2)); + ADDPS(SRC1, R(SRC3)); Compile_DestEnable(instr, SRC1); } @@ -314,27 +341,117 @@ void JitCompiler::Compile_DP4(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + Compile_SanitizedMul(SRC1, SRC2, SCRATCH); + + MOVAPS(SRC2, R(SRC1)); + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY + ADDPS(SRC1, R(SRC2)); + + MOVAPS(SRC2, R(SRC1)); + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX + ADDPS(SRC1, R(SRC2)); + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_DPH(Instruction instr) { + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::DPHI) { + Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); + } else { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + } + if (Common::GetCPUCaps().sse4_1) { - DPPS(SRC1, R(SRC2), 0xff); + // Set 4th component to 1.0 + BLENDPS(SRC1, R(ONE), 0x8); // 0b1000 } else { - MULPS(SRC1, R(SRC2)); + // Set 4th component to 1.0 + MOVAPS(SCRATCH, R(SRC1)); + UNPCKHPS(SCRATCH, R(ONE)); // XYZW, 1111 -> Z1__ + UNPCKLPD(SRC1, R(SCRATCH)); // XYZW, Z1__ -> XYZ1 + } - MOVAPS(SRC2, R(SRC1)); - SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY - ADDPS(SRC1, R(SRC2)); + Compile_SanitizedMul(SRC1, SRC2, SCRATCH); - MOVAPS(SRC2, R(SRC1)); - SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX - ADDPS(SRC1, R(SRC2)); - } + MOVAPS(SRC2, R(SRC1)); + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY + ADDPS(SRC1, R(SRC2)); + + MOVAPS(SRC2, R(SRC1)); + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX + ADDPS(SRC1, R(SRC2)); + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_EX2(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + MOVSS(XMM0, R(SRC1)); + // The following will actually break the stack alignment + ABI_PushAllCallerSavedRegsAndAdjustStack(); + Compile_PushCallerSavedXMM(); + ABI_CallFunction(reinterpret_cast<const void*>(exp2f)); + Compile_PopCallerSavedXMM(); + ABI_PopAllCallerSavedRegsAndAdjustStack(); + + SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0)); + MOVAPS(SRC1, R(XMM0)); + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_LG2(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + MOVSS(XMM0, R(SRC1)); + + // The following will actually break the stack alignment + ABI_PushAllCallerSavedRegsAndAdjustStack(); + Compile_PushCallerSavedXMM(); + ABI_CallFunction(reinterpret_cast<const void*>(log2f)); + Compile_PopCallerSavedXMM(); + ABI_PopAllCallerSavedRegsAndAdjustStack(); + + SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0)); + MOVAPS(SRC1, R(XMM0)); Compile_DestEnable(instr, SRC1); } void JitCompiler::Compile_MUL(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - MULPS(SRC1, R(SRC2)); + Compile_SanitizedMul(SRC1, SRC2, SCRATCH); + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_SGE(Instruction instr) { + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SGEI) { + Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); + } else { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + } + + CMPPS(SRC1, R(SRC2), CMP_NLT); + ANDPS(SRC1, R(ONE)); + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_SLT(Instruction instr) { + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SLTI) { + Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); + } else { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + } + + CMPPS(SRC1, R(SRC2), CMP_LT); + ANDPS(SRC1, R(ONE)); + Compile_DestEnable(instr, SRC1); } @@ -354,6 +471,7 @@ void JitCompiler::Compile_FLR(Instruction instr) { void JitCompiler::Compile_MAX(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. MAXPS(SRC1, R(SRC2)); Compile_DestEnable(instr, SRC1); } @@ -361,6 +479,7 @@ void JitCompiler::Compile_MAX(Instruction instr) { void JitCompiler::Compile_MIN(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. MINPS(SRC1, R(SRC2)); Compile_DestEnable(instr, SRC1); } @@ -374,8 +493,8 @@ void JitCompiler::Compile_MOVA(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - // Convert floats to integers (only care about X and Y components) - CVTPS2DQ(SRC1, R(SRC1)); + // Convert floats to integers using truncation (only care about X and Y components) + CVTTPS2DQ(SRC1, R(SRC1)); // Get result MOVQ_xmm(R(RAX), SRC1); @@ -415,22 +534,13 @@ void JitCompiler::Compile_MOV(Instruction instr) { Compile_DestEnable(instr, SRC1); } -void JitCompiler::Compile_SLTI(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); - Compile_SwizzleSrc(instr, 1, instr.common.src2i, SRC2); - - CMPSS(SRC1, R(SRC2), CMP_LT); - ANDPS(SRC1, R(ONE)); - - Compile_DestEnable(instr, SRC1); -} - void JitCompiler::Compile_RCP(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - // TODO(bunnei): RCPPS is a pretty rough approximation, this might cause problems if Pica + // TODO(bunnei): RCPSS is a pretty rough approximation, this might cause problems if Pica // performs this operation more accurately. This should be checked on hardware. - RCPPS(SRC1, R(SRC1)); + RCPSS(SRC1, R(SRC1)); + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX Compile_DestEnable(instr, SRC1); } @@ -438,9 +548,10 @@ void JitCompiler::Compile_RCP(Instruction instr) { void JitCompiler::Compile_RSQ(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - // TODO(bunnei): RSQRTPS is a pretty rough approximation, this might cause problems if Pica + // TODO(bunnei): RSQRTSS is a pretty rough approximation, this might cause problems if Pica // performs this operation more accurately. This should be checked on hardware. - RSQRTPS(SRC1, R(SRC1)); + RSQRTSS(SRC1, R(SRC1)); + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX Compile_DestEnable(instr, SRC1); } @@ -475,27 +586,42 @@ void JitCompiler::Compile_CALLU(Instruction instr) { } void JitCompiler::Compile_CMP(Instruction instr) { + using Op = Instruction::Common::CompareOpType::Op; + Op op_x = instr.common.compare_op.x; + Op op_y = instr.common.compare_op.y; + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_NLE, CMP_NLT }; + // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to + // emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here + // because they don't match when used with NaNs. + static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE }; + + bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual); + Gen::X64Reg lhs_x = invert_op_x ? SRC2 : SRC1; + Gen::X64Reg rhs_x = invert_op_x ? SRC1 : SRC2; - if (instr.common.compare_op.x == instr.common.compare_op.y) { + if (op_x == op_y) { // Compare X-component and Y-component together - CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.x]); + CMPPS(lhs_x, R(rhs_x), cmp[op_x]); + MOVQ_xmm(R(COND0), lhs_x); - MOVQ_xmm(R(COND0), SRC1); MOV(64, R(COND1), R(COND0)); } else { + bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual); + Gen::X64Reg lhs_y = invert_op_y ? SRC2 : SRC1; + Gen::X64Reg rhs_y = invert_op_y ? SRC1 : SRC2; + // Compare X-component - MOVAPS(SCRATCH, R(SRC1)); - CMPSS(SCRATCH, R(SRC2), cmp[instr.common.compare_op.x]); + MOVAPS(SCRATCH, R(lhs_x)); + CMPSS(SCRATCH, R(rhs_x), cmp[op_x]); // Compare Y-component - CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.y]); + CMPPS(lhs_y, R(rhs_y), cmp[op_y]); MOVQ_xmm(R(COND0), SCRATCH); - MOVQ_xmm(R(COND1), SRC1); + MOVQ_xmm(R(COND1), lhs_y); } SHR(32, R(COND0), Imm8(31)); @@ -513,12 +639,8 @@ void JitCompiler::Compile_MAD(Instruction instr) { Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3); } - if (Common::GetCPUCaps().fma) { - VFMADD213PS(SRC1, SRC2, R(SRC3)); - } else { - MULPS(SRC1, R(SRC2)); - ADDPS(SRC1, R(SRC3)); - } + Compile_SanitizedMul(SRC1, SRC2, SCRATCH); + ADDPS(SRC1, R(SRC3)); Compile_DestEnable(instr, SRC1); } @@ -646,12 +768,12 @@ CompiledShader* JitCompiler::Compile() { // Used to set a register to one static const __m128 one = { 1.f, 1.f, 1.f, 1.f }; MOV(PTRBITS, R(RAX), ImmPtr(&one)); - MOVAPS(ONE, MDisp(RAX, 0)); + MOVAPS(ONE, MatR(RAX)); // Used to negate registers static const __m128 neg = { -0.f, -0.f, -0.f, -0.f }; MOV(PTRBITS, R(RAX), ImmPtr(&neg)); - MOVAPS(NEGBIT, MDisp(RAX, 0)); + MOVAPS(NEGBIT, MatR(RAX)); looping = false; diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h index b88f2a0d2..58828ecc8 100644 --- a/src/video_core/shader/shader_jit_x64.h +++ b/src/video_core/shader/shader_jit_x64.h @@ -37,7 +37,12 @@ public: void Compile_ADD(Instruction instr); void Compile_DP3(Instruction instr); void Compile_DP4(Instruction instr); + void Compile_DPH(Instruction instr); + void Compile_EX2(Instruction instr); + void Compile_LG2(Instruction instr); void Compile_MUL(Instruction instr); + void Compile_SGE(Instruction instr); + void Compile_SLT(Instruction instr); void Compile_FLR(Instruction instr); void Compile_MAX(Instruction instr); void Compile_MIN(Instruction instr); @@ -45,7 +50,6 @@ public: void Compile_RSQ(Instruction instr); void Compile_MOVA(Instruction instr); void Compile_MOV(Instruction instr); - void Compile_SLTI(Instruction instr); void Compile_NOP(Instruction instr); void Compile_END(Instruction instr); void Compile_CALL(Instruction instr); @@ -64,9 +68,18 @@ private: void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest); void Compile_DestEnable(Instruction instr, Gen::X64Reg dest); + /** + * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying + * zero by inf. Clobbers `src2` and `scratch`. + */ + void Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch); + void Compile_EvaluateCondition(Instruction instr); void Compile_UniformCondition(Instruction instr); + void Compile_PushCallerSavedXMM(); + void Compile_PopCallerSavedXMM(); + /// Pointer to the variable that stores the current Pica code offset. Used to handle nested code blocks. unsigned* offset_ptr = nullptr; |
