aboutsummaryrefslogtreecommitdiff
path: root/src/video_core/shader
diff options
context:
space:
mode:
Diffstat (limited to 'src/video_core/shader')
-rw-r--r--src/video_core/shader/shader.cpp5
-rw-r--r--src/video_core/shader/shader_interpreter.cpp79
-rw-r--r--src/video_core/shader/shader_jit_x64.cpp252
-rw-r--r--src/video_core/shader/shader_jit_x64.h15
4 files changed, 276 insertions, 75 deletions
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index 4e9836c80..f89117521 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -9,6 +9,7 @@
#include "common/hash.h"
#include "common/make_unique.h"
+#include "common/microprofile.h"
#include "common/profiler.h"
#include "video_core/debug_utils/debug_utils.h"
@@ -51,15 +52,19 @@ void Setup(UnitState<false>& state) {
}
void Shutdown() {
+#ifdef ARCHITECTURE_x86_64
shader_map.clear();
+#endif // ARCHITECTURE_x86_64
}
static Common::Profiling::TimingCategory shader_category("Vertex Shader");
+MICROPROFILE_DEFINE(GPU_VertexShader, "GPU", "Vertex Shader", MP_RGB(50, 50, 240));
OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes) {
auto& config = g_state.regs.vs;
Common::Profiling::ScopeTimer timer(shader_category);
+ MICROPROFILE_SCOPE(GPU_VertexShader);
state.program_counter = config.main_offset;
state.debug.max_offset = 0;
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp
index e14de0768..69e4efa68 100644
--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -177,7 +177,10 @@ void RunInterpreter(UnitState<Debug>& state) {
if (!swizzle.DestComponentEnabled(i))
continue;
- dest[i] = std::max(src1[i], src2[i]);
+ // NOTE: Exact form required to match NaN semantics to hardware:
+ // max(0, NaN) -> NaN
+ // max(NaN, 0) -> 0
+ dest[i] = (src1[i] > src2[i]) ? src1[i] : src2[i];
}
Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
@@ -190,19 +193,29 @@ void RunInterpreter(UnitState<Debug>& state) {
if (!swizzle.DestComponentEnabled(i))
continue;
- dest[i] = std::min(src1[i], src2[i]);
+ // NOTE: Exact form required to match NaN semantics to hardware:
+ // min(0, NaN) -> NaN
+ // min(NaN, 0) -> 0
+ dest[i] = (src1[i] < src2[i]) ? src1[i] : src2[i];
}
Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
case OpCode::Id::DP3:
case OpCode::Id::DP4:
+ case OpCode::Id::DPH:
+ case OpCode::Id::DPHI:
{
Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
+
+ OpCode::Id opcode = instr.opcode.Value().EffectiveOpCode();
+ if (opcode == OpCode::Id::DPH || opcode == OpCode::Id::DPHI)
+ src1[3] = float24::FromFloat32(1.0f);
+
float24 dot = float24::FromFloat32(0.f);
- int num_components = (instr.opcode.Value() == OpCode::Id::DP3) ? 3 : 4;
+ int num_components = (opcode == OpCode::Id::DP3) ? 3 : 4;
for (int i = 0; i < num_components; ++i)
dot = dot + src1[i] * src2[i];
@@ -221,13 +234,12 @@ void RunInterpreter(UnitState<Debug>& state) {
{
Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
+ float24 rcp_res = float24::FromFloat32(1.0f / src1[0].ToFloat32());
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
- // TODO: Be stable against division by zero!
- // TODO: I think this might be wrong... we should only use one component here
- dest[i] = float24::FromFloat32(1.0f / src1[i].ToFloat32());
+ dest[i] = rcp_res;
}
Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
@@ -238,13 +250,12 @@ void RunInterpreter(UnitState<Debug>& state) {
{
Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
+ float24 rsq_res = float24::FromFloat32(1.0f / std::sqrt(src1[0].ToFloat32()));
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
- // TODO: Be stable against division by zero!
- // TODO: I think this might be wrong... we should only use one component here
- dest[i] = float24::FromFloat32(1.0f / sqrt(src1[i].ToFloat32()));
+ dest[i] = rsq_res;
}
Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
@@ -278,6 +289,20 @@ void RunInterpreter(UnitState<Debug>& state) {
break;
}
+ case OpCode::Id::SGE:
+ case OpCode::Id::SGEI:
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
+ for (int i = 0; i < 4; ++i) {
+ if (!swizzle.DestComponentEnabled(i))
+ continue;
+
+ dest[i] = (src1[i] >= src2[i]) ? float24::FromFloat32(1.0f) : float24::FromFloat32(0.0f);
+ }
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
+ break;
+
case OpCode::Id::SLT:
case OpCode::Id::SLTI:
Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
@@ -334,6 +359,42 @@ void RunInterpreter(UnitState<Debug>& state) {
Record<DebugDataRecord::CMP_RESULT>(state.debug, iteration, state.conditional_code);
break;
+ case OpCode::Id::EX2:
+ {
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
+
+ // EX2 only takes first component exp2 and writes it to all dest components
+ float24 ex2_res = float24::FromFloat32(std::exp2(src1[0].ToFloat32()));
+ for (int i = 0; i < 4; ++i) {
+ if (!swizzle.DestComponentEnabled(i))
+ continue;
+
+ dest[i] = ex2_res;
+ }
+
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
+ break;
+ }
+
+ case OpCode::Id::LG2:
+ {
+ Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+ Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
+
+ // LG2 only takes the first component log2 and writes it to all dest components
+ float24 lg2_res = float24::FromFloat32(std::log2(src1[0].ToFloat32()));
+ for (int i = 0; i < 4; ++i) {
+ if (!swizzle.DestComponentEnabled(i))
+ continue;
+
+ dest[i] = lg2_res;
+ }
+
+ Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
+ break;
+ }
+
default:
LOG_ERROR(HW_GPU, "Unhandled arithmetic instruction: 0x%02x (%s): 0x%08x",
(int)instr.opcode.Value().EffectiveOpCode(), instr.opcode.Value().GetInfo().name, instr.hex);
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index 836942c6b..d3cfe109e 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -23,14 +23,14 @@ const JitFunction instr_table[64] = {
&JitCompiler::Compile_ADD, // add
&JitCompiler::Compile_DP3, // dp3
&JitCompiler::Compile_DP4, // dp4
- nullptr, // dph
+ &JitCompiler::Compile_DPH, // dph
nullptr, // unknown
- nullptr, // ex2
- nullptr, // lg2
+ &JitCompiler::Compile_EX2, // ex2
+ &JitCompiler::Compile_LG2, // lg2
nullptr, // unknown
&JitCompiler::Compile_MUL, // mul
- nullptr, // lge
- nullptr, // slt
+ &JitCompiler::Compile_SGE, // sge
+ &JitCompiler::Compile_SLT, // slt
&JitCompiler::Compile_FLR, // flr
&JitCompiler::Compile_MAX, // max
&JitCompiler::Compile_MIN, // min
@@ -44,10 +44,10 @@ const JitFunction instr_table[64] = {
nullptr, // unknown
nullptr, // unknown
nullptr, // unknown
- nullptr, // dphi
+ &JitCompiler::Compile_DPH, // dphi
nullptr, // unknown
- nullptr, // sgei
- &JitCompiler::Compile_SLTI, // slti
+ &JitCompiler::Compile_SGE, // sgei
+ &JitCompiler::Compile_SLT, // slti
nullptr, // unknown
nullptr, // unknown
nullptr, // unknown
@@ -115,6 +115,8 @@ static const X64Reg SRC1 = XMM1;
static const X64Reg SRC2 = XMM2;
/// Loaded with the third swizzled source register, otherwise can be used as a scratch register
static const X64Reg SRC3 = XMM3;
+/// Additional scratch register
+static const X64Reg SCRATCH2 = XMM4;
/// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one
static const X64Reg ONE = XMM14;
/// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR
@@ -227,8 +229,8 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1);
BLENDPS(SCRATCH, R(src), mask);
} else {
- MOVAPS(XMM4, R(src));
- UNPCKHPS(XMM4, R(SCRATCH)); // Unpack X/Y components of source and destination
+ MOVAPS(SCRATCH2, R(src));
+ UNPCKHPS(SCRATCH2, R(SCRATCH)); // Unpack X/Y components of source and destination
UNPCKLPS(SCRATCH, R(src)); // Unpack Z/W components of source and destination
// Compute selector to selectively copy source components to destination for SHUFPS instruction
@@ -236,7 +238,7 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) |
((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) |
((swiz.DestComponentEnabled(3) ? 2 : 3) << 6);
- SHUFPS(SCRATCH, R(XMM4), sel);
+ SHUFPS(SCRATCH, R(SCRATCH2), sel);
}
// Store dest back to memory
@@ -244,6 +246,19 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
}
}
+void JitCompiler::Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch) {
+ MOVAPS(scratch, R(src1));
+ CMPPS(scratch, R(src2), CMP_ORD);
+
+ MULPS(src1, R(src2));
+
+ MOVAPS(src2, R(src1));
+ CMPPS(src2, R(src2), CMP_UNORD);
+
+ XORPS(scratch, R(src2));
+ ANDPS(src1, R(scratch));
+}
+
void JitCompiler::Compile_EvaluateCondition(Instruction instr) {
// Note: NXOR is used below to check for equality
switch (instr.flow_control.op) {
@@ -280,6 +295,22 @@ void JitCompiler::Compile_UniformCondition(Instruction instr) {
CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0));
}
+void JitCompiler::Compile_PushCallerSavedXMM() {
+#ifndef _WIN32
+ SUB(64, R(RSP), Imm8(2 * 16));
+ MOVUPS(MDisp(RSP, 16), ONE);
+ MOVUPS(MDisp(RSP, 0), NEGBIT);
+#endif
+}
+
+void JitCompiler::Compile_PopCallerSavedXMM() {
+#ifndef _WIN32
+ MOVUPS(NEGBIT, MDisp(RSP, 0));
+ MOVUPS(ONE, MDisp(RSP, 16));
+ ADD(64, R(RSP), Imm8(2 * 16));
+#endif
+}
+
void JitCompiler::Compile_ADD(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
@@ -291,21 +322,17 @@ void JitCompiler::Compile_DP3(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
- if (Common::GetCPUCaps().sse4_1) {
- DPPS(SRC1, R(SRC2), 0x7f);
- } else {
- MULPS(SRC1, R(SRC2));
+ Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
- MOVAPS(SRC2, R(SRC1));
- SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1));
+ MOVAPS(SRC2, R(SRC1));
+ SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1));
- MOVAPS(SRC3, R(SRC1));
- SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2));
+ MOVAPS(SRC3, R(SRC1));
+ SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2));
- SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0));
- ADDPS(SRC1, R(SRC2));
- ADDPS(SRC1, R(SRC3));
- }
+ SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0));
+ ADDPS(SRC1, R(SRC2));
+ ADDPS(SRC1, R(SRC3));
Compile_DestEnable(instr, SRC1);
}
@@ -314,27 +341,117 @@ void JitCompiler::Compile_DP4(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+ Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
+
+ MOVAPS(SRC2, R(SRC1));
+ SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
+ ADDPS(SRC1, R(SRC2));
+
+ MOVAPS(SRC2, R(SRC1));
+ SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
+ ADDPS(SRC1, R(SRC2));
+
+ Compile_DestEnable(instr, SRC1);
+}
+
+void JitCompiler::Compile_DPH(Instruction instr) {
+ if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::DPHI) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
+ Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
+ } else {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+ Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+ }
+
if (Common::GetCPUCaps().sse4_1) {
- DPPS(SRC1, R(SRC2), 0xff);
+ // Set 4th component to 1.0
+ BLENDPS(SRC1, R(ONE), 0x8); // 0b1000
} else {
- MULPS(SRC1, R(SRC2));
+ // Set 4th component to 1.0
+ MOVAPS(SCRATCH, R(SRC1));
+ UNPCKHPS(SCRATCH, R(ONE)); // XYZW, 1111 -> Z1__
+ UNPCKLPD(SRC1, R(SCRATCH)); // XYZW, Z1__ -> XYZ1
+ }
- MOVAPS(SRC2, R(SRC1));
- SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
- ADDPS(SRC1, R(SRC2));
+ Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
- MOVAPS(SRC2, R(SRC1));
- SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
- ADDPS(SRC1, R(SRC2));
- }
+ MOVAPS(SRC2, R(SRC1));
+ SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
+ ADDPS(SRC1, R(SRC2));
+
+ MOVAPS(SRC2, R(SRC1));
+ SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
+ ADDPS(SRC1, R(SRC2));
+
+ Compile_DestEnable(instr, SRC1);
+}
+
+void JitCompiler::Compile_EX2(Instruction instr) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+ MOVSS(XMM0, R(SRC1));
+ // The following will actually break the stack alignment
+ ABI_PushAllCallerSavedRegsAndAdjustStack();
+ Compile_PushCallerSavedXMM();
+ ABI_CallFunction(reinterpret_cast<const void*>(exp2f));
+ Compile_PopCallerSavedXMM();
+ ABI_PopAllCallerSavedRegsAndAdjustStack();
+
+ SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0));
+ MOVAPS(SRC1, R(XMM0));
+ Compile_DestEnable(instr, SRC1);
+}
+
+void JitCompiler::Compile_LG2(Instruction instr) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+ MOVSS(XMM0, R(SRC1));
+
+ // The following will actually break the stack alignment
+ ABI_PushAllCallerSavedRegsAndAdjustStack();
+ Compile_PushCallerSavedXMM();
+ ABI_CallFunction(reinterpret_cast<const void*>(log2f));
+ Compile_PopCallerSavedXMM();
+ ABI_PopAllCallerSavedRegsAndAdjustStack();
+
+ SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0));
+ MOVAPS(SRC1, R(XMM0));
Compile_DestEnable(instr, SRC1);
}
void JitCompiler::Compile_MUL(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
- MULPS(SRC1, R(SRC2));
+ Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
+ Compile_DestEnable(instr, SRC1);
+}
+
+void JitCompiler::Compile_SGE(Instruction instr) {
+ if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SGEI) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
+ Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
+ } else {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+ Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+ }
+
+ CMPPS(SRC1, R(SRC2), CMP_NLT);
+ ANDPS(SRC1, R(ONE));
+
+ Compile_DestEnable(instr, SRC1);
+}
+
+void JitCompiler::Compile_SLT(Instruction instr) {
+ if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SLTI) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
+ Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
+ } else {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+ Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+ }
+
+ CMPPS(SRC1, R(SRC2), CMP_LT);
+ ANDPS(SRC1, R(ONE));
+
Compile_DestEnable(instr, SRC1);
}
@@ -354,6 +471,7 @@ void JitCompiler::Compile_FLR(Instruction instr) {
void JitCompiler::Compile_MAX(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+ // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
MAXPS(SRC1, R(SRC2));
Compile_DestEnable(instr, SRC1);
}
@@ -361,6 +479,7 @@ void JitCompiler::Compile_MAX(Instruction instr) {
void JitCompiler::Compile_MIN(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+ // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
MINPS(SRC1, R(SRC2));
Compile_DestEnable(instr, SRC1);
}
@@ -374,8 +493,8 @@ void JitCompiler::Compile_MOVA(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
- // Convert floats to integers (only care about X and Y components)
- CVTPS2DQ(SRC1, R(SRC1));
+ // Convert floats to integers using truncation (only care about X and Y components)
+ CVTTPS2DQ(SRC1, R(SRC1));
// Get result
MOVQ_xmm(R(RAX), SRC1);
@@ -415,22 +534,13 @@ void JitCompiler::Compile_MOV(Instruction instr) {
Compile_DestEnable(instr, SRC1);
}
-void JitCompiler::Compile_SLTI(Instruction instr) {
- Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
- Compile_SwizzleSrc(instr, 1, instr.common.src2i, SRC2);
-
- CMPSS(SRC1, R(SRC2), CMP_LT);
- ANDPS(SRC1, R(ONE));
-
- Compile_DestEnable(instr, SRC1);
-}
-
void JitCompiler::Compile_RCP(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
- // TODO(bunnei): RCPPS is a pretty rough approximation, this might cause problems if Pica
+ // TODO(bunnei): RCPSS is a pretty rough approximation, this might cause problems if Pica
// performs this operation more accurately. This should be checked on hardware.
- RCPPS(SRC1, R(SRC1));
+ RCPSS(SRC1, R(SRC1));
+ SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX
Compile_DestEnable(instr, SRC1);
}
@@ -438,9 +548,10 @@ void JitCompiler::Compile_RCP(Instruction instr) {
void JitCompiler::Compile_RSQ(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
- // TODO(bunnei): RSQRTPS is a pretty rough approximation, this might cause problems if Pica
+ // TODO(bunnei): RSQRTSS is a pretty rough approximation, this might cause problems if Pica
// performs this operation more accurately. This should be checked on hardware.
- RSQRTPS(SRC1, R(SRC1));
+ RSQRTSS(SRC1, R(SRC1));
+ SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX
Compile_DestEnable(instr, SRC1);
}
@@ -475,27 +586,42 @@ void JitCompiler::Compile_CALLU(Instruction instr) {
}
void JitCompiler::Compile_CMP(Instruction instr) {
+ using Op = Instruction::Common::CompareOpType::Op;
+ Op op_x = instr.common.compare_op.x;
+ Op op_y = instr.common.compare_op.y;
+
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
- static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_NLE, CMP_NLT };
+ // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to
+ // emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here
+ // because they don't match when used with NaNs.
+ static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE };
+
+ bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual);
+ Gen::X64Reg lhs_x = invert_op_x ? SRC2 : SRC1;
+ Gen::X64Reg rhs_x = invert_op_x ? SRC1 : SRC2;
- if (instr.common.compare_op.x == instr.common.compare_op.y) {
+ if (op_x == op_y) {
// Compare X-component and Y-component together
- CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.x]);
+ CMPPS(lhs_x, R(rhs_x), cmp[op_x]);
+ MOVQ_xmm(R(COND0), lhs_x);
- MOVQ_xmm(R(COND0), SRC1);
MOV(64, R(COND1), R(COND0));
} else {
+ bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual);
+ Gen::X64Reg lhs_y = invert_op_y ? SRC2 : SRC1;
+ Gen::X64Reg rhs_y = invert_op_y ? SRC1 : SRC2;
+
// Compare X-component
- MOVAPS(SCRATCH, R(SRC1));
- CMPSS(SCRATCH, R(SRC2), cmp[instr.common.compare_op.x]);
+ MOVAPS(SCRATCH, R(lhs_x));
+ CMPSS(SCRATCH, R(rhs_x), cmp[op_x]);
// Compare Y-component
- CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.y]);
+ CMPPS(lhs_y, R(rhs_y), cmp[op_y]);
MOVQ_xmm(R(COND0), SCRATCH);
- MOVQ_xmm(R(COND1), SRC1);
+ MOVQ_xmm(R(COND1), lhs_y);
}
SHR(32, R(COND0), Imm8(31));
@@ -513,12 +639,8 @@ void JitCompiler::Compile_MAD(Instruction instr) {
Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3);
}
- if (Common::GetCPUCaps().fma) {
- VFMADD213PS(SRC1, SRC2, R(SRC3));
- } else {
- MULPS(SRC1, R(SRC2));
- ADDPS(SRC1, R(SRC3));
- }
+ Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
+ ADDPS(SRC1, R(SRC3));
Compile_DestEnable(instr, SRC1);
}
@@ -646,12 +768,12 @@ CompiledShader* JitCompiler::Compile() {
// Used to set a register to one
static const __m128 one = { 1.f, 1.f, 1.f, 1.f };
MOV(PTRBITS, R(RAX), ImmPtr(&one));
- MOVAPS(ONE, MDisp(RAX, 0));
+ MOVAPS(ONE, MatR(RAX));
// Used to negate registers
static const __m128 neg = { -0.f, -0.f, -0.f, -0.f };
MOV(PTRBITS, R(RAX), ImmPtr(&neg));
- MOVAPS(NEGBIT, MDisp(RAX, 0));
+ MOVAPS(NEGBIT, MatR(RAX));
looping = false;
diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h
index b88f2a0d2..58828ecc8 100644
--- a/src/video_core/shader/shader_jit_x64.h
+++ b/src/video_core/shader/shader_jit_x64.h
@@ -37,7 +37,12 @@ public:
void Compile_ADD(Instruction instr);
void Compile_DP3(Instruction instr);
void Compile_DP4(Instruction instr);
+ void Compile_DPH(Instruction instr);
+ void Compile_EX2(Instruction instr);
+ void Compile_LG2(Instruction instr);
void Compile_MUL(Instruction instr);
+ void Compile_SGE(Instruction instr);
+ void Compile_SLT(Instruction instr);
void Compile_FLR(Instruction instr);
void Compile_MAX(Instruction instr);
void Compile_MIN(Instruction instr);
@@ -45,7 +50,6 @@ public:
void Compile_RSQ(Instruction instr);
void Compile_MOVA(Instruction instr);
void Compile_MOV(Instruction instr);
- void Compile_SLTI(Instruction instr);
void Compile_NOP(Instruction instr);
void Compile_END(Instruction instr);
void Compile_CALL(Instruction instr);
@@ -64,9 +68,18 @@ private:
void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest);
void Compile_DestEnable(Instruction instr, Gen::X64Reg dest);
+ /**
+ * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying
+ * zero by inf. Clobbers `src2` and `scratch`.
+ */
+ void Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch);
+
void Compile_EvaluateCondition(Instruction instr);
void Compile_UniformCondition(Instruction instr);
+ void Compile_PushCallerSavedXMM();
+ void Compile_PopCallerSavedXMM();
+
/// Pointer to the variable that stores the current Pica code offset. Used to handle nested code blocks.
unsigned* offset_ptr = nullptr;