3 files changed, 28 insertions, 33 deletions
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index fb65a3a0a..3ab4af374 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -243,6 +243,15 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                     ASSERT(!g_state.geometry_pipeline.NeedIndexInput());
                     g_state.geometry_pipeline.Setup(shader_engine);
                     g_state.geometry_pipeline.SubmitVertex(output);
+
+                    // TODO: If drawing after every immediate mode triangle kills performance,
+                    // change it to flush triangles whenever a drawing config register changes
+                    // See: https://github.com/citra-emu/citra/pull/2866#issuecomment-327011550
+                    VideoCore::g_renderer->Rasterizer()->DrawTriangles();
+                    if (g_debug_context) {
+                        g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch,
+                                                 nullptr);
+                    }
                 }
             }
         }
@@ -250,16 +259,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
     }
 
     case PICA_REG_INDEX(pipeline.gpu_mode):
-        if (regs.pipeline.gpu_mode == PipelineRegs::GPUMode::Configuring) {
-            MICROPROFILE_SCOPE(GPU_Drawing);
-
-            // Draw immediate mode triangles when GPU Mode is set to GPUMode::Configuring
-            VideoCore::g_renderer->Rasterizer()->DrawTriangles();
-
-            if (g_debug_context) {
-                g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr);
-            }
-        }
+        // This register likely just enables vertex processing and doesn't need any special handling
         break;
 
     case PICA_REG_INDEX_WORKAROUND(pipeline.command_buffer.trigger[0], 0x23c):
@@ -398,6 +398,11 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                                                       range.second, range.first);
         }
 
+        VideoCore::g_renderer->Rasterizer()->DrawTriangles();
+        if (g_debug_context) {
+            g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr);
+        }
+
         break;
     }
 
@@ -632,6 +637,6 @@ void ProcessCommandList(const u32* list, u32 size) {
     }
 }
 
-} // namespace
+} // namespace CommandProcessor
 
-} // namespace
+} // namespace Pica
diff --git a/src/video_core/pica_types.h b/src/video_core/pica_types.h
index 5d7e10066..2eafa7e9e 100644
--- a/src/video_core/pica_types.h
+++ b/src/video_core/pica_types.h
@@ -58,11 +58,12 @@ public:
     }
 
     Float<M, E> operator*(const Float<M, E>& flt) const {
-        if ((this->value == 0.f && !std::isnan(flt.value)) ||
-            (flt.value == 0.f && !std::isnan(this->value)))
-            // PICA gives 0 instead of NaN when multiplying by inf
-            return Zero();
-        return Float<M, E>::FromFloat32(ToFloat32() * flt.ToFloat32());
+        float result = value * flt.ToFloat32();
+        // PICA gives 0 instead of NaN when multiplying by inf
+        if (!std::isnan(value) && !std::isnan(flt.ToFloat32()))
+            if (std::isnan(result))
+                result = 0.f;
+        return Float<M, E>::FromFloat32(result);
     }
 
     Float<M, E> operator/(const Float<M, E>& flt) const {
@@ -78,12 +79,7 @@ public:
     }
 
     Float<M, E>& operator*=(const Float<M, E>& flt) {
-        if ((this->value == 0.f && !std::isnan(flt.value)) ||
-            (flt.value == 0.f && !std::isnan(this->value)))
-            // PICA gives 0 instead of NaN when multiplying by inf
-            *this = Zero();
-        else
-            value *= flt.ToFloat32();
+        value = operator*(flt).value;
         return *this;
     }
 
diff --git a/src/video_core/utils.h b/src/video_core/utils.h
index 7ce83a055..d8567f314 100644
--- a/src/video_core/utils.h
+++ b/src/video_core/utils.h
@@ -8,17 +8,11 @@
 
 namespace VideoCore {
 
-/**
- * Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
- * arranged in a Z-order curve. More details on the bit manipulation at:
- * https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
- */
+// 8x8 Z-Order coordinate from 2D coordinates
 static inline u32 MortonInterleave(u32 x, u32 y) {
-    u32 i = (x & 7) | ((y & 7) << 8); // ---- -210
-    i = (i ^ (i << 2)) & 0x1313;      // ---2 --10
-    i = (i ^ (i << 1)) & 0x1515;      // ---2 -1-0
-    i = (i | (i >> 7)) & 0x3F;
-    return i;
+    static const u32 xlut[] = {0x00, 0x01, 0x04, 0x05, 0x10, 0x11, 0x14, 0x15};
+    static const u32 ylut[] = {0x00, 0x02, 0x08, 0x0a, 0x20, 0x22, 0x28, 0x2a};
+    return xlut[x % 8] + ylut[y % 8];
 }
 
 /**