13 files changed, 561 insertions, 321 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 13c3f7b22..4c1e6449a 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -19,6 +19,7 @@ set(HEADERS
             renderer_opengl/gl_shaders.h
             renderer_opengl/renderer_opengl.h
             clipper.h
+            color.h
             command_processor.h
             gpu_debugger.h
             math.h
diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp
index 1744066ba..ba3876a76 100644
--- a/src/video_core/clipper.cpp
+++ b/src/video_core/clipper.cpp
@@ -15,30 +15,18 @@ namespace Clipper {
 
 struct ClippingEdge {
 public:
-    enum Type {
-        POS_X = 0,
-        NEG_X = 1,
-        POS_Y = 2,
-        NEG_Y = 3,
-        POS_Z = 4,
-        NEG_Z = 5,
-    };
-
-    ClippingEdge(Type type, float24 position) : type(type), pos(position) {}
+    ClippingEdge(Math::Vec4<float24> coeffs,
+                 Math::Vec4<float24> bias = Math::Vec4<float24>(float24::FromFloat32(0),
+                                                                float24::FromFloat32(0),
+                                                                float24::FromFloat32(0),
+                                                                float24::FromFloat32(0)))
+        : coeffs(coeffs),
+          bias(bias)
+    {
+    }
 
     bool IsInside(const OutputVertex& vertex) const {
-        switch (type) {
-        case POS_X: return vertex.pos.x <= pos * vertex.pos.w;
-        case NEG_X: return vertex.pos.x >= pos * vertex.pos.w;
-        case POS_Y: return vertex.pos.y <= pos * vertex.pos.w;
-        case NEG_Y: return vertex.pos.y >= pos * vertex.pos.w;
-
-        // TODO: Check z compares ... should be 0..1 instead?
-        case POS_Z: return vertex.pos.z <= pos * vertex.pos.w;
-
-        default:
-        case NEG_Z: return vertex.pos.z >= pos * vertex.pos.w;
-        }
+        return Math::Dot(vertex.pos + bias, coeffs) <= float24::FromFloat32(0);
     }
 
     bool IsOutSide(const OutputVertex& vertex) const {
@@ -46,31 +34,17 @@ public:
     }
 
     OutputVertex GetIntersection(const OutputVertex& v0, const OutputVertex& v1) const {
-        auto dotpr = [this](const OutputVertex& vtx) {
-            switch (type) {
-            case POS_X: return vtx.pos.x - vtx.pos.w;
-            case NEG_X: return -vtx.pos.x - vtx.pos.w;
-            case POS_Y: return vtx.pos.y - vtx.pos.w;
-            case NEG_Y: return -vtx.pos.y - vtx.pos.w;
-
-            // TODO: Verify z clipping
-            case POS_Z: return vtx.pos.z - vtx.pos.w;
-
-            default:
-            case NEG_Z: return -vtx.pos.w;
-            }
-        };
-
-        float24 dp = dotpr(v0);
-        float24 dp_prev = dotpr(v1);
+        float24 dp = Math::Dot(v0.pos + bias, coeffs);
+        float24 dp_prev = Math::Dot(v1.pos + bias, coeffs);
         float24 factor = dp_prev / (dp_prev - dp);
 
         return OutputVertex::Lerp(factor, v0, v1);
     }
 
 private:
-    Type type;
     float24 pos;
+    Math::Vec4<float24> coeffs;
+    Math::Vec4<float24> bias;
 };
 
 static void InitScreenCoordinates(OutputVertex& vtx)
@@ -98,10 +72,9 @@ static void InitScreenCoordinates(OutputVertex& vtx)
     vtx.tc2 *= inv_w;
     vtx.pos.w = inv_w;
 
-    // TODO: Not sure why the viewport width needs to be divided by 2 but the viewport height does not
     vtx.screenpos[0] = (vtx.pos.x * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x;
     vtx.screenpos[1] = (vtx.pos.y * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y;
-    vtx.screenpos[2] = viewport.offset_z - vtx.pos.z * inv_w * viewport.zscale;
+    vtx.screenpos[2] = viewport.offset_z + vtx.pos.z * inv_w * viewport.zscale;
 }
 
 void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
@@ -117,14 +90,29 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
     auto* output_list = &buffer_a;
     auto* input_list  = &buffer_b;
 
+    // NOTE: We clip against a w=epsilon plane to guarantee that the output has a positive w value.
+    // TODO: Not sure if this is a valid approach. Also should probably instead use the smallest
+    //       epsilon possible within float24 accuracy.
+    static const float24 EPSILON = float24::FromFloat32(0.00001);
+    static const float24 f0 = float24::FromFloat32(0.0);
+    static const float24 f1 = float24::FromFloat32(1.0);
+    static const std::array<ClippingEdge, 7> clipping_edges = {{
+        { Math::MakeVec( f1,  f0,  f0, -f1) },  // x = +w
+        { Math::MakeVec(-f1,  f0,  f0, -f1) },  // x = -w
+        { Math::MakeVec( f0,  f1,  f0, -f1) },  // y = +w
+        { Math::MakeVec( f0, -f1,  f0, -f1) },  // y = -w
+        { Math::MakeVec( f0,  f0,  f1,  f0) },  // z =  0
+        { Math::MakeVec( f0,  f0, -f1, -f1) },  // z = -w
+        { Math::MakeVec( f0,  f0,  f0, -f1), Math::Vec4<float24>(f0, f0, f0, EPSILON) }, // w = EPSILON
+    }};
+
+    // TODO: If one vertex lies outside one of the depth clipping planes, some platforms (e.g. Wii)
+    //       drop the whole primitive instead of clipping the primitive properly. We should test if
+    //       this happens on the 3DS, too.
+
     // Simple implementation of the Sutherland-Hodgman clipping algorithm.
     // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
-    for (auto edge : { ClippingEdge(ClippingEdge::POS_X, float24::FromFloat32(+1.0)),
-                       ClippingEdge(ClippingEdge::NEG_X, float24::FromFloat32(-1.0)),
-                       ClippingEdge(ClippingEdge::POS_Y, float24::FromFloat32(+1.0)),
-                       ClippingEdge(ClippingEdge::NEG_Y, float24::FromFloat32(-1.0)),
-                       ClippingEdge(ClippingEdge::POS_Z, float24::FromFloat32(+1.0)),
-                       ClippingEdge(ClippingEdge::NEG_Z, float24::FromFloat32(-1.0)) }) {
+    for (auto edge : clipping_edges) {
 
         std::swap(input_list, output_list);
         output_list->clear();
diff --git a/src/video_core/color.h b/src/video_core/color.h
index e86ac1265..35da901f2 100644
--- a/src/video_core/color.h
+++ b/src/video_core/color.h
@@ -5,28 +5,152 @@
 #pragma once
 
 #include "common/common_types.h"
+#include "video_core/math.h"
 
 namespace Color {
 
 /// Convert a 1-bit color component to 8 bit
-static inline u8 Convert1To8(u8 value) {
+inline u8 Convert1To8(u8 value) {
     return value * 255;
 }
 
 /// Convert a 4-bit color component to 8 bit
-static inline u8 Convert4To8(u8 value) {
+inline u8 Convert4To8(u8 value) {
     return (value << 4) | value;
 }
 
 /// Convert a 5-bit color component to 8 bit
-static inline u8 Convert5To8(u8 value) {
+inline u8 Convert5To8(u8 value) {
     return (value << 3) | (value >> 2);
 }
 
 /// Convert a 6-bit color component to 8 bit
-static inline u8 Convert6To8(u8 value) {
+inline u8 Convert6To8(u8 value) {
     return (value << 2) | (value >> 4);
 }
 
+/// Convert a 8-bit color component to 1 bit
+inline u8 Convert8To1(u8 value) {
+    return value >> 7;
+}
+
+/// Convert a 8-bit color component to 4 bit
+inline u8 Convert8To4(u8 value) {
+    return value >> 4;
+}
+
+/// Convert a 8-bit color component to 5 bit
+inline u8 Convert8To5(u8 value) {
+    return value >> 3;
+}
+
+/// Convert a 8-bit color component to 6 bit
+inline u8 Convert8To6(u8 value) {
+    return value >> 2;
+}
+
+/**
+ * Decode a color stored in RGBA8 format
+ * @param bytes Pointer to encoded source color
+ * @return Result color decoded as Math::Vec4<u8>
+ */
+inline const Math::Vec4<u8> DecodeRGBA8(const u8* bytes) {
+    return { bytes[3], bytes[2], bytes[1], bytes[0] };
+}
+
+/**
+ * Decode a color stored in RGB8 format
+ * @param bytes Pointer to encoded source color
+ * @return Result color decoded as Math::Vec4<u8>
+ */
+inline const Math::Vec4<u8> DecodeRGB8(const u8* bytes) {
+    return { bytes[2], bytes[1], bytes[0], 255 };
+}
+
+/**
+ * Decode a color stored in RGB565 format
+ * @param bytes Pointer to encoded source color
+ * @return Result color decoded as Math::Vec4<u8>
+ */
+inline const Math::Vec4<u8> DecodeRGB565(const u8* bytes) {
+    const u16_le pixel = *reinterpret_cast<const u16_le*>(bytes);
+    return { Convert5To8((pixel >> 11) & 0x1F), Convert6To8((pixel >> 5) & 0x3F),
+        Convert5To8(pixel & 0x1F), 255 };
+}
+
+/**
+ * Decode a color stored in RGB5A1 format
+ * @param bytes Pointer to encoded source color
+ * @return Result color decoded as Math::Vec4<u8>
+ */
+inline const Math::Vec4<u8> DecodeRGB5A1(const u8* bytes) {
+    const u16_le pixel = *reinterpret_cast<const u16_le*>(bytes);
+    return { Convert5To8((pixel >> 11) & 0x1F), Convert5To8((pixel >> 6) & 0x1F),
+        Convert5To8((pixel >> 1) & 0x1F), Convert1To8(pixel & 0x1) };
+}
+
+/**
+ * Decode a color stored in RGBA4 format
+ * @param bytes Pointer to encoded source color
+ * @return Result color decoded as Math::Vec4<u8>
+ */
+inline const Math::Vec4<u8> DecodeRGBA4(const u8* bytes) {
+    const u16_le pixel = *reinterpret_cast<const u16_le*>(bytes);
+    return { Convert4To8((pixel >> 12) & 0xF), Convert4To8((pixel >> 8) & 0xF),
+        Convert4To8((pixel >> 4) & 0xF), Convert4To8(pixel & 0xF) };
+}
+
+/**
+ * Encode a color as RGBA8 format
+ * @param color Source color to encode
+ * @param bytes Destination pointer to store encoded color
+ */
+inline void EncodeRGBA8(const Math::Vec4<u8>& color, u8* bytes) {
+    bytes[3] = color.r();
+    bytes[2] = color.g();
+    bytes[1] = color.b();
+    bytes[0] = color.a();
+}
+
+/**
+ * Encode a color as RGB8 format
+ * @param color Source color to encode
+ * @param bytes Destination pointer to store encoded color
+ */
+inline void EncodeRGB8(const Math::Vec4<u8>& color, u8* bytes) {
+    bytes[2] = color.r();
+    bytes[1] = color.g();
+    bytes[0] = color.b();
+}
+
+/**
+ * Encode a color as RGB565 format
+ * @param color Source color to encode
+ * @param bytes Destination pointer to store encoded color
+ */
+inline void EncodeRGB565(const Math::Vec4<u8>& color, u8* bytes) {
+    *reinterpret_cast<u16_le*>(bytes) = (Convert8To5(color.r()) << 11) |
+        (Convert8To6(color.g()) << 5) | Convert8To5(color.b());
+}
+
+/**
+ * Encode a color as RGB5A1 format
+ * @param color Source color to encode
+ * @param bytes Destination pointer to store encoded color
+ */
+inline void EncodeRGB5A1(const Math::Vec4<u8>& color, u8* bytes) {
+    *reinterpret_cast<u16_le*>(bytes) = (Convert8To5(color.r()) << 11) |
+        (Convert8To5(color.g()) << 6) | (Convert8To5(color.b()) << 1) | Convert8To1(color.a());
+}
+
+/**
+ * Encode a color as RGBA4 format
+ * @param color Source color to encode
+ * @param bytes Destination pointer to store encoded color
+ */
+inline void EncodeRGBA4(const Math::Vec4<u8>& color, u8* bytes) {
+    *reinterpret_cast<u16_le*>(bytes) = (Convert8To4(color.r()) << 12) |
+        (Convert8To4(color.g()) << 8) | (Convert8To4(color.b()) << 4) | Convert8To4(color.a());
+}
 
 } // namespace
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 0d9f4ba66..e031871e8 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -2,6 +2,10 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <boost/range/algorithm/fill.hpp>
+
+#include "common/profiler.h"
+
 #include "clipper.h"
 #include "command_processor.h"
 #include "math.h"
@@ -23,9 +27,7 @@ static int float_regs_counter = 0;
 
 static u32 uniform_write_buffer[4];
 
-// Used for VSLoadProgramData and VSLoadSwizzleData
-static u32 vs_binary_write_offset = 0;
-static u32 vs_swizzle_write_offset = 0;
+Common::Profiling::TimingCategory category_drawing("Drawing");
 
 static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
 
@@ -55,6 +57,8 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
         case PICA_REG_INDEX(trigger_draw):
         case PICA_REG_INDEX(trigger_draw_indexed):
         {
+            Common::Profiling::ScopeTimer scope_timer(category_drawing);
+
             DebugUtils::DumpTevStageConfig(registers.GetTevStages());
 
             if (g_debug_context)
@@ -65,10 +69,14 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
 
             // Information about internal vertex attributes
             u32 vertex_attribute_sources[16];
-            std::fill(vertex_attribute_sources, &vertex_attribute_sources[16], 0xdeadbeef);
+            boost::fill(vertex_attribute_sources, 0xdeadbeef);
             u32 vertex_attribute_strides[16];
             u32 vertex_attribute_formats[16];
-            u32 vertex_attribute_elements[16];
+
+            // HACK: Initialize vertex_attribute_elements to zero to prevent infinite loops below.
+            // This is one of the hacks required to deal with uninitalized vertex attributes.
+            // TODO: Fix this properly.
+            u32 vertex_attribute_elements[16] = {};
             u32 vertex_attribute_element_size[16];
 
             // Setup attribute data from loaders
@@ -252,11 +260,6 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
             break;
         }
 
-        // Seems to be used to reset the write pointer for VSLoadProgramData
-        case PICA_REG_INDEX(vs_program.begin_load):
-            vs_binary_write_offset = 0;
-            break;
-
         // Load shader program code
         case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[0], 0x2cc):
         case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[1], 0x2cd):
@@ -267,16 +270,11 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
         case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[6], 0x2d2):
         case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[7], 0x2d3):
         {
-            VertexShader::SubmitShaderMemoryChange(vs_binary_write_offset, value);
-            vs_binary_write_offset++;
+            VertexShader::SubmitShaderMemoryChange(registers.vs_program.offset, value);
+            registers.vs_program.offset++;
             break;
         }
 
-        // Seems to be used to reset the write pointer for VSLoadSwizzleData
-        case PICA_REG_INDEX(vs_swizzle_patterns.begin_load):
-            vs_swizzle_write_offset = 0;
-            break;
-
         // Load swizzle pattern data
         case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[0], 0x2d6):
         case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[1], 0x2d7):
@@ -287,8 +285,8 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
         case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[6], 0x2dc):
         case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[7], 0x2dd):
         {
-            VertexShader::SubmitSwizzleDataChange(vs_swizzle_write_offset, value);
-            vs_swizzle_write_offset++;
+            VertexShader::SubmitSwizzleDataChange(registers.vs_swizzle_patterns.offset, value);
+            registers.vs_swizzle_patterns.offset++;
             break;
         }
 
diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp
index 8c4ec1044..745c4f4ed 100644
--- a/src/video_core/debug_utils/debug_utils.cpp
+++ b/src/video_core/debug_utils/debug_utils.cpp
@@ -23,6 +23,7 @@
 #include "video_core/color.h"
 #include "video_core/math.h"
 #include "video_core/pica.h"
+#include "video_core/utils.h"
 
 #include "debug_utils.h"
 
@@ -189,7 +190,7 @@ void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data
                                         );
 
                     if (it == output_info_table.end()) {
-                        output_info_table.push_back({});
+                        output_info_table.emplace_back();
                         output_info_table.back().type = type;
                         output_info_table.back().component_mask = component_mask;
                         output_info_table.back().id = i;
@@ -285,7 +286,7 @@ void OnPicaRegWrite(u32 id, u32 value)
     if (!is_pica_tracing)
         return;
 
-    pica_trace->writes.push_back({id, value});
+    pica_trace->writes.emplace_back(id, value);
 }
 
 std::unique_ptr<PicaTrace> FinishPicaTracing()
@@ -306,111 +307,69 @@ std::unique_ptr<PicaTrace> FinishPicaTracing()
 }
 
 const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const TextureInfo& info, bool disable_alpha) {
-    // Images are split into 8x8 tiles. Each tile is composed of four 4x4 subtiles each
-    // of which is composed of four 2x2 subtiles each of which is composed of four texels.
-    // Each structure is embedded into the next-bigger one in a diagonal pattern, e.g.
-    // texels are laid out in a 2x2 subtile like this:
-    // 2 3
-    // 0 1
-    //
-    // The full 8x8 tile has the texels arranged like this:
-    //
-    // 42 43 46 47 58 59 62 63
-    // 40 41 44 45 56 57 60 61
-    // 34 35 38 39 50 51 54 55
-    // 32 33 36 37 48 49 52 53
-    // 10 11 14 15 26 27 30 31
-    // 08 09 12 13 24 25 28 29
-    // 02 03 06 07 18 19 22 23
-    // 00 01 04 05 16 17 20 21
-
-    const unsigned int block_width = 8;
-    const unsigned int block_height = 8;
-
     const unsigned int coarse_x = x & ~7;
     const unsigned int coarse_y = y & ~7;
 
-    // Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
-    // arranged in a Z-order curve. More details on the bit manipulation at:
-    // https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
-    unsigned int i = (x | (y << 8)) & 0x0707; // ---- -210
-    i = (i ^ (i << 2)) & 0x1313;              // ---2 --10
-    i = (i ^ (i << 1)) & 0x1515;              // ---2 -1-0
-    i = (i | (i >> 7)) & 0x3F;
-
     if (info.format != Regs::TextureFormat::ETC1 &&
         info.format != Regs::TextureFormat::ETC1A4) {
         // TODO(neobrain): Fix code design to unify vertical block offsets!
         source += coarse_y * info.stride;
     }
-    const unsigned int offset = coarse_x * block_height;
-
+    
     // TODO: Assert that width/height are multiples of block dimensions
 
     switch (info.format) {
     case Regs::TextureFormat::RGBA8:
     {
-        const u8* source_ptr = source + offset * 4 + i * 4;
-        return { source_ptr[3], source_ptr[2], source_ptr[1], disable_alpha ? (u8)255 : source_ptr[0] };
+        auto res = Color::DecodeRGBA8(source + VideoCore::GetMortonOffset(x, y, 4));
+        return { res.r(), res.g(), res.b(), disable_alpha ? 255 : res.a() };
     }
 
     case Regs::TextureFormat::RGB8:
     {
-        const u8* source_ptr = source + offset * 3 + i * 3;
-        return { source_ptr[2], source_ptr[1], source_ptr[0], 255 };
+        auto res = Color::DecodeRGB8(source + VideoCore::GetMortonOffset(x, y, 3));
+        return { res.r(), res.g(), res.b(), 255 };
     }
 
-    case Regs::TextureFormat::RGBA5551:
+    case Regs::TextureFormat::RGB5A1:
     {
-        const u16 source_ptr = *(const u16*)(source + offset * 2 + i * 2);
-        u8 r = (source_ptr >> 11) & 0x1F;
-        u8 g = ((source_ptr) >> 6) & 0x1F;
-        u8 b = (source_ptr >> 1) & 0x1F;
-        u8 a = source_ptr & 1;
-        return Math::MakeVec<u8>(Color::Convert5To8(r), Color::Convert5To8(g),
-                                 Color::Convert5To8(b), disable_alpha ? 255 : Color::Convert1To8(a));
+        auto res = Color::DecodeRGB5A1(source + VideoCore::GetMortonOffset(x, y, 2));
+        return { res.r(), res.g(), res.b(), disable_alpha ? 255 : res.a() };
     }
 
     case Regs::TextureFormat::RGB565:
     {
-        const u16 source_ptr = *(const u16*)(source + offset * 2 + i * 2);
-        u8 r = Color::Convert5To8((source_ptr >> 11) & 0x1F);
-        u8 g = Color::Convert6To8(((source_ptr) >> 5) & 0x3F);
-        u8 b = Color::Convert5To8((source_ptr) & 0x1F);
-        return Math::MakeVec<u8>(r, g, b, 255);
+        auto res = Color::DecodeRGB565(source + VideoCore::GetMortonOffset(x, y, 2));
+        return { res.r(), res.g(), res.b(), 255 };
     }
 
     case Regs::TextureFormat::RGBA4:
     {
-        const u8* source_ptr = source + offset * 2 + i * 2;
-        u8 r = Color::Convert4To8(source_ptr[1] >> 4);
-        u8 g = Color::Convert4To8(source_ptr[1] & 0xF);
-        u8 b = Color::Convert4To8(source_ptr[0] >> 4);
-        u8 a = Color::Convert4To8(source_ptr[0] & 0xF);
-        return { r, g, b, disable_alpha ? (u8)255 : a };
+        auto res = Color::DecodeRGBA4(source + VideoCore::GetMortonOffset(x, y, 2));
+        return { res.r(), res.g(), res.b(), disable_alpha ? 255 : res.a() };
     }
 
     case Regs::TextureFormat::IA8:
     {
-        const u8* source_ptr = source + offset * 2 + i * 2;
+        const u8* source_ptr = source + VideoCore::GetMortonOffset(x, y, 2);
 
         if (disable_alpha) {
             // Show intensity as red, alpha as green
             return { source_ptr[1], source_ptr[0], 0, 255 };
         } else {
-            return { source_ptr[1], source_ptr[1], source_ptr[1], source_ptr[0]};
+            return { source_ptr[1], source_ptr[1], source_ptr[1], source_ptr[0] };
         }
     }
 
     case Regs::TextureFormat::I8:
     {
-        const u8* source_ptr = source + offset + i;
+        const u8* source_ptr = source + VideoCore::GetMortonOffset(x, y, 1);
         return { *source_ptr, *source_ptr, *source_ptr, 255 };
     }
 
     case Regs::TextureFormat::A8:
     {
-        const u8* source_ptr = source + offset + i;
+        const u8* source_ptr = source + VideoCore::GetMortonOffset(x, y, 1);
 
         if (disable_alpha) {
             return { *source_ptr, *source_ptr, *source_ptr, 255 };
@@ -421,7 +380,7 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
 
     case Regs::TextureFormat::IA4:
     {
-        const u8* source_ptr = source + offset + i;
+        const u8* source_ptr = source + VideoCore::GetMortonOffset(x, y, 1);
 
         u8 i = Color::Convert4To8(((*source_ptr) & 0xF0) >> 4);
         u8 a = Color::Convert4To8((*source_ptr) & 0xF);
@@ -436,9 +395,10 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
 
     case Regs::TextureFormat::A4:
     {
-        const u8* source_ptr = source + offset / 2 + i / 2;
+        u32 morton_offset = VideoCore::GetMortonOffset(x, y, 1);
+        const u8* source_ptr = source + morton_offset / 2;
 
-        u8 a = (coarse_x % 2) ? ((*source_ptr)&0xF) : (((*source_ptr) & 0xF0) >> 4);
+        u8 a = (morton_offset % 2) ? ((*source_ptr & 0xF0) >> 4) : (*source_ptr & 0xF);
         a = Color::Convert4To8(a);
 
         if (disable_alpha) {
@@ -545,7 +505,7 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
                 }
 
                 // Add modifier
-                unsigned table_index = (x < 2) ? table_index_2.Value() : table_index_1.Value();
+                unsigned table_index = (x < 2) ? table_index_1.Value() : table_index_2.Value();
 
                 static const auto etc1_modifier_table = std::array<std::array<u8, 2>, 8>{{
                     {  2,  8 }, {  5, 17 }, {  9,  29 }, { 13,  42 },
diff --git a/src/video_core/gpu_debugger.h b/src/video_core/gpu_debugger.h
index 03641d93b..48ac269e3 100644
--- a/src/video_core/gpu_debugger.h
+++ b/src/video_core/gpu_debugger.h
@@ -58,8 +58,8 @@ public:
         if (observers.empty())
             return;
 
-        gx_command_history.push_back(GSP_GPU::Command());
-        GSP_GPU::Command& cmd = gx_command_history[gx_command_history.size()-1];
+        gx_command_history.emplace_back();
+        GSP_GPU::Command& cmd = gx_command_history.back();
 
         memcpy(&cmd, command_data, sizeof(GSP_GPU::Command));
 
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 9c1a12dc8..b14de9278 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -39,13 +39,6 @@ namespace Pica {
 
 struct Regs {
 
-// helper macro to properly align structure members.
-// Calling INSERT_PADDING_WORDS will add a new member variable with a name like "pad121",
-// depending on the current source line to make sure variable names are unique.
-#define INSERT_PADDING_WORDS_HELPER1(x, y) x ## y
-#define INSERT_PADDING_WORDS_HELPER2(x, y) INSERT_PADDING_WORDS_HELPER1(x, y)
-#define INSERT_PADDING_WORDS(num_words) u32 INSERT_PADDING_WORDS_HELPER2(pad, __LINE__)[(num_words)];
-
     INSERT_PADDING_WORDS(0x10);
 
     u32 trigger_irq;
@@ -118,8 +111,9 @@ struct Regs {
 
     struct TextureConfig {
         enum WrapMode : u32 {
-            ClampToEdge = 0,
-            Repeat      = 2,
+            ClampToEdge    = 0,
+            Repeat         = 2,
+            MirroredRepeat = 3,
         };
 
         INSERT_PADDING_WORDS(0x1);
@@ -131,7 +125,7 @@ struct Regs {
 
         union {
             BitField< 8, 2, WrapMode> wrap_s;
-            BitField<11, 2, WrapMode> wrap_t;
+            BitField<12, 2, WrapMode> wrap_t;
         };
 
         INSERT_PADDING_WORDS(0x1);
@@ -151,7 +145,7 @@ struct Regs {
     enum class TextureFormat : u32 {
         RGBA8        =  0,
         RGB8         =  1,
-        RGBA5551     =  2,
+        RGB5A1       =  2,
         RGB565       =  3,
         RGBA4        =  4,
         IA8          =  5,
@@ -173,7 +167,7 @@ struct Regs {
         case TextureFormat::RGB8:
             return 6;
 
-        case TextureFormat::RGBA5551:
+        case TextureFormat::RGB5A1:
         case TextureFormat::RGB565:
         case TextureFormat::RGBA4:
         case TextureFormat::IA8:
@@ -223,6 +217,8 @@ struct Regs {
     struct TevStageConfig {
         enum class Source : u32 {
             PrimaryColor           = 0x0,
+            PrimaryFragmentColor   = 0x1,
+
             Texture0               = 0x3,
             Texture1               = 0x4,
             Texture2               = 0x5,
@@ -265,6 +261,9 @@ struct Regs {
             AddSigned       = 3,
             Lerp            = 4,
             Subtract        = 5,
+
+            MultiplyThenAdd = 8,
+            AddThenMultiply = 9,
         };
 
         union {
@@ -337,7 +336,7 @@ struct Regs {
         };
 
         union {
-            enum BlendEquation : u32 {
+            enum class BlendEquation : u32 {
                 Add             = 0,
                 Subtract        = 1,
                 ReverseSubtract = 2,
@@ -410,10 +409,11 @@ struct Regs {
     } output_merger;
 
     struct {
+        // Components are laid out in reverse byte order, most significant bits first.
         enum ColorFormat : u32 {
             RGBA8    = 0,
             RGB8     = 1,
-            RGBA5551 = 2,
+            RGB5A1   = 2,
             RGB565   = 3,
             RGBA4    = 4,
         };
@@ -421,7 +421,7 @@ struct Regs {
         INSERT_PADDING_WORDS(0x6);
 
         u32 depth_format;
-        u32 color_format;
+        BitField<16, 3, u32> color_format;
 
         INSERT_PADDING_WORDS(0x4);
 
@@ -678,7 +678,9 @@ struct Regs {
     INSERT_PADDING_WORDS(0x2);
 
     struct {
-        u32 begin_load;
+        // Offset of the next instruction to write code to.
+        // Incremented with each instruction write.
+        u32 offset;
 
         // Writing to these registers sets the "current" word in the shader program.
         // TODO: It's not clear how the hardware stores what the "current" word is.
@@ -690,7 +692,9 @@ struct Regs {
     // This register group is used to load an internal table of swizzling patterns,
     // which are indexed by each shader instruction to specify vector component swizzling.
     struct {
-        u32 begin_load;
+        // Offset of the next swizzle pattern to write code to.
+        // Incremented with each instruction write.
+        u32 offset;
 
         // Writing to these registers sets the "current" swizzle pattern in the table.
         // TODO: It's not clear how the hardware stores what the "current" swizzle pattern is.
@@ -699,10 +703,6 @@ struct Regs {
 
     INSERT_PADDING_WORDS(0x22);
 
-#undef INSERT_PADDING_WORDS_HELPER1
-#undef INSERT_PADDING_WORDS_HELPER2
-#undef INSERT_PADDING_WORDS
-
     // Map register indices to names readable by humans
     // Used for debugging purposes, so performance is not an issue here
     static std::string GetCommandName(int index) {
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 3faa10153..5861c1926 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -5,13 +5,16 @@
 #include <algorithm>
 
 #include "common/common_types.h"
+#include "common/math_util.h"
 
+#include "core/hw/gpu.h"
+#include "debug_utils/debug_utils.h"
 #include "math.h"
+#include "color.h"
 #include "pica.h"
 #include "rasterizer.h"
 #include "vertex_shader.h"
-
-#include "debug_utils/debug_utils.h"
+#include "video_core/utils.h"
 
 namespace Pica {
 
@@ -19,40 +22,101 @@ namespace Rasterizer {
 
 static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
     const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress();
-    u32* color_buffer = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(addr)));
-    u32 value = (color.a() << 24) | (color.r() << 16) | (color.g() << 8) | color.b();
 
-    // Assuming RGBA8 format until actual framebuffer format handling is implemented
-    *(color_buffer + x + y * registers.framebuffer.GetWidth()) = value;
+    // Similarly to textures, the render framebuffer is laid out from bottom to top, too.
+    // NOTE: The framebuffer height register contains the actual FB height minus one.
+    y = (registers.framebuffer.height - y);
+
+    const u32 coarse_y = y & ~7;
+    u32 bytes_per_pixel = GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(registers.framebuffer.color_format.Value()));
+    u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * registers.framebuffer.width * bytes_per_pixel;
+    u8* dst_pixel = Memory::GetPointer(PAddrToVAddr(addr)) + dst_offset;
+
+    switch (registers.framebuffer.color_format) {
+    case registers.framebuffer.RGBA8:
+        Color::EncodeRGBA8(color, dst_pixel);
+        break;
+
+    case registers.framebuffer.RGB8:
+        Color::EncodeRGB8(color, dst_pixel);
+        break;
+
+    case registers.framebuffer.RGB5A1:
+        Color::EncodeRGB5A1(color, dst_pixel);
+        break;
+
+    case registers.framebuffer.RGB565:
+        Color::EncodeRGB565(color, dst_pixel);
+        break;
+
+    case registers.framebuffer.RGBA4:
+        Color::EncodeRGBA4(color, dst_pixel);
+        break;
+
+    default:
+        LOG_CRITICAL(Render_Software, "Unknown framebuffer color format %x", registers.framebuffer.color_format.Value());
+        UNIMPLEMENTED();
+    }
 }
 
 static const Math::Vec4<u8> GetPixel(int x, int y) {
     const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress();
-    u32* color_buffer_u32 = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(addr)));
-
-    u32 value = *(color_buffer_u32 + x + y * registers.framebuffer.GetWidth());
-    Math::Vec4<u8> ret;
-    ret.a() = value >> 24;
-    ret.r() = (value >> 16) & 0xFF;
-    ret.g() = (value >> 8) & 0xFF;
-    ret.b() = value & 0xFF;
-    return ret;
+
+    y = (registers.framebuffer.height - y);
+
+    const u32 coarse_y = y & ~7;
+    u32 bytes_per_pixel = GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(registers.framebuffer.color_format.Value()));
+    u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * registers.framebuffer.width * bytes_per_pixel;
+    u8* src_pixel = Memory::GetPointer(PAddrToVAddr(addr)) + src_offset;
+
+    switch (registers.framebuffer.color_format) {
+    case registers.framebuffer.RGBA8:
+        return Color::DecodeRGBA8(src_pixel);
+
+    case registers.framebuffer.RGB8:
+        return Color::DecodeRGB8(src_pixel);
+
+    case registers.framebuffer.RGB5A1:
+        return Color::DecodeRGB5A1(src_pixel);
+
+    case registers.framebuffer.RGB565:
+        return Color::DecodeRGB565(src_pixel);
+
+    case registers.framebuffer.RGBA4:
+        return Color::DecodeRGBA4(src_pixel);
+
+    default:
+        LOG_CRITICAL(Render_Software, "Unknown framebuffer color format %x", registers.framebuffer.color_format.Value());
+        UNIMPLEMENTED();
+    }
+
+    return {};
  }
 
 static u32 GetDepth(int x, int y) {
     const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress();
-    u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));
+    u8* depth_buffer = Memory::GetPointer(PAddrToVAddr(addr));
+
+    y = (registers.framebuffer.height - y);
+    
+    const u32 coarse_y = y & ~7;
+    u32 stride = registers.framebuffer.width * 2;
 
     // Assuming 16-bit depth buffer format until actual format handling is implemented
-    return *(depth_buffer + x + y * registers.framebuffer.GetWidth());
+    return *(u16*)(depth_buffer + VideoCore::GetMortonOffset(x, y, 2) + coarse_y * stride);
 }
 
 static void SetDepth(int x, int y, u16 value) {
     const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress();
-    u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));
+    u8* depth_buffer = Memory::GetPointer(PAddrToVAddr(addr));
+
+    y = (registers.framebuffer.height - y);
+
+    const u32 coarse_y = y & ~7;
+    u32 stride = registers.framebuffer.width * 2;
 
     // Assuming 16-bit depth buffer format until actual format handling is implemented
-    *(depth_buffer + x + y * registers.framebuffer.GetWidth()) = value;
+    *(u16*)(depth_buffer + VideoCore::GetMortonOffset(x, y, 2) + coarse_y * stride) = value;
 }
 
 // NOTE: Assuming that rasterizer coordinates are 12.4 fixed-point values
@@ -90,30 +154,43 @@ static int SignedArea (const Math::Vec2<Fix12P4>& vtx1,
     return Math::Cross(vec1, vec2).z;
 };
 
-void ProcessTriangle(const VertexShader::OutputVertex& v0,
-                     const VertexShader::OutputVertex& v1,
-                     const VertexShader::OutputVertex& v2)
+/**
+ * Helper function for ProcessTriangle with the "reversed" flag to allow for implementing
+ * culling via recursion.
+ */
+static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
+                                    const VertexShader::OutputVertex& v1,
+                                    const VertexShader::OutputVertex& v2,
+                                    bool reversed = false)
 {
     // vertex positions in rasterizer coordinates
-    auto FloatToFix = [](float24 flt) {
-                          return Fix12P4(static_cast<unsigned short>(flt.ToFloat32() * 16.0f));
-                      };
-    auto ScreenToRasterizerCoordinates = [FloatToFix](const Math::Vec3<float24> vec) {
-                                             return Math::Vec3<Fix12P4>{FloatToFix(vec.x), FloatToFix(vec.y), FloatToFix(vec.z)};
-                                         };
+    static auto FloatToFix = [](float24 flt) {
+        // TODO: Rounding here is necessary to prevent garbage pixels at
+        //       triangle borders. Is it that the correct solution, though?
+        return Fix12P4(static_cast<unsigned short>(round(flt.ToFloat32() * 16.0f)));
+    };
+    static auto ScreenToRasterizerCoordinates = [](const Math::Vec3<float24>& vec) {
+        return Math::Vec3<Fix12P4>{FloatToFix(vec.x), FloatToFix(vec.y), FloatToFix(vec.z)};
+    };
 
     Math::Vec3<Fix12P4> vtxpos[3]{ ScreenToRasterizerCoordinates(v0.screenpos),
                                    ScreenToRasterizerCoordinates(v1.screenpos),
                                    ScreenToRasterizerCoordinates(v2.screenpos) };
 
-    if (registers.cull_mode == Regs::CullMode::KeepClockWise) {
-        // Reverse vertex order and use the CCW code path.
-        std::swap(vtxpos[1], vtxpos[2]);
-    }
+    if (registers.cull_mode == Regs::CullMode::KeepAll) {
+        // Make sure we always end up with a triangle wound counter-clockwise
+        if (!reversed && SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) {
+            ProcessTriangleInternal(v0, v2, v1, true);
+            return;
+        }
+    } else {
+        if (!reversed && registers.cull_mode == Regs::CullMode::KeepClockWise) {
+            // Reverse vertex order and use the CCW code path.
+            ProcessTriangleInternal(v0, v2, v1, true);
+            return;
+        }
 
-    if (registers.cull_mode != Regs::CullMode::KeepAll) {
         // Cull away triangles which are wound clockwise.
-        // TODO: A check for degenerate triangles ("== 0") should be considered for CullMode::KeepAll
         if (SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0)
             return;
     }
@@ -155,9 +232,10 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
     auto textures = registers.GetTextures();
     auto tev_stages = registers.GetTevStages();
 
+    // Enter rasterization loop, starting at the center of the topleft bounding box corner.
     // TODO: Not sure if looping through x first might be faster
-    for (u16 y = min_y; y < max_y; y += 0x10) {
-        for (u16 x = min_x; x < max_x; x += 0x10) {
+    for (u16 y = min_y + 8; y < max_y; y += 0x10) {
+        for (u16 x = min_x + 8; x < max_x; x += 0x10) {
 
             // Calculate the barycentric coordinates w0, w1 and w2
             int w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y});
@@ -220,7 +298,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
 
                 int s = (int)(uv[i].u() * float24::FromFloat32(static_cast<float>(texture.config.width))).ToFloat32();
                 int t = (int)(uv[i].v() * float24::FromFloat32(static_cast<float>(texture.config.height))).ToFloat32();
-                auto GetWrappedTexCoord = [](Regs::TextureConfig::WrapMode mode, int val, unsigned size) {
+                static auto GetWrappedTexCoord = [](Regs::TextureConfig::WrapMode mode, int val, unsigned size) {
                     switch (mode) {
                         case Regs::TextureConfig::ClampToEdge:
                             val = std::max(val, 0);
@@ -228,7 +306,15 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                             return val;
 
                         case Regs::TextureConfig::Repeat:
-                            return (int)(((unsigned)val) % size);
+                            return (int)((unsigned)val % size);
+
+                        case Regs::TextureConfig::MirroredRepeat:
+                        {
+                            int coord = (int)((unsigned)val % (2 * size));
+                            if (coord >= size)
+                                coord = 2 * size - 1 - coord;
+                            return coord;
+                        }
 
                         default:
                             LOG_ERROR(HW_GPU, "Unknown texture coordinate wrapping mode %x\n", (int)mode);
@@ -236,6 +322,10 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                             return 0;
                     }
                 };
+
+                // Textures are laid out from bottom to top, hence we invert the t coordinate.
+                // NOTE: This may not be the right place for the inversion.
+                // TODO: Check if this applies to ETC textures, too.
                 s = GetWrappedTexCoord(texture.config.wrap_s, s, texture.config.width);
                 t = texture.config.height - 1 - GetWrappedTexCoord(texture.config.wrap_t, t, texture.config.height);
 
@@ -262,7 +352,9 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
 
                 auto GetSource = [&](Source source) -> Math::Vec4<u8> {
                     switch (source) {
+                    // TODO: What's the difference between these two?
                     case Source::PrimaryColor:
+                    case Source::PrimaryFragmentColor:
                         return primary_color;
 
                     case Source::Texture0:
@@ -378,6 +470,25 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                         return result.Cast<u8>();
                     }
 
+                    case Operation::MultiplyThenAdd:
+                    {
+                        auto result = (input[0] * input[1] + 255 * input[2].Cast<int>()) / 255;
+                        result.r() = std::min(255, result.r());
+                        result.g() = std::min(255, result.g());
+                        result.b() = std::min(255, result.b());
+                        return result.Cast<u8>();
+                    }
+
+                    case Operation::AddThenMultiply:
+                    {
+                        auto result = input[0] + input[1];
+                        result.r() = std::min(255, result.r());
+                        result.g() = std::min(255, result.g());
+                        result.b() = std::min(255, result.b());
+                        result = (result * input[2].Cast<int>()) / 255;
+                        return result.Cast<u8>();
+                    }
+
                     default:
                         LOG_ERROR(HW_GPU, "Unknown color combiner operation %d\n", (int)op);
                         UNIMPLEMENTED();
@@ -402,6 +513,12 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                     case Operation::Subtract:
                         return std::max(0, (int)input[0] - (int)input[1]);
 
+                    case Operation::MultiplyThenAdd:
+                        return std::min(255, (input[0] * input[1] + 255 * input[2]) / 255);
+
+                    case Operation::AddThenMultiply:
+                        return (std::min(255, (input[0] + input[1])) * input[2]) / 255;
+
                     default:
                         LOG_ERROR(HW_GPU, "Unknown alpha combiner operation %d\n", (int)op);
                         UNIMPLEMENTED();
@@ -475,7 +592,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
 
             // TODO: Does depth indeed only get written even if depth testing is enabled?
             if (registers.output_merger.depth_test_enable) {
-                u16 z = (u16)(-(v0.screenpos[2].ToFloat32() * w0 +
+                u16 z = (u16)((v0.screenpos[2].ToFloat32() * w0 +
                             v1.screenpos[2].ToFloat32() * w1 +
                             v2.screenpos[2].ToFloat32() * w2) * 65535.f / wsum);
                 u16 ref_z = GetDepth(x >> 4, y >> 4);
@@ -524,6 +641,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
             }
 
             auto dest = GetPixel(x >> 4, y >> 4);
+            Math::Vec4<u8> blend_output = combiner_output;
 
             if (registers.output_merger.alphablend_enable) {
                 auto params = registers.output_merger.alpha_blending;
@@ -574,7 +692,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
 
                     default:
                         LOG_CRITICAL(HW_GPU, "Unknown color blend factor %x", factor);
-                        exit(0);
+                        UNIMPLEMENTED();
                         break;
                     }
                 };
@@ -607,86 +725,78 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
 
                     default:
                         LOG_CRITICAL(HW_GPU, "Unknown alpha blend factor %x", factor);
-                        exit(0);
+                        UNIMPLEMENTED();
+                        break;
+                    }
+                };
+
+                using BlendEquation = decltype(params)::BlendEquation;
+                static auto EvaluateBlendEquation = [](const Math::Vec4<u8>& src, const Math::Vec4<u8>& srcfactor,
+                                                       const Math::Vec4<u8>& dest, const Math::Vec4<u8>& destfactor,
+                                                       BlendEquation equation) {
+                    Math::Vec4<int> result;
+
+                    auto src_result = (src  *  srcfactor).Cast<int>();
+                    auto dst_result = (dest * destfactor).Cast<int>();
+
+                    switch (equation) {
+                    case BlendEquation::Add:
+                        result = (src_result + dst_result) / 255;
+                        break;
+
+                    case BlendEquation::Subtract:
+                        result = (src_result - dst_result) / 255;
+                        break;
+
+                    case BlendEquation::ReverseSubtract:
+                        result = (dst_result - src_result) / 255;
+                        break;
+
+                    // TODO: How do these two actually work?
+                    //       OpenGL doesn't include the blend factors in the min/max computations,
+                    //       but is this what the 3DS actually does?
+                    case BlendEquation::Min:
+                        result.r() = std::min(src.r(), dest.r());
+                        result.g() = std::min(src.g(), dest.g());
+                        result.b() = std::min(src.b(), dest.b());
+                        result.a() = std::min(src.a(), dest.a());
                         break;
+
+                    case BlendEquation::Max:
+                        result.r() = std::max(src.r(), dest.r());
+                        result.g() = std::max(src.g(), dest.g());
+                        result.b() = std::max(src.b(), dest.b());
+                        result.a() = std::max(src.a(), dest.a());
+                        break;
+
+                    default:
+                        LOG_CRITICAL(HW_GPU, "Unknown RGB blend equation %x", equation);
+                        UNIMPLEMENTED();
                     }
+
+                    return Math::Vec4<u8>(MathUtil::Clamp(result.r(), 0, 255),
+                                    MathUtil::Clamp(result.g(), 0, 255),
+                                    MathUtil::Clamp(result.b(), 0, 255),
+                                    MathUtil::Clamp(result.a(), 0, 255));
                 };
 
                 auto srcfactor = Math::MakeVec(LookupFactorRGB(params.factor_source_rgb),
                                                LookupFactorA(params.factor_source_a));
                 auto dstfactor = Math::MakeVec(LookupFactorRGB(params.factor_dest_rgb),
                                                LookupFactorA(params.factor_dest_a));
-                                               
-                auto src_result = (combiner_output * srcfactor).Cast<int>();
-                auto dst_result = (dest * dstfactor).Cast<int>();
-
-                switch (params.blend_equation_rgb) {
-                case params.Add:
-                {
-                    auto result = (src_result + dst_result) / 255;
-                    result.r() = std::min(255, result.r());
-                    result.g() = std::min(255, result.g());
-                    result.b() = std::min(255, result.b());
-                    combiner_output = result.Cast<u8>();
-                    break;
-                }
-                
-                case params.Subtract:
-                {
-                    auto result = (src_result - dst_result) / 255;
-                    result.r() = std::max(0, result.r());
-                    result.g() = std::max(0, result.g());
-                    result.b() = std::max(0, result.b());
-                    combiner_output = result.Cast<u8>();
-                    break;
-                }
-                
-                case params.ReverseSubtract:
-                {
-                    auto result = (dst_result - src_result) / 255;
-                    result.r() = std::max(0, result.r());
-                    result.g() = std::max(0, result.g());
-                    result.b() = std::max(0, result.b());
-                    combiner_output = result.Cast<u8>();
-                    break;
-                }
-                
-                case params.Min:
-                {
-                    // TODO: GL spec says to do it without the factors, but is this what the 3DS does?
-                    Math::Vec4<int> result;
-                    result.r() = std::min(combiner_output.r(),dest.r());
-                    result.g() = std::min(combiner_output.g(),dest.g());
-                    result.b() = std::min(combiner_output.b(),dest.b());
-                    combiner_output = result.Cast<u8>();
-                    break;
-                }
-                
-                case params.Max:
-                {
-                    // TODO: GL spec says to do it without the factors, but is this what the 3DS does?
-                    Math::Vec4<int> result;
-                    result.r() = std::max(combiner_output.r(),dest.r());
-                    result.g() = std::max(combiner_output.g(),dest.g());
-                    result.b() = std::max(combiner_output.b(),dest.b());
-                    combiner_output = result.Cast<u8>();
-                    break;
-                }
 
-                default:
-                    LOG_CRITICAL(HW_GPU, "Unknown RGB blend equation %x", params.blend_equation_rgb.Value());
-                    exit(0);
-                }
+                blend_output     = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, params.blend_equation_rgb);
+                blend_output.a() = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, params.blend_equation_a).a();
             } else {
                 LOG_CRITICAL(HW_GPU, "logic op: %x", registers.output_merger.logic_op);
-                exit(0);
+                UNIMPLEMENTED();
             }
 
             const Math::Vec4<u8> result = {
-                registers.output_merger.red_enable   ? combiner_output.r() : dest.r(),
-                registers.output_merger.green_enable ? combiner_output.g() : dest.g(),
-                registers.output_merger.blue_enable  ? combiner_output.b() : dest.b(),
-                registers.output_merger.alpha_enable ? combiner_output.a() : dest.a()
+                registers.output_merger.red_enable   ? blend_output.r() : dest.r(),
+                registers.output_merger.green_enable ? blend_output.g() : dest.g(),
+                registers.output_merger.blue_enable  ? blend_output.b() : dest.b(),
+                registers.output_merger.alpha_enable ? blend_output.a() : dest.a()
             };
 
             DrawPixel(x >> 4, y >> 4, result);
@@ -694,6 +804,12 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
     }
 }
 
+void ProcessTriangle(const VertexShader::OutputVertex& v0,
+                     const VertexShader::OutputVertex& v1,
+                     const VertexShader::OutputVertex& v2) {
+    ProcessTriangleInternal(v0, v1, v2);
+}
+
 } // namespace Rasterizer
 
 } // namespace Pica
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 272695174..95ab96340 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -4,7 +4,10 @@
 
 #include "core/hw/gpu.h"
 #include "core/mem_map.h"
+
 #include "common/emu_window.h"
+#include "common/profiler_reporting.h"
+
 #include "video_core/video_core.h"
 #include "video_core/renderer_opengl/renderer_opengl.h"
 #include "video_core/renderer_opengl/gl_shader_util.h"
@@ -75,9 +78,18 @@ void RendererOpenGL::SwapBuffers() {
 
     DrawScreens();
 
+    auto& profiler = Common::Profiling::GetProfilingManager();
+    profiler.FinishFrame();
+    {
+        auto aggregator = Common::Profiling::GetTimingResultsAggregator();
+        aggregator->AddFrame(profiler.GetPreviousFrameResults());
+    }
+
     // Swap buffers
     render_window->PollEvents();
     render_window->SwapBuffers();
+
+    profiler.BeginFrame();
 }
 
 /**
@@ -242,28 +254,26 @@ void RendererOpenGL::DrawSingleScreenRotated(const TextureInfo& texture, float x
  * Draws the emulated screens to the emulator window.
  */
 void RendererOpenGL::DrawScreens() {
-    auto viewport_extent = GetViewportExtent();
-    glViewport(viewport_extent.left, viewport_extent.top, viewport_extent.GetWidth(), viewport_extent.GetHeight()); // TODO: Or bottom?
+    auto layout = render_window->GetFramebufferLayout();
+
+    glViewport(0, 0, layout.width, layout.height);
     glClear(GL_COLOR_BUFFER_BIT);
 
     glUseProgram(program_id);
 
     // Set projection matrix
-    std::array<GLfloat, 3*2> ortho_matrix = MakeOrthographicMatrix((float)resolution_width, (float)resolution_height);
+    std::array<GLfloat, 3 * 2> ortho_matrix = MakeOrthographicMatrix((float)layout.width,
+        (float)layout.height);
     glUniformMatrix3x2fv(uniform_modelview_matrix, 1, GL_FALSE, ortho_matrix.data());
 
     // Bind texture in Texture Unit 0
     glActiveTexture(GL_TEXTURE0);
     glUniform1i(uniform_color_texture, 0);
 
-    const float max_width = std::max((float)VideoCore::kScreenTopWidth, (float)VideoCore::kScreenBottomWidth);
-    const float top_x = 0.5f * (max_width - VideoCore::kScreenTopWidth);
-    const float bottom_x = 0.5f * (max_width - VideoCore::kScreenBottomWidth);
-
-    DrawSingleScreenRotated(textures[0], top_x, 0,
-        (float)VideoCore::kScreenTopWidth, (float)VideoCore::kScreenTopHeight);
-    DrawSingleScreenRotated(textures[1], bottom_x, (float)VideoCore::kScreenTopHeight,
-        (float)VideoCore::kScreenBottomWidth, (float)VideoCore::kScreenBottomHeight);
+    DrawSingleScreenRotated(textures[0], (float)layout.top_screen.left, (float)layout.top_screen.top,
+        (float)layout.top_screen.GetWidth(), (float)layout.top_screen.GetHeight());
+    DrawSingleScreenRotated(textures[1], (float)layout.bottom_screen.left,(float)layout.bottom_screen.top,
+        (float)layout.bottom_screen.GetWidth(), (float)layout.bottom_screen.GetHeight());
 
     m_current_frame++;
 }
@@ -280,34 +290,6 @@ void RendererOpenGL::SetWindow(EmuWindow* window) {
     render_window = window;
 }
 
-MathUtil::Rectangle<unsigned> RendererOpenGL::GetViewportExtent() {
-    unsigned framebuffer_width;
-    unsigned framebuffer_height;
-    std::tie(framebuffer_width, framebuffer_height) = render_window->GetFramebufferSize();
-
-    float window_aspect_ratio = static_cast<float>(framebuffer_height) / framebuffer_width;
-    float emulation_aspect_ratio = static_cast<float>(resolution_height) / resolution_width;
-
-    MathUtil::Rectangle<unsigned> viewport_extent;
-    if (window_aspect_ratio > emulation_aspect_ratio) {
-        // Window is narrower than the emulation content => apply borders to the top and bottom
-        unsigned viewport_height = static_cast<unsigned>(std::round(emulation_aspect_ratio * framebuffer_width));
-        viewport_extent.left = 0;
-        viewport_extent.top = (framebuffer_height - viewport_height) / 2;
-        viewport_extent.right = viewport_extent.left + framebuffer_width;
-        viewport_extent.bottom = viewport_extent.top + viewport_height;
-    } else {
-        // Otherwise, apply borders to the left and right sides of the window.
-        unsigned viewport_width = static_cast<unsigned>(std::round(framebuffer_height / emulation_aspect_ratio));
-        viewport_extent.left = (framebuffer_width - viewport_width) / 2;
-        viewport_extent.top = 0;
-        viewport_extent.right = viewport_extent.left + viewport_width;
-        viewport_extent.bottom = viewport_extent.top + framebuffer_height;
-    }
-
-    return viewport_extent;
-}
-
 /// Initialize the renderer
 void RendererOpenGL::Init() {
     render_window->MakeCurrent();
diff --git a/src/video_core/utils.h b/src/video_core/utils.h
index 6fd640425..bda793fa5 100644
--- a/src/video_core/utils.h
+++ b/src/video_core/utils.h
@@ -35,4 +35,54 @@ struct TGAHeader {
  */
 void DumpTGA(std::string filename, short width, short height, u8* raw_data);
 
+/**
+ * Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
+ * arranged in a Z-order curve. More details on the bit manipulation at:
+ * https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
+ */
+static inline u32 MortonInterleave(u32 x, u32 y) {
+    u32 i = (x & 7) | ((y & 7) << 8); // ---- -210
+    i = (i ^ (i << 2)) & 0x1313;      // ---2 --10
+    i = (i ^ (i << 1)) & 0x1515;      // ---2 -1-0
+    i = (i | (i >> 7)) & 0x3F;
+    return i;
+}
+
+/**
+ * Calculates the offset of the position of the pixel in Morton order
+ */
+static inline u32 GetMortonOffset(u32 x, u32 y, u32 bytes_per_pixel) {
+    // Images are split into 8x8 tiles. Each tile is composed of four 4x4 subtiles each
+    // of which is composed of four 2x2 subtiles each of which is composed of four texels.
+    // Each structure is embedded into the next-bigger one in a diagonal pattern, e.g.
+    // texels are laid out in a 2x2 subtile like this:
+    // 2 3
+    // 0 1
+    //
+    // The full 8x8 tile has the texels arranged like this:
+    //
+    // 42 43 46 47 58 59 62 63
+    // 40 41 44 45 56 57 60 61
+    // 34 35 38 39 50 51 54 55
+    // 32 33 36 37 48 49 52 53
+    // 10 11 14 15 26 27 30 31
+    // 08 09 12 13 24 25 28 29
+    // 02 03 06 07 18 19 22 23
+    // 00 01 04 05 16 17 20 21
+    //
+    // This pattern is what's called Z-order curve, or Morton order.
+
+    const unsigned int block_width = 8;
+    const unsigned int block_height = 8;
+
+    const unsigned int coarse_x = x & ~7;
+    const unsigned int coarse_y = y & ~7;
+
+    u32 i = VideoCore::MortonInterleave(x, y);
+
+    const unsigned int offset = coarse_x * block_height;
+
+    return (i + offset) * bytes_per_pixel;
+}
+
 } // namespace
diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp
index 80935a50a..bc8c0041c 100644
--- a/src/video_core/vertex_shader.cpp
+++ b/src/video_core/vertex_shader.cpp
@@ -85,8 +85,12 @@ struct VertexShaderState {
     };
 
     struct CallStackElement {
-        u32 final_address;
-        u32 return_address;
+        u32 final_address;  // Address upon which we jump to return_address
+        u32 return_address; // Where to jump when leaving scope
+        u8 repeat_counter;  // How often to repeat until this call stack element is removed
+        u8 loop_increment;  // Which value to add to the loop counter after an iteration
+                            // TODO: Should this be a signed value? Does it even matter?
+        u32 loop_address;   // The address where we'll return to after each loop iteration
     };
 
     // TODO: Is there a maximal size for this?
@@ -105,9 +109,16 @@ static void ProcessShaderCode(VertexShaderState& state) {
 
     while (true) {
         if (!state.call_stack.empty()) {
-            if (state.program_counter - shader_memory.data() == state.call_stack.top().final_address) {
-                state.program_counter = &shader_memory[state.call_stack.top().return_address];
-                state.call_stack.pop();
+            auto& top = state.call_stack.top();
+            if (state.program_counter - shader_memory.data() == top.final_address) {
+                state.address_registers[2] += top.loop_increment;
+
+                if (top.repeat_counter-- == 0) {
+                    state.program_counter = &shader_memory[top.return_address];
+                    state.call_stack.pop();
+                } else {
+                    state.program_counter = &shader_memory[top.loop_address];
+                }
 
                 // TODO: Is "trying again" accurate to hardware?
                 continue;
@@ -118,9 +129,10 @@ static void ProcessShaderCode(VertexShaderState& state) {
         const Instruction& instr = *(const Instruction*)state.program_counter;
         const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id];
 
-        auto call = [&](VertexShaderState& state, u32 offset, u32 num_instructions, u32 return_offset) {
+        static auto call = [](VertexShaderState& state, u32 offset, u32 num_instructions,
+                              u32 return_offset, u8 repeat_count, u8 loop_increment) {
             state.program_counter = &shader_memory[offset] - 1; // -1 to make sure when incrementing the PC we end up at the correct offset
-            state.call_stack.push({ offset + num_instructions, return_offset });
+            state.call_stack.push({ offset + num_instructions, return_offset, repeat_count, loop_increment, offset });
         };
         u32 binary_offset = state.program_counter - shader_memory.data();
 
@@ -457,7 +469,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
                 call(state,
                      instr.flow_control.dest_offset,
                      instr.flow_control.num_instructions,
-                     binary_offset + 1);
+                     binary_offset + 1, 0, 0);
                 break;
 
             case Instruction::OpCode::CALLU:
@@ -465,7 +477,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
                     call(state,
                         instr.flow_control.dest_offset,
                         instr.flow_control.num_instructions,
-                        binary_offset + 1);
+                        binary_offset + 1, 0, 0);
                 }
                 break;
 
@@ -474,7 +486,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
                     call(state,
                         instr.flow_control.dest_offset,
                         instr.flow_control.num_instructions,
-                        binary_offset + 1);
+                        binary_offset + 1, 0, 0);
                 }
                 break;
 
@@ -486,12 +498,12 @@ static void ProcessShaderCode(VertexShaderState& state) {
                     call(state,
                          binary_offset + 1,
                          instr.flow_control.dest_offset - binary_offset - 1,
-                         instr.flow_control.dest_offset + instr.flow_control.num_instructions);
+                         instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
                 } else {
                     call(state,
                          instr.flow_control.dest_offset,
                          instr.flow_control.num_instructions,
-                         instr.flow_control.dest_offset + instr.flow_control.num_instructions);
+                         instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
                 }
 
                 break;
@@ -504,17 +516,30 @@ static void ProcessShaderCode(VertexShaderState& state) {
                     call(state,
                          binary_offset + 1,
                          instr.flow_control.dest_offset - binary_offset - 1,
-                         instr.flow_control.dest_offset + instr.flow_control.num_instructions);
+                         instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
                 } else {
                     call(state,
                          instr.flow_control.dest_offset,
                          instr.flow_control.num_instructions,
-                         instr.flow_control.dest_offset + instr.flow_control.num_instructions);
+                         instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
                 }
 
                 break;
             }
 
+            case Instruction::OpCode::LOOP:
+            {
+                state.address_registers[2] = shader_uniforms.i[instr.flow_control.int_uniform_id].y;
+
+                call(state,
+                     binary_offset + 1,
+                     instr.flow_control.dest_offset - binary_offset + 1,
+                     instr.flow_control.dest_offset + 1,
+                     shader_uniforms.i[instr.flow_control.int_uniform_id].x,
+                     shader_uniforms.i[instr.flow_control.int_uniform_id].z);
+                break;
+            }
+
             default:
                 LOG_ERROR(HW_GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x",
                           (int)instr.opcode.Value(), instr.opcode.GetInfo().name, instr.hex);
diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp
index 0a236595c..b9d4ede3a 100644
--- a/src/video_core/video_core.cpp
+++ b/src/video_core/video_core.cpp
@@ -18,7 +18,6 @@ namespace VideoCore {
 
 EmuWindow*      g_emu_window    = nullptr;     ///< Frontend emulator window
 RendererBase*   g_renderer      = nullptr;     ///< Renderer plugin
-int             g_current_frame = 0;
 
 /// Initialize the video core
 void Init(EmuWindow* emu_window) {
@@ -27,8 +26,6 @@ void Init(EmuWindow* emu_window) {
     g_renderer->SetWindow(g_emu_window);
     g_renderer->Init();
 
-    g_current_frame = 0;
-
     LOG_DEBUG(Render, "initialized OK");
 }
 
diff --git a/src/video_core/video_core.h b/src/video_core/video_core.h
index b782f17bd..1b51d39bf 100644
--- a/src/video_core/video_core.h
+++ b/src/video_core/video_core.h
@@ -30,7 +30,6 @@ static const int kScreenBottomHeight    = 240;  ///< 3DS bottom screen height
 // ---------------------
 
 extern RendererBase*   g_renderer;              ///< Renderer plugin
-extern int             g_current_frame;         ///< Current frame
 extern EmuWindow*      g_emu_window;            ///< Emu window
 
 /// Start the video core