12 files changed, 306 insertions, 125 deletions
diff --git a/src/video_core/color.h b/src/video_core/color.h
index 43d635e2c..4d2026eb0 100644
--- a/src/video_core/color.h
+++ b/src/video_core/color.h
@@ -5,6 +5,8 @@
 #pragma once
 
 #include "common/common_types.h"
+#include "common/swap.h"
+
 #include "video_core/math.h"
 
 namespace Color {
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index e031871e8..1ea7cad07 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -27,6 +27,10 @@ static int float_regs_counter = 0;
 
 static u32 uniform_write_buffer[4];
 
+static int default_attr_counter = 0;
+
+static u32 default_attr_write_buffer[3];
+
 Common::Profiling::TimingCategory category_drawing("Drawing");
 
 static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
@@ -71,12 +75,9 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
             u32 vertex_attribute_sources[16];
             boost::fill(vertex_attribute_sources, 0xdeadbeef);
             u32 vertex_attribute_strides[16];
-            u32 vertex_attribute_formats[16];
+            Regs::VertexAttributeFormat vertex_attribute_formats[16];
 
-            // HACK: Initialize vertex_attribute_elements to zero to prevent infinite loops below.
-            // This is one of the hacks required to deal with uninitalized vertex attributes.
-            // TODO: Fix this properly.
-            u32 vertex_attribute_elements[16] = {};
+            u32 vertex_attribute_elements[16];
             u32 vertex_attribute_element_size[16];
 
             // Setup attribute data from loaders
@@ -90,7 +91,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
                     u32 attribute_index = loader_config.GetComponent(component);
                     vertex_attribute_sources[attribute_index] = load_address;
                     vertex_attribute_strides[attribute_index] = static_cast<u32>(loader_config.byte_count);
-                    vertex_attribute_formats[attribute_index] = static_cast<u32>(attribute_config.GetFormat(attribute_index));
+                    vertex_attribute_formats[attribute_index] = attribute_config.GetFormat(attribute_index);
                     vertex_attribute_elements[attribute_index] = attribute_config.GetNumElements(attribute_index);
                     vertex_attribute_element_size[attribute_index] = attribute_config.GetElementSizeInBytes(attribute_index);
                     load_address += attribute_config.GetStride(attribute_index);
@@ -101,7 +102,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
             bool is_indexed = (id == PICA_REG_INDEX(trigger_draw_indexed));
 
             const auto& index_info = registers.index_array;
-            const u8* index_address_8 = Memory::GetPointer(PAddrToVAddr(base_address + index_info.offset));
+            const u8* index_address_8 = Memory::GetPhysicalPointer(base_address + index_info.offset);
             const u16* index_address_16 = (u16*)index_address_8;
             bool index_u16 = index_info.format != 0;
 
@@ -126,26 +127,29 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
                 input.attr[0].w = debug_token;
 
                 for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) {
-                    for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
-                        const u8* srcdata = Memory::GetPointer(PAddrToVAddr(vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i]));
-
-                        // TODO(neobrain): Ocarina of Time 3D has GetNumTotalAttributes return 8,
-                        // yet only provides 2 valid source data addresses. Need to figure out
-                        // what's wrong there, until then we just continue when address lookup fails
-                        if (srcdata == nullptr)
-                            continue;
-
-                        const float srcval = (vertex_attribute_formats[i] == 0) ? *(s8*)srcdata :
-                                             (vertex_attribute_formats[i] == 1) ? *(u8*)srcdata :
-                                             (vertex_attribute_formats[i] == 2) ? *(s16*)srcdata :
-                                                                                  *(float*)srcdata;
-                        input.attr[i][comp] = float24::FromFloat32(srcval);
-                        LOG_TRACE(HW_GPU, "Loaded component %x of attribute %x for vertex %x (index %x) from 0x%08x + 0x%08lx + 0x%04lx: %f",
-                                  comp, i, vertex, index,
-                                  attribute_config.GetPhysicalBaseAddress(),
-                                  vertex_attribute_sources[i] - base_address,
-                                  vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i],
-                                  input.attr[i][comp].ToFloat32());
+                    if (attribute_config.IsDefaultAttribute(i)) {
+                        input.attr[i] = VertexShader::GetDefaultAttribute(i);
+                        LOG_TRACE(HW_GPU, "Loaded default attribute %x for vertex %x (index %x): (%f, %f, %f, %f)",
+                                  i, vertex, index,
+                                  input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(),
+                                  input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32());
+                    } else {
+                        for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
+                            const u8* srcdata = Memory::GetPhysicalPointer(vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i]);
+
+                            const float srcval = (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::BYTE) ? *(s8*)srcdata :
+                                (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::UBYTE) ? *(u8*)srcdata :
+                                (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? *(s16*)srcdata :
+                                *(float*)srcdata;
+
+                            input.attr[i][comp] = float24::FromFloat32(srcval);
+                            LOG_TRACE(HW_GPU, "Loaded component %x of attribute %x for vertex %x (index %x) from 0x%08x + 0x%08lx + 0x%04lx: %f",
+                                      comp, i, vertex, index,
+                                      attribute_config.GetPhysicalBaseAddress(),
+                                      vertex_attribute_sources[i] - base_address,
+                                      vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i],
+                                      input.attr[i][comp].ToFloat32());
+                        }
                     }
                 }
 
@@ -224,7 +228,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
             //       it directly write the values?
             uniform_write_buffer[float_regs_counter++] = value;
 
-            // Uniforms are written in a packed format such that 4 float24 values are encoded in
+            // Uniforms are written in a packed format such that four float24 values are encoded in
             // three 32-bit numbers. We write to internal memory once a full such vector is
             // written.
             if ((float_regs_counter >= 4 && uniform_setup.IsFloat32()) ||
@@ -259,6 +263,46 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
             }
             break;
         }
+        
+        // Load default vertex input attributes
+        case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[0], 0x233):
+        case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[1], 0x234):
+        case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[2], 0x235):
+        {
+            // TODO: Does actual hardware indeed keep an intermediate buffer or does
+            //       it directly write the values?
+            default_attr_write_buffer[default_attr_counter++] = value;
+
+            // Default attributes are written in a packed format such that four float24 values are encoded in
+            // three 32-bit numbers. We write to internal memory once a full such vector is
+            // written.
+            if (default_attr_counter >= 3) {
+                default_attr_counter = 0;
+
+                auto& setup = registers.vs_default_attributes_setup;
+
+                if (setup.index >= 16) {
+                    LOG_ERROR(HW_GPU, "Invalid VS default attribute index %d", (int)setup.index);
+                    break;
+                }
+
+                Math::Vec4<float24>& attribute = VertexShader::GetDefaultAttribute(setup.index);
+                
+                // NOTE: The destination component order indeed is "backwards"
+                attribute.w = float24::FromRawFloat24(default_attr_write_buffer[0] >> 8);
+                attribute.z = float24::FromRawFloat24(((default_attr_write_buffer[0] & 0xFF) << 16) | ((default_attr_write_buffer[1] >> 16) & 0xFFFF));
+                attribute.y = float24::FromRawFloat24(((default_attr_write_buffer[1] & 0xFFFF) << 8) | ((default_attr_write_buffer[2] >> 24) & 0xFF));
+                attribute.x = float24::FromRawFloat24(default_attr_write_buffer[2] & 0xFFFFFF);
+
+                LOG_TRACE(HW_GPU, "Set default VS attribute %x to (%f %f %f %f)", (int)setup.index,
+                          attribute.x.ToFloat32(), attribute.y.ToFloat32(), attribute.z.ToFloat32(),
+                          attribute.w.ToFloat32());
+
+                // TODO: Verify that this actually modifies the register!
+                setup.index = setup.index + 1;
+            }
+            break;
+        }
 
         // Load shader program code
         case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[0], 0x2cc):
diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp
index 83982b4f2..883df48a5 100644
--- a/src/video_core/debug_utils/debug_utils.cpp
+++ b/src/video_core/debug_utils/debug_utils.cpp
@@ -393,6 +393,17 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
         }
     }
 
+    case Regs::TextureFormat::I4:
+    {
+        u32 morton_offset = VideoCore::GetMortonOffset(x, y, 1);
+        const u8* source_ptr = source + morton_offset / 2;
+
+        u8 i = (morton_offset % 2) ? ((*source_ptr & 0xF0) >> 4) : (*source_ptr & 0xF);
+        i = Color::Convert4To8(i);
+
+        return { i, i, i, 255 };
+    }
+
     case Regs::TextureFormat::A4:
     {
         u32 morton_offset = VideoCore::GetMortonOffset(x, y, 1);
@@ -507,7 +518,7 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
                 // Add modifier
                 unsigned table_index = (x < 2) ? table_index_1.Value() : table_index_2.Value();
 
-                static const auto etc1_modifier_table = std::array<std::array<u8, 2>, 8>{{
+                static const std::array<std::array<u8, 2>, 8> etc1_modifier_table = {{
                     {  2,  8 }, {  5, 17 }, {  9,  29 }, { 13,  42 },
                     { 18, 60 }, { 24, 80 }, { 33, 106 }, { 47, 183 }
                 }};
@@ -597,7 +608,7 @@ void DumpTexture(const Pica::Regs::TextureConfig& texture_config, u8* data) {
 
     png_init_io(png_ptr, fp.GetHandle());
 
-    // Write header (8 bit colour depth)
+    // Write header (8 bit color depth)
     png_set_IHDR(png_ptr, info_ptr, texture_config.width, texture_config.height,
         8, PNG_COLOR_TYPE_RGB /*_ALPHA*/, PNG_INTERLACE_NONE,
         PNG_COMPRESSION_TYPE_BASE, PNG_FILTER_TYPE_BASE);
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index fe20cd77d..3fbf95721 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -10,10 +10,11 @@
 #include <map>
 #include <vector>
 
+#include "common/assert.h"
 #include "common/bit_field.h"
+#include "common/common_funcs.h"
 #include "common/common_types.h"
-
-#include "core/mem_map.h"
+#include "common/logging/log.h"
 
 namespace Pica {
 
@@ -153,7 +154,7 @@ struct Regs {
         I8           =  7,
         A8           =  8,
         IA4          =  9,
-
+        I4           = 10,
         A4           = 11,
         ETC1         = 12,  // compressed
         ETC1A4       = 13,  // compressed
@@ -223,7 +224,8 @@ struct Regs {
             Texture1               = 0x4,
             Texture2               = 0x5,
             Texture3               = 0x6,
-            // 0x7-0xc = primary color??
+
+            PreviousBuffer         = 0xd,
             Constant               = 0xe,
             Previous               = 0xf,
         };
@@ -296,7 +298,18 @@ struct Regs {
             BitField<24, 8, u32> const_a;
         };
 
-        INSERT_PADDING_WORDS(0x1);
+        union {
+            BitField< 0, 2, u32> color_scale;
+            BitField<16, 2, u32> alpha_scale;
+        };
+
+        inline unsigned GetColorMultiplier() const {
+            return (color_scale < 3) ? (1 << color_scale) : 1;
+        }
+
+        inline unsigned GetAlphaMultiplier() const {
+            return (alpha_scale < 3) ? (1 << alpha_scale) : 1;
+        }
     };
 
     TevStageConfig tev_stage0;
@@ -306,11 +319,36 @@ struct Regs {
     TevStageConfig tev_stage2;
     INSERT_PADDING_WORDS(0x3);
     TevStageConfig tev_stage3;
-    INSERT_PADDING_WORDS(0x13);
+    INSERT_PADDING_WORDS(0x3);
+
+    union {
+        // Tev stages 0-3 write their output to the combiner buffer if the corresponding bit in
+        // these masks are set
+        BitField< 8, 4, u32> update_mask_rgb;
+        BitField<12, 4, u32> update_mask_a;
+
+        bool TevStageUpdatesCombinerBufferColor(unsigned stage_index) const {
+            return (stage_index < 4) && (update_mask_rgb & (1 << stage_index));
+        }
+
+        bool TevStageUpdatesCombinerBufferAlpha(unsigned stage_index) const {
+            return (stage_index < 4) && (update_mask_a & (1 << stage_index));
+        }
+    } tev_combiner_buffer_input;
+    
+    INSERT_PADDING_WORDS(0xf);
     TevStageConfig tev_stage4;
     INSERT_PADDING_WORDS(0x3);
     TevStageConfig tev_stage5;
-    INSERT_PADDING_WORDS(0x3);
+
+    union {
+        BitField< 0, 8, u32> r;
+        BitField< 8, 8, u32> g;
+        BitField<16, 8, u32> b;
+        BitField<24, 8, u32> a;
+    } tev_combiner_buffer_color;
+
+    INSERT_PADDING_WORDS(0x2);
 
     const std::array<Regs::TevStageConfig,6> GetTevStages() const {
         return { tev_stage0, tev_stage1,
@@ -423,9 +461,7 @@ struct Regs {
         D24S8  = 3
     };
 
-    /*
-     * Returns the number of bytes in the specified depth format
-     */
+    // Returns the number of bytes in the specified depth format
     static u32 BytesPerDepthPixel(DepthFormat format) {
         switch (format) {
         case DepthFormat::D16:
@@ -440,6 +476,20 @@ struct Regs {
         }
     }
 
+    // Returns the number of bits per depth component of the specified depth format
+    static u32 DepthBitsPerPixel(DepthFormat format) {
+        switch (format) {
+        case DepthFormat::D16:
+            return 16;
+        case DepthFormat::D24:
+        case DepthFormat::D24S8:
+            return 24;
+        default:
+            LOG_CRITICAL(HW_GPU, "Unknown depth format %u", format);
+            UNIMPLEMENTED();
+        }
+    }
+
     struct {
         // Components are laid out in reverse byte order, most significant bits first.
         enum ColorFormat : u32 {
@@ -489,14 +539,14 @@ struct Regs {
 
     INSERT_PADDING_WORDS(0xe0);
 
-    struct {
-        enum class Format : u64 {
-            BYTE = 0,
-            UBYTE = 1,
-            SHORT = 2,
-            FLOAT = 3,
-        };
+    enum class VertexAttributeFormat : u64 {
+        BYTE = 0,
+        UBYTE = 1,
+        SHORT = 2,
+        FLOAT = 3,
+    };
 
+    struct {
         BitField<0, 29, u32> base_address;
 
         u32 GetPhysicalBaseAddress() const {
@@ -505,29 +555,29 @@ struct Regs {
 
         // Descriptor for internal vertex attributes
         union {
-            BitField< 0,  2, Format> format0; // size of one element
+            BitField< 0,  2, VertexAttributeFormat> format0; // size of one element
             BitField< 2,  2, u64> size0;      // number of elements minus 1
-            BitField< 4,  2, Format> format1;
+            BitField< 4,  2, VertexAttributeFormat> format1;
             BitField< 6,  2, u64> size1;
-            BitField< 8,  2, Format> format2;
+            BitField< 8,  2, VertexAttributeFormat> format2;
             BitField<10,  2, u64> size2;
-            BitField<12,  2, Format> format3;
+            BitField<12,  2, VertexAttributeFormat> format3;
             BitField<14,  2, u64> size3;
-            BitField<16,  2, Format> format4;
+            BitField<16,  2, VertexAttributeFormat> format4;
             BitField<18,  2, u64> size4;
-            BitField<20,  2, Format> format5;
+            BitField<20,  2, VertexAttributeFormat> format5;
             BitField<22,  2, u64> size5;
-            BitField<24,  2, Format> format6;
+            BitField<24,  2, VertexAttributeFormat> format6;
             BitField<26,  2, u64> size6;
-            BitField<28,  2, Format> format7;
+            BitField<28,  2, VertexAttributeFormat> format7;
             BitField<30,  2, u64> size7;
-            BitField<32,  2, Format> format8;
+            BitField<32,  2, VertexAttributeFormat> format8;
             BitField<34,  2, u64> size8;
-            BitField<36,  2, Format> format9;
+            BitField<36,  2, VertexAttributeFormat> format9;
             BitField<38,  2, u64> size9;
-            BitField<40,  2, Format> format10;
+            BitField<40,  2, VertexAttributeFormat> format10;
             BitField<42,  2, u64> size10;
-            BitField<44,  2, Format> format11;
+            BitField<44,  2, VertexAttributeFormat> format11;
             BitField<46,  2, u64> size11;
 
             BitField<48, 12, u64> attribute_mask;
@@ -536,8 +586,8 @@ struct Regs {
             BitField<60,  4, u64> num_extra_attributes;
         };
 
-        inline Format GetFormat(int n) const {
-            Format formats[] = {
+        inline VertexAttributeFormat GetFormat(int n) const {
+            VertexAttributeFormat formats[] = {
                 format0, format1, format2, format3,
                 format4, format5, format6, format7,
                 format8, format9, format10, format11
@@ -555,14 +605,18 @@ struct Regs {
         }
 
         inline int GetElementSizeInBytes(int n) const {
-            return (GetFormat(n) == Format::FLOAT) ? 4 :
-                (GetFormat(n) == Format::SHORT) ? 2 : 1;
+            return (GetFormat(n) == VertexAttributeFormat::FLOAT) ? 4 :
+                (GetFormat(n) == VertexAttributeFormat::SHORT) ? 2 : 1;
         }
 
         inline int GetStride(int n) const {
             return GetNumElements(n) * GetElementSizeInBytes(n);
         }
 
+        inline bool IsDefaultAttribute(int id) const {
+            return (id >= 12) || (attribute_mask & (1 << id)) != 0;
+        }
+
         inline int GetNumTotalAttributes() const {
             return (int)num_extra_attributes+1;
         }
@@ -625,7 +679,18 @@ struct Regs {
     u32 trigger_draw;
     u32 trigger_draw_indexed;
 
-    INSERT_PADDING_WORDS(0x2e);
+    INSERT_PADDING_WORDS(0x2);
+
+    // These registers are used to setup the default "fall-back" vertex shader attributes
+    struct {
+        // Index of the current default attribute
+        u32 index;
+        
+        // Writing to these registers sets the "current" default attribute.
+        u32 set_value[3];
+    } vs_default_attributes_setup;
+    
+    INSERT_PADDING_WORDS(0x28);
 
     enum class TriangleTopology : u32 {
         List        = 0,
@@ -669,7 +734,7 @@ struct Regs {
         BitField<56, 4, u64> attribute14_register;
         BitField<60, 4, u64> attribute15_register;
 
-        int GetRegisterForAttribute(int attribute_index) {
+        int GetRegisterForAttribute(int attribute_index) const {
             u64 fields[] = {
                 attribute0_register,  attribute1_register,  attribute2_register,  attribute3_register,
                 attribute4_register,  attribute5_register,  attribute6_register,  attribute7_register,
@@ -766,8 +831,10 @@ struct Regs {
         ADD_FIELD(tev_stage1);
         ADD_FIELD(tev_stage2);
         ADD_FIELD(tev_stage3);
+        ADD_FIELD(tev_combiner_buffer_input);
         ADD_FIELD(tev_stage4);
         ADD_FIELD(tev_stage5);
+        ADD_FIELD(tev_combiner_buffer_color);
         ADD_FIELD(output_merger);
         ADD_FIELD(framebuffer);
         ADD_FIELD(vertex_attributes);
@@ -775,6 +842,7 @@ struct Regs {
         ADD_FIELD(num_vertices);
         ADD_FIELD(trigger_draw);
         ADD_FIELD(trigger_draw_indexed);
+        ADD_FIELD(vs_default_attributes_setup);
         ADD_FIELD(triangle_topology);
         ADD_FIELD(vs_bool_uniforms);
         ADD_FIELD(vs_int_uniforms);
@@ -840,8 +908,10 @@ ASSERT_REG_POSITION(tev_stage0, 0xc0);
 ASSERT_REG_POSITION(tev_stage1, 0xc8);
 ASSERT_REG_POSITION(tev_stage2, 0xd0);
 ASSERT_REG_POSITION(tev_stage3, 0xd8);
+ASSERT_REG_POSITION(tev_combiner_buffer_input, 0xe0);
 ASSERT_REG_POSITION(tev_stage4, 0xf0);
 ASSERT_REG_POSITION(tev_stage5, 0xf8);
+ASSERT_REG_POSITION(tev_combiner_buffer_color, 0xfd);
 ASSERT_REG_POSITION(output_merger, 0x100);
 ASSERT_REG_POSITION(framebuffer, 0x110);
 ASSERT_REG_POSITION(vertex_attributes, 0x200);
@@ -849,6 +919,7 @@ ASSERT_REG_POSITION(index_array, 0x227);
 ASSERT_REG_POSITION(num_vertices, 0x228);
 ASSERT_REG_POSITION(trigger_draw, 0x22e);
 ASSERT_REG_POSITION(trigger_draw_indexed, 0x22f);
+ASSERT_REG_POSITION(vs_default_attributes_setup, 0x232);
 ASSERT_REG_POSITION(triangle_topology, 0x25e);
 ASSERT_REG_POSITION(vs_bool_uniforms, 0x2b0);
 ASSERT_REG_POSITION(vs_int_uniforms, 0x2b1);
@@ -978,15 +1049,4 @@ union CommandHeader {
     BitField<31,  1, u32> group_commands;
 };
 
-// TODO: Ugly, should fix PhysicalToVirtualAddress instead
-inline static u32 PAddrToVAddr(u32 addr) {
-    if (addr >= Memory::VRAM_PADDR && addr < Memory::VRAM_PADDR + Memory::VRAM_SIZE) {
-        return addr - Memory::VRAM_PADDR + Memory::VRAM_VADDR;
-    } else if (addr >= Memory::FCRAM_PADDR && addr < Memory::FCRAM_PADDR + Memory::FCRAM_SIZE) {
-        return addr - Memory::FCRAM_PADDR + Memory::HEAP_LINEAR_VADDR;
-    } else {
-        return 0;
-    }
-}
-
 } // namespace
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index dd46f0ec3..59eff48f9 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -6,8 +6,11 @@
 
 #include "common/common_types.h"
 #include "common/math_util.h"
+#include "common/profiler.h"
 
 #include "core/hw/gpu.h"
+#include "core/memory.h"
+
 #include "debug_utils/debug_utils.h"
 #include "math.h"
 #include "color.h"
@@ -30,7 +33,7 @@ static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
     const u32 coarse_y = y & ~7;
     u32 bytes_per_pixel = GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(registers.framebuffer.color_format.Value()));
     u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * registers.framebuffer.width * bytes_per_pixel;
-    u8* dst_pixel = Memory::GetPointer(PAddrToVAddr(addr)) + dst_offset;
+    u8* dst_pixel = Memory::GetPhysicalPointer(addr) + dst_offset;
 
     switch (registers.framebuffer.color_format) {
     case registers.framebuffer.RGBA8:
@@ -67,7 +70,7 @@ static const Math::Vec4<u8> GetPixel(int x, int y) {
     const u32 coarse_y = y & ~7;
     u32 bytes_per_pixel = GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(registers.framebuffer.color_format.Value()));
     u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * registers.framebuffer.width * bytes_per_pixel;
-    u8* src_pixel = Memory::GetPointer(PAddrToVAddr(addr)) + src_offset;
+    u8* src_pixel = Memory::GetPhysicalPointer(addr) + src_offset;
 
     switch (registers.framebuffer.color_format) {
     case registers.framebuffer.RGBA8:
@@ -90,12 +93,12 @@ static const Math::Vec4<u8> GetPixel(int x, int y) {
         UNIMPLEMENTED();
     }
 
-    return {};
+    return {0, 0, 0, 0};
 }
 
 static u32 GetDepth(int x, int y) {
     const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress();
-    u8* depth_buffer = Memory::GetPointer(PAddrToVAddr(addr));
+    u8* depth_buffer = Memory::GetPhysicalPointer(addr);
 
     y = (registers.framebuffer.height - y);
     
@@ -122,7 +125,7 @@ static u32 GetDepth(int x, int y) {
 
 static void SetDepth(int x, int y, u32 value) {
     const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress();
-    u8* depth_buffer = Memory::GetPointer(PAddrToVAddr(addr));
+    u8* depth_buffer = Memory::GetPhysicalPointer(addr);
 
     y = (registers.framebuffer.height - y);
 
@@ -186,6 +189,8 @@ static int SignedArea (const Math::Vec2<Fix12P4>& vtx1,
     return Math::Cross(vec1, vec2).z;
 };
 
+static Common::Profiling::TimingCategory rasterization_category("Rasterization");
+
 /**
  * Helper function for ProcessTriangle with the "reversed" flag to allow for implementing
  * culling via recursion.
@@ -195,6 +200,8 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                                     const VertexShader::OutputVertex& v2,
                                     bool reversed = false)
 {
+    Common::Profiling::ScopeTimer timer(rasterization_category);
+
     // vertex positions in rasterizer coordinates
     static auto FloatToFix = [](float24 flt) {
         // TODO: Rounding here is necessary to prevent garbage pixels at
@@ -342,10 +349,10 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
 
                         case Regs::TextureConfig::MirroredRepeat:
                         {
-                            int coord = (int)((unsigned)val % (2 * size));
+                            unsigned int coord = ((unsigned)val % (2 * size));
                             if (coord >= size)
                                 coord = 2 * size - 1 - coord;
-                            return coord;
+                            return (int)coord;
                         }
 
                         default:
@@ -361,7 +368,7 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                 s = GetWrappedTexCoord(texture.config.wrap_s, s, texture.config.width);
                 t = texture.config.height - 1 - GetWrappedTexCoord(texture.config.wrap_t, t, texture.config.height);
 
-                u8* texture_data = Memory::GetPointer(PAddrToVAddr(texture.config.GetPhysicalAddress()));
+                u8* texture_data = Memory::GetPhysicalPointer(texture.config.GetPhysicalAddress());
                 auto info = DebugUtils::TextureInfo::FromPicaRegister(texture.config, texture.format);
 
                 texture_color[i] = DebugUtils::LookupTexture(texture_data, s, t, info);
@@ -376,7 +383,13 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
             // with some basic arithmetic. Alpha combiners can be configured separately but work
             // analogously.
             Math::Vec4<u8> combiner_output;
-            for (const auto& tev_stage : tev_stages) {
+            Math::Vec4<u8> combiner_buffer = {
+                registers.tev_combiner_buffer_color.r, registers.tev_combiner_buffer_color.g,
+                registers.tev_combiner_buffer_color.b, registers.tev_combiner_buffer_color.a
+            };
+
+            for (unsigned tev_stage_index = 0; tev_stage_index < tev_stages.size(); ++tev_stage_index) {
+                const auto& tev_stage = tev_stages[tev_stage_index];
                 using Source = Regs::TevStageConfig::Source;
                 using ColorModifier = Regs::TevStageConfig::ColorModifier;
                 using AlphaModifier = Regs::TevStageConfig::AlphaModifier;
@@ -398,6 +411,9 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                     case Source::Texture2:
                         return texture_color[2];
 
+                    case Source::PreviousBuffer:
+                        return combiner_buffer;
+
                     case Source::Constant:
                         return {tev_stage.const_r, tev_stage.const_g, tev_stage.const_b, tev_stage.const_a};
 
@@ -407,7 +423,7 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                     default:
                         LOG_ERROR(HW_GPU, "Unknown color combiner source %d\n", (int)source);
                         UNIMPLEMENTED();
-                        return {};
+                        return {0, 0, 0, 0};
                     }
                 };
 
@@ -490,6 +506,16 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                         return result.Cast<u8>();
                     }
 
+                    case Operation::AddSigned:
+                    {
+                        // TODO(bunnei): Verify that the color conversion from (float) 0.5f to (byte) 128 is correct
+                        auto result = input[0].Cast<int>() + input[1].Cast<int>() - Math::MakeVec<int>(128, 128, 128);
+                        result.r() = MathUtil::Clamp<int>(result.r(), 0, 255);
+                        result.g() = MathUtil::Clamp<int>(result.g(), 0, 255);
+                        result.b() = MathUtil::Clamp<int>(result.b(), 0, 255);
+                        return result.Cast<u8>();
+                    }
+
                     case Operation::Lerp:
                         return ((input[0] * input[2] + input[1] * (Math::MakeVec<u8>(255, 255, 255) - input[2]).Cast<u8>()) / 255).Cast<u8>();
 
@@ -524,7 +550,7 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                     default:
                         LOG_ERROR(HW_GPU, "Unknown color combiner operation %d\n", (int)op);
                         UNIMPLEMENTED();
-                        return {};
+                        return {0, 0, 0};
                     }
                 };
 
@@ -578,7 +604,20 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                 };
                 auto alpha_output = AlphaCombine(tev_stage.alpha_op, alpha_result);
 
-                combiner_output = Math::MakeVec(color_output, alpha_output);
+                combiner_output[0] = std::min((unsigned)255, color_output.r() * tev_stage.GetColorMultiplier());
+                combiner_output[1] = std::min((unsigned)255, color_output.g() * tev_stage.GetColorMultiplier());
+                combiner_output[2] = std::min((unsigned)255, color_output.b() * tev_stage.GetColorMultiplier());
+                combiner_output[3] = std::min((unsigned)255, alpha_output * tev_stage.GetAlphaMultiplier());
+
+                if (registers.tev_combiner_buffer_input.TevStageUpdatesCombinerBufferColor(tev_stage_index)) {
+                    combiner_buffer.r() = combiner_output.r();
+                    combiner_buffer.g() = combiner_output.g();
+                    combiner_buffer.b() = combiner_output.b();
+                }
+
+                if (registers.tev_combiner_buffer_input.TevStageUpdatesCombinerBufferAlpha(tev_stage_index)) {
+                    combiner_buffer.a() = combiner_output.a();
+                }
             }
 
             if (registers.output_merger.alpha_test.enable) {
@@ -624,9 +663,10 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
 
             // TODO: Does depth indeed only get written even if depth testing is enabled?
             if (registers.output_merger.depth_test_enable) {
-                u16 z = (u16)((v0.screenpos[2].ToFloat32() * w0 +
-                            v1.screenpos[2].ToFloat32() * w1 +
-                            v2.screenpos[2].ToFloat32() * w2) * 65535.f / wsum);
+                unsigned num_bits = Pica::Regs::DepthBitsPerPixel(registers.framebuffer.depth_format);
+                u32 z = (u32)((v0.screenpos[2].ToFloat32() * w0 +
+                               v1.screenpos[2].ToFloat32() * w1 +
+                               v2.screenpos[2].ToFloat32() * w2) * ((1 << num_bits) - 1) / wsum);
                 u32 ref_z = GetDepth(x >> 4, y >> 4);
 
                 bool pass = false;
diff --git a/src/video_core/renderer_base.h b/src/video_core/renderer_base.h
index b77f29c11..b62409538 100644
--- a/src/video_core/renderer_base.h
+++ b/src/video_core/renderer_base.h
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "common/common.h"
+#include "common/common_types.h"
 
 class RendererBase : NonCopyable {
 public:
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 4273a177f..71ceb021b 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -5,9 +5,11 @@
 #include "core/hw/gpu.h"
 #include "core/hw/hw.h"
 #include "core/hw/lcd.h"
-#include "core/mem_map.h"
+#include "core/memory.h"
+#include "core/settings.h"
 
 #include "common/emu_window.h"
+#include "common/logging/log.h"
 #include "common/profiler_reporting.h"
 
 #include "video_core/video_core.h"
@@ -117,15 +119,15 @@ void RendererOpenGL::SwapBuffers() {
 void RendererOpenGL::LoadFBToActiveGLTexture(const GPU::Regs::FramebufferConfig& framebuffer,
                                              const TextureInfo& texture) {
 
-    const VAddr framebuffer_vaddr = Memory::PhysicalToVirtualAddress(
-        framebuffer.active_fb == 0 ? framebuffer.address_left1 : framebuffer.address_left2);
+    const PAddr framebuffer_addr = framebuffer.active_fb == 0 ?
+            framebuffer.address_left1 : framebuffer.address_left2;
 
     LOG_TRACE(Render_OpenGL, "0x%08x bytes from 0x%08x(%dx%d), fmt %x",
         framebuffer.stride * framebuffer.height,
-        framebuffer_vaddr, (int)framebuffer.width,
+        framebuffer_addr, (int)framebuffer.width,
         (int)framebuffer.height, (int)framebuffer.format);
 
-    const u8* framebuffer_data = Memory::GetPointer(framebuffer_vaddr);
+    const u8* framebuffer_data = Memory::GetPhysicalPointer(framebuffer_addr);
 
     int bpp = GPU::Regs::BytesPerPixel(framebuffer.color_format);
     size_t pixel_stride = framebuffer.stride / bpp;
@@ -172,7 +174,7 @@ void RendererOpenGL::LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color
  * Initializes the OpenGL state and creates persistent objects.
  */
 void RendererOpenGL::InitOpenGLObjects() {
-    glClearColor(1.0f, 1.0f, 1.0f, 0.0f);
+    glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue, 0.0f);
     glDisable(GL_DEPTH_TEST);
 
     // Link shaders and get variable locations
diff --git a/src/video_core/utils.h b/src/video_core/utils.h
index bda793fa5..ffb3e73a3 100644
--- a/src/video_core/utils.h
+++ b/src/video_core/utils.h
@@ -13,10 +13,10 @@ namespace VideoCore {
 /// Structure for the TGA texture format (for dumping)
 struct TGAHeader {
     char  idlength;
-    char  colourmaptype;
+    char  colormaptype;
     char  datatypecode;
-    short int colourmaporigin;
-    short int colourmaplength;
+    short int colormaporigin;
+    short int colormaplength;
     short int x_origin;
     short int y_origin;
     short width;
diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp
index e8d865172..981d1a356 100644
--- a/src/video_core/vertex_shader.cpp
+++ b/src/video_core/vertex_shader.cpp
@@ -8,10 +8,9 @@
 
 #include <common/file_util.h>
 
-#include <core/mem_map.h>
-
 #include <nihstro/shader_bytecode.h>
 
+#include "common/profiler.h"
 
 #include "pica.h"
 #include "vertex_shader.h"
@@ -35,6 +34,8 @@ static struct {
     std::array<Math::Vec4<u8>,4> i;
 } shader_uniforms;
 
+static Math::Vec4<float24> vs_default_attributes[16];
+
 // TODO: Not sure where the shader binary and swizzle patterns are supposed to be loaded to!
 // For now, we just keep these local arrays around.
 static std::array<u32, 1024> shader_memory;
@@ -60,6 +61,10 @@ Math::Vec4<u8>& GetIntUniform(u32 index) {
     return shader_uniforms.i[index];
 }
 
+Math::Vec4<float24>& GetDefaultAttribute(u32 index) {
+    return vs_default_attributes[index];
+}
+
 const std::array<u32, 1024>& GetShaderBinary() {
     return shader_memory;
 }
@@ -229,6 +234,15 @@ static void ProcessShaderCode(VertexShaderState& state) {
                 break;
             }
 
+            case OpCode::Id::FLR:
+                for (int i = 0; i < 4; ++i) {
+                    if (!swizzle.DestComponentEnabled(i))
+                        continue;
+
+                    dest[i] = float24::FromFloat32(std::floor(src1[i].ToFloat32()));
+                }
+                break;
+
             case OpCode::Id::MAX:
                 for (int i = 0; i < 4; ++i) {
                     if (!swizzle.DestComponentEnabled(i))
@@ -360,12 +374,15 @@ static void ProcessShaderCode(VertexShaderState& state) {
 
         case OpCode::Type::MultiplyAdd:
         {
-            if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD) {
+            if ((instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD) || 
+                (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI)) {
                 const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.mad.operand_desc_id];
 
-                const float24* src1_ = LookupSourceRegister(instr.mad.src1);
-                const float24* src2_ = LookupSourceRegister(instr.mad.src2);
-                const float24* src3_ = LookupSourceRegister(instr.mad.src3);
+                bool is_inverted = (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI);
+
+                const float24* src1_ = LookupSourceRegister(instr.mad.GetSrc1(is_inverted));
+                const float24* src2_ = LookupSourceRegister(instr.mad.GetSrc2(is_inverted));
+                const float24* src3_ = LookupSourceRegister(instr.mad.GetSrc3(is_inverted));
 
                 const bool negate_src1 = ((bool)swizzle.negate_src1 != false);
                 const bool negate_src2 = ((bool)swizzle.negate_src2 != false);
@@ -556,7 +573,11 @@ static void ProcessShaderCode(VertexShaderState& state) {
     }
 }
 
+static Common::Profiling::TimingCategory shader_category("Vertex Shader");
+
 OutputVertex RunShader(const InputVertex& input, int num_attributes) {
+    Common::Profiling::ScopeTimer timer(shader_category);
+
     VertexShaderState state;
 
     const u32* main = &shader_memory[registers.vs_main_offset];
@@ -568,22 +589,23 @@ OutputVertex RunShader(const InputVertex& input, int num_attributes) {
     const auto& attribute_register_map = registers.vs_input_register_map;
     float24 dummy_register;
     boost::fill(state.input_register_table, &dummy_register);
-    if(num_attributes > 0) state.input_register_table[attribute_register_map.attribute0_register] = &input.attr[0].x;
-    if(num_attributes > 1) state.input_register_table[attribute_register_map.attribute1_register] = &input.attr[1].x;
-    if(num_attributes > 2) state.input_register_table[attribute_register_map.attribute2_register] = &input.attr[2].x;
-    if(num_attributes > 3) state.input_register_table[attribute_register_map.attribute3_register] = &input.attr[3].x;
-    if(num_attributes > 4) state.input_register_table[attribute_register_map.attribute4_register] = &input.attr[4].x;
-    if(num_attributes > 5) state.input_register_table[attribute_register_map.attribute5_register] = &input.attr[5].x;
-    if(num_attributes > 6) state.input_register_table[attribute_register_map.attribute6_register] = &input.attr[6].x;
-    if(num_attributes > 7) state.input_register_table[attribute_register_map.attribute7_register] = &input.attr[7].x;
-    if(num_attributes > 8) state.input_register_table[attribute_register_map.attribute8_register] = &input.attr[8].x;
-    if(num_attributes > 9) state.input_register_table[attribute_register_map.attribute9_register] = &input.attr[9].x;
-    if(num_attributes > 10) state.input_register_table[attribute_register_map.attribute10_register] = &input.attr[10].x;
-    if(num_attributes > 11) state.input_register_table[attribute_register_map.attribute11_register] = &input.attr[11].x;
-    if(num_attributes > 12) state.input_register_table[attribute_register_map.attribute12_register] = &input.attr[12].x;
-    if(num_attributes > 13) state.input_register_table[attribute_register_map.attribute13_register] = &input.attr[13].x;
-    if(num_attributes > 14) state.input_register_table[attribute_register_map.attribute14_register] = &input.attr[14].x;
-    if(num_attributes > 15) state.input_register_table[attribute_register_map.attribute15_register] = &input.attr[15].x;
+    
+    if (num_attributes > 0) state.input_register_table[attribute_register_map.attribute0_register] = &input.attr[0].x;
+    if (num_attributes > 1) state.input_register_table[attribute_register_map.attribute1_register] = &input.attr[1].x;
+    if (num_attributes > 2) state.input_register_table[attribute_register_map.attribute2_register] = &input.attr[2].x;
+    if (num_attributes > 3) state.input_register_table[attribute_register_map.attribute3_register] = &input.attr[3].x;
+    if (num_attributes > 4) state.input_register_table[attribute_register_map.attribute4_register] = &input.attr[4].x;
+    if (num_attributes > 5) state.input_register_table[attribute_register_map.attribute5_register] = &input.attr[5].x;
+    if (num_attributes > 6) state.input_register_table[attribute_register_map.attribute6_register] = &input.attr[6].x;
+    if (num_attributes > 7) state.input_register_table[attribute_register_map.attribute7_register] = &input.attr[7].x;
+    if (num_attributes > 8) state.input_register_table[attribute_register_map.attribute8_register] = &input.attr[8].x;
+    if (num_attributes > 9) state.input_register_table[attribute_register_map.attribute9_register] = &input.attr[9].x;
+    if (num_attributes > 10) state.input_register_table[attribute_register_map.attribute10_register] = &input.attr[10].x;
+    if (num_attributes > 11) state.input_register_table[attribute_register_map.attribute11_register] = &input.attr[11].x;
+    if (num_attributes > 12) state.input_register_table[attribute_register_map.attribute12_register] = &input.attr[12].x;
+    if (num_attributes > 13) state.input_register_table[attribute_register_map.attribute13_register] = &input.attr[13].x;
+    if (num_attributes > 14) state.input_register_table[attribute_register_map.attribute14_register] = &input.attr[14].x;
+    if (num_attributes > 15) state.input_register_table[attribute_register_map.attribute15_register] = &input.attr[15].x;
 
     state.conditional_code[0] = false;
     state.conditional_code[1] = false;
diff --git a/src/video_core/vertex_shader.h b/src/video_core/vertex_shader.h
index 3a68a3409..c26709bbc 100644
--- a/src/video_core/vertex_shader.h
+++ b/src/video_core/vertex_shader.h
@@ -74,6 +74,7 @@ OutputVertex RunShader(const InputVertex& input, int num_attributes);
 Math::Vec4<float24>& GetFloatUniform(u32 index);
 bool& GetBoolUniform(u32 index);
 Math::Vec4<u8>& GetIntUniform(u32 index);
+Math::Vec4<float24>& GetDefaultAttribute(u32 index);
 
 const std::array<u32, 1024>& GetShaderBinary();
 const std::array<u32, 1024>& GetSwizzlePatterns();
diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp
index b9d4ede3a..42e3bdd5b 100644
--- a/src/video_core/video_core.cpp
+++ b/src/video_core/video_core.cpp
@@ -2,7 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#include "common/common.h"
+#include "common/logging/log.h"
 #include "common/emu_window.h"
 
 #include "core/core.h"
diff --git a/src/video_core/video_core.h b/src/video_core/video_core.h
index 1b51d39bf..f885bec21 100644
--- a/src/video_core/video_core.h
+++ b/src/video_core/video_core.h
@@ -4,7 +4,6 @@
 
 #pragma once
 
-#include "common/common.h"
 #include "common/emu_window.h"
 
 #include "renderer_base.h"